X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Faesni-sha256-x86_64.pl;h=74ec8449528bc5b51427d37b28c85b0341614e27;hp=19b0433b3b1bdbac254cc6c2af30bd548f0c5442;hb=08d09628d2c9f3ef599399d8cad021a07ab98347;hpb=82c4a079395223c0063100854a0981976db9848f diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl index 19b0433b3b..74ec844952 100644 --- a/crypto/aes/asm/aesni-sha256-x86_64.pl +++ b/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -25,9 +32,10 @@ # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% +# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% # -# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that +# (*) there are XOP, AVX1 and AVX2 code paths, meaning that # Westmere is omitted from loop, this is because gain was not # estimated high enough to justify the effort; # (**) these are EVP-free results, results obtained with 'speed @@ -59,14 +67,14 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $avx = ($1>=10) + ($1>=12); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=$avx; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $func="aesni_cbc_sha256_enc"; @@ -101,7 +109,7 @@ $_key="16*$SZ+3*8(%rsp)"; $_ivp="16*$SZ+4*8(%rsp)"; $_ctx="16*$SZ+5*8(%rsp)"; $_in0="16*$SZ+6*8(%rsp)"; -$_rsp="16*$SZ+7*8(%rsp)"; +$_rsp="`16*$SZ+7*8`(%rsp)"; $framesz=16*$SZ+8*8; $code=<<___; @@ -139,11 +147,8 @@ $code.=<<___ if ($avx>1); je ${func}_avx2 ___ $code.=<<___; - and \$`1<<30`,%eax # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits - or %eax,%r10d - cmp \$`1<<28|1<<9|1<<30`,%r10d - je ${func}_avx + and \$`1<<28`,%r10d # check for AVX + jnz ${func}_avx ud2 ___ } @@ -334,15 +339,23 @@ $code.=<<___; .type ${func}_xop,\@function,6 .align 64 ${func}_xop: +.cfi_startproc .Lxop_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -358,7 +371,8 @@ ${func}_xop: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -596,6 +610,7 @@ $code.=<<___; mov $_ivp,$ivp mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ @@ -612,15 +627,23 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_xop: ret +.cfi_endproc .size ${func}_xop,.-${func}_xop ___ ###################################################################### @@ -632,15 +655,23 @@ $code.=<<___; .type ${func}_avx,\@function,6 .align 64 ${func}_avx: +.cfi_startproc .Lavx_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -656,7 +687,8 @@ ${func}_avx: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -847,6 +879,7 @@ $code.=<<___; mov $_ivp,$ivp mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ @@ -863,15 +896,23 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size ${func}_avx,.-${func}_avx ___ @@ -879,7 +920,7 @@ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # -my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; @@ -928,15 +969,23 @@ $code.=<<___; .type ${func}_avx2,\@function,6 .align 64 ${func}_avx2: +.cfi_startproc .Lavx2_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp and \$-256*$SZ,%rsp # align stack frame add \$`2*$SZ*($rounds-8)`,%rsp @@ -953,7 +1002,8 @@ ${func}_avx2: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -1184,6 +1234,7 @@ $code.=<<___; lea ($Tbl),%rsp mov $_ivp,$ivp mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ @@ -1200,15 +1251,23 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: ret +.cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} @@ -1302,6 +1361,7 @@ $code.=<<___; mov 240($key),$rounds sub $in0,$out movups ($key),$rndkey0 # $key[0] + movups ($ivp),$iv # load IV movups 16($key),$rndkey[0] # forward reference lea 112($key),$key # size optimization @@ -1564,7 +1624,6 @@ ___ $code.=<<___; mov %rax,%rsi # put aside Rsp mov 16*$SZ+7*8(%rax),%rax # pull $_rsp - lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp