X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Faesni-sha256-x86_64.pl;h=5521766a6a7dcf3f3df589b610f4784a17c0b306;hp=b6ad7b29ed43c63056402b0cc52fa6df08c56253;hb=HEAD;hpb=619b94667cc7a097f6d1e2123c4f4c2c85afb8f7 diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl index b6ad7b29ed..5c8bb0fbcc 100644 --- a/crypto/aes/asm/aesni-sha256-x86_64.pl +++ b/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -21,21 +28,26 @@ # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched # subroutine: # -# AES-128/-192/-256+SHA256 this(**)gain -# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% -# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% -# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% -# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% +# AES-128/-192/-256+SHA256 this(**) gain +# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% +# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% +# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% +# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% +# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% +# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% +# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% # -# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that +# (*) there are XOP, AVX1 and AVX2 code paths, meaning that # Westmere is omitted from loop, this is because gain was not # estimated high enough to justify the effort; # (**) these are EVP-free results, results obtained with 'speed # -evp aes-256-cbc-hmac-sha256' will vary by percent or two; +# (***) these are SHAEXT results; -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -56,10 +68,18 @@ if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); + $avx = ($1>=10) + ($1>=12); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +$shaext=$avx; ### set to zero if compiling for 1.0.1 +$avx=1 if (!$shaext && $avx); + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; $func="aesni_cbc_sha256_enc"; @@ -94,7 +114,7 @@ $_key="16*$SZ+3*8(%rsp)"; $_ivp="16*$SZ+4*8(%rsp)"; $_ctx="16*$SZ+5*8(%rsp)"; $_in0="16*$SZ+6*8(%rsp)"; -$_rsp="16*$SZ+7*8(%rsp)"; +$_rsp="`16*$SZ+7*8`(%rsp)"; $framesz=16*$SZ+8*8; $code=<<___; @@ -105,18 +125,22 @@ $code=<<___; .type $func,\@abi-omnipotent .align 16 $func: +.cfi_startproc ___ -$code.=<<___ if ($avx); + if ($avx) { +$code.=<<___; lea OPENSSL_ia32cap_P(%rip),%r11 mov \$1,%eax cmp \$0,`$win64?"%rcx":"%rdi"` je .Lprobe mov 0(%r11),%eax mov 4(%r11),%r10 - +___ +$code.=<<___ if ($shaext); bt \$61,%r10 # check for SHA jc ${func}_shaext - +___ +$code.=<<___; mov %r10,%r11 shr \$32,%r11 @@ -128,14 +152,12 @@ $code.=<<___ if ($avx>1); cmp \$`1<<8|1<<5|1<<3`,%r11d je ${func}_avx2 ___ -$code.=<<___ if ($avx); - and \$`1<<30`,%eax # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits - or %eax,%r10d - cmp \$`1<<28|1<<9|1<<30`,%r10d - je ${func}_avx +$code.=<<___; + and \$`1<<28`,%r10d # check for AVX + jnz ${func}_avx ud2 ___ + } $code.=<<___; xor %eax,%eax cmp \$0,`$win64?"%rcx":"%rdi"` @@ -143,8 +165,10 @@ $code.=<<___; ud2 .Lprobe: ret +.cfi_endproc .size $func,.-$func +.section .rodata align=64 .align 64 .type $TABLE,\@object $TABLE: @@ -187,6 +211,7 @@ $TABLE: .long 0,0,0,0, 0,0,0,0 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by " .align 64 +.previous ___ ###################################################################### @@ -323,15 +348,23 @@ $code.=<<___; .type ${func}_xop,\@function,6 .align 64 ${func}_xop: +.cfi_startproc .Lxop_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -347,7 +380,8 @@ ${func}_xop: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -585,6 +619,7 @@ $code.=<<___; mov $_ivp,$ivp mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ @@ -601,15 +636,23 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_xop: ret +.cfi_endproc .size ${func}_xop,.-${func}_xop ___ ###################################################################### @@ -621,15 +664,23 @@ $code.=<<___; .type ${func}_avx,\@function,6 .align 64 ${func}_avx: +.cfi_startproc .Lavx_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -645,7 +696,8 @@ ${func}_avx: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -836,6 +888,7 @@ $code.=<<___; mov $_ivp,$ivp mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ @@ -852,15 +905,23 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size ${func}_avx,.-${func}_avx ___ @@ -868,7 +929,7 @@ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # -my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; @@ -917,15 +978,23 @@ $code.=<<___; .type ${func}_avx2,\@function,6 .align 64 ${func}_avx2: +.cfi_startproc .Lavx2_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp and \$-256*$SZ,%rsp # align stack frame add \$`2*$SZ*($rounds-8)`,%rsp @@ -942,7 +1011,8 @@ ${func}_avx2: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -1020,7 +1090,23 @@ $code.=<<___; vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) +___ +$code.=<<___ if (!$win64); +# temporarily use %rsi as frame pointer + mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 +___ +$code.=<<___; lea -$PUSH8(%rsp),%rsp +___ +$code.=<<___ if (!$win64); +# the frame info is at $_rsp, but the stack is moving... +# so a second frame pointer is saved at -8(%rsp) +# that is in the red zone + mov %rsi,-8(%rsp) +.cfi_cfa_expression %rsp-8,deref,+8 +___ +$code.=<<___; mov $B,$a3 vmovdqa $t2,0x00(%rsp) xor $C,$a3 # magic @@ -1042,7 +1128,17 @@ my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 96 instructions my $base = "+2*$PUSH8(%rsp)"; - &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); + if (($j%2)==0) { + &lea ("%rsp","-$PUSH8(%rsp)"); +$code.=<<___ if (!$win64); +.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 +# copy secondary frame pointer to new location again at -8(%rsp) + pushq $PUSH8-8(%rsp) +.cfi_cfa_expression %rsp,deref,+8 + lea 8(%rsp),%rsp +.cfi_cfa_expression %rsp-8,deref,+8 +___ + } foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); @@ -1168,36 +1264,47 @@ $code.=<<___; jbe .Loop_avx2 lea (%rsp),$Tbl +# temporarily use $Tbl as index to $_rsp +# this avoids the need to save a secondary frame pointer at -8(%rsp) +.cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 .Ldone_avx2: - lea ($Tbl),%rsp - mov $_ivp,$ivp - mov $_rsp,%rsi + mov 16*$SZ+4*8($Tbl),$ivp + mov 16*$SZ+7*8($Tbl),%rsi +.cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); - movaps `$framesz+16*0`(%rsp),%xmm6 - movaps `$framesz+16*1`(%rsp),%xmm7 - movaps `$framesz+16*2`(%rsp),%xmm8 - movaps `$framesz+16*3`(%rsp),%xmm9 - movaps `$framesz+16*4`(%rsp),%xmm10 - movaps `$framesz+16*5`(%rsp),%xmm11 - movaps `$framesz+16*6`(%rsp),%xmm12 - movaps `$framesz+16*7`(%rsp),%xmm13 - movaps `$framesz+16*8`(%rsp),%xmm14 - movaps `$framesz+16*9`(%rsp),%xmm15 + movaps `$framesz+16*0`($Tbl),%xmm6 + movaps `$framesz+16*1`($Tbl),%xmm7 + movaps `$framesz+16*2`($Tbl),%xmm8 + movaps `$framesz+16*3`($Tbl),%xmm9 + movaps `$framesz+16*4`($Tbl),%xmm10 + movaps `$framesz+16*5`($Tbl),%xmm11 + movaps `$framesz+16*6`($Tbl),%xmm12 + movaps `$framesz+16*7`($Tbl),%xmm13 + movaps `$framesz+16*8`($Tbl),%xmm14 + movaps `$framesz+16*9`($Tbl),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: ret +.cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} @@ -1259,16 +1366,18 @@ ___ $r++; unshift(@rndkey,pop(@rndkey)); }; +if ($shaext) { +my $Tbl="%rax"; + $code.=<<___; .type ${func}_shaext,\@function,6 .align 32 ${func}_shaext: - mov %rsp,%rax +.cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument - push %rbx ___ $code.=<<___ if ($win64); - lea `-4*16`(%rsp),%rsp + lea `-8-10*16`(%rsp),%rsp movaps %xmm6,-8-10*16(%rax) movaps %xmm7,-8-9*16(%rax) movaps %xmm8,-8-8*16(%rax) @@ -1290,6 +1399,7 @@ $code.=<<___; mov 240($key),$rounds sub $in0,$out movups ($key),$rndkey0 # $key[0] + movups ($ivp),$iv # load IV movups 16($key),$rndkey[0] # forward reference lea 112($key),$key # size optimization @@ -1465,35 +1575,36 @@ $code.=<<___; movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); - movaps -8-10*16(%rax),%xmm6 - movaps -8-9*16(%rax),%xmm7 - movaps -8-8*16(%rax),%xmm8 - movaps -8-7*16(%rax),%xmm9 - movaps -8-6*16(%rax),%xmm10 - movaps -8-5*16(%rax),%xmm11 - movaps -8-4*16(%rax),%xmm12 - movaps -8-3*16(%rax),%xmm13 - movaps -8-2*16(%rax),%xmm14 - movaps -8-1*16(%rax),%xmm15 + movaps 0*16(%rsp),%xmm6 + movaps 1*16(%rsp),%xmm7 + movaps 2*16(%rsp),%xmm8 + movaps 3*16(%rsp),%xmm9 + movaps 4*16(%rsp),%xmm10 + movaps 5*16(%rsp),%xmm11 + movaps 6*16(%rsp),%xmm12 + movaps 7*16(%rsp),%xmm13 + movaps 8*16(%rsp),%xmm14 + movaps 9*16(%rsp),%xmm15 + lea 8+10*16(%rsp),%rsp .Lepilogue_shaext: ___ $code.=<<___; - mov -8(%rax),%rbx - mov %rax,%rsp ret +.cfi_endproc .size ${func}_shaext,.-${func}_shaext ___ +} }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { +if ($win64 && $avx) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; -$code.=<<___ if ($avx); +$code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 @@ -1527,6 +1638,19 @@ se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue ___ +$code.=<<___ if ($shaext); + lea aesni_cbc_sha256_enc_shaext(%rip),%r10 + cmp %r10,%rbx + jb .Lnot_in_shaext + + lea (%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + lea 168(%rax),%rax # adjust stack pointer + jmp .Lin_prologue +.Lnot_in_shaext: +___ $code.=<<___ if ($avx>1); lea .Lavx2_shortcut(%rip),%r10 cmp %r10,%rbx # context->Rip1); .rva .LSEH_end_${func}_avx2 .rva .LSEH_info_${func}_avx2 ___ -$code.=<<___ if ($avx); +$code.=<<___ if ($shaext); + .rva .LSEH_begin_${func}_shaext + .rva .LSEH_end_${func}_shaext + .rva .LSEH_info_${func}_shaext +___ +$code.=<<___; .section .xdata .align 8 .LSEH_info_${func}_xop: @@ -1632,6 +1760,12 @@ $code.=<<___ if ($avx>1); .rva se_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ +$code.=<<___ if ($shaext); +.LSEH_info_${func}_shaext: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] +___ } #################################################################### @@ -1654,7 +1788,7 @@ sub rex { sub sha256op38 { my $instr = shift; - if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; @@ -1669,4 +1803,4 @@ sub rex { $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; print $code; -close STDOUT; +close STDOUT or die "error closing STDOUT: $!";