X-Git-Url: https://git.openssl.org/?a=blobdiff_plain;f=crypto%2Fsha%2Fasm%2Fsha1-x86_64.pl;h=73ac8ee5cecec6dd98495ff9ce12ff7e4a55c0a0;hb=54f8f9a1edfcf1a2d2df801728e462841e2c316e;hp=6dc64a2eadf0121efd15f310a16cb6d649e6a892;hpb=977f32e85241cba8be53e44dade32231e8a91718;p=openssl.git diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl index 6dc64a2ead..73ac8ee5ce 100755 --- a/crypto/sha/asm/sha1-x86_64.pl +++ b/crypto/sha/asm/sha1-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -73,13 +80,17 @@ # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% +# Skylake 5.18 4.06/+28% 3.54/+46% # Bulldozer 9.11 5.95/+53% +# Ryzen 4.75 3.80/+24% 1.93/+150%(**) # VIA Nano 9.32 7.15/+30% # Atom 10.3 9.17/+12% # Silvermont 13.1(*) 9.37/+40% +# Goldmont 8.13 6.42/+27% 1.70/+380%(**) # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; +# (**) SHAEXT result $flavour = shift; $output = shift; @@ -107,10 +118,14 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $avx = ($1>=10) + ($1>=11); } +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $ctx="%rdi"; # 1st arg @@ -243,6 +258,7 @@ $code.=<<___; .type sha1_block_data_order,\@function,3 .align 16 sha1_block_data_order: +.cfi_startproc mov OPENSSL_ia32cap_P+0(%rip),%r9d mov OPENSSL_ia32cap_P+4(%rip),%r8d mov OPENSSL_ia32cap_P+8(%rip),%r10d @@ -250,7 +266,7 @@ sha1_block_data_order: jz .Lialu ___ $code.=<<___ if ($shaext); - test \$`1<<29`,%r10d # check SHA bit + test \$`1<<29`,%r10d # check SHA bit jnz _shaext_shortcut ___ $code.=<<___ if ($avx>1); @@ -271,17 +287,24 @@ $code.=<<___; .align 16 .Lialu: mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %rax,`16*4`(%rsp) +.cfi_cfa_expression %rsp+64,deref,+8 .Lprologue: mov 0($ctx),$A @@ -315,14 +338,22 @@ $code.=<<___; jnz .Lloop mov `16*4`(%rsp),%rsi +.cfi_def_cfa %rsi,8 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: ret +.cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order ___ if ($shaext) {{{ @@ -338,6 +369,7 @@ $code.=<<___; .align 32 sha1_block_data_order_shaext: _shaext_shortcut: +.cfi_startproc ___ $code.=<<___ if ($win64); lea `-8-4*16`(%rsp),%rsp @@ -368,9 +400,9 @@ $code.=<<___; .align 16 .Loop_shaext: dec $num - lea 0x40($inp),%rax # next input block + lea 0x40($inp),%r8 # next input block paddd @MSG[0],$E - cmovne %rax,$inp + cmovne %r8,$inp movdqa $ABCD,$ABCD_SAVE # offload $ABCD ___ for($i=0;$i<20-4;$i+=2) { @@ -435,6 +467,7 @@ $code.=<<___ if ($win64); .Lepilogue_shaext: ___ $code.=<<___; +.cfi_endproc ret .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext ___ @@ -448,7 +481,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $rx=0; -my $K_XX_XX="%r11"; +my $K_XX_XX="%r14"; +my $fp="%r11"; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; @@ -469,25 +503,31 @@ $code.=<<___; .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - mov %rsp,%rax +.cfi_startproc + mov %rsp,$fp # frame pointer +.cfi_def_cfa_register $fp push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler +.cfi_push %r13 push %r14 +.cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-40-6*16(%rax) - movaps %xmm7,-40-5*16(%rax) - movaps %xmm8,-40-4*16(%rax) - movaps %xmm9,-40-3*16(%rax) - movaps %xmm10,-40-2*16(%rax) - movaps %xmm11,-40-1*16(%rax) + movaps %xmm6,-40-6*16($fp) + movaps %xmm7,-40-5*16($fp) + movaps %xmm8,-40-4*16($fp) + movaps %xmm9,-40-3*16($fp) + movaps %xmm10,-40-2*16($fp) + movaps %xmm11,-40-1*16($fp) .Lprologue_ssse3: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -537,7 +577,7 @@ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -894,23 +934,29 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 +.cfi_restore %r14 + mov -32($fp),%r13 +.cfi_restore %r13 + mov -24($fp),%r12 +.cfi_restore %r12 + mov -16($fp),%rbp +.cfi_restore %rbp + mov -8($fp),%rbx +.cfi_restore %rbx + lea ($fp),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret +.cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 ___ @@ -931,26 +977,32 @@ $code.=<<___; .align 16 sha1_block_data_order_avx: _avx_shortcut: - mov %rsp,%rax +.cfi_startproc + mov %rsp,$fp +.cfi_def_cfa_register $fp push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler +.cfi_push %r13 push %r14 +.cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp vzeroupper ___ $code.=<<___ if ($win64); - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -990,7 +1042,7 @@ $code.=<<___; jmp .Loop_avx ___ -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1258,23 +1310,29 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 +.cfi_restore %r14 + mov -32($fp),%r13 +.cfi_restore %r13 + mov -24($fp),%r12 +.cfi_restore %r12 + mov -16($fp),%rbp +.cfi_restore %rbp + mov -8($fp),%rbx +.cfi_restore %rbx + lea ($fp),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx ___ @@ -1298,26 +1356,32 @@ $code.=<<___; .align 16 sha1_block_data_order_avx2: _avx2_shortcut: - mov %rsp,%rax +.cfi_startproc + mov %rsp,$fp +.cfi_def_cfa_register $fp push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 vzeroupper ___ $code.=<<___ if ($win64); lea -6*16(%rsp),%rsp - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx2: ___ $code.=<<___; - mov %rax,%r14 # original %rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument @@ -1462,7 +1526,7 @@ sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path ) } -sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx2_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions @@ -1737,23 +1801,29 @@ $code.=<<___; vzeroupper ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 +.cfi_restore %r14 + mov -32($fp),%r13 +.cfi_restore %r13 + mov -24($fp),%r12 +.cfi_restore %r12 + mov -16($fp),%rbp +.cfi_restore %rbp + mov -8($fp),%rbx +.cfi_restore %rbx + lea ($fp),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx2: ret +.cfi_endproc .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 ___ } @@ -1831,7 +1901,9 @@ se_handler: jmp .Lcommon_seh_tail .size se_handler,.-se_handler +___ +$code.=<<___ if ($shaext); .type shaext_handler,\@abi-omnipotent .align 16 shaext_handler: @@ -1864,7 +1936,9 @@ shaext_handler: jmp .Lcommon_seh_tail .size shaext_handler,.-shaext_handler +___ +$code.=<<___; .type ssse3_handler,\@abi-omnipotent .align 16 ssse3_handler: @@ -1890,15 +1964,13 @@ ssse3_handler: cmp %r10,%rbx # context->RipRsp + mov 208($context),%rax # pull context->R11 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 232($context),%rax # pull context->R14 - lea -40-6*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$12,%ecx @@ -1987,9 +2059,13 @@ $code.=<<___; .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler +___ +$code.=<<___ if ($shaext); .LSEH_info_sha1_block_data_order_shaext: .byte 9,0,0,0 .rva shaext_handler +___ +$code.=<<___; .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler @@ -2030,8 +2106,12 @@ sub sha1op38 { "sha1msg1" => 0xc9, "sha1msg2" => 0xca ); - if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); + my $rex=0; + $rex|=0x04 if ($2>=8); + $rex|=0x01 if ($1>=8); + unshift @opcode,0x40|$rex if ($rex); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode);