X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Faesni-x86_64.pl;h=31c80ae6bc53a30995483712c2a850b76cd94ac4;hp=27bb47c32634e7d7dfc909fe7abe682361921719;hb=214368ffee5736836e2dbb80a16a4fbd85f0eaf9;hpb=6c79faaa9dd288bfda72831a9ef22ca01fa482d4 diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 27bb47c326..31c80ae6bc 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # # ==================================================================== -# Written by Andy Polyakov for the OpenSSL +# Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -129,8 +129,8 @@ # # Further data for other parallelizable modes: # -# CBC decrypt 1.16 0.93 0.93 -# CTR 1.14 0.91 0.77 +# CBC decrypt 1.16 0.93 0.74 +# CTR 1.14 0.91 0.74 # # Well, given 3x column it's probably inappropriate to call the limit # asymptotic, if it can be surpassed, isn't it? What happens there? @@ -153,16 +153,24 @@ # April 2011 # -# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing -# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like +# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. ###################################################################### -# For reference, AMD Bulldozer spends 5.77 cycles per byte processed -# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70 -# in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec] -# instruction latency is 9 cycles and that they can be issued every -# cycle. +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB +# Westmere 3.77/1.25 1.25 1.25 1.26 +# * Bridge 5.07/0.74 0.75 0.90 0.85 +# Haswell 4.44/0.63 0.63 0.73 0.63 +# Atom 5.75/3.54 3.56 4.12 3.87(*) +# Bulldozer 5.77/0.70 0.72 0.90 0.70 +# +# (*) Atom ECB result is suboptimal because of penalties incurred +# by operations on %xmm8-15. As ECB is not considered +# critical, nothing was done to mitigate the problem. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for @@ -187,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code=".text\n"; +$code.=".extern OPENSSL_ia32cap_P\n"; $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... @@ -279,10 +288,49 @@ ___ # every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. -# aes[enc|dec] latency in next processor generation is 8, but the -# instructions can be scheduled every cycle. Optimal interleave for -# new processor is therefore 8x... +# This is why it originally made no sense to implement 2x subroutine. +# But times change and it became appropriate to spend extra 192 bytes +# on 2x subroutine on Atom Silvermont account. For processors that +# can schedule aes[enc|dec] every cycle optimal interleave factor +# equals to corresponding instructions latency. 8x is optimal for +# * Bridge and "super-optimal" for other Intel CPUs... + +sub aesni_generate2 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-1] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt2,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt2: + $movkey ($key),$rndkey0 + shl \$4,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + add \$16,%rax + +.L${dir}_loop2: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + $movkey -16($key,%rax),$rndkey0 + jnz .L${dir}_loop2 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + ret +.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 +___ +} sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* @@ -292,25 +340,26 @@ $code.=<<___; .align 16 _aesni_${dir}rypt3: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 - $movkey ($key),$rndkey0 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + add \$16,%rax .L${dir}_loop3: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop3 aes${dir} $rndkey1,$inout0 @@ -336,28 +385,30 @@ $code.=<<___; .align 16 _aesni_${dir}rypt4: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 xorps $rndkey0,$inout2 xorps $rndkey0,$inout3 - $movkey ($key),$rndkey0 + $movkey 32($key),$rndkey0 + lea 32($key,$rounds),$key + neg %rax # $rounds + .byte 0x0f,0x1f,0x00 + add \$16,%rax .L${dir}_loop4: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop4 aes${dir} $rndkey1,$inout0 @@ -381,43 +432,43 @@ $code.=<<___; .align 16 _aesni_${dir}rypt6: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 pxor $rndkey0,$inout1 - aes${dir} $rndkey1,$inout0 pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout0 + lea 32($key,$rounds),$key + neg %rax # $rounds aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout3 - aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout4 - aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout5 - dec $rounds + add \$16,%rax + aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 - $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout5 + $movkey -16($key,%rax),$rndkey0 jmp .L${dir}_loop6_enter .align 16 .L${dir}_loop6: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 -.L${dir}_loop6_enter: # happens to be 16-byte aligned - $movkey 16($key),$rndkey1 +.L${dir}_loop6_enter: + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop6 aes${dir} $rndkey1,$inout0 @@ -445,52 +496,51 @@ $code.=<<___; .align 16 _aesni_${dir}rypt8: $movkey ($key),$rndkey0 - shr \$1,$rounds + shl \$4,$rounds $movkey 16($key),$rndkey1 - lea 32($key),$key xorps $rndkey0,$inout0 xorps $rndkey0,$inout1 - aes${dir} $rndkey1,$inout0 pxor $rndkey0,$inout2 - aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout3 - aes${dir} $rndkey1,$inout2 pxor $rndkey0,$inout4 - aes${dir} $rndkey1,$inout3 + lea 32($key,$rounds),$key + neg %rax # $rounds + aes${dir} $rndkey1,$inout0 + add \$16,%rax pxor $rndkey0,$inout5 - dec $rounds - aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout1 pxor $rndkey0,$inout6 - aes${dir} $rndkey1,$inout5 pxor $rndkey0,$inout7 - $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 - $movkey 16($key),$rndkey1 + $movkey -16($key,%rax),$rndkey0 jmp .L${dir}_loop8_enter .align 16 .L${dir}_loop8: aes${dir} $rndkey1,$inout0 aes${dir} $rndkey1,$inout1 - dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 aes${dir} $rndkey1,$inout4 aes${dir} $rndkey1,$inout5 aes${dir} $rndkey1,$inout6 aes${dir} $rndkey1,$inout7 - $movkey 16($key),$rndkey1 -.L${dir}_loop8_enter: # happens to be 16-byte aligned +.L${dir}_loop8_enter: + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 - lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 aes${dir} $rndkey0,$inout4 aes${dir} $rndkey0,$inout5 aes${dir} $rndkey0,$inout6 aes${dir} $rndkey0,$inout7 - $movkey ($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .L${dir}_loop8 aes${dir} $rndkey1,$inout0 @@ -513,6 +563,8 @@ _aesni_${dir}rypt8: .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 ___ } +&aesni_generate2("enc") if ($PREFIX eq "aesni"); +&aesni_generate2("dec"); &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); @@ -634,8 +686,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: - xorps $inout2,$inout2 - call _aesni_encrypt3 + call _aesni_encrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret @@ -771,8 +822,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_dec_two: - xorps $inout2,$inout2 - call _aesni_decrypt3 + call _aesni_decrypt2 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret @@ -829,7 +879,8 @@ ___ { my $cmac="%r9"; # 6th argument -my $increment="%xmm6"; +my $increment="%xmm9"; +my $iv="%xmm6"; my $bswap_mask="%xmm7"; $code.=<<___; @@ -852,49 +903,49 @@ $code.=<<___; movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask - shr \$1,$rounds + shl \$4,$rounds + mov \$16,$rnds_ lea 0($key),$key_ movdqu ($cmac),$inout1 movdqa $iv,$inout0 - mov $rounds,$rnds_ + lea 32($key,$rounds),$key # end of key schedule pshufb $bswap_mask,$iv + sub %rax,%r10 # twisted $rounds jmp .Lccm64_enc_outer .align 16 .Lccm64_enc_outer: $movkey ($key_),$rndkey0 - mov $rnds_,$rounds + mov %r10,%rax movups ($inp),$in0 # load inp xorps $rndkey0,$inout0 # counter $movkey 16($key_),$rndkey1 xorps $in0,$rndkey0 - lea 32($key_),$key xorps $rndkey0,$inout1 # cmac^=inp - $movkey ($key),$rndkey0 + $movkey 32($key_),$rndkey0 .Lccm64_enc2_loop: aesenc $rndkey1,$inout0 - dec $rounds aesenc $rndkey1,$inout1 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aesenc $rndkey0,$inout0 - lea 32($key),$key aesenc $rndkey0,$inout1 - $movkey 0($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .Lccm64_enc2_loop aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 paddq $increment,$iv + dec $len aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 - dec $len lea 16($inp),$inp xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 movups $in0,($out) # save output - lea 16($out),$out pshufb $bswap_mask,$inout0 + lea 16($out),$out jnz .Lccm64_enc_outer movups $inout1,($cmac) @@ -940,15 +991,19 @@ $code.=<<___; ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; + shl \$4,$rnds_ + mov \$16,$rounds movups ($inp),$in0 # load inp paddq $increment,$iv lea 16($inp),$inp + sub %r10,%rax # twisted $rounds + lea 32($key_,$rnds_),$key # end of key schedule + mov %rax,%r10 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_outer: xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 - mov $rnds_,$rounds movups $in0,($out) # save output lea 16($out),$out pshufb $bswap_mask,$inout0 @@ -957,36 +1012,36 @@ $code.=<<___; jz .Lccm64_dec_break $movkey ($key_),$rndkey0 - shr \$1,$rounds + mov %r10,%rax $movkey 16($key_),$rndkey1 xorps $rndkey0,$in0 - lea 32($key_),$key xorps $rndkey0,$inout0 xorps $in0,$inout1 # cmac^=out - $movkey ($key),$rndkey0 - + $movkey 32($key_),$rndkey0 + jmp .Lccm64_dec2_loop +.align 16 .Lccm64_dec2_loop: aesenc $rndkey1,$inout0 - dec $rounds aesenc $rndkey1,$inout1 - $movkey 16($key),$rndkey1 + $movkey ($key,%rax),$rndkey1 + add \$32,%rax aesenc $rndkey0,$inout0 - lea 32($key),$key aesenc $rndkey0,$inout1 - $movkey 0($key),$rndkey0 + $movkey -16($key,%rax),$rndkey0 jnz .Lccm64_dec2_loop movups ($inp),$in0 # load inp paddq $increment,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 - lea 16($inp),$inp aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 + lea 16($inp),$inp jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: #xorps $in0,$inout1 # cmac^=out + mov 240($key_),$rounds ___ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); $code.=<<___; @@ -1014,7 +1069,7 @@ ___ # does not update *ivec! (see crypto/modes/ctr128.c for details) # # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, -# http://rt.openssl.org/Ticket/Display.html?id=3031&user=guest&pass=guest. +# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. # Keywords are full unroll and modulo-schedule counter calculations # with zero-round key xor. { @@ -1058,35 +1113,39 @@ $code.=<<___; mov 12($key),$key0 # 0-round key LSB movdqa $inout0,0x00(%rsp) # populate counter block bswap $ctr - movdqa $inout0,0x10(%rsp) - movdqa $inout0,0x20(%rsp) - movdqa $inout0,0x30(%rsp) + movdqa $inout0,$inout1 + movdqa $inout0,$inout2 + movdqa $inout0,$inout3 movdqa $inout0,0x40(%rsp) movdqa $inout0,0x50(%rsp) movdqa $inout0,0x60(%rsp) + mov %rdx,%r10 # borrow %rdx movdqa $inout0,0x70(%rsp) - mov 240($key),$rounds # key->rounds - - lea 1($ctr),%r9 - lea 2($ctr),%r10 - bswap %r9d - bswap %r10d - xor $key0,%r9d - xor $key0,%r10d - mov %r9d,0x10+12(%rsp) - lea 3($ctr),%r9 - mov %r10d,0x20+12(%rsp) - bswap %r9d + lea 1($ctr),%rax + lea 2($ctr),%rdx + bswap %eax + bswap %edx + xor $key0,%eax + xor $key0,%edx + pinsrd \$3,%eax,$inout1 + lea 3($ctr),%rax + movdqa $inout1,0x10(%rsp) + pinsrd \$3,%edx,$inout2 + bswap %eax + mov %r10,%rdx # restore %rdx lea 4($ctr),%r10 - xor $key0,%r9d + movdqa $inout2,0x20(%rsp) + xor $key0,%eax bswap %r10d - mov %r9d,0x30+12(%rsp) + pinsrd \$3,%eax,$inout3 xor $key0,%r10d + movdqa $inout3,0x30(%rsp) lea 5($ctr),%r9 mov %r10d,0x40+12(%rsp) bswap %r9d lea 6($ctr),%r10 + mov 240($key),$rounds # key->rounds xor $key0,%r9d bswap %r10d mov %r9d,0x50+12(%rsp) @@ -1094,24 +1153,117 @@ $code.=<<___; lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d + mov OPENSSL_ia32cap_P+4(%rip),%r10d xor $key0,%r9d + and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 - movdqa 0x10(%rsp),$inout1 - movdqa 0x20(%rsp),$inout2 - movdqa 0x30(%rsp),$inout3 movdqa 0x40(%rsp),$inout4 movdqa 0x50(%rsp),$inout5 cmp \$8,$len jb .Lctr32_tail + sub \$6,$len + cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE + je .Lctr32_6x + lea 0x80($key),$key # size optimization - sub \$8,$len + sub \$2,$len jmp .Lctr32_loop8 +.align 16 +.Lctr32_6x: + shl \$4,$rounds + mov \$48,$rnds_ + bswap $key0 + lea 32($key,$rounds),$key # end of key schedule + sub %rax,%r10 # twisted $rounds + jmp .Lctr32_loop6 + +.align 16 +.Lctr32_loop6: + add \$6,$ctr + $movkey -48($key,$rnds_),$rndkey0 + aesenc $rndkey1,$inout0 + mov $ctr,%eax + xor $key0,%eax + aesenc $rndkey1,$inout1 + movbe %eax,`0x00+12`(%rsp) + lea 1($ctr),%eax + aesenc $rndkey1,$inout2 + xor $key0,%eax + movbe %eax,`0x10+12`(%rsp) + aesenc $rndkey1,$inout3 + lea 2($ctr),%eax + xor $key0,%eax + aesenc $rndkey1,$inout4 + movbe %eax,`0x20+12`(%rsp) + lea 3($ctr),%eax + aesenc $rndkey1,$inout5 + $movkey -32($key,$rnds_),$rndkey1 + xor $key0,%eax + + aesenc $rndkey0,$inout0 + movbe %eax,`0x30+12`(%rsp) + lea 4($ctr),%eax + aesenc $rndkey0,$inout1 + xor $key0,%eax + movbe %eax,`0x40+12`(%rsp) + aesenc $rndkey0,$inout2 + lea 5($ctr),%eax + xor $key0,%eax + aesenc $rndkey0,$inout3 + movbe %eax,`0x50+12`(%rsp) + mov %r10,%rax # mov $rnds_,$rounds + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey -16($key,$rnds_),$rndkey0 + + call .Lenc_loop6 + + movdqu ($inp),$inout6 + movdqu 0x10($inp),$inout7 + movdqu 0x20($inp),$in0 + movdqu 0x30($inp),$in1 + movdqu 0x40($inp),$in2 + movdqu 0x50($inp),$in3 + lea 0x60($inp),$inp + $movkey -64($key,$rnds_),$rndkey1 + pxor $inout0,$inout6 + movaps 0x00(%rsp),$inout0 + pxor $inout1,$inout7 + movaps 0x10(%rsp),$inout1 + pxor $inout2,$in0 + movaps 0x20(%rsp),$inout2 + pxor $inout3,$in1 + movaps 0x30(%rsp),$inout3 + pxor $inout4,$in2 + movaps 0x40(%rsp),$inout4 + pxor $inout5,$in3 + movaps 0x50(%rsp),$inout5 + movdqu $inout6,($out) + movdqu $inout7,0x10($out) + movdqu $in0,0x20($out) + movdqu $in1,0x30($out) + movdqu $in2,0x40($out) + movdqu $in3,0x50($out) + lea 0x60($out),$out + + sub \$6,$len + jnc .Lctr32_loop6 + + add \$6,$len + jz .Lctr32_done + + lea -48($rnds_),$rounds + lea -80($key,$rnds_),$key # restore $key + neg $rounds + shr \$4,$rounds # restore $rounds + jmp .Lctr32_tail + .align 32 .Lctr32_loop8: add \$8,$ctr @@ -1124,6 +1276,7 @@ $code.=<<___; $movkey 0x20-0x80($key),$rndkey0 aesenc $rndkey1,$inout2 xor $key0,%r9d + nop aesenc $rndkey1,$inout3 mov %r9d,0x00+12(%rsp) lea 1($ctr),%r9 @@ -1136,11 +1289,12 @@ ___ for($i=2;$i<8;$i++) { my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; $code.=<<___; + bswap %r9d aesenc $rndkeyx,$inout0 aesenc $rndkeyx,$inout1 - bswap %r9d - aesenc $rndkeyx,$inout2 xor $key0,%r9d + .byte 0x66,0x90 + aesenc $rndkeyx,$inout2 aesenc $rndkeyx,$inout3 mov %r9d,`0x10*($i-1)`+12(%rsp) lea $i($ctr),%r9 @@ -1152,21 +1306,21 @@ $code.=<<___; ___ } $code.=<<___; + bswap %r9d aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 - bswap %r9d aesenc $rndkey0,$inout2 xor $key0,%r9d + movdqu 0x00($inp),$in0 aesenc $rndkey0,$inout3 mov %r9d,0x70+12(%rsp) + cmp \$11,$rounds aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 aesenc $rndkey0,$inout6 - movdqu 0x00($inp),$in0 aesenc $rndkey0,$inout7 $movkey 0xa0-0x80($key),$rndkey0 - cmp \$11,$rounds jb .Lctr32_enc_done aesenc $rndkey1,$inout0 @@ -1209,48 +1363,50 @@ $code.=<<___; aesenc $rndkey0,$inout6 aesenc $rndkey0,$inout7 $movkey 0xe0-0x80($key),$rndkey0 + jmp .Lctr32_enc_done +.align 16 .Lctr32_enc_done: - aesenc $rndkey1,$inout0 movdqu 0x10($inp),$in1 pxor $rndkey0,$in0 - aesenc $rndkey1,$inout1 movdqu 0x20($inp),$in2 pxor $rndkey0,$in1 - aesenc $rndkey1,$inout2 movdqu 0x30($inp),$in3 pxor $rndkey0,$in2 - aesenc $rndkey1,$inout3 movdqu 0x40($inp),$in4 pxor $rndkey0,$in3 - aesenc $rndkey1,$inout4 movdqu 0x50($inp),$in5 pxor $rndkey0,$in4 - aesenc $rndkey1,$inout5 pxor $rndkey0,$in5 + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 aesenc $rndkey1,$inout7 movdqu 0x60($inp),$rndkey1 + lea 0x80($inp),$inp aesenclast $in0,$inout0 pxor $rndkey0,$rndkey1 - movdqu 0x70($inp),$in0 - lea 0x80($inp),$inp + movdqu 0x70-0x80($inp),$in0 aesenclast $in1,$inout1 pxor $rndkey0,$in0 movdqa 0x00(%rsp),$in1 # load next counter block aesenclast $in2,$inout2 - movdqa 0x10(%rsp),$in2 aesenclast $in3,$inout3 + movdqa 0x10(%rsp),$in2 movdqa 0x20(%rsp),$in3 aesenclast $in4,$inout4 - movdqa 0x30(%rsp),$in4 aesenclast $in5,$inout5 + movdqa 0x30(%rsp),$in4 movdqa 0x40(%rsp),$in5 aesenclast $rndkey1,$inout6 movdqa 0x50(%rsp),$rndkey0 - aesenclast $in0,$inout7 $movkey 0x10-0x80($key),$rndkey1 + aesenclast $in0,$inout7 movups $inout0,($out) # store output movdqa $in1,$inout0 @@ -1278,41 +1434,42 @@ $code.=<<___; .Lctr32_tail: lea 16($key),$key cmp \$4,$len - jbe .Lctr32_loop4 + jb .Lctr32_loop3 + je .Lctr32_loop4 + shl \$4,$rounds movdqa 0x60(%rsp),$inout6 + pxor $inout7,$inout7 $movkey 16($key),$rndkey0 aesenc $rndkey1,$inout0 - lea 16($key),$key aesenc $rndkey1,$inout1 - shr \$1,$rounds + lea 32-16($key,$rounds),$key + neg %rax aesenc $rndkey1,$inout2 - dec $rounds + add \$16,%rax + movups ($inp),$in0 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 + movups 0x10($inp),$in1 + movups 0x20($inp),$in2 aesenc $rndkey1,$inout5 aesenc $rndkey1,$inout6 - pxor $inout7,$inout7 - $movkey 16($key),$rndkey1 call .Lenc_loop8_enter - movups ($inp),$in0 - movups 0x10($inp),$in1 - movups 0x20($inp),$in2 - xorps $in0,$inout0 - movups 0x30($inp),$in3 - xorps $in1,$inout1 - movups 0x40($inp),$in0 - xorps $in2,$inout2 - movups $inout0,($out) - xorps $in3,$inout3 - movups $inout1,0x10($out) - xorps $in0,$inout4 - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) + movdqu 0x30($inp),$in3 + pxor $in0,$inout0 + movdqu 0x40($inp),$in0 + pxor $in1,$inout1 + movdqu $inout0,($out) + pxor $in2,$inout2 + movdqu $inout1,0x10($out) + pxor $in3,$inout3 + movdqu $inout2,0x20($out) + pxor $in0,$inout4 + movdqu $inout3,0x30($out) + movdqu $inout4,0x40($out) cmp \$6,$len jb .Lctr32_done @@ -1330,16 +1487,43 @@ $code.=<<___; .Lctr32_loop4: aesenc $rndkey1,$inout0 lea 16($key),$key + dec $rounds aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 $movkey ($key),$rndkey1 - dec $rounds jnz .Lctr32_loop4 aesenclast $rndkey1,$inout0 aesenclast $rndkey1,$inout1 + movups ($inp),$in0 + movups 0x10($inp),$in1 aesenclast $rndkey1,$inout2 aesenclast $rndkey1,$inout3 + movups 0x20($inp),$in2 + movups 0x30($inp),$in3 + + xorps $in0,$inout0 + movups $inout0,($out) + xorps $in1,$inout1 + movups $inout1,0x10($out) + pxor $in2,$inout2 + movdqu $inout2,0x20($out) + pxor $in3,$inout3 + movdqu $inout3,0x30($out) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop3: + aesenc $rndkey1,$inout0 + lea 16($key),$key + dec $rounds + aesenc $rndkey1,$inout1 + aesenc $rndkey1,$inout2 + $movkey ($key),$rndkey1 + jnz .Lctr32_loop3 + aesenclast $rndkey1,$inout0 + aesenclast $rndkey1,$inout1 + aesenclast $rndkey1,$inout2 movups ($inp),$in0 xorps $in0,$inout0 @@ -1355,12 +1539,6 @@ $code.=<<___; movups 0x20($inp),$in2 xorps $in2,$inout2 movups $inout2,0x20($out) - cmp \$4,$len - jb .Lctr32_done - - movups 0x30($inp),$in3 - xorps $in3,$inout3 - movups $inout3,0x30($out) jmp .Lctr32_done .align 16 @@ -1408,7 +1586,7 @@ ___ my @tweak=map("%xmm$_",(10..15)); my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); -my $frame_size = 0x60 + ($win64?160:0); +my $frame_size = 0x70 + ($win64?160:0); $code.=<<___; .globl aesni_xts_encrypt @@ -1435,220 +1613,259 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea -8(%rax),%rbp - movups ($ivp),@tweak[5] # load clear-text tweak + movups ($ivp),$inout0 # load clear-text tweak mov 240(%r8),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds ___ # generate the tweak - &aesni_generate1("enc",$key2,$rounds,@tweak[5]); + &aesni_generate1("enc",$key2,$rounds,$inout0); $code.=<<___; + $movkey ($key),$rndkey0 # zero round key mov $key,$key_ # backup $key mov $rnds_,$rounds # backup $rounds + shl \$4,$rnds_ mov $len,$len_ # backup $len and \$-16,$len + $movkey 16($key,$rnds_),$rndkey1 # last round key + movdqa .Lxts_magic(%rip),$twmask - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + movdqa $inout0,@tweak[5] + pshufd \$0x5f,$inout0,$twres + pxor $rndkey0,$rndkey1 ___ + # alternative tweak calculation algorithm is based on suggestions + # by Shay Gueron. psrad doesn't conflict with AES-NI instructions + # and should help in the future... for ($i=0;$i<4;$i++) { $code.=<<___; - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp + movdqa $twres,$twtmp + paddd $twres,$twres movdqa @tweak[5],@tweak[$i] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - pand $twmask,$twres # isolate carry and residue - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - pxor $twres,@tweak[5] + psrad \$31,$twtmp # broadcast upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + pxor $rndkey0,@tweak[$i] + pxor $twtmp,@tweak[5] ___ } $code.=<<___; + movdqa @tweak[5],@tweak[4] + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + pand $twmask,$twres + pxor $rndkey0,@tweak[4] + pxor $twres,@tweak[5] + movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] + sub \$16*6,$len jc .Lxts_enc_short - shr \$1,$rounds - sub \$1,$rounds - mov $rounds,$rnds_ + mov \$16+96,$rounds + lea 32($key_,$rnds_),$key # end of key schedule + sub %r10,%rax # twisted $rounds + $movkey 16($key_),$rndkey1 + mov %rax,%r10 # backup twisted $rounds + lea .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop -.align 16 +.align 32 .Lxts_enc_grandloop: - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak movdqu `16*0`($inp),$inout0 # load input - pand $twmask,$twres # isolate carry and residue + movdqa $rndkey0,$twmask movdqu `16*1`($inp),$inout1 - pxor $twres,@tweak[5] - + pxor @tweak[0],$inout0 movdqu `16*2`($inp),$inout2 - pxor @tweak[0],$inout0 # input^=tweak - movdqu `16*3`($inp),$inout3 pxor @tweak[1],$inout1 - movdqu `16*4`($inp),$inout4 + aesenc $rndkey1,$inout0 + movdqu `16*3`($inp),$inout3 pxor @tweak[2],$inout2 - movdqu `16*5`($inp),$inout5 - lea `16*6`($inp),$inp + aesenc $rndkey1,$inout1 + movdqu `16*4`($inp),$inout4 pxor @tweak[3],$inout3 - $movkey ($key_),$rndkey0 + aesenc $rndkey1,$inout2 + movdqu `16*5`($inp),$inout5 + pxor @tweak[5],$twmask # round[0]^=tweak[5] + movdqa 0x60(%rsp),$twres # load round[0]^round[last] pxor @tweak[4],$inout4 - pxor @tweak[5],$inout5 + aesenc $rndkey1,$inout3 + $movkey 32($key_),$rndkey0 + lea `16*6`($inp),$inp + pxor $twmask,$inout5 - # inline _aesni_encrypt6 and interleave first and last rounds - # with own code... - $movkey 16($key_),$rndkey1 - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks - aesenc $rndkey1,$inout0 - lea 32($key_),$key - pxor $rndkey0,$inout2 - movdqa @tweak[1],`16*1`(%rsp) - aesenc $rndkey1,$inout1 - pxor $rndkey0,$inout3 - movdqa @tweak[2],`16*2`(%rsp) - aesenc $rndkey1,$inout2 - pxor $rndkey0,$inout4 - movdqa @tweak[3],`16*3`(%rsp) - aesenc $rndkey1,$inout3 - pxor $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds - movdqa @tweak[4],`16*4`(%rsp) + pxor $twres,@tweak[0] aesenc $rndkey1,$inout4 - movdqa @tweak[5],`16*5`(%rsp) + pxor $twres,@tweak[1] + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key aesenc $rndkey1,$inout5 - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp - jmp .Lxts_enc_loop6_enter + $movkey 48($key_),$rndkey1 + pxor $twres,@tweak[2] -.align 16 + aesenc $rndkey0,$inout0 + pxor $twres,@tweak[3] + movdqa @tweak[1],`16*1`(%rsp) + aesenc $rndkey0,$inout1 + pxor $twres,@tweak[4] + movdqa @tweak[2],`16*2`(%rsp) + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + pxor $twres,$twmask + movdqa @tweak[4],`16*4`(%rsp) + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + movdqa $twmask,`16*5`(%rsp) + pshufd \$0x5f,@tweak[5],$twres + jmp .Lxts_enc_loop6 +.align 32 .Lxts_enc_loop6: aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 - dec $rounds aesenc $rndkey1,$inout2 aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 aesenc $rndkey1,$inout5 -.Lxts_enc_loop6_enter: - $movkey 16($key),$rndkey1 + $movkey -64($key,%rax),$rndkey1 + add \$32,%rax + aesenc $rndkey0,$inout0 aesenc $rndkey0,$inout1 - lea 32($key),$key aesenc $rndkey0,$inout2 aesenc $rndkey0,$inout3 aesenc $rndkey0,$inout4 aesenc $rndkey0,$inout5 - $movkey ($key),$rndkey0 + $movkey -80($key,%rax),$rndkey0 jnz .Lxts_enc_loop6 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa (%r8),$twmask + movdqa $twres,$twtmp + paddd $twres,$twres aesenc $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + paddq @tweak[5],@tweak[5] + psrad \$31,$twtmp aesenc $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + pand $twmask,$twtmp + $movkey ($key_),@tweak[0] # load round[0] aesenc $rndkey1,$inout2 - pxor $twres,@tweak[5] aesenc $rndkey1,$inout3 aesenc $rndkey1,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[0],@tweak[1] # copy round[0] aesenc $rndkey1,$inout5 - $movkey 16($key),$rndkey1 + $movkey -64($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[0] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa $twres,$twtmp aesenc $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue + paddd $twres,$twres + pxor @tweak[5],@tweak[0] aesenc $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + psrad \$31,$twtmp + paddq @tweak[5],@tweak[5] aesenc $rndkey0,$inout2 - pxor $twres,@tweak[5] aesenc $rndkey0,$inout3 + pand $twmask,$twtmp + movaps @tweak[1],@tweak[2] aesenc $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movdqa $twres,$twtmp aesenc $rndkey0,$inout5 - $movkey 32($key),$rndkey0 + $movkey -48($key),$rndkey0 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[1] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + paddd $twres,$twres aesenc $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + pxor @tweak[5],@tweak[1] + psrad \$31,$twtmp aesenc $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp aesenc $rndkey1,$inout2 - pxor $twres,@tweak[5] aesenc $rndkey1,$inout3 + movdqa @tweak[3],`16*3`(%rsp) + pxor $twtmp,@tweak[5] aesenc $rndkey1,$inout4 + movaps @tweak[2],@tweak[3] + movdqa $twres,$twtmp aesenc $rndkey1,$inout5 + $movkey -32($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[2] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - aesenclast $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue - aesenclast $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - aesenclast $rndkey0,$inout2 - pxor $twres,@tweak[5] - aesenclast $rndkey0,$inout3 - aesenclast $rndkey0,$inout4 - aesenclast $rndkey0,$inout5 - - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[3] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - xorps `16*0`(%rsp),$inout0 # output^=tweak - pand $twmask,$twres # isolate carry and residue - xorps `16*1`(%rsp),$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - pxor $twres,@tweak[5] + paddd $twres,$twres + aesenc $rndkey0,$inout0 + pxor @tweak[5],@tweak[2] + psrad \$31,$twtmp + aesenc $rndkey0,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[3],@tweak[4] + aesenc $rndkey0,$inout5 - xorps `16*2`(%rsp),$inout2 - movups $inout0,`16*0`($out) # write output - xorps `16*3`(%rsp),$inout3 - movups $inout1,`16*1`($out) - xorps `16*4`(%rsp),$inout4 - movups $inout2,`16*2`($out) - xorps `16*5`(%rsp),$inout5 - movups $inout3,`16*3`($out) - mov $rnds_,$rounds # restore $rounds - movups $inout4,`16*4`($out) - movups $inout5,`16*5`($out) - lea `16*6`($out),$out - sub \$16*6,$len - jnc .Lxts_enc_grandloop + movdqa $twres,$rndkey0 + paddd $twres,$twres + aesenc $rndkey1,$inout0 + pxor @tweak[5],@tweak[3] + psrad \$31,$rndkey0 + aesenc $rndkey1,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$rndkey0 + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + pxor $rndkey0,@tweak[5] + $movkey ($key_),$rndkey0 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + + pxor @tweak[5],@tweak[4] + aesenclast `16*0`(%rsp),$inout0 + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + aesenclast `16*1`(%rsp),$inout1 + aesenclast `16*2`(%rsp),$inout2 + pand $twmask,$twres + mov %r10,%rax # restore $rounds + aesenclast `16*3`(%rsp),$inout3 + aesenclast `16*4`(%rsp),$inout4 + aesenclast `16*5`(%rsp),$inout5 + pxor $twres,@tweak[5] + + lea `16*6`($out),$out + movups $inout0,`-16*6`($out) # write output + movups $inout1,`-16*5`($out) + movups $inout2,`-16*4`($out) + movups $inout3,`-16*3`($out) + movups $inout4,`-16*2`($out) + movups $inout5,`-16*1`($out) + sub \$16*6,$len + jnc .Lxts_enc_grandloop - lea 3($rounds,$rounds),$rounds # restore original value + mov \$16+96,$rounds + sub $rnds_,$rounds mov $key_,$key # restore $key - mov $rounds,$rnds_ # backup $rounds + shr \$4,$rounds # restore original value .Lxts_enc_short: + mov $rounds,$rnds_ # backup $rounds + pxor $rndkey0,@tweak[0] add \$16*6,$len jz .Lxts_enc_done + pxor $rndkey0,@tweak[1] cmp \$0x20,$len jb .Lxts_enc_one + pxor $rndkey0,@tweak[2] je .Lxts_enc_two + pxor $rndkey0,@tweak[3] cmp \$0x40,$len jb .Lxts_enc_three + pxor $rndkey0,@tweak[4] je .Lxts_enc_four - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - movdqu ($inp),$inout0 - pand $twmask,$twres # isolate carry and residue - movdqu 16*1($inp),$inout1 - pxor $twres,@tweak[5] - + movdqu ($inp),$inout0 + movdqu 16*1($inp),$inout1 movdqu 16*2($inp),$inout2 pxor @tweak[0],$inout0 movdqu 16*3($inp),$inout3 @@ -1697,7 +1914,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 - call _aesni_encrypt3 + call _aesni_encrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] @@ -1743,15 +1960,15 @@ $code.=<<___; call _aesni_encrypt4 - xorps @tweak[0],$inout0 - movdqa @tweak[5],@tweak[0] - xorps @tweak[1],$inout1 - xorps @tweak[2],$inout2 - movups $inout0,($out) - xorps @tweak[3],$inout3 - movups $inout1,16*1($out) - movups $inout2,16*2($out) - movups $inout3,16*3($out) + pxor @tweak[0],$inout0 + movdqa @tweak[4],@tweak[0] + pxor @tweak[1],$inout1 + pxor @tweak[2],$inout2 + movdqu $inout0,($out) + pxor @tweak[3],$inout3 + movdqu $inout1,16*1($out) + movdqu $inout2,16*2($out) + movdqu $inout3,16*3($out) lea 16*4($out),$out jmp .Lxts_enc_done @@ -1830,12 +2047,12 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea -8(%rax),%rbp - movups ($ivp),@tweak[5] # load clear-text tweak + movups ($ivp),$inout0 # load clear-text tweak mov 240($key2),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds ___ # generate the tweak - &aesni_generate1("enc",$key2,$rounds,@tweak[5]); + &aesni_generate1("enc",$key2,$rounds,$inout0); $code.=<<___; xor %eax,%eax # if ($len%16) len-=16; test \$15,$len @@ -1843,213 +2060,249 @@ $code.=<<___; shl \$4,%rax sub %rax,$len + $movkey ($key),$rndkey0 # zero round key mov $key,$key_ # backup $key mov $rnds_,$rounds # backup $rounds + shl \$4,$rnds_ mov $len,$len_ # backup $len and \$-16,$len + $movkey 16($key,$rnds_),$rndkey1 # last round key + movdqa .Lxts_magic(%rip),$twmask - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + movdqa $inout0,@tweak[5] + pshufd \$0x5f,$inout0,$twres + pxor $rndkey0,$rndkey1 ___ for ($i=0;$i<4;$i++) { $code.=<<___; - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp + movdqa $twres,$twtmp + paddd $twres,$twres movdqa @tweak[5],@tweak[$i] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - pand $twmask,$twres # isolate carry and residue - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - pxor $twres,@tweak[5] + psrad \$31,$twtmp # broadcast upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + pxor $rndkey0,@tweak[$i] + pxor $twtmp,@tweak[5] ___ } $code.=<<___; + movdqa @tweak[5],@tweak[4] + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + pand $twmask,$twres + pxor $rndkey0,@tweak[4] + pxor $twres,@tweak[5] + movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] + sub \$16*6,$len jc .Lxts_dec_short - shr \$1,$rounds - sub \$1,$rounds - mov $rounds,$rnds_ + mov \$16+96,$rounds + lea 32($key_,$rnds_),$key # end of key schedule + sub %r10,%rax # twisted $rounds + $movkey 16($key_),$rndkey1 + mov %rax,%r10 # backup twisted $rounds + lea .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop -.align 16 +.align 32 .Lxts_dec_grandloop: - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak movdqu `16*0`($inp),$inout0 # load input - pand $twmask,$twres # isolate carry and residue + movdqa $rndkey0,$twmask movdqu `16*1`($inp),$inout1 - pxor $twres,@tweak[5] - + pxor @tweak[0],$inout0 movdqu `16*2`($inp),$inout2 - pxor @tweak[0],$inout0 # input^=tweak - movdqu `16*3`($inp),$inout3 pxor @tweak[1],$inout1 - movdqu `16*4`($inp),$inout4 + aesdec $rndkey1,$inout0 + movdqu `16*3`($inp),$inout3 pxor @tweak[2],$inout2 - movdqu `16*5`($inp),$inout5 - lea `16*6`($inp),$inp + aesdec $rndkey1,$inout1 + movdqu `16*4`($inp),$inout4 pxor @tweak[3],$inout3 - $movkey ($key_),$rndkey0 + aesdec $rndkey1,$inout2 + movdqu `16*5`($inp),$inout5 + pxor @tweak[5],$twmask # round[0]^=tweak[5] + movdqa 0x60(%rsp),$twres # load round[0]^round[last] pxor @tweak[4],$inout4 - pxor @tweak[5],$inout5 + aesdec $rndkey1,$inout3 + $movkey 32($key_),$rndkey0 + lea `16*6`($inp),$inp + pxor $twmask,$inout5 - # inline _aesni_decrypt6 and interleave first and last rounds - # with own code... - $movkey 16($key_),$rndkey1 - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks - aesdec $rndkey1,$inout0 - lea 32($key_),$key - pxor $rndkey0,$inout2 - movdqa @tweak[1],`16*1`(%rsp) - aesdec $rndkey1,$inout1 - pxor $rndkey0,$inout3 - movdqa @tweak[2],`16*2`(%rsp) - aesdec $rndkey1,$inout2 - pxor $rndkey0,$inout4 - movdqa @tweak[3],`16*3`(%rsp) - aesdec $rndkey1,$inout3 - pxor $rndkey0,$inout5 - $movkey ($key),$rndkey0 - dec $rounds - movdqa @tweak[4],`16*4`(%rsp) + pxor $twres,@tweak[0] aesdec $rndkey1,$inout4 - movdqa @tweak[5],`16*5`(%rsp) + pxor $twres,@tweak[1] + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key aesdec $rndkey1,$inout5 - pxor $twtmp,$twtmp - pcmpgtd @tweak[5],$twtmp - jmp .Lxts_dec_loop6_enter + $movkey 48($key_),$rndkey1 + pxor $twres,@tweak[2] -.align 16 + aesdec $rndkey0,$inout0 + pxor $twres,@tweak[3] + movdqa @tweak[1],`16*1`(%rsp) + aesdec $rndkey0,$inout1 + pxor $twres,@tweak[4] + movdqa @tweak[2],`16*2`(%rsp) + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + pxor $twres,$twmask + movdqa @tweak[4],`16*4`(%rsp) + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey 64($key_),$rndkey0 + movdqa $twmask,`16*5`(%rsp) + pshufd \$0x5f,@tweak[5],$twres + jmp .Lxts_dec_loop6 +.align 32 .Lxts_dec_loop6: aesdec $rndkey1,$inout0 aesdec $rndkey1,$inout1 - dec $rounds aesdec $rndkey1,$inout2 aesdec $rndkey1,$inout3 aesdec $rndkey1,$inout4 aesdec $rndkey1,$inout5 -.Lxts_dec_loop6_enter: - $movkey 16($key),$rndkey1 + $movkey -64($key,%rax),$rndkey1 + add \$32,%rax + aesdec $rndkey0,$inout0 aesdec $rndkey0,$inout1 - lea 32($key),$key aesdec $rndkey0,$inout2 aesdec $rndkey0,$inout3 aesdec $rndkey0,$inout4 aesdec $rndkey0,$inout5 - $movkey ($key),$rndkey0 + $movkey -80($key,%rax),$rndkey0 jnz .Lxts_dec_loop6 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa (%r8),$twmask + movdqa $twres,$twtmp + paddd $twres,$twres aesdec $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + paddq @tweak[5],@tweak[5] + psrad \$31,$twtmp aesdec $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcast upper bits + pand $twmask,$twtmp + $movkey ($key_),@tweak[0] # load round[0] aesdec $rndkey1,$inout2 - pxor $twres,@tweak[5] aesdec $rndkey1,$inout3 aesdec $rndkey1,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[0],@tweak[1] # copy round[0] aesdec $rndkey1,$inout5 - $movkey 16($key),$rndkey1 + $movkey -64($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[0] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqa $twres,$twtmp aesdec $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue + paddd $twres,$twres + pxor @tweak[5],@tweak[0] aesdec $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + psrad \$31,$twtmp + paddq @tweak[5],@tweak[5] aesdec $rndkey0,$inout2 - pxor $twres,@tweak[5] aesdec $rndkey0,$inout3 + pand $twmask,$twtmp + movaps @tweak[1],@tweak[2] aesdec $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movdqa $twres,$twtmp aesdec $rndkey0,$inout5 - $movkey 32($key),$rndkey0 + $movkey -48($key),$rndkey0 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[1] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak + paddd $twres,$twres aesdec $rndkey1,$inout0 - pand $twmask,$twres # isolate carry and residue + pxor @tweak[5],@tweak[1] + psrad \$31,$twtmp aesdec $rndkey1,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp aesdec $rndkey1,$inout2 - pxor $twres,@tweak[5] aesdec $rndkey1,$inout3 + movdqa @tweak[3],`16*3`(%rsp) + pxor $twtmp,@tweak[5] aesdec $rndkey1,$inout4 + movaps @tweak[2],@tweak[3] + movdqa $twres,$twtmp aesdec $rndkey1,$inout5 + $movkey -32($key),$rndkey1 - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[2] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - aesdeclast $rndkey0,$inout0 - pand $twmask,$twres # isolate carry and residue - aesdeclast $rndkey0,$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits - aesdeclast $rndkey0,$inout2 - pxor $twres,@tweak[5] - aesdeclast $rndkey0,$inout3 - aesdeclast $rndkey0,$inout4 - aesdeclast $rndkey0,$inout5 - - pshufd \$0x13,$twtmp,$twres - pxor $twtmp,$twtmp - movdqa @tweak[5],@tweak[3] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - xorps `16*0`(%rsp),$inout0 # output^=tweak - pand $twmask,$twres # isolate carry and residue - xorps `16*1`(%rsp),$inout1 - pcmpgtd @tweak[5],$twtmp # broadcat upper bits + paddd $twres,$twres + aesdec $rndkey0,$inout0 + pxor @tweak[5],@tweak[2] + psrad \$31,$twtmp + aesdec $rndkey0,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$twtmp + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + pxor $twtmp,@tweak[5] + movaps @tweak[3],@tweak[4] + aesdec $rndkey0,$inout5 + + movdqa $twres,$rndkey0 + paddd $twres,$twres + aesdec $rndkey1,$inout0 + pxor @tweak[5],@tweak[3] + psrad \$31,$rndkey0 + aesdec $rndkey1,$inout1 + paddq @tweak[5],@tweak[5] + pand $twmask,$rndkey0 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + pxor $rndkey0,@tweak[5] + $movkey ($key_),$rndkey0 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey 16($key_),$rndkey1 + + pxor @tweak[5],@tweak[4] + aesdeclast `16*0`(%rsp),$inout0 + psrad \$31,$twres + paddq @tweak[5],@tweak[5] + aesdeclast `16*1`(%rsp),$inout1 + aesdeclast `16*2`(%rsp),$inout2 + pand $twmask,$twres + mov %r10,%rax # restore $rounds + aesdeclast `16*3`(%rsp),$inout3 + aesdeclast `16*4`(%rsp),$inout4 + aesdeclast `16*5`(%rsp),$inout5 pxor $twres,@tweak[5] - xorps `16*2`(%rsp),$inout2 - movups $inout0,`16*0`($out) # write output - xorps `16*3`(%rsp),$inout3 - movups $inout1,`16*1`($out) - xorps `16*4`(%rsp),$inout4 - movups $inout2,`16*2`($out) - xorps `16*5`(%rsp),$inout5 - movups $inout3,`16*3`($out) - mov $rnds_,$rounds # restore $rounds - movups $inout4,`16*4`($out) - movups $inout5,`16*5`($out) lea `16*6`($out),$out + movups $inout0,`-16*6`($out) # write output + movups $inout1,`-16*5`($out) + movups $inout2,`-16*4`($out) + movups $inout3,`-16*3`($out) + movups $inout4,`-16*2`($out) + movups $inout5,`-16*1`($out) sub \$16*6,$len jnc .Lxts_dec_grandloop - lea 3($rounds,$rounds),$rounds # restore original value + mov \$16+96,$rounds + sub $rnds_,$rounds mov $key_,$key # restore $key - mov $rounds,$rnds_ # backup $rounds + shr \$4,$rounds # restore original value .Lxts_dec_short: + mov $rounds,$rnds_ # backup $rounds + pxor $rndkey0,@tweak[0] + pxor $rndkey0,@tweak[1] add \$16*6,$len jz .Lxts_dec_done + pxor $rndkey0,@tweak[2] cmp \$0x20,$len jb .Lxts_dec_one + pxor $rndkey0,@tweak[3] je .Lxts_dec_two + pxor $rndkey0,@tweak[4] cmp \$0x40,$len jb .Lxts_dec_three je .Lxts_dec_four - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - movdqu ($inp),$inout0 - pand $twmask,$twres # isolate carry and residue - movdqu 16*1($inp),$inout1 - pxor $twres,@tweak[5] - + movdqu ($inp),$inout0 + movdqu 16*1($inp),$inout1 movdqu 16*2($inp),$inout2 pxor @tweak[0],$inout0 movdqu 16*3($inp),$inout3 @@ -2108,7 +2361,7 @@ $code.=<<___; xorps @tweak[0],$inout0 xorps @tweak[1],$inout1 - call _aesni_decrypt3 + call _aesni_decrypt2 xorps @tweak[0],$inout0 movdqa @tweak[2],@tweak[0] @@ -2134,7 +2387,7 @@ $code.=<<___; xorps @tweak[0],$inout0 movdqa @tweak[3],@tweak[0] xorps @tweak[1],$inout1 - movdqa @tweak[5],@tweak[1] + movdqa @tweak[4],@tweak[1] xorps @tweak[2],$inout2 movups $inout0,($out) movups $inout1,16*1($out) @@ -2144,14 +2397,8 @@ $code.=<<___; .align 16 .Lxts_dec_four: - pshufd \$0x13,$twtmp,$twres - movdqa @tweak[5],@tweak[4] - paddq @tweak[5],@tweak[5] # psllq 1,$tweak - movups ($inp),$inout0 - pand $twmask,$twres # isolate carry and residue - movups 16*1($inp),$inout1 - pxor $twres,@tweak[5] - + movups ($inp),$inout0 + movups 16*1($inp),$inout1 movups 16*2($inp),$inout2 xorps @tweak[0],$inout0 movups 16*3($inp),$inout3 @@ -2162,16 +2409,16 @@ $code.=<<___; call _aesni_decrypt4 - xorps @tweak[0],$inout0 + pxor @tweak[0],$inout0 movdqa @tweak[4],@tweak[0] - xorps @tweak[1],$inout1 + pxor @tweak[1],$inout1 movdqa @tweak[5],@tweak[1] - xorps @tweak[2],$inout2 - movups $inout0,($out) - xorps @tweak[3],$inout3 - movups $inout1,16*1($out) - movups $inout2,16*2($out) - movups $inout3,16*3($out) + pxor @tweak[2],$inout2 + movdqu $inout0,($out) + pxor @tweak[3],$inout3 + movdqu $inout1,16*1($out) + movdqu $inout2,16*2($out) + movdqu $inout3,16*3($out) lea 16*4($out),$out jmp .Lxts_dec_done @@ -2242,7 +2489,10 @@ ___ # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); { -my $frame_size = 0x10 + ($win64?0x40:0); # used in decrypt +my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt +my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); +my $inp_=$key_; + $code.=<<___; .globl ${PREFIX}_cbc_encrypt .type ${PREFIX}_cbc_encrypt,\@function,6 @@ -2308,248 +2558,359 @@ $code.=<<___ if ($win64); movaps %xmm7,0x20(%rsp) movaps %xmm8,0x30(%rsp) movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) .Lcbc_decrypt_body: ___ $code.=<<___; lea -8(%rax),%rbp movups ($ivp),$iv mov $rnds_,$rounds - cmp \$0x70,$len + cmp \$0x50,$len jbe .Lcbc_dec_tail - shr \$1,$rnds_ - sub \$0x70,$len - mov $rnds_,$rounds - movaps $iv,(%rsp) + + $movkey ($key),$rndkey0 + movdqu 0x00($inp),$inout0 # load input + movdqu 0x10($inp),$inout1 + movdqa $inout0,$in0 + movdqu 0x20($inp),$inout2 + movdqa $inout1,$in1 + movdqu 0x30($inp),$inout3 + movdqa $inout2,$in2 + movdqu 0x40($inp),$inout4 + movdqa $inout3,$in3 + movdqu 0x50($inp),$inout5 + movdqa $inout4,$in4 + mov OPENSSL_ia32cap_P+4(%rip),%r9d + cmp \$0x70,$len + jbe .Lcbc_dec_six_or_seven + + and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE + sub \$0x50,$len + cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE + je .Lcbc_dec_loop6_enter + sub \$0x20,$len + lea 0x70($key),$key # size optimization jmp .Lcbc_dec_loop8_enter .align 16 .Lcbc_dec_loop8: - movaps $rndkey0,(%rsp) # save IV movups $inout7,($out) lea 0x10($out),$out .Lcbc_dec_loop8_enter: - $movkey ($key),$rndkey0 - movups ($inp),$inout0 # load input - movups 0x10($inp),$inout1 - $movkey 16($key),$rndkey1 + movdqu 0x60($inp),$inout6 + pxor $rndkey0,$inout0 + movdqu 0x70($inp),$inout7 + pxor $rndkey0,$inout1 + $movkey 0x10-0x70($key),$rndkey1 + pxor $rndkey0,$inout2 + xor $inp_,$inp_ + cmp \$0x70,$len # is there at least 0x60 bytes ahead? + pxor $rndkey0,$inout3 + pxor $rndkey0,$inout4 + pxor $rndkey0,$inout5 + pxor $rndkey0,$inout6 - lea 32($key),$key - movdqu 0x20($inp),$inout2 - xorps $rndkey0,$inout0 - movdqu 0x30($inp),$inout3 - xorps $rndkey0,$inout1 - movdqu 0x40($inp),$inout4 aesdec $rndkey1,$inout0 - pxor $rndkey0,$inout2 - movdqu 0x50($inp),$inout5 + pxor $rndkey0,$inout7 + $movkey 0x20-0x70($key),$rndkey0 aesdec $rndkey1,$inout1 - pxor $rndkey0,$inout3 - movdqu 0x60($inp),$inout6 aesdec $rndkey1,$inout2 - pxor $rndkey0,$inout4 - movdqu 0x70($inp),$inout7 aesdec $rndkey1,$inout3 - pxor $rndkey0,$inout5 - dec $rounds aesdec $rndkey1,$inout4 - pxor $rndkey0,$inout6 aesdec $rndkey1,$inout5 - pxor $rndkey0,$inout7 - $movkey ($key),$rndkey0 aesdec $rndkey1,$inout6 + setnc ${inp_}b + shl \$7,$inp_ aesdec $rndkey1,$inout7 - $movkey 16($key),$rndkey1 + add $inp,$inp_ + $movkey 0x30-0x70($key),$rndkey1 +___ +for($i=1;$i<12;$i++) { +my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; +$code.=<<___ if ($i==7); + cmp \$11,$rounds +___ +$code.=<<___; + aesdec $rndkeyx,$inout0 + aesdec $rndkeyx,$inout1 + aesdec $rndkeyx,$inout2 + aesdec $rndkeyx,$inout3 + aesdec $rndkeyx,$inout4 + aesdec $rndkeyx,$inout5 + aesdec $rndkeyx,$inout6 + aesdec $rndkeyx,$inout7 + $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx +___ +$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); + nop +___ +$code.=<<___ if ($i==7); + jb .Lcbc_dec_done +___ +$code.=<<___ if ($i==9); + je .Lcbc_dec_done +___ +$code.=<<___ if ($i==11); + jmp .Lcbc_dec_done +___ +} +$code.=<<___; +.align 16 +.Lcbc_dec_done: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + pxor $rndkey0,$iv + pxor $rndkey0,$in0 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + pxor $rndkey0,$in1 + pxor $rndkey0,$in2 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + pxor $rndkey0,$in3 + pxor $rndkey0,$in4 + aesdec $rndkey1,$inout6 + aesdec $rndkey1,$inout7 + movdqu 0x50($inp),$rndkey1 - call .Ldec_loop8_enter + aesdeclast $iv,$inout0 + movdqu 0x60($inp),$iv # borrow $iv + pxor $rndkey0,$rndkey1 + aesdeclast $in0,$inout1 + pxor $rndkey0,$iv + movdqu 0x70($inp),$rndkey0 # next IV + aesdeclast $in1,$inout2 + lea 0x80($inp),$inp + movdqu 0x00($inp_),$in0 + aesdeclast $in2,$inout3 + aesdeclast $in3,$inout4 + movdqu 0x10($inp_),$in1 + movdqu 0x20($inp_),$in2 + aesdeclast $in4,$inout5 + aesdeclast $rndkey1,$inout6 + movdqu 0x30($inp_),$in3 + movdqu 0x40($inp_),$in4 + aesdeclast $iv,$inout7 + movdqa $rndkey0,$iv # return $iv + movdqu 0x50($inp_),$rndkey1 + $movkey -0x70($key),$rndkey0 + + movups $inout0,($out) # store output + movdqa $in0,$inout0 + movups $inout1,0x10($out) + movdqa $in1,$inout1 + movups $inout2,0x20($out) + movdqa $in2,$inout2 + movups $inout3,0x30($out) + movdqa $in3,$inout3 + movups $inout4,0x40($out) + movdqa $in4,$inout4 + movups $inout5,0x50($out) + movdqa $rndkey1,$inout5 + movups $inout6,0x60($out) + lea 0x70($out),$out - movups ($inp),$rndkey1 # re-load input - movups 0x10($inp),$rndkey0 - xorps (%rsp),$inout0 # ^= IV - xorps $rndkey1,$inout1 - movups 0x20($inp),$rndkey1 - xorps $rndkey0,$inout2 - movups 0x30($inp),$rndkey0 - xorps $rndkey1,$inout3 - movups 0x40($inp),$rndkey1 - xorps $rndkey0,$inout4 - movups 0x50($inp),$rndkey0 - xorps $rndkey1,$inout5 - movups 0x60($inp),$rndkey1 - xorps $rndkey0,$inout6 - movups 0x70($inp),$rndkey0 # IV - xorps $rndkey1,$inout7 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - mov $rnds_,$rounds # restore $rounds - movups $inout4,0x40($out) - mov $key_,$key # restore $key - movups $inout5,0x50($out) - lea 0x80($inp),$inp - movups $inout6,0x60($out) - lea 0x70($out),$out sub \$0x80,$len ja .Lcbc_dec_loop8 movaps $inout7,$inout0 - movaps $rndkey0,$iv + lea -0x70($key),$key add \$0x70,$len jle .Lcbc_dec_tail_collected - movups $inout0,($out) - lea 1($rnds_,$rnds_),$rounds + movups $inout7,($out) + lea 0x10($out),$out + cmp \$0x50,$len + jbe .Lcbc_dec_tail + + movaps $in0,$inout0 +.Lcbc_dec_six_or_seven: + cmp \$0x60,$len + ja .Lcbc_dec_seven + + movaps $inout5,$inout6 + call _aesni_decrypt6 + pxor $iv,$inout0 # ^= IV + movaps $inout6,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + movdqu $inout3,0x30($out) + pxor $in4,$inout5 + movdqu $inout4,0x40($out) + lea 0x50($out),$out + movdqa $inout5,$inout0 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_seven: + movups 0x60($inp),$inout6 + xorps $inout7,$inout7 + call _aesni_decrypt8 + movups 0x50($inp),$inout7 + pxor $iv,$inout0 # ^= IV + movups 0x60($inp),$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + movdqu $inout3,0x30($out) + pxor $in4,$inout5 + movdqu $inout4,0x40($out) + pxor $inout7,$inout6 + movdqu $inout5,0x50($out) + lea 0x60($out),$out + movdqa $inout6,$inout0 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_loop6: + movups $inout5,($out) + lea 0x10($out),$out + movdqu 0x00($inp),$inout0 # load input + movdqu 0x10($inp),$inout1 + movdqa $inout0,$in0 + movdqu 0x20($inp),$inout2 + movdqa $inout1,$in1 + movdqu 0x30($inp),$inout3 + movdqa $inout2,$in2 + movdqu 0x40($inp),$inout4 + movdqa $inout3,$in3 + movdqu 0x50($inp),$inout5 + movdqa $inout4,$in4 +.Lcbc_dec_loop6_enter: + lea 0x60($inp),$inp + movdqa $inout5,$inout6 + + call _aesni_decrypt6 + + pxor $iv,$inout0 # ^= IV + movdqa $inout6,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + mov $key_,$key + movdqu $inout3,0x30($out) + pxor $in4,$inout5 + mov $rnds_,$rounds + movdqu $inout4,0x40($out) + lea 0x50($out),$out + sub \$0x60,$len + ja .Lcbc_dec_loop6 + + movdqa $inout5,$inout0 + add \$0x50,$len + jle .Lcbc_dec_tail_collected + movups $inout5,($out) lea 0x10($out),$out + .Lcbc_dec_tail: movups ($inp),$inout0 - movaps $inout0,$in0 - cmp \$0x10,$len + sub \$0x10,$len jbe .Lcbc_dec_one movups 0x10($inp),$inout1 - movaps $inout1,$in1 - cmp \$0x20,$len + movaps $inout0,$in0 + sub \$0x10,$len jbe .Lcbc_dec_two movups 0x20($inp),$inout2 - movaps $inout2,$in2 - cmp \$0x30,$len + movaps $inout1,$in1 + sub \$0x10,$len jbe .Lcbc_dec_three movups 0x30($inp),$inout3 - cmp \$0x40,$len + movaps $inout2,$in2 + sub \$0x10,$len jbe .Lcbc_dec_four movups 0x40($inp),$inout4 - cmp \$0x50,$len - jbe .Lcbc_dec_five - - movups 0x50($inp),$inout5 - cmp \$0x60,$len - jbe .Lcbc_dec_six - - movups 0x60($inp),$inout6 - movaps $iv,(%rsp) # save IV - call _aesni_decrypt8 - movups ($inp),$rndkey1 - movups 0x10($inp),$rndkey0 - xorps (%rsp),$inout0 # ^= IV - xorps $rndkey1,$inout1 - movups 0x20($inp),$rndkey1 - xorps $rndkey0,$inout2 - movups 0x30($inp),$rndkey0 - xorps $rndkey1,$inout3 - movups 0x40($inp),$rndkey1 - xorps $rndkey0,$inout4 - movups 0x50($inp),$rndkey0 - xorps $rndkey1,$inout5 - movups 0x60($inp),$iv # IV - xorps $rndkey0,$inout6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - movups $inout5,0x50($out) - lea 0x60($out),$out - movaps $inout6,$inout0 - sub \$0x70,$len + movaps $inout3,$in3 + movaps $inout4,$in4 + xorps $inout5,$inout5 + call _aesni_decrypt6 + pxor $iv,$inout0 + movaps $in4,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + movdqu $inout3,0x30($out) + lea 0x40($out),$out + movdqa $inout4,$inout0 + sub \$0x10,$len jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_one: + movaps $inout0,$in0 ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; xorps $iv,$inout0 movaps $in0,$iv - sub \$0x10,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: - xorps $inout2,$inout2 - call _aesni_decrypt3 - xorps $iv,$inout0 - xorps $in0,$inout1 - movups $inout0,($out) + movaps $inout1,$in1 + call _aesni_decrypt2 + pxor $iv,$inout0 movaps $in1,$iv - movaps $inout1,$inout0 + pxor $in0,$inout1 + movdqu $inout0,($out) + movdqa $inout1,$inout0 lea 0x10($out),$out - sub \$0x20,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: + movaps $inout2,$in2 call _aesni_decrypt3 - xorps $iv,$inout0 - xorps $in0,$inout1 - movups $inout0,($out) - xorps $in1,$inout2 - movups $inout1,0x10($out) + pxor $iv,$inout0 movaps $in2,$iv - movaps $inout2,$inout0 + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + movdqa $inout2,$inout0 lea 0x20($out),$out - sub \$0x30,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_four: + movaps $inout3,$in3 call _aesni_decrypt4 - xorps $iv,$inout0 - movups 0x30($inp),$iv - xorps $in0,$inout1 - movups $inout0,($out) - xorps $in1,$inout2 - movups $inout1,0x10($out) - xorps $in2,$inout3 - movups $inout2,0x20($out) - movaps $inout3,$inout0 + pxor $iv,$inout0 + movaps $in3,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + movdqa $inout3,$inout0 lea 0x30($out),$out - sub \$0x40,$len - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_five: - xorps $inout5,$inout5 - call _aesni_decrypt6 - movups 0x10($inp),$rndkey1 - movups 0x20($inp),$rndkey0 - xorps $iv,$inout0 - xorps $in0,$inout1 - xorps $rndkey1,$inout2 - movups 0x30($inp),$rndkey1 - xorps $rndkey0,$inout3 - movups 0x40($inp),$iv - xorps $rndkey1,$inout4 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - lea 0x40($out),$out - movaps $inout4,$inout0 - sub \$0x50,$len - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_six: - call _aesni_decrypt6 - movups 0x10($inp),$rndkey1 - movups 0x20($inp),$rndkey0 - xorps $iv,$inout0 - xorps $in0,$inout1 - xorps $rndkey1,$inout2 - movups 0x30($inp),$rndkey1 - xorps $rndkey0,$inout3 - movups 0x40($inp),$rndkey0 - xorps $rndkey1,$inout4 - movups 0x50($inp),$iv - xorps $rndkey0,$inout5 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - movups $inout4,0x40($out) - lea 0x50($out),$out - movaps $inout5,$inout0 - sub \$0x60,$len jmp .Lcbc_dec_tail_collected + .align 16 .Lcbc_dec_tail_collected: - and \$15,$len movups $iv,($ivp) + and \$15,$len jnz .Lcbc_dec_tail_partial movups $inout0,($out) jmp .Lcbc_dec_ret @@ -2569,6 +2930,12 @@ $code.=<<___ if ($win64); movaps 0x20(%rsp),%xmm7 movaps 0x30(%rsp),%xmm8 movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 ___ $code.=<<___; lea (%rbp),%rsp @@ -2991,7 +3358,7 @@ cbc_se_handler: lea 16(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 - mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq .Lcommon_rbp_tail: @@ -3156,11 +3523,30 @@ sub aesni { push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } + elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + my $off = $2; + push @opcode,0x44 if ($3>=8); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M + push @opcode,($off=~/^0/?oct($off):$off)&0xff; + return ".byte\t".join(',',@opcode); + } return $line; } +sub movbe { + ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; +} + $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; +#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact +$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; print $code;