X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Faesni-x86_64.pl;h=ea9cf9404d7bde451b210b72bc105ce1ae0d578e;hp=b1b1c5ed5ab8098b443534d61fa09ae7717ffa92;hb=f8501464cc8fd8b7b4983462944a1894b157d735;hpb=d7d119a3c9b8a26a552c7a8e3e1659db924646eb diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index b1b1c5ed5a..ea9cf9404d 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -77,7 +77,85 @@ # overhead affects small-block performance, as well as OFB and CFB # results. Differences are not large, most common coefficients are # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one -# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB, CTR)... +# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... + +# January 2011 +# +# While Westmere processor features 6 cycles latency for aes[enc|dec] +# instructions, which can be scheduled every second cycle, Sandy +# Bridge spends 8 cycles per instruction, but it can schedule them +# every cycle. This means that code targeting Westmere would perform +# suboptimally on Sandy Bridge. Therefore this update. +# +# In addition, non-parallelizable CBC encrypt (as well as CCM) is +# optimized. Relative improvement might appear modest, 8% on Westmere, +# but in absolute terms it's 3.77 cycles per byte encrypted with +# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers +# should be compared to asymptotic limits of 3.75 for Westmere and +# 5.00 for Sandy Bridge. Actually, the fact that they get this close +# to asymptotic limits is quite amazing. Indeed, the limit is +# calculated as latency times number of rounds, 10 for 128-bit key, +# and divided by 16, the number of bytes in block, or in other words +# it accounts *solely* for aesenc instructions. But there are extra +# instructions, and numbers so close to the asymptotic limits mean +# that it's as if it takes as little as *one* additional cycle to +# execute all of them. How is it possible? It is possible thanks to +# out-of-order execution logic, which manages to overlap post- +# processing of previous block, things like saving the output, with +# actual encryption of current block, as well as pre-processing of +# current block, things like fetching input and xor-ing it with +# 0-round element of the key schedule, with actual encryption of +# previous block. Keep this in mind... +# +# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher +# performance is achieved by interleaving instructions working on +# independent blocks. In which case asymptotic limit for such modes +# can be obtained by dividing above mentioned numbers by AES +# instructions' interleave factor. Westmere can execute at most 3 +# instructions at a time, meaning that optimal interleave factor is 3, +# and that's where the "magic" number of 1.25 come from. "Optimal +# interleave factor" means that increase of interleave factor does +# not improve performance. The formula has proven to reflect reality +# pretty well on Westmere... Sandy Bridge on the other hand can +# execute up to 8 AES instructions at a time, so how does varying +# interleave factor affect the performance? Here is table for ECB +# (numbers are cycles per byte processed with 128-bit key): +# +# instruction interleave factor 3x 6x 8x +# theoretical asymptotic limit 1.67 0.83 0.625 +# measured performance for 8KB block 1.05 0.86 0.84 +# +# "as if" interleave factor 4.7x 5.8x 6.0x +# +# Further data for other parallelizable modes: +# +# CBC decrypt 1.16 0.93 0.93 +# CTR 1.14 0.91 n/a +# +# Well, given 3x column it's probably inappropriate to call the limit +# asymptotic, if it can be surpassed, isn't it? What happens there? +# Rewind to CBC paragraph for the answer. Yes, out-of-order execution +# magic is responsible for this. Processor overlaps not only the +# additional instructions with AES ones, but even AES instuctions +# processing adjacent triplets of independent blocks. In the 6x case +# additional instructions still claim disproportionally small amount +# of additional cycles, but in 8x case number of instructions must be +# a tad too high for out-of-order logic to cope with, and AES unit +# remains underutilized... As you can see 8x interleave is hardly +# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl +# utilizies 6x interleave because of limited register bank capacity. +# +# Higher interleave factors do have negative impact on Westmere +# performance. While for ECB mode it's negligible ~1.5%, other +# parallelizables perform ~5% worse, which is outweighed by ~25% +# improvement on Sandy Bridge. To balance regression on Westmere +# CTR mode was implemented with 6x aesenc interleave factor. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like +# in CTR mode AES instruction interleave factor was chosen to be 6x. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for @@ -114,12 +192,14 @@ $rnds_="%r10d"; # backup copy for $rounds $key_="%r11"; # backup copy for $key # %xmm register layout -$inout0="%xmm0"; $inout1="%xmm1"; -$inout2="%xmm2"; $inout3="%xmm3"; -$rndkey0="%xmm4"; $rndkey1="%xmm5"; - -$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR, ... -$in1="%xmm8"; $in2="%xmm9"; +$rndkey0="%xmm0"; $rndkey1="%xmm1"; +$inout0="%xmm2"; $inout1="%xmm3"; +$inout2="%xmm4"; $inout3="%xmm5"; +$inout4="%xmm6"; $inout5="%xmm7"; +$inout6="%xmm8"; $inout7="%xmm9"; + +$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... +$in0="%xmm8"; $iv="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. # @@ -127,13 +207,22 @@ $in1="%xmm8"; $in2="%xmm9"; # cycles which take care of loop variables... { my $sn; sub aesni_generate1 { -my ($p,$key,$rounds,$inout)=@_; $inout=$inout0 if (!defined($inout)); +my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); ++$sn; $code.=<<___; $movkey ($key),$rndkey0 $movkey 16($key),$rndkey1 +___ +$code.=<<___ if (defined($ivec)); + xorps $rndkey0,$ivec + lea 32($key),$key + xorps $ivec,$inout +___ +$code.=<<___ if (!defined($ivec)); lea 32($key),$key - pxor $rndkey0,$inout + xorps $rndkey0,$inout +___ +$code.=<<___; .Loop_${p}1_$sn: aes${p} $rndkey1,$inout dec $rounds @@ -153,7 +242,7 @@ $code.=<<___; .align 16 ${PREFIX}_encrypt: movups ($inp),$inout0 # load input - mov 240($key),$rounds # pull $rounds + mov 240($key),$rounds # key->rounds ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; @@ -166,7 +255,7 @@ $code.=<<___; .align 16 ${PREFIX}_decrypt: movups ($inp),$inout0 # load input - mov 240($key),$rounds # pull $rounds + mov 240($key),$rounds # key->rounds ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; @@ -176,16 +265,16 @@ $code.=<<___; ___ } -# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave -# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] -# latency is 6, it turned out that it can be scheduled only every -# *second* cycle. Thus 3x interleave is the one providing optimal +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. As soon -# as/if Intel improves throughput by making it possible to schedule -# the instructions in question *every* cycles I would have to -# implement 6x interleave and use it in loop... +# This is why it makes no sense to implement 2x subroutine. +# aes[enc|dec] latency in next processor generation is 8, but the +# instructions can be scheduled every cycle. Optimal interleave for +# new processor is therefore 8x... sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* @@ -198,9 +287,9 @@ _aesni_${dir}rypt3: shr \$1,$rounds $movkey 16($key),$rndkey1 lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 $movkey ($key),$rndkey0 .L${dir}_loop3: @@ -242,11 +331,11 @@ _aesni_${dir}rypt4: shr \$1,$rounds $movkey 16($key),$rndkey1 lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 - pxor $rndkey0,$inout3 - $movkey ($key),$rndkey0 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 + xorps $rndkey0,$inout3 + $movkey ($key),$rndkey0 .L${dir}_loop4: aes${dir} $rndkey1,$inout0 @@ -275,10 +364,155 @@ _aesni_${dir}rypt4: .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 ___ } +sub aesni_generate6 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-5] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt6,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt6: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + xorps $rndkey0,$inout0 + pxor $rndkey0,$inout1 + aes${dir} $rndkey1,$inout0 + pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout3 + aes${dir} $rndkey1,$inout2 + pxor $rndkey0,$inout4 + aes${dir} $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aes${dir} $rndkey1,$inout4 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout5 + jmp .L${dir}_loop6_enter +.align 16 +.L${dir}_loop6: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 +.L${dir}_loop6_enter: # happens to be 16-byte aligned + $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .L${dir}_loop6 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + ret +.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 +___ +} +sub aesni_generate8 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-7] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt8,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt8: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + aes${dir} $rndkey1,$inout0 + pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout3 + aes${dir} $rndkey1,$inout2 + pxor $rndkey0,$inout4 + aes${dir} $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aes${dir} $rndkey1,$inout4 + pxor $rndkey0,$inout6 + aes${dir} $rndkey1,$inout5 + pxor $rndkey0,$inout7 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + $movkey 16($key),$rndkey1 + jmp .L${dir}_loop8_enter +.align 16 +.L${dir}_loop8: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + $movkey 16($key),$rndkey1 +.L${dir}_loop8_enter: # happens to be 16-byte aligned + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + aes${dir} $rndkey0,$inout6 + aes${dir} $rndkey0,$inout7 + $movkey ($key),$rndkey0 + jnz .L${dir}_loop8 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + aes${dir}last $rndkey0,$inout6 + aes${dir}last $rndkey0,$inout7 + ret +.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 +___ +} &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); +&aesni_generate6("enc") if ($PREFIX eq "aesni"); +&aesni_generate6("dec"); +&aesni_generate8("enc") if ($PREFIX eq "aesni"); +&aesni_generate8("dec"); if ($PREFIX eq "aesni") { ######################################################################## @@ -290,37 +524,73 @@ $code.=<<___; .type aesni_ecb_encrypt,\@function,5 .align 16 aesni_ecb_encrypt: - cmp \$16,$len # check length - jb .Lecb_ret - - mov 240($key),$rounds # pull $rounds and \$-16,$len + jz .Lecb_ret + + mov 240($key),$rounds # key->rounds + $movkey ($key),$rndkey0 mov $key,$key_ # backup $key mov $rounds,$rnds_ # backup $rounds test %r8d,%r8d # 5th argument jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# - cmp \$0x40,$len - jbe .Lecb_enc_tail - sub \$0x40,$len - jmp .Lecb_enc_loop3 + cmp \$0x80,$len + jb .Lecb_enc_tail + + movdqu ($inp),$inout0 + movdqu 0x10($inp),$inout1 + movdqu 0x20($inp),$inout2 + movdqu 0x30($inp),$inout3 + movdqu 0x40($inp),$inout4 + movdqu 0x50($inp),$inout5 + movdqu 0x60($inp),$inout6 + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp + sub \$0x80,$len + jmp .Lecb_enc_loop8_enter .align 16 -.Lecb_enc_loop3: - movups ($inp),$inout0 - movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - call _aesni_encrypt3 - lea 0x30($inp),$inp +.Lecb_enc_loop8: movups $inout0,($out) + mov $key_,$key # restore $key + movdqu ($inp),$inout0 mov $rnds_,$rounds # restore $rounds movups $inout1,0x10($out) + movdqu 0x10($inp),$inout1 + movups $inout2,0x20($out) + movdqu 0x20($inp),$inout2 + movups $inout3,0x30($out) + movdqu 0x30($inp),$inout3 + movups $inout4,0x40($out) + movdqu 0x40($inp),$inout4 + movups $inout5,0x50($out) + movdqu 0x50($inp),$inout5 + movups $inout6,0x60($out) + movdqu 0x60($inp),$inout6 + movups $inout7,0x70($out) + lea 0x80($out),$out + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp +.Lecb_enc_loop8_enter: + + call _aesni_encrypt8 + + sub \$0x80,$len + jnc .Lecb_enc_loop8 + + movups $inout0,($out) mov $key_,$key # restore $key + movups $inout1,0x10($out) + mov $rnds_,$rounds # restore $rounds movups $inout2,0x20($out) - lea 0x30($out),$out - sub \$0x30,$len - ja .Lecb_enc_loop3 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + add \$0x80,$len + jz .Lecb_ret - add \$0x40,$len .Lecb_enc_tail: movups ($inp),$inout0 cmp \$0x20,$len @@ -328,14 +598,24 @@ aesni_ecb_encrypt: movups 0x10($inp),$inout1 je .Lecb_enc_two movups 0x20($inp),$inout2 - cmp \$0x30,$len - je .Lecb_enc_three + cmp \$0x40,$len + jb .Lecb_enc_three movups 0x30($inp),$inout3 - call _aesni_encrypt4 + je .Lecb_enc_four + movups 0x40($inp),$inout4 + cmp \$0x60,$len + jb .Lecb_enc_five + movups 0x50($inp),$inout5 + je .Lecb_enc_six + movdqu 0x60($inp),$inout6 + call _aesni_encrypt8 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) jmp .Lecb_ret .align 16 .Lecb_enc_one: @@ -346,7 +626,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: - pxor $inout2,$inout2 + xorps $inout2,$inout2 call _aesni_encrypt3 movups $inout0,($out) movups $inout1,0x10($out) @@ -358,30 +638,95 @@ $code.=<<___; movups $inout1,0x10($out) movups $inout2,0x20($out) jmp .Lecb_ret +.align 16 +.Lecb_enc_four: + call _aesni_encrypt4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_five: + xorps $inout5,$inout5 + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_six: + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + jmp .Lecb_ret #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: - cmp \$0x40,$len - jbe .Lecb_dec_tail - sub \$0x40,$len - jmp .Lecb_dec_loop3 + cmp \$0x80,$len + jb .Lecb_dec_tail + + movdqu ($inp),$inout0 + movdqu 0x10($inp),$inout1 + movdqu 0x20($inp),$inout2 + movdqu 0x30($inp),$inout3 + movdqu 0x40($inp),$inout4 + movdqu 0x50($inp),$inout5 + movdqu 0x60($inp),$inout6 + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp + sub \$0x80,$len + jmp .Lecb_dec_loop8_enter .align 16 -.Lecb_dec_loop3: - movups ($inp),$inout0 - movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - call _aesni_decrypt3 - lea 0x30($inp),$inp +.Lecb_dec_loop8: movups $inout0,($out) + mov $key_,$key # restore $key + movdqu ($inp),$inout0 mov $rnds_,$rounds # restore $rounds movups $inout1,0x10($out) + movdqu 0x10($inp),$inout1 + movups $inout2,0x20($out) + movdqu 0x20($inp),$inout2 + movups $inout3,0x30($out) + movdqu 0x30($inp),$inout3 + movups $inout4,0x40($out) + movdqu 0x40($inp),$inout4 + movups $inout5,0x50($out) + movdqu 0x50($inp),$inout5 + movups $inout6,0x60($out) + movdqu 0x60($inp),$inout6 + movups $inout7,0x70($out) + lea 0x80($out),$out + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp +.Lecb_dec_loop8_enter: + + call _aesni_decrypt8 + + $movkey ($key_),$rndkey0 + sub \$0x80,$len + jnc .Lecb_dec_loop8 + + movups $inout0,($out) mov $key_,$key # restore $key + movups $inout1,0x10($out) + mov $rnds_,$rounds # restore $rounds movups $inout2,0x20($out) - lea 0x30($out),$out - sub \$0x30,$len - ja .Lecb_dec_loop3 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + add \$0x80,$len + jz .Lecb_ret - add \$0x40,$len .Lecb_dec_tail: movups ($inp),$inout0 cmp \$0x20,$len @@ -389,14 +734,25 @@ $code.=<<___; movups 0x10($inp),$inout1 je .Lecb_dec_two movups 0x20($inp),$inout2 - cmp \$0x30,$len - je .Lecb_dec_three + cmp \$0x40,$len + jb .Lecb_dec_three movups 0x30($inp),$inout3 - call _aesni_decrypt4 + je .Lecb_dec_four + movups 0x40($inp),$inout4 + cmp \$0x60,$len + jb .Lecb_dec_five + movups 0x50($inp),$inout5 + je .Lecb_dec_six + movups 0x60($inp),$inout6 + $movkey ($key),$rndkey0 + call _aesni_decrypt8 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) jmp .Lecb_ret .align 16 .Lecb_dec_one: @@ -407,7 +763,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_dec_two: - pxor $inout2,$inout2 + xorps $inout2,$inout2 call _aesni_decrypt3 movups $inout0,($out) movups $inout1,0x10($out) @@ -418,6 +774,34 @@ $code.=<<___; movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_four: + call _aesni_decrypt4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_five: + xorps $inout5,$inout5 + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_six: + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) .Lecb_ret: ret @@ -467,25 +851,45 @@ $code.=<<___; movdqa $iv,$inout0 .Lccm64_enc_outer: - movdqu ($inp),$in0 # load inp + movups ($inp),$in0 # load inp pshufb $bswap_mask,$inout0 mov $key_,$key mov $rnds_,$rounds - pxor $in0,$inout1 # cmac^=inp - pxor $inout2,$inout2 - call _aesni_encrypt3 + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$in0 + lea 32($key),$key + xorps $rndkey0,$inout0 + xorps $inout1,$in0 # cmac^=inp + $movkey ($key),$rndkey0 + +.Lccm64_enc2_loop: + aesenc $rndkey1,$inout0 + dec $rounds + aesenc $rndkey1,$inout1 + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + lea 32($key),$key + aesenc $rndkey0,$inout1 + $movkey 0($key),$rndkey0 + jnz .Lccm64_enc2_loop + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenclast $rndkey0,$inout0 + aesenclast $rndkey0,$inout1 paddq $increment,$iv dec $len lea 16($inp),$inp - pxor $inout0,$in0 # inp ^= E(iv) + xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 - movdqu $in0,($out) # save output + movups $in0,($out) # save output lea 16($out),$out jnz .Lccm64_enc_outer - movdqu $inout1,($cmac) + movups $inout1,($cmac) ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 @@ -529,24 +933,42 @@ ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; .Lccm64_dec_outer: - movdqu ($inp),$in0 # load inp paddq $increment,$iv - dec $len - lea 16($inp),$inp - pxor $inout0,$in0 + movups ($inp),$in0 # load inp + xorps $inout0,$in0 movdqa $iv,$inout0 + lea 16($inp),$inp + pshufb $bswap_mask,$inout0 mov $key_,$key mov $rnds_,$rounds - pshufb $bswap_mask,$inout0 - movdqu $in0,($out) + movups $in0,($out) lea 16($out),$out - pxor $in0,$inout1 # cmac^=out + sub \$1,$len jz .Lccm64_dec_break - pxor $inout2,$inout2 - call _aesni_encrypt3 + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$in0 + lea 32($key),$key + xorps $rndkey0,$inout0 + xorps $in0,$inout1 # cmac^=out + $movkey ($key),$rndkey0 +.Lccm64_dec2_loop: + aesenc $rndkey1,$inout0 + dec $rounds + aesenc $rndkey1,$inout1 + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + lea 32($key),$key + aesenc $rndkey0,$inout1 + $movkey 0($key),$rndkey0 + jnz .Lccm64_dec2_loop + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenclast $rndkey0,$inout0 jmp .Lccm64_dec_outer .align 16 @@ -554,7 +976,7 @@ $code.=<<___; ___ &aesni_generate1("enc",$key,$rounds,$inout1); $code.=<<___; - movdqu $inout1,($cmac) + movups $inout1,($cmac) ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 @@ -577,208 +999,1122 @@ ___ # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # -my $increment="%xmm10"; -my $bswap_mask="%xmm11"; +{ +my $reserved = $win64?0:-0x28; +my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); +my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); +my $bswap_mask="%xmm15"; + +$code.=<<___; +.globl aesni_ctr32_encrypt_blocks +.type aesni_ctr32_encrypt_blocks,\@function,5 +.align 16 +aesni_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($win64); + lea -0xc8(%rsp),%rsp + movaps %xmm6,0x20(%rsp) + movaps %xmm7,0x30(%rsp) + movaps %xmm8,0x40(%rsp) + movaps %xmm9,0x50(%rsp) + movaps %xmm10,0x60(%rsp) + movaps %xmm11,0x70(%rsp) + movaps %xmm12,0x80(%rsp) + movaps %xmm13,0x90(%rsp) + movaps %xmm14,0xa0(%rsp) + movaps %xmm15,0xb0(%rsp) +.Lctr32_body: +___ +$code.=<<___; + cmp \$1,$len + je .Lctr32_one_shortcut + + movdqu ($ivp),$ivec + movdqa .Lbswap_mask(%rip),$bswap_mask + xor $rounds,$rounds + pextrd \$3,$ivec,$rnds_ # pull 32-bit counter + pinsrd \$3,$rounds,$ivec # wipe 32-bit counter + + mov 240($key),$rounds # key->rounds + bswap $rnds_ + pxor $iv0,$iv0 # vector of 3 32-bit counters + pxor $iv1,$iv1 # vector of 3 32-bit counters + pinsrd \$0,$rnds_,$iv0 + lea 3($rnds_),$key_ + pinsrd \$0,$key_,$iv1 + inc $rnds_ + pinsrd \$1,$rnds_,$iv0 + inc $key_ + pinsrd \$1,$key_,$iv1 + inc $rnds_ + pinsrd \$2,$rnds_,$iv0 + inc $key_ + pinsrd \$2,$key_,$iv1 + movdqa $iv0,$reserved(%rsp) + pshufb $bswap_mask,$iv0 + movdqa $iv1,`$reserved+0x10`(%rsp) + pshufb $bswap_mask,$iv1 + + pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword + pshufd \$`2<<6`,$iv0,$inout1 + pshufd \$`1<<6`,$iv0,$inout2 + cmp \$6,$len + jb .Lctr32_tail + shr \$1,$rounds + mov $key,$key_ # backup $key + mov $rounds,$rnds_ # backup $rounds + sub \$6,$len + jmp .Lctr32_loop6 + +.align 16 +.Lctr32_loop6: + pshufd \$`3<<6`,$iv1,$inout3 + por $ivec,$inout0 # merge counter-less ivec + $movkey ($key_),$rndkey0 + pshufd \$`2<<6`,$iv1,$inout4 + por $ivec,$inout1 + $movkey 16($key_),$rndkey1 + pshufd \$`1<<6`,$iv1,$inout5 + por $ivec,$inout2 + por $ivec,$inout3 + xorps $rndkey0,$inout0 + por $ivec,$inout4 + por $ivec,$inout5 + + # inline _aesni_encrypt6 and interleave last rounds + # with own code... + + pxor $rndkey0,$inout1 + aesenc $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + aesenc $rndkey1,$inout1 + movdqa .Lincrement32(%rip),$iv1 + pxor $rndkey0,$inout3 + aesenc $rndkey1,$inout2 + movdqa $reserved(%rsp),$iv0 + pxor $rndkey0,$inout4 + aesenc $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + jmp .Lctr32_enc_loop6_enter +.align 16 +.Lctr32_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + dec $rounds + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 +.Lctr32_enc_loop6_enter: + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + lea 32($key),$key + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .Lctr32_enc_loop6 + + aesenc $rndkey1,$inout0 + paddd $iv1,$iv0 # increment counter vector + aesenc $rndkey1,$inout1 + paddd `$reserved+0x10`(%rsp),$iv1 + aesenc $rndkey1,$inout2 + movdqa $iv0,$reserved(%rsp) # save counter vector + aesenc $rndkey1,$inout3 + movdqa $iv1,`$reserved+0x10`(%rsp) + aesenc $rndkey1,$inout4 + pshufb $bswap_mask,$iv0 # byte swap + aesenc $rndkey1,$inout5 + pshufb $bswap_mask,$iv1 + + aesenclast $rndkey0,$inout0 + movups ($inp),$in0 # load input + aesenclast $rndkey0,$inout1 + movups 0x10($inp),$in1 + aesenclast $rndkey0,$inout2 + movups 0x20($inp),$in2 + aesenclast $rndkey0,$inout3 + movups 0x30($inp),$in3 + aesenclast $rndkey0,$inout4 + movups 0x40($inp),$rndkey1 + aesenclast $rndkey0,$inout5 + movups 0x50($inp),$rndkey0 + lea 0x60($inp),$inp + + xorps $inout0,$in0 # xor + pshufd \$`3<<6`,$iv0,$inout0 + xorps $inout1,$in1 + pshufd \$`2<<6`,$iv0,$inout1 + movups $in0,($out) # store output + xorps $inout2,$in2 + pshufd \$`1<<6`,$iv0,$inout2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + xorps $inout4,$rndkey1 + movups $in3,0x30($out) + xorps $inout5,$rndkey0 + movups $rndkey1,0x40($out) + movups $rndkey0,0x50($out) + lea 0x60($out),$out + mov $rnds_,$rounds + sub \$6,$len + jnc .Lctr32_loop6 + + add \$6,$len + jz .Lctr32_done + mov $key_,$key # restore $key + lea 1($rounds,$rounds),$rounds # restore original value + +.Lctr32_tail: + por $ivec,$inout0 + movups ($inp),$in0 + cmp \$2,$len + jb .Lctr32_one + + por $ivec,$inout1 + movups 0x10($inp),$in1 + je .Lctr32_two + + pshufd \$`3<<6`,$iv1,$inout3 + por $ivec,$inout2 + movups 0x20($inp),$in2 + cmp \$4,$len + jb .Lctr32_three + + pshufd \$`2<<6`,$iv1,$inout4 + por $ivec,$inout3 + movups 0x30($inp),$in3 + je .Lctr32_four + + por $ivec,$inout4 + xorps $inout5,$inout5 + + call _aesni_encrypt6 + + movups 0x40($inp),$rndkey1 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + xorps $inout4,$rndkey1 + movups $in3,0x30($out) + movups $rndkey1,0x40($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_one_shortcut: + movups ($ivp),$inout0 + movups ($inp),$in0 + mov 240($key),$rounds # key->rounds +.Lctr32_one: +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps $inout0,$in0 + movups $in0,($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_two: + xorps $inout2,$inout2 + call _aesni_encrypt3 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + movups $in1,0x10($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_three: + call _aesni_encrypt3 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + movups $in2,0x20($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_four: + call _aesni_encrypt4 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + movups $in3,0x30($out) + +.Lctr32_done: +___ +$code.=<<___ if ($win64); + movaps 0x20(%rsp),%xmm6 + movaps 0x30(%rsp),%xmm7 + movaps 0x40(%rsp),%xmm8 + movaps 0x50(%rsp),%xmm9 + movaps 0x60(%rsp),%xmm10 + movaps 0x70(%rsp),%xmm11 + movaps 0x80(%rsp),%xmm12 + movaps 0x90(%rsp),%xmm13 + movaps 0xa0(%rsp),%xmm14 + movaps 0xb0(%rsp),%xmm15 + lea 0xc8(%rsp),%rsp +.Lctr32_ret: +___ +$code.=<<___; + ret +.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks +___ +} + +###################################################################### +# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2 +# const unsigned char iv[16]); +# +{ +my @tweak=map("%xmm$_",(10..15)); +my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); +my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); +my $frame_size = 0x68 + ($win64?160:0); + +$code.=<<___; +.globl aesni_xts_encrypt +.type aesni_xts_encrypt,\@function,6 +.align 16 +aesni_xts_encrypt: + lea -$frame_size(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x60(%rsp) + movaps %xmm7,0x70(%rsp) + movaps %xmm8,0x80(%rsp) + movaps %xmm9,0x90(%rsp) + movaps %xmm10,0xa0(%rsp) + movaps %xmm11,0xb0(%rsp) + movaps %xmm12,0xc0(%rsp) + movaps %xmm13,0xd0(%rsp) + movaps %xmm14,0xe0(%rsp) + movaps %xmm15,0xf0(%rsp) +.Lxts_enc_body: +___ +$code.=<<___; + movups ($ivp),@tweak[5] # load clear-text tweak + mov 240(%r8),$rounds # key2->rounds + mov 240($key),$rnds_ # key1->rounds +___ + # generate the tweak + &aesni_generate1("enc",$key2,$rounds,@tweak[5]); +$code.=<<___; + mov $key,$key_ # backup $key + mov $rnds_,$rounds # backup $rounds + mov $len,$len_ # backup $len + and \$-16,$len + + movdqa .Lxts_magic(%rip),$twmask + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp # broadcast upper bits +___ + for ($i=0;$i<4;$i++) { + $code.=<<___; + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[$i] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,$twres # isolate carry and residue + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] +___ + } +$code.=<<___; + sub \$16*6,$len + jc .Lxts_enc_short + + shr \$1,$rounds + sub \$1,$rounds + mov $rounds,$rnds_ + jmp .Lxts_enc_grandloop + +.align 16 +.Lxts_enc_grandloop: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu `16*0`($inp),$inout0 # load input + pand $twmask,$twres # isolate carry and residue + movdqu `16*1`($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu `16*2`($inp),$inout2 + pxor @tweak[0],$inout0 # input^=tweak + movdqu `16*3`($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu `16*4`($inp),$inout4 + pxor @tweak[2],$inout2 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + pxor @tweak[3],$inout3 + $movkey ($key_),$rndkey0 + pxor @tweak[4],$inout4 + pxor @tweak[5],$inout5 + + # inline _aesni_encrypt6 and interleave first and last rounds + # with own code... + $movkey 16($key_),$rndkey1 + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks + aesenc $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + movdqa @tweak[1],`16*1`(%rsp) + aesenc $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqa @tweak[2],`16*2`(%rsp) + aesenc $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqa @tweak[3],`16*3`(%rsp) + aesenc $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + movdqa @tweak[4],`16*4`(%rsp) + aesenc $rndkey1,$inout4 + movdqa @tweak[5],`16*5`(%rsp) + aesenc $rndkey1,$inout5 + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp + jmp .Lxts_enc_loop6_enter + +.align 16 +.Lxts_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + dec $rounds + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 +.Lxts_enc_loop6_enter: + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + lea 32($key),$key + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .Lxts_enc_loop6 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcast upper bits + aesenc $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey 16($key),$rndkey1 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenc $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey 32($key),$rndkey0 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[1] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenc $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[2] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenclast $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenclast $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenclast $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesenclast $rndkey0,$inout3 + aesenclast $rndkey0,$inout4 + aesenclast $rndkey0,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[3] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + xorps `16*0`(%rsp),$inout0 # output^=tweak + pand $twmask,$twres # isolate carry and residue + xorps `16*1`(%rsp),$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] + + xorps `16*2`(%rsp),$inout2 + movups $inout0,`16*0`($out) # write output + xorps `16*3`(%rsp),$inout3 + movups $inout1,`16*1`($out) + xorps `16*4`(%rsp),$inout4 + movups $inout2,`16*2`($out) + xorps `16*5`(%rsp),$inout5 + movups $inout3,`16*3`($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$16*6,$len + jnc .Lxts_enc_grandloop + + lea 3($rounds,$rounds),$rounds # restore original value + mov $key_,$key # restore $key + mov $rounds,$rnds_ # backup $rounds + +.Lxts_enc_short: + add \$16*6,$len + jz .Lxts_enc_done + + cmp \$0x20,$len + jb .Lxts_enc_one + je .Lxts_enc_two + + cmp \$0x40,$len + jb .Lxts_enc_three + je .Lxts_enc_four + + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movdqu 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu 16*2($inp),$inout2 + pxor @tweak[0],$inout0 + movdqu 16*3($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu 16*4($inp),$inout4 + lea 16*5($inp),$inp + pxor @tweak[2],$inout2 + pxor @tweak[3],$inout3 + pxor @tweak[4],$inout4 + + call _aesni_encrypt6 + + xorps @tweak[0],$inout0 + movdqa @tweak[5],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movdqu $inout0,($out) + xorps @tweak[3],$inout3 + movdqu $inout1,16*1($out) + xorps @tweak[4],$inout4 + movdqu $inout2,16*2($out) + movdqu $inout3,16*3($out) + movdqu $inout4,16*4($out) + lea 16*5($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_one: + movups ($inp),$inout0 + lea 16*1($inp),$inp + xorps @tweak[0],$inout0 +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movdqa @tweak[1],@tweak[0] + movups $inout0,($out) + lea 16*1($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_two: + movups ($inp),$inout0 + movups 16($inp),$inout1 + lea 32($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + + call _aesni_encrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[2],@tweak[0] + xorps @tweak[1],$inout1 + movups $inout0,($out) + movups $inout1,16*1($out) + lea 16*2($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_three: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + lea 16*3($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + + call _aesni_encrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[3],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movups $inout0,($out) + movups $inout1,16*1($out) + movups $inout2,16*2($out) + lea 16*3($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_four: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + xorps @tweak[0],$inout0 + movups 16*3($inp),$inout3 + lea 16*4($inp),$inp + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + xorps @tweak[3],$inout3 + + call _aesni_encrypt4 + + xorps @tweak[0],$inout0 + movdqa @tweak[5],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movups $inout0,($out) + xorps @tweak[3],$inout3 + movups $inout1,16*1($out) + movups $inout2,16*2($out) + movups $inout3,16*3($out) + lea 16*4($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_done: + and \$15,$len_ + jz .Lxts_enc_ret + mov $len_,$len + +.Lxts_enc_steal: + movzb ($inp),%eax # borrow $rounds ... + movzb -16($out),%ecx # ... and $key + lea 1($inp),$inp + mov %al,-16($out) + mov %cl,0($out) + lea 1($out),$out + sub \$1,$len + jnz .Lxts_enc_steal + + sub $len_,$out # rewind $out + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + + movups -16($out),$inout0 + xorps @tweak[0],$inout0 +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movups $inout0,-16($out) + +.Lxts_enc_ret: +___ +$code.=<<___ if ($win64); + movaps 0x60(%rsp),%xmm6 + movaps 0x70(%rsp),%xmm7 + movaps 0x80(%rsp),%xmm8 + movaps 0x90(%rsp),%xmm9 + movaps 0xa0(%rsp),%xmm10 + movaps 0xb0(%rsp),%xmm11 + movaps 0xc0(%rsp),%xmm12 + movaps 0xd0(%rsp),%xmm13 + movaps 0xe0(%rsp),%xmm14 + movaps 0xf0(%rsp),%xmm15 +___ +$code.=<<___; + lea $frame_size(%rsp),%rsp +.Lxts_enc_epilogue: + ret +.size aesni_xts_encrypt,.-aesni_xts_encrypt +___ $code.=<<___; -.globl aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,\@function,5 +.globl aesni_xts_decrypt +.type aesni_xts_decrypt,\@function,6 .align 16 -aesni_ctr32_encrypt_blocks: +aesni_xts_decrypt: + lea -$frame_size(%rsp),%rsp ___ $code.=<<___ if ($win64); - lea -0x68(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - movaps %xmm8,0x20(%rsp) - movaps %xmm9,0x30(%rsp) - movaps %xmm10,0x40(%rsp) - movaps %xmm11,0x50(%rsp) - -.Lctr32_body: + movaps %xmm6,0x60(%rsp) + movaps %xmm7,0x70(%rsp) + movaps %xmm8,0x80(%rsp) + movaps %xmm9,0x90(%rsp) + movaps %xmm10,0xa0(%rsp) + movaps %xmm11,0xb0(%rsp) + movaps %xmm12,0xc0(%rsp) + movaps %xmm13,0xd0(%rsp) + movaps %xmm14,0xe0(%rsp) + movaps %xmm15,0xf0(%rsp) +.Lxts_dec_body: ___ $code.=<<___; - cmp \$1,$len - je .Lctr32_one_shortcut - - movdqu ($ivp),$inout3 - movdqa .Lincrement32(%rip),$increment - movdqa .Lbswap_mask(%rip),$bswap_mask - xor $rounds,$rounds - pextrd \$3,$inout3,$rnds_ # pull 32-bit counter - pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter + movups ($ivp),@tweak[5] # load clear-text tweak + mov 240($key2),$rounds # key2->rounds + mov 240($key),$rnds_ # key1->rounds +___ + # generate the tweak + &aesni_generate1("enc",$key2,$rounds,@tweak[5]); +$code.=<<___; + xor %eax,%eax # if ($len%16) len-=16; + test \$15,$len + setnz %al + shl \$4,%rax + sub %rax,$len + + mov $key,$key_ # backup $key + mov $rnds_,$rounds # backup $rounds + mov $len,$len_ # backup $len + and \$-16,$len - mov 240($key),$rounds # key->rounds - pxor $iv,$iv # vector of 3 32-bit counters - bswap $rnds_ - pinsrd \$0,$rnds_,$iv - inc $rnds_ - pinsrd \$1,$rnds_,$iv - inc $rnds_ - pinsrd \$2,$rnds_,$iv - pshufb $bswap_mask,$iv + movdqa .Lxts_magic(%rip),$twmask + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp # broadcast upper bits +___ + for ($i=0;$i<4;$i++) { + $code.=<<___; + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[$i] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,$twres # isolate carry and residue + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] +___ + } +$code.=<<___; + sub \$16*6,$len + jc .Lxts_dec_short - cmp \$4,$len - jbe .Lctr32_tail + shr \$1,$rounds + sub \$1,$rounds mov $rounds,$rnds_ - mov $key,$key_ - sub \$4,$len + jmp .Lxts_dec_grandloop -.Lctr32_loop3: - pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword - pshufd \$`2<<6`,$iv,$inout1 - por $inout3,$inout0 # merge counter-less ivec - pshufd \$`1<<6`,$iv,$inout2 - por $inout3,$inout1 - por $inout3,$inout2 - - # inline _aesni_encrypt3 and interleave last round +.align 16 +.Lxts_dec_grandloop: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu `16*0`($inp),$inout0 # load input + pand $twmask,$twres # isolate carry and residue + movdqu `16*1`($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu `16*2`($inp),$inout2 + pxor @tweak[0],$inout0 # input^=tweak + movdqu `16*3`($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu `16*4`($inp),$inout4 + pxor @tweak[2],$inout2 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + pxor @tweak[3],$inout3 + $movkey ($key_),$rndkey0 + pxor @tweak[4],$inout4 + pxor @tweak[5],$inout5 + + # inline _aesni_decrypt6 and interleave first and last rounds # with own code... + $movkey 16($key_),$rndkey1 + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks + aesdec $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + movdqa @tweak[1],`16*1`(%rsp) + aesdec $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqa @tweak[2],`16*2`(%rsp) + aesdec $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqa @tweak[3],`16*3`(%rsp) + aesdec $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + movdqa @tweak[4],`16*4`(%rsp) + aesdec $rndkey1,$inout4 + movdqa @tweak[5],`16*5`(%rsp) + aesdec $rndkey1,$inout5 + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp + jmp .Lxts_dec_loop6_enter - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 - $movkey ($key),$rndkey0 - jmp .Lctr32_enc_loop3 .align 16 -.Lctr32_enc_loop3: - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 +.Lxts_dec_loop6: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 dec $rounds - aesenc $rndkey1,$inout2 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 +.Lxts_dec_loop6_enter: $movkey 16($key),$rndkey1 - aesenc $rndkey0,$inout0 - aesenc $rndkey0,$inout1 + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 lea 32($key),$key - aesenc $rndkey0,$inout2 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 $movkey ($key),$rndkey0 - jnz .Lctr32_enc_loop3 + jnz .Lxts_dec_loop6 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcast upper bits + aesdec $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey 16($key),$rndkey1 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdec $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey 32($key),$rndkey0 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[1] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdec $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[2] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdeclast $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdeclast $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdeclast $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesdeclast $rndkey0,$inout3 + aesdeclast $rndkey0,$inout4 + aesdeclast $rndkey0,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[3] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + xorps `16*0`(%rsp),$inout0 # output^=tweak + pand $twmask,$twres # isolate carry and residue + xorps `16*1`(%rsp),$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] + + xorps `16*2`(%rsp),$inout2 + movups $inout0,`16*0`($out) # write output + xorps `16*3`(%rsp),$inout3 + movups $inout1,`16*1`($out) + xorps `16*4`(%rsp),$inout4 + movups $inout2,`16*2`($out) + xorps `16*5`(%rsp),$inout5 + movups $inout3,`16*3`($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$16*6,$len + jnc .Lxts_dec_grandloop + + lea 3($rounds,$rounds),$rounds # restore original value + mov $key_,$key # restore $key + mov $rounds,$rnds_ # backup $rounds + +.Lxts_dec_short: + add \$16*6,$len + jz .Lxts_dec_done - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 + cmp \$0x20,$len + jb .Lxts_dec_one + je .Lxts_dec_two - pshufb $bswap_mask,$iv - movdqu ($inp),$in0 - aesenclast $rndkey0,$inout0 - movdqu 0x10($inp),$in1 - paddd $increment,$iv - aesenclast $rndkey0,$inout1 - movdqu 0x20($inp),$in2 - pshufb $bswap_mask,$iv - aesenclast $rndkey0,$inout2 - lea 0x30($inp),$inp + cmp \$0x40,$len + jb .Lxts_dec_three + je .Lxts_dec_four + + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movdqu 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu 16*2($inp),$inout2 + pxor @tweak[0],$inout0 + movdqu 16*3($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu 16*4($inp),$inout4 + lea 16*5($inp),$inp + pxor @tweak[2],$inout2 + pxor @tweak[3],$inout3 + pxor @tweak[4],$inout4 + + call _aesni_decrypt6 + + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movdqu $inout0,($out) + xorps @tweak[3],$inout3 + movdqu $inout1,16*1($out) + xorps @tweak[4],$inout4 + movdqu $inout2,16*2($out) + pxor $twtmp,$twtmp + movdqu $inout3,16*3($out) + pcmpgtd @tweak[5],$twtmp + movdqu $inout4,16*4($out) + lea 16*5($out),$out + pshufd \$0x13,$twtmp,@tweak[1] # $twres + and \$15,$len_ + jz .Lxts_dec_ret + + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,@tweak[1] # isolate carry and residue + pxor @tweak[5],@tweak[1] + jmp .Lxts_dec_done2 - mov $key_,$key - pxor $inout0,$in0 - sub \$3,$len - mov $rnds_,$rounds - pxor $inout1,$in1 - movdqu $in0,($out) - pxor $inout2,$in2 - movdqu $in1,0x10($out) - movdqu $in2,0x20($out) - lea 0x30($out),$out - ja .Lctr32_loop3 +.align 16 +.Lxts_dec_one: + movups ($inp),$inout0 + lea 16*1($inp),$inp + xorps @tweak[0],$inout0 +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movdqa @tweak[1],@tweak[0] + movups $inout0,($out) + movdqa @tweak[2],@tweak[1] + lea 16*1($out),$out + jmp .Lxts_dec_done - pextrd \$1,$iv,$rnds_ # might need last counter value - add \$4,$len - bswap $rnds_ +.align 16 +.Lxts_dec_two: + movups ($inp),$inout0 + movups 16($inp),$inout1 + lea 32($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 -.Lctr32_tail: - pshufd \$`3<<6`,$iv,$inout0 - pshufd \$`2<<6`,$iv,$inout1 - por $inout3,$inout0 - movdqu ($inp),$in0 - cmp \$2,$len - jb .Lctr32_one - lea 1($rnds_),$rnds_ - pshufd \$`1<<6`,$iv,$inout2 - por $inout3,$inout1 - movdqu 0x10($inp),$in1 - je .Lctr32_two - bswap $rnds_ - por $inout3,$inout2 - movdqu 0x20($inp),$in2 - cmp \$3,$len - je .Lctr32_three + call _aesni_decrypt3 - pinsrd \$3,$rnds_,$inout3 # compose last counter value - movdqu 0x30($inp),$iv + xorps @tweak[0],$inout0 + movdqa @tweak[2],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[3],@tweak[1] + movups $inout0,($out) + movups $inout1,16*1($out) + lea 16*2($out),$out + jmp .Lxts_dec_done - call _aesni_encrypt4 +.align 16 +.Lxts_dec_three: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + lea 16*3($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 - pxor $inout0,$in0 - pxor $inout1,$in1 - pxor $inout2,$in2 - movdqu $in0,($out) - pxor $inout3,$iv - movdqu $in1,0x10($out) - movdqu $in2,0x20($out) - movdqu $iv,0x30($out) - jmp .Lctr32_done + call _aesni_decrypt3 -.align 16 -.Lctr32_one_shortcut: - movdqu ($ivp),$inout0 - movdqu ($inp),$in0 - mov 240($key),$rounds # key->rounds -.Lctr32_one: -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - pxor $inout0,$in0 - movdqu $in0,($out) - jmp .Lctr32_done + xorps @tweak[0],$inout0 + movdqa @tweak[3],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[5],@tweak[1] + xorps @tweak[2],$inout2 + movups $inout0,($out) + movups $inout1,16*1($out) + movups $inout2,16*2($out) + lea 16*3($out),$out + jmp .Lxts_dec_done .align 16 -.Lctr32_two: - pxor $inout2,$inout2 - call _aesni_encrypt3 - pxor $inout0,$in0 - pxor $inout1,$in1 - movdqu $in0,($out) - movdqu $in1,0x10($out) - jmp .Lctr32_done +.Lxts_dec_four: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movups ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movups 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movups 16*2($inp),$inout2 + xorps @tweak[0],$inout0 + movups 16*3($inp),$inout3 + lea 16*4($inp),$inp + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + xorps @tweak[3],$inout3 + + call _aesni_decrypt4 + + xorps @tweak[0],$inout0 + movdqa @tweak[4],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[5],@tweak[1] + xorps @tweak[2],$inout2 + movups $inout0,($out) + xorps @tweak[3],$inout3 + movups $inout1,16*1($out) + movups $inout2,16*2($out) + movups $inout3,16*3($out) + lea 16*4($out),$out + jmp .Lxts_dec_done .align 16 -.Lctr32_three: - call _aesni_encrypt3 - pxor $inout0,$in0 - pxor $inout1,$in1 - pxor $inout2,$in2 - movdqu $in0,($out) - movdqu $in1,0x10($out) - movdqu $in2,0x20($out) +.Lxts_dec_done: + and \$15,$len_ + jz .Lxts_dec_ret +.Lxts_dec_done2: + mov $len_,$len + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds -.Lctr32_done: + movups ($inp),$inout0 + xorps @tweak[1],$inout0 +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[1],$inout0 + movups $inout0,($out) + +.Lxts_dec_steal: + movzb 16($inp),%eax # borrow $rounds ... + movzb ($out),%ecx # ... and $key + lea 1($inp),$inp + mov %al,($out) + mov %cl,16($out) + lea 1($out),$out + sub \$1,$len + jnz .Lxts_dec_steal + + sub $len_,$out # rewind $out + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + + movups ($out),$inout0 + xorps @tweak[0],$inout0 ___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movups $inout0,($out) +.Lxts_dec_ret: +___ $code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - movaps 0x40(%rsp),%xmm10 - movaps 0x50(%rsp),%xmm11 - lea 0x68(%rsp),%rsp -.Lctr32_ret: + movaps 0x60(%rsp),%xmm6 + movaps 0x70(%rsp),%xmm7 + movaps 0x80(%rsp),%xmm8 + movaps 0x90(%rsp),%xmm9 + movaps 0xa0(%rsp),%xmm10 + movaps 0xb0(%rsp),%xmm11 + movaps 0xc0(%rsp),%xmm12 + movaps 0xd0(%rsp),%xmm13 + movaps 0xe0(%rsp),%xmm14 + movaps 0xf0(%rsp),%xmm15 ___ $code.=<<___; + lea $frame_size(%rsp),%rsp +.Lxts_dec_epilogue: ret -.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks +.size aesni_xts_decrypt,.-aesni_xts_decrypt ___ -}} +} }} ######################################################################## # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); -$reserved = $win64?0x40:-0x18; # used in decrypt +{ +my $reserved = $win64?0x40:-0x18; # used in decrypt $code.=<<___; .globl ${PREFIX}_cbc_encrypt .type ${PREFIX}_cbc_encrypt,\@function,6 @@ -787,12 +2123,12 @@ ${PREFIX}_cbc_encrypt: test $len,$len # check length jz .Lcbc_ret - mov 240($key),$rnds_ # pull $rounds + mov 240($key),$rnds_ # key->rounds mov $key,$key_ # backup $key test %r9d,%r9d # 6th argument jz .Lcbc_decrypt #--------------------------- CBC ENCRYPT ------------------------------# - movdqu ($ivp),$inout0 # load iv as initial state + movups ($ivp),$inout0 # load iv as initial state mov $rnds_,$rounds cmp \$16,$len jb .Lcbc_enc_tail @@ -800,11 +2136,11 @@ ${PREFIX}_cbc_encrypt: jmp .Lcbc_enc_loop .align 16 .Lcbc_enc_loop: - movdqu ($inp),$inout1 # load input + movups ($inp),$inout1 # load input lea 16($inp),$inp - pxor $inout1,$inout0 + #xorps $inout1,$inout0 ___ - &aesni_generate1("enc",$key,$rounds); + &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); $code.=<<___; mov $rnds_,$rounds # restore $rounds mov $key_,$key # restore $key @@ -846,106 +2182,251 @@ ___ $code.=<<___; movups ($ivp),$iv mov $rnds_,$rounds - cmp \$0x40,$len + cmp \$0x70,$len jbe .Lcbc_dec_tail - sub \$0x40,$len - jmp .Lcbc_dec_loop3 + shr \$1,$rnds_ + sub \$0x70,$len + mov $rnds_,$rounds + movaps $iv,$reserved(%rsp) + jmp .Lcbc_dec_loop8_enter .align 16 -.Lcbc_dec_loop3: - movups ($inp),$inout0 +.Lcbc_dec_loop8: + movaps $rndkey0,$reserved(%rsp) # save IV + movups $inout7,($out) + lea 0x10($out),$out +.Lcbc_dec_loop8_enter: + $movkey ($key),$rndkey0 + movups ($inp),$inout0 # load input movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - movaps $inout0,$in0 - movaps $inout1,$in1 - movaps $inout2,$in2 - - call _aesni_decrypt3 + $movkey 16($key),$rndkey1 - sub \$0x30,$len - lea 0x30($inp),$inp - lea 0x30($out),$out - pxor $iv,$inout0 - pxor $in0,$inout1 - movaps $in2,$iv - pxor $in1,$inout2 - movdqu $inout0,-0x30($out) - mov $rnds_,$rounds # restore $rounds - movdqu $inout1,-0x20($out) - mov $key_,$key # restore $key - movdqu $inout2,-0x10($out) - ja .Lcbc_dec_loop3 + lea 32($key),$key + movdqu 0x20($inp),$inout2 + xorps $rndkey0,$inout0 + movdqu 0x30($inp),$inout3 + xorps $rndkey0,$inout1 + movdqu 0x40($inp),$inout4 + aesdec $rndkey1,$inout0 + pxor $rndkey0,$inout2 + movdqu 0x50($inp),$inout5 + aesdec $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqu 0x60($inp),$inout6 + aesdec $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqu 0x70($inp),$inout7 + aesdec $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aesdec $rndkey1,$inout4 + pxor $rndkey0,$inout6 + aesdec $rndkey1,$inout5 + pxor $rndkey0,$inout7 + $movkey ($key),$rndkey0 + aesdec $rndkey1,$inout6 + aesdec $rndkey1,$inout7 + $movkey 16($key),$rndkey1 - add \$0x40,$len - movups $iv,($ivp) + call .Ldec_loop8_enter + + movups ($inp),$rndkey1 # re-load input + movups 0x10($inp),$rndkey0 + xorps $reserved(%rsp),$inout0 # ^= IV + xorps $rndkey1,$inout1 + movups 0x20($inp),$rndkey1 + xorps $rndkey0,$inout2 + movups 0x30($inp),$rndkey0 + xorps $rndkey1,$inout3 + movups 0x40($inp),$rndkey1 + xorps $rndkey0,$inout4 + movups 0x50($inp),$rndkey0 + xorps $rndkey1,$inout5 + movups 0x60($inp),$rndkey1 + xorps $rndkey0,$inout6 + movups 0x70($inp),$rndkey0 # IV + xorps $rndkey1,$inout7 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,0x40($out) + mov $key_,$key # restore $key + movups $inout5,0x50($out) + lea 0x80($inp),$inp + movups $inout6,0x60($out) + lea 0x70($out),$out + sub \$0x80,$len + ja .Lcbc_dec_loop8 + + movaps $inout7,$inout0 + movaps $rndkey0,$iv + add \$0x70,$len + jle .Lcbc_dec_tail_collected + movups $inout0,($out) + lea 1($rnds_,$rnds_),$rounds + lea 0x10($out),$out .Lcbc_dec_tail: movups ($inp),$inout0 movaps $inout0,$in0 cmp \$0x10,$len jbe .Lcbc_dec_one + movups 0x10($inp),$inout1 movaps $inout1,$in1 cmp \$0x20,$len jbe .Lcbc_dec_two + movups 0x20($inp),$inout2 movaps $inout2,$in2 cmp \$0x30,$len jbe .Lcbc_dec_three + movups 0x30($inp),$inout3 - call _aesni_decrypt4 - pxor $iv,$inout0 - movups 0x30($inp),$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - pxor $in2,$inout3 - movdqu $inout2,0x20($out) - movdqa $inout3,$inout0 - lea 0x30($out),$out + cmp \$0x40,$len + jbe .Lcbc_dec_four + + movups 0x40($inp),$inout4 + cmp \$0x50,$len + jbe .Lcbc_dec_five + + movups 0x50($inp),$inout5 + cmp \$0x60,$len + jbe .Lcbc_dec_six + + movups 0x60($inp),$inout6 + movaps $iv,$reserved(%rsp) # save IV + call _aesni_decrypt8 + movups ($inp),$rndkey1 + movups 0x10($inp),$rndkey0 + xorps $reserved(%rsp),$inout0 # ^= IV + xorps $rndkey1,$inout1 + movups 0x20($inp),$rndkey1 + xorps $rndkey0,$inout2 + movups 0x30($inp),$rndkey0 + xorps $rndkey1,$inout3 + movups 0x40($inp),$rndkey1 + xorps $rndkey0,$inout4 + movups 0x50($inp),$rndkey0 + xorps $rndkey1,$inout5 + movups 0x60($inp),$iv # IV + xorps $rndkey0,$inout6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + lea 0x60($out),$out + movaps $inout6,$inout0 + sub \$0x70,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_one: ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; - pxor $iv,$inout0 + xorps $iv,$inout0 movaps $in0,$iv + sub \$0x10,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: - pxor $inout2,$inout2 + xorps $inout2,$inout2 call _aesni_decrypt3 - pxor $iv,$inout0 - pxor $in0,$inout1 - movdqu $inout0,($out) + xorps $iv,$inout0 + xorps $in0,$inout1 + movups $inout0,($out) movaps $in1,$iv - movdqa $inout1,$inout0 + movaps $inout1,$inout0 lea 0x10($out),$out + sub \$0x20,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: call _aesni_decrypt3 - pxor $iv,$inout0 - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) + xorps $iv,$inout0 + xorps $in0,$inout1 + movups $inout0,($out) + xorps $in1,$inout2 + movups $inout1,0x10($out) movaps $in2,$iv - movdqa $inout2,$inout0 + movaps $inout2,$inout0 lea 0x20($out),$out + sub \$0x30,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_four: + call _aesni_decrypt4 + xorps $iv,$inout0 + movups 0x30($inp),$iv + xorps $in0,$inout1 + movups $inout0,($out) + xorps $in1,$inout2 + movups $inout1,0x10($out) + xorps $in2,$inout3 + movups $inout2,0x20($out) + movaps $inout3,$inout0 + lea 0x30($out),$out + sub \$0x40,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_five: + xorps $inout5,$inout5 + call _aesni_decrypt6 + movups 0x10($inp),$rndkey1 + movups 0x20($inp),$rndkey0 + xorps $iv,$inout0 + xorps $in0,$inout1 + xorps $rndkey1,$inout2 + movups 0x30($inp),$rndkey1 + xorps $rndkey0,$inout3 + movups 0x40($inp),$iv + xorps $rndkey1,$inout4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + lea 0x40($out),$out + movaps $inout4,$inout0 + sub \$0x50,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_six: + call _aesni_decrypt6 + movups 0x10($inp),$rndkey1 + movups 0x20($inp),$rndkey0 + xorps $iv,$inout0 + xorps $in0,$inout1 + xorps $rndkey1,$inout2 + movups 0x30($inp),$rndkey1 + xorps $rndkey0,$inout3 + movups 0x40($inp),$rndkey0 + xorps $rndkey1,$inout4 + movups 0x50($inp),$iv + xorps $rndkey0,$inout5 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + lea 0x50($out),$out + movaps $inout5,$inout0 + sub \$0x60,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_tail_collected: and \$15,$len movups $iv,($ivp) jnz .Lcbc_dec_tail_partial - movdqu $inout0,($out) + movups $inout0,($out) jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: movaps $inout0,$reserved(%rsp) + mov \$16,%rcx mov $out,%rdi - mov $len,%rcx + sub $len,%rcx lea $reserved(%rsp),%rsi .long 0x9066A4F3 # rep movsb @@ -963,7 +2444,7 @@ $code.=<<___; ret .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt ___ - +} # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, # int bits, AES_KEY *key) { my ($inp,$bits,$key) = @_4args; @@ -975,7 +2456,7 @@ $code.=<<___; .align 16 ${PREFIX}_set_decrypt_key: .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 - call _aesni_set_encrypt_key + call __aesni_set_encrypt_key shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key test %eax,%eax jnz .Ldec_key_ret @@ -1024,7 +2505,7 @@ $code.=<<___; .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent .align 16 ${PREFIX}_set_encrypt_key: -_aesni_set_encrypt_key: +__aesni_set_encrypt_key: .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 mov \$-1,%rax test $inp,$inp @@ -1033,7 +2514,7 @@ _aesni_set_encrypt_key: jz .Lenc_key_ret movups ($inp),%xmm0 # pull first 128 bits of *userKey - pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 + xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 lea 16($key),%rax cmp \$256,$bits je .L14rounds @@ -1148,11 +2629,11 @@ _aesni_set_encrypt_key: lea 16(%rax),%rax .Lkey_expansion_128_cold: shufps \$0b00010000,%xmm0,%xmm4 - pxor %xmm4, %xmm0 + xorps %xmm4, %xmm0 shufps \$0b10001100,%xmm0,%xmm4 - pxor %xmm4, %xmm0 - pshufd \$0b11111111,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm0 + xorps %xmm4, %xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 ret .align 16 @@ -1163,11 +2644,11 @@ _aesni_set_encrypt_key: movaps %xmm2, %xmm5 .Lkey_expansion_192b_warm: shufps \$0b00010000,%xmm0,%xmm4 - movaps %xmm2,%xmm3 - pxor %xmm4,%xmm0 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 shufps \$0b10001100,%xmm0,%xmm4 pslldq \$4,%xmm3 - pxor %xmm4,%xmm0 + xorps %xmm4,%xmm0 pshufd \$0b01010101,%xmm1,%xmm1 # critical path pxor %xmm3,%xmm2 pxor %xmm1,%xmm0 @@ -1191,11 +2672,11 @@ _aesni_set_encrypt_key: lea 16(%rax),%rax .Lkey_expansion_256a_cold: shufps \$0b00010000,%xmm0,%xmm4 - pxor %xmm4,%xmm0 + xorps %xmm4,%xmm0 shufps \$0b10001100,%xmm0,%xmm4 - pxor %xmm4,%xmm0 - pshufd \$0b11111111,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm0 + xorps %xmm4,%xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 ret .align 16 @@ -1204,13 +2685,14 @@ _aesni_set_encrypt_key: lea 16(%rax),%rax shufps \$0b00010000,%xmm2,%xmm4 - pxor %xmm4,%xmm2 + xorps %xmm4,%xmm2 shufps \$0b10001100,%xmm2,%xmm4 - pxor %xmm4,%xmm2 - pshufd \$0b10101010,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm2 + xorps %xmm4,%xmm2 + shufps \$0b10101010,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm2 ret .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key +.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key ___ } @@ -1219,9 +2701,12 @@ $code.=<<___; .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lincrement32: - .long 3,3,3,0 + .long 6,6,6,0 .Lincrement64: .long 1,0,0,0 +.Lxts_magic: + .long 0x87,0,1,0 + .asciz "AES for Intel AES-NI, CRYPTOGAMS by " .align 64 ___ @@ -1253,12 +2738,8 @@ ecb_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - jmp .Lcommon_seh_exit + jmp .Lcommon_seh_tail .size ecb_se_handler,.-ecb_se_handler .type ccm64_se_handler,\@abi-omnipotent @@ -1279,34 +2760,27 @@ ccm64_se_handler: mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),$r11 # disp->HandlerData + mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lin_ccm64_prologue + jae .Lcommon_seh_tail - lea 0(%rax),%rsi # top of stack + lea 0(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer -.Lin_ccm64_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - jmp .Lcommon_seh_exit + jmp .Lcommon_seh_tail .size ccm64_se_handler,.-ccm64_se_handler .type ctr32_se_handler,\@abi-omnipotent @@ -1328,29 +2802,63 @@ ctr32_se_handler: lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lin_ctr32_prologue + jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lctr32_ret(%rip),%r10 cmp %r10,%rbx - jae .Lin_ctr32_prologue + jae .Lcommon_seh_tail - lea 0(%rax),%rsi # top of stack + lea 0x20(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 - mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax) + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0x68(%rax),%rax # adjust stack pointer - -.Lin_ctr32_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi + lea 0xc8(%rax),%rax # adjust stack pointer - jmp .Lcommon_seh_exit + jmp .Lcommon_seh_tail .size ctr32_se_handler,.-ctr32_se_handler + +.type xts_se_handler,\@abi-omnipotent +.align 16 +xts_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue lable + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 0x60(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0x68+160(%rax),%rax # adjust stack pointer + + jmp .Lcommon_seh_tail +.size xts_se_handler,.-xts_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent @@ -1372,7 +2880,7 @@ cbc_se_handler: lea .Lcbc_decrypt(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lin_cbc_prologue + jb .Lcommon_seh_tail lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->RipRip>="epilogue" label - jae .Lin_cbc_prologue + jae .Lcommon_seh_tail lea 0(%rax),%rsi # top of stack lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer - jmp .Lin_cbc_prologue + jmp .Lcommon_seh_tail .Lrestore_cbc_rax: mov 120($context),%rax -.Lin_cbc_prologue: + +.Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi -.Lcommon_seh_exit: - mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) @@ -1443,17 +2950,23 @@ $code.=<<___ if ($PREFIX eq "aesni"); .rva .LSEH_begin_aesni_ccm64_encrypt_blocks .rva .LSEH_end_aesni_ccm64_encrypt_blocks - .rva .LSEH_info_ccm64 - .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] + .rva .LSEH_info_ccm64_enc .rva .LSEH_begin_aesni_ccm64_decrypt_blocks .rva .LSEH_end_aesni_ccm64_decrypt_blocks - .rva .LSEH_info_ccm64 - .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] + .rva .LSEH_info_ccm64_dec .rva .LSEH_begin_aesni_ctr32_encrypt_blocks .rva .LSEH_end_aesni_ctr32_encrypt_blocks .rva .LSEH_info_ctr32 + + .rva .LSEH_begin_aesni_xts_encrypt + .rva .LSEH_end_aesni_xts_encrypt + .rva .LSEH_info_xts_enc + + .rva .LSEH_begin_aesni_xts_decrypt + .rva .LSEH_end_aesni_xts_decrypt + .rva .LSEH_info_xts_dec ___ $code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt @@ -1474,12 +2987,25 @@ $code.=<<___ if ($PREFIX eq "aesni"); .LSEH_info_ecb: .byte 9,0,0,0 .rva ecb_se_handler -.LSEH_info_ccm64: +.LSEH_info_ccm64_enc: + .byte 9,0,0,0 + .rva ccm64_se_handler + .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] +.LSEH_info_ccm64_dec: .byte 9,0,0,0 .rva ccm64_se_handler + .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] .LSEH_info_ctr32: .byte 9,0,0,0 .rva ctr32_se_handler +.LSEH_info_xts_enc: + .byte 9,0,0,0 + .rva xts_se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] +.LSEH_info_xts_dec: + .byte 9,0,0,0 + .rva xts_se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] ___ $code.=<<___; .LSEH_info_cbc: