-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
-# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
-# instructions' interleave factor. Westmere can execute at most 3
+# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
#
# Further data for other parallelizable modes:
#
-# CBC decrypt 1.16 0.93 0.93
-# CTR 1.14 0.91 n/a
+# CBC decrypt 1.16 0.93 0.74
+# CTR 1.14 0.91 0.74
#
# Well, given 3x column it's probably inappropriate to call the limit
# asymptotic, if it can be surpassed, isn't it? What happens there?
# April 2011
#
-# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
-# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
+######################################################################
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+# CBC en-/decrypt CTR XTS ECB OCB
+# Westmere 3.77/1.25 1.25 1.25 1.26
+# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
+# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
+# Skylake 2.62/0.63 0.63 0.63 0.63
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
+# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
+# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
+# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
+#
+# (*) Atom Silvermont ECB result is suboptimal because of penalties
+# incurred by operations on %xmm8-15. As ECB is not considered
+# critical, nothing was done to mitigate the problem.
+
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-x86_64.pl:-)
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code=".text\n";
+$code.=".extern OPENSSL_ia32cap_P\n";
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($out) # output
+ pxor $inout0,$inout0
ret
.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($out) # output
+ pxor $inout0,$inout0
ret
.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
___
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs...
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type _aesni_${dir}rypt2,\@abi-omnipotent
+.align 16
+_aesni_${dir}rypt2:
+ $movkey ($key),$rndkey0
+ shl \$4,$rounds
+ $movkey 16($key),$rndkey1
+ xorps $rndkey0,$inout0
+ xorps $rndkey0,$inout1
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
+
+.L${dir}_loop2:
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aes${dir} $rndkey0,$inout0
+ aes${dir} $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .L${dir}_loop2
+
+ aes${dir} $rndkey1,$inout0
+ aes${dir} $rndkey1,$inout1
+ aes${dir}last $rndkey0,$inout0
+ aes${dir}last $rndkey0,$inout1
+ ret
+.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
sub aesni_generate3 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
.align 16
_aesni_${dir}rypt3:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
- $movkey ($key),$rndkey0
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ add \$16,%rax
.L${dir}_loop3:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
aes${dir} $rndkey1,$inout2
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop3
aes${dir} $rndkey1,$inout0
.align 16
_aesni_${dir}rypt4:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
xorps $rndkey0,$inout3
- $movkey ($key),$rndkey0
+ $movkey 32($key),$rndkey0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ .byte 0x0f,0x1f,0x00
+ add \$16,%rax
.L${dir}_loop4:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop4
aes${dir} $rndkey1,$inout0
.align 16
_aesni_${dir}rypt6:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
pxor $rndkey0,$inout1
- aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
+ aes${dir} $rndkey1,$inout0
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
- aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
- aes${dir} $rndkey1,$inout3
+ aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout5
- dec $rounds
- aes${dir} $rndkey1,$inout4
- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
jmp .L${dir}_loop6_enter
.align 16
.L${dir}_loop6:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
aes${dir} $rndkey1,$inout2
+.L${dir}_loop6_enter:
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
-.L${dir}_loop6_enter: # happens to be 16-byte aligned
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop6
aes${dir} $rndkey1,$inout0
.align 16
_aesni_${dir}rypt8:
$movkey ($key),$rndkey0
- shr \$1,$rounds
+ shl \$4,$rounds
$movkey 16($key),$rndkey1
- lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
- aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
- aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
- aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
- aes${dir} $rndkey1,$inout3
+ lea 32($key,$rounds),$key
+ neg %rax # $rounds
+ aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout5
- dec $rounds
- aes${dir} $rndkey1,$inout4
pxor $rndkey0,$inout6
- aes${dir} $rndkey1,$inout5
+ aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout7
- $movkey ($key),$rndkey0
- aes${dir} $rndkey1,$inout6
- aes${dir} $rndkey1,$inout7
- $movkey 16($key),$rndkey1
- jmp .L${dir}_loop8_enter
+ $movkey ($key,%rax),$rndkey0
+ add \$16,%rax
+ jmp .L${dir}_loop8_inner
.align 16
.L${dir}_loop8:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
- dec $rounds
+.L${dir}_loop8_inner:
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
- $movkey 16($key),$rndkey1
-.L${dir}_loop8_enter: # happens to be 16-byte aligned
+.L${dir}_loop8_enter:
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
- lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
aes${dir} $rndkey0,$inout6
aes${dir} $rndkey0,$inout7
- $movkey ($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .L${dir}_loop8
aes${dir} $rndkey1,$inout0
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
.type aesni_ecb_encrypt,\@function,5
.align 16
aesni_ecb_encrypt:
- and \$-16,$len
- jz .Lecb_ret
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # offload $inout4..7
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+ and \$-16,$len # if ($len<16)
+ jz .Lecb_ret # return
mov 240($key),$rounds # key->rounds
$movkey ($key),$rndkey0
test %r8d,%r8d # 5th argument
jz .Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
- cmp \$0x80,$len
- jb .Lecb_enc_tail
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_enc_tail # short input
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
- sub \$0x80,$len
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
+ lea 0x80($out),$out # $out+=8*16
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
+ lea 0x80($inp),$inp # $inp+=8*16
.Lecb_enc_loop8_enter:
call _aesni_encrypt8
sub \$0x80,$len
- jnc .Lecb_enc_loop8
+ jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
movups $inout1,0x10($out)
mov $rnds_,$rounds # restore $rounds
movups $inout5,0x50($out)
movups $inout6,0x60($out)
movups $inout7,0x70($out)
- lea 0x80($out),$out
- add \$0x80,$len
- jz .Lecb_ret
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
-.Lecb_enc_tail:
+.Lecb_enc_tail: # $len is less than 8*16
movups ($inp),$inout0
cmp \$0x20,$len
jb .Lecb_enc_one
movups 0x50($inp),$inout5
je .Lecb_enc_six
movdqu 0x60($inp),$inout6
+ xorps $inout7,$inout7
call _aesni_encrypt8
- movups $inout0,($out)
+ movups $inout0,($out) # store 7 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
- xorps $inout2,$inout2
- call _aesni_encrypt3
- movups $inout0,($out)
+ call _aesni_encrypt2
+ movups $inout0,($out) # store 2 output blocks
movups $inout1,0x10($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_three:
call _aesni_encrypt3
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_four:
call _aesni_encrypt4
- movups $inout0,($out)
+ movups $inout0,($out) # store 4 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
.Lecb_enc_five:
xorps $inout5,$inout5
call _aesni_encrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 5 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
.align 16
.Lecb_enc_six:
call _aesni_encrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 6 output blocks
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
\f#--------------------------- ECB DECRYPT ------------------------------#
.align 16
.Lecb_decrypt:
- cmp \$0x80,$len
- jb .Lecb_dec_tail
+ cmp \$0x80,$len # if ($len<8*16)
+ jb .Lecb_dec_tail # short input
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
- sub \$0x80,$len
+ lea 0x80($inp),$inp # $inp+=8*16
+ sub \$0x80,$len # $len-=8*16 (can be zero)
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
mov $key_,$key # restore $key
- movdqu ($inp),$inout0
+ movdqu ($inp),$inout0 # load 8 input blocks
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
+ lea 0x80($out),$out # $out+=8*16
movdqu 0x70($inp),$inout7
- lea 0x80($inp),$inp
+ lea 0x80($inp),$inp # $inp+=8*16
.Lecb_dec_loop8_enter:
call _aesni_decrypt8
$movkey ($key_),$rndkey0
sub \$0x80,$len
- jnc .Lecb_dec_loop8
+ jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
- movups $inout0,($out)
+ movups $inout0,($out) # store 8 output blocks
+ pxor $inout0,$inout0 # clear register bank
mov $key_,$key # restore $key
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
mov $rnds_,$rounds # restore $rounds
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
movups $inout6,0x60($out)
+ pxor $inout6,$inout6
movups $inout7,0x70($out)
- lea 0x80($out),$out
- add \$0x80,$len
- jz .Lecb_ret
+ pxor $inout7,$inout7
+ lea 0x80($out),$out # $out+=8*16
+ add \$0x80,$len # restore real remaining $len
+ jz .Lecb_ret # done if ($len==0)
.Lecb_dec_tail:
movups ($inp),$inout0
je .Lecb_dec_six
movups 0x60($inp),$inout6
$movkey ($key),$rndkey0
+ xorps $inout7,$inout7
call _aesni_decrypt8
- movups $inout0,($out)
+ movups $inout0,($out) # store 7 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
movups $inout6,0x60($out)
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
+ pxor $inout0,$inout0 # clear register bank
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
- xorps $inout2,$inout2
- call _aesni_decrypt3
- movups $inout0,($out)
+ call _aesni_decrypt2
+ movups $inout0,($out) # store 2 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
- movups $inout0,($out)
+ movups $inout0,($out) # store 4 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps $inout5,$inout5
call _aesni_decrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 5 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
- movups $inout0,($out)
+ movups $inout0,($out) # store 6 output blocks
+ pxor $inout0,$inout0 # clear register bank
movups $inout1,0x10($out)
+ pxor $inout1,$inout1
movups $inout2,0x20($out)
+ pxor $inout2,$inout2
movups $inout3,0x30($out)
+ pxor $inout3,$inout3
movups $inout4,0x40($out)
+ pxor $inout4,$inout4
movups $inout5,0x50($out)
+ pxor $inout5,$inout5
.Lecb_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lecb_enc_ret:
+___
+$code.=<<___;
ret
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
___
{
my $cmac="%r9"; # 6th argument
-my $increment="%xmm6";
+my $increment="%xmm9";
+my $iv="%xmm6";
my $bswap_mask="%xmm7";
$code.=<<___;
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in0
+ movaps %xmm9,0x30(%rsp) # $increment
.Lccm64_enc_body:
___
$code.=<<___;
movdqa .Lincrement64(%rip),$increment
movdqa .Lbswap_mask(%rip),$bswap_mask
- shr \$1,$rounds
+ shl \$4,$rounds
+ mov \$16,$rnds_
lea 0($key),$key_
movdqu ($cmac),$inout1
movdqa $iv,$inout0
- mov $rounds,$rnds_
+ lea 32($key,$rounds),$key # end of key schedule
+ pshufb $bswap_mask,$iv
+ sub %rax,%r10 # twisted $rounds
jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
$movkey ($key_),$rndkey0
- mov $rnds_,$rounds
+ mov %r10,%rax
movups ($inp),$in0 # load inp
xorps $rndkey0,$inout0 # counter
$movkey 16($key_),$rndkey1
xorps $in0,$rndkey0
- lea 32($key_),$key
xorps $rndkey0,$inout1 # cmac^=inp
- $movkey ($key),$rndkey0
+ $movkey 32($key_),$rndkey0
.Lccm64_enc2_loop:
aesenc $rndkey1,$inout0
- dec $rounds
aesenc $rndkey1,$inout1
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aesenc $rndkey0,$inout0
- lea 32($key),$key
aesenc $rndkey0,$inout1
- $movkey 0($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .Lccm64_enc2_loop
- pshufb $bswap_mask,$iv
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
paddq $increment,$iv
+ dec $len # $len-- ($len is in blocks)
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
- dec $len
lea 16($inp),$inp
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
movups $in0,($out) # save output
- lea 16($out),$out
- pshufb $bswap_mask,$iv
- jnz .Lccm64_enc_outer
-
- movups $inout1,($cmac)
+ pshufb $bswap_mask,$inout0
+ lea 16($out),$out # $out+=16
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
+
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
lea 0x58(%rsp),%rsp
.Lccm64_enc_ret:
___
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in8
+ movaps %xmm9,0x30(%rsp) # $increment
.Lccm64_dec_body:
___
$code.=<<___;
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
+ shl \$4,$rnds_
+ mov \$16,$rounds
movups ($inp),$in0 # load inp
paddq $increment,$iv
- pshufb $bswap_mask,$iv
- lea 16($inp),$inp
+ lea 16($inp),$inp # $inp+=16
+ sub %r10,%rax # twisted $rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ mov %rax,%r10
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_outer:
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
- mov $rnds_,$rounds
movups $in0,($out) # save output
- lea 16($out),$out
+ lea 16($out),$out # $out+=16
+ pshufb $bswap_mask,$inout0
- sub \$1,$len
- jz .Lccm64_dec_break
+ sub \$1,$len # $len-- ($len is in blocks)
+ jz .Lccm64_dec_break # if ($len==0) break
$movkey ($key_),$rndkey0
- shr \$1,$rounds
+ mov %r10,%rax
$movkey 16($key_),$rndkey1
xorps $rndkey0,$in0
- lea 32($key_),$key
xorps $rndkey0,$inout0
xorps $in0,$inout1 # cmac^=out
- $movkey ($key),$rndkey0
-
+ $movkey 32($key_),$rndkey0
+ jmp .Lccm64_dec2_loop
+.align 16
.Lccm64_dec2_loop:
aesenc $rndkey1,$inout0
- dec $rounds
aesenc $rndkey1,$inout1
- $movkey 16($key),$rndkey1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
aesenc $rndkey0,$inout0
- lea 32($key),$key
aesenc $rndkey0,$inout1
- $movkey 0($key),$rndkey0
+ $movkey -16($key,%rax),$rndkey0
jnz .Lccm64_dec2_loop
- movups ($inp),$in0 # load inp
+ movups ($inp),$in0 # load input
paddq $increment,$iv
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- pshufb $bswap_mask,$iv
- lea 16($inp),$inp
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
+ lea 16($inp),$inp # $inp+=16
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_break:
#xorps $in0,$inout1 # cmac^=out
+ mov 240($key_),$rounds
___
&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
$code.=<<___;
- movups $inout1,($cmac)
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
lea 0x58(%rsp),%rsp
.Lccm64_dec_ret:
___
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
-# does not update *ivec! (see engine/eng_aesni.c for details)
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
{
-my $reserved = $win64?0:-0x28;
-my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
-my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
-my $bswap_mask="%xmm15";
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("%ebp","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,\@function,5
.align 16
aesni_ctr32_encrypt_blocks:
+.cfi_startproc
+ cmp \$1,$len
+ jne .Lctr32_bulk
+
+ # handle single block without allocating stack frame,
+ # useful when handling edges
+ movups ($ivp),$inout0
+ movups ($inp),$inout1
+ mov 240($key),%edx # key->rounds
+___
+ &aesni_generate1("enc",$key,"%edx");
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ xorps $inout1,$inout0
+ pxor $inout1,$inout1
+ movups $inout0,($out)
+ xorps $inout0,$inout0
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+ lea (%rsp),$key_ # use $key_ as frame pointer
+.cfi_def_cfa_register $key_
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- lea -0xc8(%rsp),%rsp
- movaps %xmm6,0x20(%rsp)
- movaps %xmm7,0x30(%rsp)
- movaps %xmm8,0x40(%rsp)
- movaps %xmm9,0x50(%rsp)
- movaps %xmm10,0x60(%rsp)
- movaps %xmm11,0x70(%rsp)
- movaps %xmm12,0x80(%rsp)
- movaps %xmm13,0x90(%rsp)
- movaps %xmm14,0xa0(%rsp)
- movaps %xmm15,0xb0(%rsp)
+ movaps %xmm6,-0xa8($key_) # offload everything
+ movaps %xmm7,-0x98($key_)
+ movaps %xmm8,-0x88($key_)
+ movaps %xmm9,-0x78($key_)
+ movaps %xmm10,-0x68($key_)
+ movaps %xmm11,-0x58($key_)
+ movaps %xmm12,-0x48($key_)
+ movaps %xmm13,-0x38($key_)
+ movaps %xmm14,-0x28($key_)
+ movaps %xmm15,-0x18($key_)
.Lctr32_body:
___
$code.=<<___;
- cmp \$1,$len
- je .Lctr32_one_shortcut
-
- movdqu ($ivp),$ivec
- movdqa .Lbswap_mask(%rip),$bswap_mask
- xor $rounds,$rounds
- pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
- pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
+ # 8 16-byte words on top of stack are counter values
+ # xor-ed with zero-round key
+
+ movdqu ($ivp),$inout0
+ movdqu ($key),$rndkey0
+ mov 12($ivp),$ctr # counter LSB
+ pxor $rndkey0,$inout0
+ mov 12($key),$key0 # 0-round key LSB
+ movdqa $inout0,0x00(%rsp) # populate counter block
+ bswap $ctr
+ movdqa $inout0,$inout1
+ movdqa $inout0,$inout2
+ movdqa $inout0,$inout3
+ movdqa $inout0,0x40(%rsp)
+ movdqa $inout0,0x50(%rsp)
+ movdqa $inout0,0x60(%rsp)
+ mov %rdx,%r10 # about to borrow %rdx
+ movdqa $inout0,0x70(%rsp)
+
+ lea 1($ctr),%rax
+ lea 2($ctr),%rdx
+ bswap %eax
+ bswap %edx
+ xor $key0,%eax
+ xor $key0,%edx
+ pinsrd \$3,%eax,$inout1
+ lea 3($ctr),%rax
+ movdqa $inout1,0x10(%rsp)
+ pinsrd \$3,%edx,$inout2
+ bswap %eax
+ mov %r10,%rdx # restore %rdx
+ lea 4($ctr),%r10
+ movdqa $inout2,0x20(%rsp)
+ xor $key0,%eax
+ bswap %r10d
+ pinsrd \$3,%eax,$inout3
+ xor $key0,%r10d
+ movdqa $inout3,0x30(%rsp)
+ lea 5($ctr),%r9
+ mov %r10d,0x40+12(%rsp)
+ bswap %r9d
+ lea 6($ctr),%r10
mov 240($key),$rounds # key->rounds
- bswap $rnds_
- pxor $iv0,$iv0 # vector of 3 32-bit counters
- pxor $iv1,$iv1 # vector of 3 32-bit counters
- pinsrd \$0,$rnds_,$iv0
- lea 3($rnds_),$key_
- pinsrd \$0,$key_,$iv1
- inc $rnds_
- pinsrd \$1,$rnds_,$iv0
- inc $key_
- pinsrd \$1,$key_,$iv1
- inc $rnds_
- pinsrd \$2,$rnds_,$iv0
- inc $key_
- pinsrd \$2,$key_,$iv1
- movdqa $iv0,$reserved(%rsp)
- pshufb $bswap_mask,$iv0
- movdqa $iv1,`$reserved+0x10`(%rsp)
- pshufb $bswap_mask,$iv1
-
- pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
- pshufd \$`2<<6`,$iv0,$inout1
- pshufd \$`1<<6`,$iv0,$inout2
- cmp \$6,$len
- jb .Lctr32_tail
- shr \$1,$rounds
- mov $key,$key_ # backup $key
- mov $rounds,$rnds_ # backup $rounds
- sub \$6,$len
+ xor $key0,%r9d
+ bswap %r10d
+ mov %r9d,0x50+12(%rsp)
+ xor $key0,%r10d
+ lea 7($ctr),%r9
+ mov %r10d,0x60+12(%rsp)
+ bswap %r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ xor $key0,%r9d
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
+ mov %r9d,0x70+12(%rsp)
+
+ $movkey 0x10($key),$rndkey1
+
+ movdqa 0x40(%rsp),$inout4
+ movdqa 0x50(%rsp),$inout5
+
+ cmp \$8,$len # $len is in blocks
+ jb .Lctr32_tail # short input if ($len<8)
+
+ sub \$6,$len # $len is biased by -6
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
+ je .Lctr32_6x # [which denotes Atom Silvermont]
+
+ lea 0x80($key),$key # size optimization
+ sub \$2,$len # $len is biased by -8
+ jmp .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+ shl \$4,$rounds
+ mov \$48,$rnds_
+ bswap $key0
+ lea 32($key,$rounds),$key # end of key schedule
+ sub %rax,%r10 # twisted $rounds
jmp .Lctr32_loop6
.align 16
.Lctr32_loop6:
- pshufd \$`3<<6`,$iv1,$inout3
- por $ivec,$inout0 # merge counter-less ivec
- $movkey ($key_),$rndkey0
- pshufd \$`2<<6`,$iv1,$inout4
- por $ivec,$inout1
- $movkey 16($key_),$rndkey1
- pshufd \$`1<<6`,$iv1,$inout5
- por $ivec,$inout2
- por $ivec,$inout3
- xorps $rndkey0,$inout0
- por $ivec,$inout4
- por $ivec,$inout5
-
- # inline _aesni_encrypt6 and interleave last rounds
- # with own code...
+ add \$6,$ctr # next counter value
+ $movkey -48($key,$rnds_),$rndkey0
+ aesenc $rndkey1,$inout0
+ mov $ctr,%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout1
+ movbe %eax,`0x00+12`(%rsp) # store next counter value
+ lea 1($ctr),%eax
+ aesenc $rndkey1,$inout2
+ xor $key0,%eax
+ movbe %eax,`0x10+12`(%rsp)
+ aesenc $rndkey1,$inout3
+ lea 2($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout4
+ movbe %eax,`0x20+12`(%rsp)
+ lea 3($ctr),%eax
+ aesenc $rndkey1,$inout5
+ $movkey -32($key,$rnds_),$rndkey1
+ xor $key0,%eax
- pxor $rndkey0,$inout1
+ aesenc $rndkey0,$inout0
+ movbe %eax,`0x30+12`(%rsp)
+ lea 4($ctr),%eax
+ aesenc $rndkey0,$inout1
+ xor $key0,%eax
+ movbe %eax,`0x40+12`(%rsp)
+ aesenc $rndkey0,$inout2
+ lea 5($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey0,$inout3
+ movbe %eax,`0x50+12`(%rsp)
+ mov %r10,%rax # mov $rnds_,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,$rnds_),$rndkey0
+
+ call .Lenc_loop6
+
+ movdqu ($inp),$inout6 # load 6 input blocks
+ movdqu 0x10($inp),$inout7
+ movdqu 0x20($inp),$in0
+ movdqu 0x30($inp),$in1
+ movdqu 0x40($inp),$in2
+ movdqu 0x50($inp),$in3
+ lea 0x60($inp),$inp # $inp+=6*16
+ $movkey -64($key,$rnds_),$rndkey1
+ pxor $inout0,$inout6 # inp^=E(ctr)
+ movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
+ pxor $inout1,$inout7
+ movaps 0x10(%rsp),$inout1
+ pxor $inout2,$in0
+ movaps 0x20(%rsp),$inout2
+ pxor $inout3,$in1
+ movaps 0x30(%rsp),$inout3
+ pxor $inout4,$in2
+ movaps 0x40(%rsp),$inout4
+ pxor $inout5,$in3
+ movaps 0x50(%rsp),$inout5
+ movdqu $inout6,($out) # store 6 output blocks
+ movdqu $inout7,0x10($out)
+ movdqu $in0,0x20($out)
+ movdqu $in1,0x30($out)
+ movdqu $in2,0x40($out)
+ movdqu $in3,0x50($out)
+ lea 0x60($out),$out # $out+=6*16
+
+ sub \$6,$len
+ jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
+
+ add \$6,$len # restore real remaining $len
+ jz .Lctr32_done # done if ($len==0)
+
+ lea -48($rnds_),$rounds
+ lea -80($key,$rnds_),$key # restore $key
+ neg $rounds
+ shr \$4,$rounds # restore $rounds
+ jmp .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+ add \$8,$ctr # next counter value
+ movdqa 0x60(%rsp),$inout6
aesenc $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
+ mov $ctr,%r9d
+ movdqa 0x70(%rsp),$inout7
aesenc $rndkey1,$inout1
- movdqa .Lincrement32(%rip),$iv1
- pxor $rndkey0,$inout3
+ bswap %r9d
+ $movkey 0x20-0x80($key),$rndkey0
aesenc $rndkey1,$inout2
- movdqa $reserved(%rsp),$iv0
- pxor $rndkey0,$inout4
+ xor $key0,%r9d
+ nop
aesenc $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
+ mov %r9d,0x00+12(%rsp) # store next counter value
+ lea 1($ctr),%r9
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
- jmp .Lctr32_enc_loop6_enter
-.align 16
-.Lctr32_enc_loop6:
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkeyx,$inout0
+ aesenc $rndkeyx,$inout1
+ xor $key0,%r9d
+ .byte 0x66,0x90
+ aesenc $rndkeyx,$inout2
+ aesenc $rndkeyx,$inout3
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
+ lea $i($ctr),%r9
+ aesenc $rndkeyx,$inout4
+ aesenc $rndkeyx,$inout5
+ aesenc $rndkeyx,$inout6
+ aesenc $rndkeyx,$inout7
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ xor $key0,%r9d
+ movdqu 0x00($inp),$in0 # start loading input
+ aesenc $rndkey0,$inout3
+ mov %r9d,0x70+12(%rsp)
+ cmp \$11,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xa0-0x80($key),$rndkey0
+
+ jb .Lctr32_enc_done
+
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
-.Lctr32_enc_loop6_enter:
- $movkey 16($key),$rndkey1
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xb0-0x80($key),$rndkey1
+
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
- lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
- $movkey ($key),$rndkey0
- jnz .Lctr32_enc_loop6
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xc0-0x80($key),$rndkey0
+ je .Lctr32_enc_done
aesenc $rndkey1,$inout0
- paddd $iv1,$iv0 # increment counter vector
aesenc $rndkey1,$inout1
- paddd `$reserved+0x10`(%rsp),$iv1
aesenc $rndkey1,$inout2
- movdqa $iv0,$reserved(%rsp) # save counter vector
aesenc $rndkey1,$inout3
- movdqa $iv1,`$reserved+0x10`(%rsp)
aesenc $rndkey1,$inout4
- pshufb $bswap_mask,$iv0 # byte swap
aesenc $rndkey1,$inout5
- pshufb $bswap_mask,$iv1
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xd0-0x80($key),$rndkey1
- aesenclast $rndkey0,$inout0
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xe0-0x80($key),$rndkey0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 0x10($inp),$in1
+ pxor $rndkey0,$in0 # input^=round[last]
+ movdqu 0x20($inp),$in2
+ pxor $rndkey0,$in1
+ movdqu 0x30($inp),$in3
+ pxor $rndkey0,$in2
+ movdqu 0x40($inp),$in4
+ pxor $rndkey0,$in3
+ movdqu 0x50($inp),$in5
+ pxor $rndkey0,$in4
+ pxor $rndkey0,$in5
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
+ lea 0x80($inp),$inp # $inp+=8*16
+
+ aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
+ pxor $rndkey0,$rndkey1 # borrowed $rndkey
+ movdqu 0x70-0x80($inp),$in0
+ aesenclast $in1,$inout1
+ pxor $rndkey0,$in0
+ movdqa 0x00(%rsp),$in1 # load next counter block
+ aesenclast $in2,$inout2
+ aesenclast $in3,$inout3
+ movdqa 0x10(%rsp),$in2
+ movdqa 0x20(%rsp),$in3
+ aesenclast $in4,$inout4
+ aesenclast $in5,$inout5
+ movdqa 0x30(%rsp),$in4
+ movdqa 0x40(%rsp),$in5
+ aesenclast $rndkey1,$inout6
+ movdqa 0x50(%rsp),$rndkey0
+ $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
+ aesenclast $in0,$inout7
+
+ movups $inout0,($out) # store 8 output blocks
+ movdqa $in1,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in2,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in3,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in4,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in5,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey0,$inout5
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+
+ sub \$8,$len
+ jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
+
+ add \$8,$len # restore real remainig $len
+ jz .Lctr32_done # done if ($len==0)
+ lea -0x80($key),$key
+
+.Lctr32_tail:
+ # note that at this point $inout0..5 are populated with
+ # counter values xor-ed with 0-round key
+ lea 16($key),$key
+ cmp \$4,$len
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
+
+ # if ($len>4) compute 7 E(counter)
+ shl \$4,$rounds
+ movdqa 0x60(%rsp),$inout6
+ pxor $inout7,$inout7
+
+ $movkey 16($key),$rndkey0
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
+ neg %rax
+ aesenc $rndkey1,$inout2
+ add \$16,%rax # prepare for .Lenc_loop8_enter
+ movups ($inp),$in0
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ movups 0x10($inp),$in1 # pre-load input
+ movups 0x20($inp),$in2
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+
+ call .Lenc_loop8_enter
+
+ movdqu 0x30($inp),$in3
+ pxor $in0,$inout0
+ movdqu 0x40($inp),$in0
+ pxor $in1,$inout1
+ movdqu $inout0,($out) # store output
+ pxor $in2,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in3,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in0,$inout4
+ movdqu $inout3,0x30($out)
+ movdqu $inout4,0x40($out)
+ cmp \$6,$len
+ jb .Lctr32_done # $len was 5, stop store
+
+ movups 0x50($inp),$in1
+ xorps $in1,$inout5
+ movups $inout5,0x50($out)
+ je .Lctr32_done # $len was 6, stop store
+
+ movups 0x60($inp),$in2
+ xorps $in2,$inout6
+ movups $inout6,0x60($out)
+ jmp .Lctr32_done # $len was 7, stop store
+
+.align 32
+.Lctr32_loop4:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop4
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
movups ($inp),$in0 # load input
- aesenclast $rndkey0,$inout1
movups 0x10($inp),$in1
- aesenclast $rndkey0,$inout2
+ aesenclast $rndkey1,$inout2
+ aesenclast $rndkey1,$inout3
movups 0x20($inp),$in2
- aesenclast $rndkey0,$inout3
movups 0x30($inp),$in3
- aesenclast $rndkey0,$inout4
- movups 0x40($inp),$rndkey1
- aesenclast $rndkey0,$inout5
- movups 0x50($inp),$rndkey0
- lea 0x60($inp),$inp
-
- xorps $inout0,$in0 # xor
- pshufd \$`3<<6`,$iv0,$inout0
- xorps $inout1,$in1
- pshufd \$`2<<6`,$iv0,$inout1
- movups $in0,($out) # store output
- xorps $inout2,$in2
- pshufd \$`1<<6`,$iv0,$inout2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- xorps $inout4,$rndkey1
- movups $in3,0x30($out)
- xorps $inout5,$rndkey0
- movups $rndkey1,0x40($out)
- movups $rndkey0,0x50($out)
- lea 0x60($out),$out
- mov $rnds_,$rounds
- sub \$6,$len
- jnc .Lctr32_loop6
-
- add \$6,$len
- jz .Lctr32_done
- mov $key_,$key # restore $key
- lea 1($rounds,$rounds),$rounds # restore original value
-.Lctr32_tail:
- por $ivec,$inout0
- movups ($inp),$in0
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ pxor $in2,$inout2
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout3
+ movdqu $inout3,0x30($out)
+ jmp .Lctr32_done # $len was 4, stop store
+
+.align 32
+.Lctr32_loop3:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop3
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ aesenclast $rndkey1,$inout2
+
+ movups ($inp),$in0 # load input
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
cmp \$2,$len
- jb .Lctr32_one
+ jb .Lctr32_done # $len was 1, stop store
- por $ivec,$inout1
movups 0x10($inp),$in1
- je .Lctr32_two
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ je .Lctr32_done # $len was 2, stop store
- pshufd \$`3<<6`,$iv1,$inout3
- por $ivec,$inout2
movups 0x20($inp),$in2
- cmp \$4,$len
- jb .Lctr32_three
-
- pshufd \$`2<<6`,$iv1,$inout4
- por $ivec,$inout3
- movups 0x30($inp),$in3
- je .Lctr32_four
+ xorps $in2,$inout2
+ movups $inout2,0x20($out) # $len was 3, stop store
- por $ivec,$inout4
- xorps $inout5,$inout5
-
- call _aesni_encrypt6
-
- movups 0x40($inp),$rndkey1
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- xorps $inout4,$rndkey1
- movups $in3,0x30($out)
- movups $rndkey1,0x40($out)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_one_shortcut:
- movups ($ivp),$inout0
- movups ($inp),$in0
- mov 240($key),$rounds # key->rounds
-.Lctr32_one:
+.Lctr32_done:
+ xorps %xmm0,%xmm0 # clear regiser bank
+ xor $key0,$key0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
___
- &aesni_generate1("enc",$key,$rounds);
-$code.=<<___;
- xorps $inout0,$in0
- movups $in0,($out)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_two:
- xorps $inout2,$inout2
- call _aesni_encrypt3
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- movups $in1,0x10($out)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_three:
- call _aesni_encrypt3
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- movups $in2,0x20($out)
- jmp .Lctr32_done
-
-.align 16
-.Lctr32_four:
- call _aesni_encrypt4
- xorps $inout0,$in0
- xorps $inout1,$in1
- movups $in0,($out)
- xorps $inout2,$in2
- movups $in1,0x10($out)
- xorps $inout3,$in3
- movups $in2,0x20($out)
- movups $in3,0x30($out)
-
-.Lctr32_done:
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,0x70(%rsp)
+ pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps 0x20(%rsp),%xmm6
- movaps 0x30(%rsp),%xmm7
- movaps 0x40(%rsp),%xmm8
- movaps 0x50(%rsp),%xmm9
- movaps 0x60(%rsp),%xmm10
- movaps 0x70(%rsp),%xmm11
- movaps 0x80(%rsp),%xmm12
- movaps 0x90(%rsp),%xmm13
- movaps 0xa0(%rsp),%xmm14
- movaps 0xb0(%rsp),%xmm15
- lea 0xc8(%rsp),%rsp
-.Lctr32_ret:
+ movaps -0xa8($key_),%xmm6
+ movaps %xmm0,-0xa8($key_) # clear stack
+ movaps -0x98($key_),%xmm7
+ movaps %xmm0,-0x98($key_)
+ movaps -0x88($key_),%xmm8
+ movaps %xmm0,-0x88($key_)
+ movaps -0x78($key_),%xmm9
+ movaps %xmm0,-0x78($key_)
+ movaps -0x68($key_),%xmm10
+ movaps %xmm0,-0x68($key_)
+ movaps -0x58($key_),%xmm11
+ movaps %xmm0,-0x58($key_)
+ movaps -0x48($key_),%xmm12
+ movaps %xmm0,-0x48($key_)
+ movaps -0x38($key_),%xmm13
+ movaps %xmm0,-0x38($key_)
+ movaps -0x28($key_),%xmm14
+ movaps %xmm0,-0x28($key_)
+ movaps -0x18($key_),%xmm15
+ movaps %xmm0,-0x18($key_)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+ movaps %xmm0,0x70(%rsp)
___
$code.=<<___;
+ mov -8($key_),%rbp
+.cfi_restore %rbp
+ lea ($key_),%rsp
+.cfi_def_cfa_register %rsp
+.Lctr32_epilogue:
ret
+.cfi_endproc
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
}
my @tweak=map("%xmm$_",(10..15));
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
-my $frame_size = 0x68 + ($win64?160:0);
+my $frame_size = 0x70 + ($win64?160:0);
+my $key_ = "%rbp"; # override so that we can use %r11 as FP
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
- lea -$frame_size(%rsp),%rsp
+.cfi_startproc
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,0x60(%rsp)
- movaps %xmm7,0x70(%rsp)
- movaps %xmm8,0x80(%rsp)
- movaps %xmm9,0x90(%rsp)
- movaps %xmm10,0xa0(%rsp)
- movaps %xmm11,0xb0(%rsp)
- movaps %xmm12,0xc0(%rsp)
- movaps %xmm13,0xd0(%rsp)
- movaps %xmm14,0xe0(%rsp)
- movaps %xmm15,0xf0(%rsp)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_enc_body:
___
$code.=<<___;
- movups ($ivp),@tweak[5] # load clear-text tweak
+ movups ($ivp),$inout0 # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
- &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
$code.=<<___;
+ $movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
movdqa .Lxts_magic(%rip),$twmask
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
___
+ # alternative tweak calculation algorithm is based on suggestions
+ # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+ # and should help in the future...
for ($i=0;$i<4;$i++) {
$code.=<<___;
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- pand $twmask,$twres # isolate carry and residue
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- pxor $twres,@tweak[5]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
sub \$16*6,$len
- jc .Lxts_enc_short
+ jc .Lxts_enc_short # if $len-=6*16 borrowed
- shr \$1,$rounds
- sub \$1,$rounds
- mov $rounds,$rnds_
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
-.align 16
+.align 32
.Lxts_enc_grandloop:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
- pand $twmask,$twres # isolate carry and residue
+ movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
- pxor $twres,@tweak[5]
-
+ pxor @tweak[0],$inout0 # input^=tweak^round[0]
movdqu `16*2`($inp),$inout2
- pxor @tweak[0],$inout0 # input^=tweak
- movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
- movdqu `16*4`($inp),$inout4
+ aesenc $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
+ aesenc $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
- $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
- pxor @tweak[5],$inout5
+ aesenc $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
- # inline _aesni_encrypt6 and interleave first and last rounds
- # with own code...
- $movkey 16($key_),$rndkey1
- pxor $rndkey0,$inout0
- pxor $rndkey0,$inout1
- movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
- aesenc $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
- movdqa @tweak[1],`16*1`(%rsp)
- aesenc $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqa @tweak[2],`16*2`(%rsp)
- aesenc $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqa @tweak[3],`16*3`(%rsp)
- aesenc $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
- movdqa @tweak[4],`16*4`(%rsp)
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
aesenc $rndkey1,$inout4
- movdqa @tweak[5],`16*5`(%rsp)
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
aesenc $rndkey1,$inout5
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp
- jmp .Lxts_enc_loop6_enter
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
-.align 16
+ aesenc $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_enc_loop6
+.align 32
.Lxts_enc_loop6:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
- dec $rounds
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
-.Lxts_enc_loop6_enter:
- $movkey 16($key),$rndkey1
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
- lea 32($key),$key
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
- $movkey ($key),$rndkey0
+ $movkey -80($key,%rax),$rndkey0
jnz .Lxts_enc_loop6
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesenc $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
aesenc $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
aesenc $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
aesenc $rndkey1,$inout5
- $movkey 16($key),$rndkey1
+ $movkey -64($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[0]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
aesenc $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
aesenc $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
aesenc $rndkey0,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
aesenc $rndkey0,$inout5
- $movkey 32($key),$rndkey0
+ $movkey -48($key),$rndkey0
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[1]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ paddd $twres,$twres
aesenc $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
aesenc $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesenc $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesenc $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
aesenc $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[2]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- aesenclast $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
- aesenclast $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- aesenclast $rndkey0,$inout2
- pxor $twres,@tweak[5]
- aesenclast $rndkey0,$inout3
- aesenclast $rndkey0,$inout4
- aesenclast $rndkey0,$inout5
-
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[3]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- xorps `16*0`(%rsp),$inout0 # output^=tweak
- pand $twmask,$twres # isolate carry and residue
- xorps `16*1`(%rsp),$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddd $twres,$twres
+ aesenc $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesenc $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesenc $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesenclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesenclast `16*1`(%rsp),$inout1
+ aesenclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesenclast `16*3`(%rsp),$inout3
+ aesenclast `16*4`(%rsp),$inout4
+ aesenclast `16*5`(%rsp),$inout5
pxor $twres,@tweak[5]
- xorps `16*2`(%rsp),$inout2
- movups $inout0,`16*0`($out) # write output
- xorps `16*3`(%rsp),$inout3
- movups $inout1,`16*1`($out)
- xorps `16*4`(%rsp),$inout4
- movups $inout2,`16*2`($out)
- xorps `16*5`(%rsp),$inout5
- movups $inout3,`16*3`($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,`16*4`($out)
- movups $inout5,`16*5`($out)
- lea `16*6`($out),$out
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
sub \$16*6,$len
- jnc .Lxts_enc_grandloop
+ jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
- lea 3($rounds,$rounds),$rounds # restore original value
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
mov $key_,$key # restore $key
- mov $rounds,$rnds_ # backup $rounds
+ shr \$4,$rounds # restore original value
.Lxts_enc_short:
- add \$16*6,$len
- jz .Lxts_enc_done
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_enc_done # done if ($len==0)
+ pxor $rndkey0,@tweak[1]
cmp \$0x20,$len
- jb .Lxts_enc_one
- je .Lxts_enc_two
+ jb .Lxts_enc_one # $len is 1*16
+ pxor $rndkey0,@tweak[2]
+ je .Lxts_enc_two # $len is 2*16
+ pxor $rndkey0,@tweak[3]
cmp \$0x40,$len
- jb .Lxts_enc_three
- je .Lxts_enc_four
-
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movdqu ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movdqu 16*1($inp),$inout1
- pxor $twres,@tweak[5]
+ jb .Lxts_enc_three # $len is 3*16
+ pxor $rndkey0,@tweak[4]
+ je .Lxts_enc_four # $len is 4*16
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
- lea 16*5($inp),$inp
+ lea 16*5($inp),$inp # $inp+=5*16
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
+ pxor $inout5,$inout5
call _aesni_encrypt6
movdqa @tweak[5],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
- movdqu $inout0,($out)
+ movdqu $inout0,($out) # store 5 output blocks
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
movdqu $inout2,16*2($out)
movdqu $inout3,16*3($out)
movdqu $inout4,16*4($out)
- lea 16*5($out),$out
+ lea 16*5($out),$out # $out+=5*16
jmp .Lxts_enc_done
.align 16
.Lxts_enc_one:
movups ($inp),$inout0
- lea 16*1($inp),$inp
+ lea 16*1($inp),$inp # inp+=1*16
xorps @tweak[0],$inout0
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
- movups $inout0,($out)
- lea 16*1($out),$out
+ movups $inout0,($out) # store one output block
+ lea 16*1($out),$out # $out+=1*16
jmp .Lxts_enc_done
.align 16
.Lxts_enc_two:
movups ($inp),$inout0
movups 16($inp),$inout1
- lea 32($inp),$inp
+ lea 32($inp),$inp # $inp+=2*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
- call _aesni_encrypt3
+ call _aesni_encrypt2
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
- movups $inout0,($out)
+ movups $inout0,($out) # store 2 output blocks
movups $inout1,16*1($out)
- lea 16*2($out),$out
+ lea 16*2($out),$out # $out+=2*16
jmp .Lxts_enc_done
.align 16
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
- lea 16*3($inp),$inp
+ lea 16*3($inp),$inp # $inp+=3*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
movups $inout1,16*1($out)
movups $inout2,16*2($out)
- lea 16*3($out),$out
+ lea 16*3($out),$out # $out+=3*16
jmp .Lxts_enc_done
.align 16
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
- lea 16*4($inp),$inp
+ lea 16*4($inp),$inp # $inp+=4*16
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_encrypt4
- xorps @tweak[0],$inout0
- movdqa @tweak[5],@tweak[0]
- xorps @tweak[1],$inout1
- xorps @tweak[2],$inout2
- movups $inout0,($out)
- xorps @tweak[3],$inout3
- movups $inout1,16*1($out)
- movups $inout2,16*2($out)
- movups $inout3,16*3($out)
- lea 16*4($out),$out
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
jmp .Lxts_enc_done
.align 16
.Lxts_enc_done:
- and \$15,$len_
+ and \$15,$len_ # see if $len%16 is 0
jz .Lxts_enc_ret
mov $len_,$len
movups $inout0,-16($out)
.Lxts_enc_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps 0x60(%rsp),%xmm6
- movaps 0x70(%rsp),%xmm7
- movaps 0x80(%rsp),%xmm8
- movaps 0x90(%rsp),%xmm9
- movaps 0xa0(%rsp),%xmm10
- movaps 0xb0(%rsp),%xmm11
- movaps 0xc0(%rsp),%xmm12
- movaps 0xd0(%rsp),%xmm13
- movaps 0xe0(%rsp),%xmm14
- movaps 0xf0(%rsp),%xmm15
-___
-$code.=<<___;
- lea $frame_size(%rsp),%rsp
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lxts_enc_epilogue:
ret
+.cfi_endproc
.size aesni_xts_encrypt,.-aesni_xts_encrypt
___
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_xts_decrypt:
- lea -$frame_size(%rsp),%rsp
+.cfi_startproc
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,0x60(%rsp)
- movaps %xmm7,0x70(%rsp)
- movaps %xmm8,0x80(%rsp)
- movaps %xmm9,0x90(%rsp)
- movaps %xmm10,0xa0(%rsp)
- movaps %xmm11,0xb0(%rsp)
- movaps %xmm12,0xc0(%rsp)
- movaps %xmm13,0xd0(%rsp)
- movaps %xmm14,0xe0(%rsp)
- movaps %xmm15,0xf0(%rsp)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_dec_body:
___
$code.=<<___;
- movups ($ivp),@tweak[5] # load clear-text tweak
+ movups ($ivp),$inout0 # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
- &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
$code.=<<___;
xor %eax,%eax # if ($len%16) len-=16;
test \$15,$len
shl \$4,%rax
sub %rax,$len
+ $movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
movdqa .Lxts_magic(%rip),$twmask
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
___
for ($i=0;$i<4;$i++) {
$code.=<<___;
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- pand $twmask,$twres # isolate carry and residue
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- pxor $twres,@tweak[5]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
sub \$16*6,$len
- jc .Lxts_dec_short
+ jc .Lxts_dec_short # if $len-=6*16 borrowed
- shr \$1,$rounds
- sub \$1,$rounds
- mov $rounds,$rnds_
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
-.align 16
+.align 32
.Lxts_dec_grandloop:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
movdqu `16*0`($inp),$inout0 # load input
- pand $twmask,$twres # isolate carry and residue
+ movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
- pxor $twres,@tweak[5]
-
+ pxor @tweak[0],$inout0 # intput^=tweak^round[0]
movdqu `16*2`($inp),$inout2
- pxor @tweak[0],$inout0 # input^=tweak
- movdqu `16*3`($inp),$inout3
pxor @tweak[1],$inout1
- movdqu `16*4`($inp),$inout4
+ aesdec $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
- movdqu `16*5`($inp),$inout5
- lea `16*6`($inp),$inp
+ aesdec $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
- $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
- pxor @tweak[5],$inout5
+ aesdec $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
- # inline _aesni_decrypt6 and interleave first and last rounds
- # with own code...
- $movkey 16($key_),$rndkey1
- pxor $rndkey0,$inout0
- pxor $rndkey0,$inout1
- movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
- aesdec $rndkey1,$inout0
- lea 32($key_),$key
- pxor $rndkey0,$inout2
- movdqa @tweak[1],`16*1`(%rsp)
- aesdec $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqa @tweak[2],`16*2`(%rsp)
- aesdec $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqa @tweak[3],`16*3`(%rsp)
- aesdec $rndkey1,$inout3
- pxor $rndkey0,$inout5
- $movkey ($key),$rndkey0
- dec $rounds
- movdqa @tweak[4],`16*4`(%rsp)
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
aesdec $rndkey1,$inout4
- movdqa @tweak[5],`16*5`(%rsp)
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesdec $rndkey1,$inout5
- pxor $twtmp,$twtmp
- pcmpgtd @tweak[5],$twtmp
- jmp .Lxts_dec_loop6_enter
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
-.align 16
+ aesdec $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_dec_loop6
+.align 32
.Lxts_dec_loop6:
aesdec $rndkey1,$inout0
aesdec $rndkey1,$inout1
- dec $rounds
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
-.Lxts_dec_loop6_enter:
- $movkey 16($key),$rndkey1
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
aesdec $rndkey0,$inout0
aesdec $rndkey0,$inout1
- lea 32($key),$key
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
aesdec $rndkey0,$inout4
aesdec $rndkey0,$inout5
- $movkey ($key),$rndkey0
+ $movkey -80($key,%rax),$rndkey0
jnz .Lxts_dec_loop6
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
aesdec $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
aesdec $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcast upper bits
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
aesdec $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
aesdec $rndkey1,$inout5
- $movkey 16($key),$rndkey1
+ $movkey -64($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[0]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ movdqa $twres,$twtmp
aesdec $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
aesdec $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
aesdec $rndkey0,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
aesdec $rndkey0,$inout5
- $movkey 32($key),$rndkey0
+ $movkey -48($key),$rndkey0
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[1]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ paddd $twres,$twres
aesdec $rndkey1,$inout0
- pand $twmask,$twres # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
aesdec $rndkey1,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
aesdec $rndkey1,$inout2
- pxor $twres,@tweak[5]
aesdec $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
aesdec $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[2]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- aesdeclast $rndkey0,$inout0
- pand $twmask,$twres # isolate carry and residue
- aesdeclast $rndkey0,$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
- aesdeclast $rndkey0,$inout2
- pxor $twres,@tweak[5]
- aesdeclast $rndkey0,$inout3
- aesdeclast $rndkey0,$inout4
- aesdeclast $rndkey0,$inout5
-
- pshufd \$0x13,$twtmp,$twres
- pxor $twtmp,$twtmp
- movdqa @tweak[5],@tweak[3]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- xorps `16*0`(%rsp),$inout0 # output^=tweak
- pand $twmask,$twres # isolate carry and residue
- xorps `16*1`(%rsp),$inout1
- pcmpgtd @tweak[5],$twtmp # broadcat upper bits
+ paddd $twres,$twres
+ aesdec $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesdec $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesdec $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesdeclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesdeclast `16*1`(%rsp),$inout1
+ aesdeclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesdeclast `16*3`(%rsp),$inout3
+ aesdeclast `16*4`(%rsp),$inout4
+ aesdeclast `16*5`(%rsp),$inout5
pxor $twres,@tweak[5]
- xorps `16*2`(%rsp),$inout2
- movups $inout0,`16*0`($out) # write output
- xorps `16*3`(%rsp),$inout3
- movups $inout1,`16*1`($out)
- xorps `16*4`(%rsp),$inout4
- movups $inout2,`16*2`($out)
- xorps `16*5`(%rsp),$inout5
- movups $inout3,`16*3`($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,`16*4`($out)
- movups $inout5,`16*5`($out)
- lea `16*6`($out),$out
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
sub \$16*6,$len
- jnc .Lxts_dec_grandloop
+ jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
- lea 3($rounds,$rounds),$rounds # restore original value
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
mov $key_,$key # restore $key
- mov $rounds,$rnds_ # backup $rounds
+ shr \$4,$rounds # restore original value
.Lxts_dec_short:
- add \$16*6,$len
- jz .Lxts_dec_done
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ pxor $rndkey0,@tweak[1]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_dec_done # done if ($len==0)
+ pxor $rndkey0,@tweak[2]
cmp \$0x20,$len
- jb .Lxts_dec_one
- je .Lxts_dec_two
+ jb .Lxts_dec_one # $len is 1*16
+ pxor $rndkey0,@tweak[3]
+ je .Lxts_dec_two # $len is 2*16
+ pxor $rndkey0,@tweak[4]
cmp \$0x40,$len
- jb .Lxts_dec_three
- je .Lxts_dec_four
-
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movdqu ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movdqu 16*1($inp),$inout1
- pxor $twres,@tweak[5]
+ jb .Lxts_dec_three # $len is 3*16
+ je .Lxts_dec_four # $len is 4*16
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
- lea 16*5($inp),$inp
+ lea 16*5($inp),$inp # $inp+=5*16
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
- movdqu $inout0,($out)
+ movdqu $inout0,($out) # store 5 output blocks
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
movdqu $inout3,16*3($out)
pcmpgtd @tweak[5],$twtmp
movdqu $inout4,16*4($out)
- lea 16*5($out),$out
+ lea 16*5($out),$out # $out+=5*16
pshufd \$0x13,$twtmp,@tweak[1] # $twres
and \$15,$len_
jz .Lxts_dec_ret
.align 16
.Lxts_dec_one:
movups ($inp),$inout0
- lea 16*1($inp),$inp
+ lea 16*1($inp),$inp # $inp+=1*16
xorps @tweak[0],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
- movups $inout0,($out)
+ movups $inout0,($out) # store one output block
movdqa @tweak[2],@tweak[1]
- lea 16*1($out),$out
+ lea 16*1($out),$out # $out+=1*16
jmp .Lxts_dec_done
.align 16
.Lxts_dec_two:
movups ($inp),$inout0
movups 16($inp),$inout1
- lea 32($inp),$inp
+ lea 32($inp),$inp # $inp+=2*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
- call _aesni_decrypt3
+ call _aesni_decrypt2
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[3],@tweak[1]
- movups $inout0,($out)
+ movups $inout0,($out) # store 2 output blocks
movups $inout1,16*1($out)
- lea 16*2($out),$out
+ lea 16*2($out),$out # $out+=2*16
jmp .Lxts_dec_done
.align 16
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
- lea 16*3($inp),$inp
+ lea 16*3($inp),$inp # $inp+=3*16
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
- movdqa @tweak[5],@tweak[1]
+ movdqa @tweak[4],@tweak[1]
xorps @tweak[2],$inout2
- movups $inout0,($out)
+ movups $inout0,($out) # store 3 output blocks
movups $inout1,16*1($out)
movups $inout2,16*2($out)
- lea 16*3($out),$out
+ lea 16*3($out),$out # $out+=3*16
jmp .Lxts_dec_done
.align 16
.Lxts_dec_four:
- pshufd \$0x13,$twtmp,$twres
- movdqa @tweak[5],@tweak[4]
- paddq @tweak[5],@tweak[5] # psllq 1,$tweak
- movups ($inp),$inout0
- pand $twmask,$twres # isolate carry and residue
- movups 16*1($inp),$inout1
- pxor $twres,@tweak[5]
-
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
- lea 16*4($inp),$inp
+ lea 16*4($inp),$inp # $inp+=4*16
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_decrypt4
- xorps @tweak[0],$inout0
+ pxor @tweak[0],$inout0
movdqa @tweak[4],@tweak[0]
- xorps @tweak[1],$inout1
+ pxor @tweak[1],$inout1
movdqa @tweak[5],@tweak[1]
- xorps @tweak[2],$inout2
- movups $inout0,($out)
- xorps @tweak[3],$inout3
- movups $inout1,16*1($out)
- movups $inout2,16*2($out)
- movups $inout3,16*3($out)
- lea 16*4($out),$out
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
jmp .Lxts_dec_done
.align 16
.Lxts_dec_done:
- and \$15,$len_
+ and \$15,$len_ # see if $len%16 is 0
jz .Lxts_dec_ret
.Lxts_dec_done2:
mov $len_,$len
movups $inout0,($out)
.Lxts_dec_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps 0x60(%rsp),%xmm6
- movaps 0x70(%rsp),%xmm7
- movaps 0x80(%rsp),%xmm8
- movaps 0x90(%rsp),%xmm9
- movaps 0xa0(%rsp),%xmm10
- movaps 0xb0(%rsp),%xmm11
- movaps 0xc0(%rsp),%xmm12
- movaps 0xd0(%rsp),%xmm13
- movaps 0xe0(%rsp),%xmm14
- movaps 0xf0(%rsp),%xmm15
-___
-$code.=<<___;
- lea $frame_size(%rsp),%rsp
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lxts_dec_epilogue:
ret
+.cfi_endproc
.size aesni_xts_decrypt,.-aesni_xts_decrypt
___
-} }}
+}
+\f
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+.cfi_startproc
+ lea (%rsp),%rax
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_enc_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_enc_done
+
+.Locb_enc_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_enc_short
+ jmp .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_encrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_enc_grandloop
+
+.Locb_enc_short:
+ add \$6,$blocks
+ jz .Locb_enc_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_enc_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_enc_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_enc_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_enc_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_encrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out)
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+
+.Locb_enc_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+.cfi_def_cfa %rax,8
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+.cfi_restore %r14
+ mov -32(%rax),%r13
+.cfi_restore %r13
+ mov -24(%rax),%r12
+.cfi_restore %r12
+ mov -16(%rax),%rbp
+.cfi_restore %rbp
+ mov -8(%rax),%rbx
+.cfi_restore %rbx
+ lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Locb_enc_epilogue:
+ ret
+.cfi_endproc
+.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor $inout4,$checksum
+ pxor @offset[4],$inout4
+ pxor $inout5,$checksum
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesenc $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesenclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ aesenclast @offset[4],$inout4
+ aesenclast @offset[5],$inout5
+ ret
+.size __ocb_encrypt6,.-__ocb_encrypt6
+
+.type __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop4
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast @offset[0],$inout0
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ ret
+.size __ocb_encrypt4,.-__ocb_encrypt4
+
+.type __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesenc $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesenc $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+ aesenc $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop1
+
+ aesenc $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast $inout5,$inout0
+ ret
+.size __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+.cfi_startproc
+ lea (%rsp),%rax
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_dec_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ xorps $inout0,$checksum # accumulate checksum
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_dec_done
+
+.Locb_dec_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_dec_short
+ jmp .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_decrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+ movups $inout5,`16*5`($out)
+ pxor $inout5,$checksum
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_dec_grandloop
+
+.Locb_dec_short:
+ add \$6,$blocks
+ jz .Locb_dec_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_dec_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_dec_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_dec_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_dec_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_decrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ xorps $inout2,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+
+.Locb_dec_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+.cfi_def_cfa %rax,8
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+.cfi_restore %r14
+ mov -32(%rax),%r13
+.cfi_restore %r13
+ mov -24(%rax),%r12
+.cfi_restore %r12
+ mov -16(%rax),%rbp
+.cfi_restore %rbp
+ mov -8(%rax),%rbx
+.cfi_restore %rbx
+ lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Locb_dec_epilogue:
+ ret
+.cfi_endproc
+.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor @offset[4],$inout4
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop6
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesdeclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ aesdeclast @offset[4],$inout4
+ aesdeclast @offset[5],$inout5
+ ret
+.size __ocb_decrypt6,.-__ocb_decrypt6
+
+.type __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop4
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast @offset[0],$inout0
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ ret
+.size __ocb_decrypt4,.-__ocb_decrypt4
+
+.type __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesdec $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesdec $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+ aesdec $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop1
+
+ aesdec $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast $inout5,$inout0
+ ret
+.size __ocb_decrypt1,.-__ocb_decrypt1
+___
+} }}
\f
########################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
{
-my $reserved = $win64?0x40:-0x18; # used in decrypt
+my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
.type ${PREFIX}_cbc_encrypt,\@function,6
.align 16
${PREFIX}_cbc_encrypt:
+.cfi_startproc
test $len,$len # check length
jz .Lcbc_ret
jnc .Lcbc_enc_loop
add \$16,$len
jnz .Lcbc_enc_tail
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
movups $inout0,($ivp)
+ pxor $inout0,$inout0
+ pxor $inout1,$inout1
jmp .Lcbc_ret
.Lcbc_enc_tail:
\f#--------------------------- CBC DECRYPT ------------------------------#
.align 16
.Lcbc_decrypt:
+ cmp \$16,$len
+ jne .Lcbc_decrypt_bulk
+
+ # handle single block without allocating stack frame,
+ # useful in ciphertext stealing mode
+ movdqu ($inp),$inout0 # load input
+ movdqu ($ivp),$inout1 # load iv
+ movdqa $inout0,$inout2 # future iv
+___
+ &aesni_generate1("dec",$key,$rnds_);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ movdqu $inout2,($ivp) # store iv
+ xorps $inout1,$inout0 # ^=iv
+ pxor $inout1,$inout1
+ movups $inout0,($out) # store output
+ pxor $inout0,$inout0
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
+ push %rbp
+.cfi_push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- lea -0x58(%rsp),%rsp
- movaps %xmm6,(%rsp)
- movaps %xmm7,0x10(%rsp)
- movaps %xmm8,0x20(%rsp)
- movaps %xmm9,0x30(%rsp)
+ movaps %xmm6,0x10(%rsp)
+ movaps %xmm7,0x20(%rsp)
+ movaps %xmm8,0x30(%rsp)
+ movaps %xmm9,0x40(%rsp)
+ movaps %xmm10,0x50(%rsp)
+ movaps %xmm11,0x60(%rsp)
+ movaps %xmm12,0x70(%rsp)
+ movaps %xmm13,0x80(%rsp)
+ movaps %xmm14,0x90(%rsp)
+ movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
+
+my $inp_=$key_="%rbp"; # reassign $key_
+
+$code.=<<___;
+ mov $key,$key_ # [re-]backup $key [after reassignment]
+ movups ($ivp),$iv
+ mov $rnds_,$rounds
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ $movkey ($key),$rndkey0
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+ mov OPENSSL_ia32cap_P+4(%rip),%r9d
+ cmp \$0x70,$len
+ jbe .Lcbc_dec_six_or_seven
+
+ and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
+ sub \$0x50,$len # $len is biased by -5*16
+ cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
+ je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
+ sub \$0x20,$len # $len is biased by -7*16
+ lea 0x70($key),$key # size optimization
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movups $inout7,($out)
+ lea 0x10($out),$out
+.Lcbc_dec_loop8_enter:
+ movdqu 0x60($inp),$inout6
+ pxor $rndkey0,$inout0
+ movdqu 0x70($inp),$inout7
+ pxor $rndkey0,$inout1
+ $movkey 0x10-0x70($key),$rndkey1
+ pxor $rndkey0,$inout2
+ mov \$-1,$inp_
+ cmp \$0x70,$len # is there at least 0x60 bytes ahead?
+ pxor $rndkey0,$inout3
+ pxor $rndkey0,$inout4
+ pxor $rndkey0,$inout5
+ pxor $rndkey0,$inout6
+
+ aesdec $rndkey1,$inout0
+ pxor $rndkey0,$inout7
+ $movkey 0x20-0x70($key),$rndkey0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ aesdec $rndkey1,$inout6
+ adc \$0,$inp_
+ and \$128,$inp_
+ aesdec $rndkey1,$inout7
+ add $inp,$inp_
+ $movkey 0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___ if ($i==7);
+ cmp \$11,$rounds
+___
+$code.=<<___;
+ aesdec $rndkeyx,$inout0
+ aesdec $rndkeyx,$inout1
+ aesdec $rndkeyx,$inout2
+ aesdec $rndkeyx,$inout3
+ aesdec $rndkeyx,$inout4
+ aesdec $rndkeyx,$inout5
+ aesdec $rndkeyx,$inout6
+ aesdec $rndkeyx,$inout7
+ $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
+ nop
+___
+$code.=<<___ if ($i==7);
+ jb .Lcbc_dec_done
+___
+$code.=<<___ if ($i==9);
+ je .Lcbc_dec_done
+___
+$code.=<<___ if ($i==11);
+ jmp .Lcbc_dec_done
+___
+}
$code.=<<___;
- movups ($ivp),$iv
- mov $rnds_,$rounds
- cmp \$0x70,$len
- jbe .Lcbc_dec_tail
- shr \$1,$rnds_
- sub \$0x70,$len
- mov $rnds_,$rounds
- movaps $iv,$reserved(%rsp)
- jmp .Lcbc_dec_loop8_enter
.align 16
-.Lcbc_dec_loop8:
- movaps $rndkey0,$reserved(%rsp) # save IV
- movups $inout7,($out)
- lea 0x10($out),$out
-.Lcbc_dec_loop8_enter:
- $movkey ($key),$rndkey0
- movups ($inp),$inout0 # load input
- movups 0x10($inp),$inout1
- $movkey 16($key),$rndkey1
-
- lea 32($key),$key
- movdqu 0x20($inp),$inout2
- xorps $rndkey0,$inout0
- movdqu 0x30($inp),$inout3
- xorps $rndkey0,$inout1
- movdqu 0x40($inp),$inout4
+.Lcbc_dec_done:
aesdec $rndkey1,$inout0
- pxor $rndkey0,$inout2
- movdqu 0x50($inp),$inout5
aesdec $rndkey1,$inout1
- pxor $rndkey0,$inout3
- movdqu 0x60($inp),$inout6
+ pxor $rndkey0,$iv
+ pxor $rndkey0,$in0
aesdec $rndkey1,$inout2
- pxor $rndkey0,$inout4
- movdqu 0x70($inp),$inout7
aesdec $rndkey1,$inout3
- pxor $rndkey0,$inout5
- dec $rounds
+ pxor $rndkey0,$in1
+ pxor $rndkey0,$in2
aesdec $rndkey1,$inout4
- pxor $rndkey0,$inout6
aesdec $rndkey1,$inout5
- pxor $rndkey0,$inout7
- $movkey ($key),$rndkey0
+ pxor $rndkey0,$in3
+ pxor $rndkey0,$in4
aesdec $rndkey1,$inout6
aesdec $rndkey1,$inout7
- $movkey 16($key),$rndkey1
-
- call .Ldec_loop8_enter
+ movdqu 0x50($inp),$rndkey1
+
+ aesdeclast $iv,$inout0
+ movdqu 0x60($inp),$iv # borrow $iv
+ pxor $rndkey0,$rndkey1
+ aesdeclast $in0,$inout1
+ pxor $rndkey0,$iv
+ movdqu 0x70($inp),$rndkey0 # next IV
+ aesdeclast $in1,$inout2
+ lea 0x80($inp),$inp
+ movdqu 0x00($inp_),$in0
+ aesdeclast $in2,$inout3
+ aesdeclast $in3,$inout4
+ movdqu 0x10($inp_),$in1
+ movdqu 0x20($inp_),$in2
+ aesdeclast $in4,$inout5
+ aesdeclast $rndkey1,$inout6
+ movdqu 0x30($inp_),$in3
+ movdqu 0x40($inp_),$in4
+ aesdeclast $iv,$inout7
+ movdqa $rndkey0,$iv # return $iv
+ movdqu 0x50($inp_),$rndkey1
+ $movkey -0x70($key),$rndkey0
+
+ movups $inout0,($out) # store output
+ movdqa $in0,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in1,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in2,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in3,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in4,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey1,$inout5
+ movups $inout6,0x60($out)
+ lea 0x70($out),$out
- movups ($inp),$rndkey1 # re-load input
- movups 0x10($inp),$rndkey0
- xorps $reserved(%rsp),$inout0 # ^= IV
- xorps $rndkey1,$inout1
- movups 0x20($inp),$rndkey1
- xorps $rndkey0,$inout2
- movups 0x30($inp),$rndkey0
- xorps $rndkey1,$inout3
- movups 0x40($inp),$rndkey1
- xorps $rndkey0,$inout4
- movups 0x50($inp),$rndkey0
- xorps $rndkey1,$inout5
- movups 0x60($inp),$rndkey1
- xorps $rndkey0,$inout6
- movups 0x70($inp),$rndkey0 # IV
- xorps $rndkey1,$inout7
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- mov $rnds_,$rounds # restore $rounds
- movups $inout4,0x40($out)
- mov $key_,$key # restore $key
- movups $inout5,0x50($out)
- lea 0x80($inp),$inp
- movups $inout6,0x60($out)
- lea 0x70($out),$out
sub \$0x80,$len
ja .Lcbc_dec_loop8
movaps $inout7,$inout0
- movaps $rndkey0,$iv
+ lea -0x70($key),$key
add \$0x70,$len
- jle .Lcbc_dec_tail_collected
- movups $inout0,($out)
- lea 1($rnds_,$rnds_),$rounds
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout7,($out)
+ lea 0x10($out),$out
+ cmp \$0x50,$len
+ jbe .Lcbc_dec_tail
+
+ movaps $in0,$inout0
+.Lcbc_dec_six_or_seven:
+ cmp \$0x60,$len
+ ja .Lcbc_dec_seven
+
+ movaps $inout5,$inout6
+ call _aesni_decrypt6
+ pxor $iv,$inout0 # ^= IV
+ movaps $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ lea 0x50($out),$out
+ movdqa $inout5,$inout0
+ pxor $inout5,$inout5
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 0x60($inp),$inout6
+ xorps $inout7,$inout7
+ call _aesni_decrypt8
+ movups 0x50($inp),$inout7
+ pxor $iv,$inout0 # ^= IV
+ movups 0x60($inp),$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ pxor $in4,$inout5
+ movdqu $inout4,0x40($out)
+ pxor $inout4,$inout4
+ pxor $inout7,$inout6
+ movdqu $inout5,0x50($out)
+ pxor $inout5,$inout5
+ lea 0x60($out),$out
+ movdqa $inout6,$inout0
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_loop6:
+ movups $inout5,($out)
+ lea 0x10($out),$out
+ movdqu 0x00($inp),$inout0 # load input
+ movdqu 0x10($inp),$inout1
+ movdqa $inout0,$in0
+ movdqu 0x20($inp),$inout2
+ movdqa $inout1,$in1
+ movdqu 0x30($inp),$inout3
+ movdqa $inout2,$in2
+ movdqu 0x40($inp),$inout4
+ movdqa $inout3,$in3
+ movdqu 0x50($inp),$inout5
+ movdqa $inout4,$in4
+.Lcbc_dec_loop6_enter:
+ lea 0x60($inp),$inp
+ movdqa $inout5,$inout6
+
+ call _aesni_decrypt6
+
+ pxor $iv,$inout0 # ^= IV
+ movdqa $inout6,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout4
+ mov $key_,$key
+ movdqu $inout3,0x30($out)
+ pxor $in4,$inout5
+ mov $rnds_,$rounds
+ movdqu $inout4,0x40($out)
+ lea 0x50($out),$out
+ sub \$0x60,$len
+ ja .Lcbc_dec_loop6
+
+ movdqa $inout5,$inout0
+ add \$0x50,$len
+ jle .Lcbc_dec_clear_tail_collected
+ movups $inout5,($out)
lea 0x10($out),$out
+
.Lcbc_dec_tail:
movups ($inp),$inout0
- movaps $inout0,$in0
- cmp \$0x10,$len
- jbe .Lcbc_dec_one
+ sub \$0x10,$len
+ jbe .Lcbc_dec_one # $len is 1*16 or less
movups 0x10($inp),$inout1
- movaps $inout1,$in1
- cmp \$0x20,$len
- jbe .Lcbc_dec_two
+ movaps $inout0,$in0
+ sub \$0x10,$len
+ jbe .Lcbc_dec_two # $len is 2*16 or less
movups 0x20($inp),$inout2
- movaps $inout2,$in2
- cmp \$0x30,$len
- jbe .Lcbc_dec_three
+ movaps $inout1,$in1
+ sub \$0x10,$len
+ jbe .Lcbc_dec_three # $len is 3*16 or less
movups 0x30($inp),$inout3
- cmp \$0x40,$len
- jbe .Lcbc_dec_four
-
- movups 0x40($inp),$inout4
- cmp \$0x50,$len
- jbe .Lcbc_dec_five
-
- movups 0x50($inp),$inout5
- cmp \$0x60,$len
- jbe .Lcbc_dec_six
+ movaps $inout2,$in2
+ sub \$0x10,$len
+ jbe .Lcbc_dec_four # $len is 4*16 or less
- movups 0x60($inp),$inout6
- movaps $iv,$reserved(%rsp) # save IV
- call _aesni_decrypt8
- movups ($inp),$rndkey1
- movups 0x10($inp),$rndkey0
- xorps $reserved(%rsp),$inout0 # ^= IV
- xorps $rndkey1,$inout1
- movups 0x20($inp),$rndkey1
- xorps $rndkey0,$inout2
- movups 0x30($inp),$rndkey0
- xorps $rndkey1,$inout3
- movups 0x40($inp),$rndkey1
- xorps $rndkey0,$inout4
- movups 0x50($inp),$rndkey0
- xorps $rndkey1,$inout5
- movups 0x60($inp),$iv # IV
- xorps $rndkey0,$inout6
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- movups $inout4,0x40($out)
- movups $inout5,0x50($out)
- lea 0x60($out),$out
- movaps $inout6,$inout0
- sub \$0x70,$len
+ movups 0x40($inp),$inout4 # $len is 5*16 or less
+ movaps $inout3,$in3
+ movaps $inout4,$in4
+ xorps $inout5,$inout5
+ call _aesni_decrypt6
+ pxor $iv,$inout0
+ movaps $in4,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ pxor $in3,$inout4
+ movdqu $inout3,0x30($out)
+ pxor $inout3,$inout3
+ lea 0x40($out),$out
+ movdqa $inout4,$inout0
+ pxor $inout4,$inout4
+ pxor $inout5,$inout5
+ sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
+
.align 16
.Lcbc_dec_one:
+ movaps $inout0,$in0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps $iv,$inout0
movaps $in0,$iv
- sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
- xorps $inout2,$inout2
- call _aesni_decrypt3
- xorps $iv,$inout0
- xorps $in0,$inout1
- movups $inout0,($out)
+ movaps $inout1,$in1
+ call _aesni_decrypt2
+ pxor $iv,$inout0
movaps $in1,$iv
- movaps $inout1,$inout0
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ movdqa $inout1,$inout0
+ pxor $inout1,$inout1 # clear register bank
lea 0x10($out),$out
- sub \$0x20,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
+ movaps $inout2,$in2
call _aesni_decrypt3
- xorps $iv,$inout0
- xorps $in0,$inout1
- movups $inout0,($out)
- xorps $in1,$inout2
- movups $inout1,0x10($out)
+ pxor $iv,$inout0
movaps $in2,$iv
- movaps $inout2,$inout0
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ movdqa $inout2,$inout0
+ pxor $inout2,$inout2
lea 0x20($out),$out
- sub \$0x30,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
+ movaps $inout3,$in3
call _aesni_decrypt4
- xorps $iv,$inout0
- movups 0x30($inp),$iv
- xorps $in0,$inout1
- movups $inout0,($out)
- xorps $in1,$inout2
- movups $inout1,0x10($out)
- xorps $in2,$inout3
- movups $inout2,0x20($out)
- movaps $inout3,$inout0
+ pxor $iv,$inout0
+ movaps $in3,$iv
+ pxor $in0,$inout1
+ movdqu $inout0,($out)
+ pxor $in1,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $inout1,$inout1 # clear register bank
+ pxor $in2,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $inout2,$inout2
+ movdqa $inout3,$inout0
+ pxor $inout3,$inout3
lea 0x30($out),$out
- sub \$0x40,$len
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_five:
- xorps $inout5,$inout5
- call _aesni_decrypt6
- movups 0x10($inp),$rndkey1
- movups 0x20($inp),$rndkey0
- xorps $iv,$inout0
- xorps $in0,$inout1
- xorps $rndkey1,$inout2
- movups 0x30($inp),$rndkey1
- xorps $rndkey0,$inout3
- movups 0x40($inp),$iv
- xorps $rndkey1,$inout4
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- lea 0x40($out),$out
- movaps $inout4,$inout0
- sub \$0x50,$len
- jmp .Lcbc_dec_tail_collected
-.align 16
-.Lcbc_dec_six:
- call _aesni_decrypt6
- movups 0x10($inp),$rndkey1
- movups 0x20($inp),$rndkey0
- xorps $iv,$inout0
- xorps $in0,$inout1
- xorps $rndkey1,$inout2
- movups 0x30($inp),$rndkey1
- xorps $rndkey0,$inout3
- movups 0x40($inp),$rndkey0
- xorps $rndkey1,$inout4
- movups 0x50($inp),$iv
- xorps $rndkey0,$inout5
- movups $inout0,($out)
- movups $inout1,0x10($out)
- movups $inout2,0x20($out)
- movups $inout3,0x30($out)
- movups $inout4,0x40($out)
- lea 0x50($out),$out
- movaps $inout5,$inout0
- sub \$0x60,$len
jmp .Lcbc_dec_tail_collected
+
.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor $inout1,$inout1 # clear register bank
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+___
+$code.=<<___ if (!$win64);
+ pxor $inout4,$inout4 # %xmm6..9
+ pxor $inout5,$inout5
+ pxor $inout6,$inout6
+ pxor $inout7,$inout7
+___
+$code.=<<___;
.Lcbc_dec_tail_collected:
- and \$15,$len
movups $iv,($ivp)
+ and \$15,$len
jnz .Lcbc_dec_tail_partial
movups $inout0,($out)
+ pxor $inout0,$inout0
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
- movaps $inout0,$reserved(%rsp)
+ movaps $inout0,(%rsp)
+ pxor $inout0,$inout0
mov \$16,%rcx
mov $out,%rdi
sub $len,%rcx
- lea $reserved(%rsp),%rsi
- .long 0x9066A4F3 # rep movsb
+ lea (%rsp),%rsi
+ .long 0x9066A4F3 # rep movsb
+ movdqa $inout0,(%rsp)
.Lcbc_dec_ret:
+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
___
$code.=<<___ if ($win64);
- movaps (%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- lea 0x58(%rsp),%rsp
+ movaps 0x10(%rsp),%xmm6
+ movaps %xmm0,0x10(%rsp) # clear stack
+ movaps 0x20(%rsp),%xmm7
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm8
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm9
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm10
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm11
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm12
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm13
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm14
+ movaps %xmm0,0x90(%rsp)
+ movaps 0xa0(%rsp),%xmm15
+ movaps %xmm0,0xa0(%rsp)
___
$code.=<<___;
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lcbc_ret:
ret
+.cfi_endproc
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
___
} \f
-# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
# int bits, AES_KEY *key)
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# *$key key schedule
+#
{ my ($inp,$bits,$key) = @_4args;
$bits =~ s/%r/%e/;
.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
.align 16
${PREFIX}_set_decrypt_key:
+.cfi_startproc
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+.cfi_adjust_cfa_offset 8
call __aesni_set_encrypt_key
shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
test %eax,%eax
$movkey ($key),%xmm0 # inverse middle
aesimc %xmm0,%xmm0
+ pxor %xmm1,%xmm1
$movkey %xmm0,($inp)
+ pxor %xmm0,%xmm0
.Ldec_key_ret:
add \$8,%rsp
+.cfi_adjust_cfa_offset -8
ret
+.cfi_endproc
.LSEH_end_set_decrypt_key:
.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
___
# Vinodh Gopal <vinodh.gopal@intel.com>
# Kahraman Akdemir
#
-# Agressively optimized in respect to aeskeygenassist's critical path
+# Aggressively optimized in respect to aeskeygenassist's critical path
# and is contained in %xmm0-5 to meet Win64 ABI requirement.
#
+# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
+# int bits, AES_KEY * const key);
+#
+# input: $inp user-supplied key
+# $bits $inp length in bits
+# $key pointer to key schedule
+# output: %eax 0 denoting success, -1 or -2 - failure (see C)
+# $bits rounds-1 (used in aesni_set_decrypt_key)
+# *$key key schedule
+# $key pointer to key schedule (used in
+# aesni_set_decrypt_key)
+#
+# Subroutine is frame-less, which means that only volatile registers
+# are used. Note that it's declared "abi-omnipotent", which means that
+# amount of volatile registers is smaller on Windows.
+#
$code.=<<___;
.globl ${PREFIX}_set_encrypt_key
.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
.align 16
${PREFIX}_set_encrypt_key:
__aesni_set_encrypt_key:
+.cfi_startproc
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+.cfi_adjust_cfa_offset 8
mov \$-1,%rax
test $inp,$inp
jz .Lenc_key_ret
test $key,$key
jz .Lenc_key_ret
+ mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
movups ($inp),%xmm0 # pull first 128 bits of *userKey
xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
- lea 16($key),%rax
+ and OPENSSL_ia32cap_P+4(%rip),%r10d
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
cmp \$256,$bits
je .L14rounds
cmp \$192,$bits
.L10rounds:
mov \$9,$bits # 10 rounds for 128-bit key
+ cmp \$`1<<28`,%r10d # AVX, bit no XOP
+ je .L10rounds_alt
+
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
call .Lkey_expansion_128_cold
xor %eax,%eax
jmp .Lenc_key_ret
+.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ mov \$8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,($key)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+ lea 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ dec %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+ pslld \$1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+ pshufb %xmm5,%xmm0
+ aesenclast %xmm4,%xmm0
+
+ movdqa %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq \$4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ mov $bits,96(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
.align 16
.L12rounds:
movq 16($inp),%xmm2 # remaining 1/3 of *userKey
mov \$11,$bits # 12 rounds for 192
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L12rounds_alt
+
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
call .Lkey_expansion_192a_cold
xor %rax, %rax
jmp .Lenc_key_ret
+.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$8,%r10d
+ movdqu %xmm0,($key)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+ pslld \$1, %xmm4
+ lea 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd \$0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ dec %r10d
+ jnz .Loop_key192
+
+ mov $bits,32(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
.align 16
.L14rounds:
movups 16($inp),%xmm2 # remaning half of *userKey
mov \$13,$bits # 14 rounds for 256
lea 16(%rax),%rax
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
+ je .L14rounds_alt
+
$movkey %xmm0,($key) # round 0
$movkey %xmm2,16($key) # round 1
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
xor %rax,%rax
jmp .Lenc_key_ret
+.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ mov \$7,%r10d
+ movdqu %xmm0,0($key)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16($key)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+ pshufb %xmm5,%xmm2
+ aesenclast %xmm4,%xmm2
+
+ movdqa %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq \$4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld \$1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ dec %r10d
+ jz .Ldone_key256
+
+ pshufd \$0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+ aesenclast %xmm3,%xmm2
+
+ movdqa %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq \$4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ lea 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ mov $bits,16(%rax) # 240($key)
+ xor %eax,%eax
+ jmp .Lenc_key_ret
+
.align 16
.Lbad_keybits:
mov \$-2,%rax
.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
add \$8,%rsp
+.cfi_adjust_cfa_offset -8
ret
+.cfi_endproc
.LSEH_end_set_encrypt_key:
\f
.align 16
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
+.Lincrement1:
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+ .long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+ .long 1,1,1,1
+.Lkey_rcon1b:
+ .long 0x1b,0x1b,0x1b,0x1b
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
.extern __imp_RtlVirtualUnwind
___
$code.=<<___ if ($PREFIX eq "aesni");
-.type ecb_se_handler,\@abi-omnipotent
-.align 16
-ecb_se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 152($context),%rax # pull context->Rsp
-
- jmp .Lcommon_seh_tail
-.size ecb_se_handler,.-ecb_se_handler
-
-.type ccm64_se_handler,\@abi-omnipotent
+.type ecb_ccm64_se_handler,\@abi-omnipotent
.align 16
-ccm64_se_handler:
+ecb_ccm64_se_handler:
push %rsi
push %rdi
push %rbx
lea 0x58(%rax),%rax # adjust stack pointer
jmp .Lcommon_seh_tail
-.size ccm64_se_handler,.-ccm64_se_handler
+.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
-.type ctr32_se_handler,\@abi-omnipotent
+.type ctr_xts_se_handler,\@abi-omnipotent
.align 16
-ctr32_se_handler:
+ctr_xts_se_handler:
push %rsi
push %rdi
push %rbx
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
- lea .Lctr32_body(%rip),%r10
- cmp %r10,%rbx # context->Rip<"prologue" label
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
- lea .Lctr32_ret(%rip),%r10
- cmp %r10,%rbx
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- lea 0x20(%rax),%rsi # %xmm save area
- lea 512($context),%rdi # &context.Xmm6
+ mov 208($context),%rax # pull context->R11
+
+ lea -0xa8(%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0xc8(%rax),%rax # adjust stack pointer
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
jmp .Lcommon_seh_tail
-.size ctr32_se_handler,.-ctr32_se_handler
+.size ctr_xts_se_handler,.-ctr_xts_se_handler
-.type xts_se_handler,\@abi-omnipotent
+.type ocb_se_handler,\@abi-omnipotent
.align 16
-xts_se_handler:
+ocb_se_handler:
push %rsi
push %rdi
push %rbx
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
- mov 152($context),%rax # pull context->Rsp
-
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- lea 0x60(%rax),%rsi # %xmm save area
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10
+ cmp %r10,%rbx # context->Rip>=pop label
+ jae .Locb_no_xmm
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea (%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0x68+160(%rax),%rax # adjust stack pointer
+ lea 0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
jmp .Lcommon_seh_tail
-.size xts_se_handler,.-xts_se_handler
+.size ocb_se_handler,.-ocb_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
mov 152($context),%rax # pull context->Rsp
mov 248($context),%rbx # pull context->Rip
- lea .Lcbc_decrypt(%rip),%r10
+ lea .Lcbc_decrypt_bulk(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lcommon_seh_tail
+ mov 120($context),%rax # pull context->Rax
+
lea .Lcbc_decrypt_body(%rip),%r10
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
- jb .Lrestore_cbc_rax
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
lea .Lcbc_ret(%rip),%r10
cmp %r10,%rbx # context->Rip>="epilogue" label
jae .Lcommon_seh_tail
- lea 0(%rax),%rsi # top of stack
+ lea 16(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
- mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0x58(%rax),%rax # adjust stack pointer
- jmp .Lcommon_seh_tail
-.Lrestore_cbc_rax:
- mov 120($context),%rax
+ mov 208($context),%rax # pull context->R11
+
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
.Lcommon_seh_tail:
mov 8(%rax),%rdi
.rva .LSEH_begin_aesni_xts_decrypt
.rva .LSEH_end_aesni_xts_decrypt
.rva .LSEH_info_xts_dec
+
+ .rva .LSEH_begin_aesni_ocb_encrypt
+ .rva .LSEH_end_aesni_ocb_encrypt
+ .rva .LSEH_info_ocb_enc
+
+ .rva .LSEH_begin_aesni_ocb_decrypt
+ .rva .LSEH_end_aesni_ocb_decrypt
+ .rva .LSEH_info_ocb_dec
___
$code.=<<___;
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
$code.=<<___ if ($PREFIX eq "aesni");
.LSEH_info_ecb:
.byte 9,0,0,0
- .rva ecb_se_handler
+ .rva ecb_ccm64_se_handler
+ .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
.LSEH_info_ccm64_enc:
.byte 9,0,0,0
- .rva ccm64_se_handler
+ .rva ecb_ccm64_se_handler
.rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
.LSEH_info_ccm64_dec:
.byte 9,0,0,0
- .rva ccm64_se_handler
+ .rva ecb_ccm64_se_handler
.rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
.LSEH_info_ctr32:
.byte 9,0,0,0
- .rva ctr32_se_handler
+ .rva ctr_xts_se_handler
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
.LSEH_info_xts_enc:
.byte 9,0,0,0
- .rva xts_se_handler
+ .rva ctr_xts_se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.LSEH_info_xts_dec:
.byte 9,0,0,0
- .rva xts_se_handler
+ .rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+.LSEH_info_ocb_enc:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
+ .rva .Locb_enc_pop
+ .long 0
+.LSEH_info_ocb_dec:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
+ .rva .Locb_dec_pop
+ .long 0
___
$code.=<<___;
.LSEH_info_cbc:
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
}
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+ my %opcodelet = (
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
+ );
+ return undef if (!defined($opcodelet{$1}));
+ my $off = $2;
+ push @opcode,0x44 if ($3>=8);
+ push @opcode,0x0f,0x38,$opcodelet{$1};
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+ return ".byte\t".join(',',@opcode);
+ }
return $line;
}
+sub movbe {
+ ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
print $code;