From f8501464cc8fd8b7b4983462944a1894b157d735 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 22 May 2011 18:38:00 +0000 Subject: [PATCH 1/1] aesni-x86[_64].pl: optimize for Sandy Bridge and add XTS mode. --- crypto/aes/asm/aesni-x86.pl | 1634 ++++++++++++++++++----- crypto/aes/asm/aesni-x86_64.pl | 2251 ++++++++++++++++++++++++++------ 2 files changed, 3212 insertions(+), 673 deletions(-) diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index 1a1bf539cd..712149ab4b 100644 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -27,7 +27,21 @@ # Lower ratios for smaller block sizes are perfectly understandable, # because function call overhead is higher in 32-bit mode. Largest # 8-KB block performance is virtually same: 32-bit code is less than -# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. +# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. + +# January 2011 +# +# See aesni-x86_64.pl for details. Unlike x86_64 version this module +# interleaves at most 6 aes[enc|dec] instructions, because there are +# not enough registers for 8x interleave [which should be optimal for +# Sandy Bridge]. Actually, performance results for 6x interleave +# factor presented in aesni-x86_64.pl (except for CTR) are for this +# module. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for @@ -51,14 +65,14 @@ $out="edi"; $rounds_="ebx"; # backup copy for $rounds $key_="ebp"; # backup copy for $key -$inout0="xmm0"; -$inout1="xmm1"; -$inout2="xmm2"; -$rndkey0="xmm3"; -$rndkey1="xmm4"; -$ivec="xmm5"; -$in0="xmm6"; -$in1="xmm7"; $inout3="xmm7"; +$rndkey0="xmm0"; +$rndkey1="xmm1"; +$inout0="xmm2"; +$inout1="xmm3"; +$inout2="xmm4"; +$inout3="xmm5"; $in1="xmm5"; +$inout4="xmm6"; $in0="xmm6"; +$inout5="xmm7"; $ivec="xmm7"; # AESNI extenstion sub aeskeygenassist @@ -80,13 +94,15 @@ sub aesdeclast { aescommon(0xdf,@_); } # Inline version of internal aesni_[en|de]crypt1 { my $sn; sub aesni_inline_generate1 -{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); +{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); $sn++; - &movdqu ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($ivec,$rndkey0) if (defined($ivec)); &lea ($key,&DWP(32,$key)); - &pxor ($inout,$rndkey0); + &xorps ($inout,$ivec) if (defined($ivec)); + &xorps ($inout,$rndkey0) if (!defined($ivec)); &set_label("${p}1_loop_$sn"); eval"&aes${p} ($inout,$rndkey1)"; &dec ($rounds); @@ -100,9 +116,9 @@ sub aesni_generate1 # fully unrolled loop { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); &function_begin_B("_aesni_${p}rypt1"); - &movdqu ($rndkey0,&QWP(0,$key)); + &movups ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); - &pxor ($inout,$rndkey0); + &xorps ($inout,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); &cmp ($rounds,11); @@ -147,7 +163,7 @@ sub aesni_generate1 # fully unrolled loop &function_begin_B("${PREFIX}_encrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); - &movdqu ($inout0,&QWP(0,"eax")); + &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); if ($inline) @@ -163,7 +179,7 @@ sub aesni_generate1 # fully unrolled loop &function_begin_B("${PREFIX}_decrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); - &movdqu ($inout0,&QWP(0,"eax")); + &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); if ($inline) @@ -174,16 +190,19 @@ sub aesni_generate1 # fully unrolled loop &ret (); &function_end_B("${PREFIX}_decrypt"); -# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave -# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] -# latency is 6, it turned out that it can be scheduled only every -# *second* cycle. Thus 3x interleave is the one providing optimal +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. As soon -# as/if Intel improves throughput by making it possible to schedule -# the instructions in question *every* cycles I would have to -# implement 6x interleave and use it in loop... +# This is why it makes no sense to implement 2x subroutine. +# aes[enc|dec] latency in next processor generation is 8, but the +# instructions can be scheduled every cycle. Optimal interleave for +# new processor is therefore 8x, but it's unfeasible to accommodate it +# in XMM registers addreassable in 32-bit mode and therefore 6x is +# used instead... + sub aesni_generate3 { my $p=shift; @@ -192,7 +211,7 @@ sub aesni_generate3 &shr ($rounds,1); &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); - &pxor ($inout0,$rndkey0); + &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); @@ -231,13 +250,13 @@ sub aesni_generate4 &$movekey ($rndkey1,&QWP(16,$key)); &shr ($rounds,1); &lea ($key,&DWP(32,$key)); - &pxor ($inout0,$rndkey0); + &xorps ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); &pxor ($inout2,$rndkey0); &pxor ($inout3,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); - &set_label("${p}3_loop"); + &set_label("${p}4_loop"); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; &dec ($rounds); @@ -250,7 +269,7 @@ sub aesni_generate4 eval"&aes${p} ($inout2,$rndkey0)"; eval"&aes${p} ($inout3,$rndkey0)"; &$movekey ($rndkey0,&QWP(0,$key)); - &jnz (&label("${p}3_loop")); + &jnz (&label("${p}4_loop")); eval"&aes${p} ($inout0,$rndkey1)"; eval"&aes${p} ($inout1,$rndkey1)"; @@ -263,10 +282,73 @@ sub aesni_generate4 &ret(); &function_end_B("_aesni_${p}rypt4"); } + +sub aesni_generate6 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt6"); + &static_label("_aesni_${p}rypt6_enter"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); # pxor does better here + eval"&aes${p} ($inout0,$rndkey1)"; + &pxor ($inout2,$rndkey0); + eval"&aes${p} ($inout1,$rndkey1)"; + &pxor ($inout3,$rndkey0); + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &pxor ($inout4,$rndkey0); + eval"&aes${p} ($inout3,$rndkey1)"; + &pxor ($inout5,$rndkey0); + eval"&aes${p} ($inout4,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout5,$rndkey1)"; + &jmp (&label("_aesni_${p}rypt6_enter")); + + &set_label("${p}6_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + &set_label("_aesni_${p}rypt6_enter",16); + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + eval"&aes${p} ($inout4,$rndkey0)"; + eval"&aes${p} ($inout5,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("${p}6_loop")); + + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + eval"&aes${p}last ($inout4,$rndkey0)"; + eval"&aes${p}last ($inout5,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt6"); +} &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); +&aesni_generate6("enc") if ($PREFIX eq "aesni"); +&aesni_generate6("dec"); if ($PREFIX eq "aesni") { ###################################################################### @@ -278,37 +360,62 @@ if ($PREFIX eq "aesni") { &mov ($out,&wparam(1)); &mov ($len,&wparam(2)); &mov ($key,&wparam(3)); - &mov ($rounds,&wparam(4)); - &cmp ($len,16); - &jb (&label("ecb_ret")); + &mov ($rounds_,&wparam(4)); &and ($len,-16); - &test ($rounds,$rounds) + &jz (&label("ecb_ret")); &mov ($rounds,&DWP(240,$key)); + &test ($rounds_,$rounds_); + &jz (&label("ecb_decrypt")); + &mov ($key_,$key); # backup $key &mov ($rounds_,$rounds); # backup $rounds - &jz (&label("ecb_decrypt")); + &cmp ($len,0x60); + &jb (&label("ecb_enc_tail")); + + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &sub ($len,0x60); + &jmp (&label("ecb_enc_loop6_enter")); + +&set_label("ecb_enc_loop6",16); + &movups (&QWP(0,$out),$inout0); + &movdqu ($inout0,&QWP(0,$inp)); + &movups (&QWP(0x10,$out),$inout1); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movups (&QWP(0x30,$out),$inout3); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movups (&QWP(0x40,$out),$inout4); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); +&set_label("ecb_enc_loop6_enter"); - &cmp ($len,0x40); - &jbe (&label("ecb_enc_tail")); - &sub ($len,0x40); - &jmp (&label("ecb_enc_loop3")); + &call ("_aesni_encrypt6"); -&set_label("ecb_enc_loop3",16); - &movups ($inout0,&QWP(0,$inp)); - &movups ($inout1,&QWP(0x10,$inp)); - &movups ($inout2,&QWP(0x20,$inp)); - &call ("_aesni_encrypt3"); - &lea ($inp,&DWP(0x30,$inp)); - &movups (&QWP(0,$out),$inout0); &mov ($key,$key_); # restore $key - &movups (&QWP(0x10,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds + &sub ($len,0x60); + &jnc (&label("ecb_enc_loop6")); + + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); - &lea ($out,&DWP(0x30,$out)); - &sub ($len,0x30); - &ja (&label("ecb_enc_loop3")); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &add ($len,0x60); + &jz (&label("ecb_ret")); - &add ($len,0x40); &set_label("ecb_enc_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); @@ -316,14 +423,18 @@ if ($PREFIX eq "aesni") { &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_enc_two")); &movups ($inout2,&QWP(0x20,$inp)); - &cmp ($len,0x30); - &je (&label("ecb_enc_three")); + &cmp ($len,0x40); + &jb (&label("ecb_enc_three")); &movups ($inout3,&QWP(0x30,$inp)); - &call ("_aesni_encrypt4"); + &je (&label("ecb_enc_four")); + &movups ($inout4,&QWP(0x40,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_encrypt6"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); jmp (&label("ecb_ret")); &set_label("ecb_enc_one",16); @@ -335,7 +446,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_enc_two",16); - &pxor ($inout2,$inout2); + &xorps ($inout2,$inout2); &call ("_aesni_encrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); @@ -347,29 +458,65 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_four",16); + &call ("_aesni_encrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &jmp (&label("ecb_ret")); ###################################################################### &set_label("ecb_decrypt",16); - &cmp ($len,0x40); - &jbe (&label("ecb_dec_tail")); - &sub ($len,0x40); - &jmp (&label("ecb_dec_loop3")); - -&set_label("ecb_dec_loop3",16); - &movups ($inout0,&QWP(0,$inp)); - &movups ($inout1,&QWP(0x10,$inp)); - &movups ($inout2,&QWP(0x20,$inp)); - &call ("_aesni_decrypt3"); - &lea ($inp,&DWP(0x30,$inp)); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &cmp ($len,0x60); + &jb (&label("ecb_dec_tail")); + + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &sub ($len,0x60); + &jmp (&label("ecb_dec_loop6_enter")); + +&set_label("ecb_dec_loop6",16); &movups (&QWP(0,$out),$inout0); - &mov ($key,$key_); # restore $key + &movdqu ($inout0,&QWP(0,$inp)); &movups (&QWP(0x10,$out),$inout1); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movups (&QWP(0x30,$out),$inout3); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movups (&QWP(0x40,$out),$inout4); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); +&set_label("ecb_dec_loop6_enter"); + + &call ("_aesni_decrypt6"); + + &mov ($key,$key_); # restore $key &mov ($rounds,$rounds_); # restore $rounds + &sub ($len,0x60); + &jnc (&label("ecb_dec_loop6")); + + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); - &lea ($out,&DWP(0x30,$out)); - &sub ($len,0x30); - &ja (&label("ecb_dec_loop3")); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &add ($len,0x60); + &jz (&label("ecb_ret")); - &add ($len,0x40); &set_label("ecb_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &cmp ($len,0x20); @@ -377,14 +524,18 @@ if ($PREFIX eq "aesni") { &movups ($inout1,&QWP(0x10,$inp)); &je (&label("ecb_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); - &cmp ($len,0x30); - &je (&label("ecb_dec_three")); + &cmp ($len,0x40); + &jb (&label("ecb_dec_three")); &movups ($inout3,&QWP(0x30,$inp)); - &call ("_aesni_decrypt4"); + &je (&label("ecb_dec_four")); + &movups ($inout4,&QWP(0x40,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_decrypt6"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); &jmp (&label("ecb_ret")); &set_label("ecb_dec_one",16); @@ -396,7 +547,7 @@ if ($PREFIX eq "aesni") { &jmp (&label("ecb_ret")); &set_label("ecb_dec_two",16); - &pxor ($inout2,$inout2); + &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); @@ -407,6 +558,14 @@ if ($PREFIX eq "aesni") { &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_four",16); + &call ("_aesni_decrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); @@ -420,6 +579,7 @@ if ($PREFIX eq "aesni") { # does not update *ivec! Nor does it finalize CMAC value # (see engine/eng_aesni.c for details) # +{ my $cmac=$inout1; &function_begin("aesni_ccm64_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); @@ -433,7 +593,7 @@ if ($PREFIX eq "aesni") { &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec - &movdqu ($inout1,&QWP(0,$rounds)); # load cmac + &movdqu ($cmac,&QWP(0,$rounds)); # load cmac # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); @@ -458,27 +618,47 @@ if ($PREFIX eq "aesni") { &movdqa ($inout0,$ivec); &set_label("ccm64_enc_outer"); - &movdqu ($in0,&QWP(0,$inp)); - &pshufb ($inout0,$inout3); - &mov ($key,$key_); - &mov ($rounds,$rounds_); - &pxor ($inout1,$in0); # cmac^=inp - &pxor ($inout2,$inout2); + &movups ($in0,&QWP(0,$inp)); + &pshufb ($inout0,$inout3); + &mov ($key,$key_); + &mov ($rounds,$rounds_); - &call ("_aesni_encrypt3"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($in0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &xorps ($cmac,$in0); # cmac^=inp + &$movekey ($rndkey0,&QWP(0,$key)); + +&set_label("ccm64_enc2_loop"); + &aesenc ($inout0,$rndkey1); + &dec ($rounds); + &aesenc ($cmac,$rndkey1); + &$movekey ($rndkey1,&QWP(16,$key)); + &aesenc ($inout0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &aesenc ($cmac,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("ccm64_enc2_loop")); + &aesenc ($inout0,$rndkey1); + &aesenc ($cmac,$rndkey1); + &aesenclast ($inout0,$rndkey0); + &aesenclast ($cmac,$rndkey0); &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&DWP(16,$inp)); - &pxor ($in0,$inout0); # inp^=E(ivec) + &xorps ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); - &movdqu (&QWP(0,$out),$in0); + &movups (&QWP(0,$out),$in0); &lea ($out,&DWP(16,$out)); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); - &movdqu (&QWP(0,$out),$inout1); + &movups (&QWP(0,$out),$cmac); &function_end("aesni_ccm64_encrypt_blocks"); &function_begin("aesni_ccm64_decrypt_blocks"); @@ -494,7 +674,7 @@ if ($PREFIX eq "aesni") { &mov (&DWP(48,"esp"),$key_); &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec - &movdqu ($inout1,&QWP(0,$rounds)); # load cmac + &movdqu ($cmac,&QWP(0,$rounds)); # load cmac # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); @@ -524,35 +704,56 @@ if ($PREFIX eq "aesni") { { &call ("_aesni_encrypt1"); } &set_label("ccm64_dec_outer"); - &movdqu ($in0,&QWP(0,$inp)); &paddq ($ivec,&QWP(16,"esp")); - &dec ($len); - &lea ($inp,&QWP(16,$inp)); - &pxor ($in0,$inout0); + &movups ($in0,&QWP(0,$inp)); # load inp + &xorps ($in0,$inout0); &movdqa ($inout0,$ivec); + &lea ($inp,&QWP(16,$inp)); + &pshufb ($inout0,$inout3); &mov ($key,$key_); &mov ($rounds,$rounds_); - &pshufb ($inout0,$inout3); - &movdqu (&QWP(0,$out),$in0); + &movups (&QWP(0,$out),$in0); &lea ($out,&DWP(16,$out)); + &sub ($len,1); &jz (&label("ccm64_dec_break")); - &pxor ($inout2,$inout2); - &call ("_aesni_encrypt3"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($in0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &xorps ($cmac,$in0); # cmac^=out + &$movekey ($rndkey0,&QWP(0,$key)); +&set_label("ccm64_dec2_loop"); + &aesenc ($inout0,$rndkey1); + &dec ($rounds); + &aesenc ($cmac,$rndkey1); + &$movekey ($rndkey1,&QWP(16,$key)); + &aesenc ($inout0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &aesenc ($cmac,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("ccm64_dec2_loop")); + &aesenc ($inout0,$rndkey1); + &aesenc ($cmac,$rndkey1); + &aesenclast ($inout0,$rndkey0); + &aesenclast ($cmac,$rndkey0); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); if ($inline) - { &aesni_inline_generate1("enc",$inout1); } + { &aesni_inline_generate1("enc",$cmac,$in0); } else - { &call ("_aesni_encrypt1",$inout1); } + { &call ("_aesni_encrypt1",$cmac); } &mov ("esp",&DWP(48,"esp")); &mov ($out,&wparam(5)); - &movdqu (&QWP(0,$out),$inout1); + &movups (&QWP(0,$out),$cmac); &function_end("aesni_ccm64_decrypt_blocks"); +} ###################################################################### # void aesni_ctr32_encrypt_blocks (const void *in, void *out, @@ -562,6 +763,14 @@ if ($PREFIX eq "aesni") { # Handles only complete blocks, operates on 32-bit counter and # does not update *ivec! (see engine/eng_aesni.c for details) # +# stack layout: +# 0 pshufb mask +# 16 vector addend: 0,6,6,6 +# 32 counter-less ivec +# 48 1st triplet of counter vector +# 64 2nd triplet of counter vector +# 80 saved %esp + &function_begin("aesni_ctr32_encrypt_blocks"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); @@ -569,14 +778,14 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(3)); &mov ($rounds_,&wparam(4)); &mov ($key_,"esp"); - &sub ("esp",60); + &sub ("esp",88); &and ("esp",-16); # align stack - &mov (&DWP(48,"esp"),$key_); + &mov (&DWP(80,"esp"),$key_); &cmp ($len,1); &je (&label("ctr32_one_shortcut")); - &movups ($inout3,&QWP(0,$rounds_)); # load ivec + &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); @@ -585,139 +794,167 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack - &mov ($rounds,3); + &mov ($rounds,6); &xor ($key_,$key_); &mov (&DWP(16,"esp"),$rounds); &mov (&DWP(20,"esp"),$rounds); &mov (&DWP(24,"esp"),$rounds); &mov (&DWP(28,"esp"),$key_); - &pextrd ($rounds_,$inout3,3); # pull 32-bit counter - &pinsrd ($inout3,$key_,3); # wipe 32-bit counter + &pextrd ($rounds_,$inout5,3); # pull 32-bit counter + &pinsrd ($inout5,$key_,3); # wipe 32-bit counter &mov ($rounds,&DWP(240,$key)); # key->rounds - &movdqa ($rndkey0,&QWP(0,"esp")); # load byte-swap mask - # $ivec is vector of 3 32-bit counters - &pxor ($ivec,$ivec); + # compose 2 vectors of 3x32-bit counters &bswap ($rounds_); - &pinsrd ($ivec,$rounds_,0); + &pxor ($rndkey1,$rndkey1); + &pxor ($rndkey0,$rndkey0); + &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask + &pinsrd ($rndkey1,$rounds_,0); + &lea ($key_,&DWP(3,$rounds_)); + &pinsrd ($rndkey0,$key_,0); &inc ($rounds_); - &pinsrd ($ivec,$rounds_,1); + &pinsrd ($rndkey1,$rounds_,1); + &inc ($key_); + &pinsrd ($rndkey0,$key_,1); &inc ($rounds_); - &pinsrd ($ivec,$rounds_,2); - &pshufb ($ivec,$rndkey0); # byte swap - - &cmp ($len,4); - &jbe (&label("ctr32_tail")); - &movdqa (&QWP(32,"esp"),$inout3); # save counter-less ivec - &mov ($rounds_,$rounds); - &mov ($key_,$key); - &sub ($len,4); - &jmp (&label("ctr32_loop3")); - -&set_label("ctr32_loop3",16); - &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword - &pshufd ($inout1,$ivec,2<<6); - &por ($inout0,$inout3); # merge counter-less ivec - &pshufd ($inout2,$ivec,1<<6); - &por ($inout1,$inout3); - &por ($inout2,$inout3); - - # inline _aesni_encrypt3 and interleave last round - # with own code... - - &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(32,$key)); + &pinsrd ($rndkey1,$rounds_,2); + &inc ($key_); + &pinsrd ($rndkey0,$key_,2); + &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet + &pshufb ($rndkey1,$inout0); # byte swap + &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet + &pshufb ($rndkey0,$inout0); # byte swap + + &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword + &pshufd ($inout1,$rndkey1,2<<6); + &cmp ($len,6); + &jb (&label("ctr32_tail")); + &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec + &shr ($rounds,1); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &sub ($len,6); + &jmp (&label("ctr32_loop6")); + +&set_label("ctr32_loop6",16); + &pshufd ($inout2,$rndkey1,1<<6); + &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec + &pshufd ($inout3,$rndkey0,3<<6); + &por ($inout0,$rndkey1); # merge counter-less ivec + &pshufd ($inout4,$rndkey0,2<<6); + &por ($inout1,$rndkey1); + &pshufd ($inout5,$rndkey0,1<<6); + &por ($inout2,$rndkey1); + &por ($inout3,$rndkey1); + &por ($inout4,$rndkey1); + &por ($inout5,$rndkey1); + + # inlining _aesni_encrypt6's prologue gives ~4% improvement... + &$movekey ($rndkey0,&QWP(0,$key_)); + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &dec ($rounds); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); - &pxor ($inout2,$rndkey0); - &$movekey ($rndkey0,&QWP(0,$key)); - -&set_label("ctr32_enc_loop3"); &aesenc ($inout0,$rndkey1); + &pxor ($inout2,$rndkey0); &aesenc ($inout1,$rndkey1); - &dec ($rounds); + &pxor ($inout3,$rndkey0); &aesenc ($inout2,$rndkey1); - &$movekey ($rndkey1,&QWP(16,$key)); - &aesenc ($inout0,$rndkey0); - &aesenc ($inout1,$rndkey0); - &lea ($key,&DWP(32,$key)); - &aesenc ($inout2,$rndkey0); + &pxor ($inout4,$rndkey0); + &aesenc ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesenc ($inout4,$rndkey1); &$movekey ($rndkey0,&QWP(0,$key)); - &jnz (&label("ctr32_enc_loop3")); + &aesenc ($inout5,$rndkey1); - &aesenc ($inout0,$rndkey1); - &aesenc ($inout1,$rndkey1); - &aesenc ($inout2,$rndkey1); - &movdqa ($rndkey1,&QWP(0,"esp")); # load byte-swap mask + &call (&label("_aesni_encrypt6_enter")); + + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &movdqa ($rndkey0,&QWP(16,"esp")); # load increment + &xorps ($inout2,$rndkey1); + &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + + &paddd ($rndkey1,$rndkey0); # 1st triplet increment + &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment + &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask + + &movups ($inout1,&QWP(0x30,$inp)); + &movups ($inout2,&QWP(0x40,$inp)); + &xorps ($inout3,$inout1); + &movups ($inout1,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet + &pshufb ($rndkey1,$inout0); # byte swap + &xorps ($inout4,$inout2); + &movups (&QWP(0x30,$out),$inout3); + &xorps ($inout5,$inout1); + &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet + &pshufb ($rndkey0,$inout0); # byte swap + &movups (&QWP(0x40,$out),$inout4); + &pshufd ($inout0,$rndkey1,3<<6); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); - &aesenclast ($inout0,$rndkey0); - &pshufb ($ivec,$rndkey1); # byte swap - &movdqu ($in0,&QWP(0,$inp)); - &aesenclast ($inout1,$rndkey0); - &paddd ($ivec,&QWP(16,"esp")); # counter increment - &movdqu ($in1,&QWP(0x10,$inp)); - &aesenclast ($inout2,$rndkey0); - &pshufb ($ivec,$rndkey1); # byte swap - &movdqu ($rndkey0,&QWP(0x20,$inp)); - &lea ($inp,&DWP(0x30,$inp)); - - &pxor ($in0,$inout0); - &mov ($key,$key_); - &pxor ($in1,$inout1); - &movdqu (&QWP(0,$out),$in0); - &pxor ($rndkey0,$inout2); - &movdqu (&QWP(0x10,$out),$in1); - &movdqu (&QWP(0x20,$out),$rndkey0); - &movdqa ($inout3,&QWP(32,"esp")); # load counter-less ivec - - &sub ($len,3); - &lea ($out,&DWP(0x30,$out)); &mov ($rounds,$rounds_); - &ja (&label("ctr32_loop3")); + &pshufd ($inout1,$rndkey1,2<<6); + &sub ($len,6); + &jnc (&label("ctr32_loop6")); - &pextrd ($rounds_,$ivec,1); # might need last counter value - &add ($len,4); - &bswap ($rounds_); + &add ($len,6); + &jz (&label("ctr32_ret")); + &mov ($key,$key_); + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec &set_label("ctr32_tail"); - &pshufd ($inout0,$ivec,3<<6); - &pshufd ($inout1,$ivec,2<<6); - &por ($inout0,$inout3); + &por ($inout0,$inout5); &cmp ($len,2); &jb (&label("ctr32_one")); - &lea ($rounds_,&DWP(1,$rounds_)); - &pshufd ($inout2,$ivec,1<<6); - &por ($inout1,$inout3); - &je (&label("ctr32_two")); - &bswap ($rounds_); - &por ($inout2,$inout3); - &cmp ($len,3); - &je (&label("ctr32_three")); - - &pinsrd ($inout3,$rounds_,3); # compose last counter value - &call ("_aesni_encrypt4"); + &pshufd ($inout2,$rndkey1,1<<6); + &por ($inout1,$inout5); + &je (&label("ctr32_two")); - &movdqu ($in0,&QWP(0,$inp)); - &movdqu ($rndkey1,&QWP(0x10,$inp)); - &pxor ($in0,$inout0); - &movdqu ($rndkey0,&QWP(0x20,$inp)); - &pxor ($rndkey1,$inout1); - &movdqu ($ivec,&QWP(0x30,$inp)); - &pxor ($rndkey0,$inout2); - &movdqu (&QWP(0,$out),$in0); - &pxor ($ivec,$inout3); - &movdqu (&QWP(0x10,$out),$rndkey1); - &movdqu (&QWP(0x20,$out),$rndkey0); - &movdqu (&QWP(0x30,$out),$ivec); + &pshufd ($inout3,$rndkey0,3<<6); + &por ($inout2,$inout5); + &cmp ($len,4); + &jb (&label("ctr32_three")); + + &pshufd ($inout4,$rndkey0,2<<6); + &por ($inout3,$inout5); + &je (&label("ctr32_four")); + + &por ($inout4,$inout5); + &call ("_aesni_encrypt6"); + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout2,$rndkey1); + &movups ($rndkey1,&QWP(0x40,$inp)); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout4,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); &jmp (&label("ctr32_ret")); &set_label("ctr32_one_shortcut",16); - &movdqu ($inout0,&QWP(0,$rounds_)); # load ivec + &movups ($inout0,&QWP(0,$rounds_)); # load ivec &mov ($rounds,&DWP(240,$key)); &set_label("ctr32_one"); @@ -725,37 +962,757 @@ if ($PREFIX eq "aesni") { { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } - &movdqu ($in0,&QWP(0,$inp)); - &pxor ($in0,$inout0); - &movdqu (&QWP(0,$out),$in0); + &movups ($in0,&QWP(0,$inp)); + &xorps ($in0,$inout0); + &movups (&QWP(0,$out),$in0); &jmp (&label("ctr32_ret")); &set_label("ctr32_two",16); - &pxor ($inout2,$inout2); &call ("_aesni_encrypt3"); - &movdqu ($in0,&QWP(0,$inp)); - &movdqu ($in1,&QWP(0x10,$inp)); - &pxor ($in0,$inout0); - &pxor ($in1,$inout1); - &movdqu (&QWP(0,$out),$in0); - &movdqu (&QWP(0x10,$out),$in1); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); &jmp (&label("ctr32_ret")); &set_label("ctr32_three",16); &call ("_aesni_encrypt3"); - &movdqu ($in0,&QWP(0,$inp)); - &movdqu ($in1,&QWP(0x10,$inp)); - &movdqu ($rndkey1,&QWP(0x20,$inp)); - &pxor ($in0,$inout0); - &pxor ($in1,$inout1); - &movdqu (&QWP(0,$out),$in0); - &pxor ($rndkey1,$inout2); - &movdqu (&QWP(0x10,$out),$in1); - &movdqu (&QWP(0x20,$out),$rndkey1); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &movups ($inout5,&QWP(0x20,$inp)); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$inout5); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_four",16); + &call ("_aesni_encrypt4"); + &movups ($inout4,&QWP(0,$inp)); + &movups ($inout5,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout0,$inout4); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout1,$inout5); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); &set_label("ctr32_ret"); - &mov ("esp",&DWP(48,"esp")); + &mov ("esp",&DWP(80,"esp")); &function_end("aesni_ctr32_encrypt_blocks"); + +###################################################################### +# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2 +# const unsigned char iv[16]); +# +{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); + +&function_begin("aesni_xts_encrypt"); + &mov ($key,&wparam(4)); # key2 + &mov ($inp,&wparam(5)); # clear-text tweak + + &mov ($rounds,&DWP(240,$key)); # key2->rounds + &movups ($inout0,&QWP(0,$inp)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); # key1 + + &mov ($key_,"esp"); + &sub ("esp",16*7+8); + &mov ($rounds,&DWP(240,$key)); # key1->rounds + &and ("esp",-16); # align stack + + &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant + &mov (&DWP(16*6+4,"esp"),0); + &mov (&DWP(16*6+8,"esp"),1); + &mov (&DWP(16*6+12,"esp"),0); + &mov (&DWP(16*7+0,"esp"),$len); # save original $len + &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp + + &movdqa ($tweak,$inout0); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + + &and ($len,-16); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &sub ($len,16*6); + &jc (&label("xts_enc_short")); + + &shr ($rounds,1); + &mov ($rounds_,$rounds); + &jmp (&label("xts_enc_loop6")); + +&set_label("xts_enc_loop6",16); + for ($i=0;$i<4;$i++) { + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa (&QWP(16*$i,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + } + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*$i++,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &$movekey ($rndkey0,&QWP(0,$key_)); + &pand ($inout5,$twmask); # isolate carry and residue + &movups ($inout0,&QWP(0,$inp)); # load input + &pxor ($inout5,$tweak); + + # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &movdqu ($inout1,&QWP(16*1,$inp)); + &xorps ($inout0,$rndkey0); # input^=rndkey[0] + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout1,$rndkey0); + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout2,$rndkey0); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout3,$rndkey0); + &movdqu ($rndkey1,&QWP(16*5,$inp)); + &pxor ($inout4,$rndkey0); + &lea ($inp,&DWP(16*6,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak + &pxor ($inout5,$rndkey1); + + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &pxor ($inout1,&QWP(16*1,"esp")); + &aesenc ($inout0,$rndkey1); + &pxor ($inout2,&QWP(16*2,"esp")); + &aesenc ($inout1,$rndkey1); + &pxor ($inout3,&QWP(16*3,"esp")); + &dec ($rounds); + &aesenc ($inout2,$rndkey1); + &pxor ($inout4,&QWP(16*4,"esp")); + &aesenc ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesenc ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesenc ($inout5,$rndkey1); + &call (&label("_aesni_encrypt6_enter")); + + &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak + &pxor ($twtmp,$twtmp); + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &xorps ($inout1,&QWP(16*1,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*2,$out),$inout2); + &xorps ($inout4,&QWP(16*4,"esp")); + &movups (&QWP(16*3,$out),$inout3); + &xorps ($inout5,$tweak); + &movups (&QWP(16*4,$out),$inout4); + &pshufd ($twres,$twtmp,0x13); + &movups (&QWP(16*5,$out),$inout5); + &lea ($out,&DWP(16*6,$out)); + &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 + + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov ($rounds,$rounds_); # restore $rounds + &pxor ($tweak,$twres); + + &sub ($len,16*6); + &jnc (&label("xts_enc_loop6")); + + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($key,$key_); # restore $key + &mov ($rounds_,$rounds); + +&set_label("xts_enc_short"); + &add ($len,16*6); + &jz (&label("xts_enc_done6x")); + + &movdqa ($inout3,$tweak); # put aside previous tweak + &cmp ($len,0x20); + &jb (&label("xts_enc_one")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &je (&label("xts_enc_two")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &cmp ($len,0x40); + &jb (&label("xts_enc_three")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout5,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &movdqa (&QWP(16*0,"esp"),$inout3); + &movdqa (&QWP(16*1,"esp"),$inout4); + &je (&label("xts_enc_four")); + + &movdqa (&QWP(16*2,"esp"),$inout5); + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*3,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($inout0,1); + &pand ($inout5,$twmask); # isolate carry and residue + &pxor ($inout5,$tweak); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout1,&QWP(16*1,"esp")); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout2,&QWP(16*2,"esp")); + &lea ($inp,&DWP(16*5,$inp)); + &pxor ($inout3,&QWP(16*3,"esp")); + &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak + &pxor ($inout4,$inout5); + + &call ("_aesni_encrypt6"); + + &movaps ($tweak,&QWP(16*4,"esp")); # last tweak + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout4,$tweak); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &movups (&QWP(16*4,$out),$inout4); + &lea ($out,&DWP(16*5,$out)); + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_one",16); + &movups ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16*1,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(16*0,$out),$inout0); # write output + &lea ($out,&DWP(16*1,$out)); + + &movdqa ($tweak,$inout3); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_two",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &lea ($inp,&DWP(16*2,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout2); + + &call ("_aesni_encrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &lea ($out,&DWP(16*2,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_three",16); + &movaps ($inout5,$tweak); # put aside last tweak + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &lea ($inp,&DWP(16*3,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + + &call ("_aesni_encrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &lea ($out,&DWP(16*3,$out)); + + &movdqa ($tweak,$inout5); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_four",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movups ($inout3,&QWP(16*3,$inp)); + &lea ($inp,&DWP(16*4,$inp)); + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &xorps ($inout3,$inout4); + + &call ("_aesni_encrypt4"); + + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,$inout4); + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &lea ($out,&DWP(16*4,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_done6x",16); # $tweak is pre-calculated + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &and ($len,15); + &jz (&label("xts_enc_ret")); + &movdqa ($inout3,$tweak); + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &jmp (&label("xts_enc_steal")); + +&set_label("xts_enc_done",16); + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &pxor ($twtmp,$twtmp); + &and ($len,15); + &jz (&label("xts_enc_ret")); + + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &pshufd ($inout3,$twtmp,0x13); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue + &pxor ($inout3,$tweak); + +&set_label("xts_enc_steal"); + &movz ($rounds,&BP(0,$inp)); + &movz ($key,&BP(-16,$out)); + &lea ($inp,&DWP(1,$inp)); + &mov (&BP(-16,$out),&LB($rounds)); + &mov (&BP(0,$out),&LB($key)); + &lea ($out,&DWP(1,$out)); + &sub ($len,1); + &jnz (&label("xts_enc_steal")); + + &sub ($out,&DWP(16*7+0,"esp")); # rewind $out + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(-16,$out)); # load input + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(-16,$out),$inout0); # write output + +&set_label("xts_enc_ret"); + &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp +&function_end("aesni_xts_encrypt"); + +&function_begin("aesni_xts_decrypt"); + &mov ($key,&wparam(4)); # key2 + &mov ($inp,&wparam(5)); # clear-text tweak + + &mov ($rounds,&DWP(240,$key)); # key2->rounds + &movups ($inout0,&QWP(0,$inp)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); # key1 + + &mov ($key_,"esp"); + &sub ("esp",16*7+8); + &and ("esp",-16); # align stack + + &xor ($rounds_,$rounds_); # if(len%16) len-=16; + &test ($len,15); + &setnz (&LB($rounds_)); + &shl ($rounds_,4); + &sub ($len,$rounds_); + + &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant + &mov (&DWP(16*6+4,"esp"),0); + &mov (&DWP(16*6+8,"esp"),1); + &mov (&DWP(16*6+12,"esp"),0); + &mov (&DWP(16*7+0,"esp"),$len); # save original $len + &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp + + &mov ($rounds,&DWP(240,$key)); # key1->rounds + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + + &movdqa ($tweak,$inout0); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + + &and ($len,-16); + &sub ($len,16*6); + &jc (&label("xts_dec_short")); + + &shr ($rounds,1); + &mov ($rounds_,$rounds); + &jmp (&label("xts_dec_loop6")); + +&set_label("xts_dec_loop6",16); + for ($i=0;$i<4;$i++) { + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa (&QWP(16*$i,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + } + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*$i++,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &$movekey ($rndkey0,&QWP(0,$key_)); + &pand ($inout5,$twmask); # isolate carry and residue + &movups ($inout0,&QWP(0,$inp)); # load input + &pxor ($inout5,$tweak); + + # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &movdqu ($inout1,&QWP(16*1,$inp)); + &xorps ($inout0,$rndkey0); # input^=rndkey[0] + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout1,$rndkey0); + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout2,$rndkey0); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout3,$rndkey0); + &movdqu ($rndkey1,&QWP(16*5,$inp)); + &pxor ($inout4,$rndkey0); + &lea ($inp,&DWP(16*6,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak + &pxor ($inout5,$rndkey1); + + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &pxor ($inout1,&QWP(16*1,"esp")); + &aesdec ($inout0,$rndkey1); + &pxor ($inout2,&QWP(16*2,"esp")); + &aesdec ($inout1,$rndkey1); + &pxor ($inout3,&QWP(16*3,"esp")); + &dec ($rounds); + &aesdec ($inout2,$rndkey1); + &pxor ($inout4,&QWP(16*4,"esp")); + &aesdec ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesdec ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesdec ($inout5,$rndkey1); + &call (&label("_aesni_decrypt6_enter")); + + &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak + &pxor ($twtmp,$twtmp); + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &xorps ($inout1,&QWP(16*1,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*2,$out),$inout2); + &xorps ($inout4,&QWP(16*4,"esp")); + &movups (&QWP(16*3,$out),$inout3); + &xorps ($inout5,$tweak); + &movups (&QWP(16*4,$out),$inout4); + &pshufd ($twres,$twtmp,0x13); + &movups (&QWP(16*5,$out),$inout5); + &lea ($out,&DWP(16*6,$out)); + &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 + + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov ($rounds,$rounds_); # restore $rounds + &pxor ($tweak,$twres); + + &sub ($len,16*6); + &jnc (&label("xts_dec_loop6")); + + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($key,$key_); # restore $key + &mov ($rounds_,$rounds); + +&set_label("xts_dec_short"); + &add ($len,16*6); + &jz (&label("xts_dec_done6x")); + + &movdqa ($inout3,$tweak); # put aside previous tweak + &cmp ($len,0x20); + &jb (&label("xts_dec_one")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &je (&label("xts_dec_two")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &cmp ($len,0x40); + &jb (&label("xts_dec_three")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout5,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &movdqa (&QWP(16*0,"esp"),$inout3); + &movdqa (&QWP(16*1,"esp"),$inout4); + &je (&label("xts_dec_four")); + + &movdqa (&QWP(16*2,"esp"),$inout5); + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*3,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($inout0,1); + &pand ($inout5,$twmask); # isolate carry and residue + &pxor ($inout5,$tweak); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout1,&QWP(16*1,"esp")); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout2,&QWP(16*2,"esp")); + &lea ($inp,&DWP(16*5,$inp)); + &pxor ($inout3,&QWP(16*3,"esp")); + &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak + &pxor ($inout4,$inout5); + + &call ("_aesni_decrypt6"); + + &movaps ($tweak,&QWP(16*4,"esp")); # last tweak + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout4,$tweak); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &movups (&QWP(16*4,$out),$inout4); + &lea ($out,&DWP(16*5,$out)); + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_one",16); + &movups ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16*1,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(16*0,$out),$inout0); # write output + &lea ($out,&DWP(16*1,$out)); + + &movdqa ($tweak,$inout3); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_two",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &lea ($inp,&DWP(16*2,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + + &call ("_aesni_decrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &lea ($out,&DWP(16*2,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_three",16); + &movaps ($inout5,$tweak); # put aside last tweak + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &lea ($inp,&DWP(16*3,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + + &call ("_aesni_decrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &lea ($out,&DWP(16*3,$out)); + + &movdqa ($tweak,$inout5); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_four",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movups ($inout3,&QWP(16*3,$inp)); + &lea ($inp,&DWP(16*4,$inp)); + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &xorps ($inout3,$inout4); + + &call ("_aesni_decrypt4"); + + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,$inout4); + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &lea ($out,&DWP(16*4,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_done6x",16); # $tweak is pre-calculated + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &and ($len,15); + &jz (&label("xts_dec_ret")); + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &jmp (&label("xts_dec_only_one_more")); + +&set_label("xts_dec_done",16); + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &pxor ($twtmp,$twtmp); + &and ($len,15); + &jz (&label("xts_dec_ret")); + + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(16*6,"esp")); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + +&set_label("xts_dec_only_one_more"); + &pshufd ($inout3,$twtmp,0x13); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($inout3,$twmask); # isolate carry and residue + &pxor ($inout3,$tweak); + + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(0,$inp)); # load input + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(0,$out),$inout0); # write output + +&set_label("xts_dec_steal"); + &movz ($rounds,&BP(16,$inp)); + &movz ($key,&BP(0,$out)); + &lea ($inp,&DWP(1,$inp)); + &mov (&BP(0,$out),&LB($rounds)); + &mov (&BP(16,$out),&LB($key)); + &lea ($out,&DWP(1,$out)); + &sub ($len,1); + &jnz (&label("xts_dec_steal")); + + &sub ($out,&DWP(16*7+0,"esp")); # rewind $out + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(0,$out)); # load input + &xorps ($inout0,$inout4); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout4); # output^=tweak + &movups (&QWP(0,$out),$inout0); # write output + +&set_label("xts_dec_ret"); + &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp +&function_end("aesni_xts_decrypt"); +} } ###################################################################### @@ -764,34 +1721,38 @@ if ($PREFIX eq "aesni") { # unsigned char *ivp,const int enc); &function_begin("${PREFIX}_cbc_encrypt"); &mov ($inp,&wparam(0)); + &mov ($rounds_,"esp"); &mov ($out,&wparam(1)); + &sub ($rounds_,24); &mov ($len,&wparam(2)); + &and ($rounds_,-16); &mov ($key,&wparam(3)); &mov ($key_,&wparam(4)); &test ($len,$len); - &jz (&label("cbc_ret")); + &jz (&label("cbc_abort")); &cmp (&wparam(5),0); - &movdqu ($ivec,&QWP(0,$key_)); # load IV + &xchg ($rounds_,"esp"); # alloca + &movups ($ivec,&QWP(0,$key_)); # load IV &mov ($rounds,&DWP(240,$key)); - &mov ($key_,$key); # backup $key - &mov ($rounds_,$rounds); # backup $rounds + &mov ($key_,$key); # backup $key + &mov (&DWP(16,"esp"),$rounds_); # save original %esp + &mov ($rounds_,$rounds); # backup $rounds &je (&label("cbc_decrypt")); - &movdqa ($inout0,$ivec); + &movaps ($inout0,$ivec); &cmp ($len,16); &jb (&label("cbc_enc_tail")); &sub ($len,16); &jmp (&label("cbc_enc_loop")); &set_label("cbc_enc_loop",16); - &movdqu ($ivec,&QWP(0,$inp)); + &movups ($ivec,&QWP(0,$inp)); # input actually &lea ($inp,&DWP(16,$inp)); - &pxor ($inout0,$ivec); if ($inline) - { &aesni_inline_generate1("enc"); } + { &aesni_inline_generate1("enc",$inout0,$ivec); } else - { &call ("_aesni_encrypt1"); } + { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key &movups (&QWP(0,$out),$inout0); # store output @@ -817,61 +1778,97 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_enc_loop")); ###################################################################### &set_label("cbc_decrypt",16); - &cmp ($len,0x40); + &cmp ($len,0x50); &jbe (&label("cbc_dec_tail")); - &sub ($len,0x40); - &jmp (&label("cbc_dec_loop3")); + &movaps (&QWP(0,"esp"),$ivec); # save IV + &sub ($len,0x50); + &jmp (&label("cbc_dec_loop6_enter")); -&set_label("cbc_dec_loop3",16); - &movups ($inout0,&QWP(0,$inp)); - &movups ($inout1,&QWP(0x10,$inp)); - &movups ($inout2,&QWP(0x20,$inp)); - &movaps ($in0,$inout0); - &movaps ($in1,$inout1); - - &call ("_aesni_decrypt3"); - - &pxor ($inout0,$ivec); - &pxor ($inout1,$in0); - &movdqu ($ivec,&QWP(0x20,$inp)); - &lea ($inp,&DWP(0x30,$inp)); - &pxor ($inout2,$in1); - &movdqu (&QWP(0,$out),$inout0); - &mov ($rounds,$rounds_) # restore $rounds - &movdqu (&QWP(0x10,$out),$inout1); - &mov ($key,$key_); # restore $key - &movdqu (&QWP(0x20,$out),$inout2); - &lea ($out,&DWP(0x30,$out)); - &sub ($len,0x30); - &ja (&label("cbc_dec_loop3")); - - &add ($len,0x40); +&set_label("cbc_dec_loop6",16); + &movaps (&QWP(0,"esp"),$rndkey0); # save IV + &movups (&QWP(0,$out),$inout5); + &lea ($out,&DWP(0x10,$out)); +&set_label("cbc_dec_loop6_enter"); + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + + &call ("_aesni_decrypt6"); + + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,&QWP(0,"esp")); # ^=IV + &xorps ($inout1,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout2,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout3,$rndkey1); + &movups ($rndkey1,&QWP(0x40,$inp)); + &xorps ($inout4,$rndkey0); + &movups ($rndkey0,&QWP(0x50,$inp)); # IV + &xorps ($inout5,$rndkey1); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &lea ($inp,&DWP(0x60,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &mov ($rounds,$rounds_) # restore $rounds + &movups (&QWP(0x30,$out),$inout3); + &mov ($key,$key_); # restore $key + &movups (&QWP(0x40,$out),$inout4); + &lea ($out,&DWP(0x50,$out)); + &sub ($len,0x60); + &ja (&label("cbc_dec_loop6")); + + &movaps ($inout0,$inout5); + &movaps ($ivec,$rndkey0); + &add ($len,0x50); + &jle (&label("cbc_dec_tail_collected")); + &movups (&QWP(0,$out),$inout0); + &lea ($out,&DWP(0x10,$out)); &set_label("cbc_dec_tail"); &movups ($inout0,&QWP(0,$inp)); &movaps ($in0,$inout0); &cmp ($len,0x10); &jbe (&label("cbc_dec_one")); + &movups ($inout1,&QWP(0x10,$inp)); &movaps ($in1,$inout1); &cmp ($len,0x20); &jbe (&label("cbc_dec_two")); + &movups ($inout2,&QWP(0x20,$inp)); &cmp ($len,0x30); &jbe (&label("cbc_dec_three")); + &movups ($inout3,&QWP(0x30,$inp)); - &call ("_aesni_decrypt4"); - &movdqu ($rndkey0,&QWP(0x10,$inp)); - &movdqu ($rndkey1,&QWP(0x20,$inp)); - &pxor ($inout0,$ivec); - &pxor ($inout1,$in0); - &movdqu ($ivec,&QWP(0x30,$inp)); - &movdqu (&QWP(0,$out),$inout0); - &pxor ($inout2,$rndkey0); - &pxor ($inout3,$rndkey1); - &movdqu (&QWP(0x10,$out),$inout1); - &movdqu (&QWP(0x20,$out),$inout2); - &movdqa ($inout0,$inout3); - &lea ($out,&DWP(0x30,$out)); + &cmp ($len,0x40); + &jbe (&label("cbc_dec_four")); + + &movups ($inout4,&QWP(0x40,$inp)); + &movaps (&QWP(0,"esp"),$ivec); # save IV + &movups ($inout0,&QWP(0,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_decrypt6"); + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,&QWP(0,"esp")); # ^= IV + &xorps ($inout1,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout2,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout3,$rndkey1); + &movups ($ivec,&QWP(0x40,$inp)); # IV + &xorps ($inout4,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &lea ($out,&DWP(0x40,$out)); + &movaps ($inout0,$inout4); + &sub ($len,0x50); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one",16); @@ -879,51 +1876,70 @@ if ($PREFIX eq "aesni") { { &aesni_inline_generate1("dec"); } else { &call ("_aesni_decrypt1"); } - &pxor ($inout0,$ivec); - &movdqa ($ivec,$in0); + &xorps ($inout0,$ivec); + &movaps ($ivec,$in0); + &sub ($len,0x10); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_two",16); - &pxor ($inout2,$inout2); + &xorps ($inout2,$inout2); &call ("_aesni_decrypt3"); - &pxor ($inout0,$ivec); - &pxor ($inout1,$in0); - &movdqu (&QWP(0,$out),$inout0); - &movdqa ($inout0,$inout1); - &movdqa ($ivec,$in1); + &xorps ($inout0,$ivec); + &xorps ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout1); &lea ($out,&DWP(0x10,$out)); + &movaps ($ivec,$in1); + &sub ($len,0x20); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_three",16); &call ("_aesni_decrypt3"); - &pxor ($inout0,$ivec); - &pxor ($inout1,$in0); - &pxor ($inout2,$in1); - &movdqu (&QWP(0,$out),$inout0); - &movdqu (&QWP(0x10,$out),$inout1); - &movdqa ($inout0,$inout2); - &movdqu ($ivec,&QWP(0x20,$inp)); + &xorps ($inout0,$ivec); + &xorps ($inout1,$in0); + &xorps ($inout2,$in1); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout2); + &movups (&QWP(0x10,$out),$inout1); &lea ($out,&DWP(0x20,$out)); + &movups ($ivec,&QWP(0x20,$inp)); + &sub ($len,0x30); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_four",16); + &call ("_aesni_decrypt4"); + &movups ($rndkey1,&QWP(0x10,$inp)); + &movups ($rndkey0,&QWP(0x20,$inp)); + &xorps ($inout0,$ivec); + &movups ($ivec,&QWP(0x30,$inp)); + &xorps ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0x20,$out),$inout2); + &lea ($out,&DWP(0x30,$out)); + &movaps ($inout0,$inout3); + &sub ($len,0x40); &set_label("cbc_dec_tail_collected"); &and ($len,15); &jnz (&label("cbc_dec_tail_partial")); - &movdqu (&QWP(0,$out),$inout0); + &movups (&QWP(0,$out),$inout0); &jmp (&label("cbc_ret")); &set_label("cbc_dec_tail_partial",16); - &mov ($key_,"esp"); - &sub ("esp",16); - &and ("esp",-16); - &movdqa (&QWP(0,"esp"),$inout0); + &movaps (&QWP(0,"esp"),$inout0); + &mov ("ecx",16); &mov ($inp,"esp"); - &mov ("ecx",$len); + &sub ("ecx",$len); &data_word(0xA4F3F689); # rep movsb - &mov ("esp",$key_); &set_label("cbc_ret"); + &mov ("esp",&DWP(16,"esp")); # pull original %esp &mov ($key_,&wparam(4)); &movups (&QWP(0,$key_),$ivec); # output IV +&set_label("cbc_abort"); &function_end("${PREFIX}_cbc_encrypt"); ###################################################################### @@ -945,7 +1961,7 @@ if ($PREFIX eq "aesni") { &jz (&label("bad_pointer")); &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey - &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 &lea ($key,&DWP(16,$key)); &cmp ($rounds,256); &je (&label("14rounds")); @@ -987,11 +2003,11 @@ if ($PREFIX eq "aesni") { &lea ($key,&DWP(16,$key)); &set_label("key_128_cold"); &shufps ("xmm4","xmm0",0b00010000); - &pxor ("xmm0","xmm4"); - &shufps ("xmm4","xmm0",0b10001100,); - &pxor ("xmm0","xmm4"); - &pshufd ("xmm1","xmm1",0b11111111); # critical path - &pxor ("xmm0","xmm1"); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); &ret(); &set_label("12rounds",16); @@ -1026,11 +2042,11 @@ if ($PREFIX eq "aesni") { &movaps ("xmm5","xmm2"); &set_label("key_192b_warm"); &shufps ("xmm4","xmm0",0b00010000); - &movaps ("xmm3","xmm2"); - &pxor ("xmm0","xmm4"); + &movdqa ("xmm3","xmm2"); + &xorps ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); &pslldq ("xmm3",4); - &pxor ("xmm0","xmm4"); + &xorps ("xmm0","xmm4"); &pshufd ("xmm1","xmm1",0b01010101); # critical path &pxor ("xmm2","xmm3"); &pxor ("xmm0","xmm1"); @@ -1089,11 +2105,11 @@ if ($PREFIX eq "aesni") { &lea ($key,&DWP(16,$key)); &set_label("key_256a_cold"); &shufps ("xmm4","xmm0",0b00010000); - &pxor ("xmm0","xmm4"); + &xorps ("xmm0","xmm4"); &shufps ("xmm4","xmm0",0b10001100); - &pxor ("xmm0","xmm4"); - &pshufd ("xmm1","xmm1",0b11111111); # critical path - &pxor ("xmm0","xmm1"); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); &ret(); &set_label("key_256b",16); @@ -1101,11 +2117,11 @@ if ($PREFIX eq "aesni") { &lea ($key,&DWP(16,$key)); &shufps ("xmm4","xmm2",0b00010000); - &pxor ("xmm2","xmm4"); + &xorps ("xmm2","xmm4"); &shufps ("xmm4","xmm2",0b10001100); - &pxor ("xmm2","xmm4"); - &pshufd ("xmm1","xmm1",0b10101010); # critical path - &pxor ("xmm2","xmm1"); + &xorps ("xmm2","xmm4"); + &shufps ("xmm1","xmm1",0b10101010); # critical path + &xorps ("xmm2","xmm1"); &ret(); &set_label("bad_pointer",4); diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 4602068ffc..ea9cf9404d 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -18,7 +18,7 @@ # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte # processed with 128-bit key. And given their throughput asymptotic # performance for parallelizable modes is 1.25 cycles per byte. Being -# asymptotic limit is not something you commonly achieve in reality, +# asymptotic limit it's not something you commonly achieve in reality, # but how close does one get? Below are results collected for # different modes and block sized. Pairs of numbers are for en-/ # decryption. @@ -79,6 +79,84 @@ # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... +# January 2011 +# +# While Westmere processor features 6 cycles latency for aes[enc|dec] +# instructions, which can be scheduled every second cycle, Sandy +# Bridge spends 8 cycles per instruction, but it can schedule them +# every cycle. This means that code targeting Westmere would perform +# suboptimally on Sandy Bridge. Therefore this update. +# +# In addition, non-parallelizable CBC encrypt (as well as CCM) is +# optimized. Relative improvement might appear modest, 8% on Westmere, +# but in absolute terms it's 3.77 cycles per byte encrypted with +# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers +# should be compared to asymptotic limits of 3.75 for Westmere and +# 5.00 for Sandy Bridge. Actually, the fact that they get this close +# to asymptotic limits is quite amazing. Indeed, the limit is +# calculated as latency times number of rounds, 10 for 128-bit key, +# and divided by 16, the number of bytes in block, or in other words +# it accounts *solely* for aesenc instructions. But there are extra +# instructions, and numbers so close to the asymptotic limits mean +# that it's as if it takes as little as *one* additional cycle to +# execute all of them. How is it possible? It is possible thanks to +# out-of-order execution logic, which manages to overlap post- +# processing of previous block, things like saving the output, with +# actual encryption of current block, as well as pre-processing of +# current block, things like fetching input and xor-ing it with +# 0-round element of the key schedule, with actual encryption of +# previous block. Keep this in mind... +# +# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher +# performance is achieved by interleaving instructions working on +# independent blocks. In which case asymptotic limit for such modes +# can be obtained by dividing above mentioned numbers by AES +# instructions' interleave factor. Westmere can execute at most 3 +# instructions at a time, meaning that optimal interleave factor is 3, +# and that's where the "magic" number of 1.25 come from. "Optimal +# interleave factor" means that increase of interleave factor does +# not improve performance. The formula has proven to reflect reality +# pretty well on Westmere... Sandy Bridge on the other hand can +# execute up to 8 AES instructions at a time, so how does varying +# interleave factor affect the performance? Here is table for ECB +# (numbers are cycles per byte processed with 128-bit key): +# +# instruction interleave factor 3x 6x 8x +# theoretical asymptotic limit 1.67 0.83 0.625 +# measured performance for 8KB block 1.05 0.86 0.84 +# +# "as if" interleave factor 4.7x 5.8x 6.0x +# +# Further data for other parallelizable modes: +# +# CBC decrypt 1.16 0.93 0.93 +# CTR 1.14 0.91 n/a +# +# Well, given 3x column it's probably inappropriate to call the limit +# asymptotic, if it can be surpassed, isn't it? What happens there? +# Rewind to CBC paragraph for the answer. Yes, out-of-order execution +# magic is responsible for this. Processor overlaps not only the +# additional instructions with AES ones, but even AES instuctions +# processing adjacent triplets of independent blocks. In the 6x case +# additional instructions still claim disproportionally small amount +# of additional cycles, but in 8x case number of instructions must be +# a tad too high for out-of-order logic to cope with, and AES unit +# remains underutilized... As you can see 8x interleave is hardly +# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl +# utilizies 6x interleave because of limited register bank capacity. +# +# Higher interleave factors do have negative impact on Westmere +# performance. While for ECB mode it's negligible ~1.5%, other +# parallelizables perform ~5% worse, which is outweighed by ~25% +# improvement on Sandy Bridge. To balance regression on Westmere +# CTR mode was implemented with 6x aesenc interleave factor. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like +# in CTR mode AES instruction interleave factor was chosen to be 6x. + $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-x86_64.pl:-) @@ -114,12 +192,14 @@ $rnds_="%r10d"; # backup copy for $rounds $key_="%r11"; # backup copy for $key # %xmm register layout -$inout0="%xmm0"; $inout1="%xmm1"; -$inout2="%xmm2"; $inout3="%xmm3"; -$rndkey0="%xmm4"; $rndkey1="%xmm5"; - -$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt, CTR, ... -$in1="%xmm8"; $in2="%xmm9"; +$rndkey0="%xmm0"; $rndkey1="%xmm1"; +$inout0="%xmm2"; $inout1="%xmm3"; +$inout2="%xmm4"; $inout3="%xmm5"; +$inout4="%xmm6"; $inout5="%xmm7"; +$inout6="%xmm8"; $inout7="%xmm9"; + +$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... +$in0="%xmm8"; $iv="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. # @@ -127,13 +207,22 @@ $in1="%xmm8"; $in2="%xmm9"; # cycles which take care of loop variables... { my $sn; sub aesni_generate1 { -my ($p,$key,$rounds,$inout)=@_; $inout=$inout0 if (!defined($inout)); +my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); ++$sn; $code.=<<___; - movdqu ($key),$rndkey0 + $movkey ($key),$rndkey0 $movkey 16($key),$rndkey1 +___ +$code.=<<___ if (defined($ivec)); + xorps $rndkey0,$ivec + lea 32($key),$key + xorps $ivec,$inout +___ +$code.=<<___ if (!defined($ivec)); lea 32($key),$key - pxor $rndkey0,$inout + xorps $rndkey0,$inout +___ +$code.=<<___; .Loop_${p}1_$sn: aes${p} $rndkey1,$inout dec $rounds @@ -152,8 +241,8 @@ $code.=<<___; .type ${PREFIX}_encrypt,\@abi-omnipotent .align 16 ${PREFIX}_encrypt: - movdqu ($inp),$inout0 # load input - mov 240($key),$rounds # pull $rounds + movups ($inp),$inout0 # load input + mov 240($key),$rounds # key->rounds ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; @@ -165,8 +254,8 @@ $code.=<<___; .type ${PREFIX}_decrypt,\@abi-omnipotent .align 16 ${PREFIX}_decrypt: - movdqu ($inp),$inout0 # load input - mov 240($key),$rounds # pull $rounds + movups ($inp),$inout0 # load input + mov 240($key),$rounds # key->rounds ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; @@ -176,16 +265,16 @@ $code.=<<___; ___ } -# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave -# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] -# latency is 6, it turned out that it can be scheduled only every -# *second* cycle. Thus 3x interleave is the one providing optimal +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. As soon -# as/if Intel improves throughput by making it possible to schedule -# the instructions in question *every* cycles I would have to -# implement 6x interleave and use it in loop... +# This is why it makes no sense to implement 2x subroutine. +# aes[enc|dec] latency in next processor generation is 8, but the +# instructions can be scheduled every cycle. Optimal interleave for +# new processor is therefore 8x... sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* @@ -198,9 +287,9 @@ _aesni_${dir}rypt3: shr \$1,$rounds $movkey 16($key),$rndkey1 lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 $movkey ($key),$rndkey0 .L${dir}_loop3: @@ -242,11 +331,11 @@ _aesni_${dir}rypt4: shr \$1,$rounds $movkey 16($key),$rndkey1 lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 - pxor $rndkey0,$inout3 - $movkey ($key),$rndkey0 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 + xorps $rndkey0,$inout3 + $movkey ($key),$rndkey0 .L${dir}_loop4: aes${dir} $rndkey1,$inout0 @@ -275,10 +364,155 @@ _aesni_${dir}rypt4: .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 ___ } +sub aesni_generate6 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-5] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt6,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt6: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + xorps $rndkey0,$inout0 + pxor $rndkey0,$inout1 + aes${dir} $rndkey1,$inout0 + pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout3 + aes${dir} $rndkey1,$inout2 + pxor $rndkey0,$inout4 + aes${dir} $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aes${dir} $rndkey1,$inout4 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout5 + jmp .L${dir}_loop6_enter +.align 16 +.L${dir}_loop6: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 +.L${dir}_loop6_enter: # happens to be 16-byte aligned + $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .L${dir}_loop6 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + ret +.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 +___ +} +sub aesni_generate8 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-7] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt8,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt8: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + aes${dir} $rndkey1,$inout0 + pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout3 + aes${dir} $rndkey1,$inout2 + pxor $rndkey0,$inout4 + aes${dir} $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aes${dir} $rndkey1,$inout4 + pxor $rndkey0,$inout6 + aes${dir} $rndkey1,$inout5 + pxor $rndkey0,$inout7 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + $movkey 16($key),$rndkey1 + jmp .L${dir}_loop8_enter +.align 16 +.L${dir}_loop8: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + $movkey 16($key),$rndkey1 +.L${dir}_loop8_enter: # happens to be 16-byte aligned + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + aes${dir} $rndkey0,$inout6 + aes${dir} $rndkey0,$inout7 + $movkey ($key),$rndkey0 + jnz .L${dir}_loop8 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + aes${dir}last $rndkey0,$inout6 + aes${dir}last $rndkey0,$inout7 + ret +.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 +___ +} &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); +&aesni_generate6("enc") if ($PREFIX eq "aesni"); +&aesni_generate6("dec"); +&aesni_generate8("enc") if ($PREFIX eq "aesni"); +&aesni_generate8("dec"); if ($PREFIX eq "aesni") { ######################################################################## @@ -290,37 +524,73 @@ $code.=<<___; .type aesni_ecb_encrypt,\@function,5 .align 16 aesni_ecb_encrypt: - cmp \$16,$len # check length - jb .Lecb_ret - - mov 240($key),$rounds # pull $rounds and \$-16,$len + jz .Lecb_ret + + mov 240($key),$rounds # key->rounds + $movkey ($key),$rndkey0 mov $key,$key_ # backup $key mov $rounds,$rnds_ # backup $rounds test %r8d,%r8d # 5th argument jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# - cmp \$0x40,$len - jbe .Lecb_enc_tail - sub \$0x40,$len - jmp .Lecb_enc_loop3 + cmp \$0x80,$len + jb .Lecb_enc_tail + + movdqu ($inp),$inout0 + movdqu 0x10($inp),$inout1 + movdqu 0x20($inp),$inout2 + movdqu 0x30($inp),$inout3 + movdqu 0x40($inp),$inout4 + movdqu 0x50($inp),$inout5 + movdqu 0x60($inp),$inout6 + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp + sub \$0x80,$len + jmp .Lecb_enc_loop8_enter .align 16 -.Lecb_enc_loop3: - movups ($inp),$inout0 - movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - call _aesni_encrypt3 - lea 0x30($inp),$inp +.Lecb_enc_loop8: movups $inout0,($out) + mov $key_,$key # restore $key + movdqu ($inp),$inout0 mov $rnds_,$rounds # restore $rounds movups $inout1,0x10($out) + movdqu 0x10($inp),$inout1 + movups $inout2,0x20($out) + movdqu 0x20($inp),$inout2 + movups $inout3,0x30($out) + movdqu 0x30($inp),$inout3 + movups $inout4,0x40($out) + movdqu 0x40($inp),$inout4 + movups $inout5,0x50($out) + movdqu 0x50($inp),$inout5 + movups $inout6,0x60($out) + movdqu 0x60($inp),$inout6 + movups $inout7,0x70($out) + lea 0x80($out),$out + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp +.Lecb_enc_loop8_enter: + + call _aesni_encrypt8 + + sub \$0x80,$len + jnc .Lecb_enc_loop8 + + movups $inout0,($out) mov $key_,$key # restore $key + movups $inout1,0x10($out) + mov $rnds_,$rounds # restore $rounds movups $inout2,0x20($out) - lea 0x30($out),$out - sub \$0x30,$len - ja .Lecb_enc_loop3 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + add \$0x80,$len + jz .Lecb_ret - add \$0x40,$len .Lecb_enc_tail: movups ($inp),$inout0 cmp \$0x20,$len @@ -328,14 +598,24 @@ aesni_ecb_encrypt: movups 0x10($inp),$inout1 je .Lecb_enc_two movups 0x20($inp),$inout2 - cmp \$0x30,$len - je .Lecb_enc_three + cmp \$0x40,$len + jb .Lecb_enc_three movups 0x30($inp),$inout3 - call _aesni_encrypt4 + je .Lecb_enc_four + movups 0x40($inp),$inout4 + cmp \$0x60,$len + jb .Lecb_enc_five + movups 0x50($inp),$inout5 + je .Lecb_enc_six + movdqu 0x60($inp),$inout6 + call _aesni_encrypt8 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) jmp .Lecb_ret .align 16 .Lecb_enc_one: @@ -346,7 +626,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: - pxor $inout2,$inout2 + xorps $inout2,$inout2 call _aesni_encrypt3 movups $inout0,($out) movups $inout1,0x10($out) @@ -358,30 +638,95 @@ $code.=<<___; movups $inout1,0x10($out) movups $inout2,0x20($out) jmp .Lecb_ret +.align 16 +.Lecb_enc_four: + call _aesni_encrypt4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_five: + xorps $inout5,$inout5 + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_six: + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + jmp .Lecb_ret #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: - cmp \$0x40,$len - jbe .Lecb_dec_tail - sub \$0x40,$len - jmp .Lecb_dec_loop3 + cmp \$0x80,$len + jb .Lecb_dec_tail + + movdqu ($inp),$inout0 + movdqu 0x10($inp),$inout1 + movdqu 0x20($inp),$inout2 + movdqu 0x30($inp),$inout3 + movdqu 0x40($inp),$inout4 + movdqu 0x50($inp),$inout5 + movdqu 0x60($inp),$inout6 + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp + sub \$0x80,$len + jmp .Lecb_dec_loop8_enter .align 16 -.Lecb_dec_loop3: - movups ($inp),$inout0 - movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - call _aesni_decrypt3 - lea 0x30($inp),$inp +.Lecb_dec_loop8: movups $inout0,($out) + mov $key_,$key # restore $key + movdqu ($inp),$inout0 mov $rnds_,$rounds # restore $rounds movups $inout1,0x10($out) + movdqu 0x10($inp),$inout1 + movups $inout2,0x20($out) + movdqu 0x20($inp),$inout2 + movups $inout3,0x30($out) + movdqu 0x30($inp),$inout3 + movups $inout4,0x40($out) + movdqu 0x40($inp),$inout4 + movups $inout5,0x50($out) + movdqu 0x50($inp),$inout5 + movups $inout6,0x60($out) + movdqu 0x60($inp),$inout6 + movups $inout7,0x70($out) + lea 0x80($out),$out + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp +.Lecb_dec_loop8_enter: + + call _aesni_decrypt8 + + $movkey ($key_),$rndkey0 + sub \$0x80,$len + jnc .Lecb_dec_loop8 + + movups $inout0,($out) mov $key_,$key # restore $key + movups $inout1,0x10($out) + mov $rnds_,$rounds # restore $rounds movups $inout2,0x20($out) - lea 0x30($out),$out - sub \$0x30,$len - ja .Lecb_dec_loop3 + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + add \$0x80,$len + jz .Lecb_ret - add \$0x40,$len .Lecb_dec_tail: movups ($inp),$inout0 cmp \$0x20,$len @@ -389,14 +734,25 @@ $code.=<<___; movups 0x10($inp),$inout1 je .Lecb_dec_two movups 0x20($inp),$inout2 - cmp \$0x30,$len - je .Lecb_dec_three + cmp \$0x40,$len + jb .Lecb_dec_three movups 0x30($inp),$inout3 - call _aesni_decrypt4 + je .Lecb_dec_four + movups 0x40($inp),$inout4 + cmp \$0x60,$len + jb .Lecb_dec_five + movups 0x50($inp),$inout5 + je .Lecb_dec_six + movups 0x60($inp),$inout6 + $movkey ($key),$rndkey0 + call _aesni_decrypt8 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) jmp .Lecb_ret .align 16 .Lecb_dec_one: @@ -407,7 +763,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_dec_two: - pxor $inout2,$inout2 + xorps $inout2,$inout2 call _aesni_decrypt3 movups $inout0,($out) movups $inout1,0x10($out) @@ -418,6 +774,34 @@ $code.=<<___; movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_four: + call _aesni_decrypt4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_five: + xorps $inout5,$inout5 + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_six: + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) .Lecb_ret: ret @@ -467,25 +851,45 @@ $code.=<<___; movdqa $iv,$inout0 .Lccm64_enc_outer: - movdqu ($inp),$in0 # load inp + movups ($inp),$in0 # load inp pshufb $bswap_mask,$inout0 mov $key_,$key mov $rnds_,$rounds - pxor $in0,$inout1 # cmac^=inp - pxor $inout2,$inout2 - call _aesni_encrypt3 + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$in0 + lea 32($key),$key + xorps $rndkey0,$inout0 + xorps $inout1,$in0 # cmac^=inp + $movkey ($key),$rndkey0 + +.Lccm64_enc2_loop: + aesenc $rndkey1,$inout0 + dec $rounds + aesenc $rndkey1,$inout1 + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + lea 32($key),$key + aesenc $rndkey0,$inout1 + $movkey 0($key),$rndkey0 + jnz .Lccm64_enc2_loop + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenclast $rndkey0,$inout0 + aesenclast $rndkey0,$inout1 paddq $increment,$iv dec $len lea 16($inp),$inp - pxor $inout0,$in0 # inp ^= E(iv) + xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 - movdqu $in0,($out) # save output + movups $in0,($out) # save output lea 16($out),$out jnz .Lccm64_enc_outer - movdqu $inout1,($cmac) + movups $inout1,($cmac) ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 @@ -529,24 +933,42 @@ ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; .Lccm64_dec_outer: - movdqu ($inp),$in0 # load inp paddq $increment,$iv - dec $len - lea 16($inp),$inp - pxor $inout0,$in0 + movups ($inp),$in0 # load inp + xorps $inout0,$in0 movdqa $iv,$inout0 + lea 16($inp),$inp + pshufb $bswap_mask,$inout0 mov $key_,$key mov $rnds_,$rounds - pshufb $bswap_mask,$inout0 - movdqu $in0,($out) + movups $in0,($out) lea 16($out),$out - pxor $in0,$inout1 # cmac^=out + sub \$1,$len jz .Lccm64_dec_break - pxor $inout2,$inout2 - call _aesni_encrypt3 + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + xorps $rndkey0,$in0 + lea 32($key),$key + xorps $rndkey0,$inout0 + xorps $in0,$inout1 # cmac^=out + $movkey ($key),$rndkey0 +.Lccm64_dec2_loop: + aesenc $rndkey1,$inout0 + dec $rounds + aesenc $rndkey1,$inout1 + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + lea 32($key),$key + aesenc $rndkey0,$inout1 + $movkey 0($key),$rndkey0 + jnz .Lccm64_dec2_loop + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + aesenclast $rndkey0,$inout0 jmp .Lccm64_dec_outer .align 16 @@ -554,7 +976,7 @@ $code.=<<___; ___ &aesni_generate1("enc",$key,$rounds,$inout1); $code.=<<___; - movdqu $inout1,($cmac) + movups $inout1,($cmac) ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 @@ -566,219 +988,1133 @@ $code.=<<___ if ($win64); ___ $code.=<<___; ret -.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks +.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks +___ +} +###################################################################### +# void aesni_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +# +# Handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see engine/eng_aesni.c for details) +# +{ +my $reserved = $win64?0:-0x28; +my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); +my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); +my $bswap_mask="%xmm15"; + +$code.=<<___; +.globl aesni_ctr32_encrypt_blocks +.type aesni_ctr32_encrypt_blocks,\@function,5 +.align 16 +aesni_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($win64); + lea -0xc8(%rsp),%rsp + movaps %xmm6,0x20(%rsp) + movaps %xmm7,0x30(%rsp) + movaps %xmm8,0x40(%rsp) + movaps %xmm9,0x50(%rsp) + movaps %xmm10,0x60(%rsp) + movaps %xmm11,0x70(%rsp) + movaps %xmm12,0x80(%rsp) + movaps %xmm13,0x90(%rsp) + movaps %xmm14,0xa0(%rsp) + movaps %xmm15,0xb0(%rsp) +.Lctr32_body: +___ +$code.=<<___; + cmp \$1,$len + je .Lctr32_one_shortcut + + movdqu ($ivp),$ivec + movdqa .Lbswap_mask(%rip),$bswap_mask + xor $rounds,$rounds + pextrd \$3,$ivec,$rnds_ # pull 32-bit counter + pinsrd \$3,$rounds,$ivec # wipe 32-bit counter + + mov 240($key),$rounds # key->rounds + bswap $rnds_ + pxor $iv0,$iv0 # vector of 3 32-bit counters + pxor $iv1,$iv1 # vector of 3 32-bit counters + pinsrd \$0,$rnds_,$iv0 + lea 3($rnds_),$key_ + pinsrd \$0,$key_,$iv1 + inc $rnds_ + pinsrd \$1,$rnds_,$iv0 + inc $key_ + pinsrd \$1,$key_,$iv1 + inc $rnds_ + pinsrd \$2,$rnds_,$iv0 + inc $key_ + pinsrd \$2,$key_,$iv1 + movdqa $iv0,$reserved(%rsp) + pshufb $bswap_mask,$iv0 + movdqa $iv1,`$reserved+0x10`(%rsp) + pshufb $bswap_mask,$iv1 + + pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword + pshufd \$`2<<6`,$iv0,$inout1 + pshufd \$`1<<6`,$iv0,$inout2 + cmp \$6,$len + jb .Lctr32_tail + shr \$1,$rounds + mov $key,$key_ # backup $key + mov $rounds,$rnds_ # backup $rounds + sub \$6,$len + jmp .Lctr32_loop6 + +.align 16 +.Lctr32_loop6: + pshufd \$`3<<6`,$iv1,$inout3 + por $ivec,$inout0 # merge counter-less ivec + $movkey ($key_),$rndkey0 + pshufd \$`2<<6`,$iv1,$inout4 + por $ivec,$inout1 + $movkey 16($key_),$rndkey1 + pshufd \$`1<<6`,$iv1,$inout5 + por $ivec,$inout2 + por $ivec,$inout3 + xorps $rndkey0,$inout0 + por $ivec,$inout4 + por $ivec,$inout5 + + # inline _aesni_encrypt6 and interleave last rounds + # with own code... + + pxor $rndkey0,$inout1 + aesenc $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + aesenc $rndkey1,$inout1 + movdqa .Lincrement32(%rip),$iv1 + pxor $rndkey0,$inout3 + aesenc $rndkey1,$inout2 + movdqa $reserved(%rsp),$iv0 + pxor $rndkey0,$inout4 + aesenc $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + jmp .Lctr32_enc_loop6_enter +.align 16 +.Lctr32_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + dec $rounds + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 +.Lctr32_enc_loop6_enter: + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + lea 32($key),$key + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .Lctr32_enc_loop6 + + aesenc $rndkey1,$inout0 + paddd $iv1,$iv0 # increment counter vector + aesenc $rndkey1,$inout1 + paddd `$reserved+0x10`(%rsp),$iv1 + aesenc $rndkey1,$inout2 + movdqa $iv0,$reserved(%rsp) # save counter vector + aesenc $rndkey1,$inout3 + movdqa $iv1,`$reserved+0x10`(%rsp) + aesenc $rndkey1,$inout4 + pshufb $bswap_mask,$iv0 # byte swap + aesenc $rndkey1,$inout5 + pshufb $bswap_mask,$iv1 + + aesenclast $rndkey0,$inout0 + movups ($inp),$in0 # load input + aesenclast $rndkey0,$inout1 + movups 0x10($inp),$in1 + aesenclast $rndkey0,$inout2 + movups 0x20($inp),$in2 + aesenclast $rndkey0,$inout3 + movups 0x30($inp),$in3 + aesenclast $rndkey0,$inout4 + movups 0x40($inp),$rndkey1 + aesenclast $rndkey0,$inout5 + movups 0x50($inp),$rndkey0 + lea 0x60($inp),$inp + + xorps $inout0,$in0 # xor + pshufd \$`3<<6`,$iv0,$inout0 + xorps $inout1,$in1 + pshufd \$`2<<6`,$iv0,$inout1 + movups $in0,($out) # store output + xorps $inout2,$in2 + pshufd \$`1<<6`,$iv0,$inout2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + xorps $inout4,$rndkey1 + movups $in3,0x30($out) + xorps $inout5,$rndkey0 + movups $rndkey1,0x40($out) + movups $rndkey0,0x50($out) + lea 0x60($out),$out + mov $rnds_,$rounds + sub \$6,$len + jnc .Lctr32_loop6 + + add \$6,$len + jz .Lctr32_done + mov $key_,$key # restore $key + lea 1($rounds,$rounds),$rounds # restore original value + +.Lctr32_tail: + por $ivec,$inout0 + movups ($inp),$in0 + cmp \$2,$len + jb .Lctr32_one + + por $ivec,$inout1 + movups 0x10($inp),$in1 + je .Lctr32_two + + pshufd \$`3<<6`,$iv1,$inout3 + por $ivec,$inout2 + movups 0x20($inp),$in2 + cmp \$4,$len + jb .Lctr32_three + + pshufd \$`2<<6`,$iv1,$inout4 + por $ivec,$inout3 + movups 0x30($inp),$in3 + je .Lctr32_four + + por $ivec,$inout4 + xorps $inout5,$inout5 + + call _aesni_encrypt6 + + movups 0x40($inp),$rndkey1 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + xorps $inout4,$rndkey1 + movups $in3,0x30($out) + movups $rndkey1,0x40($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_one_shortcut: + movups ($ivp),$inout0 + movups ($inp),$in0 + mov 240($key),$rounds # key->rounds +.Lctr32_one: +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps $inout0,$in0 + movups $in0,($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_two: + xorps $inout2,$inout2 + call _aesni_encrypt3 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + movups $in1,0x10($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_three: + call _aesni_encrypt3 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + movups $in2,0x20($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_four: + call _aesni_encrypt4 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + movups $in3,0x30($out) + +.Lctr32_done: +___ +$code.=<<___ if ($win64); + movaps 0x20(%rsp),%xmm6 + movaps 0x30(%rsp),%xmm7 + movaps 0x40(%rsp),%xmm8 + movaps 0x50(%rsp),%xmm9 + movaps 0x60(%rsp),%xmm10 + movaps 0x70(%rsp),%xmm11 + movaps 0x80(%rsp),%xmm12 + movaps 0x90(%rsp),%xmm13 + movaps 0xa0(%rsp),%xmm14 + movaps 0xb0(%rsp),%xmm15 + lea 0xc8(%rsp),%rsp +.Lctr32_ret: +___ +$code.=<<___; + ret +.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks +___ +} + +###################################################################### +# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2 +# const unsigned char iv[16]); +# +{ +my @tweak=map("%xmm$_",(10..15)); +my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); +my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); +my $frame_size = 0x68 + ($win64?160:0); + +$code.=<<___; +.globl aesni_xts_encrypt +.type aesni_xts_encrypt,\@function,6 +.align 16 +aesni_xts_encrypt: + lea -$frame_size(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x60(%rsp) + movaps %xmm7,0x70(%rsp) + movaps %xmm8,0x80(%rsp) + movaps %xmm9,0x90(%rsp) + movaps %xmm10,0xa0(%rsp) + movaps %xmm11,0xb0(%rsp) + movaps %xmm12,0xc0(%rsp) + movaps %xmm13,0xd0(%rsp) + movaps %xmm14,0xe0(%rsp) + movaps %xmm15,0xf0(%rsp) +.Lxts_enc_body: +___ +$code.=<<___; + movups ($ivp),@tweak[5] # load clear-text tweak + mov 240(%r8),$rounds # key2->rounds + mov 240($key),$rnds_ # key1->rounds +___ + # generate the tweak + &aesni_generate1("enc",$key2,$rounds,@tweak[5]); +$code.=<<___; + mov $key,$key_ # backup $key + mov $rnds_,$rounds # backup $rounds + mov $len,$len_ # backup $len + and \$-16,$len + + movdqa .Lxts_magic(%rip),$twmask + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp # broadcast upper bits +___ + for ($i=0;$i<4;$i++) { + $code.=<<___; + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[$i] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,$twres # isolate carry and residue + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] +___ + } +$code.=<<___; + sub \$16*6,$len + jc .Lxts_enc_short + + shr \$1,$rounds + sub \$1,$rounds + mov $rounds,$rnds_ + jmp .Lxts_enc_grandloop + +.align 16 +.Lxts_enc_grandloop: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu `16*0`($inp),$inout0 # load input + pand $twmask,$twres # isolate carry and residue + movdqu `16*1`($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu `16*2`($inp),$inout2 + pxor @tweak[0],$inout0 # input^=tweak + movdqu `16*3`($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu `16*4`($inp),$inout4 + pxor @tweak[2],$inout2 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + pxor @tweak[3],$inout3 + $movkey ($key_),$rndkey0 + pxor @tweak[4],$inout4 + pxor @tweak[5],$inout5 + + # inline _aesni_encrypt6 and interleave first and last rounds + # with own code... + $movkey 16($key_),$rndkey1 + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks + aesenc $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + movdqa @tweak[1],`16*1`(%rsp) + aesenc $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqa @tweak[2],`16*2`(%rsp) + aesenc $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqa @tweak[3],`16*3`(%rsp) + aesenc $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + movdqa @tweak[4],`16*4`(%rsp) + aesenc $rndkey1,$inout4 + movdqa @tweak[5],`16*5`(%rsp) + aesenc $rndkey1,$inout5 + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp + jmp .Lxts_enc_loop6_enter + +.align 16 +.Lxts_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + dec $rounds + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 +.Lxts_enc_loop6_enter: + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + lea 32($key),$key + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .Lxts_enc_loop6 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcast upper bits + aesenc $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey 16($key),$rndkey1 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenc $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey 32($key),$rndkey0 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[1] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenc $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[2] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenclast $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenclast $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenclast $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesenclast $rndkey0,$inout3 + aesenclast $rndkey0,$inout4 + aesenclast $rndkey0,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[3] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + xorps `16*0`(%rsp),$inout0 # output^=tweak + pand $twmask,$twres # isolate carry and residue + xorps `16*1`(%rsp),$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] + + xorps `16*2`(%rsp),$inout2 + movups $inout0,`16*0`($out) # write output + xorps `16*3`(%rsp),$inout3 + movups $inout1,`16*1`($out) + xorps `16*4`(%rsp),$inout4 + movups $inout2,`16*2`($out) + xorps `16*5`(%rsp),$inout5 + movups $inout3,`16*3`($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$16*6,$len + jnc .Lxts_enc_grandloop + + lea 3($rounds,$rounds),$rounds # restore original value + mov $key_,$key # restore $key + mov $rounds,$rnds_ # backup $rounds + +.Lxts_enc_short: + add \$16*6,$len + jz .Lxts_enc_done + + cmp \$0x20,$len + jb .Lxts_enc_one + je .Lxts_enc_two + + cmp \$0x40,$len + jb .Lxts_enc_three + je .Lxts_enc_four + + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movdqu 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu 16*2($inp),$inout2 + pxor @tweak[0],$inout0 + movdqu 16*3($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu 16*4($inp),$inout4 + lea 16*5($inp),$inp + pxor @tweak[2],$inout2 + pxor @tweak[3],$inout3 + pxor @tweak[4],$inout4 + + call _aesni_encrypt6 + + xorps @tweak[0],$inout0 + movdqa @tweak[5],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movdqu $inout0,($out) + xorps @tweak[3],$inout3 + movdqu $inout1,16*1($out) + xorps @tweak[4],$inout4 + movdqu $inout2,16*2($out) + movdqu $inout3,16*3($out) + movdqu $inout4,16*4($out) + lea 16*5($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_one: + movups ($inp),$inout0 + lea 16*1($inp),$inp + xorps @tweak[0],$inout0 +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movdqa @tweak[1],@tweak[0] + movups $inout0,($out) + lea 16*1($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_two: + movups ($inp),$inout0 + movups 16($inp),$inout1 + lea 32($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + + call _aesni_encrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[2],@tweak[0] + xorps @tweak[1],$inout1 + movups $inout0,($out) + movups $inout1,16*1($out) + lea 16*2($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_three: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + lea 16*3($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + + call _aesni_encrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[3],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movups $inout0,($out) + movups $inout1,16*1($out) + movups $inout2,16*2($out) + lea 16*3($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_four: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + xorps @tweak[0],$inout0 + movups 16*3($inp),$inout3 + lea 16*4($inp),$inp + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + xorps @tweak[3],$inout3 + + call _aesni_encrypt4 + + xorps @tweak[0],$inout0 + movdqa @tweak[5],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movups $inout0,($out) + xorps @tweak[3],$inout3 + movups $inout1,16*1($out) + movups $inout2,16*2($out) + movups $inout3,16*3($out) + lea 16*4($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_done: + and \$15,$len_ + jz .Lxts_enc_ret + mov $len_,$len + +.Lxts_enc_steal: + movzb ($inp),%eax # borrow $rounds ... + movzb -16($out),%ecx # ... and $key + lea 1($inp),$inp + mov %al,-16($out) + mov %cl,0($out) + lea 1($out),$out + sub \$1,$len + jnz .Lxts_enc_steal + + sub $len_,$out # rewind $out + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + + movups -16($out),$inout0 + xorps @tweak[0],$inout0 +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movups $inout0,-16($out) + +.Lxts_enc_ret: +___ +$code.=<<___ if ($win64); + movaps 0x60(%rsp),%xmm6 + movaps 0x70(%rsp),%xmm7 + movaps 0x80(%rsp),%xmm8 + movaps 0x90(%rsp),%xmm9 + movaps 0xa0(%rsp),%xmm10 + movaps 0xb0(%rsp),%xmm11 + movaps 0xc0(%rsp),%xmm12 + movaps 0xd0(%rsp),%xmm13 + movaps 0xe0(%rsp),%xmm14 + movaps 0xf0(%rsp),%xmm15 +___ +$code.=<<___; + lea $frame_size(%rsp),%rsp +.Lxts_enc_epilogue: + ret +.size aesni_xts_encrypt,.-aesni_xts_encrypt ___ -} -###################################################################### -# void aesni_ctr32_encrypt_blocks (const void *in, void *out, -# size_t blocks, const AES_KEY *key, -# const char *ivec); -# -# Handles only complete blocks, operates on 32-bit counter and -# does not update *ivec! (see engine/eng_aesni.c for details) -# -my $increment="%xmm10"; -my $bswap_mask="%xmm11"; $code.=<<___; -.globl aesni_ctr32_encrypt_blocks -.type aesni_ctr32_encrypt_blocks,\@function,5 +.globl aesni_xts_decrypt +.type aesni_xts_decrypt,\@function,6 .align 16 -aesni_ctr32_encrypt_blocks: +aesni_xts_decrypt: + lea -$frame_size(%rsp),%rsp ___ $code.=<<___ if ($win64); - lea -0x68(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) - movaps %xmm8,0x20(%rsp) - movaps %xmm9,0x30(%rsp) - movaps %xmm10,0x40(%rsp) - movaps %xmm11,0x50(%rsp) - -.Lctr32_body: + movaps %xmm6,0x60(%rsp) + movaps %xmm7,0x70(%rsp) + movaps %xmm8,0x80(%rsp) + movaps %xmm9,0x90(%rsp) + movaps %xmm10,0xa0(%rsp) + movaps %xmm11,0xb0(%rsp) + movaps %xmm12,0xc0(%rsp) + movaps %xmm13,0xd0(%rsp) + movaps %xmm14,0xe0(%rsp) + movaps %xmm15,0xf0(%rsp) +.Lxts_dec_body: ___ $code.=<<___; - cmp \$1,$len - je .Lctr32_one_shortcut - - movdqu ($ivp),$inout3 - movdqa .Lincrement32(%rip),$increment - movdqa .Lbswap_mask(%rip),$bswap_mask - xor $rounds,$rounds - pextrd \$3,$inout3,$rnds_ # pull 32-bit counter - pinsrd \$3,$rounds,$inout3 # wipe 32-bit counter + movups ($ivp),@tweak[5] # load clear-text tweak + mov 240($key2),$rounds # key2->rounds + mov 240($key),$rnds_ # key1->rounds +___ + # generate the tweak + &aesni_generate1("enc",$key2,$rounds,@tweak[5]); +$code.=<<___; + xor %eax,%eax # if ($len%16) len-=16; + test \$15,$len + setnz %al + shl \$4,%rax + sub %rax,$len + + mov $key,$key_ # backup $key + mov $rnds_,$rounds # backup $rounds + mov $len,$len_ # backup $len + and \$-16,$len - mov 240($key),$rounds # key->rounds - pxor $iv,$iv # vector of 3 32-bit counters - bswap $rnds_ - pinsrd \$0,$rnds_,$iv - inc $rnds_ - pinsrd \$1,$rnds_,$iv - inc $rnds_ - pinsrd \$2,$rnds_,$iv - pshufb $bswap_mask,$iv + movdqa .Lxts_magic(%rip),$twmask + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp # broadcast upper bits +___ + for ($i=0;$i<4;$i++) { + $code.=<<___; + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[$i] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,$twres # isolate carry and residue + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] +___ + } +$code.=<<___; + sub \$16*6,$len + jc .Lxts_dec_short - cmp \$4,$len - jbe .Lctr32_tail + shr \$1,$rounds + sub \$1,$rounds mov $rounds,$rnds_ - mov $key,$key_ - sub \$4,$len - -.Lctr32_loop3: - pshufd \$`3<<6`,$iv,$inout0 # place counter to upper dword - pshufd \$`2<<6`,$iv,$inout1 - por $inout3,$inout0 # merge counter-less ivec - pshufd \$`1<<6`,$iv,$inout2 - por $inout3,$inout1 - por $inout3,$inout2 + jmp .Lxts_dec_grandloop - # inline _aesni_encrypt3 and interleave last round +.align 16 +.Lxts_dec_grandloop: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu `16*0`($inp),$inout0 # load input + pand $twmask,$twres # isolate carry and residue + movdqu `16*1`($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu `16*2`($inp),$inout2 + pxor @tweak[0],$inout0 # input^=tweak + movdqu `16*3`($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu `16*4`($inp),$inout4 + pxor @tweak[2],$inout2 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + pxor @tweak[3],$inout3 + $movkey ($key_),$rndkey0 + pxor @tweak[4],$inout4 + pxor @tweak[5],$inout5 + + # inline _aesni_decrypt6 and interleave first and last rounds # with own code... + $movkey 16($key_),$rndkey1 + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks + aesdec $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + movdqa @tweak[1],`16*1`(%rsp) + aesdec $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqa @tweak[2],`16*2`(%rsp) + aesdec $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqa @tweak[3],`16*3`(%rsp) + aesdec $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + movdqa @tweak[4],`16*4`(%rsp) + aesdec $rndkey1,$inout4 + movdqa @tweak[5],`16*5`(%rsp) + aesdec $rndkey1,$inout5 + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp + jmp .Lxts_dec_loop6_enter - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 - $movkey ($key),$rndkey0 - jmp .Lctr32_enc_loop3 .align 16 -.Lctr32_enc_loop3: - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 +.Lxts_dec_loop6: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 dec $rounds - aesenc $rndkey1,$inout2 + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 +.Lxts_dec_loop6_enter: $movkey 16($key),$rndkey1 - aesenc $rndkey0,$inout0 - aesenc $rndkey0,$inout1 + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 lea 32($key),$key - aesenc $rndkey0,$inout2 + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 $movkey ($key),$rndkey0 - jnz .Lctr32_enc_loop3 + jnz .Lxts_dec_loop6 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcast upper bits + aesdec $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey 16($key),$rndkey1 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdec $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey 32($key),$rndkey0 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[1] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdec $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[2] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdeclast $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdeclast $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdeclast $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesdeclast $rndkey0,$inout3 + aesdeclast $rndkey0,$inout4 + aesdeclast $rndkey0,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[3] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + xorps `16*0`(%rsp),$inout0 # output^=tweak + pand $twmask,$twres # isolate carry and residue + xorps `16*1`(%rsp),$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] + + xorps `16*2`(%rsp),$inout2 + movups $inout0,`16*0`($out) # write output + xorps `16*3`(%rsp),$inout3 + movups $inout1,`16*1`($out) + xorps `16*4`(%rsp),$inout4 + movups $inout2,`16*2`($out) + xorps `16*5`(%rsp),$inout5 + movups $inout3,`16*3`($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$16*6,$len + jnc .Lxts_dec_grandloop + + lea 3($rounds,$rounds),$rounds # restore original value + mov $key_,$key # restore $key + mov $rounds,$rnds_ # backup $rounds + +.Lxts_dec_short: + add \$16*6,$len + jz .Lxts_dec_done - aesenc $rndkey1,$inout0 - aesenc $rndkey1,$inout1 - aesenc $rndkey1,$inout2 + cmp \$0x20,$len + jb .Lxts_dec_one + je .Lxts_dec_two - pshufb $bswap_mask,$iv - movdqu ($inp),$in0 - aesenclast $rndkey0,$inout0 - movdqu 0x10($inp),$in1 - paddd $increment,$iv - aesenclast $rndkey0,$inout1 - movdqu 0x20($inp),$in2 - pshufb $bswap_mask,$iv - aesenclast $rndkey0,$inout2 - lea 0x30($inp),$inp + cmp \$0x40,$len + jb .Lxts_dec_three + je .Lxts_dec_four + + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movdqu 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu 16*2($inp),$inout2 + pxor @tweak[0],$inout0 + movdqu 16*3($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu 16*4($inp),$inout4 + lea 16*5($inp),$inp + pxor @tweak[2],$inout2 + pxor @tweak[3],$inout3 + pxor @tweak[4],$inout4 + + call _aesni_decrypt6 + + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movdqu $inout0,($out) + xorps @tweak[3],$inout3 + movdqu $inout1,16*1($out) + xorps @tweak[4],$inout4 + movdqu $inout2,16*2($out) + pxor $twtmp,$twtmp + movdqu $inout3,16*3($out) + pcmpgtd @tweak[5],$twtmp + movdqu $inout4,16*4($out) + lea 16*5($out),$out + pshufd \$0x13,$twtmp,@tweak[1] # $twres + and \$15,$len_ + jz .Lxts_dec_ret + + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,@tweak[1] # isolate carry and residue + pxor @tweak[5],@tweak[1] + jmp .Lxts_dec_done2 - mov $key_,$key - pxor $inout0,$in0 - sub \$3,$len - mov $rnds_,$rounds - pxor $inout1,$in1 - movdqu $in0,($out) - pxor $inout2,$in2 - movdqu $in1,0x10($out) - movdqu $in2,0x20($out) - lea 0x30($out),$out - ja .Lctr32_loop3 +.align 16 +.Lxts_dec_one: + movups ($inp),$inout0 + lea 16*1($inp),$inp + xorps @tweak[0],$inout0 +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movdqa @tweak[1],@tweak[0] + movups $inout0,($out) + movdqa @tweak[2],@tweak[1] + lea 16*1($out),$out + jmp .Lxts_dec_done - pextrd \$1,$iv,$rnds_ # might need last counter value - add \$4,$len - bswap $rnds_ +.align 16 +.Lxts_dec_two: + movups ($inp),$inout0 + movups 16($inp),$inout1 + lea 32($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 -.Lctr32_tail: - pshufd \$`3<<6`,$iv,$inout0 - pshufd \$`2<<6`,$iv,$inout1 - por $inout3,$inout0 - movdqu ($inp),$in0 - cmp \$2,$len - jb .Lctr32_one - lea 1($rnds_),$rnds_ - pshufd \$`1<<6`,$iv,$inout2 - por $inout3,$inout1 - movdqu 0x10($inp),$in1 - je .Lctr32_two - bswap $rnds_ - por $inout3,$inout2 - movdqu 0x20($inp),$in2 - cmp \$3,$len - je .Lctr32_three + call _aesni_decrypt3 - pinsrd \$3,$rnds_,$inout3 # compose last counter value - movdqu 0x30($inp),$iv + xorps @tweak[0],$inout0 + movdqa @tweak[2],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[3],@tweak[1] + movups $inout0,($out) + movups $inout1,16*1($out) + lea 16*2($out),$out + jmp .Lxts_dec_done - call _aesni_encrypt4 +.align 16 +.Lxts_dec_three: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + lea 16*3($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 - pxor $inout0,$in0 - pxor $inout1,$in1 - pxor $inout2,$in2 - movdqu $in0,($out) - pxor $inout3,$iv - movdqu $in1,0x10($out) - movdqu $in2,0x20($out) - movdqu $iv,0x30($out) - jmp .Lctr32_done + call _aesni_decrypt3 -.align 16 -.Lctr32_one_shortcut: - movdqu ($ivp),$inout0 - movdqu ($inp),$in0 - mov 240($key),$rounds # key->rounds -.Lctr32_one: -___ - &aesni_generate1("enc",$key,$rounds); -$code.=<<___; - pxor $inout0,$in0 - movdqu $in0,($out) - jmp .Lctr32_done + xorps @tweak[0],$inout0 + movdqa @tweak[3],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[5],@tweak[1] + xorps @tweak[2],$inout2 + movups $inout0,($out) + movups $inout1,16*1($out) + movups $inout2,16*2($out) + lea 16*3($out),$out + jmp .Lxts_dec_done .align 16 -.Lctr32_two: - pxor $inout2,$inout2 - call _aesni_encrypt3 - pxor $inout0,$in0 - pxor $inout1,$in1 - movdqu $in0,($out) - movdqu $in1,0x10($out) - jmp .Lctr32_done +.Lxts_dec_four: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movups ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movups 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movups 16*2($inp),$inout2 + xorps @tweak[0],$inout0 + movups 16*3($inp),$inout3 + lea 16*4($inp),$inp + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + xorps @tweak[3],$inout3 + + call _aesni_decrypt4 + + xorps @tweak[0],$inout0 + movdqa @tweak[4],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[5],@tweak[1] + xorps @tweak[2],$inout2 + movups $inout0,($out) + xorps @tweak[3],$inout3 + movups $inout1,16*1($out) + movups $inout2,16*2($out) + movups $inout3,16*3($out) + lea 16*4($out),$out + jmp .Lxts_dec_done .align 16 -.Lctr32_three: - call _aesni_encrypt3 - pxor $inout0,$in0 - pxor $inout1,$in1 - pxor $inout2,$in2 - movdqu $in0,($out) - movdqu $in1,0x10($out) - movdqu $in2,0x20($out) +.Lxts_dec_done: + and \$15,$len_ + jz .Lxts_dec_ret +.Lxts_dec_done2: + mov $len_,$len + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds -.Lctr32_done: + movups ($inp),$inout0 + xorps @tweak[1],$inout0 +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[1],$inout0 + movups $inout0,($out) + +.Lxts_dec_steal: + movzb 16($inp),%eax # borrow $rounds ... + movzb ($out),%ecx # ... and $key + lea 1($inp),$inp + mov %al,($out) + mov %cl,16($out) + lea 1($out),$out + sub \$1,$len + jnz .Lxts_dec_steal + + sub $len_,$out # rewind $out + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + + movups ($out),$inout0 + xorps @tweak[0],$inout0 ___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movups $inout0,($out) +.Lxts_dec_ret: +___ $code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - movaps 0x40(%rsp),%xmm10 - movaps 0x50(%rsp),%xmm11 - lea 0x68(%rsp),%rsp -.Lctr32_ret: + movaps 0x60(%rsp),%xmm6 + movaps 0x70(%rsp),%xmm7 + movaps 0x80(%rsp),%xmm8 + movaps 0x90(%rsp),%xmm9 + movaps 0xa0(%rsp),%xmm10 + movaps 0xb0(%rsp),%xmm11 + movaps 0xc0(%rsp),%xmm12 + movaps 0xd0(%rsp),%xmm13 + movaps 0xe0(%rsp),%xmm14 + movaps 0xf0(%rsp),%xmm15 ___ $code.=<<___; + lea $frame_size(%rsp),%rsp +.Lxts_dec_epilogue: ret -.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks +.size aesni_xts_decrypt,.-aesni_xts_decrypt ___ -}} +} }} ######################################################################## # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); -$reserved = $win64?0x40:-0x18; # used in decrypt +{ +my $reserved = $win64?0x40:-0x18; # used in decrypt $code.=<<___; .globl ${PREFIX}_cbc_encrypt .type ${PREFIX}_cbc_encrypt,\@function,6 @@ -787,12 +2123,12 @@ ${PREFIX}_cbc_encrypt: test $len,$len # check length jz .Lcbc_ret - mov 240($key),$rnds_ # pull $rounds + mov 240($key),$rnds_ # key->rounds mov $key,$key_ # backup $key test %r9d,%r9d # 6th argument jz .Lcbc_decrypt #--------------------------- CBC ENCRYPT ------------------------------# - movdqu ($ivp),$inout0 # load iv as initial state + movups ($ivp),$inout0 # load iv as initial state mov $rnds_,$rounds cmp \$16,$len jb .Lcbc_enc_tail @@ -800,11 +2136,11 @@ ${PREFIX}_cbc_encrypt: jmp .Lcbc_enc_loop .align 16 .Lcbc_enc_loop: - movdqu ($inp),$inout1 # load input + movups ($inp),$inout1 # load input lea 16($inp),$inp - pxor $inout1,$inout0 + #xorps $inout1,$inout0 ___ - &aesni_generate1("enc",$key,$rounds); + &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); $code.=<<___; mov $rnds_,$rounds # restore $rounds mov $key_,$key # restore $key @@ -846,106 +2182,251 @@ ___ $code.=<<___; movups ($ivp),$iv mov $rnds_,$rounds - cmp \$0x40,$len + cmp \$0x70,$len jbe .Lcbc_dec_tail - sub \$0x40,$len - jmp .Lcbc_dec_loop3 + shr \$1,$rnds_ + sub \$0x70,$len + mov $rnds_,$rounds + movaps $iv,$reserved(%rsp) + jmp .Lcbc_dec_loop8_enter .align 16 -.Lcbc_dec_loop3: - movups ($inp),$inout0 +.Lcbc_dec_loop8: + movaps $rndkey0,$reserved(%rsp) # save IV + movups $inout7,($out) + lea 0x10($out),$out +.Lcbc_dec_loop8_enter: + $movkey ($key),$rndkey0 + movups ($inp),$inout0 # load input movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - movaps $inout0,$in0 - movaps $inout1,$in1 - movaps $inout2,$in2 - - call _aesni_decrypt3 + $movkey 16($key),$rndkey1 - sub \$0x30,$len - lea 0x30($inp),$inp - lea 0x30($out),$out - pxor $iv,$inout0 - pxor $in0,$inout1 - movaps $in2,$iv - pxor $in1,$inout2 - movdqu $inout0,-0x30($out) - mov $rnds_,$rounds # restore $rounds - movdqu $inout1,-0x20($out) - mov $key_,$key # restore $key - movdqu $inout2,-0x10($out) - ja .Lcbc_dec_loop3 + lea 32($key),$key + movdqu 0x20($inp),$inout2 + xorps $rndkey0,$inout0 + movdqu 0x30($inp),$inout3 + xorps $rndkey0,$inout1 + movdqu 0x40($inp),$inout4 + aesdec $rndkey1,$inout0 + pxor $rndkey0,$inout2 + movdqu 0x50($inp),$inout5 + aesdec $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqu 0x60($inp),$inout6 + aesdec $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqu 0x70($inp),$inout7 + aesdec $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aesdec $rndkey1,$inout4 + pxor $rndkey0,$inout6 + aesdec $rndkey1,$inout5 + pxor $rndkey0,$inout7 + $movkey ($key),$rndkey0 + aesdec $rndkey1,$inout6 + aesdec $rndkey1,$inout7 + $movkey 16($key),$rndkey1 - add \$0x40,$len - movups $iv,($ivp) + call .Ldec_loop8_enter + + movups ($inp),$rndkey1 # re-load input + movups 0x10($inp),$rndkey0 + xorps $reserved(%rsp),$inout0 # ^= IV + xorps $rndkey1,$inout1 + movups 0x20($inp),$rndkey1 + xorps $rndkey0,$inout2 + movups 0x30($inp),$rndkey0 + xorps $rndkey1,$inout3 + movups 0x40($inp),$rndkey1 + xorps $rndkey0,$inout4 + movups 0x50($inp),$rndkey0 + xorps $rndkey1,$inout5 + movups 0x60($inp),$rndkey1 + xorps $rndkey0,$inout6 + movups 0x70($inp),$rndkey0 # IV + xorps $rndkey1,$inout7 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,0x40($out) + mov $key_,$key # restore $key + movups $inout5,0x50($out) + lea 0x80($inp),$inp + movups $inout6,0x60($out) + lea 0x70($out),$out + sub \$0x80,$len + ja .Lcbc_dec_loop8 + + movaps $inout7,$inout0 + movaps $rndkey0,$iv + add \$0x70,$len + jle .Lcbc_dec_tail_collected + movups $inout0,($out) + lea 1($rnds_,$rnds_),$rounds + lea 0x10($out),$out .Lcbc_dec_tail: movups ($inp),$inout0 movaps $inout0,$in0 cmp \$0x10,$len jbe .Lcbc_dec_one + movups 0x10($inp),$inout1 movaps $inout1,$in1 cmp \$0x20,$len jbe .Lcbc_dec_two + movups 0x20($inp),$inout2 movaps $inout2,$in2 cmp \$0x30,$len jbe .Lcbc_dec_three + movups 0x30($inp),$inout3 - call _aesni_decrypt4 - pxor $iv,$inout0 - movups 0x30($inp),$iv - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) - pxor $in2,$inout3 - movdqu $inout2,0x20($out) - movdqa $inout3,$inout0 - lea 0x30($out),$out + cmp \$0x40,$len + jbe .Lcbc_dec_four + + movups 0x40($inp),$inout4 + cmp \$0x50,$len + jbe .Lcbc_dec_five + + movups 0x50($inp),$inout5 + cmp \$0x60,$len + jbe .Lcbc_dec_six + + movups 0x60($inp),$inout6 + movaps $iv,$reserved(%rsp) # save IV + call _aesni_decrypt8 + movups ($inp),$rndkey1 + movups 0x10($inp),$rndkey0 + xorps $reserved(%rsp),$inout0 # ^= IV + xorps $rndkey1,$inout1 + movups 0x20($inp),$rndkey1 + xorps $rndkey0,$inout2 + movups 0x30($inp),$rndkey0 + xorps $rndkey1,$inout3 + movups 0x40($inp),$rndkey1 + xorps $rndkey0,$inout4 + movups 0x50($inp),$rndkey0 + xorps $rndkey1,$inout5 + movups 0x60($inp),$iv # IV + xorps $rndkey0,$inout6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + lea 0x60($out),$out + movaps $inout6,$inout0 + sub \$0x70,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_one: ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; - pxor $iv,$inout0 + xorps $iv,$inout0 movaps $in0,$iv + sub \$0x10,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: - pxor $inout2,$inout2 + xorps $inout2,$inout2 call _aesni_decrypt3 - pxor $iv,$inout0 - pxor $in0,$inout1 - movdqu $inout0,($out) + xorps $iv,$inout0 + xorps $in0,$inout1 + movups $inout0,($out) movaps $in1,$iv - movdqa $inout1,$inout0 + movaps $inout1,$inout0 lea 0x10($out),$out + sub \$0x20,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: call _aesni_decrypt3 - pxor $iv,$inout0 - pxor $in0,$inout1 - movdqu $inout0,($out) - pxor $in1,$inout2 - movdqu $inout1,0x10($out) + xorps $iv,$inout0 + xorps $in0,$inout1 + movups $inout0,($out) + xorps $in1,$inout2 + movups $inout1,0x10($out) movaps $in2,$iv - movdqa $inout2,$inout0 + movaps $inout2,$inout0 lea 0x20($out),$out + sub \$0x30,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_four: + call _aesni_decrypt4 + xorps $iv,$inout0 + movups 0x30($inp),$iv + xorps $in0,$inout1 + movups $inout0,($out) + xorps $in1,$inout2 + movups $inout1,0x10($out) + xorps $in2,$inout3 + movups $inout2,0x20($out) + movaps $inout3,$inout0 + lea 0x30($out),$out + sub \$0x40,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_five: + xorps $inout5,$inout5 + call _aesni_decrypt6 + movups 0x10($inp),$rndkey1 + movups 0x20($inp),$rndkey0 + xorps $iv,$inout0 + xorps $in0,$inout1 + xorps $rndkey1,$inout2 + movups 0x30($inp),$rndkey1 + xorps $rndkey0,$inout3 + movups 0x40($inp),$iv + xorps $rndkey1,$inout4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + lea 0x40($out),$out + movaps $inout4,$inout0 + sub \$0x50,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_six: + call _aesni_decrypt6 + movups 0x10($inp),$rndkey1 + movups 0x20($inp),$rndkey0 + xorps $iv,$inout0 + xorps $in0,$inout1 + xorps $rndkey1,$inout2 + movups 0x30($inp),$rndkey1 + xorps $rndkey0,$inout3 + movups 0x40($inp),$rndkey0 + xorps $rndkey1,$inout4 + movups 0x50($inp),$iv + xorps $rndkey0,$inout5 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + lea 0x50($out),$out + movaps $inout5,$inout0 + sub \$0x60,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_tail_collected: and \$15,$len movups $iv,($ivp) jnz .Lcbc_dec_tail_partial - movdqu $inout0,($out) + movups $inout0,($out) jmp .Lcbc_dec_ret .align 16 .Lcbc_dec_tail_partial: movaps $inout0,$reserved(%rsp) + mov \$16,%rcx mov $out,%rdi - mov $len,%rcx + sub $len,%rcx lea $reserved(%rsp),%rsi .long 0x9066A4F3 # rep movsb @@ -963,7 +2444,7 @@ $code.=<<___; ret .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt ___ - +} # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, # int bits, AES_KEY *key) { my ($inp,$bits,$key) = @_4args; @@ -1033,7 +2514,7 @@ __aesni_set_encrypt_key: jz .Lenc_key_ret movups ($inp),%xmm0 # pull first 128 bits of *userKey - pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 + xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 lea 16($key),%rax cmp \$256,$bits je .L14rounds @@ -1148,11 +2629,11 @@ __aesni_set_encrypt_key: lea 16(%rax),%rax .Lkey_expansion_128_cold: shufps \$0b00010000,%xmm0,%xmm4 - pxor %xmm4, %xmm0 + xorps %xmm4, %xmm0 shufps \$0b10001100,%xmm0,%xmm4 - pxor %xmm4, %xmm0 - pshufd \$0b11111111,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm0 + xorps %xmm4, %xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 ret .align 16 @@ -1163,11 +2644,11 @@ __aesni_set_encrypt_key: movaps %xmm2, %xmm5 .Lkey_expansion_192b_warm: shufps \$0b00010000,%xmm0,%xmm4 - movaps %xmm2,%xmm3 - pxor %xmm4,%xmm0 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 shufps \$0b10001100,%xmm0,%xmm4 pslldq \$4,%xmm3 - pxor %xmm4,%xmm0 + xorps %xmm4,%xmm0 pshufd \$0b01010101,%xmm1,%xmm1 # critical path pxor %xmm3,%xmm2 pxor %xmm1,%xmm0 @@ -1191,11 +2672,11 @@ __aesni_set_encrypt_key: lea 16(%rax),%rax .Lkey_expansion_256a_cold: shufps \$0b00010000,%xmm0,%xmm4 - pxor %xmm4,%xmm0 + xorps %xmm4,%xmm0 shufps \$0b10001100,%xmm0,%xmm4 - pxor %xmm4,%xmm0 - pshufd \$0b11111111,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm0 + xorps %xmm4,%xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 ret .align 16 @@ -1204,13 +2685,14 @@ __aesni_set_encrypt_key: lea 16(%rax),%rax shufps \$0b00010000,%xmm2,%xmm4 - pxor %xmm4,%xmm2 + xorps %xmm4,%xmm2 shufps \$0b10001100,%xmm2,%xmm4 - pxor %xmm4,%xmm2 - pshufd \$0b10101010,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm2 + xorps %xmm4,%xmm2 + shufps \$0b10101010,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm2 ret .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key +.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key ___ } @@ -1219,9 +2701,12 @@ $code.=<<___; .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lincrement32: - .long 3,3,3,0 + .long 6,6,6,0 .Lincrement64: .long 1,0,0,0 +.Lxts_magic: + .long 0x87,0,1,0 + .asciz "AES for Intel AES-NI, CRYPTOGAMS by " .align 64 ___ @@ -1253,12 +2738,8 @@ ecb_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - jmp .Lcommon_seh_exit + jmp .Lcommon_seh_tail .size ecb_se_handler,.-ecb_se_handler .type ccm64_se_handler,\@abi-omnipotent @@ -1284,29 +2765,22 @@ ccm64_se_handler: mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lin_ccm64_prologue + jae .Lcommon_seh_tail - lea 0(%rax),%rsi # top of stack + lea 0(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer -.Lin_ccm64_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - jmp .Lcommon_seh_exit + jmp .Lcommon_seh_tail .size ccm64_se_handler,.-ccm64_se_handler .type ctr32_se_handler,\@abi-omnipotent @@ -1328,29 +2802,63 @@ ctr32_se_handler: lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lin_ctr32_prologue + jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lctr32_ret(%rip),%r10 cmp %r10,%rbx - jae .Lin_ctr32_prologue + jae .Lcommon_seh_tail - lea 0(%rax),%rsi # top of stack + lea 0x20(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 - mov \$12,%ecx # 6*sizeof(%xmm0)/sizeof(%rax) + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0x68(%rax),%rax # adjust stack pointer - -.Lin_ctr32_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi + lea 0xc8(%rax),%rax # adjust stack pointer - jmp .Lcommon_seh_exit + jmp .Lcommon_seh_tail .size ctr32_se_handler,.-ctr32_se_handler + +.type xts_se_handler,\@abi-omnipotent +.align 16 +xts_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue lable + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 0x60(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0x68+160(%rax),%rax # adjust stack pointer + + jmp .Lcommon_seh_tail +.size xts_se_handler,.-xts_se_handler ___ $code.=<<___; .type cbc_se_handler,\@abi-omnipotent @@ -1372,7 +2880,7 @@ cbc_se_handler: lea .Lcbc_decrypt(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lin_cbc_prologue + jb .Lcommon_seh_tail lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->RipRip>="epilogue" label - jae .Lin_cbc_prologue + jae .Lcommon_seh_tail lea 0(%rax),%rsi # top of stack lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer - jmp .Lin_cbc_prologue + jmp .Lcommon_seh_tail .Lrestore_cbc_rax: mov 120($context),%rax -.Lin_cbc_prologue: + +.Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi -.Lcommon_seh_exit: - mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) @@ -1452,6 +2959,14 @@ $code.=<<___ if ($PREFIX eq "aesni"); .rva .LSEH_begin_aesni_ctr32_encrypt_blocks .rva .LSEH_end_aesni_ctr32_encrypt_blocks .rva .LSEH_info_ctr32 + + .rva .LSEH_begin_aesni_xts_encrypt + .rva .LSEH_end_aesni_xts_encrypt + .rva .LSEH_info_xts_enc + + .rva .LSEH_begin_aesni_xts_decrypt + .rva .LSEH_end_aesni_xts_decrypt + .rva .LSEH_info_xts_dec ___ $code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt @@ -1483,6 +2998,14 @@ $code.=<<___ if ($PREFIX eq "aesni"); .LSEH_info_ctr32: .byte 9,0,0,0 .rva ctr32_se_handler +.LSEH_info_xts_enc: + .byte 9,0,0,0 + .rva xts_se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] +.LSEH_info_xts_dec: + .byte 9,0,0,0 + .rva xts_se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] ___ $code.=<<___; .LSEH_info_cbc: -- 2.34.1