From: Andy Polyakov Date: Sun, 7 Aug 2011 17:47:56 +0000 (+0000) Subject: aes/asm/aesni-*.pl: fix CCM and further optimize it. X-Git-Tag: OpenSSL-fips-2_0-rc1~232 X-Git-Url: https://git.openssl.org/gitweb/?a=commitdiff_plain;ds=sidebyside;h=267b481c47a937d926aca4a9c866af7397fc040d;p=openssl.git aes/asm/aesni-*.pl: fix CCM and further optimize it. modes/ccm128.c: minor branch optimization. --- diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index b3c8d1f60a..f95bf520d3 100644 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -594,6 +594,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); @@ -602,34 +603,29 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack - &mov ($rounds,1); + &mov ($rounds_,1); &xor ($key_,$key_); - &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); - &movdqa ($inout3,&QWP(0,"esp")); - &pshufb ($ivec,$inout3); # keep iv in reverse order - - &mov ($rounds,&DWP(240,$key)); - &mov ($key_,$key); - &mov ($rounds_,$rounds); + &shr ($rounds,1); + &lea ($key_,&DWP(0,$key)); &movdqa ($inout0,$ivec); + &mov ($rounds_,$rounds); + &movdqa ($inout3,&QWP(0,"esp")); &set_label("ccm64_enc_outer"); - &movups ($in0,&QWP(0,$inp)); - &pshufb ($inout0,$inout3); - &mov ($key,$key_); + &$movekey ($rndkey0,&QWP(0,$key_)); &mov ($rounds,$rounds_); + &movups ($in0,&QWP(0,$inp)); - &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key)); - &xorps ($in0,$rndkey0); - &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); - &xorps ($cmac,$in0); # cmac^=inp + &$movekey ($rndkey1,&QWP(16,$key_)); + &xorps ($rndkey0,$in0); + &lea ($key,&DWP(32,$key_)); + &xorps ($cmac,$rndkey0); # cmac^=inp &$movekey ($rndkey0,&QWP(0,$key)); &set_label("ccm64_enc2_loop"); @@ -642,18 +638,20 @@ if ($PREFIX eq "aesni") { &aesenc ($cmac,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("ccm64_enc2_loop")); + &pshufb ($ivec,$inout3); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); + &paddq ($ivec,&QWP(16,"esp")); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); - &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&DWP(16,$inp)); &xorps ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); - &movups (&QWP(0,$out),$in0); + &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); + &pshufb ($ivec,$inout3); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); @@ -675,6 +673,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); @@ -683,46 +682,45 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack - &mov ($rounds,1); + &mov ($rounds_,1); &xor ($key_,$key_); - &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout0,$ivec); - &pshufb ($ivec,$inout3); # keep iv in reverse order - &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); + &pshufb ($ivec,$inout3); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } - -&set_label("ccm64_dec_outer"); - &paddq ($ivec,&QWP(16,"esp")); &movups ($in0,&QWP(0,$inp)); # load inp - &xorps ($in0,$inout0); - &movdqa ($inout0,$ivec); + &paddq ($ivec,&QWP(16,"esp")); + &pshufb ($ivec,$inout3); &lea ($inp,&QWP(16,$inp)); - &pshufb ($inout0,$inout3); - &mov ($key,$key_); + &jmp (&label("ccm64_dec_outer")); + +&set_label("ccm64_dec_outer",16); + &xorps ($in0,$inout0); # inp ^= E(ivec) + &movdqa ($inout0,$ivec); &mov ($rounds,$rounds_); - &movups (&QWP(0,$out),$in0); + &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); &sub ($len,1); &jz (&label("ccm64_dec_break")); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(0,$key_)); &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($in0,$rndkey0); - &lea ($key,&DWP(32,$key)); + &lea ($key,&DWP(32,$key_)); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=out &$movekey ($rndkey0,&QWP(0,$key)); @@ -737,13 +735,18 @@ if ($PREFIX eq "aesni") { &aesenc ($cmac,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("ccm64_dec2_loop")); + &movups ($in0,&QWP(0,$inp)); # load inp + &paddq ($ivec,&QWP(16,"esp")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); + &pshufb ($ivec,$inout3); + &lea ($inp,&QWP(16,$inp)); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); + &mov ($key,$key_); if ($inline) { &aesni_inline_generate1("enc",$cmac,$in0); } else diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index ae0ad7f809..98c0dd55bf 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -821,8 +821,8 @@ ___ { my $cmac="%r9"; # 6th argument -my $increment="%xmm8"; -my $bswap_mask="%xmm9"; +my $increment="%xmm6"; +my $bswap_mask="%xmm7"; $code.=<<___; .globl aesni_ccm64_encrypt_blocks @@ -839,30 +839,28 @@ $code.=<<___ if ($win64); .Lccm64_enc_body: ___ $code.=<<___; + mov 240($key),$rounds # key->rounds movdqu ($ivp),$iv - movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask - pshufb $bswap_mask,$iv # keep iv in reverse order - mov 240($key),$rounds # key->rounds - mov $key,$key_ - mov $rounds,$rnds_ + shr \$1,$rounds + lea 0($key),$key_ + movdqu ($cmac),$inout1 movdqa $iv,$inout0 - + mov $rounds,$rnds_ + jmp .Lccm64_enc_outer +.align 16 .Lccm64_enc_outer: - movups ($inp),$in0 # load inp - pshufb $bswap_mask,$inout0 - mov $key_,$key + $movkey ($key_),$rndkey0 mov $rnds_,$rounds + movups ($inp),$in0 # load inp - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - xorps $rndkey0,$in0 - lea 32($key),$key - xorps $rndkey0,$inout0 - xorps $inout1,$in0 # cmac^=inp + xorps $rndkey0,$inout0 # counter + $movkey 16($key_),$rndkey1 + xorps $in0,$rndkey0 + lea 32($key_),$key + xorps $rndkey0,$inout1 # cmac^=inp $movkey ($key),$rndkey0 .Lccm64_enc2_loop: @@ -875,18 +873,20 @@ $code.=<<___; aesenc $rndkey0,$inout1 $movkey 0($key),$rndkey0 jnz .Lccm64_enc2_loop + pshufb $bswap_mask,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 + paddq $increment,$iv aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 - paddq $increment,$iv dec $len lea 16($inp),$inp xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 movups $in0,($out) # save output lea 16($out),$out + pshufb $bswap_mask,$iv jnz .Lccm64_enc_outer movups $inout1,($cmac) @@ -919,39 +919,40 @@ $code.=<<___ if ($win64); .Lccm64_dec_body: ___ $code.=<<___; - movdqu ($ivp),$iv + mov 240($key),$rounds # key->rounds + movups ($ivp),$iv movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask - mov 240($key),$rounds # key->rounds - movdqa $iv,$inout0 - pshufb $bswap_mask,$iv # keep iv in reverse order + movaps $iv,$inout0 mov $rounds,$rnds_ mov $key,$key_ + pshufb $bswap_mask,$iv ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; -.Lccm64_dec_outer: - paddq $increment,$iv movups ($inp),$in0 # load inp - xorps $inout0,$in0 - movdqa $iv,$inout0 + paddq $increment,$iv + pshufb $bswap_mask,$iv lea 16($inp),$inp - pshufb $bswap_mask,$inout0 - mov $key_,$key + jmp .Lccm64_dec_outer +.align 16 +.Lccm64_dec_outer: + xorps $inout0,$in0 # inp ^= E(iv) + movdqa $iv,$inout0 mov $rnds_,$rounds - movups $in0,($out) + movups $in0,($out) # save output lea 16($out),$out sub \$1,$len jz .Lccm64_dec_break - $movkey ($key),$rndkey0 + $movkey ($key_),$rndkey0 shr \$1,$rounds - $movkey 16($key),$rndkey1 + $movkey 16($key_),$rndkey1 xorps $rndkey0,$in0 - lea 32($key),$key + lea 32($key_),$key xorps $rndkey0,$inout0 xorps $in0,$inout1 # cmac^=out $movkey ($key),$rndkey0 @@ -966,15 +967,21 @@ $code.=<<___; aesenc $rndkey0,$inout1 $movkey 0($key),$rndkey0 jnz .Lccm64_dec2_loop + movups ($inp),$in0 # load inp + paddq $increment,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 + pshufb $bswap_mask,$iv + lea 16($inp),$inp aesenclast $rndkey0,$inout0 + aesenclast $rndkey0,$inout1 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: + #xorps $in0,$inout1 # cmac^=out ___ - &aesni_generate1("enc",$key,$rounds,$inout1); + &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); $code.=<<___; movups $inout1,($cmac) ___ diff --git a/crypto/modes/ccm128.c b/crypto/modes/ccm128.c index 001fdff658..c9b35e5b35 100644 --- a/crypto/modes/ccm128.c +++ b/crypto/modes/ccm128.c @@ -356,10 +356,10 @@ int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, inp += n; out += n; len -= n; + if (len) ctr64_add(ctx->nonce.c,n/16); } if (len) { - if (n) ctr64_add(ctx->nonce.c,n/16); for (i=0; icmac.c[i] ^= inp[i]; (*block)(ctx->cmac.c,ctx->cmac.c,key); (*block)(ctx->nonce.c,scratch.c,key); @@ -409,10 +409,10 @@ int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, inp += n; out += n; len -= n; + if (len) ctr64_add(ctx->nonce.c,n/16); } if (len) { - if (n) ctr64_add(ctx->nonce.c,n/16); (*block)(ctx->nonce.c,scratch.c,key); for (i=0; icmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);