+\f
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &and ("esp",-16); # align stack
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+ &sub ($len,16*6);
+ &jc (&label("xts_enc_short"));
+
+ &shr ($rounds,1);
+ &mov ($rounds_,$rounds);
+ &jmp (&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &aesenc ($inout0,$rndkey1);
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesenc ($inout1,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &dec ($rounds);
+ &aesenc ($inout2,$rndkey1);
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesenc ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesenc ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesenc ($inout5,$rndkey1);
+ &call (&label("_aesni_encrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov ($rounds,$rounds_); # restore $rounds
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_enc_loop6"));
+
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+ &add ($len,16*6);
+ &jz (&label("xts_enc_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_enc_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_enc_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_enc_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_enc_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_encrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout2);
+
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_encrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_encrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+ &movdqa ($inout3,$tweak);
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_enc_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($inout3,$twtmp,0x13);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+ &movz ($rounds,&BP(0,$inp));
+ &movz ($key,&BP(-16,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(-16,$out),&LB($rounds));
+ &mov (&BP(0,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_enc_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(-16,$out)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(-16,$out),$inout0); # write output
+
+&set_label("xts_enc_ret");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+ &mov ($key,&wparam(4)); # key2
+ &mov ($inp,&wparam(5)); # clear-text tweak
+
+ &mov ($rounds,&DWP(240,$key)); # key2->rounds
+ &movups ($inout0,&QWP(0,$inp));
+ if ($inline)
+ { &aesni_inline_generate1("enc"); }
+ else
+ { &call ("_aesni_encrypt1"); }
+
+ &mov ($inp,&wparam(0));
+ &mov ($out,&wparam(1));
+ &mov ($len,&wparam(2));
+ &mov ($key,&wparam(3)); # key1
+
+ &mov ($key_,"esp");
+ &sub ("esp",16*7+8);
+ &and ("esp",-16); # align stack
+
+ &xor ($rounds_,$rounds_); # if(len%16) len-=16;
+ &test ($len,15);
+ &setnz (&LB($rounds_));
+ &shl ($rounds_,4);
+ &sub ($len,$rounds_);
+
+ &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
+ &mov (&DWP(16*6+4,"esp"),0);
+ &mov (&DWP(16*6+8,"esp"),1);
+ &mov (&DWP(16*6+12,"esp"),0);
+ &mov (&DWP(16*7+0,"esp"),$len); # save original $len
+ &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
+
+ &mov ($rounds,&DWP(240,$key)); # key1->rounds
+ &mov ($key_,$key); # backup $key
+ &mov ($rounds_,$rounds); # backup $rounds
+
+ &movdqa ($tweak,$inout0);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+
+ &and ($len,-16);
+ &sub ($len,16*6);
+ &jc (&label("xts_dec_short"));
+
+ &shr ($rounds,1);
+ &mov ($rounds_,$rounds);
+ &jmp (&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+ for ($i=0;$i<4;$i++) {
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa (&QWP(16*$i,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ }
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*$i++,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &$movekey ($rndkey0,&QWP(0,$key_));
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &pxor ($inout5,$tweak);
+
+ # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &xorps ($inout0,$rndkey0); # input^=rndkey[0]
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout1,$rndkey0);
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout2,$rndkey0);
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout3,$rndkey0);
+ &movdqu ($rndkey1,&QWP(16*5,$inp));
+ &pxor ($inout4,$rndkey0);
+ &lea ($inp,&DWP(16*6,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
+ &pxor ($inout5,$rndkey1);
+
+ &$movekey ($rndkey1,&QWP(16,$key_));
+ &lea ($key,&DWP(32,$key_));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &aesdec ($inout0,$rndkey1);
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &aesdec ($inout1,$rndkey1);
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &dec ($rounds);
+ &aesdec ($inout2,$rndkey1);
+ &pxor ($inout4,&QWP(16*4,"esp"));
+ &aesdec ($inout3,$rndkey1);
+ &pxor ($inout5,$rndkey0);
+ &aesdec ($inout4,$rndkey1);
+ &$movekey ($rndkey0,&QWP(0,$key));
+ &aesdec ($inout5,$rndkey1);
+ &call (&label("_aesni_decrypt6_enter"));
+
+ &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
+ &pxor ($twtmp,$twtmp);
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*2,$out),$inout2);
+ &xorps ($inout4,&QWP(16*4,"esp"));
+ &movups (&QWP(16*3,$out),$inout3);
+ &xorps ($inout5,$tweak);
+ &movups (&QWP(16*4,$out),$inout4);
+ &pshufd ($twres,$twtmp,0x13);
+ &movups (&QWP(16*5,$out),$inout5);
+ &lea ($out,&DWP(16*6,$out));
+ &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
+
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov ($rounds,$rounds_); # restore $rounds
+ &pxor ($tweak,$twres);
+
+ &sub ($len,16*6);
+ &jnc (&label("xts_dec_loop6"));
+
+ &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+ &add ($len,16*6);
+ &jz (&label("xts_dec_done6x"));
+
+ &movdqa ($inout3,$tweak); # put aside previous tweak
+ &cmp ($len,0x20);
+ &jb (&label("xts_dec_one"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &je (&label("xts_dec_two"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &cmp ($len,0x40);
+ &jb (&label("xts_dec_three"));
+
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($inout5,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+ &movdqa (&QWP(16*0,"esp"),$inout3);
+ &movdqa (&QWP(16*1,"esp"),$inout4);
+ &je (&label("xts_dec_four"));
+
+ &movdqa (&QWP(16*2,"esp"),$inout5);
+ &pshufd ($inout5,$twtmp,0x13);
+ &movdqa (&QWP(16*3,"esp"),$tweak);
+ &paddq ($tweak,$tweak); # &psllq($inout0,1);
+ &pand ($inout5,$twmask); # isolate carry and residue
+ &pxor ($inout5,$tweak);
+
+ &movdqu ($inout0,&QWP(16*0,$inp)); # load input
+ &movdqu ($inout1,&QWP(16*1,$inp));
+ &movdqu ($inout2,&QWP(16*2,$inp));
+ &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movdqu ($inout3,&QWP(16*3,$inp));
+ &pxor ($inout1,&QWP(16*1,"esp"));
+ &movdqu ($inout4,&QWP(16*4,$inp));
+ &pxor ($inout2,&QWP(16*2,"esp"));
+ &lea ($inp,&DWP(16*5,$inp));
+ &pxor ($inout3,&QWP(16*3,"esp"));
+ &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
+ &pxor ($inout4,$inout5);
+
+ &call ("_aesni_decrypt6");
+
+ &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,&QWP(16*2,"esp"));
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,&QWP(16*3,"esp"));
+ &movups (&QWP(16*1,$out),$inout1);
+ &xorps ($inout4,$tweak);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &movups (&QWP(16*4,$out),$inout4);
+ &lea ($out,&DWP(16*5,$out));
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &lea ($inp,&DWP(16*1,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &lea ($out,&DWP(16*1,$out));
+
+ &movdqa ($tweak,$inout3); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &lea ($inp,&DWP(16*2,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+
+ &call ("_aesni_decrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &lea ($out,&DWP(16*2,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+ &movaps ($inout5,$tweak); # put aside last tweak
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &lea ($inp,&DWP(16*3,$inp));
+ &xorps ($inout0,$inout3); # input^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+
+ &call ("_aesni_decrypt3");
+
+ &xorps ($inout0,$inout3); # output^=tweak
+ &xorps ($inout1,$inout4);
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &lea ($out,&DWP(16*3,$out));
+
+ &movdqa ($tweak,$inout5); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+ &movaps ($inout4,$tweak); # put aside last tweak
+
+ &movups ($inout0,&QWP(16*0,$inp)); # load input
+ &movups ($inout1,&QWP(16*1,$inp));
+ &movups ($inout2,&QWP(16*2,$inp));
+ &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
+ &movups ($inout3,&QWP(16*3,$inp));
+ &lea ($inp,&DWP(16*4,$inp));
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &xorps ($inout3,$inout4);
+
+ &call ("_aesni_decrypt4");
+
+ &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
+ &xorps ($inout1,&QWP(16*1,"esp"));
+ &xorps ($inout2,$inout5);
+ &movups (&QWP(16*0,$out),$inout0); # write output
+ &xorps ($inout3,$inout4);
+ &movups (&QWP(16*1,$out),$inout1);
+ &movups (&QWP(16*2,$out),$inout2);
+ &movups (&QWP(16*3,$out),$inout3);
+ &lea ($out,&DWP(16*4,$out));
+
+ &movdqa ($tweak,$inout4); # last tweak
+ &jmp (&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &jmp (&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+ &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
+ &pxor ($twtmp,$twtmp);
+ &and ($len,15);
+ &jz (&label("xts_dec_ret"));
+
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
+ &pshufd ($twres,$twtmp,0x13);
+ &pxor ($twtmp,$twtmp);
+ &movdqa ($twmask,&QWP(16*6,"esp"));
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($twres,$twmask); # isolate carry and residue
+ &pcmpgtd($twtmp,$tweak); # broadcast upper bits
+ &pxor ($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+ &pshufd ($inout3,$twtmp,0x13);
+ &movdqa ($inout4,$tweak); # put aside previous tweak
+ &paddq ($tweak,$tweak); # &psllq($tweak,1);
+ &pand ($inout3,$twmask); # isolate carry and residue
+ &pxor ($inout3,$tweak);
+
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$inp)); # load input
+ &xorps ($inout0,$inout3); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout3); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_steal");
+ &movz ($rounds,&BP(16,$inp));
+ &movz ($key,&BP(0,$out));
+ &lea ($inp,&DWP(1,$inp));
+ &mov (&BP(0,$out),&LB($rounds));
+ &mov (&BP(16,$out),&LB($key));
+ &lea ($out,&DWP(1,$out));
+ &sub ($len,1);
+ &jnz (&label("xts_dec_steal"));
+
+ &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
+ &mov ($key,$key_); # restore $key
+ &mov ($rounds,$rounds_); # restore $rounds
+
+ &movups ($inout0,&QWP(0,$out)); # load input
+ &xorps ($inout0,$inout4); # input^=tweak
+ if ($inline)
+ { &aesni_inline_generate1("dec"); }
+ else
+ { &call ("_aesni_decrypt1"); }
+ &xorps ($inout0,$inout4); # output^=tweak
+ &movups (&QWP(0,$out),$inout0); # write output
+
+&set_label("xts_dec_ret");
+ &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
+&function_end("aesni_xts_decrypt");
+}