+ xorps $rndkey0,$rndkey0 # %xmm0
+ pxor $rndkey1,$rndkey1
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lecb_enc_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+\f
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9"; # 6th argument
+
+my $increment="%xmm9";
+my $iv="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl aesni_ccm64_encrypt_blocks
+.type aesni_ccm64_encrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in0
+ movaps %xmm9,0x30(%rsp) # $increment
+.Lccm64_enc_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movdqu ($ivp),$iv
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ shl \$4,$rounds
+ mov \$16,$rnds_
+ lea 0($key),$key_
+ movdqu ($cmac),$inout1
+ movdqa $iv,$inout0
+ lea 32($key,$rounds),$key # end of key schedule
+ pshufb $bswap_mask,$iv
+ sub %rax,%r10 # twisted $rounds
+ jmp .Lccm64_enc_outer
+.align 16
+.Lccm64_enc_outer:
+ $movkey ($key_),$rndkey0
+ mov %r10,%rax
+ movups ($inp),$in0 # load inp
+
+ xorps $rndkey0,$inout0 # counter
+ $movkey 16($key_),$rndkey1
+ xorps $in0,$rndkey0
+ xorps $rndkey0,$inout1 # cmac^=inp
+ $movkey 32($key_),$rndkey0
+
+.Lccm64_enc2_loop:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Lccm64_enc2_loop
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ paddq $increment,$iv
+ dec $len # $len-- ($len is in blocks)
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+
+ lea 16($inp),$inp
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ pshufb $bswap_mask,$inout0
+ lea 16($out),$out # $out+=16
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
+
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl aesni_ccm64_decrypt_blocks
+.type aesni_ccm64_decrypt_blocks,\@function,6
+.align 16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+ lea -0x58(%rsp),%rsp
+ movaps %xmm6,(%rsp) # $iv
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
+ movaps %xmm8,0x20(%rsp) # $in8
+ movaps %xmm9,0x30(%rsp) # $increment
+.Lccm64_dec_body:
+___
+$code.=<<___;
+ mov 240($key),$rounds # key->rounds
+ movups ($ivp),$iv
+ movdqu ($cmac),$inout1
+ movdqa .Lincrement64(%rip),$increment
+ movdqa .Lbswap_mask(%rip),$bswap_mask
+
+ movaps $iv,$inout0
+ mov $rounds,$rnds_
+ mov $key,$key_
+ pshufb $bswap_mask,$iv
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ shl \$4,$rnds_
+ mov \$16,$rounds
+ movups ($inp),$in0 # load inp
+ paddq $increment,$iv
+ lea 16($inp),$inp # $inp+=16
+ sub %r10,%rax # twisted $rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ mov %rax,%r10
+ jmp .Lccm64_dec_outer
+.align 16
+.Lccm64_dec_outer:
+ xorps $inout0,$in0 # inp ^= E(iv)
+ movdqa $iv,$inout0
+ movups $in0,($out) # save output
+ lea 16($out),$out # $out+=16
+ pshufb $bswap_mask,$inout0
+
+ sub \$1,$len # $len-- ($len is in blocks)
+ jz .Lccm64_dec_break # if ($len==0) break
+
+ $movkey ($key_),$rndkey0
+ mov %r10,%rax
+ $movkey 16($key_),$rndkey1
+ xorps $rndkey0,$in0
+ xorps $rndkey0,$inout0
+ xorps $in0,$inout1 # cmac^=out
+ $movkey 32($key_),$rndkey0
+ jmp .Lccm64_dec2_loop
+.align 16
+.Lccm64_dec2_loop:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Lccm64_dec2_loop
+ movups ($inp),$in0 # load input
+ paddq $increment,$iv
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenclast $rndkey0,$inout0
+ aesenclast $rndkey0,$inout1
+ lea 16($inp),$inp # $inp+=16
+ jmp .Lccm64_dec_outer
+
+.align 16
+.Lccm64_dec_break:
+ #xorps $in0,$inout1 # cmac^=out
+ mov 240($key_),$rounds
+___
+ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ pxor $inout0,$inout0
+ movups $inout1,($cmac) # store resulting mac
+ pxor $inout1,$inout1
+ pxor $in0,$in0
+ pxor $iv,$iv
+___
+$code.=<<___ if ($win64);
+ movaps (%rsp),%xmm6
+ movaps %xmm0,(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ lea 0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+ ret
+.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}\f
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+# size_t blocks, const AES_KEY *key,
+# const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see crypto/modes/ctr128.c for details)
+#
+# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
+# Keywords are full unroll and modulo-schedule counter calculations
+# with zero-round key xor.
+{
+my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
+my ($key0,$ctr)=("${key_}d","${ivp}d");
+my $frame_size = 0x80 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_ctr32_encrypt_blocks
+.type aesni_ctr32_encrypt_blocks,\@function,5
+.align 16
+aesni_ctr32_encrypt_blocks:
+ cmp \$1,$len
+ jne .Lctr32_bulk
+
+ # handle single block without allocating stack frame,
+ # useful when handling edges
+ movups ($ivp),$inout0
+ movups ($inp),$inout1
+ mov 240($key),%edx # key->rounds
+___
+ &aesni_generate1("enc",$key,"%edx");
+$code.=<<___;
+ pxor $rndkey0,$rndkey0 # clear register bank
+ pxor $rndkey1,$rndkey1
+ xorps $inout1,$inout0
+ pxor $inout1,$inout1
+ movups $inout0,($out)
+ xorps $inout0,$inout0
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
+.Lctr32_body:
+___
+$code.=<<___;
+ lea -8(%rax),%rbp
+
+ # 8 16-byte words on top of stack are counter values
+ # xor-ed with zero-round key
+
+ movdqu ($ivp),$inout0
+ movdqu ($key),$rndkey0
+ mov 12($ivp),$ctr # counter LSB
+ pxor $rndkey0,$inout0
+ mov 12($key),$key0 # 0-round key LSB
+ movdqa $inout0,0x00(%rsp) # populate counter block
+ bswap $ctr
+ movdqa $inout0,$inout1
+ movdqa $inout0,$inout2
+ movdqa $inout0,$inout3
+ movdqa $inout0,0x40(%rsp)
+ movdqa $inout0,0x50(%rsp)
+ movdqa $inout0,0x60(%rsp)
+ mov %rdx,%r10 # about to borrow %rdx
+ movdqa $inout0,0x70(%rsp)
+
+ lea 1($ctr),%rax
+ lea 2($ctr),%rdx
+ bswap %eax
+ bswap %edx
+ xor $key0,%eax
+ xor $key0,%edx
+ pinsrd \$3,%eax,$inout1
+ lea 3($ctr),%rax
+ movdqa $inout1,0x10(%rsp)
+ pinsrd \$3,%edx,$inout2
+ bswap %eax
+ mov %r10,%rdx # restore %rdx
+ lea 4($ctr),%r10
+ movdqa $inout2,0x20(%rsp)
+ xor $key0,%eax
+ bswap %r10d
+ pinsrd \$3,%eax,$inout3
+ xor $key0,%r10d
+ movdqa $inout3,0x30(%rsp)
+ lea 5($ctr),%r9
+ mov %r10d,0x40+12(%rsp)
+ bswap %r9d
+ lea 6($ctr),%r10
+ mov 240($key),$rounds # key->rounds
+ xor $key0,%r9d
+ bswap %r10d
+ mov %r9d,0x50+12(%rsp)
+ xor $key0,%r10d
+ lea 7($ctr),%r9
+ mov %r10d,0x60+12(%rsp)
+ bswap %r9d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ xor $key0,%r9d
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
+ mov %r9d,0x70+12(%rsp)
+
+ $movkey 0x10($key),$rndkey1
+
+ movdqa 0x40(%rsp),$inout4
+ movdqa 0x50(%rsp),$inout5
+
+ cmp \$8,$len # $len is in blocks
+ jb .Lctr32_tail # short input if ($len<8)
+
+ sub \$6,$len # $len is biased by -6
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
+ je .Lctr32_6x # [which denotes Atom Silvermont]
+
+ lea 0x80($key),$key # size optimization
+ sub \$2,$len # $len is biased by -8
+ jmp .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+ shl \$4,$rounds
+ mov \$48,$rnds_
+ bswap $key0
+ lea 32($key,$rounds),$key # end of key schedule
+ sub %rax,%r10 # twisted $rounds
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ add \$6,$ctr # next counter value
+ $movkey -48($key,$rnds_),$rndkey0
+ aesenc $rndkey1,$inout0
+ mov $ctr,%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout1
+ movbe %eax,`0x00+12`(%rsp) # store next counter value
+ lea 1($ctr),%eax
+ aesenc $rndkey1,$inout2
+ xor $key0,%eax
+ movbe %eax,`0x10+12`(%rsp)
+ aesenc $rndkey1,$inout3
+ lea 2($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey1,$inout4
+ movbe %eax,`0x20+12`(%rsp)
+ lea 3($ctr),%eax
+ aesenc $rndkey1,$inout5
+ $movkey -32($key,$rnds_),$rndkey1
+ xor $key0,%eax
+
+ aesenc $rndkey0,$inout0
+ movbe %eax,`0x30+12`(%rsp)
+ lea 4($ctr),%eax
+ aesenc $rndkey0,$inout1
+ xor $key0,%eax
+ movbe %eax,`0x40+12`(%rsp)
+ aesenc $rndkey0,$inout2
+ lea 5($ctr),%eax
+ xor $key0,%eax
+ aesenc $rndkey0,$inout3
+ movbe %eax,`0x50+12`(%rsp)
+ mov %r10,%rax # mov $rnds_,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,$rnds_),$rndkey0
+
+ call .Lenc_loop6
+
+ movdqu ($inp),$inout6 # load 6 input blocks
+ movdqu 0x10($inp),$inout7
+ movdqu 0x20($inp),$in0
+ movdqu 0x30($inp),$in1
+ movdqu 0x40($inp),$in2
+ movdqu 0x50($inp),$in3
+ lea 0x60($inp),$inp # $inp+=6*16
+ $movkey -64($key,$rnds_),$rndkey1
+ pxor $inout0,$inout6 # inp^=E(ctr)
+ movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
+ pxor $inout1,$inout7
+ movaps 0x10(%rsp),$inout1
+ pxor $inout2,$in0
+ movaps 0x20(%rsp),$inout2
+ pxor $inout3,$in1
+ movaps 0x30(%rsp),$inout3
+ pxor $inout4,$in2
+ movaps 0x40(%rsp),$inout4
+ pxor $inout5,$in3
+ movaps 0x50(%rsp),$inout5
+ movdqu $inout6,($out) # store 6 output blocks
+ movdqu $inout7,0x10($out)
+ movdqu $in0,0x20($out)
+ movdqu $in1,0x30($out)
+ movdqu $in2,0x40($out)
+ movdqu $in3,0x50($out)
+ lea 0x60($out),$out # $out+=6*16
+
+ sub \$6,$len
+ jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
+
+ add \$6,$len # restore real remaining $len
+ jz .Lctr32_done # done if ($len==0)
+
+ lea -48($rnds_),$rounds
+ lea -80($key,$rnds_),$key # restore $key
+ neg $rounds
+ shr \$4,$rounds # restore $rounds
+ jmp .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+ add \$8,$ctr # next counter value
+ movdqa 0x60(%rsp),$inout6
+ aesenc $rndkey1,$inout0
+ mov $ctr,%r9d
+ movdqa 0x70(%rsp),$inout7
+ aesenc $rndkey1,$inout1
+ bswap %r9d
+ $movkey 0x20-0x80($key),$rndkey0
+ aesenc $rndkey1,$inout2
+ xor $key0,%r9d
+ nop
+ aesenc $rndkey1,$inout3
+ mov %r9d,0x00+12(%rsp) # store next counter value
+ lea 1($ctr),%r9
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0x30-0x80($key),$rndkey1
+___
+for($i=2;$i<8;$i++) {
+my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkeyx,$inout0
+ aesenc $rndkeyx,$inout1
+ xor $key0,%r9d
+ .byte 0x66,0x90
+ aesenc $rndkeyx,$inout2
+ aesenc $rndkeyx,$inout3
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
+ lea $i($ctr),%r9
+ aesenc $rndkeyx,$inout4
+ aesenc $rndkeyx,$inout5
+ aesenc $rndkeyx,$inout6
+ aesenc $rndkeyx,$inout7
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
+___
+}
+$code.=<<___;
+ bswap %r9d
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ xor $key0,%r9d
+ movdqu 0x00($inp),$in0 # start loading input
+ aesenc $rndkey0,$inout3
+ mov %r9d,0x70+12(%rsp)
+ cmp \$11,$rounds
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xa0-0x80($key),$rndkey0
+
+ jb .Lctr32_enc_done
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xb0-0x80($key),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xc0-0x80($key),$rndkey0
+ je .Lctr32_enc_done
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ $movkey 0xd0-0x80($key),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ aesenc $rndkey0,$inout6
+ aesenc $rndkey0,$inout7
+ $movkey 0xe0-0x80($key),$rndkey0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 0x10($inp),$in1
+ pxor $rndkey0,$in0 # input^=round[last]
+ movdqu 0x20($inp),$in2
+ pxor $rndkey0,$in1
+ movdqu 0x30($inp),$in3
+ pxor $rndkey0,$in2
+ movdqu 0x40($inp),$in4
+ pxor $rndkey0,$in3
+ movdqu 0x50($inp),$in5
+ pxor $rndkey0,$in4
+ pxor $rndkey0,$in5
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+ aesenc $rndkey1,$inout7
+ movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
+ lea 0x80($inp),$inp # $inp+=8*16
+
+ aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
+ pxor $rndkey0,$rndkey1 # borrowed $rndkey
+ movdqu 0x70-0x80($inp),$in0
+ aesenclast $in1,$inout1
+ pxor $rndkey0,$in0
+ movdqa 0x00(%rsp),$in1 # load next counter block
+ aesenclast $in2,$inout2
+ aesenclast $in3,$inout3
+ movdqa 0x10(%rsp),$in2
+ movdqa 0x20(%rsp),$in3
+ aesenclast $in4,$inout4
+ aesenclast $in5,$inout5
+ movdqa 0x30(%rsp),$in4
+ movdqa 0x40(%rsp),$in5
+ aesenclast $rndkey1,$inout6
+ movdqa 0x50(%rsp),$rndkey0
+ $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
+ aesenclast $in0,$inout7
+
+ movups $inout0,($out) # store 8 output blocks
+ movdqa $in1,$inout0
+ movups $inout1,0x10($out)
+ movdqa $in2,$inout1
+ movups $inout2,0x20($out)
+ movdqa $in3,$inout2
+ movups $inout3,0x30($out)
+ movdqa $in4,$inout3
+ movups $inout4,0x40($out)
+ movdqa $in5,$inout4
+ movups $inout5,0x50($out)
+ movdqa $rndkey0,$inout5
+ movups $inout6,0x60($out)
+ movups $inout7,0x70($out)
+ lea 0x80($out),$out # $out+=8*16
+
+ sub \$8,$len
+ jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
+
+ add \$8,$len # restore real remainig $len
+ jz .Lctr32_done # done if ($len==0)
+ lea -0x80($key),$key
+
+.Lctr32_tail:
+ # note that at this point $inout0..5 are populated with
+ # counter values xor-ed with 0-round key
+ lea 16($key),$key
+ cmp \$4,$len
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
+
+ # if ($len>4) compute 7 E(counter)
+ shl \$4,$rounds
+ movdqa 0x60(%rsp),$inout6
+ pxor $inout7,$inout7
+
+ $movkey 16($key),$rndkey0
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
+ neg %rax
+ aesenc $rndkey1,$inout2
+ add \$16,%rax # prepare for .Lenc_loop8_enter
+ movups ($inp),$in0
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ movups 0x10($inp),$in1 # pre-load input
+ movups 0x20($inp),$in2
+ aesenc $rndkey1,$inout5
+ aesenc $rndkey1,$inout6
+
+ call .Lenc_loop8_enter
+
+ movdqu 0x30($inp),$in3
+ pxor $in0,$inout0
+ movdqu 0x40($inp),$in0
+ pxor $in1,$inout1
+ movdqu $inout0,($out) # store output
+ pxor $in2,$inout2
+ movdqu $inout1,0x10($out)
+ pxor $in3,$inout3
+ movdqu $inout2,0x20($out)
+ pxor $in0,$inout4
+ movdqu $inout3,0x30($out)
+ movdqu $inout4,0x40($out)
+ cmp \$6,$len
+ jb .Lctr32_done # $len was 5, stop store
+
+ movups 0x50($inp),$in1
+ xorps $in1,$inout5
+ movups $inout5,0x50($out)
+ je .Lctr32_done # $len was 6, stop store
+
+ movups 0x60($inp),$in2
+ xorps $in2,$inout6
+ movups $inout6,0x60($out)
+ jmp .Lctr32_done # $len was 7, stop store
+
+.align 32
+.Lctr32_loop4:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop4
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ movups ($inp),$in0 # load input
+ movups 0x10($inp),$in1
+ aesenclast $rndkey1,$inout2
+ aesenclast $rndkey1,$inout3
+ movups 0x20($inp),$in2
+ movups 0x30($inp),$in3
+
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ pxor $in2,$inout2
+ movdqu $inout2,0x20($out)
+ pxor $in3,$inout3
+ movdqu $inout3,0x30($out)
+ jmp .Lctr32_done # $len was 4, stop store
+
+.align 32
+.Lctr32_loop3:
+ aesenc $rndkey1,$inout0
+ lea 16($key),$key
+ dec $rounds
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ $movkey ($key),$rndkey1
+ jnz .Lctr32_loop3
+ aesenclast $rndkey1,$inout0
+ aesenclast $rndkey1,$inout1
+ aesenclast $rndkey1,$inout2
+
+ movups ($inp),$in0 # load input
+ xorps $in0,$inout0
+ movups $inout0,($out) # store output
+ cmp \$2,$len
+ jb .Lctr32_done # $len was 1, stop store
+
+ movups 0x10($inp),$in1
+ xorps $in1,$inout1
+ movups $inout1,0x10($out)
+ je .Lctr32_done # $len was 2, stop store
+
+ movups 0x20($inp),$in2
+ xorps $in2,$inout2
+ movups $inout2,0x20($out) # $len was 3, stop store
+
+.Lctr32_done:
+ xorps %xmm0,%xmm0 # clear regiser bank
+ xor $key0,$key0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,0x70(%rsp)
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+ movaps %xmm0,0x70(%rsp)
+___
+$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lctr32_epilogue:
+ ret
+.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+\f
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+# const AES_KEY *key1, const AES_KEY *key2
+# const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x70 + ($win64?160:0);
+
+$code.=<<___;
+.globl aesni_xts_encrypt
+.type aesni_xts_encrypt,\@function,6
+.align 16
+aesni_xts_encrypt:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
+.Lxts_enc_body:
+___
+$code.=<<___;
+ lea -8(%rax),%rbp
+ movups ($ivp),$inout0 # load clear-text tweak
+ mov 240(%r8),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+ $movkey ($key),$rndkey0 # zero round key
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
+ movdqa .Lxts_magic(%rip),$twmask
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
+___
+ # alternative tweak calculation algorithm is based on suggestions
+ # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+ # and should help in the future...
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ movdqa @tweak[5],@tweak[$i]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
+___
+ }
+$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
+ sub \$16*6,$len
+ jc .Lxts_enc_short # if $len-=6*16 borrowed
+
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
+ jmp .Lxts_enc_grandloop
+
+.align 32
+.Lxts_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqa $rndkey0,$twmask
+ movdqu `16*1`($inp),$inout1
+ pxor @tweak[0],$inout0 # input^=tweak^round[0]
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[1],$inout1
+ aesenc $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[2],$inout2
+ aesenc $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[3],$inout3
+ aesenc $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
+ pxor @tweak[4],$inout4
+ aesenc $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
+
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
+ aesenc $rndkey1,$inout4
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
+
+ aesenc $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesenc $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_enc_loop6
+.align 32
+.Lxts_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -80($key,%rax),$rndkey0
+ jnz .Lxts_enc_loop6
+
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
+ aesenc $rndkey1,$inout1
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
+ aesenc $rndkey1,$inout5
+ $movkey -64($key),$rndkey1
+
+ movdqa $twres,$twtmp
+ aesenc $rndkey0,$inout0
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
+ aesenc $rndkey0,$inout1
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
+ aesenc $rndkey0,$inout5
+ $movkey -48($key),$rndkey0
+
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
+ aesenc $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
+ aesenc $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
+
+ paddd $twres,$twres
+ aesenc $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesenc $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesenc $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesenc $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesenc $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesenclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesenclast `16*1`(%rsp),$inout1
+ aesenclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesenclast `16*3`(%rsp),$inout3
+ aesenclast `16*4`(%rsp),$inout4
+ aesenclast `16*5`(%rsp),$inout5
+ pxor $twres,@tweak[5]
+
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
+ sub \$16*6,$len
+ jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
+
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
+ mov $key_,$key # restore $key
+ shr \$4,$rounds # restore original value
+
+.Lxts_enc_short:
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_enc_done # done if ($len==0)
+
+ pxor $rndkey0,@tweak[1]
+ cmp \$0x20,$len
+ jb .Lxts_enc_one # $len is 1*16
+ pxor $rndkey0,@tweak[2]
+ je .Lxts_enc_two # $len is 2*16
+
+ pxor $rndkey0,@tweak[3]
+ cmp \$0x40,$len
+ jb .Lxts_enc_three # $len is 3*16
+ pxor $rndkey0,@tweak[4]
+ je .Lxts_enc_four # $len is 4*16
+
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp # $inp+=5*16
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+ pxor $inout5,$inout5
+
+ call _aesni_encrypt6
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[5],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out) # store 5 output blocks
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out # $out+=5*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp # inp+=1*16
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out) # store one output block
+ lea 16*1($out),$out # $out+=1*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp # $inp+=2*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_encrypt2
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out # $out+=2*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp # $inp+=3*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_encrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out # $out+=3*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp # $inp+=4*16
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_encrypt4
+
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
+ jmp .Lxts_enc_done
+
+.align 16
+.Lxts_enc_done:
+ and \$15,$len_ # see if $len%16 is 0
+ jz .Lxts_enc_ret
+ mov $len_,$len
+
+.Lxts_enc_steal:
+ movzb ($inp),%eax # borrow $rounds ...
+ movzb -16($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,-16($out)
+ mov %cl,0($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_enc_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups -16($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,-16($out)
+
+.Lxts_enc_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lxts_enc_epilogue:
+ ret
+.size aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl aesni_xts_decrypt
+.type aesni_xts_decrypt,\@function,6
+.align 16
+aesni_xts_decrypt:
+ lea (%rsp),%rax
+ push %rbp
+ sub \$$frame_size,%rsp
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
+___
+$code.=<<___ if ($win64);
+ movaps %xmm6,-0xa8(%rax) # offload everything
+ movaps %xmm7,-0x98(%rax)
+ movaps %xmm8,-0x88(%rax)
+ movaps %xmm9,-0x78(%rax)
+ movaps %xmm10,-0x68(%rax)
+ movaps %xmm11,-0x58(%rax)
+ movaps %xmm12,-0x48(%rax)
+ movaps %xmm13,-0x38(%rax)
+ movaps %xmm14,-0x28(%rax)
+ movaps %xmm15,-0x18(%rax)
+.Lxts_dec_body:
+___
+$code.=<<___;
+ lea -8(%rax),%rbp
+ movups ($ivp),$inout0 # load clear-text tweak
+ mov 240($key2),$rounds # key2->rounds
+ mov 240($key),$rnds_ # key1->rounds
+___
+ # generate the tweak
+ &aesni_generate1("enc",$key2,$rounds,$inout0);
+$code.=<<___;
+ xor %eax,%eax # if ($len%16) len-=16;
+ test \$15,$len
+ setnz %al
+ shl \$4,%rax
+ sub %rax,$len
+
+ $movkey ($key),$rndkey0 # zero round key
+ mov $key,$key_ # backup $key
+ mov $rnds_,$rounds # backup $rounds
+ shl \$4,$rnds_
+ mov $len,$len_ # backup $len
+ and \$-16,$len
+
+ $movkey 16($key,$rnds_),$rndkey1 # last round key
+
+ movdqa .Lxts_magic(%rip),$twmask
+ movdqa $inout0,@tweak[5]
+ pshufd \$0x5f,$inout0,$twres
+ pxor $rndkey0,$rndkey1
+___
+ for ($i=0;$i<4;$i++) {
+ $code.=<<___;
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ movdqa @tweak[5],@tweak[$i]
+ psrad \$31,$twtmp # broadcast upper bits
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ pxor $rndkey0,@tweak[$i]
+ pxor $twtmp,@tweak[5]
+___
+ }
+$code.=<<___;
+ movdqa @tweak[5],@tweak[4]
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twres
+ pxor $rndkey0,@tweak[4]
+ pxor $twres,@tweak[5]
+ movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
+
+ sub \$16*6,$len
+ jc .Lxts_dec_short # if $len-=6*16 borrowed
+
+ mov \$16+96,$rounds
+ lea 32($key_,$rnds_),$key # end of key schedule
+ sub %r10,%rax # twisted $rounds
+ $movkey 16($key_),$rndkey1
+ mov %rax,%r10 # backup twisted $rounds
+ lea .Lxts_magic(%rip),%r8
+ jmp .Lxts_dec_grandloop
+
+.align 32
+.Lxts_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqa $rndkey0,$twmask
+ movdqu `16*1`($inp),$inout1
+ pxor @tweak[0],$inout0 # intput^=tweak^round[0]
+ movdqu `16*2`($inp),$inout2
+ pxor @tweak[1],$inout1
+ aesdec $rndkey1,$inout0
+ movdqu `16*3`($inp),$inout3
+ pxor @tweak[2],$inout2
+ aesdec $rndkey1,$inout1
+ movdqu `16*4`($inp),$inout4
+ pxor @tweak[3],$inout3
+ aesdec $rndkey1,$inout2
+ movdqu `16*5`($inp),$inout5
+ pxor @tweak[5],$twmask # round[0]^=tweak[5]
+ movdqa 0x60(%rsp),$twres # load round[0]^round[last]
+ pxor @tweak[4],$inout4
+ aesdec $rndkey1,$inout3
+ $movkey 32($key_),$rndkey0
+ lea `16*6`($inp),$inp
+ pxor $twmask,$inout5
+
+ pxor $twres,@tweak[0] # calclulate tweaks^round[last]
+ aesdec $rndkey1,$inout4
+ pxor $twres,@tweak[1]
+ movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $twres,@tweak[2]
+
+ aesdec $rndkey0,$inout0
+ pxor $twres,@tweak[3]
+ movdqa @tweak[1],`16*1`(%rsp)
+ aesdec $rndkey0,$inout1
+ pxor $twres,@tweak[4]
+ movdqa @tweak[2],`16*2`(%rsp)
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pxor $twres,$twmask
+ movdqa @tweak[4],`16*4`(%rsp)
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ movdqa $twmask,`16*5`(%rsp)
+ pshufd \$0x5f,@tweak[5],$twres
+ jmp .Lxts_dec_loop6
+.align 32
+.Lxts_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey -64($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -80($key,%rax),$rndkey0
+ jnz .Lxts_dec_loop6
+
+ movdqa (%r8),$twmask # start calculating next tweak
+ movdqa $twres,$twtmp
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ paddq @tweak[5],@tweak[5]
+ psrad \$31,$twtmp
+ aesdec $rndkey1,$inout1
+ pand $twmask,$twtmp
+ $movkey ($key_),@tweak[0] # load round[0]
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[0],@tweak[1] # copy round[0]
+ aesdec $rndkey1,$inout5
+ $movkey -64($key),$rndkey1
+
+ movdqa $twres,$twtmp
+ aesdec $rndkey0,$inout0
+ paddd $twres,$twres
+ pxor @tweak[5],@tweak[0]
+ aesdec $rndkey0,$inout1
+ psrad \$31,$twtmp
+ paddq @tweak[5],@tweak[5]
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ pand $twmask,$twtmp
+ movaps @tweak[1],@tweak[2]
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movdqa $twres,$twtmp
+ aesdec $rndkey0,$inout5
+ $movkey -48($key),$rndkey0
+
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[1]
+ psrad \$31,$twtmp
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ movdqa @tweak[3],`16*3`(%rsp)
+ pxor $twtmp,@tweak[5]
+ aesdec $rndkey1,$inout4
+ movaps @tweak[2],@tweak[3]
+ movdqa $twres,$twtmp
+ aesdec $rndkey1,$inout5
+ $movkey -32($key),$rndkey1
+
+ paddd $twres,$twres
+ aesdec $rndkey0,$inout0
+ pxor @tweak[5],@tweak[2]
+ psrad \$31,$twtmp
+ aesdec $rndkey0,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$twtmp
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ pxor $twtmp,@tweak[5]
+ movaps @tweak[3],@tweak[4]
+ aesdec $rndkey0,$inout5
+
+ movdqa $twres,$rndkey0
+ paddd $twres,$twres
+ aesdec $rndkey1,$inout0
+ pxor @tweak[5],@tweak[3]
+ psrad \$31,$rndkey0
+ aesdec $rndkey1,$inout1
+ paddq @tweak[5],@tweak[5]
+ pand $twmask,$rndkey0
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0,@tweak[5]
+ $movkey ($key_),$rndkey0
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+
+ pxor @tweak[5],@tweak[4]
+ aesdeclast `16*0`(%rsp),$inout0
+ psrad \$31,$twres
+ paddq @tweak[5],@tweak[5]
+ aesdeclast `16*1`(%rsp),$inout1
+ aesdeclast `16*2`(%rsp),$inout2
+ pand $twmask,$twres
+ mov %r10,%rax # restore $rounds
+ aesdeclast `16*3`(%rsp),$inout3
+ aesdeclast `16*4`(%rsp),$inout4
+ aesdeclast `16*5`(%rsp),$inout5
+ pxor $twres,@tweak[5]
+
+ lea `16*6`($out),$out # $out+=6*16
+ movups $inout0,`-16*6`($out) # store 6 output blocks
+ movups $inout1,`-16*5`($out)
+ movups $inout2,`-16*4`($out)
+ movups $inout3,`-16*3`($out)
+ movups $inout4,`-16*2`($out)
+ movups $inout5,`-16*1`($out)
+ sub \$16*6,$len
+ jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
+
+ mov \$16+96,$rounds
+ sub $rnds_,$rounds
+ mov $key_,$key # restore $key
+ shr \$4,$rounds # restore original value
+
+.Lxts_dec_short:
+ # at the point @tweak[0..5] are populated with tweak values
+ mov $rounds,$rnds_ # backup $rounds
+ pxor $rndkey0,@tweak[0]
+ pxor $rndkey0,@tweak[1]
+ add \$16*6,$len # restore real remaining $len
+ jz .Lxts_dec_done # done if ($len==0)
+
+ pxor $rndkey0,@tweak[2]
+ cmp \$0x20,$len
+ jb .Lxts_dec_one # $len is 1*16
+ pxor $rndkey0,@tweak[3]
+ je .Lxts_dec_two # $len is 2*16
+
+ pxor $rndkey0,@tweak[4]
+ cmp \$0x40,$len
+ jb .Lxts_dec_three # $len is 3*16
+ je .Lxts_dec_four # $len is 4*16
+
+ movdqu ($inp),$inout0 # $len is 5*16
+ movdqu 16*1($inp),$inout1
+ movdqu 16*2($inp),$inout2
+ pxor @tweak[0],$inout0
+ movdqu 16*3($inp),$inout3
+ pxor @tweak[1],$inout1
+ movdqu 16*4($inp),$inout4
+ lea 16*5($inp),$inp # $inp+=5*16
+ pxor @tweak[2],$inout2
+ pxor @tweak[3],$inout3
+ pxor @tweak[4],$inout4
+
+ call _aesni_decrypt6
+
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ movdqu $inout0,($out) # store 5 output blocks
+ xorps @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ xorps @tweak[4],$inout4
+ movdqu $inout2,16*2($out)
+ pxor $twtmp,$twtmp
+ movdqu $inout3,16*3($out)
+ pcmpgtd @tweak[5],$twtmp
+ movdqu $inout4,16*4($out)
+ lea 16*5($out),$out # $out+=5*16
+ pshufd \$0x13,$twtmp,@tweak[1] # $twres
+ and \$15,$len_
+ jz .Lxts_dec_ret
+
+ movdqa @tweak[5],@tweak[0]
+ paddq @tweak[5],@tweak[5] # psllq 1,$tweak
+ pand $twmask,@tweak[1] # isolate carry and residue
+ pxor @tweak[5],@tweak[1]
+ jmp .Lxts_dec_done2
+
+.align 16
+.Lxts_dec_one:
+ movups ($inp),$inout0
+ lea 16*1($inp),$inp # $inp+=1*16
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movdqa @tweak[1],@tweak[0]
+ movups $inout0,($out) # store one output block
+ movdqa @tweak[2],@tweak[1]
+ lea 16*1($out),$out # $out+=1*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_two:
+ movups ($inp),$inout0
+ movups 16($inp),$inout1
+ lea 32($inp),$inp # $inp+=2*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+
+ call _aesni_decrypt2
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[2],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[3],@tweak[1]
+ movups $inout0,($out) # store 2 output blocks
+ movups $inout1,16*1($out)
+ lea 16*2($out),$out # $out+=2*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_three:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ lea 16*3($inp),$inp # $inp+=3*16
+ xorps @tweak[0],$inout0
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+
+ call _aesni_decrypt3
+
+ xorps @tweak[0],$inout0
+ movdqa @tweak[3],@tweak[0]
+ xorps @tweak[1],$inout1
+ movdqa @tweak[4],@tweak[1]
+ xorps @tweak[2],$inout2
+ movups $inout0,($out) # store 3 output blocks
+ movups $inout1,16*1($out)
+ movups $inout2,16*2($out)
+ lea 16*3($out),$out # $out+=3*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_four:
+ movups ($inp),$inout0
+ movups 16*1($inp),$inout1
+ movups 16*2($inp),$inout2
+ xorps @tweak[0],$inout0
+ movups 16*3($inp),$inout3
+ lea 16*4($inp),$inp # $inp+=4*16
+ xorps @tweak[1],$inout1
+ xorps @tweak[2],$inout2
+ xorps @tweak[3],$inout3
+
+ call _aesni_decrypt4
+
+ pxor @tweak[0],$inout0
+ movdqa @tweak[4],@tweak[0]
+ pxor @tweak[1],$inout1
+ movdqa @tweak[5],@tweak[1]
+ pxor @tweak[2],$inout2
+ movdqu $inout0,($out) # store 4 output blocks
+ pxor @tweak[3],$inout3
+ movdqu $inout1,16*1($out)
+ movdqu $inout2,16*2($out)
+ movdqu $inout3,16*3($out)
+ lea 16*4($out),$out # $out+=4*16
+ jmp .Lxts_dec_done
+
+.align 16
+.Lxts_dec_done:
+ and \$15,$len_ # see if $len%16 is 0
+ jz .Lxts_dec_ret
+.Lxts_dec_done2:
+ mov $len_,$len
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($inp),$inout0
+ xorps @tweak[1],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[1],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_steal:
+ movzb 16($inp),%eax # borrow $rounds ...
+ movzb ($out),%ecx # ... and $key
+ lea 1($inp),$inp
+ mov %al,($out)
+ mov %cl,16($out)
+ lea 1($out),$out
+ sub \$1,$len
+ jnz .Lxts_dec_steal
+
+ sub $len_,$out # rewind $out
+ mov $key_,$key # restore $key
+ mov $rnds_,$rounds # restore $rounds
+
+ movups ($out),$inout0
+ xorps @tweak[0],$inout0
+___
+ &aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+ xorps @tweak[0],$inout0
+ movups $inout0,($out)
+
+.Lxts_dec_ret:
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0x00(%rsp) # clear stack
+ pxor %xmm8,%xmm8
+ movaps %xmm0,0x10(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,0x20(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,0x30(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,0x40(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,0x50(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,0x60(%rsp)
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+___
+$code.=<<___ if ($win64);
+ movaps -0xa0(%rbp),%xmm6
+ movaps %xmm0,-0xa0(%rbp) # clear stack
+ movaps -0x90(%rbp),%xmm7
+ movaps %xmm0,-0x90(%rbp)
+ movaps -0x80(%rbp),%xmm8
+ movaps %xmm0,-0x80(%rbp)
+ movaps -0x70(%rbp),%xmm9
+ movaps %xmm0,-0x70(%rbp)
+ movaps -0x60(%rbp),%xmm10
+ movaps %xmm0,-0x60(%rbp)
+ movaps -0x50(%rbp),%xmm11
+ movaps %xmm0,-0x50(%rbp)
+ movaps -0x40(%rbp),%xmm12
+ movaps %xmm0,-0x40(%rbp)
+ movaps -0x30(%rbp),%xmm13
+ movaps %xmm0,-0x30(%rbp)
+ movaps -0x20(%rbp),%xmm14
+ movaps %xmm0,-0x20(%rbp)
+ movaps -0x10(%rbp),%xmm15
+ movaps %xmm0,-0x10(%rbp)
+ movaps %xmm0,0x00(%rsp)
+ movaps %xmm0,0x10(%rsp)
+ movaps %xmm0,0x20(%rsp)
+ movaps %xmm0,0x30(%rsp)
+ movaps %xmm0,0x40(%rsp)
+ movaps %xmm0,0x50(%rsp)
+ movaps %xmm0,0x60(%rsp)
+___
+$code.=<<___;
+ lea (%rbp),%rsp
+ pop %rbp
+.Lxts_dec_epilogue: