X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Faesni-x86_64.pl;h=31c80ae6bc53a30995483712c2a850b76cd94ac4;hp=27bb47c32634e7d7dfc909fe7abe682361921719;hb=214368ffee5736836e2dbb80a16a4fbd85f0eaf9;hpb=6c79faaa9dd288bfda72831a9ef22ca01fa482d4

diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 27bb47c326..31c80ae6bc 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 #
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -129,8 +129,8 @@
 #
 # Further data for other parallelizable modes:
 #
-# CBC decrypt				1.16	0.93	0.93
-# CTR					1.14	0.91	0.77
+# CBC decrypt				1.16	0.93	0.74
+# CTR					1.14	0.91	0.74
 #
 # Well, given 3x column it's probably inappropriate to call the limit
 # asymptotic, if it can be surpassed, isn't it? What happens there?
@@ -153,16 +153,24 @@
 
 # April 2011
 #
-# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
-# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
 # in CTR mode AES instruction interleave factor was chosen to be 6x.
 
 ######################################################################
-# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
-# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
-# in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec]
-# instruction latency is 9 cycles and that they can be issued every
-# cycle.
+# Current large-block performance in cycles per byte processed with
+# 128-bit key (less is better).
+#
+#		CBC en-/decrypt	CTR	XTS	ECB
+# Westmere	3.77/1.25	1.25	1.25	1.26
+# * Bridge	5.07/0.74	0.75	0.90	0.85
+# Haswell	4.44/0.63	0.63	0.73	0.63
+# Atom		5.75/3.54	3.56	4.12	3.87(*)
+# Bulldozer	5.77/0.70	0.72	0.90	0.70
+#
+# (*)	Atom ECB result is suboptimal because of penalties incurred
+#	by operations on %xmm8-15. As ECB is not considered
+#	critical, nothing was done to mitigate the problem.
 
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
@@ -187,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
 $code=".text\n";
+$code.=".extern	OPENSSL_ia32cap_P\n";
 
 $rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@@ -279,10 +288,49 @@ ___
 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
 # utilization, i.e. when subroutine's throughput is virtually same as
 # of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine.
-# aes[enc|dec] latency in next processor generation is 8, but the
-# instructions can be scheduled every cycle. Optimal interleave for
-# new processor is therefore 8x...
+# This is why it originally made no sense to implement 2x subroutine.
+# But times change and it became appropriate to spend extra 192 bytes
+# on 2x subroutine on Atom Silvermont account. For processors that
+# can schedule aes[enc|dec] every cycle optimal interleave factor
+# equals to corresponding instructions latency. 8x is optimal for
+# * Bridge and "super-optimal" for other Intel CPUs... 
+
+sub aesni_generate2 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-1] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt2,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt2:
+	$movkey	($key),$rndkey0
+	shl	\$4,$rounds
+	$movkey	16($key),$rndkey1
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	$movkey	32($key),$rndkey0
+	lea	32($key,$rounds),$key
+	neg	%rax				# $rounds
+	add	\$16,%rax
+
+.L${dir}_loop2:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.L${dir}_loop2
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	ret
+.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
+___
+}
 sub aesni_generate3 {
 my $dir=shift;
 # As already mentioned it takes in $key and $rounds, which are *not*
@@ -292,25 +340,26 @@ $code.=<<___;
 .align	16
 _aesni_${dir}rypt3:
 	$movkey	($key),$rndkey0
-	shr	\$1,$rounds
+	shl	\$4,$rounds
 	$movkey	16($key),$rndkey1
-	lea	32($key),$key
 	xorps	$rndkey0,$inout0
 	xorps	$rndkey0,$inout1
 	xorps	$rndkey0,$inout2
-	$movkey		($key),$rndkey0
+	$movkey	32($key),$rndkey0
+	lea	32($key,$rounds),$key
+	neg	%rax				# $rounds
+	add	\$16,%rax
 
 .L${dir}_loop3:
 	aes${dir}	$rndkey1,$inout0
 	aes${dir}	$rndkey1,$inout1
-	dec		$rounds
 	aes${dir}	$rndkey1,$inout2
-	$movkey		16($key),$rndkey1
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
 	aes${dir}	$rndkey0,$inout0
 	aes${dir}	$rndkey0,$inout1
-	lea		32($key),$key
 	aes${dir}	$rndkey0,$inout2
-	$movkey		($key),$rndkey0
+	$movkey		-16($key,%rax),$rndkey0
 	jnz		.L${dir}_loop3
 
 	aes${dir}	$rndkey1,$inout0
@@ -336,28 +385,30 @@ $code.=<<___;
 .align	16
 _aesni_${dir}rypt4:
 	$movkey	($key),$rndkey0
-	shr	\$1,$rounds
+	shl	\$4,$rounds
 	$movkey	16($key),$rndkey1
-	lea	32($key),$key
 	xorps	$rndkey0,$inout0
 	xorps	$rndkey0,$inout1
 	xorps	$rndkey0,$inout2
 	xorps	$rndkey0,$inout3
-	$movkey	($key),$rndkey0
+	$movkey	32($key),$rndkey0
+	lea	32($key,$rounds),$key
+	neg	%rax				# $rounds
+	.byte	0x0f,0x1f,0x00
+	add	\$16,%rax
 
 .L${dir}_loop4:
 	aes${dir}	$rndkey1,$inout0
 	aes${dir}	$rndkey1,$inout1
-	dec		$rounds
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}	$rndkey1,$inout3
-	$movkey		16($key),$rndkey1
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
 	aes${dir}	$rndkey0,$inout0
 	aes${dir}	$rndkey0,$inout1
-	lea		32($key),$key
 	aes${dir}	$rndkey0,$inout2
 	aes${dir}	$rndkey0,$inout3
-	$movkey		($key),$rndkey0
+	$movkey		-16($key,%rax),$rndkey0
 	jnz		.L${dir}_loop4
 
 	aes${dir}	$rndkey1,$inout0
@@ -381,43 +432,43 @@ $code.=<<___;
 .align	16
 _aesni_${dir}rypt6:
 	$movkey		($key),$rndkey0
-	shr		\$1,$rounds
+	shl		\$4,$rounds
 	$movkey		16($key),$rndkey1
-	lea		32($key),$key
 	xorps		$rndkey0,$inout0
 	pxor		$rndkey0,$inout1
-	aes${dir}	$rndkey1,$inout0
 	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout0
+	lea		32($key,$rounds),$key
+	neg		%rax			# $rounds
 	aes${dir}	$rndkey1,$inout1
 	pxor		$rndkey0,$inout3
-	aes${dir}	$rndkey1,$inout2
 	pxor		$rndkey0,$inout4
-	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout2
 	pxor		$rndkey0,$inout5
-	dec		$rounds
+	add		\$16,%rax
+	aes${dir}	$rndkey1,$inout3
 	aes${dir}	$rndkey1,$inout4
-	$movkey		($key),$rndkey0
 	aes${dir}	$rndkey1,$inout5
+	$movkey		-16($key,%rax),$rndkey0
 	jmp		.L${dir}_loop6_enter
 .align	16
 .L${dir}_loop6:
 	aes${dir}	$rndkey1,$inout0
 	aes${dir}	$rndkey1,$inout1
-	dec		$rounds
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}	$rndkey1,$inout3
 	aes${dir}	$rndkey1,$inout4
 	aes${dir}	$rndkey1,$inout5
-.L${dir}_loop6_enter:				# happens to be 16-byte aligned
-	$movkey		16($key),$rndkey1
+.L${dir}_loop6_enter:
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
 	aes${dir}	$rndkey0,$inout0
 	aes${dir}	$rndkey0,$inout1
-	lea		32($key),$key
 	aes${dir}	$rndkey0,$inout2
 	aes${dir}	$rndkey0,$inout3
 	aes${dir}	$rndkey0,$inout4
 	aes${dir}	$rndkey0,$inout5
-	$movkey		($key),$rndkey0
+	$movkey		-16($key,%rax),$rndkey0
 	jnz		.L${dir}_loop6
 
 	aes${dir}	$rndkey1,$inout0
@@ -445,52 +496,51 @@ $code.=<<___;
 .align	16
 _aesni_${dir}rypt8:
 	$movkey		($key),$rndkey0
-	shr		\$1,$rounds
+	shl		\$4,$rounds
 	$movkey		16($key),$rndkey1
-	lea		32($key),$key
 	xorps		$rndkey0,$inout0
 	xorps		$rndkey0,$inout1
-	aes${dir}	$rndkey1,$inout0
 	pxor		$rndkey0,$inout2
-	aes${dir}	$rndkey1,$inout1
 	pxor		$rndkey0,$inout3
-	aes${dir}	$rndkey1,$inout2
 	pxor		$rndkey0,$inout4
-	aes${dir}	$rndkey1,$inout3
+	lea		32($key,$rounds),$key
+	neg		%rax			# $rounds
+	aes${dir}	$rndkey1,$inout0
+	add		\$16,%rax
 	pxor		$rndkey0,$inout5
-	dec		$rounds
-	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout1
 	pxor		$rndkey0,$inout6
-	aes${dir}	$rndkey1,$inout5
 	pxor		$rndkey0,$inout7
-	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
 	aes${dir}	$rndkey1,$inout6
 	aes${dir}	$rndkey1,$inout7
-	$movkey		16($key),$rndkey1
+	$movkey		-16($key,%rax),$rndkey0
 	jmp		.L${dir}_loop8_enter
 .align	16
 .L${dir}_loop8:
 	aes${dir}	$rndkey1,$inout0
 	aes${dir}	$rndkey1,$inout1
-	dec		$rounds
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}	$rndkey1,$inout3
 	aes${dir}	$rndkey1,$inout4
 	aes${dir}	$rndkey1,$inout5
 	aes${dir}	$rndkey1,$inout6
 	aes${dir}	$rndkey1,$inout7
-	$movkey		16($key),$rndkey1
-.L${dir}_loop8_enter:				# happens to be 16-byte aligned
+.L${dir}_loop8_enter:
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
 	aes${dir}	$rndkey0,$inout0
 	aes${dir}	$rndkey0,$inout1
-	lea		32($key),$key
 	aes${dir}	$rndkey0,$inout2
 	aes${dir}	$rndkey0,$inout3
 	aes${dir}	$rndkey0,$inout4
 	aes${dir}	$rndkey0,$inout5
 	aes${dir}	$rndkey0,$inout6
 	aes${dir}	$rndkey0,$inout7
-	$movkey		($key),$rndkey0
+	$movkey		-16($key,%rax),$rndkey0
 	jnz		.L${dir}_loop8
 
 	aes${dir}	$rndkey1,$inout0
@@ -513,6 +563,8 @@ _aesni_${dir}rypt8:
 .size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
 ___
 }
+&aesni_generate2("enc") if ($PREFIX eq "aesni");
+&aesni_generate2("dec");
 &aesni_generate3("enc") if ($PREFIX eq "aesni");
 &aesni_generate3("dec");
 &aesni_generate4("enc") if ($PREFIX eq "aesni");
@@ -634,8 +686,7 @@ $code.=<<___;
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_two:
-	xorps	$inout2,$inout2
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	jmp	.Lecb_ret
@@ -771,8 +822,7 @@ $code.=<<___;
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_two:
-	xorps	$inout2,$inout2
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	jmp	.Lecb_ret
@@ -829,7 +879,8 @@ ___
 {
 my $cmac="%r9";	# 6th argument
 
-my $increment="%xmm6";
+my $increment="%xmm9";
+my $iv="%xmm6";
 my $bswap_mask="%xmm7";
 
 $code.=<<___;
@@ -852,49 +903,49 @@ $code.=<<___;
 	movdqa	.Lincrement64(%rip),$increment
 	movdqa	.Lbswap_mask(%rip),$bswap_mask
 
-	shr	\$1,$rounds
+	shl	\$4,$rounds
+	mov	\$16,$rnds_
 	lea	0($key),$key_
 	movdqu	($cmac),$inout1
 	movdqa	$iv,$inout0
-	mov	$rounds,$rnds_
+	lea	32($key,$rounds),$key		# end of key schedule
 	pshufb	$bswap_mask,$iv
+	sub	%rax,%r10			# twisted $rounds
 	jmp	.Lccm64_enc_outer
 .align	16
 .Lccm64_enc_outer:
 	$movkey	($key_),$rndkey0
-	mov	$rnds_,$rounds
+	mov	%r10,%rax
 	movups	($inp),$in0			# load inp
 
 	xorps	$rndkey0,$inout0		# counter
 	$movkey	16($key_),$rndkey1
 	xorps	$in0,$rndkey0
-	lea	32($key_),$key
 	xorps	$rndkey0,$inout1		# cmac^=inp
-	$movkey	($key),$rndkey0
+	$movkey	32($key_),$rndkey0
 
 .Lccm64_enc2_loop:
 	aesenc	$rndkey1,$inout0
-	dec	$rounds
 	aesenc	$rndkey1,$inout1
-	$movkey	16($key),$rndkey1
+	$movkey	($key,%rax),$rndkey1
+	add	\$32,%rax
 	aesenc	$rndkey0,$inout0
-	lea	32($key),$key
 	aesenc	$rndkey0,$inout1
-	$movkey	0($key),$rndkey0
+	$movkey	-16($key,%rax),$rndkey0
 	jnz	.Lccm64_enc2_loop
 	aesenc	$rndkey1,$inout0
 	aesenc	$rndkey1,$inout1
 	paddq	$increment,$iv
+	dec	$len
 	aesenclast	$rndkey0,$inout0
 	aesenclast	$rndkey0,$inout1
 
-	dec	$len
 	lea	16($inp),$inp
 	xorps	$inout0,$in0			# inp ^= E(iv)
 	movdqa	$iv,$inout0
 	movups	$in0,($out)			# save output
-	lea	16($out),$out
 	pshufb	$bswap_mask,$inout0
+	lea	16($out),$out
 	jnz	.Lccm64_enc_outer
 
 	movups	$inout1,($cmac)
@@ -940,15 +991,19 @@ $code.=<<___;
 ___
 	&aesni_generate1("enc",$key,$rounds);
 $code.=<<___;
+	shl	\$4,$rnds_
+	mov	\$16,$rounds
 	movups	($inp),$in0			# load inp
 	paddq	$increment,$iv
 	lea	16($inp),$inp
+	sub	%r10,%rax			# twisted $rounds
+	lea	32($key_,$rnds_),$key		# end of key schedule
+	mov	%rax,%r10
 	jmp	.Lccm64_dec_outer
 .align	16
 .Lccm64_dec_outer:
 	xorps	$inout0,$in0			# inp ^= E(iv)
 	movdqa	$iv,$inout0
-	mov	$rnds_,$rounds
 	movups	$in0,($out)			# save output
 	lea	16($out),$out
 	pshufb	$bswap_mask,$inout0
@@ -957,36 +1012,36 @@ $code.=<<___;
 	jz	.Lccm64_dec_break
 
 	$movkey	($key_),$rndkey0
-	shr	\$1,$rounds
+	mov	%r10,%rax
 	$movkey	16($key_),$rndkey1
 	xorps	$rndkey0,$in0
-	lea	32($key_),$key
 	xorps	$rndkey0,$inout0
 	xorps	$in0,$inout1			# cmac^=out
-	$movkey	($key),$rndkey0
-
+	$movkey	32($key_),$rndkey0
+	jmp	.Lccm64_dec2_loop
+.align	16
 .Lccm64_dec2_loop:
 	aesenc	$rndkey1,$inout0
-	dec	$rounds
 	aesenc	$rndkey1,$inout1
-	$movkey	16($key),$rndkey1
+	$movkey	($key,%rax),$rndkey1
+	add	\$32,%rax
 	aesenc	$rndkey0,$inout0
-	lea	32($key),$key
 	aesenc	$rndkey0,$inout1
-	$movkey	0($key),$rndkey0
+	$movkey	-16($key,%rax),$rndkey0
 	jnz	.Lccm64_dec2_loop
 	movups	($inp),$in0			# load inp
 	paddq	$increment,$iv
 	aesenc	$rndkey1,$inout0
 	aesenc	$rndkey1,$inout1
-	lea	16($inp),$inp
 	aesenclast	$rndkey0,$inout0
 	aesenclast	$rndkey0,$inout1
+	lea	16($inp),$inp
 	jmp	.Lccm64_dec_outer
 
 .align	16
 .Lccm64_dec_break:
 	#xorps	$in0,$inout1			# cmac^=out
+	mov	240($key_),$rounds
 ___
 	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
 $code.=<<___;
@@ -1014,7 +1069,7 @@ ___
 # does not update *ivec! (see crypto/modes/ctr128.c for details)
 #
 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
-# http://rt.openssl.org/Ticket/Display.html?id=3031&user=guest&pass=guest.
+# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
 # Keywords are full unroll and modulo-schedule counter calculations
 # with zero-round key xor.
 {
@@ -1058,35 +1113,39 @@ $code.=<<___;
 	mov	12($key),$key0			# 0-round key LSB
 	movdqa	$inout0,0x00(%rsp)		# populate counter block
 	bswap	$ctr
-	movdqa	$inout0,0x10(%rsp)
-	movdqa	$inout0,0x20(%rsp)
-	movdqa	$inout0,0x30(%rsp)
+	movdqa	$inout0,$inout1
+	movdqa	$inout0,$inout2
+	movdqa	$inout0,$inout3
 	movdqa	$inout0,0x40(%rsp)
 	movdqa	$inout0,0x50(%rsp)
 	movdqa	$inout0,0x60(%rsp)
+	mov	%rdx,%r10			# borrow %rdx
 	movdqa	$inout0,0x70(%rsp)
 
-	mov	240($key),$rounds		# key->rounds
-
-	lea	1($ctr),%r9
-	 lea	2($ctr),%r10
-	bswap	%r9d
-	 bswap	%r10d
-	xor	$key0,%r9d
-	 xor	$key0,%r10d
-	mov	%r9d,0x10+12(%rsp)
-	lea	3($ctr),%r9
-	 mov	%r10d,0x20+12(%rsp)
-	bswap	%r9d
+	lea	1($ctr),%rax
+	 lea	2($ctr),%rdx
+	bswap	%eax
+	 bswap	%edx
+	xor	$key0,%eax
+	 xor	$key0,%edx
+	pinsrd	\$3,%eax,$inout1
+	lea	3($ctr),%rax
+	movdqa	$inout1,0x10(%rsp)
+	 pinsrd	\$3,%edx,$inout2
+	bswap	%eax
+	 mov	%r10,%rdx			# restore %rdx
 	 lea	4($ctr),%r10
-	xor	$key0,%r9d
+	 movdqa	$inout2,0x20(%rsp)
+	xor	$key0,%eax
 	 bswap	%r10d
-	mov	%r9d,0x30+12(%rsp)
+	pinsrd	\$3,%eax,$inout3
 	 xor	$key0,%r10d
+	movdqa	$inout3,0x30(%rsp)
 	lea	5($ctr),%r9
 	 mov	%r10d,0x40+12(%rsp)
 	bswap	%r9d
 	 lea	6($ctr),%r10
+	mov	240($key),$rounds		# key->rounds
 	xor	$key0,%r9d
 	 bswap	%r10d
 	mov	%r9d,0x50+12(%rsp)
@@ -1094,24 +1153,117 @@ $code.=<<___;
 	lea	7($ctr),%r9
 	 mov	%r10d,0x60+12(%rsp)
 	bswap	%r9d
+	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d 
 	xor	$key0,%r9d
+	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
 	mov	%r9d,0x70+12(%rsp)
 
 	$movkey	0x10($key),$rndkey1
 
-	movdqa	0x10(%rsp),$inout1
-	movdqa	0x20(%rsp),$inout2
-	movdqa	0x30(%rsp),$inout3
 	movdqa	0x40(%rsp),$inout4
 	movdqa	0x50(%rsp),$inout5
 
 	cmp	\$8,$len
 	jb	.Lctr32_tail
 
+	sub	\$6,$len
+	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
+	je	.Lctr32_6x
+
 	lea	0x80($key),$key		# size optimization
-	sub	\$8,$len
+	sub	\$2,$len
 	jmp	.Lctr32_loop8
 
+.align	16
+.Lctr32_6x:
+	shl	\$4,$rounds
+	mov	\$48,$rnds_
+	bswap	$key0
+	lea	32($key,$rounds),$key	# end of key schedule
+	sub	%rax,%r10		# twisted $rounds
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	 add	\$6,$ctr
+	$movkey	-48($key,$rnds_),$rndkey0
+	aesenc	$rndkey1,$inout0
+	 mov	$ctr,%eax
+	 xor	$key0,%eax
+	aesenc	$rndkey1,$inout1
+	 movbe	%eax,`0x00+12`(%rsp)
+	 lea	1($ctr),%eax
+	aesenc	$rndkey1,$inout2
+	 xor	$key0,%eax
+	 movbe	%eax,`0x10+12`(%rsp)
+	aesenc	$rndkey1,$inout3
+	 lea	2($ctr),%eax
+	 xor	$key0,%eax
+	aesenc	$rndkey1,$inout4
+	 movbe	%eax,`0x20+12`(%rsp)
+	 lea	3($ctr),%eax
+	aesenc	$rndkey1,$inout5
+	$movkey	-32($key,$rnds_),$rndkey1
+	 xor	$key0,%eax
+
+	aesenc	$rndkey0,$inout0
+	 movbe	%eax,`0x30+12`(%rsp)
+	 lea	4($ctr),%eax
+	aesenc	$rndkey0,$inout1
+	 xor	$key0,%eax
+	 movbe	%eax,`0x40+12`(%rsp)
+	aesenc	$rndkey0,$inout2
+	 lea	5($ctr),%eax
+	 xor	$key0,%eax
+	aesenc	$rndkey0,$inout3
+	 movbe	%eax,`0x50+12`(%rsp)
+	 mov	%r10,%rax		# mov	$rnds_,$rounds
+	aesenc	$rndkey0,$inout4
+	aesenc	$rndkey0,$inout5
+	$movkey	-16($key,$rnds_),$rndkey0
+
+	call	.Lenc_loop6
+
+	movdqu	($inp),$inout6
+	movdqu	0x10($inp),$inout7
+	movdqu	0x20($inp),$in0
+	movdqu	0x30($inp),$in1
+	movdqu	0x40($inp),$in2
+	movdqu	0x50($inp),$in3
+	lea	0x60($inp),$inp
+	$movkey	-64($key,$rnds_),$rndkey1
+	pxor	$inout0,$inout6
+	movaps	0x00(%rsp),$inout0
+	pxor	$inout1,$inout7
+	movaps	0x10(%rsp),$inout1
+	pxor	$inout2,$in0
+	movaps	0x20(%rsp),$inout2
+	pxor	$inout3,$in1
+	movaps	0x30(%rsp),$inout3
+	pxor	$inout4,$in2
+	movaps	0x40(%rsp),$inout4
+	pxor	$inout5,$in3
+	movaps	0x50(%rsp),$inout5
+	movdqu	$inout6,($out)
+	movdqu	$inout7,0x10($out)
+	movdqu	$in0,0x20($out)
+	movdqu	$in1,0x30($out)
+	movdqu	$in2,0x40($out)
+	movdqu	$in3,0x50($out)
+	lea	0x60($out),$out
+	
+	sub	\$6,$len
+	jnc	.Lctr32_loop6
+
+	add	\$6,$len
+	jz	.Lctr32_done
+
+	lea	-48($rnds_),$rounds
+	lea	-80($key,$rnds_),$key	# restore $key
+	neg	$rounds
+	shr	\$4,$rounds		# restore $rounds
+	jmp	.Lctr32_tail
+
 .align	32
 .Lctr32_loop8:
 	 add		\$8,$ctr
@@ -1124,6 +1276,7 @@ $code.=<<___;
 	$movkey		0x20-0x80($key),$rndkey0
 	aesenc		$rndkey1,$inout2
 	 xor		$key0,%r9d
+	 nop
 	aesenc		$rndkey1,$inout3
 	 mov		%r9d,0x00+12(%rsp)
 	 lea		1($ctr),%r9
@@ -1136,11 +1289,12 @@ ___
 for($i=2;$i<8;$i++) {
 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
 $code.=<<___;
+	 bswap		%r9d
 	aesenc		$rndkeyx,$inout0
 	aesenc		$rndkeyx,$inout1
-	 bswap		%r9d
-	aesenc		$rndkeyx,$inout2
 	 xor		$key0,%r9d
+	 .byte		0x66,0x90
+	aesenc		$rndkeyx,$inout2
 	aesenc		$rndkeyx,$inout3
 	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
 	 lea		$i($ctr),%r9
@@ -1152,21 +1306,21 @@ $code.=<<___;
 ___
 }
 $code.=<<___;
+	 bswap		%r9d
 	aesenc		$rndkey0,$inout0
 	aesenc		$rndkey0,$inout1
-	 bswap		%r9d
 	aesenc		$rndkey0,$inout2
 	 xor		$key0,%r9d
+	 movdqu		0x00($inp),$in0
 	aesenc		$rndkey0,$inout3
 	 mov		%r9d,0x70+12(%rsp)
+	 cmp		\$11,$rounds
 	aesenc		$rndkey0,$inout4
 	aesenc		$rndkey0,$inout5
 	aesenc		$rndkey0,$inout6
-	 movdqu		0x00($inp),$in0
 	aesenc		$rndkey0,$inout7
 	$movkey		0xa0-0x80($key),$rndkey0
 
-	cmp		\$11,$rounds
 	jb		.Lctr32_enc_done
 
 	aesenc		$rndkey1,$inout0
@@ -1209,48 +1363,50 @@ $code.=<<___;
 	aesenc		$rndkey0,$inout6
 	aesenc		$rndkey0,$inout7
 	$movkey		0xe0-0x80($key),$rndkey0
+	jmp		.Lctr32_enc_done
 
+.align	16
 .Lctr32_enc_done:
-	aesenc		$rndkey1,$inout0
 	movdqu		0x10($inp),$in1
 	pxor		$rndkey0,$in0
-	aesenc		$rndkey1,$inout1
 	movdqu		0x20($inp),$in2
 	pxor		$rndkey0,$in1
-	aesenc		$rndkey1,$inout2
 	movdqu		0x30($inp),$in3
 	pxor		$rndkey0,$in2
-	aesenc		$rndkey1,$inout3
 	movdqu		0x40($inp),$in4
 	pxor		$rndkey0,$in3
-	aesenc		$rndkey1,$inout4
 	movdqu		0x50($inp),$in5
 	pxor		$rndkey0,$in4
-	aesenc		$rndkey1,$inout5
 	pxor		$rndkey0,$in5
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
 	aesenc		$rndkey1,$inout6
 	aesenc		$rndkey1,$inout7
 	movdqu		0x60($inp),$rndkey1
+	lea		0x80($inp),$inp
 
 	aesenclast	$in0,$inout0
 	pxor		$rndkey0,$rndkey1
-	movdqu		0x70($inp),$in0
-	lea		0x80($inp),$inp
+	movdqu		0x70-0x80($inp),$in0
 	aesenclast	$in1,$inout1
 	pxor		$rndkey0,$in0
 	movdqa		0x00(%rsp),$in1		# load next counter block
 	aesenclast	$in2,$inout2
-	movdqa		0x10(%rsp),$in2
 	aesenclast	$in3,$inout3
+	movdqa		0x10(%rsp),$in2
 	movdqa		0x20(%rsp),$in3
 	aesenclast	$in4,$inout4
-	movdqa		0x30(%rsp),$in4
 	aesenclast	$in5,$inout5
+	movdqa		0x30(%rsp),$in4
 	movdqa		0x40(%rsp),$in5
 	aesenclast	$rndkey1,$inout6
 	movdqa		0x50(%rsp),$rndkey0
-	aesenclast	$in0,$inout7
 	$movkey		0x10-0x80($key),$rndkey1
+	aesenclast	$in0,$inout7
 
 	movups		$inout0,($out)		# store output
 	movdqa		$in1,$inout0
@@ -1278,41 +1434,42 @@ $code.=<<___;
 .Lctr32_tail:
 	lea	16($key),$key
 	cmp	\$4,$len
-	jbe	.Lctr32_loop4
+	jb	.Lctr32_loop3
+	je	.Lctr32_loop4
 
+	shl		\$4,$rounds
 	movdqa		0x60(%rsp),$inout6
+	pxor		$inout7,$inout7
 
 	$movkey		16($key),$rndkey0
 	aesenc		$rndkey1,$inout0
-	lea		16($key),$key
 	aesenc		$rndkey1,$inout1
-	shr		\$1,$rounds
+	lea		32-16($key,$rounds),$key
+	neg		%rax
 	aesenc		$rndkey1,$inout2
-	dec		$rounds
+	add		\$16,%rax
+	 movups		($inp),$in0
 	aesenc		$rndkey1,$inout3
 	aesenc		$rndkey1,$inout4
+	 movups		0x10($inp),$in1
+	 movups		0x20($inp),$in2
 	aesenc		$rndkey1,$inout5
 	aesenc		$rndkey1,$inout6
-	pxor		$inout7,$inout7
-	$movkey		16($key),$rndkey1
 
 	call            .Lenc_loop8_enter
 
-	movups	($inp),$in0
-	movups	0x10($inp),$in1
-	movups	0x20($inp),$in2
-	xorps	$in0,$inout0
-	movups	0x30($inp),$in3
-	xorps	$in1,$inout1
-	movups	0x40($inp),$in0
-	xorps	$in2,$inout2
-	movups	$inout0,($out)
-	xorps	$in3,$inout3
-	movups	$inout1,0x10($out)
-	xorps	$in0,$inout4
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
+	movdqu	0x30($inp),$in3
+	pxor	$in0,$inout0
+	movdqu	0x40($inp),$in0
+	pxor	$in1,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in2,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in3,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in0,$inout4
+	movdqu	$inout3,0x30($out)
+	movdqu	$inout4,0x40($out)
 	cmp	\$6,$len
 	jb	.Lctr32_done
 
@@ -1330,16 +1487,43 @@ $code.=<<___;
 .Lctr32_loop4:
 	aesenc		$rndkey1,$inout0
 	lea		16($key),$key
+	dec		$rounds
 	aesenc		$rndkey1,$inout1
 	aesenc		$rndkey1,$inout2
 	aesenc		$rndkey1,$inout3
 	$movkey		($key),$rndkey1
-	dec		$rounds
 	jnz		.Lctr32_loop4
 	aesenclast	$rndkey1,$inout0
 	aesenclast	$rndkey1,$inout1
+	 movups		($inp),$in0
+	 movups		0x10($inp),$in1
 	aesenclast	$rndkey1,$inout2
 	aesenclast	$rndkey1,$inout3
+	 movups		0x20($inp),$in2
+	 movups		0x30($inp),$in3
+
+	xorps	$in0,$inout0
+	movups	$inout0,($out)
+	xorps	$in1,$inout1
+	movups	$inout1,0x10($out)
+	pxor	$in2,$inout2
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout3
+	movdqu	$inout3,0x30($out)
+	jmp	.Lctr32_done
+
+.align	32
+.Lctr32_loop3:
+	aesenc		$rndkey1,$inout0
+	lea		16($key),$key
+	dec		$rounds
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	$movkey		($key),$rndkey1
+	jnz		.Lctr32_loop3
+	aesenclast	$rndkey1,$inout0
+	aesenclast	$rndkey1,$inout1
+	aesenclast	$rndkey1,$inout2
 
 	movups	($inp),$in0
 	xorps	$in0,$inout0
@@ -1355,12 +1539,6 @@ $code.=<<___;
 	movups	0x20($inp),$in2
 	xorps	$in2,$inout2
 	movups	$inout2,0x20($out)
-	cmp	\$4,$len
-	jb	.Lctr32_done
-
-	movups	0x30($inp),$in3
-	xorps	$in3,$inout3
-	movups	$inout3,0x30($out)
 	jmp	.Lctr32_done
 
 .align	16
@@ -1408,7 +1586,7 @@ ___
 my @tweak=map("%xmm$_",(10..15));
 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
-my $frame_size = 0x60 + ($win64?160:0);
+my $frame_size = 0x70 + ($win64?160:0);
 
 $code.=<<___;
 .globl	aesni_xts_encrypt
@@ -1435,220 +1613,259 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	lea	-8(%rax),%rbp
-	movups	($ivp),@tweak[5]		# load clear-text tweak
+	movups	($ivp),$inout0			# load clear-text tweak
 	mov	240(%r8),$rounds		# key2->rounds
 	mov	240($key),$rnds_		# key1->rounds
 ___
 	# generate the tweak
-	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+	&aesni_generate1("enc",$key2,$rounds,$inout0);
 $code.=<<___;
+	$movkey	($key),$rndkey0			# zero round key
 	mov	$key,$key_			# backup $key
 	mov	$rnds_,$rounds			# backup $rounds
+	shl	\$4,$rnds_
 	mov	$len,$len_			# backup $len
 	and	\$-16,$len
 
+	$movkey	16($key,$rnds_),$rndkey1	# last round key
+
 	movdqa	.Lxts_magic(%rip),$twmask
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	movdqa	$inout0,@tweak[5]
+	pshufd	\$0x5f,$inout0,$twres
+	pxor	$rndkey0,$rndkey1
 ___
+    # alternative tweak calculation algorithm is based on suggestions
+    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
+    # and should help in the future...
     for ($i=0;$i<4;$i++) {
     $code.=<<___;
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	movdqa	@tweak[5],@tweak[$i]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	pxor	$twres,@tweak[5]
+	psrad	\$31,$twtmp			# broadcast upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	pxor	$rndkey0,@tweak[$i]
+	pxor	$twtmp,@tweak[5]
 ___
     }
 $code.=<<___;
+	movdqa	@tweak[5],@tweak[4]
+	psrad	\$31,$twres
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twres
+	pxor	$rndkey0,@tweak[4]
+	pxor	$twres,@tweak[5]
+	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
+
 	sub	\$16*6,$len
 	jc	.Lxts_enc_short
 
-	shr	\$1,$rounds
-	sub	\$1,$rounds
-	mov	$rounds,$rnds_
+	mov	\$16+96,$rounds
+	lea	32($key_,$rnds_),$key		# end of key schedule
+	sub	%r10,%rax			# twisted $rounds
+	$movkey	16($key_),$rndkey1
+	mov	%rax,%r10			# backup twisted $rounds
+	lea	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_enc_grandloop
 
-.align	16
+.align	32
 .Lxts_enc_grandloop:
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
 	movdqu	`16*0`($inp),$inout0		# load input
-	pand	$twmask,$twres			# isolate carry and residue
+	movdqa	$rndkey0,$twmask
 	movdqu	`16*1`($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	pxor	@tweak[0],$inout0
 	movdqu	`16*2`($inp),$inout2
-	pxor	@tweak[0],$inout0		# input^=tweak
-	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[1],$inout1
-	movdqu	`16*4`($inp),$inout4
+	 aesenc		$rndkey1,$inout0
+	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[2],$inout2
-	movdqu	`16*5`($inp),$inout5
-	lea	`16*6`($inp),$inp
+	 aesenc		$rndkey1,$inout1
+	movdqu	`16*4`($inp),$inout4
 	pxor	@tweak[3],$inout3
-	$movkey		($key_),$rndkey0
+	 aesenc		$rndkey1,$inout2
+	movdqu	`16*5`($inp),$inout5
+	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
+	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
 	pxor	@tweak[4],$inout4
-	pxor	@tweak[5],$inout5
+	 aesenc		$rndkey1,$inout3
+	$movkey	32($key_),$rndkey0
+	lea	`16*6`($inp),$inp
+	pxor	$twmask,$inout5
 
-	# inline _aesni_encrypt6 and interleave first and last rounds
-	# with own code...
-	$movkey		16($key_),$rndkey1
-	pxor		$rndkey0,$inout0
-	pxor		$rndkey0,$inout1
-	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
-	aesenc		$rndkey1,$inout0
-	lea		32($key_),$key
-	pxor		$rndkey0,$inout2
-	 movdqa	@tweak[1],`16*1`(%rsp)
-	aesenc		$rndkey1,$inout1
-	pxor		$rndkey0,$inout3
-	 movdqa	@tweak[2],`16*2`(%rsp)
-	aesenc		$rndkey1,$inout2
-	pxor		$rndkey0,$inout4
-	 movdqa	@tweak[3],`16*3`(%rsp)
-	aesenc		$rndkey1,$inout3
-	pxor		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
-	dec		$rounds
-	 movdqa	@tweak[4],`16*4`(%rsp)
+	 pxor	$twres,@tweak[0]
 	aesenc		$rndkey1,$inout4
-	 movdqa	@tweak[5],`16*5`(%rsp)
+	 pxor	$twres,@tweak[1]
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
 	aesenc		$rndkey1,$inout5
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp
-	jmp		.Lxts_enc_loop6_enter
+	$movkey		48($key_),$rndkey1
+	 pxor	$twres,@tweak[2]
 
-.align	16
+	aesenc		$rndkey0,$inout0
+	 pxor	$twres,@tweak[3]
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesenc		$rndkey0,$inout1
+	 pxor	$twres,@tweak[4]
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	 pxor	$twres,$twmask
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	 movdqa	$twmask,`16*5`(%rsp)
+	pshufd	\$0x5f,@tweak[5],$twres
+	jmp	.Lxts_enc_loop6
+.align	32
 .Lxts_enc_loop6:
 	aesenc		$rndkey1,$inout0
 	aesenc		$rndkey1,$inout1
-	dec		$rounds
 	aesenc		$rndkey1,$inout2
 	aesenc		$rndkey1,$inout3
 	aesenc		$rndkey1,$inout4
 	aesenc		$rndkey1,$inout5
-.Lxts_enc_loop6_enter:
-	$movkey		16($key),$rndkey1
+	$movkey		-64($key,%rax),$rndkey1
+	add		\$32,%rax
+
 	aesenc		$rndkey0,$inout0
 	aesenc		$rndkey0,$inout1
-	lea		32($key),$key
 	aesenc		$rndkey0,$inout2
 	aesenc		$rndkey0,$inout3
 	aesenc		$rndkey0,$inout4
 	aesenc		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
+	$movkey		-80($key,%rax),$rndkey0
 	jnz		.Lxts_enc_loop6
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	(%r8),$twmask
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesenc		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	paddq	@tweak[5],@tweak[5]
+	psrad	\$31,$twtmp
 	 aesenc		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	pand	$twmask,$twtmp
+	$movkey	($key_),@tweak[0]		# load round[0]
 	 aesenc		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
 	 aesenc		$rndkey1,$inout3
 	 aesenc		$rndkey1,$inout4
+	pxor	$twtmp,@tweak[5]
+	movaps	@tweak[0],@tweak[1]		# copy round[0]
 	 aesenc		$rndkey1,$inout5
-	 $movkey	16($key),$rndkey1
+	 $movkey	-64($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[0]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	$twres,$twtmp
 	 aesenc		$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	paddd	$twres,$twres
+	pxor	@tweak[5],@tweak[0]
 	 aesenc		$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	psrad	\$31,$twtmp
+	paddq	@tweak[5],@tweak[5]
 	 aesenc		$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
 	 aesenc		$rndkey0,$inout3
+	pand	$twmask,$twtmp
+	movaps	@tweak[1],@tweak[2]
 	 aesenc		$rndkey0,$inout4
+	pxor	$twtmp,@tweak[5]
+	movdqa	$twres,$twtmp
 	 aesenc		$rndkey0,$inout5
-	 $movkey	32($key),$rndkey0
+	 $movkey	-48($key),$rndkey0
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[1]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	paddd	$twres,$twres
 	 aesenc		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	psrad	\$31,$twtmp
 	 aesenc		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
 	 aesenc		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
 	 aesenc		$rndkey1,$inout3
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	pxor	$twtmp,@tweak[5]
 	 aesenc		$rndkey1,$inout4
+	movaps	@tweak[2],@tweak[3]
+	movdqa	$twres,$twtmp
 	 aesenc		$rndkey1,$inout5
+	 $movkey	-32($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[2]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 aesenclast	$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 aesenclast	$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	 aesenclast	$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
-	 aesenclast	$rndkey0,$inout3
-	 aesenclast	$rndkey0,$inout4
-	 aesenclast	$rndkey0,$inout5
-
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[3]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	 xorps	`16*1`(%rsp),$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	pxor	$twres,@tweak[5]
+	paddd	$twres,$twres
+	 aesenc		$rndkey0,$inout0
+	pxor	@tweak[5],@tweak[2]
+	psrad	\$31,$twtmp
+	 aesenc		$rndkey0,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	 aesenc		$rndkey0,$inout2
+	 aesenc		$rndkey0,$inout3
+	 aesenc		$rndkey0,$inout4
+	pxor	$twtmp,@tweak[5]
+	movaps	@tweak[3],@tweak[4]
+	 aesenc		$rndkey0,$inout5
 
-	xorps	`16*2`(%rsp),$inout2
-	movups	$inout0,`16*0`($out)		# write output
-	xorps	`16*3`(%rsp),$inout3
-	movups	$inout1,`16*1`($out)
-	xorps	`16*4`(%rsp),$inout4
-	movups	$inout2,`16*2`($out)
-	xorps	`16*5`(%rsp),$inout5
-	movups	$inout3,`16*3`($out)
-	mov	$rnds_,$rounds			# restore $rounds
-	movups	$inout4,`16*4`($out)
-	movups	$inout5,`16*5`($out)
-	lea	`16*6`($out),$out
-	sub	\$16*6,$len
-	jnc	.Lxts_enc_grandloop
+	movdqa	$twres,$rndkey0
+	paddd	$twres,$twres
+	 aesenc		$rndkey1,$inout0
+	pxor	@tweak[5],@tweak[3]
+	psrad	\$31,$rndkey0
+	 aesenc		$rndkey1,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$rndkey0
+	 aesenc		$rndkey1,$inout2
+	 aesenc		$rndkey1,$inout3
+	pxor	$rndkey0,@tweak[5]
+	$movkey		($key_),$rndkey0
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+
+	pxor	@tweak[5],@tweak[4]
+	 aesenclast	`16*0`(%rsp),$inout0
+	psrad	\$31,$twres
+	paddq	@tweak[5],@tweak[5]
+	 aesenclast	`16*1`(%rsp),$inout1
+	 aesenclast	`16*2`(%rsp),$inout2
+	pand	$twmask,$twres
+	mov	%r10,%rax			# restore $rounds
+	 aesenclast	`16*3`(%rsp),$inout3
+	 aesenclast	`16*4`(%rsp),$inout4
+	 aesenclast	`16*5`(%rsp),$inout5
+	pxor	$twres,@tweak[5]
+
+	lea	`16*6`($out),$out
+	movups	$inout0,`-16*6`($out)		# write output
+	movups	$inout1,`-16*5`($out)
+	movups	$inout2,`-16*4`($out)
+	movups	$inout3,`-16*3`($out)
+	movups	$inout4,`-16*2`($out)
+	movups	$inout5,`-16*1`($out)
+	sub	\$16*6,$len
+	jnc	.Lxts_enc_grandloop
 
-	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	\$16+96,$rounds
+	sub	$rnds_,$rounds
 	mov	$key_,$key			# restore $key
-	mov	$rounds,$rnds_			# backup $rounds
+	shr	\$4,$rounds			# restore original value
 
 .Lxts_enc_short:
+	mov	$rounds,$rnds_			# backup $rounds
+	pxor	$rndkey0,@tweak[0]
 	add	\$16*6,$len
 	jz	.Lxts_enc_done
 
+	pxor	$rndkey0,@tweak[1]
 	cmp	\$0x20,$len
 	jb	.Lxts_enc_one
+	pxor	$rndkey0,@tweak[2]
 	je	.Lxts_enc_two
 
+	pxor	$rndkey0,@tweak[3]
 	cmp	\$0x40,$len
 	jb	.Lxts_enc_three
+	pxor	$rndkey0,@tweak[4]
 	je	.Lxts_enc_four
 
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
-	 movdqu	($inp),$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 movdqu	16*1($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	movdqu	($inp),$inout0
+	movdqu	16*1($inp),$inout1
 	movdqu	16*2($inp),$inout2
 	pxor	@tweak[0],$inout0
 	movdqu	16*3($inp),$inout3
@@ -1697,7 +1914,7 @@ $code.=<<___;
 	xorps	@tweak[0],$inout0
 	xorps	@tweak[1],$inout1
 
-	call	_aesni_encrypt3
+	call	_aesni_encrypt2
 
 	xorps	@tweak[0],$inout0
 	movdqa	@tweak[2],@tweak[0]
@@ -1743,15 +1960,15 @@ $code.=<<___;
 
 	call	_aesni_encrypt4
 
-	xorps	@tweak[0],$inout0
-	movdqa	@tweak[5],@tweak[0]
-	xorps	@tweak[1],$inout1
-	xorps	@tweak[2],$inout2
-	movups	$inout0,($out)
-	xorps	@tweak[3],$inout3
-	movups	$inout1,16*1($out)
-	movups	$inout2,16*2($out)
-	movups	$inout3,16*3($out)
+	pxor	@tweak[0],$inout0
+	movdqa	@tweak[4],@tweak[0]
+	pxor	@tweak[1],$inout1
+	pxor	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	pxor	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
 	lea	16*4($out),$out
 	jmp	.Lxts_enc_done
 
@@ -1830,12 +2047,12 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	lea	-8(%rax),%rbp
-	movups	($ivp),@tweak[5]		# load clear-text tweak
+	movups	($ivp),$inout0			# load clear-text tweak
 	mov	240($key2),$rounds		# key2->rounds
 	mov	240($key),$rnds_		# key1->rounds
 ___
 	# generate the tweak
-	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+	&aesni_generate1("enc",$key2,$rounds,$inout0);
 $code.=<<___;
 	xor	%eax,%eax			# if ($len%16) len-=16;
 	test	\$15,$len
@@ -1843,213 +2060,249 @@ $code.=<<___;
 	shl	\$4,%rax
 	sub	%rax,$len
 
+	$movkey	($key),$rndkey0			# zero round key
 	mov	$key,$key_			# backup $key
 	mov	$rnds_,$rounds			# backup $rounds
+	shl	\$4,$rnds_
 	mov	$len,$len_			# backup $len
 	and	\$-16,$len
 
+	$movkey	16($key,$rnds_),$rndkey1	# last round key
+
 	movdqa	.Lxts_magic(%rip),$twmask
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	movdqa	$inout0,@tweak[5]
+	pshufd	\$0x5f,$inout0,$twres
+	pxor	$rndkey0,$rndkey1
 ___
     for ($i=0;$i<4;$i++) {
     $code.=<<___;
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	movdqa	@tweak[5],@tweak[$i]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	pxor	$twres,@tweak[5]
+	psrad	\$31,$twtmp			# broadcast upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	pxor	$rndkey0,@tweak[$i]
+	pxor	$twtmp,@tweak[5]
 ___
     }
 $code.=<<___;
+	movdqa	@tweak[5],@tweak[4]
+	psrad	\$31,$twres
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twres
+	pxor	$rndkey0,@tweak[4]
+	pxor	$twres,@tweak[5]
+	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
+
 	sub	\$16*6,$len
 	jc	.Lxts_dec_short
 
-	shr	\$1,$rounds
-	sub	\$1,$rounds
-	mov	$rounds,$rnds_
+	mov	\$16+96,$rounds
+	lea	32($key_,$rnds_),$key		# end of key schedule
+	sub	%r10,%rax			# twisted $rounds
+	$movkey	16($key_),$rndkey1
+	mov	%rax,%r10			# backup twisted $rounds
+	lea	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_dec_grandloop
 
-.align	16
+.align	32
 .Lxts_dec_grandloop:
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
 	movdqu	`16*0`($inp),$inout0		# load input
-	pand	$twmask,$twres			# isolate carry and residue
+	movdqa	$rndkey0,$twmask
 	movdqu	`16*1`($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	pxor	@tweak[0],$inout0
 	movdqu	`16*2`($inp),$inout2
-	pxor	@tweak[0],$inout0		# input^=tweak
-	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[1],$inout1
-	movdqu	`16*4`($inp),$inout4
+	 aesdec		$rndkey1,$inout0
+	movdqu	`16*3`($inp),$inout3
 	pxor	@tweak[2],$inout2
-	movdqu	`16*5`($inp),$inout5
-	lea	`16*6`($inp),$inp
+	 aesdec		$rndkey1,$inout1
+	movdqu	`16*4`($inp),$inout4
 	pxor	@tweak[3],$inout3
-	$movkey		($key_),$rndkey0
+	 aesdec		$rndkey1,$inout2
+	movdqu	`16*5`($inp),$inout5
+	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
+	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
 	pxor	@tweak[4],$inout4
-	pxor	@tweak[5],$inout5
+	 aesdec		$rndkey1,$inout3
+	$movkey	32($key_),$rndkey0
+	lea	`16*6`($inp),$inp
+	pxor	$twmask,$inout5
 
-	# inline _aesni_decrypt6 and interleave first and last rounds
-	# with own code...
-	$movkey		16($key_),$rndkey1
-	pxor		$rndkey0,$inout0
-	pxor		$rndkey0,$inout1
-	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
-	aesdec		$rndkey1,$inout0
-	lea		32($key_),$key
-	pxor		$rndkey0,$inout2
-	 movdqa	@tweak[1],`16*1`(%rsp)
-	aesdec		$rndkey1,$inout1
-	pxor		$rndkey0,$inout3
-	 movdqa	@tweak[2],`16*2`(%rsp)
-	aesdec		$rndkey1,$inout2
-	pxor		$rndkey0,$inout4
-	 movdqa	@tweak[3],`16*3`(%rsp)
-	aesdec		$rndkey1,$inout3
-	pxor		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
-	dec		$rounds
-	 movdqa	@tweak[4],`16*4`(%rsp)
+	 pxor	$twres,@tweak[0]
 	aesdec		$rndkey1,$inout4
-	 movdqa	@tweak[5],`16*5`(%rsp)
+	 pxor	$twres,@tweak[1]
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
 	aesdec		$rndkey1,$inout5
-	pxor	$twtmp,$twtmp
-	pcmpgtd	@tweak[5],$twtmp
-	jmp		.Lxts_dec_loop6_enter
+	$movkey		48($key_),$rndkey1
+	 pxor	$twres,@tweak[2]
 
-.align	16
+	aesdec		$rndkey0,$inout0
+	 pxor	$twres,@tweak[3]
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesdec		$rndkey0,$inout1
+	 pxor	$twres,@tweak[4]
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	 pxor	$twres,$twmask
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	 movdqa	$twmask,`16*5`(%rsp)
+	pshufd	\$0x5f,@tweak[5],$twres
+	jmp	.Lxts_dec_loop6
+.align	32
 .Lxts_dec_loop6:
 	aesdec		$rndkey1,$inout0
 	aesdec		$rndkey1,$inout1
-	dec		$rounds
 	aesdec		$rndkey1,$inout2
 	aesdec		$rndkey1,$inout3
 	aesdec		$rndkey1,$inout4
 	aesdec		$rndkey1,$inout5
-.Lxts_dec_loop6_enter:
-	$movkey		16($key),$rndkey1
+	$movkey		-64($key,%rax),$rndkey1
+	add		\$32,%rax
+
 	aesdec		$rndkey0,$inout0
 	aesdec		$rndkey0,$inout1
-	lea		32($key),$key
 	aesdec		$rndkey0,$inout2
 	aesdec		$rndkey0,$inout3
 	aesdec		$rndkey0,$inout4
 	aesdec		$rndkey0,$inout5
-	$movkey		($key),$rndkey0
+	$movkey		-80($key,%rax),$rndkey0
 	jnz		.Lxts_dec_loop6
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	(%r8),$twmask
+	movdqa	$twres,$twtmp
+	paddd	$twres,$twres
 	 aesdec		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	paddq	@tweak[5],@tweak[5]
+	psrad	\$31,$twtmp
 	 aesdec		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	pand	$twmask,$twtmp
+	$movkey	($key_),@tweak[0]		# load round[0]
 	 aesdec		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
 	 aesdec		$rndkey1,$inout3
 	 aesdec		$rndkey1,$inout4
+	pxor	$twtmp,@tweak[5]
+	movaps	@tweak[0],@tweak[1]		# copy round[0]
 	 aesdec		$rndkey1,$inout5
-	 $movkey	16($key),$rndkey1
+	 $movkey	-64($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[0]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	movdqa	$twres,$twtmp
 	 aesdec		$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	paddd	$twres,$twres
+	pxor	@tweak[5],@tweak[0]
 	 aesdec		$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	psrad	\$31,$twtmp
+	paddq	@tweak[5],@tweak[5]
 	 aesdec		$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
 	 aesdec		$rndkey0,$inout3
+	pand	$twmask,$twtmp
+	movaps	@tweak[1],@tweak[2]
 	 aesdec		$rndkey0,$inout4
+	pxor	$twtmp,@tweak[5]
+	movdqa	$twres,$twtmp
 	 aesdec		$rndkey0,$inout5
-	 $movkey	32($key),$rndkey0
+	 $movkey	-48($key),$rndkey0
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[1]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	paddd	$twres,$twres
 	 aesdec		$rndkey1,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	psrad	\$31,$twtmp
 	 aesdec		$rndkey1,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
 	 aesdec		$rndkey1,$inout2
-	pxor	$twres,@tweak[5]
 	 aesdec		$rndkey1,$inout3
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	pxor	$twtmp,@tweak[5]
 	 aesdec		$rndkey1,$inout4
+	movaps	@tweak[2],@tweak[3]
+	movdqa	$twres,$twtmp
 	 aesdec		$rndkey1,$inout5
+	 $movkey	-32($key),$rndkey1
 
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[2]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 aesdeclast	$rndkey0,$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 aesdeclast	$rndkey0,$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
-	 aesdeclast	$rndkey0,$inout2
-	pxor	$twres,@tweak[5]
-	 aesdeclast	$rndkey0,$inout3
-	 aesdeclast	$rndkey0,$inout4
-	 aesdeclast	$rndkey0,$inout5
-
-	pshufd	\$0x13,$twtmp,$twres
-	pxor	$twtmp,$twtmp
-	movdqa	@tweak[5],@tweak[3]
-	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
-	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
-	pand	$twmask,$twres			# isolate carry and residue
-	 xorps	`16*1`(%rsp),$inout1
-	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	paddd	$twres,$twres
+	 aesdec		$rndkey0,$inout0
+	pxor	@tweak[5],@tweak[2]
+	psrad	\$31,$twtmp
+	 aesdec		$rndkey0,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$twtmp
+	 aesdec		$rndkey0,$inout2
+	 aesdec		$rndkey0,$inout3
+	 aesdec		$rndkey0,$inout4
+	pxor	$twtmp,@tweak[5]
+	movaps	@tweak[3],@tweak[4]
+	 aesdec		$rndkey0,$inout5
+
+	movdqa	$twres,$rndkey0
+	paddd	$twres,$twres
+	 aesdec		$rndkey1,$inout0
+	pxor	@tweak[5],@tweak[3]
+	psrad	\$31,$rndkey0
+	 aesdec		$rndkey1,$inout1
+	paddq	@tweak[5],@tweak[5]
+	pand	$twmask,$rndkey0
+	 aesdec		$rndkey1,$inout2
+	 aesdec		$rndkey1,$inout3
+	pxor	$rndkey0,@tweak[5]
+	$movkey		($key_),$rndkey0
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+
+	pxor	@tweak[5],@tweak[4]
+	 aesdeclast	`16*0`(%rsp),$inout0
+	psrad	\$31,$twres
+	paddq	@tweak[5],@tweak[5]
+	 aesdeclast	`16*1`(%rsp),$inout1
+	 aesdeclast	`16*2`(%rsp),$inout2
+	pand	$twmask,$twres
+	mov	%r10,%rax			# restore $rounds
+	 aesdeclast	`16*3`(%rsp),$inout3
+	 aesdeclast	`16*4`(%rsp),$inout4
+	 aesdeclast	`16*5`(%rsp),$inout5
 	pxor	$twres,@tweak[5]
 
-	xorps	`16*2`(%rsp),$inout2
-	movups	$inout0,`16*0`($out)		# write output
-	xorps	`16*3`(%rsp),$inout3
-	movups	$inout1,`16*1`($out)
-	xorps	`16*4`(%rsp),$inout4
-	movups	$inout2,`16*2`($out)
-	xorps	`16*5`(%rsp),$inout5
-	movups	$inout3,`16*3`($out)
-	mov	$rnds_,$rounds			# restore $rounds
-	movups	$inout4,`16*4`($out)
-	movups	$inout5,`16*5`($out)
 	lea	`16*6`($out),$out
+	movups	$inout0,`-16*6`($out)		# write output
+	movups	$inout1,`-16*5`($out)
+	movups	$inout2,`-16*4`($out)
+	movups	$inout3,`-16*3`($out)
+	movups	$inout4,`-16*2`($out)
+	movups	$inout5,`-16*1`($out)
 	sub	\$16*6,$len
 	jnc	.Lxts_dec_grandloop
 
-	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	\$16+96,$rounds
+	sub	$rnds_,$rounds
 	mov	$key_,$key			# restore $key
-	mov	$rounds,$rnds_			# backup $rounds
+	shr	\$4,$rounds			# restore original value
 
 .Lxts_dec_short:
+	mov	$rounds,$rnds_			# backup $rounds
+	pxor	$rndkey0,@tweak[0]
+	pxor	$rndkey0,@tweak[1]
 	add	\$16*6,$len
 	jz	.Lxts_dec_done
 
+	pxor	$rndkey0,@tweak[2]
 	cmp	\$0x20,$len
 	jb	.Lxts_dec_one
+	pxor	$rndkey0,@tweak[3]
 	je	.Lxts_dec_two
 
+	pxor	$rndkey0,@tweak[4]
 	cmp	\$0x40,$len
 	jb	.Lxts_dec_three
 	je	.Lxts_dec_four
 
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
-	 movdqu	($inp),$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 movdqu	16*1($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	movdqu	($inp),$inout0
+	movdqu	16*1($inp),$inout1
 	movdqu	16*2($inp),$inout2
 	pxor	@tweak[0],$inout0
 	movdqu	16*3($inp),$inout3
@@ -2108,7 +2361,7 @@ $code.=<<___;
 	xorps	@tweak[0],$inout0
 	xorps	@tweak[1],$inout1
 
-	call	_aesni_decrypt3
+	call	_aesni_decrypt2
 
 	xorps	@tweak[0],$inout0
 	movdqa	@tweak[2],@tweak[0]
@@ -2134,7 +2387,7 @@ $code.=<<___;
 	xorps	@tweak[0],$inout0
 	movdqa	@tweak[3],@tweak[0]
 	xorps	@tweak[1],$inout1
-	movdqa	@tweak[5],@tweak[1]
+	movdqa	@tweak[4],@tweak[1]
 	xorps	@tweak[2],$inout2
 	movups	$inout0,($out)
 	movups	$inout1,16*1($out)
@@ -2144,14 +2397,8 @@ $code.=<<___;
 
 .align	16
 .Lxts_dec_four:
-	pshufd	\$0x13,$twtmp,$twres
-	movdqa	@tweak[5],@tweak[4]
-	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
-	 movups	($inp),$inout0
-	pand	$twmask,$twres			# isolate carry and residue
-	 movups	16*1($inp),$inout1
-	pxor	$twres,@tweak[5]
-
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
 	movups	16*2($inp),$inout2
 	xorps	@tweak[0],$inout0
 	movups	16*3($inp),$inout3
@@ -2162,16 +2409,16 @@ $code.=<<___;
 
 	call	_aesni_decrypt4
 
-	xorps	@tweak[0],$inout0
+	pxor	@tweak[0],$inout0
 	movdqa	@tweak[4],@tweak[0]
-	xorps	@tweak[1],$inout1
+	pxor	@tweak[1],$inout1
 	movdqa	@tweak[5],@tweak[1]
-	xorps	@tweak[2],$inout2
-	movups	$inout0,($out)
-	xorps	@tweak[3],$inout3
-	movups	$inout1,16*1($out)
-	movups	$inout2,16*2($out)
-	movups	$inout3,16*3($out)
+	pxor	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	pxor	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
 	lea	16*4($out),$out
 	jmp	.Lxts_dec_done
 
@@ -2242,7 +2489,10 @@ ___
 #			    size_t length, const AES_KEY *key,
 #			    unsigned char *ivp,const int enc);
 {
-my $frame_size = 0x10 + ($win64?0x40:0);	# used in decrypt
+my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
+my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
+my $inp_=$key_;
+
 $code.=<<___;
 .globl	${PREFIX}_cbc_encrypt
 .type	${PREFIX}_cbc_encrypt,\@function,6
@@ -2308,248 +2558,359 @@ $code.=<<___ if ($win64);
 	movaps	%xmm7,0x20(%rsp)
 	movaps	%xmm8,0x30(%rsp)
 	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
 .Lcbc_decrypt_body:
 ___
 $code.=<<___;
 	lea	-8(%rax),%rbp
 	movups	($ivp),$iv
 	mov	$rnds_,$rounds
-	cmp	\$0x70,$len
+	cmp	\$0x50,$len
 	jbe	.Lcbc_dec_tail
-	shr	\$1,$rnds_
-	sub	\$0x70,$len
-	mov	$rnds_,$rounds
-	movaps	$iv,(%rsp)
+
+	$movkey	($key),$rndkey0
+	movdqu	0x00($inp),$inout0	# load input
+	movdqu	0x10($inp),$inout1
+	movdqa	$inout0,$in0
+	movdqu	0x20($inp),$inout2
+	movdqa	$inout1,$in1
+	movdqu	0x30($inp),$inout3
+	movdqa	$inout2,$in2
+	movdqu	0x40($inp),$inout4
+	movdqa	$inout3,$in3
+	movdqu	0x50($inp),$inout5
+	movdqa	$inout4,$in4
+	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
+	cmp	\$0x70,$len
+	jbe	.Lcbc_dec_six_or_seven
+
+	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE	
+	sub	\$0x50,$len
+	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
+	je	.Lcbc_dec_loop6_enter
+	sub	\$0x20,$len
+	lea	0x70($key),$key		# size optimization
 	jmp	.Lcbc_dec_loop8_enter
 .align	16
 .Lcbc_dec_loop8:
-	movaps	$rndkey0,(%rsp)			# save IV
 	movups	$inout7,($out)
 	lea	0x10($out),$out
 .Lcbc_dec_loop8_enter:
-	$movkey		($key),$rndkey0
-	movups	($inp),$inout0			# load input
-	movups	0x10($inp),$inout1
-	$movkey		16($key),$rndkey1
+	movdqu		0x60($inp),$inout6
+	pxor		$rndkey0,$inout0
+	movdqu		0x70($inp),$inout7
+	pxor		$rndkey0,$inout1
+	$movkey		0x10-0x70($key),$rndkey1
+	pxor		$rndkey0,$inout2
+	xor		$inp_,$inp_
+	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
+	pxor		$rndkey0,$inout3
+	pxor		$rndkey0,$inout4
+	pxor		$rndkey0,$inout5
+	pxor		$rndkey0,$inout6
 
-	lea		32($key),$key
-	movdqu	0x20($inp),$inout2
-	xorps		$rndkey0,$inout0
-	movdqu	0x30($inp),$inout3
-	xorps		$rndkey0,$inout1
-	movdqu	0x40($inp),$inout4
 	aesdec		$rndkey1,$inout0
-	pxor		$rndkey0,$inout2
-	movdqu	0x50($inp),$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		0x20-0x70($key),$rndkey0
 	aesdec		$rndkey1,$inout1
-	pxor		$rndkey0,$inout3
-	movdqu	0x60($inp),$inout6
 	aesdec		$rndkey1,$inout2
-	pxor		$rndkey0,$inout4
-	movdqu	0x70($inp),$inout7
 	aesdec		$rndkey1,$inout3
-	pxor		$rndkey0,$inout5
-	dec		$rounds
 	aesdec		$rndkey1,$inout4
-	pxor		$rndkey0,$inout6
 	aesdec		$rndkey1,$inout5
-	pxor		$rndkey0,$inout7
-	$movkey		($key),$rndkey0
 	aesdec		$rndkey1,$inout6
+	setnc		${inp_}b
+	shl		\$7,$inp_
 	aesdec		$rndkey1,$inout7
-	$movkey		16($key),$rndkey1
+	add		$inp,$inp_
+	$movkey		0x30-0x70($key),$rndkey1
+___
+for($i=1;$i<12;$i++) {
+my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
+$code.=<<___	if ($i==7);
+	cmp		\$11,$rounds
+___
+$code.=<<___;
+	aesdec		$rndkeyx,$inout0
+	aesdec		$rndkeyx,$inout1
+	aesdec		$rndkeyx,$inout2
+	aesdec		$rndkeyx,$inout3
+	aesdec		$rndkeyx,$inout4
+	aesdec		$rndkeyx,$inout5
+	aesdec		$rndkeyx,$inout6
+	aesdec		$rndkeyx,$inout7
+	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
+___
+$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
+	nop
+___
+$code.=<<___	if ($i==7);
+	jb		.Lcbc_dec_done
+___
+$code.=<<___	if ($i==9);
+	je		.Lcbc_dec_done
+___
+$code.=<<___	if ($i==11);
+	jmp		.Lcbc_dec_done
+___
+}
+$code.=<<___;
+.align	16
+.Lcbc_dec_done:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$iv
+	pxor		$rndkey0,$in0
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$in1
+	pxor		$rndkey0,$in2
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+	pxor		$rndkey0,$in3
+	pxor		$rndkey0,$in4
+	aesdec		$rndkey1,$inout6
+	aesdec		$rndkey1,$inout7
+	movdqu		0x50($inp),$rndkey1
 
-	call		.Ldec_loop8_enter
+	aesdeclast	$iv,$inout0
+	movdqu		0x60($inp),$iv		# borrow $iv
+	pxor		$rndkey0,$rndkey1
+	aesdeclast	$in0,$inout1
+	pxor		$rndkey0,$iv
+	movdqu		0x70($inp),$rndkey0	# next IV
+	aesdeclast	$in1,$inout2
+	lea		0x80($inp),$inp
+	movdqu		0x00($inp_),$in0
+	aesdeclast	$in2,$inout3
+	aesdeclast	$in3,$inout4
+	movdqu		0x10($inp_),$in1
+	movdqu		0x20($inp_),$in2
+	aesdeclast	$in4,$inout5
+	aesdeclast	$rndkey1,$inout6
+	movdqu		0x30($inp_),$in3
+	movdqu		0x40($inp_),$in4
+	aesdeclast	$iv,$inout7
+	movdqa		$rndkey0,$iv		# return $iv
+	movdqu		0x50($inp_),$rndkey1
+	$movkey		-0x70($key),$rndkey0
+
+	movups		$inout0,($out)		# store output
+	movdqa		$in0,$inout0
+	movups		$inout1,0x10($out)
+	movdqa		$in1,$inout1
+	movups		$inout2,0x20($out)
+	movdqa		$in2,$inout2
+	movups		$inout3,0x30($out)
+	movdqa		$in3,$inout3
+	movups		$inout4,0x40($out)
+	movdqa		$in4,$inout4
+	movups		$inout5,0x50($out)
+	movdqa		$rndkey1,$inout5
+	movups		$inout6,0x60($out)
+	lea		0x70($out),$out
 
-	movups	($inp),$rndkey1		# re-load input
-	movups	0x10($inp),$rndkey0
-	xorps	(%rsp),$inout0		# ^= IV
-	xorps	$rndkey1,$inout1
-	movups	0x20($inp),$rndkey1
-	xorps	$rndkey0,$inout2
-	movups	0x30($inp),$rndkey0
-	xorps	$rndkey1,$inout3
-	movups	0x40($inp),$rndkey1
-	xorps	$rndkey0,$inout4
-	movups	0x50($inp),$rndkey0
-	xorps	$rndkey1,$inout5
-	movups	0x60($inp),$rndkey1
-	xorps	$rndkey0,$inout6
-	movups	0x70($inp),$rndkey0	# IV
-	xorps	$rndkey1,$inout7
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	mov	$rnds_,$rounds		# restore $rounds
-	movups	$inout4,0x40($out)
-	mov	$key_,$key		# restore $key
-	movups	$inout5,0x50($out)
-	lea	0x80($inp),$inp
-	movups	$inout6,0x60($out)
-	lea	0x70($out),$out
 	sub	\$0x80,$len
 	ja	.Lcbc_dec_loop8
 
 	movaps	$inout7,$inout0
-	movaps	$rndkey0,$iv
+	lea	-0x70($key),$key
 	add	\$0x70,$len
 	jle	.Lcbc_dec_tail_collected
-	movups	$inout0,($out)
-	lea	1($rnds_,$rnds_),$rounds
+	movups	$inout7,($out)
+	lea	0x10($out),$out
+	cmp	\$0x50,$len
+	jbe	.Lcbc_dec_tail
+
+	movaps	$in0,$inout0
+.Lcbc_dec_six_or_seven:
+	cmp	\$0x60,$len
+	ja	.Lcbc_dec_seven
+
+	movaps	$inout5,$inout6
+	call	_aesni_decrypt6
+	pxor	$iv,$inout0		# ^= IV
+	movaps	$inout6,$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout4
+	movdqu	$inout3,0x30($out)
+	pxor	$in4,$inout5
+	movdqu	$inout4,0x40($out)
+	lea	0x50($out),$out
+	movdqa	$inout5,$inout0
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_seven:
+	movups	0x60($inp),$inout6
+	xorps	$inout7,$inout7
+	call	_aesni_decrypt8
+	movups	0x50($inp),$inout7
+	pxor	$iv,$inout0		# ^= IV
+	movups	0x60($inp),$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout4
+	movdqu	$inout3,0x30($out)
+	pxor	$in4,$inout5
+	movdqu	$inout4,0x40($out)
+	pxor	$inout7,$inout6
+	movdqu	$inout5,0x50($out)
+	lea	0x60($out),$out
+	movdqa	$inout6,$inout0
+	jmp	.Lcbc_dec_tail_collected
+
+.align	16
+.Lcbc_dec_loop6:
+	movups	$inout5,($out)
+	lea	0x10($out),$out
+	movdqu	0x00($inp),$inout0	# load input
+	movdqu	0x10($inp),$inout1
+	movdqa	$inout0,$in0
+	movdqu	0x20($inp),$inout2
+	movdqa	$inout1,$in1
+	movdqu	0x30($inp),$inout3
+	movdqa	$inout2,$in2
+	movdqu	0x40($inp),$inout4
+	movdqa	$inout3,$in3
+	movdqu	0x50($inp),$inout5
+	movdqa	$inout4,$in4
+.Lcbc_dec_loop6_enter:
+	lea	0x60($inp),$inp
+	movdqa	$inout5,$inout6
+
+	call	_aesni_decrypt6
+
+	pxor	$iv,$inout0		# ^= IV
+	movdqa	$inout6,$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout4
+	mov	$key_,$key
+	movdqu	$inout3,0x30($out)
+	pxor	$in4,$inout5
+	mov	$rnds_,$rounds
+	movdqu	$inout4,0x40($out)
+	lea	0x50($out),$out
+	sub	\$0x60,$len
+	ja	.Lcbc_dec_loop6
+
+	movdqa	$inout5,$inout0
+	add	\$0x50,$len
+	jle	.Lcbc_dec_tail_collected
+	movups	$inout5,($out)
 	lea	0x10($out),$out
+
 .Lcbc_dec_tail:
 	movups	($inp),$inout0
-	movaps	$inout0,$in0
-	cmp	\$0x10,$len
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_one
 
 	movups	0x10($inp),$inout1
-	movaps	$inout1,$in1
-	cmp	\$0x20,$len
+	movaps	$inout0,$in0
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_two
 
 	movups	0x20($inp),$inout2
-	movaps	$inout2,$in2
-	cmp	\$0x30,$len
+	movaps	$inout1,$in1
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_three
 
 	movups	0x30($inp),$inout3
-	cmp	\$0x40,$len
+	movaps	$inout2,$in2
+	sub	\$0x10,$len
 	jbe	.Lcbc_dec_four
 
 	movups	0x40($inp),$inout4
-	cmp	\$0x50,$len
-	jbe	.Lcbc_dec_five
-
-	movups	0x50($inp),$inout5
-	cmp	\$0x60,$len
-	jbe	.Lcbc_dec_six
-
-	movups	0x60($inp),$inout6
-	movaps	$iv,(%rsp)		# save IV
-	call	_aesni_decrypt8
-	movups	($inp),$rndkey1
-	movups	0x10($inp),$rndkey0
-	xorps	(%rsp),$inout0		# ^= IV
-	xorps	$rndkey1,$inout1
-	movups	0x20($inp),$rndkey1
-	xorps	$rndkey0,$inout2
-	movups	0x30($inp),$rndkey0
-	xorps	$rndkey1,$inout3
-	movups	0x40($inp),$rndkey1
-	xorps	$rndkey0,$inout4
-	movups	0x50($inp),$rndkey0
-	xorps	$rndkey1,$inout5
-	movups	0x60($inp),$iv		# IV
-	xorps	$rndkey0,$inout6
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
-	movups	$inout5,0x50($out)
-	lea	0x60($out),$out
-	movaps	$inout6,$inout0
-	sub	\$0x70,$len
+	movaps	$inout3,$in3
+	movaps	$inout4,$in4
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	pxor	$iv,$inout0
+	movaps	$in4,$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	pxor	$in3,$inout4
+	movdqu	$inout3,0x30($out)
+	lea	0x40($out),$out
+	movdqa	$inout4,$inout0
+	sub	\$0x10,$len
 	jmp	.Lcbc_dec_tail_collected
+
 .align	16
 .Lcbc_dec_one:
+	movaps	$inout0,$in0
 ___
 	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
 	xorps	$iv,$inout0
 	movaps	$in0,$iv
-	sub	\$0x10,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_two:
-	xorps	$inout2,$inout2
-	call	_aesni_decrypt3
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	movups	$inout0,($out)
+	movaps	$inout1,$in1
+	call	_aesni_decrypt2
+	pxor	$iv,$inout0
 	movaps	$in1,$iv
-	movaps	$inout1,$inout0
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	movdqa	$inout1,$inout0
 	lea	0x10($out),$out
-	sub	\$0x20,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_three:
+	movaps	$inout2,$in2
 	call	_aesni_decrypt3
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	movups	$inout0,($out)
-	xorps	$in1,$inout2
-	movups	$inout1,0x10($out)
+	pxor	$iv,$inout0
 	movaps	$in2,$iv
-	movaps	$inout2,$inout0
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	movdqa	$inout2,$inout0
 	lea	0x20($out),$out
-	sub	\$0x30,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_four:
+	movaps	$inout3,$in3
 	call	_aesni_decrypt4
-	xorps	$iv,$inout0
-	movups	0x30($inp),$iv
-	xorps	$in0,$inout1
-	movups	$inout0,($out)
-	xorps	$in1,$inout2
-	movups	$inout1,0x10($out)
-	xorps	$in2,$inout3
-	movups	$inout2,0x20($out)
-	movaps	$inout3,$inout0
+	pxor	$iv,$inout0
+	movaps	$in3,$iv
+	pxor	$in0,$inout1
+	movdqu	$inout0,($out)
+	pxor	$in1,$inout2
+	movdqu	$inout1,0x10($out)
+	pxor	$in2,$inout3
+	movdqu	$inout2,0x20($out)
+	movdqa	$inout3,$inout0
 	lea	0x30($out),$out
-	sub	\$0x40,$len
-	jmp	.Lcbc_dec_tail_collected
-.align	16
-.Lcbc_dec_five:
-	xorps	$inout5,$inout5
-	call	_aesni_decrypt6
-	movups	0x10($inp),$rndkey1
-	movups	0x20($inp),$rndkey0
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	xorps	$rndkey1,$inout2
-	movups	0x30($inp),$rndkey1
-	xorps	$rndkey0,$inout3
-	movups	0x40($inp),$iv
-	xorps	$rndkey1,$inout4
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	lea	0x40($out),$out
-	movaps	$inout4,$inout0
-	sub	\$0x50,$len
-	jmp	.Lcbc_dec_tail_collected
-.align	16
-.Lcbc_dec_six:
-	call	_aesni_decrypt6
-	movups	0x10($inp),$rndkey1
-	movups	0x20($inp),$rndkey0
-	xorps	$iv,$inout0
-	xorps	$in0,$inout1
-	xorps	$rndkey1,$inout2
-	movups	0x30($inp),$rndkey1
-	xorps	$rndkey0,$inout3
-	movups	0x40($inp),$rndkey0
-	xorps	$rndkey1,$inout4
-	movups	0x50($inp),$iv
-	xorps	$rndkey0,$inout5
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
-	lea	0x50($out),$out
-	movaps	$inout5,$inout0
-	sub	\$0x60,$len
 	jmp	.Lcbc_dec_tail_collected
+
 .align	16
 .Lcbc_dec_tail_collected:
-	and	\$15,$len
 	movups	$iv,($ivp)
+	and	\$15,$len
 	jnz	.Lcbc_dec_tail_partial
 	movups	$inout0,($out)
 	jmp	.Lcbc_dec_ret
@@ -2569,6 +2930,12 @@ $code.=<<___ if ($win64);
 	movaps	0x20(%rsp),%xmm7
 	movaps	0x30(%rsp),%xmm8
 	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
 ___
 $code.=<<___;
 	lea	(%rbp),%rsp
@@ -2991,7 +3358,7 @@ cbc_se_handler:
 
 	lea	16(%rax),%rsi		# %xmm save area
 	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
 
 .Lcommon_rbp_tail:
@@ -3156,11 +3523,30 @@ sub aesni {
 	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
 	return ".byte\t".join(',',@opcode);
     }
+    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
+		"aesdec" => 0xde,	"aesdeclast" => 0xdf
+	);
+	return undef if (!defined($opcodelet{$1}));
+	my $off = $2;
+	push @opcode,0x44 if ($3>=8);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
+	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
+	return ".byte\t".join(',',@opcode);
+    }
     return $line;
 }
 
+sub movbe {
+	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
+}
+
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
+$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
 
 print $code;