From 84e7485bfb22847d43e85e02b4d9e5e393fa34ec Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Tue, 23 Aug 2011 20:53:34 +0000
Subject: [PATCH] Add RC4-MD5 and AESNI-SHA1 "stitched" implementations [from
 HEAD].

---
 CHANGES                             |    5 +
 Configure                           |    2 +-
 TABLE                               |   44 +-
 crypto/aes/Makefile                 |    2 +
 crypto/aes/asm/aesni-sha1-x86_64.pl | 1249 +++++++++++++++++++++++++++
 crypto/evp/Makefile                 |   31 +-
 crypto/evp/c_allc.c                 |    7 +
 crypto/evp/e_aes_cbc_hmac_sha1.c    |  403 +++++++++
 crypto/evp/e_rc4_hmac_md5.c         |  292 +++++++
 crypto/evp/evp.h                    |    7 +
 crypto/evp/names.c                  |    3 +
 crypto/objects/obj_dat.h            |   21 +-
 crypto/objects/obj_mac.h            |   16 +
 crypto/objects/obj_mac.num          |    4 +
 crypto/objects/objects.txt          |    6 +
 crypto/rc4/Makefile                 |    2 +
 crypto/rc4/asm/rc4-md5-x86_64.pl    |  631 ++++++++++++++
 ssl/ssl_algs.c                      |    7 +
 18 files changed, 2704 insertions(+), 28 deletions(-)
 create mode 100644 crypto/aes/asm/aesni-sha1-x86_64.pl
 create mode 100644 crypto/evp/e_aes_cbc_hmac_sha1.c
 create mode 100644 crypto/evp/e_rc4_hmac_md5.c
 create mode 100644 crypto/rc4/asm/rc4-md5-x86_64.pl

diff --git a/CHANGES b/CHANGES
index 68e7bc4a82..7673c680f9 100644
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,11 @@
 
  Changes between 1.0.0e and 1.0.1  [xx XXX xxxx]
 
+  *) Add RC4-MD5 and AESNI-SHA1 "stitched" implementations.
+
+     This work was sponsored by Intel.
+     [Andy Polyakov]
+
   *) Add GCM support to TLS library. Some custom code is needed to split
      the IV between the fixed (from PRF) and explicit (from TLS record)
      portions. This adds all GCM ciphersuites supported by RFC5288 and 
diff --git a/Configure b/Configure
index 2c5f214222..d2755ae5a2 100755
--- a/Configure
+++ b/Configure
@@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-58
 
 my $x86_elf_asm="$x86_asm:elf";
 
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o";
 my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void";
 my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
diff --git a/TABLE b/TABLE
index 9d3dd012ba..94772e4670 100644
--- a/TABLE
+++ b/TABLE
@@ -290,12 +290,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -755,12 +755,12 @@ $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = bn_asm.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -1344,12 +1344,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -1499,12 +1499,12 @@ $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = bn_asm.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -2243,12 +2243,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -2429,12 +2429,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -2491,12 +2491,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -3948,12 +3948,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -4041,12 +4041,12 @@ $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -4971,12 +4971,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -5002,12 +5002,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
diff --git a/crypto/aes/Makefile b/crypto/aes/Makefile
index 5012b947a4..91d8a19336 100644
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@@ -57,6 +57,8 @@ aes-x86_64.s: asm/aes-x86_64.pl
 	$(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@
 aesni-x86_64.s: asm/aesni-x86_64.pl
 	$(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-sha1-x86_64.s:	asm/aesni-sha1-x86_64.pl
+	$(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@
 
 aes-sparcv9.s: asm/aes-sparcv9.pl
 	$(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000000..c6f6b3334a
--- /dev/null
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1249 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#		AES-128-CBC	+SHA1		stitch      gain
+# Westmere	3.77[+5.6]	9.37		6.65	    +41%
+# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
+#
+#		AES-192-CBC
+# Westmere	4.51		10.11		6.97	    +45%
+# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
+#
+#		AES-256-CBC
+# Westmere	5.25		10.85		7.25	    +50%
+# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
+#
+# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#	background information. Above numbers in parentheses are SSSE3
+#	results collected on AVX-capable CPU, i.e. apply on OSes that
+#	don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#		AES-128-CBC	AES-192-CBC	AES-256-CBC
+# Westmere	1.31		1.55		1.80
+# Sandy Bridge	0.93		1.06		1.22
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# void aesni_cbc_sha1_enc(const void *inp,
+#			void *out,
+#			size_t length,
+#			const AES_KEY *key,
+#			unsigned char *iv,
+#			SHA_CTX *ctx,
+#			const void *in0);
+
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
+
+.globl	aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc,\@abi-omnipotent
+.align	16
+aesni_cbc_sha1_enc:
+	# caller should check for SSSE3 and AES-NI bits
+	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r11d		# mask AVX bit
+	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
+	or	%r11d,%r10d
+	cmp	\$`1<<28|1<<30`,%r10d
+	je	aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+	jmp	aesni_cbc_sha1_enc_ssse3
+	ret
+.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_ssse3,\@function,6
+.align	16
+aesni_cbc_sha1_enc_ssse3:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_ssse3		# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	movdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	movups	($key),$rndkey0		# $key[0]
+	movups	16($key),$rndkey[0]	# forward reference
+	jmp	.Loop_ssse3
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	movups		`16*$n`($in0),$in		# load input
+	xorps		$rndkey0,$in
+___
+      $code.=<<___ if ($n);
+	movups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	xorps		$in,$iv
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Laesenclast$sn
+	movups		`32+16*($k+0)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+1)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+	je		.Laesenclast$sn
+	movups		`32+16*($k+2)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+3)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+.Laesenclast$sn:
+	aesenclast	$rndkey[0],$iv
+	movups		16($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_20_39 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_40_59 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+$code.=<<___;
+.align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	movups	$iv,($ivp)			# write IV
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_avx,\@function,6
+.align	16
+aesni_cbc_sha1_enc_avx:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_avx			# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	vzeroall
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	vmovdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	\$112,$key		# size optimization
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	vmovups	-112($key),$rndkey0	# $key[0]
+	vmovups	16-112($key),$rndkey[0]	# forward reference
+	jmp	.Loop_avx
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	vmovups		`16*$n`($in0),$in		# load input
+	vxorps		$rndkey0,$in,$in
+___
+      $code.=<<___ if ($n);
+	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	vxorps		$in,$iv,$iv
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
+	je		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+	vaesenclast	$rndkey[0],$iv,$iv
+	vmovups		16-112($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	vmovups	$iv,($ivp)			# write IV
+	vzeroall
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+
+.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	96(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
+
+	mov	0(%rax),%r15
+	mov	8(%rax),%r14
+	mov	16(%rax),%r13
+	mov	24(%rax),%r12
+	mov	32(%rax),%rbp
+	mov	40(%rax),%rbx
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	ssse3_handler,.-ssse3_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/evp/Makefile b/crypto/evp/Makefile
index 3de27503d1..aca6c6ce96 100644
--- a/crypto/evp/Makefile
+++ b/crypto/evp/Makefile
@@ -28,7 +28,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c \
 	bio_md.c bio_b64.c bio_enc.c evp_err.c e_null.c \
 	c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \
 	evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \
-	e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c evp_fips.c
+	e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c evp_fips.c	\
+	e_aes_cbc_hmac_sha1.c e_rc4_hmac_md5.c
 
 LIBOBJ=	encode.o digest.o evp_enc.o evp_key.o evp_acnf.o \
 	e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\
@@ -40,7 +41,8 @@ LIBOBJ=	encode.o digest.o evp_enc.o evp_key.o evp_acnf.o \
 	bio_md.o bio_b64.o bio_enc.o evp_err.o e_null.o \
 	c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \
 	evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \
-	e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o evp_fips.o
+	e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o evp_fips.o \
+	e_aes_cbc_hmac_sha1.o e_rc4_hmac_md5.o
 
 SRC= $(LIBSRC)
 
@@ -195,6 +197,20 @@ e_aes.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
 e_aes.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
 e_aes.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
 e_aes.o: ../modes/modes_lcl.h e_aes.c evp_locl.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/aes.h ../../include/openssl/asn1.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/bio.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/crypto.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/e_os2.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/evp.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/obj_mac.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/objects.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/opensslconf.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/opensslv.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/ossl_typ.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/safestack.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/sha.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/stack.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/symhacks.h e_aes_cbc_hmac_sha1.c
 e_bf.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
 e_bf.o: ../../include/openssl/blowfish.h ../../include/openssl/buffer.h
 e_bf.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
@@ -281,6 +297,17 @@ e_rc4.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
 e_rc4.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rc4.h
 e_rc4.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
 e_rc4.o: ../../include/openssl/symhacks.h ../cryptlib.h e_rc4.c evp_locl.h
+e_rc4_hmac_md5.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
+e_rc4_hmac_md5.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+e_rc4_hmac_md5.o: ../../include/openssl/evp.h ../../include/openssl/md5.h
+e_rc4_hmac_md5.o: ../../include/openssl/obj_mac.h
+e_rc4_hmac_md5.o: ../../include/openssl/objects.h
+e_rc4_hmac_md5.o: ../../include/openssl/opensslconf.h
+e_rc4_hmac_md5.o: ../../include/openssl/opensslv.h
+e_rc4_hmac_md5.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rc4.h
+e_rc4_hmac_md5.o: ../../include/openssl/safestack.h
+e_rc4_hmac_md5.o: ../../include/openssl/stack.h
+e_rc4_hmac_md5.o: ../../include/openssl/symhacks.h e_rc4_hmac_md5.c
 e_rc5.o: ../../e_os.h ../../include/openssl/bio.h
 e_rc5.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
 e_rc5.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c
index 395a164c8f..8765cfbf92 100644
--- a/crypto/evp/c_allc.c
+++ b/crypto/evp/c_allc.c
@@ -98,6 +98,9 @@ void OpenSSL_add_all_ciphers(void)
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
 	EVP_add_cipher(EVP_rc4_40());
+#ifndef OPENSSL_NO_MD5
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_IDEA
@@ -190,6 +193,10 @@ void OpenSSL_add_all_ciphers(void)
 	EVP_add_cipher(EVP_aes_256_gcm());
 	EVP_add_cipher_alias(SN_aes_256_cbc,"AES256");
 	EVP_add_cipher_alias(SN_aes_256_cbc,"aes256");
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_CAMELLIA
diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 0000000000..78a0135470
--- /dev/null
+++ b/crypto/evp/e_aes_cbc_hmac_sha1.c
@@ -0,0 +1,403 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+    {
+    AES_KEY		ks;
+    SHA_CTX		head,tail,md;
+    size_t		payload_length;	/* AAD length in decrypt case */
+    union {
+	unsigned int	tls_ver;
+    	unsigned char	tls_aad[16];	/* 13 used */
+    } aux;
+    } EVP_AES_HMAC_SHA1;
+
+#if	defined(AES_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)	)
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE   (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+			   unsigned char *out,
+			   size_t length,
+			   const AES_KEY *key,
+			   unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+		const AES_KEY *key, unsigned char iv[16],
+		SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	int ret;
+
+	if (enc)
+		ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+	else
+		ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+	SHA1_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = 0;
+
+	return ret<0?0:1;
+	}
+
+#define	STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define	aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{	const unsigned char *ptr = data;
+	size_t res;
+
+	if ((res = c->num)) {
+		res = SHA_CBLOCK-res;
+		if (len<res) res=len;
+		SHA1_Update (c,ptr,res);
+		ptr += res;
+		len -= res;
+	}
+
+	res = len % SHA_CBLOCK;
+	len -= res;
+
+	if (len) {
+		sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+
+		ptr += len;
+		c->Nh += len>>29;
+		c->Nl += len<<=3;
+		if (c->Nl<(unsigned int)len) c->Nh++;
+	}
+
+	if (res)
+		SHA1_Update(c,ptr,res);
+}
+
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	unsigned int l;
+	size_t	plen = key->payload_length,
+		iv = 0,		/* explicit IV in TLS 1.1 and later */
+		sha_off = 0;
+#if defined(STITCHED_CALL)
+	size_t	aes_off = 0,
+		blocks;
+
+	sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+	if (len%AES_BLOCK_SIZE) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==0)
+			plen = len;
+		else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+			return 0;
+		else if (key->aux.tls_ver >= TLS1_1_VERSION)
+			iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+		if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+			SHA1_Update(&key->md,in+iv,sha_off);
+
+			aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+				ctx->iv,&key->md,in+iv+sha_off);
+			blocks *= SHA_CBLOCK;
+			aes_off += blocks;
+			sha_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			sha_off = 0;
+		}
+#endif
+		sha_off += iv;
+		SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+		if (plen!=len)	{	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+			/* calculate HMAC and append it to payload */
+			SHA1_Final(out+plen,&key->md);
+			key->md = key->tail;
+			SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+			SHA1_Final(out+plen,&key->md);
+
+			/* pad the payload|hmac */
+			plen += SHA_DIGEST_LENGTH;
+			for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+			/* encrypt HMAC|padding at once */
+			aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		} else {
+			aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		}
+	} else {
+		unsigned char mac[SHA_DIGEST_LENGTH];
+
+		/* decrypt HMAC|padding at once */
+		aesni_cbc_encrypt(in,out,len,
+				&key->ks,ctx->iv,0);
+
+		if (plen) {	/* "TLS" mode of operation */
+			/* figure out payload length */
+			if (len<(out[len-1]+1+SHA_DIGEST_LENGTH))
+				return 0;
+
+			len -= (out[len-1]+1+SHA_DIGEST_LENGTH);
+
+			if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+			    >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				iv = AES_BLOCK_SIZE;
+			}
+
+			key->aux.tls_aad[plen-2] = len>>8;
+			key->aux.tls_aad[plen-1] = len;
+
+			/* calculate HMAC and verify it */
+			key->md = key->head;
+			SHA1_Update(&key->md,key->aux.tls_aad,plen);
+			SHA1_Update(&key->md,out+iv,len);
+			SHA1_Final(mac,&key->md);
+
+			key->md = key->tail;
+			SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH);
+			SHA1_Final(mac,&key->md);
+
+			if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH))
+				return 0;
+		} else {
+			SHA1_Update(&key->md,out,len);
+		}
+	}
+
+	key->payload_length = 0;
+
+	return 1;
+	}
+
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > sizeof(hmac_key)) {
+			SHA1_Init(&key->head);
+			SHA1_Update(&key->head,ptr,arg);
+			SHA1_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		SHA1_Init(&key->head);
+		SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		SHA1_Init(&key->tail);
+		SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (ctx->encrypt)
+			{
+			key->payload_length = len;
+			if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				p[arg-2] = len>>8;
+				p[arg-1] = len;
+			}
+			key->md = key->head;
+			SHA1_Update(&key->md,p,arg);
+
+			return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+				- len);
+			}
+		else
+			{
+			if (arg>13) arg = 13;
+			memcpy(key->aux.tls_aad,ptr,arg);
+			key->payload_length = arg;
+
+			return SHA_DIGEST_LENGTH;
+			}
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_128_cbc_hmac_sha1
+	NID_aes_128_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,16,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_256_cbc_hmac_sha1
+	NID_aes_256_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,32,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_128_cbc_hmac_sha1_cipher:NULL);
+	}
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_256_cbc_hmac_sha1_cipher:NULL);
+	}
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+#endif
+#endif
diff --git a/crypto/evp/e_rc4_hmac_md5.c b/crypto/evp/e_rc4_hmac_md5.c
new file mode 100644
index 0000000000..1fa2aa2b84
--- /dev/null
+++ b/crypto/evp/e_rc4_hmac_md5.c
@@ -0,0 +1,292 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE		16
+
+typedef struct
+    {
+    RC4_KEY		ks;
+    MD5_CTX		head,tail,md;
+    size_t		payload_length;
+    } EVP_RC4_HMAC_MD5;
+
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+		MD5_CTX *ctx,const void *inp,size_t blocks);
+
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+		    inkey);
+
+	MD5_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = 0;
+
+	return 1;
+	}
+
+#if	!defined(OPENSSL_NO_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)		)
+#define	STITCHED_CALL
+#endif
+
+#if !defined(STITCHED_CALL)
+#define	rc4_off 0
+#define	md5_off 0
+#endif
+
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+	size_t	rc4_off = 32-1-(key->ks.x&(32-1)),	/* 32 is $MOD from rc4_md5-x86_64.pl */
+		md5_off = MD5_CBLOCK-key->md.num,
+		blocks;
+	unsigned int l;
+#endif
+	size_t	plen = key->payload_length;
+
+	if (plen && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==0) plen = len;
+#if defined(STITCHED_CALL)
+		/* cipher has to "fall behind" */
+		if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+
+		if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK)) {
+			MD5_Update(&key->md,in,md5_off);
+			RC4(&key->ks,rc4_off,in,out);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,in+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			rc4_off = 0;
+			md5_off = 0;
+		}
+#endif
+		MD5_Update(&key->md,in+md5_off,plen-md5_off);
+
+		if (plen!=len) {	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+
+			/* calculate HMAC and append it to payload */
+			MD5_Final(out+plen,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+			MD5_Final(out+plen,&key->md);
+			/* encrypt HMAC at once */
+			RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+		} else {
+			RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		}
+	} else {
+		unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+		/* digest has to "fall behind" */
+		if (md5_off>rc4_off)	rc4_off += 2*MD5_CBLOCK;
+		else			rc4_off += MD5_CBLOCK;
+
+		if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK)) {
+			RC4(&key->ks,rc4_off,in,out);
+			MD5_Update(&key->md,out,md5_off);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,out+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+			if (l<key->md.Nl) key->md.Nh++;
+			key->md.Nl  = l;
+			key->md.Nh += blocks>>29;
+		} else {
+			md5_off=0;
+			rc4_off=0;
+		}
+#endif
+		/* decrypt HMAC at once */
+		RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		if (plen) {	/* "TLS" mode of operation */
+			MD5_Update(&key->md,out+md5_off,plen-md5_off);
+
+			/* calculate HMAC and verify it */
+			MD5_Final(mac,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+			MD5_Final(mac,&key->md);
+
+			if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+				return 0;
+		} else {
+			MD5_Update(&key->md,out+md5_off,len-md5_off);
+		}
+	}
+
+	key->payload_length = 0;
+
+	return 1;
+	}
+
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > sizeof(hmac_key)) {
+			MD5_Init(&key->head);
+			MD5_Update(&key->head,ptr,arg);
+			MD5_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		MD5_Init(&key->head);
+		MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		MD5_Init(&key->tail);
+		MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (!ctx->encrypt)
+			{
+			len -= MD5_DIGEST_LENGTH;
+			p[arg-2] = len>>8;
+			p[arg-1] = len;
+			}
+		key->payload_length=len;
+		key->md = key->head;
+		MD5_Update(&key->md,p,arg);
+
+		return MD5_DIGEST_LENGTH;
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER r4_hmac_md5_cipher=
+	{
+#ifdef NID_rc4_hmac_md5
+	NID_rc4_hmac_md5,
+#else
+	NID_undef,
+#endif
+	1,EVP_RC4_KEY_SIZE,0,
+	EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+	rc4_hmac_md5_init_key,
+	rc4_hmac_md5_cipher,
+	NULL,
+	sizeof(EVP_RC4_HMAC_MD5),
+	NULL,
+	NULL,
+	rc4_hmac_md5_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+	{
+	return(&r4_hmac_md5_cipher);
+	}
+#endif
diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h
index 365eeabf9a..1fd95dd125 100644
--- a/crypto/evp/evp.h
+++ b/crypto/evp/evp.h
@@ -739,6 +739,9 @@ const EVP_MD *EVP_dev_crypto_md5(void);
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -810,6 +813,10 @@ const EVP_CIPHER *EVP_aes_256_ctr(void);
 const EVP_CIPHER *EVP_aes_256_gcm(void);
 const EVP_CIPHER *EVP_aes_256_ccm(void);
 const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
 const EVP_CIPHER *EVP_camellia_128_ecb(void);
diff --git a/crypto/evp/names.c b/crypto/evp/names.c
index 67c73c0bab..6311ad7cfb 100644
--- a/crypto/evp/names.c
+++ b/crypto/evp/names.c
@@ -65,6 +65,9 @@
 int EVP_add_cipher(const EVP_CIPHER *c)
 	{
 	int r;
+
+	if (c == NULL) return 0;
+
 	OPENSSL_init();
 
 	r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c);
diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h
index 8d1100b321..7dd278f5b8 100644
--- a/crypto/objects/obj_dat.h
+++ b/crypto/objects/obj_dat.h
@@ -62,9 +62,9 @@
  * [including the GNU Public Licence.]
  */
 
-#define NUM_NID 915
-#define NUM_SN 908
-#define NUM_LN 908
+#define NUM_NID 919
+#define NUM_SN 912
+#define NUM_LN 912
 #define NUM_OBJ 856
 
 static const unsigned char lvalues[5971]={
@@ -2397,11 +2397,19 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={
 {"RSASSA-PSS","rsassaPss",NID_rsassaPss,9,&(lvalues[5961]),0},
 {"AES-128-XTS","aes-128-xts",NID_aes_128_xts,0,NULL,0},
 {"AES-256-XTS","aes-256-xts",NID_aes_256_xts,0,NULL,0},
+{"RC4-HMAC-MD5","rc4-hmac-md5",NID_rc4_hmac_md5,0,NULL,0},
+{"AES-128-CBC-HMAC-SHA1","aes-128-cbc-hmac-sha1",
+	NID_aes_128_cbc_hmac_sha1,0,NULL,0},
+{"AES-192-CBC-HMAC-SHA1","aes-192-cbc-hmac-sha1",
+	NID_aes_192_cbc_hmac_sha1,0,NULL,0},
+{"AES-256-CBC-HMAC-SHA1","aes-256-cbc-hmac-sha1",
+	NID_aes_256_cbc_hmac_sha1,0,NULL,0},
 };
 
 static const unsigned int sn_objs[NUM_SN]={
 364,	/* "AD_DVCS" */
 419,	/* "AES-128-CBC" */
+916,	/* "AES-128-CBC-HMAC-SHA1" */
 421,	/* "AES-128-CFB" */
 650,	/* "AES-128-CFB1" */
 653,	/* "AES-128-CFB8" */
@@ -2410,6 +2418,7 @@ static const unsigned int sn_objs[NUM_SN]={
 420,	/* "AES-128-OFB" */
 913,	/* "AES-128-XTS" */
 423,	/* "AES-192-CBC" */
+917,	/* "AES-192-CBC-HMAC-SHA1" */
 425,	/* "AES-192-CFB" */
 651,	/* "AES-192-CFB1" */
 654,	/* "AES-192-CFB8" */
@@ -2417,6 +2426,7 @@ static const unsigned int sn_objs[NUM_SN]={
 422,	/* "AES-192-ECB" */
 424,	/* "AES-192-OFB" */
 427,	/* "AES-256-CBC" */
+918,	/* "AES-256-CBC-HMAC-SHA1" */
 429,	/* "AES-256-CFB" */
 652,	/* "AES-256-CFB1" */
 655,	/* "AES-256-CFB8" */
@@ -2540,6 +2550,7 @@ static const unsigned int sn_objs[NUM_SN]={
 40,	/* "RC2-OFB" */
  5,	/* "RC4" */
 97,	/* "RC4-40" */
+915,	/* "RC4-HMAC-MD5" */
 120,	/* "RC5-CBC" */
 122,	/* "RC5-CFB" */
 121,	/* "RC5-ECB" */
@@ -3455,6 +3466,7 @@ static const unsigned int ln_objs[NUM_LN]={
 364,	/* "ad dvcs" */
 606,	/* "additional verification" */
 419,	/* "aes-128-cbc" */
+916,	/* "aes-128-cbc-hmac-sha1" */
 896,	/* "aes-128-ccm" */
 421,	/* "aes-128-cfb" */
 650,	/* "aes-128-cfb1" */
@@ -3465,6 +3477,7 @@ static const unsigned int ln_objs[NUM_LN]={
 420,	/* "aes-128-ofb" */
 913,	/* "aes-128-xts" */
 423,	/* "aes-192-cbc" */
+917,	/* "aes-192-cbc-hmac-sha1" */
 899,	/* "aes-192-ccm" */
 425,	/* "aes-192-cfb" */
 651,	/* "aes-192-cfb1" */
@@ -3474,6 +3487,7 @@ static const unsigned int ln_objs[NUM_LN]={
 898,	/* "aes-192-gcm" */
 424,	/* "aes-192-ofb" */
 427,	/* "aes-256-cbc" */
+918,	/* "aes-256-cbc-hmac-sha1" */
 902,	/* "aes-256-ccm" */
 429,	/* "aes-256-cfb" */
 652,	/* "aes-256-cfb1" */
@@ -3978,6 +3992,7 @@ static const unsigned int ln_objs[NUM_LN]={
 40,	/* "rc2-ofb" */
  5,	/* "rc4" */
 97,	/* "rc4-40" */
+915,	/* "rc4-hmac-md5" */
 120,	/* "rc5-cbc" */
 122,	/* "rc5-cfb" */
 121,	/* "rc5-ecb" */
diff --git a/crypto/objects/obj_mac.h b/crypto/objects/obj_mac.h
index 6de8c70479..02fc4095aa 100644
--- a/crypto/objects/obj_mac.h
+++ b/crypto/objects/obj_mac.h
@@ -4009,3 +4009,19 @@
 #define LN_cmac		"cmac"
 #define NID_cmac		894
 
+#define SN_rc4_hmac_md5		"RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5		"rc4-hmac-md5"
+#define NID_rc4_hmac_md5		915
+
+#define SN_aes_128_cbc_hmac_sha1		"AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1		"aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1		916
+
+#define SN_aes_192_cbc_hmac_sha1		"AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1		"aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1		917
+
+#define SN_aes_256_cbc_hmac_sha1		"AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1		"aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1		918
+
diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num
index cbd77f3a41..a50aa57709 100644
--- a/crypto/objects/obj_mac.num
+++ b/crypto/objects/obj_mac.num
@@ -912,3 +912,7 @@ mgf1		911
 rsassaPss		912
 aes_128_xts		913
 aes_256_xts		914
+rc4_hmac_md5		915
+aes_128_cbc_hmac_sha1		916
+aes_192_cbc_hmac_sha1		917
+aes_256_cbc_hmac_sha1		918
diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt
index 1bf3ad6eed..183806e39f 100644
--- a/crypto/objects/objects.txt
+++ b/crypto/objects/objects.txt
@@ -1283,3 +1283,9 @@ kisa 1 6                : SEED-OFB      : seed-ofb
 			: HMAC				: hmac
 # Nor CMAC either
 			: CMAC				: cmac
+
+# Synthetic composite ciphersuites
+			: RC4-HMAC-MD5			: rc4-hmac-md5
+			: AES-128-CBC-HMAC-SHA1		: aes-128-cbc-hmac-sha1
+			: AES-192-CBC-HMAC-SHA1		: aes-192-cbc-hmac-sha1
+			: AES-256-CBC-HMAC-SHA1		: aes-256-cbc-hmac-sha1
diff --git a/crypto/rc4/Makefile b/crypto/rc4/Makefile
index cec0b0221c..b52704d167 100644
--- a/crypto/rc4/Makefile
+++ b/crypto/rc4/Makefile
@@ -46,6 +46,8 @@ rc4-586.s:	asm/rc4-586.pl ../perlasm/x86asm.pl
 
 rc4-x86_64.s: asm/rc4-x86_64.pl
 	$(PERL) asm/rc4-x86_64.pl $(PERLASM_SCHEME) > $@
+rc4-md5-x86_64.s:	asm/rc4-md5-x86_64.pl
+	$(PERL) asm/rc4-md5-x86_64.pl $(PERLASM_SCHEME) > $@
 
 rc4-ia64.S: asm/rc4-ia64.pl
 	$(PERL) asm/rc4-ia64.pl $(CFLAGS) > $@
diff --git a/crypto/rc4/asm/rc4-md5-x86_64.pl b/crypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 0000000000..7f684092d4
--- /dev/null
+++ b/crypto/rc4/asm/rc4-md5-x86_64.pl
@@ -0,0 +1,631 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+#		RC4	MD5	RC4+MD5	stitch	gain
+# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
+# Core2		6.5	5.8	12.3	7.7	+60%
+# Westmere	4.3	5.2	9.5	7.0	+36%
+# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
+# Atom		9.3	6.5	15.8	11.1	+42%
+#
+# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+#	is +53%...
+
+my ($rc4,$md5)=(1,1);	# what to generate?
+my $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
+			# but its result is discarded. Idea here is
+			# to be able to use 'openssl speed rc4' for
+			# benchmarking the stitched subroutine... 
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+
+if ($rc4 && !$md5) {
+  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+  $func="RC4";				$nargs=4;
+} elsif ($md5 && !$rc4) {
+  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+  $func="md5_block_asm_data_order";	$nargs=3;
+} else {
+  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+  $func="rc4_md5_enc";			$nargs=6;
+  # void rc4_md5_enc(
+  #		RC4_KEY *key,		#
+  #		const void *in0,	# RC4 input
+  #		void *out,		# RC4 output
+  #		MD5_CTX *ctx,		#
+  #		const void *inp,	# MD5 input
+  #		size_t len);		# number of 64-byte blocks
+}
+
+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
+
+my @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
+my $tmp="%r12d";
+
+my @XX=("%rbp","%rsi");			# RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+my $MOD=32;				# 16, 32 or 64
+
+$code.=<<___;
+.text
+.align 16
+
+.globl	$func
+.type	$func,\@function,$nargs
+$func:
+	cmp	\$0,$len
+	je	.Labort
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5#	mov	$ctx,%r11		# reassign arguments
+	mov	$len,%r12
+	mov	$in0,%r13
+	mov	$out,%r14
+$D#md5#	mov	$inp,%r15
+___
+    $ctx="%r11"	if ($md5);		# reassign arguments
+    $len="%r12";
+    $in0="%r13";
+    $out="%r14";
+    $inp="%r15"	if ($md5);
+    $inp=$in0	if (!$md5);
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
+
+	inc	$XX[0]#b
+	sub	$in0,$out
+	movl	($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+	xor	$TX[1],$TX[1]
+	test	\$-128,$len
+	jz	.Loop1
+	sub	$XX[0],$TX[1]
+	and	\$`$MOD-1`,$TX[1]
+	jz	.Loop${MOD}_is_hot
+	sub	$TX[1],$len
+.Loop${MOD}_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$TX[1]
+	jnz	.Loop${MOD}_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop${MOD}_is_hot:
+	mov	$len,32(%rsp)		# save original $len
+	shr	\$6,$len		# number of 64-byte blocks
+___
+  if ($D && !$md5) {			# stitch in dummy MD5
+    $md5=1;
+    $ctx="%r11";
+    $inp="%r15";
+    $code.=<<___;
+	mov	%rsp,$ctx
+	mov	$in0,$inp
+___
+  }
+}
+$code.=<<___;
+#rc4#	add	$TX[0]#b,$YY#b
+#rc4#	lea	($dat,$XX[0],4),$XX[1]
+	shl	\$6,$len
+	add	$inp,$len		# pointer to the end of input
+	mov	$len,16(%rsp)
+
+#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
+#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
+#md5#	mov	1*4($ctx),$V[1]
+#md5#	mov	2*4($ctx),$V[2]
+#md5#	mov	3*4($ctx),$V[3]
+	jmp	.Loop
+
+.align	16
+.Loop:
+#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
+#md5#	mov	$V[1],1*4(%rsp)
+#md5#	mov	$V[2],2*4(%rsp)
+#md5#	mov	$V[3],$tmp		# forward reference
+#md5#	mov	$V[3],3*4(%rsp)
+___
+
+sub R0 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot0=(7,12,17,22);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$b,$tmp
+#md5#	add	4*`$j`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$d,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot0[$j%4],$a
+#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+___
+}
+sub R1 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot1=(5,9,14,20);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$b,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$d,$tmp
+#md5#	add	4*`((1+5*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$c,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot1[$j%4],$a
+#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+___
+}
+sub R2 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot2=(4,11,16,23);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	xor	$b,$tmp
+#md5#	add	4*`((5+3*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	add	$tmp,$a
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot2[$j%4],$a
+#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+___
+}
+sub R3 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot3=(6,10,15,21);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$d,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	or	$b,$tmp
+#md5#	add	4*`((7*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot3[$j%4],$a
+#md5#	mov	\$-1,$tmp			# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	mov	$XX[0],$XX[1]
+	xor	$XX[0],$XX[0]			# keyword to partial register
+	mov	$XX[1]#b,$XX[0]#b
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+___
+}
+
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+
+$code.=<<___;
+#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
+#md5#	add	1*4(%rsp),$V[1]
+#md5#	add	2*4(%rsp),$V[2]
+#md5#	add	3*4(%rsp),$V[3]
+
+#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
+#rc4#	movdqu	%xmm3,16($out,$in0)
+#rc4#	movdqu	%xmm4,32($out,$in0)
+#rc4#	movdqu	%xmm5,48($out,$in0)
+#md5#	lea	64($inp),$inp
+#rc4#	lea	64($in0),$in0
+	cmp	16(%rsp),$inp		# are we done?
+	jb	.Loop
+
+#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
+#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
+#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
+#md5#	mov	$V[1],1*4($len)
+#md5#	mov	$V[2],2*4($len)
+#md5#	mov	$V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+	mov	32(%rsp),$len		# restore original $len
+	and	\$63,$len		# remaining bytes
+	jnz	.Loop1
+	jmp	.Ldone
+	
+.align	16
+.Loop1:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$len
+	jnz	.Loop1
+
+.Ldone:
+___
+$code.=<<___;
+#rc4#	sub	\$1,$XX[0]#b
+#rc4#	movl	$XX[0]#d,-8($dat)
+#rc4#	movl	$YY#d,-4($dat)
+
+	mov	40(%rsp),%r15
+	mov	48(%rsp),%r14
+	mov	56(%rsp),%r13
+	mov	64(%rsp),%r12
+	mov	72(%rsp),%rbp
+	mov	80(%rsp),%rbx
+	lea	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	ret
+.size $func,.-$func
+___
+
+if ($rc4 && $D) {	# sole purpose of this section is to provide
+			# option to use the generated module as drop-in
+			# replacement for rc4-x86_64.pl for debugging
+			# and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@abi-omnipotent
+.align	16
+RC4_options:
+	lea	.Lopts(%rip),%rax
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(64x,int)"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lbody
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	mov	40(%rax),%r15
+	mov	48(%rax),%r14
+	mov	56(%rax),%r13
+	mov	64(%rax),%r12
+	mov	72(%rax),%rbp
+	mov	80(%rax),%rbx
+	lea	88(%rax),%rax
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R12
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_$func
+	.rva	.LSEH_end_$func
+	.rva	.LSEH_info_$func
+
+.section	.xdata
+.align	8
+.LSEH_info_$func:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd	/gm;
+
+$code =~ s/#md5#//gm	if ($md5);
+$code =~ s/#rc4#//gm	if ($rc4);
+
+print $code;
+
+close STDOUT;
diff --git a/ssl/ssl_algs.c b/ssl/ssl_algs.c
index 0967b2dfe4..24976383e0 100644
--- a/ssl/ssl_algs.c
+++ b/ssl/ssl_algs.c
@@ -73,6 +73,9 @@ int SSL_library_init(void)
 #endif
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
+#if !defined(OPENSSL_NO_MD5) && (defined(__x86_64) || defined(__x86_64__))
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif  
 #ifndef OPENSSL_NO_RC2
 	EVP_add_cipher(EVP_rc2_cbc());
@@ -85,6 +88,10 @@ int SSL_library_init(void)
 	EVP_add_cipher(EVP_aes_128_cbc());
 	EVP_add_cipher(EVP_aes_192_cbc());
 	EVP_add_cipher(EVP_aes_256_cbc());
+#if !defined(OPENSSL_NO_SHA) && !defined(OPNESSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
 	EVP_add_cipher(EVP_camellia_128_cbc());
-- 
2.34.1