Add RC4-MD5 and AESNI-SHA1 "stitched" implementations.
authorAndy Polyakov <appro@openssl.org>
Tue, 23 Aug 2011 20:51:38 +0000 (20:51 +0000)
committerAndy Polyakov <appro@openssl.org>
Tue, 23 Aug 2011 20:51:38 +0000 (20:51 +0000)
18 files changed:
CHANGES
Configure
TABLE
crypto/aes/Makefile
crypto/aes/asm/aesni-sha1-x86_64.pl [new file with mode: 0644]
crypto/evp/Makefile
crypto/evp/c_allc.c
crypto/evp/e_aes_cbc_hmac_sha1.c [new file with mode: 0644]
crypto/evp/e_rc4_hmac_md5.c [new file with mode: 0644]
crypto/evp/evp.h
crypto/evp/names.c
crypto/objects/obj_dat.h
crypto/objects/obj_mac.h
crypto/objects/obj_mac.num
crypto/objects/objects.txt
crypto/rc4/Makefile
crypto/rc4/asm/rc4-md5-x86_64.pl [new file with mode: 0644]
ssl/ssl_algs.c

diff --git a/CHANGES b/CHANGES
index 65bbce6..48537e8 100644 (file)
--- a/CHANGES
+++ b/CHANGES
   
  Changes between 1.0.0e and 1.0.1  [xx XXX xxxx]
 
+  *) Add RC4-MD5 and AESNI-SHA1 "stiched" implementations.
+
+     This work was sponsored by Intel.
+     [Andy Polyakov]
+
   *) Redirect HMAC and CMAC operations to FIPS module in FIPS mode. If an
      ENGINE is used then we cannot handle that in the FIPS module so we
      keep original code iff non-FIPS operations are allowed.
index 9d9bd72..fc793b0 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5
 
 my $x86_elf_asm="$x86_asm:elf";
 
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
 my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:void";
 my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o:void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
diff --git a/TABLE b/TABLE
index b5aea36..d6940be 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -299,12 +299,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -779,12 +779,12 @@ $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -1387,12 +1387,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -1547,12 +1547,12 @@ $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -2315,12 +2315,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -2507,12 +2507,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -2571,12 +2571,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -4075,12 +4075,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -4235,12 +4235,12 @@ $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -5195,12 +5195,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
@@ -5227,12 +5227,12 @@ $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
 $bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o aesni-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
 $cast_obj     = 
-$rc4_obj      = rc4-x86_64.o
+$rc4_obj      = rc4-x86_64.o rc4-md5-x86_64.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = wp-x86_64.o
index cf1b3e5..ae16e65 100644 (file)
@@ -57,6 +57,8 @@ aes-x86_64.s: asm/aes-x86_64.pl
        $(PERL) asm/aes-x86_64.pl $(PERLASM_SCHEME) > $@
 aesni-x86_64.s: asm/aesni-x86_64.pl
        $(PERL) asm/aesni-x86_64.pl $(PERLASM_SCHEME) > $@
+aesni-sha1-x86_64.s:   asm/aesni-sha1-x86_64.pl
+       $(PERL) asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME) > $@
 
 aes-sparcv9.s: asm/aes-sparcv9.pl
        $(PERL) asm/aes-sparcv9.pl $(CFLAGS) > $@
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644 (file)
index 0000000..c6f6b33
--- /dev/null
@@ -0,0 +1,1249 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#              AES-128-CBC     +SHA1           stitch      gain
+# Westmere     3.77[+5.6]      9.37            6.65        +41%
+# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35)    6.16(7.08)  +67%(+60%)
+#
+#              AES-192-CBC
+# Westmere     4.51            10.11           6.97        +45%
+# Sandy Bridge 6.05            11.25(12.35)    6.34(7.27)  +77%(+70%)
+#
+#              AES-256-CBC
+# Westmere     5.25            10.85           7.25        +50%
+# Sandy Bridge 7.05            12.25(13.35)    7.06(7.70)  +74%(+73%)
+#
+# (*)  There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#      background information. Above numbers in parentheses are SSSE3
+#      results collected on AVX-capable CPU, i.e. apply on OSes that
+#      don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#              AES-128-CBC     AES-192-CBC     AES-256-CBC
+# Westmere     1.31            1.55            1.80
+# Sandy Bridge 0.93            1.06            1.22
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+               =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+          $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+          $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+          `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+          $1>=10);
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# void aesni_cbc_sha1_enc(const void *inp,
+#                      void *out,
+#                      size_t length,
+#                      const AES_KEY *key,
+#                      unsigned char *iv,
+#                      SHA_CTX *ctx,
+#                      const void *in0);
+
+$code.=<<___;
+.text
+.extern        OPENSSL_ia32cap_P
+
+.globl aesni_cbc_sha1_enc
+.type  aesni_cbc_sha1_enc,\@abi-omnipotent
+.align 16
+aesni_cbc_sha1_enc:
+       # caller should check for SSSE3 and AES-NI bits
+       mov     OPENSSL_ia32cap_P+0(%rip),%r10d
+       mov     OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+       and     \$`1<<28`,%r11d         # mask AVX bit
+       and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
+       or      %r11d,%r10d
+       cmp     \$`1<<28|1<<30`,%r10d
+       je      aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+       jmp     aesni_cbc_sha1_enc_ssse3
+       ret
+.size  aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");   # size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD()         # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type  aesni_cbc_sha1_enc_ssse3,\@function,6
+.align 16
+aesni_cbc_sha1_enc_ssse3:
+       mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+       #shr    \$6,$len                        # debugging artefact
+       #jz     .Lepilogue_ssse3                # debugging artefact
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+       #mov    $in0,$inp                       # debugging artefact
+       #lea    64(%rsp),$ctx                   # debugging artefact
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,96+0(%rsp)
+       movaps  %xmm7,96+16(%rsp)
+       movaps  %xmm8,96+32(%rsp)
+       movaps  %xmm9,96+48(%rsp)
+       movaps  %xmm10,96+64(%rsp)
+       movaps  %xmm11,96+80(%rsp)
+       movaps  %xmm12,96+96(%rsp)
+       movaps  %xmm13,96+112(%rsp)
+       movaps  %xmm14,96+128(%rsp)
+       movaps  %xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+       mov     $in0,%r12                       # reassign arguments
+       mov     $out,%r13
+       mov     $len,%r14
+       mov     $key,%r15
+       movdqu  ($ivp),$iv                      # load IV
+       mov     $ivp,88(%rsp)                   # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+       shl     \$6,$len
+       sub     $in0,$out
+       mov     240($key),$rounds
+       add     $inp,$len               # end of input
+
+       lea     K_XX_XX(%rip),$K_XX_XX
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+
+       movdqa  64($K_XX_XX),@X[2]      # pbswap mask
+       movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
+       movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       movdqu  16($inp),@X[-3&7]
+       movdqu  32($inp),@X[-2&7]
+       movdqu  48($inp),@X[-1&7]
+       pshufb  @X[2],@X[-4&7]          # byte swap
+       add     \$64,$inp
+       pshufb  @X[2],@X[-3&7]
+       pshufb  @X[2],@X[-2&7]
+       pshufb  @X[2],@X[-1&7]
+       paddd   @Tx[1],@X[-4&7]         # add K_00_19
+       paddd   @Tx[1],@X[-3&7]
+       paddd   @Tx[1],@X[-2&7]
+       movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
+       psubd   @Tx[1],@X[-4&7]         # restore X[]
+       movdqa  @X[-3&7],16(%rsp)
+       psubd   @Tx[1],@X[-3&7]
+       movdqa  @X[-2&7],32(%rsp)
+       psubd   @Tx[1],@X[-2&7]
+       movups  ($key),$rndkey0         # $key[0]
+       movups  16($key),$rndkey[0]     # forward reference
+       jmp     .Loop_ssse3
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+       movups          `16*$n`($in0),$in               # load input
+       xorps           $rndkey0,$in
+___
+      $code.=<<___ if ($n);
+       movups          $iv,`16*($n-1)`($out,$in0)      # write output
+___
+      $code.=<<___;
+       xorps           $in,$iv
+       aesenc          $rndkey[0],$iv
+       movups          `32+16*$k`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+       cmp             \$11,$rounds
+       jb              .Laesenclast$sn
+       movups          `32+16*($k+0)`($key),$rndkey[1]
+       aesenc          $rndkey[0],$iv
+       movups          `32+16*($k+1)`($key),$rndkey[0]
+       aesenc          $rndkey[1],$iv
+       je              .Laesenclast$sn
+       movups          `32+16*($k+2)`($key),$rndkey[1]
+       aesenc          $rndkey[0],$iv
+       movups          `32+16*($k+3)`($key),$rndkey[0]
+       aesenc          $rndkey[1],$iv
+.Laesenclast$sn:
+       aesenclast      $rndkey[0],$iv
+       movups          16($key),$rndkey[1]             # forward reference
+___
+    } else {
+      $code.=<<___;
+       aesenc          $rndkey[0],$iv
+       movups          `32+16*$k`($key),$rndkey[1]
+___
+    }
+    $r++;      unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31()              # recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+       &movdqa (@X[0],@X[-3&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &movdqa (@Tx[0],@X[-1&7]);
+       &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &movdqa (@Tx[2],@X[0]);
+       &movdqa (@Tx[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
+       &paddd  (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &psrld  (@Tx[0],31);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &movdqa (@Tx[1],@Tx[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &psrld  (@Tx[2],30);
+       &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pslld  (@Tx[1],2);
+       &pxor   (@X[0],@Tx[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
+
+        foreach (@insns) { eval; }     # remaining instructions [if any]
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+       &movdqa (@Tx[0],@X[-1&7])       if ($Xi==8);
+        eval(shift(@insns));           # body_20_39
+       &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
+       &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
+        eval(shift(@insns));
+        eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+       if ($Xi%5) {
+         &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+       } else {                        # ... or load next one
+         &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+       }
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &movdqa (@Tx[0],@X[0]);
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &pslld  (@X[0],2);
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+       &psrld  (@Tx[0],30);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+         &movdqa       (@Tx[1],@X[0])  if ($Xi<19);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+
+        foreach (@insns) { eval; }     # remaining instructions
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+         &paddd        (@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+        foreach (@insns) { eval; }             # remaining instructions
+
+       &cmp    ($inp,$len);
+       &je     (".Ldone_ssse3");
+
+       unshift(@Tx,pop(@Tx));
+
+       &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
+       &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
+       &movdqu (@X[-4&7],"0($inp)");           # load input
+       &movdqu (@X[-3&7],"16($inp)");
+       &movdqu (@X[-2&7],"32($inp)");
+       &movdqu (@X[-1&7],"48($inp)");
+       &pshufb (@X[-4&7],@X[2]);               # byte swap
+       &add    ($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &pshufb (@X[($Xi-3)&7],@X[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &paddd  (@X[($Xi-4)&7],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &psubd  (@X[($Xi-4)&7],@Tx[1]);
+
+       foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+       foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&add   ($e,eval(4*($j&15))."(%rsp)");',        # X[]+K xfer
+       '&xor   ($c,$d);',
+       '&mov   (@T[1],$a);',   # $b in next round
+       '&$_rol ($a,5);',
+       '&and   (@T[0],$c);',   # ($b&($c^$d))
+       '&xor   ($c,$d);',      # restore $c
+       '&xor   (@T[0],$d);',
+       '&add   ($e,$a);',
+       '&$_ror ($b,$j?7:2);',  # $b>>>2
+       '&add   ($e,@T[0]);'    .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+       $n = scalar(@r);
+       $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
+       @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
+       $jj++;
+    return @r;
+}
+
+sub body_20_39 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
+       '&xor   (@T[0],$d);',   # ($b^$d)
+       '&mov   (@T[1],$a);',   # $b in next round
+       '&$_rol ($a,5);',
+       '&xor   (@T[0],$c);',   # ($b^$d^$c)
+       '&add   ($e,$a);',
+       '&$_ror ($b,7);',       # $b>>>2
+       '&add   ($e,@T[0]);'    .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+       $n = scalar(@r);
+       $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
+       @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
+       $jj++;
+    return @r;
+}
+
+sub body_40_59 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+       '($a,$b,$c,$d,$e)=@V;'.
+       '&mov   (@T[1],$c);',
+       '&xor   ($c,$d);',
+       '&add   ($e,eval(4*($j++&15))."(%rsp)");',      # X[]+K xfer
+       '&and   (@T[1],$d);',
+       '&and   (@T[0],$c);',   # ($b&($c^$d))
+       '&$_ror ($b,7);',       # $b>>>2
+       '&add   ($e,@T[1]);',
+       '&mov   (@T[1],$a);',   # $b in next round
+       '&$_rol ($a,5);',
+       '&add   ($e,@T[0]);',
+       '&xor   ($c,$d);',      # restore $c
+       '&add   ($e,$a);'       .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+       );
+       $n = scalar(@r);
+       $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
+       @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
+       $jj++;
+    return @r;
+}
+$code.=<<___;
+.align 16
+.Loop_ssse3:
+___
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_16_31(\&body_00_19);
+       &Xupdate_ssse3_32_79(\&body_00_19);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_40_59);
+       &Xupdate_ssse3_32_79(\&body_20_39);
+       &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
+
+                               $saved_j=$j; @saved_V=@V;
+                               $saved_r=$r; @saved_rndkey=@rndkey;
+
+       &Xloop_ssse3(\&body_20_39);
+       &Xloop_ssse3(\&body_20_39);
+       &Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+       movups  $iv,48($out,$in0)               # write output
+       lea     64($in0),$in0
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       jmp     .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+___
+                               $jj=$j=$saved_j; @V=@saved_V;
+                               $r=$saved_r;     @rndkey=@saved_rndkey;
+
+       &Xtail_ssse3(\&body_20_39);
+       &Xtail_ssse3(\&body_20_39);
+       &Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+       movups  $iv,48($out,$in0)               # write output
+       mov     88(%rsp),$ivp                   # restore $ivp
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       movups  $iv,($ivp)                      # write IV
+___
+$code.=<<___ if ($win64);
+       movaps  96+0(%rsp),%xmm6
+       movaps  96+16(%rsp),%xmm7
+       movaps  96+32(%rsp),%xmm8
+       movaps  96+48(%rsp),%xmm9
+       movaps  96+64(%rsp),%xmm10
+       movaps  96+80(%rsp),%xmm11
+       movaps  96+96(%rsp),%xmm12
+       movaps  96+112(%rsp),%xmm13
+       movaps  96+128(%rsp),%xmm14
+       movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     `104+($win64?10*16:0)`(%rsp),%rsi
+       mov     0(%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lepilogue_ssse3:
+       ret
+.size  aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");   # size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type  aesni_cbc_sha1_enc_avx,\@function,6
+.align 16
+aesni_cbc_sha1_enc_avx:
+       mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
+       #shr    \$6,$len                        # debugging artefact
+       #jz     .Lepilogue_avx                  # debugging artefact
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     `-104-($win64?10*16:0)`(%rsp),%rsp
+       #mov    $in0,$inp                       # debugging artefact
+       #lea    64(%rsp),$ctx                   # debugging artefact
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,96+0(%rsp)
+       movaps  %xmm7,96+16(%rsp)
+       movaps  %xmm8,96+32(%rsp)
+       movaps  %xmm9,96+48(%rsp)
+       movaps  %xmm10,96+64(%rsp)
+       movaps  %xmm11,96+80(%rsp)
+       movaps  %xmm12,96+96(%rsp)
+       movaps  %xmm13,96+112(%rsp)
+       movaps  %xmm14,96+128(%rsp)
+       movaps  %xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+       vzeroall
+       mov     $in0,%r12                       # reassign arguments
+       mov     $out,%r13
+       mov     $len,%r14
+       mov     $key,%r15
+       vmovdqu ($ivp),$iv                      # load IV
+       mov     $ivp,88(%rsp)                   # save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+       shl     \$6,$len
+       sub     $in0,$out
+       mov     240($key),$rounds
+       add     \$112,$key              # size optimization
+       add     $inp,$len               # end of input
+
+       lea     K_XX_XX(%rip),$K_XX_XX
+       mov     0($ctx),$A              # load context
+       mov     4($ctx),$B
+       mov     8($ctx),$C
+       mov     12($ctx),$D
+       mov     $B,@T[0]                # magic seed
+       mov     16($ctx),$E
+
+       vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
+       vmovdqa 0($K_XX_XX),@Tx[1]      # K_00_19
+       vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
+       vmovdqu 16($inp),@X[-3&7]
+       vmovdqu 32($inp),@X[-2&7]
+       vmovdqu 48($inp),@X[-1&7]
+       vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
+       add     \$64,$inp
+       vpshufb @X[2],@X[-3&7],@X[-3&7]
+       vpshufb @X[2],@X[-2&7],@X[-2&7]
+       vpshufb @X[2],@X[-1&7],@X[-1&7]
+       vpaddd  @Tx[1],@X[-4&7],@X[0]   # add K_00_19
+       vpaddd  @Tx[1],@X[-3&7],@X[1]
+       vpaddd  @Tx[1],@X[-2&7],@X[2]
+       vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
+       vmovdqa @X[1],16(%rsp)
+       vmovdqa @X[2],32(%rsp)
+       vmovups -112($key),$rndkey0     # $key[0]
+       vmovups 16-112($key),$rndkey[0] # forward reference
+       jmp     .Loop_avx
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+       vmovups         `16*$n`($in0),$in               # load input
+       vxorps          $rndkey0,$in,$in
+___
+      $code.=<<___ if ($n);
+       vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
+___
+      $code.=<<___;
+       vxorps          $in,$iv,$iv
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+       cmp             \$11,$rounds
+       jb              .Lvaesenclast$sn
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
+       vaesenc         $rndkey[1],$iv,$iv
+       vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
+       je              .Lvaesenclast$sn
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
+       vaesenc         $rndkey[1],$iv,$iv
+       vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+       vaesenclast     $rndkey[0],$iv,$iv
+       vmovups         16-112($key),$rndkey[1]         # forward reference
+___
+    } else {
+      $code.=<<___;
+       vaesenc         $rndkey[0],$iv,$iv
+       vmovups         `32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;      unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31()                # recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpsrldq(@Tx[0],@X[-1&7],4);    # "X[-3]", 3 dwords
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[0],@X[0],31);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
+       &vpaddd (@X[0],@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpsrld (@Tx[1],@Tx[2],30);
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpslld (@Tx[2],@Tx[2],2);
+       &vpxor  (@X[0],@X[0],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vmovdqa      (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+
+        foreach (@insns) { eval; }     # remaining instructions [if any]
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+       &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
+       &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
+        eval(shift(@insns));
+        eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
+       if ($Xi%5) {
+         &vmovdqa      (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+       } else {                        # ... or load next one
+         &vmovdqa      (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+       }
+         &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+
+       &vpsrld (@Tx[0],@X[0],30);
+         &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpslld (@X[0],@X[0],2);
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # ror
+        eval(shift(@insns));
+
+       &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
+        eval(shift(@insns));           # body_20_39
+        eval(shift(@insns));
+         &vmovdqa      (@Tx[1],@X[0])  if ($Xi<19);
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));           # rol
+        eval(shift(@insns));
+
+        foreach (@insns) { eval; }     # remaining instructions
+
+  $Xi++;       push(@X,shift(@X));     # "rotate" X[]
+               push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+         &vpaddd       (@Tx[1],@Tx[1],@X[-1&7]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+         &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+
+        foreach (@insns) { eval; }             # remaining instructions
+
+       &cmp    ($inp,$len);
+       &je     (".Ldone_avx");
+
+       unshift(@Tx,pop(@Tx));
+
+       &vmovdqa(@X[2],"64($K_XX_XX)");         # pbswap mask
+       &vmovdqa(@Tx[1],"0($K_XX_XX)");         # K_00_19
+       &vmovdqu(@X[-4&7],"0($inp)");           # load input
+       &vmovdqu(@X[-3&7],"16($inp)");
+       &vmovdqu(@X[-2&7],"32($inp)");
+       &vmovdqu(@X[-1&7],"48($inp)");
+       &vpshufb(@X[-4&7],@X[-4&7],@X[2]);      # byte swap
+       &add    ($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);   # 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+       foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align 16
+.Loop_avx:
+___
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_16_31(\&body_00_19);
+       &Xupdate_avx_32_79(\&body_00_19);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_40_59);
+       &Xupdate_avx_32_79(\&body_20_39);
+       &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
+
+                               $saved_j=$j; @saved_V=@V;
+                               $saved_r=$r; @saved_rndkey=@rndkey;
+
+       &Xloop_avx(\&body_20_39);
+       &Xloop_avx(\&body_20_39);
+       &Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+       vmovups $iv,48($out,$in0)               # write output
+       lea     64($in0),$in0
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       add     12($ctx),$D
+       mov     $A,0($ctx)
+       add     16($ctx),$E
+       mov     @T[0],4($ctx)
+       mov     @T[0],$B                        # magic seed
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       jmp     .Loop_avx
+
+.align 16
+.Ldone_avx:
+___
+                               $jj=$j=$saved_j; @V=@saved_V;
+                               $r=$saved_r;     @rndkey=@saved_rndkey;
+
+       &Xtail_avx(\&body_20_39);
+       &Xtail_avx(\&body_20_39);
+       &Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+       vmovups $iv,48($out,$in0)               # write output
+       mov     88(%rsp),$ivp                   # restore $ivp
+
+       add     0($ctx),$A                      # update context
+       add     4($ctx),@T[0]
+       add     8($ctx),$C
+       mov     $A,0($ctx)
+       add     12($ctx),$D
+       mov     @T[0],4($ctx)
+       add     16($ctx),$E
+       mov     $C,8($ctx)
+       mov     $D,12($ctx)
+       mov     $E,16($ctx)
+       vmovups $iv,($ivp)                      # write IV
+       vzeroall
+___
+$code.=<<___ if ($win64);
+       movaps  96+0(%rsp),%xmm6
+       movaps  96+16(%rsp),%xmm7
+       movaps  96+32(%rsp),%xmm8
+       movaps  96+48(%rsp),%xmm9
+       movaps  96+64(%rsp),%xmm10
+       movaps  96+80(%rsp),%xmm11
+       movaps  96+96(%rsp),%xmm12
+       movaps  96+112(%rsp),%xmm13
+       movaps  96+128(%rsp),%xmm14
+       movaps  96+144(%rsp),%xmm15
+___
+$code.=<<___;
+       lea     `104+($win64?10*16:0)`(%rsp),%rsi
+       mov     0(%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lepilogue_avx:
+       ret
+.size  aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align 64
+K_XX_XX:
+.long  0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
+.long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
+.long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
+.long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
+
+.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  ssse3_handler,\@abi-omnipotent
+.align 16
+ssse3_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       lea     96(%rax),%rsi
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$20,%ecx
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     `104+10*16`(%rax),%rax  # adjust stack pointer
+
+       mov     0(%rax),%r15
+       mov     8(%rax),%r14
+       mov     16(%rax),%r13
+       mov     24(%rax),%r12
+       mov     32(%rax),%rbp
+       mov     40(%rax),%rbx
+       lea     48(%rax),%rax
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+
+.Lcommon_seh_tail:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  ssse3_handler,.-ssse3_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
+       .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
+       .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+       .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
+       .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
+       .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section       .xdata
+.align 8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+       .byte   9,0,0,0
+       .rva    ssse3_handler
+       .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04                 if($dst>=8);
+    $rex|=0x01                 if($src>=8);
+    push @opcode,$rex|0x40     if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+       my %opcodelet = (
+               "aesenc" => 0xdc,       "aesenclast" => 0xdd
+       );
+       return undef if (!defined($opcodelet{$1}));
+       rex(\@opcode,$3,$2);
+       push @opcode,0x0f,0x38,$opcodelet{$1};
+       push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
+       return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
index ee350a9..6b2dac1 100644 (file)
@@ -28,7 +28,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c \
        bio_md.c bio_b64.c bio_enc.c evp_err.c e_null.c \
        c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \
        evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \
-       e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c
+       e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \
+       e_aes_cbc_hmac_sha1.c e_rc4_hmac_md5.c
 
 LIBOBJ=        encode.o digest.o evp_enc.o evp_key.o evp_acnf.o \
        e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\
@@ -40,7 +41,8 @@ LIBOBJ=       encode.o digest.o evp_enc.o evp_key.o evp_acnf.o \
        bio_md.o bio_b64.o bio_enc.o evp_err.o e_null.o \
        c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \
        evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \
-       e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o
+       e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o \
+       e_aes_cbc_hmac_sha1.o e_rc4_hmac_md5.o
 
 SRC= $(LIBSRC)
 
@@ -195,6 +197,20 @@ e_aes.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
 e_aes.o: ../../include/openssl/rand.h ../../include/openssl/safestack.h
 e_aes.o: ../../include/openssl/stack.h ../../include/openssl/symhacks.h
 e_aes.o: ../modes/modes_lcl.h e_aes.c evp_locl.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/aes.h ../../include/openssl/asn1.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/bio.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/crypto.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/e_os2.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/evp.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/obj_mac.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/objects.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/opensslconf.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/opensslv.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/ossl_typ.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/safestack.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/sha.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/stack.h
+e_aes_cbc_hmac_sha1.o: ../../include/openssl/symhacks.h e_aes_cbc_hmac_sha1.c
 e_bf.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
 e_bf.o: ../../include/openssl/blowfish.h ../../include/openssl/buffer.h
 e_bf.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
@@ -281,6 +297,17 @@ e_rc4.o: ../../include/openssl/opensslconf.h ../../include/openssl/opensslv.h
 e_rc4.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rc4.h
 e_rc4.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
 e_rc4.o: ../../include/openssl/symhacks.h ../cryptlib.h e_rc4.c
+e_rc4_hmac_md5.o: ../../include/openssl/asn1.h ../../include/openssl/bio.h
+e_rc4_hmac_md5.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+e_rc4_hmac_md5.o: ../../include/openssl/evp.h ../../include/openssl/md5.h
+e_rc4_hmac_md5.o: ../../include/openssl/obj_mac.h
+e_rc4_hmac_md5.o: ../../include/openssl/objects.h
+e_rc4_hmac_md5.o: ../../include/openssl/opensslconf.h
+e_rc4_hmac_md5.o: ../../include/openssl/opensslv.h
+e_rc4_hmac_md5.o: ../../include/openssl/ossl_typ.h ../../include/openssl/rc4.h
+e_rc4_hmac_md5.o: ../../include/openssl/safestack.h
+e_rc4_hmac_md5.o: ../../include/openssl/stack.h
+e_rc4_hmac_md5.o: ../../include/openssl/symhacks.h e_rc4_hmac_md5.c
 e_rc5.o: ../../e_os.h ../../include/openssl/bio.h
 e_rc5.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
 e_rc5.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h
index 395a164..8765cfb 100644 (file)
@@ -98,6 +98,9 @@ void OpenSSL_add_all_ciphers(void)
 #ifndef OPENSSL_NO_RC4
        EVP_add_cipher(EVP_rc4());
        EVP_add_cipher(EVP_rc4_40());
+#ifndef OPENSSL_NO_MD5
+       EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_IDEA
@@ -190,6 +193,10 @@ void OpenSSL_add_all_ciphers(void)
        EVP_add_cipher(EVP_aes_256_gcm());
        EVP_add_cipher_alias(SN_aes_256_cbc,"AES256");
        EVP_add_cipher_alias(SN_aes_256_cbc,"aes256");
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+       EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+       EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 
 #ifndef OPENSSL_NO_CAMELLIA
diff --git a/crypto/evp/e_aes_cbc_hmac_sha1.c b/crypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644 (file)
index 0000000..78a0135
--- /dev/null
@@ -0,0 +1,403 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER      0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD         0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY      0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+    {
+    AES_KEY            ks;
+    SHA_CTX            head,tail,md;
+    size_t             payload_length; /* AAD length in decrypt case */
+    union {
+       unsigned int    tls_ver;
+       unsigned char   tls_aad[16];    /* 13 used */
+    } aux;
+    } EVP_AES_HMAC_SHA1;
+
+#if    defined(AES_ASM) &&     ( \
+       defined(__x86_64)       || defined(__x86_64__)  || \
+       defined(_M_AMD64)       || defined(_M_X64)      || \
+       defined(__INTEL__)      )
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE   (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+                             AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+                             AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+                          unsigned char *out,
+                          size_t length,
+                          const AES_KEY *key,
+                          unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+               const AES_KEY *key, unsigned char iv[16],
+               SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+                       const unsigned char *inkey,
+                       const unsigned char *iv, int enc)
+       {
+       EVP_AES_HMAC_SHA1 *key = data(ctx);
+       int ret;
+
+       if (enc)
+               ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+       else
+               ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+       SHA1_Init(&key->head);  /* handy when benchmarking */
+       key->tail = key->head;
+       key->md   = key->head;
+
+       key->payload_length = 0;
+
+       return ret<0?0:1;
+       }
+
+#define        STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define        aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{      const unsigned char *ptr = data;
+       size_t res;
+
+       if ((res = c->num)) {
+               res = SHA_CBLOCK-res;
+               if (len<res) res=len;
+               SHA1_Update (c,ptr,res);
+               ptr += res;
+               len -= res;
+       }
+
+       res = len % SHA_CBLOCK;
+       len -= res;
+
+       if (len) {
+               sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+
+               ptr += len;
+               c->Nh += len>>29;
+               c->Nl += len<<=3;
+               if (c->Nl<(unsigned int)len) c->Nh++;
+       }
+
+       if (res)
+               SHA1_Update(c,ptr,res);
+}
+
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                     const unsigned char *in, size_t len)
+       {
+       EVP_AES_HMAC_SHA1 *key = data(ctx);
+       unsigned int l;
+       size_t  plen = key->payload_length,
+               iv = 0,         /* explicit IV in TLS 1.1 and later */
+               sha_off = 0;
+#if defined(STITCHED_CALL)
+       size_t  aes_off = 0,
+               blocks;
+
+       sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+       if (len%AES_BLOCK_SIZE) return 0;
+
+       if (ctx->encrypt) {
+               if (plen==0)
+                       plen = len;
+               else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+                       return 0;
+               else if (key->aux.tls_ver >= TLS1_1_VERSION)
+                       iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+               if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+                       SHA1_Update(&key->md,in+iv,sha_off);
+
+                       aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+                               ctx->iv,&key->md,in+iv+sha_off);
+                       blocks *= SHA_CBLOCK;
+                       aes_off += blocks;
+                       sha_off += blocks;
+                       key->md.Nh += blocks>>29;
+                       key->md.Nl += blocks<<=3;
+                       if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+               } else {
+                       sha_off = 0;
+               }
+#endif
+               sha_off += iv;
+               SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+               if (plen!=len)  {       /* "TLS" mode of operation */
+                       if (in!=out)
+                               memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+                       /* calculate HMAC and append it to payload */
+                       SHA1_Final(out+plen,&key->md);
+                       key->md = key->tail;
+                       SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+                       SHA1_Final(out+plen,&key->md);
+
+                       /* pad the payload|hmac */
+                       plen += SHA_DIGEST_LENGTH;
+                       for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+                       /* encrypt HMAC|padding at once */
+                       aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+                                       &key->ks,ctx->iv,1);
+               } else {
+                       aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+                                       &key->ks,ctx->iv,1);
+               }
+       } else {
+               unsigned char mac[SHA_DIGEST_LENGTH];
+
+               /* decrypt HMAC|padding at once */
+               aesni_cbc_encrypt(in,out,len,
+                               &key->ks,ctx->iv,0);
+
+               if (plen) {     /* "TLS" mode of operation */
+                       /* figure out payload length */
+                       if (len<(out[len-1]+1+SHA_DIGEST_LENGTH))
+                               return 0;
+
+                       len -= (out[len-1]+1+SHA_DIGEST_LENGTH);
+
+                       if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+                           >= TLS1_1_VERSION) {
+                               len -= AES_BLOCK_SIZE;
+                               iv = AES_BLOCK_SIZE;
+                       }
+
+                       key->aux.tls_aad[plen-2] = len>>8;
+                       key->aux.tls_aad[plen-1] = len;
+
+                       /* calculate HMAC and verify it */
+                       key->md = key->head;
+                       SHA1_Update(&key->md,key->aux.tls_aad,plen);
+                       SHA1_Update(&key->md,out+iv,len);
+                       SHA1_Final(mac,&key->md);
+
+                       key->md = key->tail;
+                       SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH);
+                       SHA1_Final(mac,&key->md);
+
+                       if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH))
+                               return 0;
+               } else {
+                       SHA1_Update(&key->md,out,len);
+               }
+       }
+
+       key->payload_length = 0;
+
+       return 1;
+       }
+
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+       {
+       EVP_AES_HMAC_SHA1 *key = data(ctx);
+
+       switch (type)
+               {
+       case EVP_CTRL_AEAD_SET_MAC_KEY:
+               {
+               unsigned int  i;
+               unsigned char hmac_key[64];
+
+               memset (hmac_key,0,sizeof(hmac_key));
+
+               if (arg > sizeof(hmac_key)) {
+                       SHA1_Init(&key->head);
+                       SHA1_Update(&key->head,ptr,arg);
+                       SHA1_Final(hmac_key,&key->head);
+               } else {
+                       memcpy(hmac_key,ptr,arg);
+               }
+
+               for (i=0;i<sizeof(hmac_key);i++)
+                       hmac_key[i] ^= 0x36;            /* ipad */
+               SHA1_Init(&key->head);
+               SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+               for (i=0;i<sizeof(hmac_key);i++)
+                       hmac_key[i] ^= 0x36^0x5c;       /* opad */
+               SHA1_Init(&key->tail);
+               SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+               return 1;
+               }
+       case EVP_CTRL_AEAD_TLS1_AAD:
+               {
+               unsigned char *p=ptr;
+               unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+               if (ctx->encrypt)
+                       {
+                       key->payload_length = len;
+                       if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+                               len -= AES_BLOCK_SIZE;
+                               p[arg-2] = len>>8;
+                               p[arg-1] = len;
+                       }
+                       key->md = key->head;
+                       SHA1_Update(&key->md,p,arg);
+
+                       return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+                               - len);
+                       }
+               else
+                       {
+                       if (arg>13) arg = 13;
+                       memcpy(key->aux.tls_aad,ptr,arg);
+                       key->payload_length = arg;
+
+                       return SHA_DIGEST_LENGTH;
+                       }
+               }
+       default:
+               return -1;
+               }
+       }
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+       {
+#ifdef NID_aes_128_cbc_hmac_sha1
+       NID_aes_128_cbc_hmac_sha1,
+#else
+       NID_undef,
+#endif
+       16,16,16,
+       EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+       aesni_cbc_hmac_sha1_init_key,
+       aesni_cbc_hmac_sha1_cipher,
+       NULL,
+       sizeof(EVP_AES_HMAC_SHA1),
+       EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+       EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+       aesni_cbc_hmac_sha1_ctrl,
+       NULL
+       };
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+       {
+#ifdef NID_aes_256_cbc_hmac_sha1
+       NID_aes_256_cbc_hmac_sha1,
+#else
+       NID_undef,
+#endif
+       16,32,16,
+       EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+       aesni_cbc_hmac_sha1_init_key,
+       aesni_cbc_hmac_sha1_cipher,
+       NULL,
+       sizeof(EVP_AES_HMAC_SHA1),
+       EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+       EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+       aesni_cbc_hmac_sha1_ctrl,
+       NULL
+       };
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+       {
+       return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+               &aesni_128_cbc_hmac_sha1_cipher:NULL);
+       }
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+       {
+       return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+               &aesni_256_cbc_hmac_sha1_cipher:NULL);
+       }
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+       {
+       return NULL;
+       }
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+       {
+       return NULL;
+       }
+#endif
+#endif
diff --git a/crypto/evp/e_rc4_hmac_md5.c b/crypto/evp/e_rc4_hmac_md5.c
new file mode 100644 (file)
index 0000000..1fa2aa2
--- /dev/null
@@ -0,0 +1,292 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER      0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD         0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY      0x17
+#endif
+
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE               16
+
+typedef struct
+    {
+    RC4_KEY            ks;
+    MD5_CTX            head,tail,md;
+    size_t             payload_length;
+    } EVP_RC4_HMAC_MD5;
+
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+               MD5_CTX *ctx,const void *inp,size_t blocks);
+
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+                       const unsigned char *inkey,
+                       const unsigned char *iv, int enc)
+       {
+       EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+       RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+                   inkey);
+
+       MD5_Init(&key->head);   /* handy when benchmarking */
+       key->tail = key->head;
+       key->md   = key->head;
+
+       key->payload_length = 0;
+
+       return 1;
+       }
+
+#if    !defined(OPENSSL_NO_ASM) &&     ( \
+       defined(__x86_64)       || defined(__x86_64__)  || \
+       defined(_M_AMD64)       || defined(_M_X64)      || \
+       defined(__INTEL__)              )
+#define        STITCHED_CALL
+#endif
+
+#if !defined(STITCHED_CALL)
+#define        rc4_off 0
+#define        md5_off 0
+#endif
+
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                     const unsigned char *in, size_t len)
+       {
+       EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+       size_t  rc4_off = 32-1-(key->ks.x&(32-1)),      /* 32 is $MOD from rc4_md5-x86_64.pl */
+               md5_off = MD5_CBLOCK-key->md.num,
+               blocks;
+       unsigned int l;
+#endif
+       size_t  plen = key->payload_length;
+
+       if (plen && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+
+       if (ctx->encrypt) {
+               if (plen==0) plen = len;
+#if defined(STITCHED_CALL)
+               /* cipher has to "fall behind" */
+               if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+
+               if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK)) {
+                       MD5_Update(&key->md,in,md5_off);
+                       RC4(&key->ks,rc4_off,in,out);
+
+                       rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+                               &key->md,in+md5_off,blocks);
+                       blocks *= MD5_CBLOCK;
+                       rc4_off += blocks;
+                       md5_off += blocks;
+                       key->md.Nh += blocks>>29;
+                       key->md.Nl += blocks<<=3;
+                       if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+               } else {
+                       rc4_off = 0;
+                       md5_off = 0;
+               }
+#endif
+               MD5_Update(&key->md,in+md5_off,plen-md5_off);
+
+               if (plen!=len) {        /* "TLS" mode of operation */
+                       if (in!=out)
+                               memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+
+                       /* calculate HMAC and append it to payload */
+                       MD5_Final(out+plen,&key->md);
+                       key->md = key->tail;
+                       MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+                       MD5_Final(out+plen,&key->md);
+                       /* encrypt HMAC at once */
+                       RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+               } else {
+                       RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+               }
+       } else {
+               unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+               /* digest has to "fall behind" */
+               if (md5_off>rc4_off)    rc4_off += 2*MD5_CBLOCK;
+               else                    rc4_off += MD5_CBLOCK;
+
+               if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK)) {
+                       RC4(&key->ks,rc4_off,in,out);
+                       MD5_Update(&key->md,out,md5_off);
+
+                       rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+                               &key->md,out+md5_off,blocks);
+                       blocks *= MD5_CBLOCK;
+                       rc4_off += blocks;
+                       md5_off += blocks;
+                       l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+                       if (l<key->md.Nl) key->md.Nh++;
+                       key->md.Nl  = l;
+                       key->md.Nh += blocks>>29;
+               } else {
+                       md5_off=0;
+                       rc4_off=0;
+               }
+#endif
+               /* decrypt HMAC at once */
+               RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+               if (plen) {     /* "TLS" mode of operation */
+                       MD5_Update(&key->md,out+md5_off,plen-md5_off);
+
+                       /* calculate HMAC and verify it */
+                       MD5_Final(mac,&key->md);
+                       key->md = key->tail;
+                       MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+                       MD5_Final(mac,&key->md);
+
+                       if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+                               return 0;
+               } else {
+                       MD5_Update(&key->md,out+md5_off,len-md5_off);
+               }
+       }
+
+       key->payload_length = 0;
+
+       return 1;
+       }
+
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+       {
+       EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+       switch (type)
+               {
+       case EVP_CTRL_AEAD_SET_MAC_KEY:
+               {
+               unsigned int  i;
+               unsigned char hmac_key[64];
+
+               memset (hmac_key,0,sizeof(hmac_key));
+
+               if (arg > sizeof(hmac_key)) {
+                       MD5_Init(&key->head);
+                       MD5_Update(&key->head,ptr,arg);
+                       MD5_Final(hmac_key,&key->head);
+               } else {
+                       memcpy(hmac_key,ptr,arg);
+               }
+
+               for (i=0;i<sizeof(hmac_key);i++)
+                       hmac_key[i] ^= 0x36;            /* ipad */
+               MD5_Init(&key->head);
+               MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+               for (i=0;i<sizeof(hmac_key);i++)
+                       hmac_key[i] ^= 0x36^0x5c;       /* opad */
+               MD5_Init(&key->tail);
+               MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+               return 1;
+               }
+       case EVP_CTRL_AEAD_TLS1_AAD:
+               {
+               unsigned char *p=ptr;
+               unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+               if (!ctx->encrypt)
+                       {
+                       len -= MD5_DIGEST_LENGTH;
+                       p[arg-2] = len>>8;
+                       p[arg-1] = len;
+                       }
+               key->payload_length=len;
+               key->md = key->head;
+               MD5_Update(&key->md,p,arg);
+
+               return MD5_DIGEST_LENGTH;
+               }
+       default:
+               return -1;
+               }
+       }
+
+static EVP_CIPHER r4_hmac_md5_cipher=
+       {
+#ifdef NID_rc4_hmac_md5
+       NID_rc4_hmac_md5,
+#else
+       NID_undef,
+#endif
+       1,EVP_RC4_KEY_SIZE,0,
+       EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+       rc4_hmac_md5_init_key,
+       rc4_hmac_md5_cipher,
+       NULL,
+       sizeof(EVP_RC4_HMAC_MD5),
+       NULL,
+       NULL,
+       rc4_hmac_md5_ctrl,
+       NULL
+       };
+
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+       {
+       return(&r4_hmac_md5_cipher);
+       }
+#endif
index d6cf616..82e762f 100644 (file)
@@ -766,6 +766,9 @@ const EVP_MD *EVP_dev_crypto_md5(void);
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -837,6 +840,10 @@ const EVP_CIPHER *EVP_aes_256_ctr(void);
 const EVP_CIPHER *EVP_aes_256_ccm(void);
 const EVP_CIPHER *EVP_aes_256_gcm(void);
 const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
 const EVP_CIPHER *EVP_camellia_128_ecb(void);
index f2869f5..33dea16 100644 (file)
@@ -66,6 +66,8 @@ int EVP_add_cipher(const EVP_CIPHER *c)
        {
        int r;
 
+       if (c == NULL) return(0);
+
        r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c);
        if (r == 0) return(0);
        check_defer(c->nid);
index 8d1100b..7dd278f 100644 (file)
@@ -62,9 +62,9 @@
  * [including the GNU Public Licence.]
  */
 
-#define NUM_NID 915
-#define NUM_SN 908
-#define NUM_LN 908
+#define NUM_NID 919
+#define NUM_SN 912
+#define NUM_LN 912
 #define NUM_OBJ 856
 
 static const unsigned char lvalues[5971]={
@@ -2397,11 +2397,19 @@ static const ASN1_OBJECT nid_objs[NUM_NID]={
 {"RSASSA-PSS","rsassaPss",NID_rsassaPss,9,&(lvalues[5961]),0},
 {"AES-128-XTS","aes-128-xts",NID_aes_128_xts,0,NULL,0},
 {"AES-256-XTS","aes-256-xts",NID_aes_256_xts,0,NULL,0},
+{"RC4-HMAC-MD5","rc4-hmac-md5",NID_rc4_hmac_md5,0,NULL,0},
+{"AES-128-CBC-HMAC-SHA1","aes-128-cbc-hmac-sha1",
+       NID_aes_128_cbc_hmac_sha1,0,NULL,0},
+{"AES-192-CBC-HMAC-SHA1","aes-192-cbc-hmac-sha1",
+       NID_aes_192_cbc_hmac_sha1,0,NULL,0},
+{"AES-256-CBC-HMAC-SHA1","aes-256-cbc-hmac-sha1",
+       NID_aes_256_cbc_hmac_sha1,0,NULL,0},
 };
 
 static const unsigned int sn_objs[NUM_SN]={
 364,   /* "AD_DVCS" */
 419,   /* "AES-128-CBC" */
+916,   /* "AES-128-CBC-HMAC-SHA1" */
 421,   /* "AES-128-CFB" */
 650,   /* "AES-128-CFB1" */
 653,   /* "AES-128-CFB8" */
@@ -2410,6 +2418,7 @@ static const unsigned int sn_objs[NUM_SN]={
 420,   /* "AES-128-OFB" */
 913,   /* "AES-128-XTS" */
 423,   /* "AES-192-CBC" */
+917,   /* "AES-192-CBC-HMAC-SHA1" */
 425,   /* "AES-192-CFB" */
 651,   /* "AES-192-CFB1" */
 654,   /* "AES-192-CFB8" */
@@ -2417,6 +2426,7 @@ static const unsigned int sn_objs[NUM_SN]={
 422,   /* "AES-192-ECB" */
 424,   /* "AES-192-OFB" */
 427,   /* "AES-256-CBC" */
+918,   /* "AES-256-CBC-HMAC-SHA1" */
 429,   /* "AES-256-CFB" */
 652,   /* "AES-256-CFB1" */
 655,   /* "AES-256-CFB8" */
@@ -2540,6 +2550,7 @@ static const unsigned int sn_objs[NUM_SN]={
 40,    /* "RC2-OFB" */
  5,    /* "RC4" */
 97,    /* "RC4-40" */
+915,   /* "RC4-HMAC-MD5" */
 120,   /* "RC5-CBC" */
 122,   /* "RC5-CFB" */
 121,   /* "RC5-ECB" */
@@ -3455,6 +3466,7 @@ static const unsigned int ln_objs[NUM_LN]={
 364,   /* "ad dvcs" */
 606,   /* "additional verification" */
 419,   /* "aes-128-cbc" */
+916,   /* "aes-128-cbc-hmac-sha1" */
 896,   /* "aes-128-ccm" */
 421,   /* "aes-128-cfb" */
 650,   /* "aes-128-cfb1" */
@@ -3465,6 +3477,7 @@ static const unsigned int ln_objs[NUM_LN]={
 420,   /* "aes-128-ofb" */
 913,   /* "aes-128-xts" */
 423,   /* "aes-192-cbc" */
+917,   /* "aes-192-cbc-hmac-sha1" */
 899,   /* "aes-192-ccm" */
 425,   /* "aes-192-cfb" */
 651,   /* "aes-192-cfb1" */
@@ -3474,6 +3487,7 @@ static const unsigned int ln_objs[NUM_LN]={
 898,   /* "aes-192-gcm" */
 424,   /* "aes-192-ofb" */
 427,   /* "aes-256-cbc" */
+918,   /* "aes-256-cbc-hmac-sha1" */
 902,   /* "aes-256-ccm" */
 429,   /* "aes-256-cfb" */
 652,   /* "aes-256-cfb1" */
@@ -3978,6 +3992,7 @@ static const unsigned int ln_objs[NUM_LN]={
 40,    /* "rc2-ofb" */
  5,    /* "rc4" */
 97,    /* "rc4-40" */
+915,   /* "rc4-hmac-md5" */
 120,   /* "rc5-cbc" */
 122,   /* "rc5-cfb" */
 121,   /* "rc5-ecb" */
index 6de8c70..02fc409 100644 (file)
 #define LN_cmac                "cmac"
 #define NID_cmac               894
 
+#define SN_rc4_hmac_md5                "RC4-HMAC-MD5"
+#define LN_rc4_hmac_md5                "rc4-hmac-md5"
+#define NID_rc4_hmac_md5               915
+
+#define SN_aes_128_cbc_hmac_sha1               "AES-128-CBC-HMAC-SHA1"
+#define LN_aes_128_cbc_hmac_sha1               "aes-128-cbc-hmac-sha1"
+#define NID_aes_128_cbc_hmac_sha1              916
+
+#define SN_aes_192_cbc_hmac_sha1               "AES-192-CBC-HMAC-SHA1"
+#define LN_aes_192_cbc_hmac_sha1               "aes-192-cbc-hmac-sha1"
+#define NID_aes_192_cbc_hmac_sha1              917
+
+#define SN_aes_256_cbc_hmac_sha1               "AES-256-CBC-HMAC-SHA1"
+#define LN_aes_256_cbc_hmac_sha1               "aes-256-cbc-hmac-sha1"
+#define NID_aes_256_cbc_hmac_sha1              918
+
index cbd77f3..a50aa57 100644 (file)
@@ -912,3 +912,7 @@ mgf1                911
 rsassaPss              912
 aes_128_xts            913
 aes_256_xts            914
+rc4_hmac_md5           915
+aes_128_cbc_hmac_sha1          916
+aes_192_cbc_hmac_sha1          917
+aes_256_cbc_hmac_sha1          918
index 1bf3ad6..183806e 100644 (file)
@@ -1283,3 +1283,9 @@ kisa 1 6                : SEED-OFB      : seed-ofb
                        : HMAC                          : hmac
 # Nor CMAC either
                        : CMAC                          : cmac
+
+# Synthetic composite ciphersuites
+                       : RC4-HMAC-MD5                  : rc4-hmac-md5
+                       : AES-128-CBC-HMAC-SHA1         : aes-128-cbc-hmac-sha1
+                       : AES-192-CBC-HMAC-SHA1         : aes-192-cbc-hmac-sha1
+                       : AES-256-CBC-HMAC-SHA1         : aes-256-cbc-hmac-sha1
index 3e17089..3f5aaea 100644 (file)
@@ -46,6 +46,8 @@ rc4-586.s:    asm/rc4-586.pl ../perlasm/x86asm.pl
 
 rc4-x86_64.s: asm/rc4-x86_64.pl
        $(PERL) asm/rc4-x86_64.pl $(PERLASM_SCHEME) > $@
+rc4-md5-x86_64.s:      asm/rc4-md5-x86_64.pl
+       $(PERL) asm/rc4-md5-x86_64.pl $(PERLASM_SCHEME) > $@
 
 rc4-ia64.S: asm/rc4-ia64.pl
        $(PERL) asm/rc4-ia64.pl $(CFLAGS) > $@
diff --git a/crypto/rc4/asm/rc4-md5-x86_64.pl b/crypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644 (file)
index 0000000..7f68409
--- /dev/null
@@ -0,0 +1,631 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+#              RC4     MD5     RC4+MD5 stitch  gain
+# Opteron      6.5(*)  5.4     11.9    7.0     +70%(*)
+# Core2                6.5     5.8     12.3    7.7     +60%
+# Westmere     4.3     5.2     9.5     7.0     +36%
+# Sandy Bridge 4.2     5.5     9.7     6.8     +43%
+# Atom         9.3     6.5     15.8    11.1    +42%
+#
+# (*)  rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+#      is +53%...
+
+my ($rc4,$md5)=(1,1);  # what to generate?
+my $D="#" if (!$md5);  # if set to "#", MD5 is stitched into RC4(),
+                       # but its result is discarded. Idea here is
+                       # to be able to use 'openssl speed rc4' for
+                       # benchmarking the stitched subroutine... 
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+
+if ($rc4 && !$md5) {
+  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+  $func="RC4";                         $nargs=4;
+} elsif ($md5 && !$rc4) {
+  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+  $func="md5_block_asm_data_order";    $nargs=3;
+} else {
+  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+  $func="rc4_md5_enc";                 $nargs=6;
+  # void rc4_md5_enc(
+  #            RC4_KEY *key,           #
+  #            const void *in0,        # RC4 input
+  #            void *out,              # RC4 output
+  #            MD5_CTX *ctx,           #
+  #            const void *inp,        # MD5 input
+  #            size_t len);            # number of 64-byte blocks
+}
+
+my @K=(        0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+       0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+       0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+       0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+       0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+       0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+       0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+       0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+       0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+       0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+       0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+       0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+       0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+       0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+       0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+       0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391     );
+
+my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers
+my $tmp="%r12d";
+
+my @XX=("%rbp","%rsi");                        # RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+my $MOD=32;                            # 16, 32 or 64
+
+$code.=<<___;
+.text
+.align 16
+
+.globl $func
+.type  $func,\@function,$nargs
+$func:
+       cmp     \$0,$len
+       je      .Labort
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       sub     \$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5#        mov     $ctx,%r11               # reassign arguments
+       mov     $len,%r12
+       mov     $in0,%r13
+       mov     $out,%r14
+$D#md5#        mov     $inp,%r15
+___
+    $ctx="%r11"        if ($md5);              # reassign arguments
+    $len="%r12";
+    $in0="%r13";
+    $out="%r14";
+    $inp="%r15"        if ($md5);
+    $inp=$in0  if (!$md5);
+$code.=<<___;
+       xor     $XX[0],$XX[0]
+       xor     $YY,$YY
+
+       lea     8($dat),$dat
+       mov     -8($dat),$XX[0]#b
+       mov     -4($dat),$YY#b
+
+       inc     $XX[0]#b
+       sub     $in0,$out
+       movl    ($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+       xor     $TX[1],$TX[1]
+       test    \$-128,$len
+       jz      .Loop1
+       sub     $XX[0],$TX[1]
+       and     \$`$MOD-1`,$TX[1]
+       jz      .Loop${MOD}_is_hot
+       sub     $TX[1],$len
+.Loop${MOD}_warmup:
+       add     $TX[0]#b,$YY#b
+       movl    ($dat,$YY,4),$TY#d
+       movl    $TX[0]#d,($dat,$YY,4)
+       movl    $TY#d,($dat,$XX[0],4)
+       add     $TY#b,$TX[0]#b
+       inc     $XX[0]#b
+       movl    ($dat,$TX[0],4),$TY#d
+       movl    ($dat,$XX[0],4),$TX[0]#d
+       xorb    ($in0),$TY#b
+       movb    $TY#b,($out,$in0)
+       lea     1($in0),$in0
+       dec     $TX[1]
+       jnz     .Loop${MOD}_warmup
+
+       mov     $YY,$TX[1]
+       xor     $YY,$YY
+       mov     $TX[1]#b,$YY#b
+
+.Loop${MOD}_is_hot:
+       mov     $len,32(%rsp)           # save original $len
+       shr     \$6,$len                # number of 64-byte blocks
+___
+  if ($D && !$md5) {                   # stitch in dummy MD5
+    $md5=1;
+    $ctx="%r11";
+    $inp="%r15";
+    $code.=<<___;
+       mov     %rsp,$ctx
+       mov     $in0,$inp
+___
+  }
+}
+$code.=<<___;
+#rc4#  add     $TX[0]#b,$YY#b
+#rc4#  lea     ($dat,$XX[0],4),$XX[1]
+       shl     \$6,$len
+       add     $inp,$len               # pointer to the end of input
+       mov     $len,16(%rsp)
+
+#md5#  mov     $ctx,24(%rsp)           # save pointer to MD5_CTX
+#md5#  mov     0*4($ctx),$V[0]         # load current hash value from MD5_CTX
+#md5#  mov     1*4($ctx),$V[1]
+#md5#  mov     2*4($ctx),$V[2]
+#md5#  mov     3*4($ctx),$V[3]
+       jmp     .Loop
+
+.align 16
+.Loop:
+#md5#  mov     $V[0],0*4(%rsp)         # put aside current hash value
+#md5#  mov     $V[1],1*4(%rsp)
+#md5#  mov     $V[2],2*4(%rsp)
+#md5#  mov     $V[3],$tmp              # forward reference
+#md5#  mov     $V[3],3*4(%rsp)
+___
+
+sub R0 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot0=(7,12,17,22);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="   movdqu  ($in0),%xmm2\n"         if ($rc4 && $j==15);
+    $code.="   add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="   pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#  movl    ($dat,$YY,4),$TY#d
+#md5#  xor     $c,$tmp
+#rc4#  movl    $TX[0]#d,($dat,$YY,4)
+#md5#  and     $b,$tmp
+#md5#  add     4*`$j`($inp),$a
+#rc4#  add     $TY#b,$TX[0]#b
+#rc4#  movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#  add     \$$K[$i],$a
+#md5#  xor     $d,$tmp
+#rc4#  movz    $TX[0]#b,$TX[0]#d
+#rc4#  movl    $TY#d,4*$k($XX[1])
+#md5#  add     $tmp,$a
+#rc4#  add     $TX[1]#b,$YY#b
+#md5#  rol     \$$rot0[$j%4],$a
+#md5#  mov     `$j==15?"$b":"$c"`,$tmp         # forward reference
+#rc4#  pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#  add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+       mov     $YY,$XX[1]
+       xor     $YY,$YY                         # keyword to partial register
+       mov     $XX[1]#b,$YY#b
+       lea     ($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+       psllq   \$8,%xmm1
+       pxor    %xmm0,%xmm2
+       pxor    %xmm1,%xmm2
+___
+}
+sub R1 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot1=(5,9,14,20);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="   movdqu  16($in0),%xmm3\n"       if ($rc4 && $j==15);
+    $code.="   add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="   pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#  movl    ($dat,$YY,4),$TY#d
+#md5#  xor     $b,$tmp
+#rc4#  movl    $TX[0]#d,($dat,$YY,4)
+#md5#  and     $d,$tmp
+#md5#  add     4*`((1+5*$j)%16)`($inp),$a
+#rc4#  add     $TY#b,$TX[0]#b
+#rc4#  movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#  add     \$$K[$i],$a
+#md5#  xor     $c,$tmp
+#rc4#  movz    $TX[0]#b,$TX[0]#d
+#rc4#  movl    $TY#d,4*$k($XX[1])
+#md5#  add     $tmp,$a
+#rc4#  add     $TX[1]#b,$YY#b
+#md5#  rol     \$$rot1[$j%4],$a
+#md5#  mov     `$j==15?"$c":"$b"`,$tmp         # forward reference
+#rc4#  pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#  add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+       mov     $YY,$XX[1]
+       xor     $YY,$YY                         # keyword to partial register
+       mov     $XX[1]#b,$YY#b
+       lea     ($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+       psllq   \$8,%xmm1
+       pxor    %xmm0,%xmm3
+       pxor    %xmm1,%xmm3
+___
+}
+sub R2 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot2=(4,11,16,23);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="   movdqu  32($in0),%xmm4\n"       if ($rc4 && $j==15);
+    $code.="   add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="   pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#  movl    ($dat,$YY,4),$TY#d
+#md5#  xor     $c,$tmp
+#rc4#  movl    $TX[0]#d,($dat,$YY,4)
+#md5#  xor     $b,$tmp
+#md5#  add     4*`((5+3*$j)%16)`($inp),$a
+#rc4#  add     $TY#b,$TX[0]#b
+#rc4#  movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#  add     \$$K[$i],$a
+#rc4#  movz    $TX[0]#b,$TX[0]#d
+#md5#  add     $tmp,$a
+#rc4#  movl    $TY#d,4*$k($XX[1])
+#rc4#  add     $TX[1]#b,$YY#b
+#md5#  rol     \$$rot2[$j%4],$a
+#md5#  mov     `$j==15?"\\\$-1":"$c"`,$tmp     # forward reference
+#rc4#  pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#  add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+       mov     $YY,$XX[1]
+       xor     $YY,$YY                         # keyword to partial register
+       mov     $XX[1]#b,$YY#b
+       lea     ($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+       psllq   \$8,%xmm1
+       pxor    %xmm0,%xmm4
+       pxor    %xmm1,%xmm4
+___
+}
+sub R3 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot3=(6,10,15,21);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="   movdqu  48($in0),%xmm5\n"       if ($rc4 && $j==15);
+    $code.="   add     \$$MOD,$XX[0]#b\n"      if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="   pxor    $xmm,$xmm\n"            if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#  movl    ($dat,$YY,4),$TY#d
+#md5#  xor     $d,$tmp
+#rc4#  movl    $TX[0]#d,($dat,$YY,4)
+#md5#  or      $b,$tmp
+#md5#  add     4*`((7*$j)%16)`($inp),$a
+#rc4#  add     $TY#b,$TX[0]#b
+#rc4#  movl    `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#  add     \$$K[$i],$a
+#rc4#  movz    $TX[0]#b,$TX[0]#d
+#md5#  xor     $c,$tmp
+#rc4#  movl    $TY#d,4*$k($XX[1])
+#md5#  add     $tmp,$a
+#rc4#  add     $TX[1]#b,$YY#b
+#md5#  rol     \$$rot3[$j%4],$a
+#md5#  mov     \$-1,$tmp                       # forward reference
+#rc4#  pinsrw  \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#  add     $b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15);
+       mov     $XX[0],$XX[1]
+       xor     $XX[0],$XX[0]                   # keyword to partial register
+       mov     $XX[1]#b,$XX[0]#b
+       mov     $YY,$XX[1]
+       xor     $YY,$YY                         # keyword to partial register
+       mov     $XX[1]#b,$YY#b
+       lea     ($dat,$XX[0],4),$XX[1]
+       psllq   \$8,%xmm1
+       pxor    %xmm0,%xmm5
+       pxor    %xmm1,%xmm5
+___
+}
+
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+
+$code.=<<___;
+#md5#  add     0*4(%rsp),$V[0]         # accumulate hash value
+#md5#  add     1*4(%rsp),$V[1]
+#md5#  add     2*4(%rsp),$V[2]
+#md5#  add     3*4(%rsp),$V[3]
+
+#rc4#  movdqu  %xmm2,($out,$in0)       # write RC4 output
+#rc4#  movdqu  %xmm3,16($out,$in0)
+#rc4#  movdqu  %xmm4,32($out,$in0)
+#rc4#  movdqu  %xmm5,48($out,$in0)
+#md5#  lea     64($inp),$inp
+#rc4#  lea     64($in0),$in0
+       cmp     16(%rsp),$inp           # are we done?
+       jb      .Loop
+
+#md5#  mov     24(%rsp),$len           # restore pointer to MD5_CTX
+#rc4#  sub     $TX[0]#b,$YY#b          # correct $YY
+#md5#  mov     $V[0],0*4($len)         # write MD5_CTX
+#md5#  mov     $V[1],1*4($len)
+#md5#  mov     $V[2],2*4($len)
+#md5#  mov     $V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+       mov     32(%rsp),$len           # restore original $len
+       and     \$63,$len               # remaining bytes
+       jnz     .Loop1
+       jmp     .Ldone
+       
+.align 16
+.Loop1:
+       add     $TX[0]#b,$YY#b
+       movl    ($dat,$YY,4),$TY#d
+       movl    $TX[0]#d,($dat,$YY,4)
+       movl    $TY#d,($dat,$XX[0],4)
+       add     $TY#b,$TX[0]#b
+       inc     $XX[0]#b
+       movl    ($dat,$TX[0],4),$TY#d
+       movl    ($dat,$XX[0],4),$TX[0]#d
+       xorb    ($in0),$TY#b
+       movb    $TY#b,($out,$in0)
+       lea     1($in0),$in0
+       dec     $len
+       jnz     .Loop1
+
+.Ldone:
+___
+$code.=<<___;
+#rc4#  sub     \$1,$XX[0]#b
+#rc4#  movl    $XX[0]#d,-8($dat)
+#rc4#  movl    $YY#d,-4($dat)
+
+       mov     40(%rsp),%r15
+       mov     48(%rsp),%r14
+       mov     56(%rsp),%r13
+       mov     64(%rsp),%r12
+       mov     72(%rsp),%rbp
+       mov     80(%rsp),%rbx
+       lea     88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+       ret
+.size $func,.-$func
+___
+
+if ($rc4 && $D) {      # sole purpose of this section is to provide
+                       # option to use the generated module as drop-in
+                       # replacement for rc4-x86_64.pl for debugging
+                       # and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.globl RC4_set_key
+.type  RC4_set_key,\@function,3
+.align 16
+RC4_set_key:
+       lea     8($dat),$dat
+       lea     ($inp,$len),$inp
+       neg     $len
+       mov     $len,%rcx
+       xor     %eax,%eax
+       xor     $ido,$ido
+       xor     %r10,%r10
+       xor     %r11,%r11
+       jmp     .Lw1stloop
+
+.align 16
+.Lw1stloop:
+       mov     %eax,($dat,%rax,4)
+       add     \$1,%al
+       jnc     .Lw1stloop
+
+       xor     $ido,$ido
+       xor     $idx,$idx
+.align 16
+.Lw2ndloop:
+       mov     ($dat,$ido,4),%r10d
+       add     ($inp,$len,1),$idx#b
+       add     %r10b,$idx#b
+       add     \$1,$len
+       mov     ($dat,$idx,4),%r11d
+       cmovz   %rcx,$len
+       mov     %r10d,($dat,$idx,4)
+       mov     %r11d,($dat,$ido,4)
+       add     \$1,$ido#b
+       jnc     .Lw2ndloop
+
+       xor     %eax,%eax
+       mov     %eax,-8($dat)
+       mov     %eax,-4($dat)
+       ret
+.size  RC4_set_key,.-RC4_set_key
+
+.globl RC4_options
+.type  RC4_options,\@abi-omnipotent
+.align 16
+RC4_options:
+       lea     .Lopts(%rip),%rax
+       ret
+.align 64
+.Lopts:
+.asciz "rc4(64x,int)"
+.align 64
+.size  RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lbody(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<.Lbody
+       jb      .Lin_prologue
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     .Lepilogue(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+       jae     .Lin_prologue
+
+       mov     40(%rax),%r15
+       mov     48(%rax),%r14
+       mov     56(%rax),%r13
+       mov     64(%rax),%r12
+       mov     72(%rax),%rbp
+       mov     80(%rax),%rbx
+       lea     88(%rax),%rax
+
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R12
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+
+.Lin_prologue:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  se_handler,.-se_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_$func
+       .rva    .LSEH_end_$func
+       .rva    .LSEH_info_$func
+
+.section       .xdata
+.align 8
+.LSEH_info_$func:
+       .byte   9,0,0,0
+       .rva    se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd  /gm;
+
+$code =~ s/#md5#//gm   if ($md5);
+$code =~ s/#rc4#//gm   if ($rc4);
+
+print $code;
+
+close STDOUT;
index 0967b2d..21d8d54 100644 (file)
@@ -73,6 +73,9 @@ int SSL_library_init(void)
 #endif
 #ifndef OPENSSL_NO_RC4
        EVP_add_cipher(EVP_rc4());
+#ifndef OPENSSL_NO_MD5
+       EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif  
 #ifndef OPENSSL_NO_RC2
        EVP_add_cipher(EVP_rc2_cbc());
@@ -85,6 +88,10 @@ int SSL_library_init(void)
        EVP_add_cipher(EVP_aes_128_cbc());
        EVP_add_cipher(EVP_aes_192_cbc());
        EVP_add_cipher(EVP_aes_256_cbc());
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+       EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+       EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
        EVP_add_cipher(EVP_camellia_128_cbc());