Remove x86/x86_64 BSAES and AES_ASM support
authorBernd Edlinger <bernd.edlinger@hotmail.de>
Fri, 23 Aug 2019 08:17:31 +0000 (10:17 +0200)
committerBernd Edlinger <bernd.edlinger@hotmail.de>
Sat, 7 Sep 2019 08:26:48 +0000 (10:26 +0200)
This leaves VPAES and AESNI support.
The VPAES performance is comparable but BSAES is not
completely constant time. There are table lookups
using secret key data in AES_set_encrypt/decrypt_key
and in ctr mode short data uses the non-constant
time AES_encrypt function instead of bit-slicing.
Furthermore the AES_ASM is by far outperformed
by recent GCC versions.
Since BSAES calls back to AES_ASM for short
data blocks the performance on those is also
worse than the pure software implementaion.

Fixes: #9640
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/9675)

Configurations/00-base-templates.conf
crypto/aes/asm/aes-586.pl [deleted file]
crypto/aes/asm/aes-x86_64.pl [deleted file]
crypto/aes/asm/bsaes-x86_64.pl [deleted file]
crypto/evp/e_aes.c

index 5fd995cb3392eb819c0bf0cb7f3bd2a02fb72d60..e01dc63a8bf40855ea9137e13c8f835be1dc9547 100644 (file)
@@ -198,7 +198,7 @@ my %targets=(
        bn_asm_src      => "bn-586.s co-586.s x86-mont.s x86-gf2m.s",
        ec_asm_src      => "ecp_nistz256.c ecp_nistz256-x86.s",
        des_asm_src     => "des-586.s crypt586.s",
-       aes_asm_src     => "aes-586.s vpaes-x86.s aesni-x86.s",
+       aes_asm_src     => "aes_core.c aes_cbc.c vpaes-x86.s aesni-x86.s",
        bf_asm_src      => "bf-586.s",
        md5_asm_src     => "md5-586.s",
        cast_asm_src    => "cast-586.s",
@@ -223,7 +223,7 @@ my %targets=(
        cpuid_asm_src   => "x86_64cpuid.s",
        bn_asm_src      => "asm/x86_64-gcc.c x86_64-mont.s x86_64-mont5.s x86_64-gf2m.s rsaz_exp.c rsaz-x86_64.s rsaz-avx2.s",
        ec_asm_src      => "ecp_nistz256.c ecp_nistz256-x86_64.s x25519-x86_64.s",
-       aes_asm_src     => "aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
+       aes_asm_src     => "aes_core.c aes_cbc.c vpaes-x86_64.s aesni-x86_64.s aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s",
        md5_asm_src     => "md5-x86_64.s",
        sha1_asm_src    => "sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s sha1-mb-x86_64.s sha256-mb-x86_64.s",
        rc4_asm_src     => "rc4-x86_64.s rc4-md5-x86_64.s",
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
deleted file mode 100755 (executable)
index 29059ed..0000000
+++ /dev/null
@@ -1,3000 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 4.3.
-#
-# You might fail to appreciate this module performance from the first
-# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
-# to be *the* best Intel C compiler without -KPIC, performance appears
-# to be virtually identical... But try to re-configure with shared
-# library support... Aha! Intel compiler "suddenly" lags behind by 30%
-# [on P4, more on others]:-) And if compared to position-independent
-# code generated by GNU C, this code performs *more* than *twice* as
-# fast! Yes, all this buzz about PIC means that unlike other hand-
-# coded implementations, this one was explicitly designed to be safe
-# to use even in shared library context... This also means that this
-# code isn't necessarily absolutely fastest "ever," because in order
-# to achieve position independence an extra register has to be
-# off-loaded to stack, which affects the benchmark result.
-#
-# Special note about instruction choice. Do you recall RC4_INT code
-# performing poorly on P4? It might be the time to figure out why.
-# RC4_INT code implies effective address calculations in base+offset*4
-# form. Trouble is that it seems that offset scaling turned to be
-# critical path... At least eliminating scaling resulted in 2.8x RC4
-# performance improvement [as you might recall]. As AES code is hungry
-# for scaling too, I [try to] avoid the latter by favoring off-by-2
-# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
-#
-# As was shown by Dean Gaudet, the above note turned out to be
-# void. Performance improvement with off-by-2 shifts was observed on
-# intermediate implementation, which was spilling yet another register
-# to stack... Final offset*4 code below runs just a tad faster on P4,
-# but exhibits up to 10% improvement on other cores.
-#
-# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
-# This made it possible to implement little-endian variant of the
-# algorithm without modifying the base C code. Motivating factor for
-# the undertaken effort was that it appeared that in tight IA-32
-# register window little-endian flavor could achieve slightly higher
-# Instruction Level Parallelism, and it indeed resulted in up to 15%
-# better performance on most recent µ-archs...
-#
-# Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance improvement of CBC benchmark results. 40% was
-# observed on P4 core, where "overall" improvement coefficient, i.e. if
-# compared to PIC generated by GCC and in CBC mode, was observed to be
-# as large as 4x:-) CBC performance is virtually identical to ECB now
-# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
-# Opteron, because certain function prologues and epilogues are
-# effectively taken out of the loop...
-#
-# Version 3.2 implements compressed tables and prefetch of these tables
-# in CBC[!] mode. Former means that 3/4 of table references are now
-# misaligned, which unfortunately has negative impact on elder IA-32
-# implementations, Pentium suffered 30% penalty, PIII - 10%.
-#
-# Version 3.3 avoids L1 cache aliasing between stack frame and
-# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
-# latter is achieved by copying the key schedule to controlled place in
-# stack. This unfortunately has rather strong impact on small block CBC
-# performance, ~2x deterioration on 16-byte block if compared to 3.3.
-#
-# Version 3.5 checks if there is L1 cache aliasing between user-supplied
-# key schedule and S-boxes and abstains from copying the former if
-# there is no. This allows end-user to consciously retain small block
-# performance by aligning key schedule in specific manner.
-#
-# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
-#
-# Current ECB performance numbers for 128-bit key in CPU cycles per
-# processed byte [measure commonly used by AES benchmarkers] are:
-#
-#              small footprint         fully unrolled
-# P4           24                      22
-# AMD K8       20                      19
-# PIII         25                      23
-# Pentium      81                      78
-#
-# Version 3.7 reimplements outer rounds as "compact." Meaning that
-# first and last rounds reference compact 256 bytes S-box. This means
-# that first round consumes a lot more CPU cycles and that encrypt
-# and decrypt performance becomes asymmetric. Encrypt performance
-# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
-# aggressively pre-fetched.
-#
-# Version 4.0 effectively rolls back to 3.6 and instead implements
-# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
-# which use exclusively 256 byte S-box. These functions are to be
-# called in modes not concealing plain text, such as ECB, or when
-# we're asked to process smaller amount of data [or unconditionally
-# on hyper-threading CPU]. Currently it's called unconditionally from
-# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
-# still needs to be modified to switch between slower and faster
-# mode when appropriate... But in either case benchmark landscape
-# changes dramatically and below numbers are CPU cycles per processed
-# byte for 128-bit key.
-#
-#              ECB encrypt     ECB decrypt     CBC large chunk
-# P4           52[54]          83[95]          23
-# AMD K8       46[41]          66[70]          18
-# PIII         41[50]          60[77]          24
-# Core 2       31[36]          45[64]          18.5
-# Atom         76[100]         96[138]         60
-# Pentium      115             150             77
-#
-# Version 4.1 switches to compact S-box even in key schedule setup.
-#
-# Version 4.2 prefetches compact S-box in every SSE round or in other
-# words every cache-line is *guaranteed* to be accessed within ~50
-# cycles window. Why just SSE? Because it's needed on hyper-threading
-# CPU! Which is also why it's prefetched with 64 byte stride. Best
-# part is that it has no negative effect on performance:-)
-#
-# Version 4.3 implements switch between compact and non-compact block
-# functions in AES_cbc_encrypt depending on how much data was asked
-# to be processed in one stroke.
-#
-######################################################################
-# Timing attacks are classified in two classes: synchronous when
-# attacker consciously initiates cryptographic operation and collects
-# timing data of various character afterwards, and asynchronous when
-# malicious code is executed on same CPU simultaneously with AES,
-# instruments itself and performs statistical analysis of this data.
-#
-# As far as synchronous attacks go the root to the AES timing
-# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
-# are referred to in single 128-bit block operation. Well, in C
-# implementation with 4 distinct tables it's actually as little as 40
-# references per 256 elements table, but anyway... Secondly, even
-# though S-box elements are clustered into smaller amount of cache-
-# lines, smaller than 160 and even 40, it turned out that for certain
-# plain-text pattern[s] or simply put chosen plain-text and given key
-# few cache-lines remain unaccessed during block operation. Now, if
-# attacker can figure out this access pattern, he can deduct the key
-# [or at least part of it]. The natural way to mitigate this kind of
-# attacks is to minimize the amount of cache-lines in S-box and/or
-# prefetch them to ensure that every one is accessed for more uniform
-# timing. But note that *if* plain-text was concealed in such way that
-# input to block function is distributed *uniformly*, then attack
-# wouldn't apply. Now note that some encryption modes, most notably
-# CBC, do mask the plain-text in this exact way [secure cipher output
-# is distributed uniformly]. Yes, one still might find input that
-# would reveal the information about given key, but if amount of
-# candidate inputs to be tried is larger than amount of possible key
-# combinations then attack becomes infeasible. This is why revised
-# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
-# of data is to be processed in one stroke. The current size limit of
-# 512 bytes is chosen to provide same [diminishingly low] probability
-# for cache-line to remain untouched in large chunk operation with
-# large S-box as for single block operation with compact S-box and
-# surely needs more careful consideration...
-#
-# As for asynchronous attacks. There are two flavours: attacker code
-# being interleaved with AES on hyper-threading CPU at *instruction*
-# level, and two processes time sharing single core. As for latter.
-# Two vectors. 1. Given that attacker process has higher priority,
-# yield execution to process performing AES just before timer fires
-# off the scheduler, immediately regain control of CPU and analyze the
-# cache state. For this attack to be efficient attacker would have to
-# effectively slow down the operation by several *orders* of magnitude,
-# by ratio of time slice to duration of handful of AES rounds, which
-# unlikely to remain unnoticed. Not to mention that this also means
-# that he would spend correspondingly more time to collect enough
-# statistical data to mount the attack. It's probably appropriate to
-# say that if adversary reckons that this attack is beneficial and
-# risks to be noticed, you probably have larger problems having him
-# mere opportunity. In other words suggested code design expects you
-# to preclude/mitigate this attack by overall system security design.
-# 2. Attacker manages to make his code interrupt driven. In order for
-# this kind of attack to be feasible, interrupt rate has to be high
-# enough, again comparable to duration of handful of AES rounds. But
-# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
-# generates interrupts at such raging rate...
-#
-# And now back to the former, hyper-threading CPU or more specifically
-# Intel P4. Recall that asynchronous attack implies that malicious
-# code instruments itself. And naturally instrumentation granularity
-# has be noticeably lower than duration of codepath accessing S-box.
-# Given that all cache-lines are accessed during that time that is.
-# Current implementation accesses *all* cache-lines within ~50 cycles
-# window, which is actually *less* than RDTSC latency on Intel P4!
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-$output = pop;
-open OUT,">$output";
-*STDOUT=*OUT;
-
-&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
-&static_label("AES_Te");
-&static_label("AES_Td");
-
-$s0="eax";
-$s1="ebx";
-$s2="ecx";
-$s3="edx";
-$key="edi";
-$acc="esi";
-$tbl="ebp";
-
-# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
-# by caller
-$__ra=&DWP(0,"esp");   # return address
-$__s0=&DWP(4,"esp");   # s0 backing store
-$__s1=&DWP(8,"esp");   # s1 backing store
-$__s2=&DWP(12,"esp");  # s2 backing store
-$__s3=&DWP(16,"esp");  # s3 backing store
-$__key=&DWP(20,"esp"); # pointer to key schedule
-$__end=&DWP(24,"esp"); # pointer to end of key schedule
-$__tbl=&DWP(28,"esp"); # %ebp backing store
-
-# stack frame layout in AES_[en|crypt] routines, which differs from
-# above by 4 and overlaps by %ebp backing store
-$_tbl=&DWP(24,"esp");
-$_esp=&DWP(28,"esp");
-
-sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
-
-$speed_limit=512;      # chunks smaller than $speed_limit are
-                       # processed with compact routine in CBC mode
-$small_footprint=1;    # $small_footprint=1 code is ~5% slower [on
-                       # recent µ-archs], but ~5 times smaller!
-                       # I favor compact code to minimize cache
-                       # contention and in hope to "collect" 5% back
-                       # in real-life applications...
-
-$vertical_spin=0;      # shift "vertically" defaults to 0, because of
-                       # its proof-of-concept status...
-# Note that there is no decvert(), as well as last encryption round is
-# performed with "horizontal" shifts. This is because this "vertical"
-# implementation [one which groups shifts on a given $s[i] to form a
-# "column," unlike "horizontal" one, which groups shifts on different
-# $s[i] to form a "row"] is work in progress. It was observed to run
-# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
-# whole 12% slower:-( So we face a trade-off... Shall it be resolved
-# some day? Till then the code is considered experimental and by
-# default remains dormant...
-
-sub encvert()
-{ my ($te,@s) = @_;
-  my ($v0,$v1) = ($acc,$key);
-
-       &mov    ($v0,$s[3]);                            # copy s3
-       &mov    (&DWP(4,"esp"),$s[2]);                  # save s2
-       &mov    ($v1,$s[0]);                            # copy s0
-       &mov    (&DWP(8,"esp"),$s[1]);                  # save s1
-
-       &movz   ($s[2],&HB($s[0]));
-       &and    ($s[0],0xFF);
-       &mov    ($s[0],&DWP(0,$te,$s[0],8));            # s0>>0
-       &shr    ($v1,16);
-       &mov    ($s[3],&DWP(3,$te,$s[2],8));            # s0>>8
-       &movz   ($s[1],&HB($v1));
-       &and    ($v1,0xFF);
-       &mov    ($s[2],&DWP(2,$te,$v1,8));              # s0>>16
-        &mov   ($v1,$v0);
-       &mov    ($s[1],&DWP(1,$te,$s[1],8));            # s0>>24
-
-       &and    ($v0,0xFF);
-       &xor    ($s[3],&DWP(0,$te,$v0,8));              # s3>>0
-       &movz   ($v0,&HB($v1));
-       &shr    ($v1,16);
-       &xor    ($s[2],&DWP(3,$te,$v0,8));              # s3>>8
-       &movz   ($v0,&HB($v1));
-       &and    ($v1,0xFF);
-       &xor    ($s[1],&DWP(2,$te,$v1,8));              # s3>>16
-        &mov   ($v1,&DWP(4,"esp"));                    # restore s2
-       &xor    ($s[0],&DWP(1,$te,$v0,8));              # s3>>24
-
-       &mov    ($v0,$v1);
-       &and    ($v1,0xFF);
-       &xor    ($s[2],&DWP(0,$te,$v1,8));              # s2>>0
-       &movz   ($v1,&HB($v0));
-       &shr    ($v0,16);
-       &xor    ($s[1],&DWP(3,$te,$v1,8));              # s2>>8
-       &movz   ($v1,&HB($v0));
-       &and    ($v0,0xFF);
-       &xor    ($s[0],&DWP(2,$te,$v0,8));              # s2>>16
-        &mov   ($v0,&DWP(8,"esp"));                    # restore s1
-       &xor    ($s[3],&DWP(1,$te,$v1,8));              # s2>>24
-
-       &mov    ($v1,$v0);
-       &and    ($v0,0xFF);
-       &xor    ($s[1],&DWP(0,$te,$v0,8));              # s1>>0
-       &movz   ($v0,&HB($v1));
-       &shr    ($v1,16);
-       &xor    ($s[0],&DWP(3,$te,$v0,8));              # s1>>8
-       &movz   ($v0,&HB($v1));
-       &and    ($v1,0xFF);
-       &xor    ($s[3],&DWP(2,$te,$v1,8));              # s1>>16
-        &mov   ($key,$__key);                          # reincarnate v1 as key
-       &xor    ($s[2],&DWP(1,$te,$v0,8));              # s1>>24
-}
-
-# Another experimental routine, which features "horizontal spin," but
-# eliminates one reference to stack. Strangely enough runs slower...
-sub enchoriz()
-{ my ($v0,$v1) = ($key,$acc);
-
-       &movz   ($v0,&LB($s0));                 #  3, 2, 1, 0*
-       &rotr   ($s2,8);                        #  8,11,10, 9
-       &mov    ($v1,&DWP(0,$te,$v0,8));        #  0
-       &movz   ($v0,&HB($s1));                 #  7, 6, 5*, 4
-       &rotr   ($s3,16);                       # 13,12,15,14
-       &xor    ($v1,&DWP(3,$te,$v0,8));        #  5
-       &movz   ($v0,&HB($s2));                 #  8,11,10*, 9
-       &rotr   ($s0,16);                       #  1, 0, 3, 2
-       &xor    ($v1,&DWP(2,$te,$v0,8));        # 10
-       &movz   ($v0,&HB($s3));                 # 13,12,15*,14
-       &xor    ($v1,&DWP(1,$te,$v0,8));        # 15, t[0] collected
-       &mov    ($__s0,$v1);                    # t[0] saved
-
-       &movz   ($v0,&LB($s1));                 #  7, 6, 5, 4*
-       &shr    ($s1,16);                       #  -, -, 7, 6
-       &mov    ($v1,&DWP(0,$te,$v0,8));        #  4
-       &movz   ($v0,&LB($s3));                 # 13,12,15,14*
-       &xor    ($v1,&DWP(2,$te,$v0,8));        # 14
-       &movz   ($v0,&HB($s0));                 #  1, 0, 3*, 2
-       &and    ($s3,0xffff0000);               # 13,12, -, -
-       &xor    ($v1,&DWP(1,$te,$v0,8));        #  3
-       &movz   ($v0,&LB($s2));                 #  8,11,10, 9*
-       &or     ($s3,$s1);                      # 13,12, 7, 6
-       &xor    ($v1,&DWP(3,$te,$v0,8));        #  9, t[1] collected
-       &mov    ($s1,$v1);                      #  s[1]=t[1]
-
-       &movz   ($v0,&LB($s0));                 #  1, 0, 3, 2*
-       &shr    ($s2,16);                       #  -, -, 8,11
-       &mov    ($v1,&DWP(2,$te,$v0,8));        #  2
-       &movz   ($v0,&HB($s3));                 # 13,12, 7*, 6
-       &xor    ($v1,&DWP(1,$te,$v0,8));        #  7
-       &movz   ($v0,&HB($s2));                 #  -, -, 8*,11
-       &xor    ($v1,&DWP(0,$te,$v0,8));        #  8
-       &mov    ($v0,$s3);
-       &shr    ($v0,24);                       # 13
-       &xor    ($v1,&DWP(3,$te,$v0,8));        # 13, t[2] collected
-
-       &movz   ($v0,&LB($s2));                 #  -, -, 8,11*
-       &shr    ($s0,24);                       #  1*
-       &mov    ($s2,&DWP(1,$te,$v0,8));        # 11
-       &xor    ($s2,&DWP(3,$te,$s0,8));        #  1
-       &mov    ($s0,$__s0);                    # s[0]=t[0]
-       &movz   ($v0,&LB($s3));                 # 13,12, 7, 6*
-       &shr    ($s3,16);                       #   ,  ,13,12
-       &xor    ($s2,&DWP(2,$te,$v0,8));        #  6
-       &mov    ($key,$__key);                  # reincarnate v0 as key
-       &and    ($s3,0xff);                     #   ,  ,13,12*
-       &mov    ($s3,&DWP(0,$te,$s3,8));        # 12
-       &xor    ($s3,$s2);                      # s[2]=t[3] collected
-       &mov    ($s2,$v1);                      # s[2]=t[2]
-}
-
-# More experimental code... SSE one... Even though this one eliminates
-# *all* references to stack, it's not faster...
-sub sse_encbody()
-{
-       &movz   ($acc,&LB("eax"));              #  0
-       &mov    ("ecx",&DWP(0,$tbl,$acc,8));    #  0
-       &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
-       &movz   ("edx",&HB("eax"));             #  1
-       &mov    ("edx",&DWP(3,$tbl,"edx",8));   #  1
-       &shr    ("eax",16);                     #  5, 4
-
-       &movz   ($acc,&LB("ebx"));              # 10
-       &xor    ("ecx",&DWP(2,$tbl,$acc,8));    # 10
-       &pshufw ("mm6","mm4",0x08);             # 13,12, 9, 8
-       &movz   ($acc,&HB("ebx"));              # 11
-       &xor    ("edx",&DWP(1,$tbl,$acc,8));    # 11
-       &shr    ("ebx",16);                     # 15,14
-
-       &movz   ($acc,&HB("eax"));              #  5
-       &xor    ("ecx",&DWP(3,$tbl,$acc,8));    #  5
-       &movq   ("mm3",QWP(16,$key));
-       &movz   ($acc,&HB("ebx"));              # 15
-       &xor    ("ecx",&DWP(1,$tbl,$acc,8));    # 15
-       &movd   ("mm0","ecx");                  # t[0] collected
-
-       &movz   ($acc,&LB("eax"));              #  4
-       &mov    ("ecx",&DWP(0,$tbl,$acc,8));    #  4
-       &movd   ("eax","mm2");                  #  7, 6, 3, 2
-       &movz   ($acc,&LB("ebx"));              # 14
-       &xor    ("ecx",&DWP(2,$tbl,$acc,8));    # 14
-       &movd   ("ebx","mm6");                  # 13,12, 9, 8
-
-       &movz   ($acc,&HB("eax"));              #  3
-       &xor    ("ecx",&DWP(1,$tbl,$acc,8));    #  3
-       &movz   ($acc,&HB("ebx"));              #  9
-       &xor    ("ecx",&DWP(3,$tbl,$acc,8));    #  9
-       &movd   ("mm1","ecx");                  # t[1] collected
-
-       &movz   ($acc,&LB("eax"));              #  2
-       &mov    ("ecx",&DWP(2,$tbl,$acc,8));    #  2
-       &shr    ("eax",16);                     #  7, 6
-       &punpckldq      ("mm0","mm1");          # t[0,1] collected
-       &movz   ($acc,&LB("ebx"));              #  8
-       &xor    ("ecx",&DWP(0,$tbl,$acc,8));    #  8
-       &shr    ("ebx",16);                     # 13,12
-
-       &movz   ($acc,&HB("eax"));              #  7
-       &xor    ("ecx",&DWP(1,$tbl,$acc,8));    #  7
-       &pxor   ("mm0","mm3");
-       &movz   ("eax",&LB("eax"));             #  6
-       &xor    ("edx",&DWP(2,$tbl,"eax",8));   #  6
-       &pshufw ("mm1","mm0",0x08);             #  5, 4, 1, 0
-       &movz   ($acc,&HB("ebx"));              # 13
-       &xor    ("ecx",&DWP(3,$tbl,$acc,8));    # 13
-       &xor    ("ecx",&DWP(24,$key));          # t[2]
-       &movd   ("mm4","ecx");                  # t[2] collected
-       &movz   ("ebx",&LB("ebx"));             # 12
-       &xor    ("edx",&DWP(0,$tbl,"ebx",8));   # 12
-       &shr    ("ecx",16);
-       &movd   ("eax","mm1");                  #  5, 4, 1, 0
-       &mov    ("ebx",&DWP(28,$key));          # t[3]
-       &xor    ("ebx","edx");
-       &movd   ("mm5","ebx");                  # t[3] collected
-       &and    ("ebx",0xffff0000);
-       &or     ("ebx","ecx");
-
-       &punpckldq      ("mm4","mm5");          # t[2,3] collected
-}
-
-######################################################################
-# "Compact" block function
-######################################################################
-
-sub enccompact()
-{ my $Fn = \&mov;
-  while ($#_>5) { pop(@_); $Fn=sub{}; }
-  my ($i,$te,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-       # $Fn is used in first compact round and its purpose is to
-       # void restoration of some values from stack, so that after
-       # 4xenccompact with extra argument $key value is left there...
-       if ($i==3)  {   &$Fn    ($key,$__key);                  }##%edx
-       else        {   &mov    ($out,$s[0]);                   }
-                       &and    ($out,0xFF);
-       if ($i==1)  {   &shr    ($s[0],16);                     }#%ebx[1]
-       if ($i==2)  {   &shr    ($s[0],24);                     }#%ecx[2]
-                       &movz   ($out,&BP(-128,$te,$out,1));
-
-       if ($i==3)  {   $tmp=$s[1];                             }##%eax
-                       &movz   ($tmp,&HB($s[1]));
-                       &movz   ($tmp,&BP(-128,$te,$tmp,1));
-                       &shl    ($tmp,8);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$__s0);         }##%ebx
-       else        {   &mov    ($tmp,$s[2]);
-                       &shr    ($tmp,16);                      }
-       if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
-                       &and    ($tmp,0xFF);
-                       &movz   ($tmp,&BP(-128,$te,$tmp,1));
-                       &shl    ($tmp,16);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],$__s1);         }##%ecx
-       elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
-       else        {   &mov    ($tmp,$s[3]);
-                       &shr    ($tmp,24);                      }
-                       &movz   ($tmp,&BP(-128,$te,$tmp,1));
-                       &shl    ($tmp,24);
-                       &xor    ($out,$tmp);
-       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
-       if ($i==3)  {   &mov    ($s[3],$acc);                   }
-       &comment();
-}
-
-sub enctransform()
-{ my @s = ($s0,$s1,$s2,$s3);
-  my $i = shift;
-  my $tmp = $tbl;
-  my $r2  = $key ;
-
-       &and    ($tmp,$s[$i]);
-       &lea    ($r2,&DWP(0,$s[$i],$s[$i]));
-       &mov    ($acc,$tmp);
-       &shr    ($tmp,7);
-       &and    ($r2,0xfefefefe);
-       &sub    ($acc,$tmp);
-       &mov    ($tmp,$s[$i]);
-       &and    ($acc,0x1b1b1b1b);
-       &rotr   ($tmp,16);
-       &xor    ($acc,$r2);     # r2
-       &mov    ($r2,$s[$i]);
-
-       &xor    ($s[$i],$acc);  # r0 ^ r2
-       &rotr   ($r2,16+8);
-       &xor    ($acc,$tmp);
-       &rotl   ($s[$i],24);
-       &xor    ($acc,$r2);
-       &mov    ($tmp,0x80808080)       if ($i!=1);
-       &xor    ($s[$i],$acc);  # ROTATE(r2^r0,24) ^ r2
-}
-
-&function_begin_B("_x86_AES_encrypt_compact");
-       # note that caller is expected to allocate stack frame for me!
-       &mov    ($__key,$key);                  # save key
-
-       &xor    ($s0,&DWP(0,$key));             # xor with key
-       &xor    ($s1,&DWP(4,$key));
-       &xor    ($s2,&DWP(8,$key));
-       &xor    ($s3,&DWP(12,$key));
-
-       &mov    ($acc,&DWP(240,$key));          # load key->rounds
-       &lea    ($acc,&DWP(-2,$acc,$acc));
-       &lea    ($acc,&DWP(0,$key,$acc,8));
-       &mov    ($__end,$acc);                  # end of key schedule
-
-       # prefetch Te4
-       &mov    ($key,&DWP(0-128,$tbl));
-       &mov    ($acc,&DWP(32-128,$tbl));
-       &mov    ($key,&DWP(64-128,$tbl));
-       &mov    ($acc,&DWP(96-128,$tbl));
-       &mov    ($key,&DWP(128-128,$tbl));
-       &mov    ($acc,&DWP(160-128,$tbl));
-       &mov    ($key,&DWP(192-128,$tbl));
-       &mov    ($acc,&DWP(224-128,$tbl));
-
-       &set_label("loop",16);
-
-               &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
-               &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
-               &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
-               &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
-               &mov    ($tbl,0x80808080);
-               &enctransform(2);
-               &enctransform(3);
-               &enctransform(0);
-               &enctransform(1);
-               &mov    ($key,$__key);
-               &mov    ($tbl,$__tbl);
-               &add    ($key,16);              # advance rd_key
-               &xor    ($s0,&DWP(0,$key));
-               &xor    ($s1,&DWP(4,$key));
-               &xor    ($s2,&DWP(8,$key));
-               &xor    ($s3,&DWP(12,$key));
-
-       &cmp    ($key,$__end);
-       &mov    ($__key,$key);
-       &jb     (&label("loop"));
-
-       &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
-       &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
-       &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
-       &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
-
-       &xor    ($s0,&DWP(16,$key));
-       &xor    ($s1,&DWP(20,$key));
-       &xor    ($s2,&DWP(24,$key));
-       &xor    ($s3,&DWP(28,$key));
-
-       &ret    ();
-&function_end_B("_x86_AES_encrypt_compact");
-
-######################################################################
-# "Compact" SSE block function.
-######################################################################
-#
-# Performance is not actually extraordinary in comparison to pure
-# x86 code. In particular encrypt performance is virtually the same.
-# Decrypt performance on the other hand is 15-20% better on newer
-# µ-archs [but we're thankful for *any* improvement here], and ~50%
-# better on PIII:-) And additionally on the pros side this code
-# eliminates redundant references to stack and thus relieves/
-# minimizes the pressure on the memory bus.
-#
-# MMX register layout                           lsb
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |          mm4          |          mm0          |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |     s3    |     s2    |     s1    |     s0    |
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
-# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-#
-# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
-# In this terms encryption and decryption "compact" permutation
-# matrices can be depicted as following:
-#
-# encryption              lsb  # decryption              lsb
-# +----++----+----+----+----+  # +----++----+----+----+----+
-# | t0 || 15 | 10 |  5 |  0 |  # | t0 ||  7 | 10 | 13 |  0 |
-# +----++----+----+----+----+  # +----++----+----+----+----+
-# | t1 ||  3 | 14 |  9 |  4 |  # | t1 || 11 | 14 |  1 |  4 |
-# +----++----+----+----+----+  # +----++----+----+----+----+
-# | t2 ||  7 |  2 | 13 |  8 |  # | t2 || 15 |  2 |  5 |  8 |
-# +----++----+----+----+----+  # +----++----+----+----+----+
-# | t3 || 11 |  6 |  1 | 12 |  # | t3 ||  3 |  6 |  9 | 12 |
-# +----++----+----+----+----+  # +----++----+----+----+----+
-#
-######################################################################
-# Why not xmm registers? Short answer. It was actually tested and
-# was not any faster, but *contrary*, most notably on Intel CPUs.
-# Longer answer. Main advantage of using mm registers is that movd
-# latency is lower, especially on Intel P4. While arithmetic
-# instructions are twice as many, they can be scheduled every cycle
-# and not every second one when they are operating on xmm register,
-# so that "arithmetic throughput" remains virtually the same. And
-# finally the code can be executed even on elder SSE-only CPUs:-)
-
-sub sse_enccompact()
-{
-       &pshufw ("mm1","mm0",0x08);             #  5, 4, 1, 0
-       &pshufw ("mm5","mm4",0x0d);             # 15,14,11,10
-       &movd   ("eax","mm1");                  #  5, 4, 1, 0
-       &movd   ("ebx","mm5");                  # 15,14,11,10
-       &mov    ($__key,$key);
-
-       &movz   ($acc,&LB("eax"));              #  0
-       &movz   ("edx",&HB("eax"));             #  1
-       &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
-       &movz   ($key,&LB("ebx"));              # 10
-       &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
-       &shr    ("eax",16);                     #  5, 4
-       &shl    ("edx",8);                      #  1
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 10
-       &movz   ($key,&HB("ebx"));              # 11
-       &shl    ($acc,16);                      # 10
-       &pshufw ("mm6","mm4",0x08);             # 13,12, 9, 8
-       &or     ("ecx",$acc);                   # 10
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 11
-       &movz   ($key,&HB("eax"));              #  5
-       &shl    ($acc,24);                      # 11
-       &shr    ("ebx",16);                     # 15,14
-       &or     ("edx",$acc);                   # 11
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  5
-       &movz   ($key,&HB("ebx"));              # 15
-       &shl    ($acc,8);                       #  5
-       &or     ("ecx",$acc);                   #  5
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 15
-       &movz   ($key,&LB("eax"));              #  4
-       &shl    ($acc,24);                      # 15
-       &or     ("ecx",$acc);                   # 15
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  4
-       &movz   ($key,&LB("ebx"));              # 14
-       &movd   ("eax","mm2");                  #  7, 6, 3, 2
-       &movd   ("mm0","ecx");                  # t[0] collected
-       &movz   ("ecx",&BP(-128,$tbl,$key,1));  # 14
-       &movz   ($key,&HB("eax"));              #  3
-       &shl    ("ecx",16);                     # 14
-       &movd   ("ebx","mm6");                  # 13,12, 9, 8
-       &or     ("ecx",$acc);                   # 14
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  3
-       &movz   ($key,&HB("ebx"));              #  9
-       &shl    ($acc,24);                      #  3
-       &or     ("ecx",$acc);                   #  3
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  9
-       &movz   ($key,&LB("ebx"));              #  8
-       &shl    ($acc,8);                       #  9
-       &shr    ("ebx",16);                     # 13,12
-       &or     ("ecx",$acc);                   #  9
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  8
-       &movz   ($key,&LB("eax"));              #  2
-       &shr    ("eax",16);                     #  7, 6
-       &movd   ("mm1","ecx");                  # t[1] collected
-       &movz   ("ecx",&BP(-128,$tbl,$key,1));  #  2
-       &movz   ($key,&HB("eax"));              #  7
-       &shl    ("ecx",16);                     #  2
-       &and    ("eax",0xff);                   #  6
-       &or     ("ecx",$acc);                   #  2
-
-       &punpckldq      ("mm0","mm1");          # t[0,1] collected
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  7
-       &movz   ($key,&HB("ebx"));              # 13
-       &shl    ($acc,24);                      #  7
-       &and    ("ebx",0xff);                   # 12
-       &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  6
-       &or     ("ecx",$acc);                   #  7
-       &shl    ("eax",16);                     #  6
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 13
-       &or     ("edx","eax");                  #  6
-       &shl    ($acc,8);                       # 13
-       &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
-       &or     ("ecx",$acc);                   # 13
-       &or     ("edx","ebx");                  # 12
-       &mov    ($key,$__key);
-       &movd   ("mm4","ecx");                  # t[2] collected
-       &movd   ("mm5","edx");                  # t[3] collected
-
-       &punpckldq      ("mm4","mm5");          # t[2,3] collected
-}
-
-                                       if (!$x86only) {
-&function_begin_B("_sse_AES_encrypt_compact");
-       &pxor   ("mm0",&QWP(0,$key));   #  7, 6, 5, 4, 3, 2, 1, 0
-       &pxor   ("mm4",&QWP(8,$key));   # 15,14,13,12,11,10, 9, 8
-
-       # note that caller is expected to allocate stack frame for me!
-       &mov    ($acc,&DWP(240,$key));          # load key->rounds
-       &lea    ($acc,&DWP(-2,$acc,$acc));
-       &lea    ($acc,&DWP(0,$key,$acc,8));
-       &mov    ($__end,$acc);                  # end of key schedule
-
-       &mov    ($s0,0x1b1b1b1b);               # magic constant
-       &mov    (&DWP(8,"esp"),$s0);
-       &mov    (&DWP(12,"esp"),$s0);
-
-       # prefetch Te4
-       &mov    ($s0,&DWP(0-128,$tbl));
-       &mov    ($s1,&DWP(32-128,$tbl));
-       &mov    ($s2,&DWP(64-128,$tbl));
-       &mov    ($s3,&DWP(96-128,$tbl));
-       &mov    ($s0,&DWP(128-128,$tbl));
-       &mov    ($s1,&DWP(160-128,$tbl));
-       &mov    ($s2,&DWP(192-128,$tbl));
-       &mov    ($s3,&DWP(224-128,$tbl));
-
-       &set_label("loop",16);
-               &sse_enccompact();
-               &add    ($key,16);
-               &cmp    ($key,$__end);
-               &ja     (&label("out"));
-
-               &movq   ("mm2",&QWP(8,"esp"));
-               &pxor   ("mm3","mm3");          &pxor   ("mm7","mm7");
-               &movq   ("mm1","mm0");          &movq   ("mm5","mm4");  # r0
-               &pcmpgtb("mm3","mm0");          &pcmpgtb("mm7","mm4");
-               &pand   ("mm3","mm2");          &pand   ("mm7","mm2");
-               &pshufw ("mm2","mm0",0xb1);     &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
-               &paddb  ("mm0","mm0");          &paddb  ("mm4","mm4");
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # = r2
-               &pshufw ("mm3","mm2",0xb1);     &pshufw ("mm7","mm6",0xb1);# r0
-               &pxor   ("mm1","mm0");          &pxor   ("mm5","mm4");  # r0^r2
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= ROTATE(r0,16)
-
-               &movq   ("mm2","mm3");          &movq   ("mm6","mm7");
-               &pslld  ("mm3",8);              &pslld  ("mm7",8);
-               &psrld  ("mm2",24);             &psrld  ("mm6",24);
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= r0<<8
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= r0>>24
-
-               &movq   ("mm3","mm1");          &movq   ("mm7","mm5");
-               &movq   ("mm2",&QWP(0,$key));   &movq   ("mm6",&QWP(8,$key));
-               &psrld  ("mm1",8);              &psrld  ("mm5",8);
-               &mov    ($s0,&DWP(0-128,$tbl));
-               &pslld  ("mm3",24);             &pslld  ("mm7",24);
-               &mov    ($s1,&DWP(64-128,$tbl));
-               &pxor   ("mm0","mm1");          &pxor   ("mm4","mm5");  # ^= (r2^r0)<<8
-               &mov    ($s2,&DWP(128-128,$tbl));
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= (r2^r0)>>24
-               &mov    ($s3,&DWP(192-128,$tbl));
-
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");
-       &jmp    (&label("loop"));
-
-       &set_label("out",16);
-       &pxor   ("mm0",&QWP(0,$key));
-       &pxor   ("mm4",&QWP(8,$key));
-
-       &ret    ();
-&function_end_B("_sse_AES_encrypt_compact");
-                                       }
-
-######################################################################
-# Vanilla block function.
-######################################################################
-
-sub encstep()
-{ my ($i,$te,@s) = @_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-       # lines marked with #%e?x[i] denote "reordered" instructions...
-       if ($i==3)  {   &mov    ($key,$__key);                  }##%edx
-       else        {   &mov    ($out,$s[0]);
-                       &and    ($out,0xFF);                    }
-       if ($i==1)  {   &shr    ($s[0],16);                     }#%ebx[1]
-       if ($i==2)  {   &shr    ($s[0],24);                     }#%ecx[2]
-                       &mov    ($out,&DWP(0,$te,$out,8));
-
-       if ($i==3)  {   $tmp=$s[1];                             }##%eax
-                       &movz   ($tmp,&HB($s[1]));
-                       &xor    ($out,&DWP(3,$te,$tmp,8));
-
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$__s0);         }##%ebx
-       else        {   &mov    ($tmp,$s[2]);
-                       &shr    ($tmp,16);                      }
-       if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
-                       &and    ($tmp,0xFF);
-                       &xor    ($out,&DWP(2,$te,$tmp,8));
-
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],$__s1);         }##%ecx
-       elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
-       else        {   &mov    ($tmp,$s[3]);
-                       &shr    ($tmp,24)                       }
-                       &xor    ($out,&DWP(1,$te,$tmp,8));
-       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
-       if ($i==3)  {   &mov    ($s[3],$acc);                   }
-                       &comment();
-}
-
-sub enclast()
-{ my ($i,$te,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-       if ($i==3)  {   &mov    ($key,$__key);                  }##%edx
-       else        {   &mov    ($out,$s[0]);                   }
-                       &and    ($out,0xFF);
-       if ($i==1)  {   &shr    ($s[0],16);                     }#%ebx[1]
-       if ($i==2)  {   &shr    ($s[0],24);                     }#%ecx[2]
-                       &mov    ($out,&DWP(2,$te,$out,8));
-                       &and    ($out,0x000000ff);
-
-       if ($i==3)  {   $tmp=$s[1];                             }##%eax
-                       &movz   ($tmp,&HB($s[1]));
-                       &mov    ($tmp,&DWP(0,$te,$tmp,8));
-                       &and    ($tmp,0x0000ff00);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$__s0);         }##%ebx
-       else        {   &mov    ($tmp,$s[2]);
-                       &shr    ($tmp,16);                      }
-       if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
-                       &and    ($tmp,0xFF);
-                       &mov    ($tmp,&DWP(0,$te,$tmp,8));
-                       &and    ($tmp,0x00ff0000);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],$__s1);         }##%ecx
-       elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
-       else        {   &mov    ($tmp,$s[3]);
-                       &shr    ($tmp,24);                      }
-                       &mov    ($tmp,&DWP(2,$te,$tmp,8));
-                       &and    ($tmp,0xff000000);
-                       &xor    ($out,$tmp);
-       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
-       if ($i==3)  {   &mov    ($s[3],$acc);                   }
-}
-
-&function_begin_B("_x86_AES_encrypt");
-       if ($vertical_spin) {
-               # I need high parts of volatile registers to be accessible...
-               &exch   ($s1="edi",$key="ebx");
-               &mov    ($s2="esi",$acc="ecx");
-       }
-
-       # note that caller is expected to allocate stack frame for me!
-       &mov    ($__key,$key);                  # save key
-
-       &xor    ($s0,&DWP(0,$key));             # xor with key
-       &xor    ($s1,&DWP(4,$key));
-       &xor    ($s2,&DWP(8,$key));
-       &xor    ($s3,&DWP(12,$key));
-
-       &mov    ($acc,&DWP(240,$key));          # load key->rounds
-
-       if ($small_footprint) {
-           &lea        ($acc,&DWP(-2,$acc,$acc));
-           &lea        ($acc,&DWP(0,$key,$acc,8));
-           &mov        ($__end,$acc);          # end of key schedule
-
-           &set_label("loop",16);
-               if ($vertical_spin) {
-                   &encvert($tbl,$s0,$s1,$s2,$s3);
-               } else {
-                   &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-                   &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-                   &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-                   &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-               }
-               &add    ($key,16);              # advance rd_key
-               &xor    ($s0,&DWP(0,$key));
-               &xor    ($s1,&DWP(4,$key));
-               &xor    ($s2,&DWP(8,$key));
-               &xor    ($s3,&DWP(12,$key));
-           &cmp        ($key,$__end);
-           &mov        ($__key,$key);
-           &jb         (&label("loop"));
-       }
-       else {
-           &cmp        ($acc,10);
-           &jle        (&label("10rounds"));
-           &cmp        ($acc,12);
-           &jle        (&label("12rounds"));
-
-       &set_label("14rounds",4);
-           for ($i=1;$i<3;$i++) {
-               if ($vertical_spin) {
-                   &encvert($tbl,$s0,$s1,$s2,$s3);
-               } else {
-                   &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-                   &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-                   &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-                   &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-               }
-               &xor    ($s0,&DWP(16*$i+0,$key));
-               &xor    ($s1,&DWP(16*$i+4,$key));
-               &xor    ($s2,&DWP(16*$i+8,$key));
-               &xor    ($s3,&DWP(16*$i+12,$key));
-           }
-           &add        ($key,32);
-           &mov        ($__key,$key);          # advance rd_key
-       &set_label("12rounds",4);
-           for ($i=1;$i<3;$i++) {
-               if ($vertical_spin) {
-                   &encvert($tbl,$s0,$s1,$s2,$s3);
-               } else {
-                   &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-                   &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-                   &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-                   &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-               }
-               &xor    ($s0,&DWP(16*$i+0,$key));
-               &xor    ($s1,&DWP(16*$i+4,$key));
-               &xor    ($s2,&DWP(16*$i+8,$key));
-               &xor    ($s3,&DWP(16*$i+12,$key));
-           }
-           &add        ($key,32);
-           &mov        ($__key,$key);          # advance rd_key
-       &set_label("10rounds",4);
-           for ($i=1;$i<10;$i++) {
-               if ($vertical_spin) {
-                   &encvert($tbl,$s0,$s1,$s2,$s3);
-               } else {
-                   &encstep(0,$tbl,$s0,$s1,$s2,$s3);
-                   &encstep(1,$tbl,$s1,$s2,$s3,$s0);
-                   &encstep(2,$tbl,$s2,$s3,$s0,$s1);
-                   &encstep(3,$tbl,$s3,$s0,$s1,$s2);
-               }
-               &xor    ($s0,&DWP(16*$i+0,$key));
-               &xor    ($s1,&DWP(16*$i+4,$key));
-               &xor    ($s2,&DWP(16*$i+8,$key));
-               &xor    ($s3,&DWP(16*$i+12,$key));
-           }
-       }
-
-       if ($vertical_spin) {
-           # "reincarnate" some registers for "horizontal" spin...
-           &mov        ($s1="ebx",$key="edi");
-           &mov        ($s2="ecx",$acc="esi");
-       }
-       &enclast(0,$tbl,$s0,$s1,$s2,$s3);
-       &enclast(1,$tbl,$s1,$s2,$s3,$s0);
-       &enclast(2,$tbl,$s2,$s3,$s0,$s1);
-       &enclast(3,$tbl,$s3,$s0,$s1,$s2);
-
-       &add    ($key,$small_footprint?16:160);
-       &xor    ($s0,&DWP(0,$key));
-       &xor    ($s1,&DWP(4,$key));
-       &xor    ($s2,&DWP(8,$key));
-       &xor    ($s3,&DWP(12,$key));
-
-       &ret    ();
-
-&set_label("AES_Te",64);       # Yes! I keep it in the code segment!
-       &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
-       &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
-       &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
-       &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
-       &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
-       &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
-       &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
-       &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
-       &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
-       &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
-       &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
-       &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
-       &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
-       &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
-       &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
-       &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
-       &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
-       &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
-       &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
-       &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
-       &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
-       &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
-       &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
-       &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
-       &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
-       &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
-       &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
-       &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
-       &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
-       &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
-       &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
-       &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
-       &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
-       &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
-       &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
-       &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
-       &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
-       &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
-       &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
-       &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
-       &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
-       &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
-       &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
-       &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
-       &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
-       &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
-       &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
-       &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
-       &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
-       &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
-       &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
-       &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
-       &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
-       &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
-       &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
-       &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
-       &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
-       &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
-       &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
-       &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
-       &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
-       &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
-       &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
-       &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
-
-#Te4   # four copies of Te4 to choose from to avoid L1 aliasing
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-#rcon:
-       &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
-       &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
-       &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
-       &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
-&function_end_B("_x86_AES_encrypt");
-
-# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
-&function_begin("AES_encrypt");
-       &mov    ($acc,&wparam(0));              # load inp
-       &mov    ($key,&wparam(2));              # load key
-
-       &mov    ($s0,"esp");
-       &sub    ("esp",36);
-       &and    ("esp",-64);                    # align to cache-line
-
-       # place stack frame just "above" the key schedule
-       &lea    ($s1,&DWP(-64-63,$key));
-       &sub    ($s1,"esp");
-       &neg    ($s1);
-       &and    ($s1,0x3C0);    # modulo 1024, but aligned to cache-line
-       &sub    ("esp",$s1);
-       &add    ("esp",4);      # 4 is reserved for caller's return address
-       &mov    ($_esp,$s0);                    # save stack pointer
-
-       &call   (&label("pic_point"));          # make it PIC!
-       &set_label("pic_point");
-       &blindpop($tbl);
-       &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
-       &lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-
-       # pick Te4 copy which can't "overlap" with stack frame or key schedule
-       &lea    ($s1,&DWP(768-4,"esp"));
-       &sub    ($s1,$tbl);
-       &and    ($s1,0x300);
-       &lea    ($tbl,&DWP(2048+128,$tbl,$s1));
-
-                                       if (!$x86only) {
-       &bt     (&DWP(0,$s0),25);       # check for SSE bit
-       &jnc    (&label("x86"));
-
-       &movq   ("mm0",&QWP(0,$acc));
-       &movq   ("mm4",&QWP(8,$acc));
-       &call   ("_sse_AES_encrypt_compact");
-       &mov    ("esp",$_esp);                  # restore stack pointer
-       &mov    ($acc,&wparam(1));              # load out
-       &movq   (&QWP(0,$acc),"mm0");           # write output data
-       &movq   (&QWP(8,$acc),"mm4");
-       &emms   ();
-       &function_end_A();
-                                       }
-       &set_label("x86",16);
-       &mov    ($_tbl,$tbl);
-       &mov    ($s0,&DWP(0,$acc));             # load input data
-       &mov    ($s1,&DWP(4,$acc));
-       &mov    ($s2,&DWP(8,$acc));
-       &mov    ($s3,&DWP(12,$acc));
-       &call   ("_x86_AES_encrypt_compact");
-       &mov    ("esp",$_esp);                  # restore stack pointer
-       &mov    ($acc,&wparam(1));              # load out
-       &mov    (&DWP(0,$acc),$s0);             # write output data
-       &mov    (&DWP(4,$acc),$s1);
-       &mov    (&DWP(8,$acc),$s2);
-       &mov    (&DWP(12,$acc),$s3);
-&function_end("AES_encrypt");
-
-#--------------------------------------------------------------------#
-
-######################################################################
-# "Compact" block function
-######################################################################
-
-sub deccompact()
-{ my $Fn = \&mov;
-  while ($#_>5) { pop(@_); $Fn=sub{}; }
-  my ($i,$td,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-       # $Fn is used in first compact round and its purpose is to
-       # void restoration of some values from stack, so that after
-       # 4xdeccompact with extra argument $key, $s0 and $s1 values
-       # are left there...
-       if($i==3)   {   &$Fn    ($key,$__key);                  }
-       else        {   &mov    ($out,$s[0]);                   }
-                       &and    ($out,0xFF);
-                       &movz   ($out,&BP(-128,$td,$out,1));
-
-       if ($i==3)  {   $tmp=$s[1];                             }
-                       &movz   ($tmp,&HB($s[1]));
-                       &movz   ($tmp,&BP(-128,$td,$tmp,1));
-                       &shl    ($tmp,8);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$acc);          }
-       else        {   mov     ($tmp,$s[2]);                   }
-                       &shr    ($tmp,16);
-                       &and    ($tmp,0xFF);
-                       &movz   ($tmp,&BP(-128,$td,$tmp,1));
-                       &shl    ($tmp,16);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[3]; &$Fn ($s[2],$__s1);         }
-       else        {   &mov    ($tmp,$s[3]);                   }
-                       &shr    ($tmp,24);
-                       &movz   ($tmp,&BP(-128,$td,$tmp,1));
-                       &shl    ($tmp,24);
-                       &xor    ($out,$tmp);
-       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
-       if ($i==3)  {   &$Fn    ($s[3],$__s0);                  }
-}
-
-# must be called with 2,3,0,1 as argument sequence!!!
-sub dectransform()
-{ my @s = ($s0,$s1,$s2,$s3);
-  my $i = shift;
-  my $tmp = $key;
-  my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
-  my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
-  my $tp8 = $tbl;
-
-       &mov    ($tmp,0x80808080);
-       &and    ($tmp,$s[$i]);
-       &mov    ($acc,$tmp);
-       &shr    ($tmp,7);
-       &lea    ($tp2,&DWP(0,$s[$i],$s[$i]));
-       &sub    ($acc,$tmp);
-       &and    ($tp2,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
-       &xor    ($tp2,$acc);
-       &mov    ($tmp,0x80808080);
-
-       &and    ($tmp,$tp2);
-       &mov    ($acc,$tmp);
-       &shr    ($tmp,7);
-       &lea    ($tp4,&DWP(0,$tp2,$tp2));
-       &sub    ($acc,$tmp);
-       &and    ($tp4,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
-        &xor   ($tp2,$s[$i]);  # tp2^tp1
-       &xor    ($tp4,$acc);
-       &mov    ($tmp,0x80808080);
-
-       &and    ($tmp,$tp4);
-       &mov    ($acc,$tmp);
-       &shr    ($tmp,7);
-       &lea    ($tp8,&DWP(0,$tp4,$tp4));
-       &sub    ($acc,$tmp);
-       &and    ($tp8,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
-        &xor   ($tp4,$s[$i]);  # tp4^tp1
-        &rotl  ($s[$i],8);     # = ROTATE(tp1,8)
-       &xor    ($tp8,$acc);
-
-       &xor    ($s[$i],$tp2);
-       &xor    ($tp2,$tp8);
-       &xor    ($s[$i],$tp4);
-       &xor    ($tp4,$tp8);
-       &rotl   ($tp2,24);
-       &xor    ($s[$i],$tp8);  # ^= tp8^(tp4^tp1)^(tp2^tp1)
-       &rotl   ($tp4,16);
-       &xor    ($s[$i],$tp2);  # ^= ROTATE(tp8^tp2^tp1,24)
-       &rotl   ($tp8,8);
-       &xor    ($s[$i],$tp4);  # ^= ROTATE(tp8^tp4^tp1,16)
-        &mov   ($s[0],$__s0)                   if($i==2); #prefetch $s0
-        &mov   ($s[1],$__s1)                   if($i==3); #prefetch $s1
-        &mov   ($s[2],$__s2)                   if($i==1);
-       &xor    ($s[$i],$tp8);  # ^= ROTATE(tp8,8)
-
-       &mov    ($s[3],$__s3)                   if($i==1);
-       &mov    (&DWP(4+4*$i,"esp"),$s[$i])     if($i>=2);
-}
-
-&function_begin_B("_x86_AES_decrypt_compact");
-       # note that caller is expected to allocate stack frame for me!
-       &mov    ($__key,$key);                  # save key
-
-       &xor    ($s0,&DWP(0,$key));             # xor with key
-       &xor    ($s1,&DWP(4,$key));
-       &xor    ($s2,&DWP(8,$key));
-       &xor    ($s3,&DWP(12,$key));
-
-       &mov    ($acc,&DWP(240,$key));          # load key->rounds
-
-       &lea    ($acc,&DWP(-2,$acc,$acc));
-       &lea    ($acc,&DWP(0,$key,$acc,8));
-       &mov    ($__end,$acc);                  # end of key schedule
-
-       # prefetch Td4
-       &mov    ($key,&DWP(0-128,$tbl));
-       &mov    ($acc,&DWP(32-128,$tbl));
-       &mov    ($key,&DWP(64-128,$tbl));
-       &mov    ($acc,&DWP(96-128,$tbl));
-       &mov    ($key,&DWP(128-128,$tbl));
-       &mov    ($acc,&DWP(160-128,$tbl));
-       &mov    ($key,&DWP(192-128,$tbl));
-       &mov    ($acc,&DWP(224-128,$tbl));
-
-       &set_label("loop",16);
-
-               &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
-               &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
-               &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
-               &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
-               &dectransform(2);
-               &dectransform(3);
-               &dectransform(0);
-               &dectransform(1);
-               &mov    ($key,$__key);
-               &mov    ($tbl,$__tbl);
-               &add    ($key,16);              # advance rd_key
-               &xor    ($s0,&DWP(0,$key));
-               &xor    ($s1,&DWP(4,$key));
-               &xor    ($s2,&DWP(8,$key));
-               &xor    ($s3,&DWP(12,$key));
-
-       &cmp    ($key,$__end);
-       &mov    ($__key,$key);
-       &jb     (&label("loop"));
-
-       &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
-       &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
-       &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
-       &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
-
-       &xor    ($s0,&DWP(16,$key));
-       &xor    ($s1,&DWP(20,$key));
-       &xor    ($s2,&DWP(24,$key));
-       &xor    ($s3,&DWP(28,$key));
-
-       &ret    ();
-&function_end_B("_x86_AES_decrypt_compact");
-
-######################################################################
-# "Compact" SSE block function.
-######################################################################
-
-sub sse_deccompact()
-{
-       &pshufw ("mm1","mm0",0x0c);             #  7, 6, 1, 0
-       &pshufw ("mm5","mm4",0x09);             # 13,12,11,10
-       &movd   ("eax","mm1");                  #  7, 6, 1, 0
-       &movd   ("ebx","mm5");                  # 13,12,11,10
-       &mov    ($__key,$key);
-
-       &movz   ($acc,&LB("eax"));              #  0
-       &movz   ("edx",&HB("eax"));             #  1
-       &pshufw ("mm2","mm0",0x06);             #  3, 2, 5, 4
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
-       &movz   ($key,&LB("ebx"));              # 10
-       &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
-       &shr    ("eax",16);                     #  7, 6
-       &shl    ("edx",8);                      #  1
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 10
-       &movz   ($key,&HB("ebx"));              # 11
-       &shl    ($acc,16);                      # 10
-       &pshufw ("mm6","mm4",0x03);             # 9, 8,15,14
-       &or     ("ecx",$acc);                   # 10
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 11
-       &movz   ($key,&HB("eax"));              #  7
-       &shl    ($acc,24);                      # 11
-       &shr    ("ebx",16);                     # 13,12
-       &or     ("edx",$acc);                   # 11
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  7
-       &movz   ($key,&HB("ebx"));              # 13
-       &shl    ($acc,24);                      #  7
-       &or     ("ecx",$acc);                   #  7
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 13
-       &movz   ($key,&LB("eax"));              #  6
-       &shl    ($acc,8);                       # 13
-       &movd   ("eax","mm2");                  #  3, 2, 5, 4
-       &or     ("ecx",$acc);                   # 13
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  6
-       &movz   ($key,&LB("ebx"));              # 12
-       &shl    ($acc,16);                      #  6
-       &movd   ("ebx","mm6");                  #  9, 8,15,14
-       &movd   ("mm0","ecx");                  # t[0] collected
-       &movz   ("ecx",&BP(-128,$tbl,$key,1));  # 12
-       &movz   ($key,&LB("eax"));              #  4
-       &or     ("ecx",$acc);                   # 12
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  4
-       &movz   ($key,&LB("ebx"));              # 14
-       &or     ("edx",$acc);                   #  4
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 14
-       &movz   ($key,&HB("eax"));              #  5
-       &shl    ($acc,16);                      # 14
-       &shr    ("eax",16);                     #  3, 2
-       &or     ("edx",$acc);                   # 14
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  5
-       &movz   ($key,&HB("ebx"));              # 15
-       &shr    ("ebx",16);                     #  9, 8
-       &shl    ($acc,8);                       #  5
-       &movd   ("mm1","edx");                  # t[1] collected
-       &movz   ("edx",&BP(-128,$tbl,$key,1));  # 15
-       &movz   ($key,&HB("ebx"));              #  9
-       &shl    ("edx",24);                     # 15
-       &and    ("ebx",0xff);                   #  8
-       &or     ("edx",$acc);                   # 15
-
-       &punpckldq      ("mm0","mm1");          # t[0,1] collected
-
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  9
-       &movz   ($key,&LB("eax"));              #  2
-       &shl    ($acc,8);                       #  9
-       &movz   ("eax",&HB("eax"));             #  3
-       &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); #  8
-       &or     ("ecx",$acc);                   #  9
-       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  2
-       &or     ("edx","ebx");                  #  8
-       &shl    ($acc,16);                      #  2
-       &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  3
-       &or     ("edx",$acc);                   #  2
-       &shl    ("eax",24);                     #  3
-       &or     ("ecx","eax");                  #  3
-       &mov    ($key,$__key);
-       &movd   ("mm4","edx");                  # t[2] collected
-       &movd   ("mm5","ecx");                  # t[3] collected
-
-       &punpckldq      ("mm4","mm5");          # t[2,3] collected
-}
-
-                                       if (!$x86only) {
-&function_begin_B("_sse_AES_decrypt_compact");
-       &pxor   ("mm0",&QWP(0,$key));   #  7, 6, 5, 4, 3, 2, 1, 0
-       &pxor   ("mm4",&QWP(8,$key));   # 15,14,13,12,11,10, 9, 8
-
-       # note that caller is expected to allocate stack frame for me!
-       &mov    ($acc,&DWP(240,$key));          # load key->rounds
-       &lea    ($acc,&DWP(-2,$acc,$acc));
-       &lea    ($acc,&DWP(0,$key,$acc,8));
-       &mov    ($__end,$acc);                  # end of key schedule
-
-       &mov    ($s0,0x1b1b1b1b);               # magic constant
-       &mov    (&DWP(8,"esp"),$s0);
-       &mov    (&DWP(12,"esp"),$s0);
-
-       # prefetch Td4
-       &mov    ($s0,&DWP(0-128,$tbl));
-       &mov    ($s1,&DWP(32-128,$tbl));
-       &mov    ($s2,&DWP(64-128,$tbl));
-       &mov    ($s3,&DWP(96-128,$tbl));
-       &mov    ($s0,&DWP(128-128,$tbl));
-       &mov    ($s1,&DWP(160-128,$tbl));
-       &mov    ($s2,&DWP(192-128,$tbl));
-       &mov    ($s3,&DWP(224-128,$tbl));
-
-       &set_label("loop",16);
-               &sse_deccompact();
-               &add    ($key,16);
-               &cmp    ($key,$__end);
-               &ja     (&label("out"));
-
-               # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
-               &movq   ("mm3","mm0");          &movq   ("mm7","mm4");
-               &movq   ("mm2","mm0",1);        &movq   ("mm6","mm4",1);
-               &movq   ("mm1","mm0");          &movq   ("mm5","mm4");
-               &pshufw ("mm0","mm0",0xb1);     &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
-               &pslld  ("mm2",8);              &pslld  ("mm6",8);
-               &psrld  ("mm3",8);              &psrld  ("mm7",8);
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= tp0<<8
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= tp0>>8
-               &pslld  ("mm2",16);             &pslld  ("mm6",16);
-               &psrld  ("mm3",16);             &psrld  ("mm7",16);
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= tp0<<24
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= tp0>>24
-
-               &movq   ("mm3",&QWP(8,"esp"));
-               &pxor   ("mm2","mm2");          &pxor   ("mm6","mm6");
-               &pcmpgtb("mm2","mm1");          &pcmpgtb("mm6","mm5");
-               &pand   ("mm2","mm3");          &pand   ("mm6","mm3");
-               &paddb  ("mm1","mm1");          &paddb  ("mm5","mm5");
-               &pxor   ("mm1","mm2");          &pxor   ("mm5","mm6");  # tp2
-               &movq   ("mm3","mm1");          &movq   ("mm7","mm5");
-               &movq   ("mm2","mm1");          &movq   ("mm6","mm5");
-               &pxor   ("mm0","mm1");          &pxor   ("mm4","mm5");  # ^= tp2
-               &pslld  ("mm3",24);             &pslld  ("mm7",24);
-               &psrld  ("mm2",8);              &psrld  ("mm6",8);
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= tp2<<24
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= tp2>>8
-
-               &movq   ("mm2",&QWP(8,"esp"));
-               &pxor   ("mm3","mm3");          &pxor   ("mm7","mm7");
-               &pcmpgtb("mm3","mm1");          &pcmpgtb("mm7","mm5");
-               &pand   ("mm3","mm2");          &pand   ("mm7","mm2");
-               &paddb  ("mm1","mm1");          &paddb  ("mm5","mm5");
-               &pxor   ("mm1","mm3");          &pxor   ("mm5","mm7");  # tp4
-               &pshufw ("mm3","mm1",0xb1);     &pshufw ("mm7","mm5",0xb1);
-               &pxor   ("mm0","mm1");          &pxor   ("mm4","mm5");  # ^= tp4
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= ROTATE(tp4,16)
-
-               &pxor   ("mm3","mm3");          &pxor   ("mm7","mm7");
-               &pcmpgtb("mm3","mm1");          &pcmpgtb("mm7","mm5");
-               &pand   ("mm3","mm2");          &pand   ("mm7","mm2");
-               &paddb  ("mm1","mm1");          &paddb  ("mm5","mm5");
-               &pxor   ("mm1","mm3");          &pxor   ("mm5","mm7");  # tp8
-               &pxor   ("mm0","mm1");          &pxor   ("mm4","mm5");  # ^= tp8
-               &movq   ("mm3","mm1");          &movq   ("mm7","mm5");
-               &pshufw ("mm2","mm1",0xb1);     &pshufw ("mm6","mm5",0xb1);
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= ROTATE(tp8,16)
-               &pslld  ("mm1",8);              &pslld  ("mm5",8);
-               &psrld  ("mm3",8);              &psrld  ("mm7",8);
-               &movq   ("mm2",&QWP(0,$key));   &movq   ("mm6",&QWP(8,$key));
-               &pxor   ("mm0","mm1");          &pxor   ("mm4","mm5");  # ^= tp8<<8
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= tp8>>8
-               &mov    ($s0,&DWP(0-128,$tbl));
-               &pslld  ("mm1",16);             &pslld  ("mm5",16);
-               &mov    ($s1,&DWP(64-128,$tbl));
-               &psrld  ("mm3",16);             &psrld  ("mm7",16);
-               &mov    ($s2,&DWP(128-128,$tbl));
-               &pxor   ("mm0","mm1");          &pxor   ("mm4","mm5");  # ^= tp8<<24
-               &mov    ($s3,&DWP(192-128,$tbl));
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= tp8>>24
-
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");
-       &jmp    (&label("loop"));
-
-       &set_label("out",16);
-       &pxor   ("mm0",&QWP(0,$key));
-       &pxor   ("mm4",&QWP(8,$key));
-
-       &ret    ();
-&function_end_B("_sse_AES_decrypt_compact");
-                                       }
-
-######################################################################
-# Vanilla block function.
-######################################################################
-
-sub decstep()
-{ my ($i,$td,@s) = @_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-       # no instructions are reordered, as performance appears
-       # optimal... or rather that all attempts to reorder didn't
-       # result in better performance [which by the way is not a
-       # bit lower than encryption].
-       if($i==3)   {   &mov    ($key,$__key);                  }
-       else        {   &mov    ($out,$s[0]);                   }
-                       &and    ($out,0xFF);
-                       &mov    ($out,&DWP(0,$td,$out,8));
-
-       if ($i==3)  {   $tmp=$s[1];                             }
-                       &movz   ($tmp,&HB($s[1]));
-                       &xor    ($out,&DWP(3,$td,$tmp,8));
-
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$acc);          }
-       else        {   &mov    ($tmp,$s[2]);                   }
-                       &shr    ($tmp,16);
-                       &and    ($tmp,0xFF);
-                       &xor    ($out,&DWP(2,$td,$tmp,8));
-
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],$__s1);         }
-       else        {   &mov    ($tmp,$s[3]);                   }
-                       &shr    ($tmp,24);
-                       &xor    ($out,&DWP(1,$td,$tmp,8));
-       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
-       if ($i==3)  {   &mov    ($s[3],$__s0);                  }
-                       &comment();
-}
-
-sub declast()
-{ my ($i,$td,@s)=@_;
-  my $tmp = $key;
-  my $out = $i==3?$s[0]:$acc;
-
-       if($i==0)   {   &lea    ($td,&DWP(2048+128,$td));
-                       &mov    ($tmp,&DWP(0-128,$td));
-                       &mov    ($acc,&DWP(32-128,$td));
-                       &mov    ($tmp,&DWP(64-128,$td));
-                       &mov    ($acc,&DWP(96-128,$td));
-                       &mov    ($tmp,&DWP(128-128,$td));
-                       &mov    ($acc,&DWP(160-128,$td));
-                       &mov    ($tmp,&DWP(192-128,$td));
-                       &mov    ($acc,&DWP(224-128,$td));
-                       &lea    ($td,&DWP(-128,$td));           }
-       if($i==3)   {   &mov    ($key,$__key);                  }
-       else        {   &mov    ($out,$s[0]);                   }
-                       &and    ($out,0xFF);
-                       &movz   ($out,&BP(0,$td,$out,1));
-
-       if ($i==3)  {   $tmp=$s[1];                             }
-                       &movz   ($tmp,&HB($s[1]));
-                       &movz   ($tmp,&BP(0,$td,$tmp,1));
-                       &shl    ($tmp,8);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$acc);          }
-       else        {   mov     ($tmp,$s[2]);                   }
-                       &shr    ($tmp,16);
-                       &and    ($tmp,0xFF);
-                       &movz   ($tmp,&BP(0,$td,$tmp,1));
-                       &shl    ($tmp,16);
-                       &xor    ($out,$tmp);
-
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],$__s1);         }
-       else        {   &mov    ($tmp,$s[3]);                   }
-                       &shr    ($tmp,24);
-                       &movz   ($tmp,&BP(0,$td,$tmp,1));
-                       &shl    ($tmp,24);
-                       &xor    ($out,$tmp);
-       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
-       if ($i==3)  {   &mov    ($s[3],$__s0);
-                       &lea    ($td,&DWP(-2048,$td));          }
-}
-
-&function_begin_B("_x86_AES_decrypt");
-       # note that caller is expected to allocate stack frame for me!
-       &mov    ($__key,$key);                  # save key
-
-       &xor    ($s0,&DWP(0,$key));             # xor with key
-       &xor    ($s1,&DWP(4,$key));
-       &xor    ($s2,&DWP(8,$key));
-       &xor    ($s3,&DWP(12,$key));
-
-       &mov    ($acc,&DWP(240,$key));          # load key->rounds
-
-       if ($small_footprint) {
-           &lea        ($acc,&DWP(-2,$acc,$acc));
-           &lea        ($acc,&DWP(0,$key,$acc,8));
-           &mov        ($__end,$acc);          # end of key schedule
-           &set_label("loop",16);
-               &decstep(0,$tbl,$s0,$s3,$s2,$s1);
-               &decstep(1,$tbl,$s1,$s0,$s3,$s2);
-               &decstep(2,$tbl,$s2,$s1,$s0,$s3);
-               &decstep(3,$tbl,$s3,$s2,$s1,$s0);
-               &add    ($key,16);              # advance rd_key
-               &xor    ($s0,&DWP(0,$key));
-               &xor    ($s1,&DWP(4,$key));
-               &xor    ($s2,&DWP(8,$key));
-               &xor    ($s3,&DWP(12,$key));
-           &cmp        ($key,$__end);
-           &mov        ($__key,$key);
-           &jb         (&label("loop"));
-       }
-       else {
-           &cmp        ($acc,10);
-           &jle        (&label("10rounds"));
-           &cmp        ($acc,12);
-           &jle        (&label("12rounds"));
-
-       &set_label("14rounds",4);
-           for ($i=1;$i<3;$i++) {
-               &decstep(0,$tbl,$s0,$s3,$s2,$s1);
-               &decstep(1,$tbl,$s1,$s0,$s3,$s2);
-               &decstep(2,$tbl,$s2,$s1,$s0,$s3);
-               &decstep(3,$tbl,$s3,$s2,$s1,$s0);
-               &xor    ($s0,&DWP(16*$i+0,$key));
-               &xor    ($s1,&DWP(16*$i+4,$key));
-               &xor    ($s2,&DWP(16*$i+8,$key));
-               &xor    ($s3,&DWP(16*$i+12,$key));
-           }
-           &add        ($key,32);
-           &mov        ($__key,$key);          # advance rd_key
-       &set_label("12rounds",4);
-           for ($i=1;$i<3;$i++) {
-               &decstep(0,$tbl,$s0,$s3,$s2,$s1);
-               &decstep(1,$tbl,$s1,$s0,$s3,$s2);
-               &decstep(2,$tbl,$s2,$s1,$s0,$s3);
-               &decstep(3,$tbl,$s3,$s2,$s1,$s0);
-               &xor    ($s0,&DWP(16*$i+0,$key));
-               &xor    ($s1,&DWP(16*$i+4,$key));
-               &xor    ($s2,&DWP(16*$i+8,$key));
-               &xor    ($s3,&DWP(16*$i+12,$key));
-           }
-           &add        ($key,32);
-           &mov        ($__key,$key);          # advance rd_key
-       &set_label("10rounds",4);
-           for ($i=1;$i<10;$i++) {
-               &decstep(0,$tbl,$s0,$s3,$s2,$s1);
-               &decstep(1,$tbl,$s1,$s0,$s3,$s2);
-               &decstep(2,$tbl,$s2,$s1,$s0,$s3);
-               &decstep(3,$tbl,$s3,$s2,$s1,$s0);
-               &xor    ($s0,&DWP(16*$i+0,$key));
-               &xor    ($s1,&DWP(16*$i+4,$key));
-               &xor    ($s2,&DWP(16*$i+8,$key));
-               &xor    ($s3,&DWP(16*$i+12,$key));
-           }
-       }
-
-       &declast(0,$tbl,$s0,$s3,$s2,$s1);
-       &declast(1,$tbl,$s1,$s0,$s3,$s2);
-       &declast(2,$tbl,$s2,$s1,$s0,$s3);
-       &declast(3,$tbl,$s3,$s2,$s1,$s0);
-
-       &add    ($key,$small_footprint?16:160);
-       &xor    ($s0,&DWP(0,$key));
-       &xor    ($s1,&DWP(4,$key));
-       &xor    ($s2,&DWP(8,$key));
-       &xor    ($s3,&DWP(12,$key));
-
-       &ret    ();
-
-&set_label("AES_Td",64);       # Yes! I keep it in the code segment!
-       &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
-       &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
-       &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
-       &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
-       &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
-       &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
-       &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
-       &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
-       &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
-       &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
-       &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
-       &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
-       &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
-       &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
-       &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
-       &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
-       &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
-       &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
-       &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
-       &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
-       &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
-       &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
-       &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
-       &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
-       &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
-       &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
-       &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
-       &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
-       &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
-       &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
-       &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
-       &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
-       &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
-       &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
-       &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
-       &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
-       &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
-       &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
-       &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
-       &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
-       &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
-       &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
-       &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
-       &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
-       &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
-       &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
-       &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
-       &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
-       &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
-       &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
-       &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
-       &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
-       &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
-       &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
-       &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
-       &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
-       &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
-       &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
-       &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
-       &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
-       &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
-       &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
-       &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
-       &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
-
-#Td4:  # four copies of Td4 to choose from to avoid L1 aliasing
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-&function_end_B("_x86_AES_decrypt");
-
-# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
-&function_begin("AES_decrypt");
-       &mov    ($acc,&wparam(0));              # load inp
-       &mov    ($key,&wparam(2));              # load key
-
-       &mov    ($s0,"esp");
-       &sub    ("esp",36);
-       &and    ("esp",-64);                    # align to cache-line
-
-       # place stack frame just "above" the key schedule
-       &lea    ($s1,&DWP(-64-63,$key));
-       &sub    ($s1,"esp");
-       &neg    ($s1);
-       &and    ($s1,0x3C0);    # modulo 1024, but aligned to cache-line
-       &sub    ("esp",$s1);
-       &add    ("esp",4);      # 4 is reserved for caller's return address
-       &mov    ($_esp,$s0);    # save stack pointer
-
-       &call   (&label("pic_point"));          # make it PIC!
-       &set_label("pic_point");
-       &blindpop($tbl);
-       &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
-       &lea    ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
-
-       # pick Td4 copy which can't "overlap" with stack frame or key schedule
-       &lea    ($s1,&DWP(768-4,"esp"));
-       &sub    ($s1,$tbl);
-       &and    ($s1,0x300);
-       &lea    ($tbl,&DWP(2048+128,$tbl,$s1));
-
-                                       if (!$x86only) {
-       &bt     (&DWP(0,$s0),25);       # check for SSE bit
-       &jnc    (&label("x86"));
-
-       &movq   ("mm0",&QWP(0,$acc));
-       &movq   ("mm4",&QWP(8,$acc));
-       &call   ("_sse_AES_decrypt_compact");
-       &mov    ("esp",$_esp);                  # restore stack pointer
-       &mov    ($acc,&wparam(1));              # load out
-       &movq   (&QWP(0,$acc),"mm0");           # write output data
-       &movq   (&QWP(8,$acc),"mm4");
-       &emms   ();
-       &function_end_A();
-                                       }
-       &set_label("x86",16);
-       &mov    ($_tbl,$tbl);
-       &mov    ($s0,&DWP(0,$acc));             # load input data
-       &mov    ($s1,&DWP(4,$acc));
-       &mov    ($s2,&DWP(8,$acc));
-       &mov    ($s3,&DWP(12,$acc));
-       &call   ("_x86_AES_decrypt_compact");
-       &mov    ("esp",$_esp);                  # restore stack pointer
-       &mov    ($acc,&wparam(1));              # load out
-       &mov    (&DWP(0,$acc),$s0);             # write output data
-       &mov    (&DWP(4,$acc),$s1);
-       &mov    (&DWP(8,$acc),$s2);
-       &mov    (&DWP(12,$acc),$s3);
-&function_end("AES_decrypt");
-
-# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
-#                      size_t length, const AES_KEY *key,
-#                      unsigned char *ivp,const int enc);
-{
-# stack frame layout
-#             -4(%esp)         # return address         0(%esp)
-#              0(%esp)         # s0 backing store       4(%esp)
-#              4(%esp)         # s1 backing store       8(%esp)
-#              8(%esp)         # s2 backing store      12(%esp)
-#             12(%esp)         # s3 backing store      16(%esp)
-#             16(%esp)         # key backup            20(%esp)
-#             20(%esp)         # end of key schedule   24(%esp)
-#             24(%esp)         # %ebp backup           28(%esp)
-#             28(%esp)         # %esp backup
-my $_inp=&DWP(32,"esp");       # copy of wparam(0)
-my $_out=&DWP(36,"esp");       # copy of wparam(1)
-my $_len=&DWP(40,"esp");       # copy of wparam(2)
-my $_key=&DWP(44,"esp");       # copy of wparam(3)
-my $_ivp=&DWP(48,"esp");       # copy of wparam(4)
-my $_tmp=&DWP(52,"esp");       # volatile variable
-#
-my $ivec=&DWP(60,"esp");       # ivec[16]
-my $aes_key=&DWP(76,"esp");    # copy of aes_key
-my $mark=&DWP(76+240,"esp");   # copy of aes_key->rounds
-
-&function_begin("AES_cbc_encrypt");
-       &mov    ($s2 eq "ecx"? $s2 : "",&wparam(2));    # load len
-       &cmp    ($s2,0);
-       &je     (&label("drop_out"));
-
-       &call   (&label("pic_point"));          # make it PIC!
-       &set_label("pic_point");
-       &blindpop($tbl);
-       &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
-
-       &cmp    (&wparam(5),0);
-       &lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-       &jne    (&label("picked_te"));
-       &lea    ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
-       &set_label("picked_te");
-
-       # one can argue if this is required
-       &pushf  ();
-       &cld    ();
-
-       &cmp    ($s2,$speed_limit);
-       &jb     (&label("slow_way"));
-       &test   ($s2,15);
-       &jnz    (&label("slow_way"));
-                                       if (!$x86only) {
-       &bt     (&DWP(0,$s0),28);       # check for hyper-threading bit
-       &jc     (&label("slow_way"));
-                                       }
-       # pre-allocate aligned stack frame...
-       &lea    ($acc,&DWP(-80-244,"esp"));
-       &and    ($acc,-64);
-
-       # ... and make sure it doesn't alias with $tbl modulo 4096
-       &mov    ($s0,$tbl);
-       &lea    ($s1,&DWP(2048+256,$tbl));
-       &mov    ($s3,$acc);
-       &and    ($s0,0xfff);            # s = %ebp&0xfff
-       &and    ($s1,0xfff);            # e = (%ebp+2048+256)&0xfff
-       &and    ($s3,0xfff);            # p = %esp&0xfff
-
-       &cmp    ($s3,$s1);              # if (p>=e) %esp =- (p-e);
-       &jb     (&label("tbl_break_out"));
-       &sub    ($s3,$s1);
-       &sub    ($acc,$s3);
-       &jmp    (&label("tbl_ok"));
-       &set_label("tbl_break_out",4);  # else %esp -= (p-s)&0xfff + framesz;
-       &sub    ($s3,$s0);
-       &and    ($s3,0xfff);
-       &add    ($s3,384);
-       &sub    ($acc,$s3);
-       &set_label("tbl_ok",4);
-
-       &lea    ($s3,&wparam(0));       # obtain pointer to parameter block
-       &exch   ("esp",$acc);           # allocate stack frame
-       &add    ("esp",4);              # reserve for return address!
-       &mov    ($_tbl,$tbl);           # save %ebp
-       &mov    ($_esp,$acc);           # save %esp
-
-       &mov    ($s0,&DWP(0,$s3));      # load inp
-       &mov    ($s1,&DWP(4,$s3));      # load out
-       #&mov   ($s2,&DWP(8,$s3));      # load len
-       &mov    ($key,&DWP(12,$s3));    # load key
-       &mov    ($acc,&DWP(16,$s3));    # load ivp
-       &mov    ($s3,&DWP(20,$s3));     # load enc flag
-
-       &mov    ($_inp,$s0);            # save copy of inp
-       &mov    ($_out,$s1);            # save copy of out
-       &mov    ($_len,$s2);            # save copy of len
-       &mov    ($_key,$key);           # save copy of key
-       &mov    ($_ivp,$acc);           # save copy of ivp
-
-       &mov    ($mark,0);              # copy of aes_key->rounds = 0;
-       # do we copy key schedule to stack?
-       &mov    ($s1 eq "ebx" ? $s1 : "",$key);
-       &mov    ($s2 eq "ecx" ? $s2 : "",244/4);
-       &sub    ($s1,$tbl);
-       &mov    ("esi",$key);
-       &and    ($s1,0xfff);
-       &lea    ("edi",$aes_key);
-       &cmp    ($s1,2048+256);
-       &jb     (&label("do_copy"));
-       &cmp    ($s1,4096-244);
-       &jb     (&label("skip_copy"));
-       &set_label("do_copy",4);
-               &mov    ($_key,"edi");
-               &data_word(0xA5F3F689); # rep movsd
-       &set_label("skip_copy");
-
-       &mov    ($key,16);
-       &set_label("prefetch_tbl",4);
-               &mov    ($s0,&DWP(0,$tbl));
-               &mov    ($s1,&DWP(32,$tbl));
-               &mov    ($s2,&DWP(64,$tbl));
-               &mov    ($acc,&DWP(96,$tbl));
-               &lea    ($tbl,&DWP(128,$tbl));
-               &sub    ($key,1);
-       &jnz    (&label("prefetch_tbl"));
-       &sub    ($tbl,2048);
-
-       &mov    ($acc,$_inp);
-       &mov    ($key,$_ivp);
-
-       &cmp    ($s3,0);
-       &je     (&label("fast_decrypt"));
-
-#----------------------------- ENCRYPT -----------------------------#
-       &mov    ($s0,&DWP(0,$key));             # load iv
-       &mov    ($s1,&DWP(4,$key));
-
-       &set_label("fast_enc_loop",16);
-               &mov    ($s2,&DWP(8,$key));
-               &mov    ($s3,&DWP(12,$key));
-
-               &xor    ($s0,&DWP(0,$acc));     # xor input data
-               &xor    ($s1,&DWP(4,$acc));
-               &xor    ($s2,&DWP(8,$acc));
-               &xor    ($s3,&DWP(12,$acc));
-
-               &mov    ($key,$_key);           # load key
-               &call   ("_x86_AES_encrypt");
-
-               &mov    ($acc,$_inp);           # load inp
-               &mov    ($key,$_out);           # load out
-
-               &mov    (&DWP(0,$key),$s0);     # save output data
-               &mov    (&DWP(4,$key),$s1);
-               &mov    (&DWP(8,$key),$s2);
-               &mov    (&DWP(12,$key),$s3);
-
-               &lea    ($acc,&DWP(16,$acc));   # advance inp
-               &mov    ($s2,$_len);            # load len
-               &mov    ($_inp,$acc);           # save inp
-               &lea    ($s3,&DWP(16,$key));    # advance out
-               &mov    ($_out,$s3);            # save out
-               &sub    ($s2,16);               # decrease len
-               &mov    ($_len,$s2);            # save len
-       &jnz    (&label("fast_enc_loop"));
-       &mov    ($acc,$_ivp);           # load ivp
-       &mov    ($s2,&DWP(8,$key));     # restore last 2 dwords
-       &mov    ($s3,&DWP(12,$key));
-       &mov    (&DWP(0,$acc),$s0);     # save ivec
-       &mov    (&DWP(4,$acc),$s1);
-       &mov    (&DWP(8,$acc),$s2);
-       &mov    (&DWP(12,$acc),$s3);
-
-       &cmp    ($mark,0);              # was the key schedule copied?
-       &mov    ("edi",$_key);
-       &je     (&label("skip_ezero"));
-       # zero copy of key schedule
-       &mov    ("ecx",240/4);
-       &xor    ("eax","eax");
-       &align  (4);
-       &data_word(0xABF3F689);         # rep stosd
-       &set_label("skip_ezero");
-       &mov    ("esp",$_esp);
-       &popf   ();
-    &set_label("drop_out");
-       &function_end_A();
-       &pushf  ();                     # kludge, never executed
-
-#----------------------------- DECRYPT -----------------------------#
-&set_label("fast_decrypt",16);
-
-       &cmp    ($acc,$_out);
-       &je     (&label("fast_dec_in_place"));  # in-place processing...
-
-       &mov    ($_tmp,$key);
-
-       &align  (4);
-       &set_label("fast_dec_loop",16);
-               &mov    ($s0,&DWP(0,$acc));     # read input
-               &mov    ($s1,&DWP(4,$acc));
-               &mov    ($s2,&DWP(8,$acc));
-               &mov    ($s3,&DWP(12,$acc));
-
-               &mov    ($key,$_key);           # load key
-               &call   ("_x86_AES_decrypt");
-
-               &mov    ($key,$_tmp);           # load ivp
-               &mov    ($acc,$_len);           # load len
-               &xor    ($s0,&DWP(0,$key));     # xor iv
-               &xor    ($s1,&DWP(4,$key));
-               &xor    ($s2,&DWP(8,$key));
-               &xor    ($s3,&DWP(12,$key));
-
-               &mov    ($key,$_out);           # load out
-               &mov    ($acc,$_inp);           # load inp
-
-               &mov    (&DWP(0,$key),$s0);     # write output
-               &mov    (&DWP(4,$key),$s1);
-               &mov    (&DWP(8,$key),$s2);
-               &mov    (&DWP(12,$key),$s3);
-
-               &mov    ($s2,$_len);            # load len
-               &mov    ($_tmp,$acc);           # save ivp
-               &lea    ($acc,&DWP(16,$acc));   # advance inp
-               &mov    ($_inp,$acc);           # save inp
-               &lea    ($key,&DWP(16,$key));   # advance out
-               &mov    ($_out,$key);           # save out
-               &sub    ($s2,16);               # decrease len
-               &mov    ($_len,$s2);            # save len
-       &jnz    (&label("fast_dec_loop"));
-       &mov    ($key,$_tmp);           # load temp ivp
-       &mov    ($acc,$_ivp);           # load user ivp
-       &mov    ($s0,&DWP(0,$key));     # load iv
-       &mov    ($s1,&DWP(4,$key));
-       &mov    ($s2,&DWP(8,$key));
-       &mov    ($s3,&DWP(12,$key));
-       &mov    (&DWP(0,$acc),$s0);     # copy back to user
-       &mov    (&DWP(4,$acc),$s1);
-       &mov    (&DWP(8,$acc),$s2);
-       &mov    (&DWP(12,$acc),$s3);
-       &jmp    (&label("fast_dec_out"));
-
-    &set_label("fast_dec_in_place",16);
-       &set_label("fast_dec_in_place_loop");
-               &mov    ($s0,&DWP(0,$acc));     # read input
-               &mov    ($s1,&DWP(4,$acc));
-               &mov    ($s2,&DWP(8,$acc));
-               &mov    ($s3,&DWP(12,$acc));
-
-               &lea    ($key,$ivec);
-               &mov    (&DWP(0,$key),$s0);     # copy to temp
-               &mov    (&DWP(4,$key),$s1);
-               &mov    (&DWP(8,$key),$s2);
-               &mov    (&DWP(12,$key),$s3);
-
-               &mov    ($key,$_key);           # load key
-               &call   ("_x86_AES_decrypt");
-
-               &mov    ($key,$_ivp);           # load ivp
-               &mov    ($acc,$_out);           # load out
-               &xor    ($s0,&DWP(0,$key));     # xor iv
-               &xor    ($s1,&DWP(4,$key));
-               &xor    ($s2,&DWP(8,$key));
-               &xor    ($s3,&DWP(12,$key));
-
-               &mov    (&DWP(0,$acc),$s0);     # write output
-               &mov    (&DWP(4,$acc),$s1);
-               &mov    (&DWP(8,$acc),$s2);
-               &mov    (&DWP(12,$acc),$s3);
-
-               &lea    ($acc,&DWP(16,$acc));   # advance out
-               &mov    ($_out,$acc);           # save out
-
-               &lea    ($acc,$ivec);
-               &mov    ($s0,&DWP(0,$acc));     # read temp
-               &mov    ($s1,&DWP(4,$acc));
-               &mov    ($s2,&DWP(8,$acc));
-               &mov    ($s3,&DWP(12,$acc));
-
-               &mov    (&DWP(0,$key),$s0);     # copy iv
-               &mov    (&DWP(4,$key),$s1);
-               &mov    (&DWP(8,$key),$s2);
-               &mov    (&DWP(12,$key),$s3);
-
-               &mov    ($acc,$_inp);           # load inp
-               &mov    ($s2,$_len);            # load len
-               &lea    ($acc,&DWP(16,$acc));   # advance inp
-               &mov    ($_inp,$acc);           # save inp
-               &sub    ($s2,16);               # decrease len
-               &mov    ($_len,$s2);            # save len
-       &jnz    (&label("fast_dec_in_place_loop"));
-
-    &set_label("fast_dec_out",4);
-       &cmp    ($mark,0);              # was the key schedule copied?
-       &mov    ("edi",$_key);
-       &je     (&label("skip_dzero"));
-       # zero copy of key schedule
-       &mov    ("ecx",240/4);
-       &xor    ("eax","eax");
-       &align  (4);
-       &data_word(0xABF3F689);         # rep stosd
-       &set_label("skip_dzero");
-       &mov    ("esp",$_esp);
-       &popf   ();
-       &function_end_A();
-       &pushf  ();                     # kludge, never executed
-
-#--------------------------- SLOW ROUTINE ---------------------------#
-&set_label("slow_way",16);
-
-       &mov    ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
-       &mov    ($key,&wparam(3));      # load key
-
-       # pre-allocate aligned stack frame...
-       &lea    ($acc,&DWP(-80,"esp"));
-       &and    ($acc,-64);
-
-       # ... and make sure it doesn't alias with $key modulo 1024
-       &lea    ($s1,&DWP(-80-63,$key));
-       &sub    ($s1,$acc);
-       &neg    ($s1);
-       &and    ($s1,0x3C0);    # modulo 1024, but aligned to cache-line
-       &sub    ($acc,$s1);
-
-       # pick S-box copy which can't overlap with stack frame or $key
-       &lea    ($s1,&DWP(768,$acc));
-       &sub    ($s1,$tbl);
-       &and    ($s1,0x300);
-       &lea    ($tbl,&DWP(2048+128,$tbl,$s1));
-
-       &lea    ($s3,&wparam(0));       # pointer to parameter block
-
-       &exch   ("esp",$acc);
-       &add    ("esp",4);              # reserve for return address!
-       &mov    ($_tbl,$tbl);           # save %ebp
-       &mov    ($_esp,$acc);           # save %esp
-       &mov    ($_tmp,$s0);            # save OPENSSL_ia32cap
-
-       &mov    ($s0,&DWP(0,$s3));      # load inp
-       &mov    ($s1,&DWP(4,$s3));      # load out
-       #&mov   ($s2,&DWP(8,$s3));      # load len
-       #&mov   ($key,&DWP(12,$s3));    # load key
-       &mov    ($acc,&DWP(16,$s3));    # load ivp
-       &mov    ($s3,&DWP(20,$s3));     # load enc flag
-
-       &mov    ($_inp,$s0);            # save copy of inp
-       &mov    ($_out,$s1);            # save copy of out
-       &mov    ($_len,$s2);            # save copy of len
-       &mov    ($_key,$key);           # save copy of key
-       &mov    ($_ivp,$acc);           # save copy of ivp
-
-       &mov    ($key,$acc);
-       &mov    ($acc,$s0);
-
-       &cmp    ($s3,0);
-       &je     (&label("slow_decrypt"));
-
-#--------------------------- SLOW ENCRYPT ---------------------------#
-       &cmp    ($s2,16);
-       &mov    ($s3,$s1);
-       &jb     (&label("slow_enc_tail"));
-
-                                       if (!$x86only) {
-       &bt     ($_tmp,25);             # check for SSE bit
-       &jnc    (&label("slow_enc_x86"));
-
-       &movq   ("mm0",&QWP(0,$key));   # load iv
-       &movq   ("mm4",&QWP(8,$key));
-
-       &set_label("slow_enc_loop_sse",16);
-               &pxor   ("mm0",&QWP(0,$acc));   # xor input data
-               &pxor   ("mm4",&QWP(8,$acc));
-
-               &mov    ($key,$_key);
-               &call   ("_sse_AES_encrypt_compact");
-
-               &mov    ($acc,$_inp);           # load inp
-               &mov    ($key,$_out);           # load out
-               &mov    ($s2,$_len);            # load len
-
-               &movq   (&QWP(0,$key),"mm0");   # save output data
-               &movq   (&QWP(8,$key),"mm4");
-
-               &lea    ($acc,&DWP(16,$acc));   # advance inp
-               &mov    ($_inp,$acc);           # save inp
-               &lea    ($s3,&DWP(16,$key));    # advance out
-               &mov    ($_out,$s3);            # save out
-               &sub    ($s2,16);               # decrease len
-               &cmp    ($s2,16);
-               &mov    ($_len,$s2);            # save len
-       &jae    (&label("slow_enc_loop_sse"));
-       &test   ($s2,15);
-       &jnz    (&label("slow_enc_tail"));
-       &mov    ($acc,$_ivp);           # load ivp
-       &movq   (&QWP(0,$acc),"mm0");   # save ivec
-       &movq   (&QWP(8,$acc),"mm4");
-       &emms   ();
-       &mov    ("esp",$_esp);
-       &popf   ();
-       &function_end_A();
-       &pushf  ();                     # kludge, never executed
-                                       }
-    &set_label("slow_enc_x86",16);
-       &mov    ($s0,&DWP(0,$key));     # load iv
-       &mov    ($s1,&DWP(4,$key));
-
-       &set_label("slow_enc_loop_x86",4);
-               &mov    ($s2,&DWP(8,$key));
-               &mov    ($s3,&DWP(12,$key));
-
-               &xor    ($s0,&DWP(0,$acc));     # xor input data
-               &xor    ($s1,&DWP(4,$acc));
-               &xor    ($s2,&DWP(8,$acc));
-               &xor    ($s3,&DWP(12,$acc));
-
-               &mov    ($key,$_key);           # load key
-               &call   ("_x86_AES_encrypt_compact");
-
-               &mov    ($acc,$_inp);           # load inp
-               &mov    ($key,$_out);           # load out
-
-               &mov    (&DWP(0,$key),$s0);     # save output data
-               &mov    (&DWP(4,$key),$s1);
-               &mov    (&DWP(8,$key),$s2);
-               &mov    (&DWP(12,$key),$s3);
-
-               &mov    ($s2,$_len);            # load len
-               &lea    ($acc,&DWP(16,$acc));   # advance inp
-               &mov    ($_inp,$acc);           # save inp
-               &lea    ($s3,&DWP(16,$key));    # advance out
-               &mov    ($_out,$s3);            # save out
-               &sub    ($s2,16);               # decrease len
-               &cmp    ($s2,16);
-               &mov    ($_len,$s2);            # save len
-       &jae    (&label("slow_enc_loop_x86"));
-       &test   ($s2,15);
-       &jnz    (&label("slow_enc_tail"));
-       &mov    ($acc,$_ivp);           # load ivp
-       &mov    ($s2,&DWP(8,$key));     # restore last dwords
-       &mov    ($s3,&DWP(12,$key));
-       &mov    (&DWP(0,$acc),$s0);     # save ivec
-       &mov    (&DWP(4,$acc),$s1);
-       &mov    (&DWP(8,$acc),$s2);
-       &mov    (&DWP(12,$acc),$s3);
-
-       &mov    ("esp",$_esp);
-       &popf   ();
-       &function_end_A();
-       &pushf  ();                     # kludge, never executed
-
-    &set_label("slow_enc_tail",16);
-       &emms   ()      if (!$x86only);
-       &mov    ($key eq "edi"? $key:"",$s3);   # load out to edi
-       &mov    ($s1,16);
-       &sub    ($s1,$s2);
-       &cmp    ($key,$acc eq "esi"? $acc:"");  # compare with inp
-       &je     (&label("enc_in_place"));
-       &align  (4);
-       &data_word(0xA4F3F689); # rep movsb     # copy input
-       &jmp    (&label("enc_skip_in_place"));
-    &set_label("enc_in_place");
-       &lea    ($key,&DWP(0,$key,$s2));
-    &set_label("enc_skip_in_place");
-       &mov    ($s2,$s1);
-       &xor    ($s0,$s0);
-       &align  (4);
-       &data_word(0xAAF3F689); # rep stosb     # zero tail
-
-       &mov    ($key,$_ivp);                   # restore ivp
-       &mov    ($acc,$s3);                     # output as input
-       &mov    ($s0,&DWP(0,$key));
-       &mov    ($s1,&DWP(4,$key));
-       &mov    ($_len,16);                     # len=16
-       &jmp    (&label("slow_enc_loop_x86"));  # one more spin...
-
-#--------------------------- SLOW DECRYPT ---------------------------#
-&set_label("slow_decrypt",16);
-                                       if (!$x86only) {
-       &bt     ($_tmp,25);             # check for SSE bit
-       &jnc    (&label("slow_dec_loop_x86"));
-
-       &set_label("slow_dec_loop_sse",4);
-               &movq   ("mm0",&QWP(0,$acc));   # read input
-               &movq   ("mm4",&QWP(8,$acc));
-
-               &mov    ($key,$_key);
-               &call   ("_sse_AES_decrypt_compact");
-
-               &mov    ($acc,$_inp);           # load inp
-               &lea    ($s0,$ivec);
-               &mov    ($s1,$_out);            # load out
-               &mov    ($s2,$_len);            # load len
-               &mov    ($key,$_ivp);           # load ivp
-
-               &movq   ("mm1",&QWP(0,$acc));   # re-read input
-               &movq   ("mm5",&QWP(8,$acc));
-
-               &pxor   ("mm0",&QWP(0,$key));   # xor iv
-               &pxor   ("mm4",&QWP(8,$key));
-
-               &movq   (&QWP(0,$key),"mm1");   # copy input to iv
-               &movq   (&QWP(8,$key),"mm5");
-
-               &sub    ($s2,16);               # decrease len
-               &jc     (&label("slow_dec_partial_sse"));
-
-               &movq   (&QWP(0,$s1),"mm0");    # write output
-               &movq   (&QWP(8,$s1),"mm4");
-
-               &lea    ($s1,&DWP(16,$s1));     # advance out
-               &mov    ($_out,$s1);            # save out
-               &lea    ($acc,&DWP(16,$acc));   # advance inp
-               &mov    ($_inp,$acc);           # save inp
-               &mov    ($_len,$s2);            # save len
-       &jnz    (&label("slow_dec_loop_sse"));
-       &emms   ();
-       &mov    ("esp",$_esp);
-       &popf   ();
-       &function_end_A();
-       &pushf  ();                     # kludge, never executed
-
-    &set_label("slow_dec_partial_sse",16);
-       &movq   (&QWP(0,$s0),"mm0");    # save output to temp
-       &movq   (&QWP(8,$s0),"mm4");
-       &emms   ();
-
-       &add    ($s2 eq "ecx" ? "ecx":"",16);
-       &mov    ("edi",$s1);            # out
-       &mov    ("esi",$s0);            # temp
-       &align  (4);
-       &data_word(0xA4F3F689);         # rep movsb # copy partial output
-
-       &mov    ("esp",$_esp);
-       &popf   ();
-       &function_end_A();
-       &pushf  ();                     # kludge, never executed
-                                       }
-       &set_label("slow_dec_loop_x86",16);
-               &mov    ($s0,&DWP(0,$acc));     # read input
-               &mov    ($s1,&DWP(4,$acc));
-               &mov    ($s2,&DWP(8,$acc));
-               &mov    ($s3,&DWP(12,$acc));
-
-               &lea    ($key,$ivec);
-               &mov    (&DWP(0,$key),$s0);     # copy to temp
-               &mov    (&DWP(4,$key),$s1);
-               &mov    (&DWP(8,$key),$s2);
-               &mov    (&DWP(12,$key),$s3);
-
-               &mov    ($key,$_key);           # load key
-               &call   ("_x86_AES_decrypt_compact");
-
-               &mov    ($key,$_ivp);           # load ivp
-               &mov    ($acc,$_len);           # load len
-               &xor    ($s0,&DWP(0,$key));     # xor iv
-               &xor    ($s1,&DWP(4,$key));
-               &xor    ($s2,&DWP(8,$key));
-               &xor    ($s3,&DWP(12,$key));
-
-               &sub    ($acc,16);
-               &jc     (&label("slow_dec_partial_x86"));
-
-               &mov    ($_len,$acc);           # save len
-               &mov    ($acc,$_out);           # load out
-
-               &mov    (&DWP(0,$acc),$s0);     # write output
-               &mov    (&DWP(4,$acc),$s1);
-               &mov    (&DWP(8,$acc),$s2);
-               &mov    (&DWP(12,$acc),$s3);
-
-               &lea    ($acc,&DWP(16,$acc));   # advance out
-               &mov    ($_out,$acc);           # save out
-
-               &lea    ($acc,$ivec);
-               &mov    ($s0,&DWP(0,$acc));     # read temp
-               &mov    ($s1,&DWP(4,$acc));
-               &mov    ($s2,&DWP(8,$acc));
-               &mov    ($s3,&DWP(12,$acc));
-
-               &mov    (&DWP(0,$key),$s0);     # copy it to iv
-               &mov    (&DWP(4,$key),$s1);
-               &mov    (&DWP(8,$key),$s2);
-               &mov    (&DWP(12,$key),$s3);
-
-               &mov    ($acc,$_inp);           # load inp
-               &lea    ($acc,&DWP(16,$acc));   # advance inp
-               &mov    ($_inp,$acc);           # save inp
-       &jnz    (&label("slow_dec_loop_x86"));
-       &mov    ("esp",$_esp);
-       &popf   ();
-       &function_end_A();
-       &pushf  ();                     # kludge, never executed
-
-    &set_label("slow_dec_partial_x86",16);
-       &lea    ($acc,$ivec);
-       &mov    (&DWP(0,$acc),$s0);     # save output to temp
-       &mov    (&DWP(4,$acc),$s1);
-       &mov    (&DWP(8,$acc),$s2);
-       &mov    (&DWP(12,$acc),$s3);
-
-       &mov    ($acc,$_inp);
-       &mov    ($s0,&DWP(0,$acc));     # re-read input
-       &mov    ($s1,&DWP(4,$acc));
-       &mov    ($s2,&DWP(8,$acc));
-       &mov    ($s3,&DWP(12,$acc));
-
-       &mov    (&DWP(0,$key),$s0);     # copy it to iv
-       &mov    (&DWP(4,$key),$s1);
-       &mov    (&DWP(8,$key),$s2);
-       &mov    (&DWP(12,$key),$s3);
-
-       &mov    ("ecx",$_len);
-       &mov    ("edi",$_out);
-       &lea    ("esi",$ivec);
-       &align  (4);
-       &data_word(0xA4F3F689);         # rep movsb # copy partial output
-
-       &mov    ("esp",$_esp);
-       &popf   ();
-&function_end("AES_cbc_encrypt");
-}
-
-#------------------------------------------------------------------#
-
-sub enckey()
-{
-       &movz   ("esi",&LB("edx"));             # rk[i]>>0
-       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-       &movz   ("esi",&HB("edx"));             # rk[i]>>8
-       &shl    ("ebx",24);
-       &xor    ("eax","ebx");
-
-       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-       &shr    ("edx",16);
-       &movz   ("esi",&LB("edx"));             # rk[i]>>16
-       &xor    ("eax","ebx");
-
-       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-       &movz   ("esi",&HB("edx"));             # rk[i]>>24
-       &shl    ("ebx",8);
-       &xor    ("eax","ebx");
-
-       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-       &shl    ("ebx",16);
-       &xor    ("eax","ebx");
-
-       &xor    ("eax",&DWP(1024-128,$tbl,"ecx",4));    # rcon
-}
-
-&function_begin("_x86_AES_set_encrypt_key");
-       &mov    ("esi",&wparam(1));             # user supplied key
-       &mov    ("edi",&wparam(3));             # private key schedule
-
-       &test   ("esi",-1);
-       &jz     (&label("badpointer"));
-       &test   ("edi",-1);
-       &jz     (&label("badpointer"));
-
-       &call   (&label("pic_point"));
-       &set_label("pic_point");
-       &blindpop($tbl);
-       &lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
-       &lea    ($tbl,&DWP(2048+128,$tbl));
-
-       # prefetch Te4
-       &mov    ("eax",&DWP(0-128,$tbl));
-       &mov    ("ebx",&DWP(32-128,$tbl));
-       &mov    ("ecx",&DWP(64-128,$tbl));
-       &mov    ("edx",&DWP(96-128,$tbl));
-       &mov    ("eax",&DWP(128-128,$tbl));
-       &mov    ("ebx",&DWP(160-128,$tbl));
-       &mov    ("ecx",&DWP(192-128,$tbl));
-       &mov    ("edx",&DWP(224-128,$tbl));
-
-       &mov    ("ecx",&wparam(2));             # number of bits in key
-       &cmp    ("ecx",128);
-       &je     (&label("10rounds"));
-       &cmp    ("ecx",192);
-       &je     (&label("12rounds"));
-       &cmp    ("ecx",256);
-       &je     (&label("14rounds"));
-       &mov    ("eax",-2);                     # invalid number of bits
-       &jmp    (&label("exit"));
-
-    &set_label("10rounds");
-       &mov    ("eax",&DWP(0,"esi"));          # copy first 4 dwords
-       &mov    ("ebx",&DWP(4,"esi"));
-       &mov    ("ecx",&DWP(8,"esi"));
-       &mov    ("edx",&DWP(12,"esi"));
-       &mov    (&DWP(0,"edi"),"eax");
-       &mov    (&DWP(4,"edi"),"ebx");
-       &mov    (&DWP(8,"edi"),"ecx");
-       &mov    (&DWP(12,"edi"),"edx");
-
-       &xor    ("ecx","ecx");
-       &jmp    (&label("10shortcut"));
-
-       &align  (4);
-       &set_label("10loop");
-               &mov    ("eax",&DWP(0,"edi"));          # rk[0]
-               &mov    ("edx",&DWP(12,"edi"));         # rk[3]
-       &set_label("10shortcut");
-               &enckey ();
-
-               &mov    (&DWP(16,"edi"),"eax");         # rk[4]
-               &xor    ("eax",&DWP(4,"edi"));
-               &mov    (&DWP(20,"edi"),"eax");         # rk[5]
-               &xor    ("eax",&DWP(8,"edi"));
-               &mov    (&DWP(24,"edi"),"eax");         # rk[6]
-               &xor    ("eax",&DWP(12,"edi"));
-               &mov    (&DWP(28,"edi"),"eax");         # rk[7]
-               &inc    ("ecx");
-               &add    ("edi",16);
-               &cmp    ("ecx",10);
-       &jl     (&label("10loop"));
-
-       &mov    (&DWP(80,"edi"),10);            # setup number of rounds
-       &xor    ("eax","eax");
-       &jmp    (&label("exit"));
-
-    &set_label("12rounds");
-       &mov    ("eax",&DWP(0,"esi"));          # copy first 6 dwords
-       &mov    ("ebx",&DWP(4,"esi"));
-       &mov    ("ecx",&DWP(8,"esi"));
-       &mov    ("edx",&DWP(12,"esi"));
-       &mov    (&DWP(0,"edi"),"eax");
-       &mov    (&DWP(4,"edi"),"ebx");
-       &mov    (&DWP(8,"edi"),"ecx");
-       &mov    (&DWP(12,"edi"),"edx");
-       &mov    ("ecx",&DWP(16,"esi"));
-       &mov    ("edx",&DWP(20,"esi"));
-       &mov    (&DWP(16,"edi"),"ecx");
-       &mov    (&DWP(20,"edi"),"edx");
-
-       &xor    ("ecx","ecx");
-       &jmp    (&label("12shortcut"));
-
-       &align  (4);
-       &set_label("12loop");
-               &mov    ("eax",&DWP(0,"edi"));          # rk[0]
-               &mov    ("edx",&DWP(20,"edi"));         # rk[5]
-       &set_label("12shortcut");
-               &enckey ();
-
-               &mov    (&DWP(24,"edi"),"eax");         # rk[6]
-               &xor    ("eax",&DWP(4,"edi"));
-               &mov    (&DWP(28,"edi"),"eax");         # rk[7]
-               &xor    ("eax",&DWP(8,"edi"));
-               &mov    (&DWP(32,"edi"),"eax");         # rk[8]
-               &xor    ("eax",&DWP(12,"edi"));
-               &mov    (&DWP(36,"edi"),"eax");         # rk[9]
-
-               &cmp    ("ecx",7);
-               &je     (&label("12break"));
-               &inc    ("ecx");
-
-               &xor    ("eax",&DWP(16,"edi"));
-               &mov    (&DWP(40,"edi"),"eax");         # rk[10]
-               &xor    ("eax",&DWP(20,"edi"));
-               &mov    (&DWP(44,"edi"),"eax");         # rk[11]
-
-               &add    ("edi",24);
-       &jmp    (&label("12loop"));
-
-       &set_label("12break");
-       &mov    (&DWP(72,"edi"),12);            # setup number of rounds
-       &xor    ("eax","eax");
-       &jmp    (&label("exit"));
-
-    &set_label("14rounds");
-       &mov    ("eax",&DWP(0,"esi"));          # copy first 8 dwords
-       &mov    ("ebx",&DWP(4,"esi"));
-       &mov    ("ecx",&DWP(8,"esi"));
-       &mov    ("edx",&DWP(12,"esi"));
-       &mov    (&DWP(0,"edi"),"eax");
-       &mov    (&DWP(4,"edi"),"ebx");
-       &mov    (&DWP(8,"edi"),"ecx");
-       &mov    (&DWP(12,"edi"),"edx");
-       &mov    ("eax",&DWP(16,"esi"));
-       &mov    ("ebx",&DWP(20,"esi"));
-       &mov    ("ecx",&DWP(24,"esi"));
-       &mov    ("edx",&DWP(28,"esi"));
-       &mov    (&DWP(16,"edi"),"eax");
-       &mov    (&DWP(20,"edi"),"ebx");
-       &mov    (&DWP(24,"edi"),"ecx");
-       &mov    (&DWP(28,"edi"),"edx");
-
-       &xor    ("ecx","ecx");
-       &jmp    (&label("14shortcut"));
-
-       &align  (4);
-       &set_label("14loop");
-               &mov    ("edx",&DWP(28,"edi"));         # rk[7]
-       &set_label("14shortcut");
-               &mov    ("eax",&DWP(0,"edi"));          # rk[0]
-
-               &enckey ();
-
-               &mov    (&DWP(32,"edi"),"eax");         # rk[8]
-               &xor    ("eax",&DWP(4,"edi"));
-               &mov    (&DWP(36,"edi"),"eax");         # rk[9]
-               &xor    ("eax",&DWP(8,"edi"));
-               &mov    (&DWP(40,"edi"),"eax");         # rk[10]
-               &xor    ("eax",&DWP(12,"edi"));
-               &mov    (&DWP(44,"edi"),"eax");         # rk[11]
-
-               &cmp    ("ecx",6);
-               &je     (&label("14break"));
-               &inc    ("ecx");
-
-               &mov    ("edx","eax");
-               &mov    ("eax",&DWP(16,"edi"));         # rk[4]
-               &movz   ("esi",&LB("edx"));             # rk[11]>>0
-               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-               &movz   ("esi",&HB("edx"));             # rk[11]>>8
-               &xor    ("eax","ebx");
-
-               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-               &shr    ("edx",16);
-               &shl    ("ebx",8);
-               &movz   ("esi",&LB("edx"));             # rk[11]>>16
-               &xor    ("eax","ebx");
-
-               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-               &movz   ("esi",&HB("edx"));             # rk[11]>>24
-               &shl    ("ebx",16);
-               &xor    ("eax","ebx");
-
-               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
-               &shl    ("ebx",24);
-               &xor    ("eax","ebx");
-
-               &mov    (&DWP(48,"edi"),"eax");         # rk[12]
-               &xor    ("eax",&DWP(20,"edi"));
-               &mov    (&DWP(52,"edi"),"eax");         # rk[13]
-               &xor    ("eax",&DWP(24,"edi"));
-               &mov    (&DWP(56,"edi"),"eax");         # rk[14]
-               &xor    ("eax",&DWP(28,"edi"));
-               &mov    (&DWP(60,"edi"),"eax");         # rk[15]
-
-               &add    ("edi",32);
-       &jmp    (&label("14loop"));
-
-       &set_label("14break");
-       &mov    (&DWP(48,"edi"),14);            # setup number of rounds
-       &xor    ("eax","eax");
-       &jmp    (&label("exit"));
-
-    &set_label("badpointer");
-       &mov    ("eax",-1);
-    &set_label("exit");
-&function_end("_x86_AES_set_encrypt_key");
-
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
-       &call   ("_x86_AES_set_encrypt_key");
-       &ret    ();
-&function_end_B("AES_set_encrypt_key");
-
-sub deckey()
-{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
-  my $tmp = $tbl;
-
-       &mov    ($tmp,0x80808080);
-       &and    ($tmp,$tp1);
-       &lea    ($tp2,&DWP(0,$tp1,$tp1));
-       &mov    ($acc,$tmp);
-       &shr    ($tmp,7);
-       &sub    ($acc,$tmp);
-       &and    ($tp2,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
-       &xor    ($tp2,$acc);
-       &mov    ($tmp,0x80808080);
-
-       &and    ($tmp,$tp2);
-       &lea    ($tp4,&DWP(0,$tp2,$tp2));
-       &mov    ($acc,$tmp);
-       &shr    ($tmp,7);
-       &sub    ($acc,$tmp);
-       &and    ($tp4,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
-        &xor   ($tp2,$tp1);    # tp2^tp1
-       &xor    ($tp4,$acc);
-       &mov    ($tmp,0x80808080);
-
-       &and    ($tmp,$tp4);
-       &lea    ($tp8,&DWP(0,$tp4,$tp4));
-       &mov    ($acc,$tmp);
-       &shr    ($tmp,7);
-        &xor   ($tp4,$tp1);    # tp4^tp1
-       &sub    ($acc,$tmp);
-       &and    ($tp8,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
-        &rotl  ($tp1,8);       # = ROTATE(tp1,8)
-       &xor    ($tp8,$acc);
-
-       &mov    ($tmp,&DWP(4*($i+1),$key));     # modulo-scheduled load
-
-       &xor    ($tp1,$tp2);
-       &xor    ($tp2,$tp8);
-       &xor    ($tp1,$tp4);
-       &rotl   ($tp2,24);
-       &xor    ($tp4,$tp8);
-       &xor    ($tp1,$tp8);    # ^= tp8^(tp4^tp1)^(tp2^tp1)
-       &rotl   ($tp4,16);
-       &xor    ($tp1,$tp2);    # ^= ROTATE(tp8^tp2^tp1,24)
-       &rotl   ($tp8,8);
-       &xor    ($tp1,$tp4);    # ^= ROTATE(tp8^tp4^tp1,16)
-       &mov    ($tp2,$tmp);
-       &xor    ($tp1,$tp8);    # ^= ROTATE(tp8,8)
-
-       &mov    (&DWP(4*$i,$key),$tp1);
-}
-
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
-       &call   ("_x86_AES_set_encrypt_key");
-       &cmp    ("eax",0);
-       &je     (&label("proceed"));
-       &ret    ();
-
-    &set_label("proceed");
-       &push   ("ebp");
-       &push   ("ebx");
-       &push   ("esi");
-       &push   ("edi");
-
-       &mov    ("esi",&wparam(2));
-       &mov    ("ecx",&DWP(240,"esi"));        # pull number of rounds
-       &lea    ("ecx",&DWP(0,"","ecx",4));
-       &lea    ("edi",&DWP(0,"esi","ecx",4));  # pointer to last chunk
-
-       &set_label("invert",4);                 # invert order of chunks
-               &mov    ("eax",&DWP(0,"esi"));
-               &mov    ("ebx",&DWP(4,"esi"));
-               &mov    ("ecx",&DWP(0,"edi"));
-               &mov    ("edx",&DWP(4,"edi"));
-               &mov    (&DWP(0,"edi"),"eax");
-               &mov    (&DWP(4,"edi"),"ebx");
-               &mov    (&DWP(0,"esi"),"ecx");
-               &mov    (&DWP(4,"esi"),"edx");
-               &mov    ("eax",&DWP(8,"esi"));
-               &mov    ("ebx",&DWP(12,"esi"));
-               &mov    ("ecx",&DWP(8,"edi"));
-               &mov    ("edx",&DWP(12,"edi"));
-               &mov    (&DWP(8,"edi"),"eax");
-               &mov    (&DWP(12,"edi"),"ebx");
-               &mov    (&DWP(8,"esi"),"ecx");
-               &mov    (&DWP(12,"esi"),"edx");
-               &add    ("esi",16);
-               &sub    ("edi",16);
-               &cmp    ("esi","edi");
-       &jne    (&label("invert"));
-
-       &mov    ($key,&wparam(2));
-       &mov    ($acc,&DWP(240,$key));          # pull number of rounds
-       &lea    ($acc,&DWP(-2,$acc,$acc));
-       &lea    ($acc,&DWP(0,$key,$acc,8));
-       &mov    (&wparam(2),$acc);
-
-       &mov    ($s0,&DWP(16,$key));            # modulo-scheduled load
-       &set_label("permute",4);                # permute the key schedule
-               &add    ($key,16);
-               &deckey (0,$key,$s0,$s1,$s2,$s3);
-               &deckey (1,$key,$s1,$s2,$s3,$s0);
-               &deckey (2,$key,$s2,$s3,$s0,$s1);
-               &deckey (3,$key,$s3,$s0,$s1,$s2);
-               &cmp    ($key,&wparam(2));
-       &jb     (&label("permute"));
-
-       &xor    ("eax","eax");                  # return success
-&function_end("AES_set_decrypt_key");
-&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
-
-close STDOUT;
diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl
deleted file mode 100755 (executable)
index d87e201..0000000
+++ /dev/null
@@ -1,2916 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2005-2019 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Version 2.1.
-#
-# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
-# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
-# [you'll notice a lot of resemblance], such as compressed S-boxes
-# in little-endian byte order, prefetch of these tables in CBC mode,
-# as well as avoiding L1 cache aliasing between stack frame and key
-# schedule and already mentioned tables, compressed Td4...
-#
-# Performance in number of cycles per processed byte for 128-bit key:
-#
-#              ECB encrypt     ECB decrypt     CBC large chunk
-# AMD64                33              43              13.0
-# EM64T                38              56              18.6(*)
-# Core 2       30              42              14.5(*)
-# Atom         65              86              32.1(*)
-#
-# (*) with hyper-threading off
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-$verticalspin=1;       # unlike 32-bit version $verticalspin performs
-                       # ~15% better on both AMD and Intel cores
-$speed_limit=512;      # see aes-586.pl for details
-
-$code=".text\n";
-
-$s0="%eax";
-$s1="%ebx";
-$s2="%ecx";
-$s3="%edx";
-$acc0="%esi";  $mask80="%rsi";
-$acc1="%edi";  $maskfe="%rdi";
-$acc2="%ebp";  $mask1b="%rbp";
-$inp="%r8";
-$out="%r9";
-$t0="%r10d";
-$t1="%r11d";
-$t2="%r12d";
-$rnds="%r13d";
-$sbox="%r14";
-$key="%r15";
-
-sub hi() { my $r=shift;        $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
-sub lo() { my $r=shift;        $r =~ s/%[er]([a-d])x/%\1l/;
-                       $r =~ s/%[er]([sd]i)/%\1l/;
-                       $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
-sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
-                       $r =~ s/%r([0-9]+)/%r\1d/;      $r; }
-sub _data_word()
-{ my $i;
-    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
-}
-sub data_word()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".long\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
-    $code.=sprintf"0x%08x\n",$last;
-}
-
-sub data_byte()
-{ my $i;
-  my $last=pop(@_);
-    $code.=".byte\t";
-    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
-    $code.=sprintf"0x%02x\n",$last&0xff;
-}
-
-sub encvert()
-{ my $t3="%r8d";       # zaps $inp!
-
-$code.=<<___;
-       # favor 3-way issue Opteron pipeline...
-       movzb   `&lo("$s0")`,$acc0
-       movzb   `&lo("$s1")`,$acc1
-       movzb   `&lo("$s2")`,$acc2
-       mov     0($sbox,$acc0,8),$t0
-       mov     0($sbox,$acc1,8),$t1
-       mov     0($sbox,$acc2,8),$t2
-
-       movzb   `&hi("$s1")`,$acc0
-       movzb   `&hi("$s2")`,$acc1
-       movzb   `&lo("$s3")`,$acc2
-       xor     3($sbox,$acc0,8),$t0
-       xor     3($sbox,$acc1,8),$t1
-       mov     0($sbox,$acc2,8),$t3
-
-       movzb   `&hi("$s3")`,$acc0
-       shr     \$16,$s2
-       movzb   `&hi("$s0")`,$acc2
-       xor     3($sbox,$acc0,8),$t2
-       shr     \$16,$s3
-       xor     3($sbox,$acc2,8),$t3
-
-       shr     \$16,$s1
-       lea     16($key),$key
-       shr     \$16,$s0
-
-       movzb   `&lo("$s2")`,$acc0
-       movzb   `&lo("$s3")`,$acc1
-       movzb   `&lo("$s0")`,$acc2
-       xor     2($sbox,$acc0,8),$t0
-       xor     2($sbox,$acc1,8),$t1
-       xor     2($sbox,$acc2,8),$t2
-
-       movzb   `&hi("$s3")`,$acc0
-       movzb   `&hi("$s0")`,$acc1
-       movzb   `&lo("$s1")`,$acc2
-       xor     1($sbox,$acc0,8),$t0
-       xor     1($sbox,$acc1,8),$t1
-       xor     2($sbox,$acc2,8),$t3
-
-       mov     12($key),$s3
-       movzb   `&hi("$s1")`,$acc1
-       movzb   `&hi("$s2")`,$acc2
-       mov     0($key),$s0
-       xor     1($sbox,$acc1,8),$t2
-       xor     1($sbox,$acc2,8),$t3
-
-       mov     4($key),$s1
-       mov     8($key),$s2
-       xor     $t0,$s0
-       xor     $t1,$s1
-       xor     $t2,$s2
-       xor     $t3,$s3
-___
-}
-
-sub enclastvert()
-{ my $t3="%r8d";       # zaps $inp!
-
-$code.=<<___;
-       movzb   `&lo("$s0")`,$acc0
-       movzb   `&lo("$s1")`,$acc1
-       movzb   `&lo("$s2")`,$acc2
-       movzb   2($sbox,$acc0,8),$t0
-       movzb   2($sbox,$acc1,8),$t1
-       movzb   2($sbox,$acc2,8),$t2
-
-       movzb   `&lo("$s3")`,$acc0
-       movzb   `&hi("$s1")`,$acc1
-       movzb   `&hi("$s2")`,$acc2
-       movzb   2($sbox,$acc0,8),$t3
-       mov     0($sbox,$acc1,8),$acc1  #$t0
-       mov     0($sbox,$acc2,8),$acc2  #$t1
-
-       and     \$0x0000ff00,$acc1
-       and     \$0x0000ff00,$acc2
-
-       xor     $acc1,$t0
-       xor     $acc2,$t1
-       shr     \$16,$s2
-
-       movzb   `&hi("$s3")`,$acc0
-       movzb   `&hi("$s0")`,$acc1
-       shr     \$16,$s3
-       mov     0($sbox,$acc0,8),$acc0  #$t2
-       mov     0($sbox,$acc1,8),$acc1  #$t3
-
-       and     \$0x0000ff00,$acc0
-       and     \$0x0000ff00,$acc1
-       shr     \$16,$s1
-       xor     $acc0,$t2
-       xor     $acc1,$t3
-       shr     \$16,$s0
-
-       movzb   `&lo("$s2")`,$acc0
-       movzb   `&lo("$s3")`,$acc1
-       movzb   `&lo("$s0")`,$acc2
-       mov     0($sbox,$acc0,8),$acc0  #$t0
-       mov     0($sbox,$acc1,8),$acc1  #$t1
-       mov     0($sbox,$acc2,8),$acc2  #$t2
-
-       and     \$0x00ff0000,$acc0
-       and     \$0x00ff0000,$acc1
-       and     \$0x00ff0000,$acc2
-
-       xor     $acc0,$t0
-       xor     $acc1,$t1
-       xor     $acc2,$t2
-
-       movzb   `&lo("$s1")`,$acc0
-       movzb   `&hi("$s3")`,$acc1
-       movzb   `&hi("$s0")`,$acc2
-       mov     0($sbox,$acc0,8),$acc0  #$t3
-       mov     2($sbox,$acc1,8),$acc1  #$t0
-       mov     2($sbox,$acc2,8),$acc2  #$t1
-
-       and     \$0x00ff0000,$acc0
-       and     \$0xff000000,$acc1
-       and     \$0xff000000,$acc2
-
-       xor     $acc0,$t3
-       xor     $acc1,$t0
-       xor     $acc2,$t1
-
-       movzb   `&hi("$s1")`,$acc0
-       movzb   `&hi("$s2")`,$acc1
-       mov     16+12($key),$s3
-       mov     2($sbox,$acc0,8),$acc0  #$t2
-       mov     2($sbox,$acc1,8),$acc1  #$t3
-       mov     16+0($key),$s0
-
-       and     \$0xff000000,$acc0
-       and     \$0xff000000,$acc1
-
-       xor     $acc0,$t2
-       xor     $acc1,$t3
-
-       mov     16+4($key),$s1
-       mov     16+8($key),$s2
-       xor     $t0,$s0
-       xor     $t1,$s1
-       xor     $t2,$s2
-       xor     $t3,$s3
-___
-}
-
-sub encstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-       if ($i==3) {
-               $tmp0=$s[1];
-               $tmp1=$s[2];
-               $tmp2=$s[3];
-       }
-       $code.="        movzb   ".&lo($s[0]).",$out\n";
-       $code.="        mov     $s[2],$tmp1\n"          if ($i!=3);
-       $code.="        lea     16($key),$key\n"        if ($i==0);
-
-       $code.="        movzb   ".&hi($s[1]).",$tmp0\n";
-       $code.="        mov     0($sbox,$out,8),$out\n";
-
-       $code.="        shr     \$16,$tmp1\n";
-       $code.="        mov     $s[3],$tmp2\n"          if ($i!=3);
-       $code.="        xor     3($sbox,$tmp0,8),$out\n";
-
-       $code.="        movzb   ".&lo($tmp1).",$tmp1\n";
-       $code.="        shr     \$24,$tmp2\n";
-       $code.="        xor     4*$i($key),$out\n";
-
-       $code.="        xor     2($sbox,$tmp1,8),$out\n";
-       $code.="        xor     1($sbox,$tmp2,8),$out\n";
-
-       $code.="        mov     $t0,$s[1]\n"            if ($i==3);
-       $code.="        mov     $t1,$s[2]\n"            if ($i==3);
-       $code.="        mov     $t2,$s[3]\n"            if ($i==3);
-       $code.="\n";
-}
-
-sub enclast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-       if ($i==3) {
-               $tmp0=$s[1];
-               $tmp1=$s[2];
-               $tmp2=$s[3];
-       }
-       $code.="        movzb   ".&lo($s[0]).",$out\n";
-       $code.="        mov     $s[2],$tmp1\n"          if ($i!=3);
-
-       $code.="        mov     2($sbox,$out,8),$out\n";
-       $code.="        shr     \$16,$tmp1\n";
-       $code.="        mov     $s[3],$tmp2\n"          if ($i!=3);
-
-       $code.="        and     \$0x000000ff,$out\n";
-       $code.="        movzb   ".&hi($s[1]).",$tmp0\n";
-       $code.="        movzb   ".&lo($tmp1).",$tmp1\n";
-       $code.="        shr     \$24,$tmp2\n";
-
-       $code.="        mov     0($sbox,$tmp0,8),$tmp0\n";
-       $code.="        mov     0($sbox,$tmp1,8),$tmp1\n";
-       $code.="        mov     2($sbox,$tmp2,8),$tmp2\n";
-
-       $code.="        and     \$0x0000ff00,$tmp0\n";
-       $code.="        and     \$0x00ff0000,$tmp1\n";
-       $code.="        and     \$0xff000000,$tmp2\n";
-
-       $code.="        xor     $tmp0,$out\n";
-       $code.="        mov     $t0,$s[1]\n"            if ($i==3);
-       $code.="        xor     $tmp1,$out\n";
-       $code.="        mov     $t1,$s[2]\n"            if ($i==3);
-       $code.="        xor     $tmp2,$out\n";
-       $code.="        mov     $t2,$s[3]\n"            if ($i==3);
-       $code.="\n";
-}
-
-$code.=<<___;
-.type  _x86_64_AES_encrypt,\@abi-omnipotent
-.align 16
-_x86_64_AES_encrypt:
-       xor     0($key),$s0                     # xor with key
-       xor     4($key),$s1
-       xor     8($key),$s2
-       xor     12($key),$s3
-
-       mov     240($key),$rnds                 # load key->rounds
-       sub     \$1,$rnds
-       jmp     .Lenc_loop
-.align 16
-.Lenc_loop:
-___
-       if ($verticalspin) { &encvert(); }
-       else {  &encstep(0,$s0,$s1,$s2,$s3);
-               &encstep(1,$s1,$s2,$s3,$s0);
-               &encstep(2,$s2,$s3,$s0,$s1);
-               &encstep(3,$s3,$s0,$s1,$s2);
-       }
-$code.=<<___;
-       sub     \$1,$rnds
-       jnz     .Lenc_loop
-___
-       if ($verticalspin) { &enclastvert(); }
-       else {  &enclast(0,$s0,$s1,$s2,$s3);
-               &enclast(1,$s1,$s2,$s3,$s0);
-               &enclast(2,$s2,$s3,$s0,$s1);
-               &enclast(3,$s3,$s0,$s1,$s2);
-               $code.=<<___;
-               xor     16+0($key),$s0          # xor with key
-               xor     16+4($key),$s1
-               xor     16+8($key),$s2
-               xor     16+12($key),$s3
-___
-       }
-$code.=<<___;
-       .byte   0xf3,0xc3                       # rep ret
-.size  _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
-___
-
-# it's possible to implement this by shifting tN by 8, filling least
-# significant byte with byte load and finally bswap-ing at the end,
-# but such partial register load kills Core 2...
-sub enccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-       movzb   `&lo("$s0")`,$t0
-       movzb   `&lo("$s1")`,$t1
-       movzb   `&lo("$s2")`,$t2
-       movzb   `&lo("$s3")`,$t3
-       movzb   `&hi("$s1")`,$acc0
-       movzb   `&hi("$s2")`,$acc1
-       shr     \$16,$s2
-       movzb   `&hi("$s3")`,$acc2
-       movzb   ($sbox,$t0,1),$t0
-       movzb   ($sbox,$t1,1),$t1
-       movzb   ($sbox,$t2,1),$t2
-       movzb   ($sbox,$t3,1),$t3
-
-       movzb   ($sbox,$acc0,1),$t4     #$t0
-       movzb   `&hi("$s0")`,$acc0
-       movzb   ($sbox,$acc1,1),$t5     #$t1
-       movzb   `&lo("$s2")`,$acc1
-       movzb   ($sbox,$acc2,1),$acc2   #$t2
-       movzb   ($sbox,$acc0,1),$acc0   #$t3
-
-       shl     \$8,$t4
-       shr     \$16,$s3
-       shl     \$8,$t5
-       xor     $t4,$t0
-       shr     \$16,$s0
-       movzb   `&lo("$s3")`,$t4
-       shr     \$16,$s1
-       xor     $t5,$t1
-       shl     \$8,$acc2
-       movzb   `&lo("$s0")`,$t5
-       movzb   ($sbox,$acc1,1),$acc1   #$t0
-       xor     $acc2,$t2
-
-       shl     \$8,$acc0
-       movzb   `&lo("$s1")`,$acc2
-       shl     \$16,$acc1
-       xor     $acc0,$t3
-       movzb   ($sbox,$t4,1),$t4       #$t1
-       movzb   `&hi("$s3")`,$acc0
-       movzb   ($sbox,$t5,1),$t5       #$t2
-       xor     $acc1,$t0
-
-       shr     \$8,$s2
-       movzb   `&hi("$s0")`,$acc1
-       shl     \$16,$t4
-       shr     \$8,$s1
-       shl     \$16,$t5
-       xor     $t4,$t1
-       movzb   ($sbox,$acc2,1),$acc2   #$t3
-       movzb   ($sbox,$acc0,1),$acc0   #$t0
-       movzb   ($sbox,$acc1,1),$acc1   #$t1
-       movzb   ($sbox,$s2,1),$s3       #$t3
-       movzb   ($sbox,$s1,1),$s2       #$t2
-
-       shl     \$16,$acc2
-       xor     $t5,$t2
-       shl     \$24,$acc0
-       xor     $acc2,$t3
-       shl     \$24,$acc1
-       xor     $acc0,$t0
-       shl     \$24,$s3
-       xor     $acc1,$t1
-       shl     \$24,$s2
-       mov     $t0,$s0
-       mov     $t1,$s1
-       xor     $t2,$s2
-       xor     $t3,$s3
-___
-}
-
-sub enctransform_ref()
-{ my $sn = shift;
-  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-       mov     $sn,$acc
-       and     \$0x80808080,$acc
-       mov     $acc,$tmp
-       shr     \$7,$tmp
-       lea     ($sn,$sn),$r2
-       sub     $tmp,$acc
-       and     \$0xfefefefe,$r2
-       and     \$0x1b1b1b1b,$acc
-       mov     $sn,$tmp
-       xor     $acc,$r2
-
-       xor     $r2,$sn
-       rol     \$24,$sn
-       xor     $r2,$sn
-       ror     \$16,$tmp
-       xor     $tmp,$sn
-       ror     \$8,$tmp
-       xor     $tmp,$sn
-___
-}
-
-# unlike decrypt case it does not pay off to parallelize enctransform
-sub enctransform()
-{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
-
-$code.=<<___;
-       mov     \$0x80808080,$t0
-       mov     \$0x80808080,$t1
-       and     $s0,$t0
-       and     $s1,$t1
-       mov     $t0,$acc0
-       mov     $t1,$acc1
-       shr     \$7,$t0
-       lea     ($s0,$s0),$r20
-       shr     \$7,$t1
-       lea     ($s1,$s1),$r21
-       sub     $t0,$acc0
-       sub     $t1,$acc1
-       and     \$0xfefefefe,$r20
-       and     \$0xfefefefe,$r21
-       and     \$0x1b1b1b1b,$acc0
-       and     \$0x1b1b1b1b,$acc1
-       mov     $s0,$t0
-       mov     $s1,$t1
-       xor     $acc0,$r20
-       xor     $acc1,$r21
-
-       xor     $r20,$s0
-       xor     $r21,$s1
-        mov    \$0x80808080,$t2
-       rol     \$24,$s0
-        mov    \$0x80808080,$t3
-       rol     \$24,$s1
-        and    $s2,$t2
-        and    $s3,$t3
-       xor     $r20,$s0
-       xor     $r21,$s1
-        mov    $t2,$acc0
-       ror     \$16,$t0
-        mov    $t3,$acc1
-       ror     \$16,$t1
-        lea    ($s2,$s2),$r20
-        shr    \$7,$t2
-       xor     $t0,$s0
-        shr    \$7,$t3
-       xor     $t1,$s1
-       ror     \$8,$t0
-        lea    ($s3,$s3),$r21
-       ror     \$8,$t1
-        sub    $t2,$acc0
-        sub    $t3,$acc1
-       xor     $t0,$s0
-       xor     $t1,$s1
-
-       and     \$0xfefefefe,$r20
-       and     \$0xfefefefe,$r21
-       and     \$0x1b1b1b1b,$acc0
-       and     \$0x1b1b1b1b,$acc1
-       mov     $s2,$t2
-       mov     $s3,$t3
-       xor     $acc0,$r20
-       xor     $acc1,$r21
-
-       ror     \$16,$t2
-       xor     $r20,$s2
-       ror     \$16,$t3
-       xor     $r21,$s3
-       rol     \$24,$s2
-       mov     0($sbox),$acc0                  # prefetch Te4
-       rol     \$24,$s3
-       xor     $r20,$s2
-       mov     64($sbox),$acc1
-       xor     $r21,$s3
-       mov     128($sbox),$r20
-       xor     $t2,$s2
-       ror     \$8,$t2
-       xor     $t3,$s3
-       ror     \$8,$t3
-       xor     $t2,$s2
-       mov     192($sbox),$r21
-       xor     $t3,$s3
-___
-}
-
-$code.=<<___;
-.type  _x86_64_AES_encrypt_compact,\@abi-omnipotent
-.align 16
-_x86_64_AES_encrypt_compact:
-.cfi_startproc
-       lea     128($sbox),$inp                 # size optimization
-       mov     0-128($inp),$acc1               # prefetch Te4
-       mov     32-128($inp),$acc2
-       mov     64-128($inp),$t0
-       mov     96-128($inp),$t1
-       mov     128-128($inp),$acc1
-       mov     160-128($inp),$acc2
-       mov     192-128($inp),$t0
-       mov     224-128($inp),$t1
-       jmp     .Lenc_loop_compact
-.align 16
-.Lenc_loop_compact:
-               xor     0($key),$s0             # xor with key
-               xor     4($key),$s1
-               xor     8($key),$s2
-               xor     12($key),$s3
-               lea     16($key),$key
-___
-               &enccompactvert();
-$code.=<<___;
-               cmp     16(%rsp),$key
-               je      .Lenc_compact_done
-___
-               &enctransform();
-$code.=<<___;
-       jmp     .Lenc_loop_compact
-.align 16
-.Lenc_compact_done:
-       xor     0($key),$s0
-       xor     4($key),$s1
-       xor     8($key),$s2
-       xor     12($key),$s3
-       .byte   0xf3,0xc3                       # rep ret
-.cfi_endproc
-.size  _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
-___
-
-# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.globl AES_encrypt
-.type  AES_encrypt,\@function,3
-.align 16
-.globl asm_AES_encrypt
-.hidden        asm_AES_encrypt
-asm_AES_encrypt:
-AES_encrypt:
-.cfi_startproc
-       mov     %rsp,%rax
-.cfi_def_cfa_register  %rax
-       push    %rbx
-.cfi_push      %rbx
-       push    %rbp
-.cfi_push      %rbp
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-
-       # allocate frame "above" key schedule
-       lea     -63(%rdx),%rcx  # %rdx is key argument
-       and     \$-64,%rsp
-       sub     %rsp,%rcx
-       neg     %rcx
-       and     \$0x3c0,%rcx
-       sub     %rcx,%rsp
-       sub     \$32,%rsp
-
-       mov     %rsi,16(%rsp)   # save out
-       mov     %rax,24(%rsp)   # save original stack pointer
-.cfi_cfa_expression    %rsp+24,deref,+8
-.Lenc_prologue:
-
-       mov     %rdx,$key
-       mov     240($key),$rnds # load rounds
-
-       mov     0(%rdi),$s0     # load input vector
-       mov     4(%rdi),$s1
-       mov     8(%rdi),$s2
-       mov     12(%rdi),$s3
-
-       shl     \$4,$rnds
-       lea     ($key,$rnds),%rbp
-       mov     $key,(%rsp)     # key schedule
-       mov     %rbp,8(%rsp)    # end of key schedule
-
-       # pick Te4 copy which can't "overlap" with stack frame or key schedule
-       lea     .LAES_Te+2048(%rip),$sbox
-       lea     768(%rsp),%rbp
-       sub     $sbox,%rbp
-       and     \$0x300,%rbp
-       lea     ($sbox,%rbp),$sbox
-
-       call    _x86_64_AES_encrypt_compact
-
-       mov     16(%rsp),$out   # restore out
-       mov     24(%rsp),%rsi   # restore saved stack pointer
-.cfi_def_cfa   %rsi,8
-       mov     $s0,0($out)     # write output vector
-       mov     $s1,4($out)
-       mov     $s2,8($out)
-       mov     $s3,12($out)
-
-       mov     -48(%rsi),%r15
-.cfi_restore   %r15
-       mov     -40(%rsi),%r14
-.cfi_restore   %r14
-       mov     -32(%rsi),%r13
-.cfi_restore   %r13
-       mov     -24(%rsi),%r12
-.cfi_restore   %r12
-       mov     -16(%rsi),%rbp
-.cfi_restore   %rbp
-       mov     -8(%rsi),%rbx
-.cfi_restore   %rbx
-       lea     (%rsi),%rsp
-.cfi_def_cfa_register  %rsp
-.Lenc_epilogue:
-       ret
-.cfi_endproc
-.size  AES_encrypt,.-AES_encrypt
-___
-
-#------------------------------------------------------------------#
-
-sub decvert()
-{ my $t3="%r8d";       # zaps $inp!
-
-$code.=<<___;
-       # favor 3-way issue Opteron pipeline...
-       movzb   `&lo("$s0")`,$acc0
-       movzb   `&lo("$s1")`,$acc1
-       movzb   `&lo("$s2")`,$acc2
-       mov     0($sbox,$acc0,8),$t0
-       mov     0($sbox,$acc1,8),$t1
-       mov     0($sbox,$acc2,8),$t2
-
-       movzb   `&hi("$s3")`,$acc0
-       movzb   `&hi("$s0")`,$acc1
-       movzb   `&lo("$s3")`,$acc2
-       xor     3($sbox,$acc0,8),$t0
-       xor     3($sbox,$acc1,8),$t1
-       mov     0($sbox,$acc2,8),$t3
-
-       movzb   `&hi("$s1")`,$acc0
-       shr     \$16,$s0
-       movzb   `&hi("$s2")`,$acc2
-       xor     3($sbox,$acc0,8),$t2
-       shr     \$16,$s3
-       xor     3($sbox,$acc2,8),$t3
-
-       shr     \$16,$s1
-       lea     16($key),$key
-       shr     \$16,$s2
-
-       movzb   `&lo("$s2")`,$acc0
-       movzb   `&lo("$s3")`,$acc1
-       movzb   `&lo("$s0")`,$acc2
-       xor     2($sbox,$acc0,8),$t0
-       xor     2($sbox,$acc1,8),$t1
-       xor     2($sbox,$acc2,8),$t2
-
-       movzb   `&hi("$s1")`,$acc0
-       movzb   `&hi("$s2")`,$acc1
-       movzb   `&lo("$s1")`,$acc2
-       xor     1($sbox,$acc0,8),$t0
-       xor     1($sbox,$acc1,8),$t1
-       xor     2($sbox,$acc2,8),$t3
-
-       movzb   `&hi("$s3")`,$acc0
-       mov     12($key),$s3
-       movzb   `&hi("$s0")`,$acc2
-       xor     1($sbox,$acc0,8),$t2
-       mov     0($key),$s0
-       xor     1($sbox,$acc2,8),$t3
-
-       xor     $t0,$s0
-       mov     4($key),$s1
-       mov     8($key),$s2
-       xor     $t2,$s2
-       xor     $t1,$s1
-       xor     $t3,$s3
-___
-}
-
-sub declastvert()
-{ my $t3="%r8d";       # zaps $inp!
-
-$code.=<<___;
-       lea     2048($sbox),$sbox       # size optimization
-       movzb   `&lo("$s0")`,$acc0
-       movzb   `&lo("$s1")`,$acc1
-       movzb   `&lo("$s2")`,$acc2
-       movzb   ($sbox,$acc0,1),$t0
-       movzb   ($sbox,$acc1,1),$t1
-       movzb   ($sbox,$acc2,1),$t2
-
-       movzb   `&lo("$s3")`,$acc0
-       movzb   `&hi("$s3")`,$acc1
-       movzb   `&hi("$s0")`,$acc2
-       movzb   ($sbox,$acc0,1),$t3
-       movzb   ($sbox,$acc1,1),$acc1   #$t0
-       movzb   ($sbox,$acc2,1),$acc2   #$t1
-
-       shl     \$8,$acc1
-       shl     \$8,$acc2
-
-       xor     $acc1,$t0
-       xor     $acc2,$t1
-       shr     \$16,$s3
-
-       movzb   `&hi("$s1")`,$acc0
-       movzb   `&hi("$s2")`,$acc1
-       shr     \$16,$s0
-       movzb   ($sbox,$acc0,1),$acc0   #$t2
-       movzb   ($sbox,$acc1,1),$acc1   #$t3
-
-       shl     \$8,$acc0
-       shl     \$8,$acc1
-       shr     \$16,$s1
-       xor     $acc0,$t2
-       xor     $acc1,$t3
-       shr     \$16,$s2
-
-       movzb   `&lo("$s2")`,$acc0
-       movzb   `&lo("$s3")`,$acc1
-       movzb   `&lo("$s0")`,$acc2
-       movzb   ($sbox,$acc0,1),$acc0   #$t0
-       movzb   ($sbox,$acc1,1),$acc1   #$t1
-       movzb   ($sbox,$acc2,1),$acc2   #$t2
-
-       shl     \$16,$acc0
-       shl     \$16,$acc1
-       shl     \$16,$acc2
-
-       xor     $acc0,$t0
-       xor     $acc1,$t1
-       xor     $acc2,$t2
-
-       movzb   `&lo("$s1")`,$acc0
-       movzb   `&hi("$s1")`,$acc1
-       movzb   `&hi("$s2")`,$acc2
-       movzb   ($sbox,$acc0,1),$acc0   #$t3
-       movzb   ($sbox,$acc1,1),$acc1   #$t0
-       movzb   ($sbox,$acc2,1),$acc2   #$t1
-
-       shl     \$16,$acc0
-       shl     \$24,$acc1
-       shl     \$24,$acc2
-
-       xor     $acc0,$t3
-       xor     $acc1,$t0
-       xor     $acc2,$t1
-
-       movzb   `&hi("$s3")`,$acc0
-       movzb   `&hi("$s0")`,$acc1
-       mov     16+12($key),$s3
-       movzb   ($sbox,$acc0,1),$acc0   #$t2
-       movzb   ($sbox,$acc1,1),$acc1   #$t3
-       mov     16+0($key),$s0
-
-       shl     \$24,$acc0
-       shl     \$24,$acc1
-
-       xor     $acc0,$t2
-       xor     $acc1,$t3
-
-       mov     16+4($key),$s1
-       mov     16+8($key),$s2
-       lea     -2048($sbox),$sbox
-       xor     $t0,$s0
-       xor     $t1,$s1
-       xor     $t2,$s2
-       xor     $t3,$s3
-___
-}
-
-sub decstep()
-{ my ($i,@s) = @_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-       $code.="        mov     $s[0],$out\n"           if ($i!=3);
-                       $tmp1=$s[2]                     if ($i==3);
-       $code.="        mov     $s[2],$tmp1\n"          if ($i!=3);
-       $code.="        and     \$0xFF,$out\n";
-
-       $code.="        mov     0($sbox,$out,8),$out\n";
-       $code.="        shr     \$16,$tmp1\n";
-                       $tmp2=$s[3]                     if ($i==3);
-       $code.="        mov     $s[3],$tmp2\n"          if ($i!=3);
-
-                       $tmp0=$s[1]                     if ($i==3);
-       $code.="        movzb   ".&hi($s[1]).",$tmp0\n";
-       $code.="        and     \$0xFF,$tmp1\n";
-       $code.="        shr     \$24,$tmp2\n";
-
-       $code.="        xor     3($sbox,$tmp0,8),$out\n";
-       $code.="        xor     2($sbox,$tmp1,8),$out\n";
-       $code.="        xor     1($sbox,$tmp2,8),$out\n";
-
-       $code.="        mov     $t2,$s[1]\n"            if ($i==3);
-       $code.="        mov     $t1,$s[2]\n"            if ($i==3);
-       $code.="        mov     $t0,$s[3]\n"            if ($i==3);
-       $code.="\n";
-}
-
-sub declast()
-{ my ($i,@s)=@_;
-  my $tmp0=$acc0;
-  my $tmp1=$acc1;
-  my $tmp2=$acc2;
-  my $out=($t0,$t1,$t2,$s[0])[$i];
-
-       $code.="        mov     $s[0],$out\n"           if ($i!=3);
-                       $tmp1=$s[2]                     if ($i==3);
-       $code.="        mov     $s[2],$tmp1\n"          if ($i!=3);
-       $code.="        and     \$0xFF,$out\n";
-
-       $code.="        movzb   2048($sbox,$out,1),$out\n";
-       $code.="        shr     \$16,$tmp1\n";
-                       $tmp2=$s[3]                     if ($i==3);
-       $code.="        mov     $s[3],$tmp2\n"          if ($i!=3);
-
-                       $tmp0=$s[1]                     if ($i==3);
-       $code.="        movzb   ".&hi($s[1]).",$tmp0\n";
-       $code.="        and     \$0xFF,$tmp1\n";
-       $code.="        shr     \$24,$tmp2\n";
-
-       $code.="        movzb   2048($sbox,$tmp0,1),$tmp0\n";
-       $code.="        movzb   2048($sbox,$tmp1,1),$tmp1\n";
-       $code.="        movzb   2048($sbox,$tmp2,1),$tmp2\n";
-
-       $code.="        shl     \$8,$tmp0\n";
-       $code.="        shl     \$16,$tmp1\n";
-       $code.="        shl     \$24,$tmp2\n";
-
-       $code.="        xor     $tmp0,$out\n";
-       $code.="        mov     $t2,$s[1]\n"            if ($i==3);
-       $code.="        xor     $tmp1,$out\n";
-       $code.="        mov     $t1,$s[2]\n"            if ($i==3);
-       $code.="        xor     $tmp2,$out\n";
-       $code.="        mov     $t0,$s[3]\n"            if ($i==3);
-       $code.="\n";
-}
-
-$code.=<<___;
-.type  _x86_64_AES_decrypt,\@abi-omnipotent
-.align 16
-_x86_64_AES_decrypt:
-       xor     0($key),$s0                     # xor with key
-       xor     4($key),$s1
-       xor     8($key),$s2
-       xor     12($key),$s3
-
-       mov     240($key),$rnds                 # load key->rounds
-       sub     \$1,$rnds
-       jmp     .Ldec_loop
-.align 16
-.Ldec_loop:
-___
-       if ($verticalspin) { &decvert(); }
-       else {  &decstep(0,$s0,$s3,$s2,$s1);
-               &decstep(1,$s1,$s0,$s3,$s2);
-               &decstep(2,$s2,$s1,$s0,$s3);
-               &decstep(3,$s3,$s2,$s1,$s0);
-               $code.=<<___;
-               lea     16($key),$key
-               xor     0($key),$s0                     # xor with key
-               xor     4($key),$s1
-               xor     8($key),$s2
-               xor     12($key),$s3
-___
-       }
-$code.=<<___;
-       sub     \$1,$rnds
-       jnz     .Ldec_loop
-___
-       if ($verticalspin) { &declastvert(); }
-       else {  &declast(0,$s0,$s3,$s2,$s1);
-               &declast(1,$s1,$s0,$s3,$s2);
-               &declast(2,$s2,$s1,$s0,$s3);
-               &declast(3,$s3,$s2,$s1,$s0);
-               $code.=<<___;
-               xor     16+0($key),$s0                  # xor with key
-               xor     16+4($key),$s1
-               xor     16+8($key),$s2
-               xor     16+12($key),$s3
-___
-       }
-$code.=<<___;
-       .byte   0xf3,0xc3                       # rep ret
-.size  _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
-___
-
-sub deccompactvert()
-{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
-
-$code.=<<___;
-       movzb   `&lo("$s0")`,$t0
-       movzb   `&lo("$s1")`,$t1
-       movzb   `&lo("$s2")`,$t2
-       movzb   `&lo("$s3")`,$t3
-       movzb   `&hi("$s3")`,$acc0
-       movzb   `&hi("$s0")`,$acc1
-       shr     \$16,$s3
-       movzb   `&hi("$s1")`,$acc2
-       movzb   ($sbox,$t0,1),$t0
-       movzb   ($sbox,$t1,1),$t1
-       movzb   ($sbox,$t2,1),$t2
-       movzb   ($sbox,$t3,1),$t3
-
-       movzb   ($sbox,$acc0,1),$t4     #$t0
-       movzb   `&hi("$s2")`,$acc0
-       movzb   ($sbox,$acc1,1),$t5     #$t1
-       movzb   ($sbox,$acc2,1),$acc2   #$t2
-       movzb   ($sbox,$acc0,1),$acc0   #$t3
-
-       shr     \$16,$s2
-       shl     \$8,$t5
-       shl     \$8,$t4
-       movzb   `&lo("$s2")`,$acc1
-       shr     \$16,$s0
-       xor     $t4,$t0
-       shr     \$16,$s1
-       movzb   `&lo("$s3")`,$t4
-
-       shl     \$8,$acc2
-       xor     $t5,$t1
-       shl     \$8,$acc0
-       movzb   `&lo("$s0")`,$t5
-       movzb   ($sbox,$acc1,1),$acc1   #$t0
-       xor     $acc2,$t2
-       movzb   `&lo("$s1")`,$acc2
-
-       shl     \$16,$acc1
-       xor     $acc0,$t3
-       movzb   ($sbox,$t4,1),$t4       #$t1
-       movzb   `&hi("$s1")`,$acc0
-       movzb   ($sbox,$acc2,1),$acc2   #$t3
-       xor     $acc1,$t0
-       movzb   ($sbox,$t5,1),$t5       #$t2
-       movzb   `&hi("$s2")`,$acc1
-
-       shl     \$16,$acc2
-       shl     \$16,$t4
-       shl     \$16,$t5
-       xor     $acc2,$t3
-       movzb   `&hi("$s3")`,$acc2
-       xor     $t4,$t1
-       shr     \$8,$s0
-       xor     $t5,$t2
-
-       movzb   ($sbox,$acc0,1),$acc0   #$t0
-       movzb   ($sbox,$acc1,1),$s1     #$t1
-       movzb   ($sbox,$acc2,1),$s2     #$t2
-       movzb   ($sbox,$s0,1),$s3       #$t3
-
-       mov     $t0,$s0
-       shl     \$24,$acc0
-       shl     \$24,$s1
-       shl     \$24,$s2
-       xor     $acc0,$s0
-       shl     \$24,$s3
-       xor     $t1,$s1
-       xor     $t2,$s2
-       xor     $t3,$s3
-___
-}
-
-# parallelized version! input is pair of 64-bit values: %rax=s1.s0
-# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
-# %ecx=s2 and %edx=s3.
-sub dectransform()
-{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
-  my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
-  my $prefetch = shift;
-
-$code.=<<___;
-       mov     $mask80,$tp40
-       mov     $mask80,$tp48
-       and     $tp10,$tp40
-       and     $tp18,$tp48
-       mov     $tp40,$acc0
-       mov     $tp48,$acc8
-       shr     \$7,$tp40
-       lea     ($tp10,$tp10),$tp20
-       shr     \$7,$tp48
-       lea     ($tp18,$tp18),$tp28
-       sub     $tp40,$acc0
-       sub     $tp48,$acc8
-       and     $maskfe,$tp20
-       and     $maskfe,$tp28
-       and     $mask1b,$acc0
-       and     $mask1b,$acc8
-       xor     $acc0,$tp20
-       xor     $acc8,$tp28
-       mov     $mask80,$tp80
-       mov     $mask80,$tp88
-
-       and     $tp20,$tp80
-       and     $tp28,$tp88
-       mov     $tp80,$acc0
-       mov     $tp88,$acc8
-       shr     \$7,$tp80
-       lea     ($tp20,$tp20),$tp40
-       shr     \$7,$tp88
-       lea     ($tp28,$tp28),$tp48
-       sub     $tp80,$acc0
-       sub     $tp88,$acc8
-       and     $maskfe,$tp40
-       and     $maskfe,$tp48
-       and     $mask1b,$acc0
-       and     $mask1b,$acc8
-       xor     $acc0,$tp40
-       xor     $acc8,$tp48
-       mov     $mask80,$tp80
-       mov     $mask80,$tp88
-
-       and     $tp40,$tp80
-       and     $tp48,$tp88
-       mov     $tp80,$acc0
-       mov     $tp88,$acc8
-       shr     \$7,$tp80
-        xor    $tp10,$tp20             # tp2^=tp1
-       shr     \$7,$tp88
-        xor    $tp18,$tp28             # tp2^=tp1
-       sub     $tp80,$acc0
-       sub     $tp88,$acc8
-       lea     ($tp40,$tp40),$tp80
-       lea     ($tp48,$tp48),$tp88
-        xor    $tp10,$tp40             # tp4^=tp1
-        xor    $tp18,$tp48             # tp4^=tp1
-       and     $maskfe,$tp80
-       and     $maskfe,$tp88
-       and     $mask1b,$acc0
-       and     $mask1b,$acc8
-       xor     $acc0,$tp80
-       xor     $acc8,$tp88
-
-       xor     $tp80,$tp10             # tp1^=tp8
-       xor     $tp88,$tp18             # tp1^=tp8
-       xor     $tp80,$tp20             # tp2^tp1^=tp8
-       xor     $tp88,$tp28             # tp2^tp1^=tp8
-       mov     $tp10,$acc0
-       mov     $tp18,$acc8
-       xor     $tp80,$tp40             # tp4^tp1^=tp8
-       shr     \$32,$acc0
-       xor     $tp88,$tp48             # tp4^tp1^=tp8
-       shr     \$32,$acc8
-       xor     $tp20,$tp80             # tp8^=tp8^tp2^tp1=tp2^tp1
-       rol     \$8,`&LO("$tp10")`      # ROTATE(tp1^tp8,8)
-       xor     $tp28,$tp88             # tp8^=tp8^tp2^tp1=tp2^tp1
-       rol     \$8,`&LO("$tp18")`      # ROTATE(tp1^tp8,8)
-       xor     $tp40,$tp80             # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-       rol     \$8,`&LO("$acc0")`      # ROTATE(tp1^tp8,8)
-       xor     $tp48,$tp88             # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
-
-       rol     \$8,`&LO("$acc8")`      # ROTATE(tp1^tp8,8)
-       xor     `&LO("$tp80")`,`&LO("$tp10")`
-       shr     \$32,$tp80
-       xor     `&LO("$tp88")`,`&LO("$tp18")`
-       shr     \$32,$tp88
-       xor     `&LO("$tp80")`,`&LO("$acc0")`
-       xor     `&LO("$tp88")`,`&LO("$acc8")`
-
-       mov     $tp20,$tp80
-       rol     \$24,`&LO("$tp20")`     # ROTATE(tp2^tp1^tp8,24)
-       mov     $tp28,$tp88
-       rol     \$24,`&LO("$tp28")`     # ROTATE(tp2^tp1^tp8,24)
-       shr     \$32,$tp80
-       xor     `&LO("$tp20")`,`&LO("$tp10")`
-       shr     \$32,$tp88
-       xor     `&LO("$tp28")`,`&LO("$tp18")`
-       rol     \$24,`&LO("$tp80")`     # ROTATE(tp2^tp1^tp8,24)
-       mov     $tp40,$tp20
-       rol     \$24,`&LO("$tp88")`     # ROTATE(tp2^tp1^tp8,24)
-       mov     $tp48,$tp28
-       shr     \$32,$tp20
-       xor     `&LO("$tp80")`,`&LO("$acc0")`
-       shr     \$32,$tp28
-       xor     `&LO("$tp88")`,`&LO("$acc8")`
-
-       `"mov   0($sbox),$mask80"       if ($prefetch)`
-       rol     \$16,`&LO("$tp40")`     # ROTATE(tp4^tp1^tp8,16)
-       `"mov   64($sbox),$maskfe"      if ($prefetch)`
-       rol     \$16,`&LO("$tp48")`     # ROTATE(tp4^tp1^tp8,16)
-       `"mov   128($sbox),$mask1b"     if ($prefetch)`
-       rol     \$16,`&LO("$tp20")`     # ROTATE(tp4^tp1^tp8,16)
-       `"mov   192($sbox),$tp80"       if ($prefetch)`
-       xor     `&LO("$tp40")`,`&LO("$tp10")`
-       rol     \$16,`&LO("$tp28")`     # ROTATE(tp4^tp1^tp8,16)
-       xor     `&LO("$tp48")`,`&LO("$tp18")`
-       `"mov   256($sbox),$tp88"       if ($prefetch)`
-       xor     `&LO("$tp20")`,`&LO("$acc0")`
-       xor     `&LO("$tp28")`,`&LO("$acc8")`
-___
-}
-
-$code.=<<___;
-.type  _x86_64_AES_decrypt_compact,\@abi-omnipotent
-.align 16
-_x86_64_AES_decrypt_compact:
-.cfi_startproc
-       lea     128($sbox),$inp                 # size optimization
-       mov     0-128($inp),$acc1               # prefetch Td4
-       mov     32-128($inp),$acc2
-       mov     64-128($inp),$t0
-       mov     96-128($inp),$t1
-       mov     128-128($inp),$acc1
-       mov     160-128($inp),$acc2
-       mov     192-128($inp),$t0
-       mov     224-128($inp),$t1
-       jmp     .Ldec_loop_compact
-
-.align 16
-.Ldec_loop_compact:
-               xor     0($key),$s0             # xor with key
-               xor     4($key),$s1
-               xor     8($key),$s2
-               xor     12($key),$s3
-               lea     16($key),$key
-___
-               &deccompactvert();
-$code.=<<___;
-               cmp     16(%rsp),$key
-               je      .Ldec_compact_done
-
-               mov     256+0($sbox),$mask80
-               shl     \$32,%rbx
-               shl     \$32,%rdx
-               mov     256+8($sbox),$maskfe
-               or      %rbx,%rax
-               or      %rdx,%rcx
-               mov     256+16($sbox),$mask1b
-___
-               &dectransform(1);
-$code.=<<___;
-       jmp     .Ldec_loop_compact
-.align 16
-.Ldec_compact_done:
-       xor     0($key),$s0
-       xor     4($key),$s1
-       xor     8($key),$s2
-       xor     12($key),$s3
-       .byte   0xf3,0xc3                       # rep ret
-.cfi_endproc
-.size  _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
-___
-
-# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
-$code.=<<___;
-.globl AES_decrypt
-.type  AES_decrypt,\@function,3
-.align 16
-.globl asm_AES_decrypt
-.hidden        asm_AES_decrypt
-asm_AES_decrypt:
-AES_decrypt:
-.cfi_startproc
-       mov     %rsp,%rax
-.cfi_def_cfa_register  %rax
-       push    %rbx
-.cfi_push      %rbx
-       push    %rbp
-.cfi_push      %rbp
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-
-       # allocate frame "above" key schedule
-       lea     -63(%rdx),%rcx  # %rdx is key argument
-       and     \$-64,%rsp
-       sub     %rsp,%rcx
-       neg     %rcx
-       and     \$0x3c0,%rcx
-       sub     %rcx,%rsp
-       sub     \$32,%rsp
-
-       mov     %rsi,16(%rsp)   # save out
-       mov     %rax,24(%rsp)   # save original stack pointer
-.cfi_cfa_expression    %rsp+24,deref,+8
-.Ldec_prologue:
-
-       mov     %rdx,$key
-       mov     240($key),$rnds # load rounds
-
-       mov     0(%rdi),$s0     # load input vector
-       mov     4(%rdi),$s1
-       mov     8(%rdi),$s2
-       mov     12(%rdi),$s3
-
-       shl     \$4,$rnds
-       lea     ($key,$rnds),%rbp
-       mov     $key,(%rsp)     # key schedule
-       mov     %rbp,8(%rsp)    # end of key schedule
-
-       # pick Td4 copy which can't "overlap" with stack frame or key schedule
-       lea     .LAES_Td+2048(%rip),$sbox
-       lea     768(%rsp),%rbp
-       sub     $sbox,%rbp
-       and     \$0x300,%rbp
-       lea     ($sbox,%rbp),$sbox
-       shr     \$3,%rbp        # recall "magic" constants!
-       add     %rbp,$sbox
-
-       call    _x86_64_AES_decrypt_compact
-
-       mov     16(%rsp),$out   # restore out
-       mov     24(%rsp),%rsi   # restore saved stack pointer
-.cfi_def_cfa   %rsi,8
-       mov     $s0,0($out)     # write output vector
-       mov     $s1,4($out)
-       mov     $s2,8($out)
-       mov     $s3,12($out)
-
-       mov     -48(%rsi),%r15
-.cfi_restore   %r15
-       mov     -40(%rsi),%r14
-.cfi_restore   %r14
-       mov     -32(%rsi),%r13
-.cfi_restore   %r13
-       mov     -24(%rsi),%r12
-.cfi_restore   %r12
-       mov     -16(%rsi),%rbp
-.cfi_restore   %rbp
-       mov     -8(%rsi),%rbx
-.cfi_restore   %rbx
-       lea     (%rsi),%rsp
-.cfi_def_cfa_register  %rsp
-.Ldec_epilogue:
-       ret
-.cfi_endproc
-.size  AES_decrypt,.-AES_decrypt
-___
-#------------------------------------------------------------------#
-
-sub enckey()
-{
-$code.=<<___;
-       movz    %dl,%esi                # rk[i]>>0
-       movzb   -128(%rbp,%rsi),%ebx
-       movz    %dh,%esi                # rk[i]>>8
-       shl     \$24,%ebx
-       xor     %ebx,%eax
-
-       movzb   -128(%rbp,%rsi),%ebx
-       shr     \$16,%edx
-       movz    %dl,%esi                # rk[i]>>16
-       xor     %ebx,%eax
-
-       movzb   -128(%rbp,%rsi),%ebx
-       movz    %dh,%esi                # rk[i]>>24
-       shl     \$8,%ebx
-       xor     %ebx,%eax
-
-       movzb   -128(%rbp,%rsi),%ebx
-       shl     \$16,%ebx
-       xor     %ebx,%eax
-
-       xor     1024-128(%rbp,%rcx,4),%eax              # rcon
-___
-}
-
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-$code.=<<___;
-.globl AES_set_encrypt_key
-.type  AES_set_encrypt_key,\@function,3
-.align 16
-AES_set_encrypt_key:
-.cfi_startproc
-       push    %rbx
-.cfi_push      %rbx
-       push    %rbp
-.cfi_push      %rbp
-       push    %r12                    # redundant, but allows to share
-.cfi_push      %r12
-       push    %r13                    # exception handler...
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       sub     \$8,%rsp
-.cfi_adjust_cfa_offset 8
-.Lenc_key_prologue:
-
-       call    _x86_64_AES_set_encrypt_key
-
-       mov     40(%rsp),%rbp
-.cfi_restore   %rbp
-       mov     48(%rsp),%rbx
-.cfi_restore   %rbx
-       add     \$56,%rsp
-.cfi_adjust_cfa_offset -56
-.Lenc_key_epilogue:
-       ret
-.cfi_endproc
-.size  AES_set_encrypt_key,.-AES_set_encrypt_key
-
-.type  _x86_64_AES_set_encrypt_key,\@abi-omnipotent
-.align 16
-_x86_64_AES_set_encrypt_key:
-.cfi_startproc
-       mov     %esi,%ecx                       # %ecx=bits
-       mov     %rdi,%rsi                       # %rsi=userKey
-       mov     %rdx,%rdi                       # %rdi=key
-
-       test    \$-1,%rsi
-       jz      .Lbadpointer
-       test    \$-1,%rdi
-       jz      .Lbadpointer
-
-       lea     .LAES_Te(%rip),%rbp
-       lea     2048+128(%rbp),%rbp
-
-       # prefetch Te4
-       mov     0-128(%rbp),%eax
-       mov     32-128(%rbp),%ebx
-       mov     64-128(%rbp),%r8d
-       mov     96-128(%rbp),%edx
-       mov     128-128(%rbp),%eax
-       mov     160-128(%rbp),%ebx
-       mov     192-128(%rbp),%r8d
-       mov     224-128(%rbp),%edx
-
-       cmp     \$128,%ecx
-       je      .L10rounds
-       cmp     \$192,%ecx
-       je      .L12rounds
-       cmp     \$256,%ecx
-       je      .L14rounds
-       mov     \$-2,%rax                       # invalid number of bits
-       jmp     .Lexit
-
-.L10rounds:
-       mov     0(%rsi),%rax                    # copy first 4 dwords
-       mov     8(%rsi),%rdx
-       mov     %rax,0(%rdi)
-       mov     %rdx,8(%rdi)
-
-       shr     \$32,%rdx
-       xor     %ecx,%ecx
-       jmp     .L10shortcut
-.align 4
-.L10loop:
-               mov     0(%rdi),%eax                    # rk[0]
-               mov     12(%rdi),%edx                   # rk[3]
-.L10shortcut:
-___
-               &enckey ();
-$code.=<<___;
-               mov     %eax,16(%rdi)                   # rk[4]
-               xor     4(%rdi),%eax
-               mov     %eax,20(%rdi)                   # rk[5]
-               xor     8(%rdi),%eax
-               mov     %eax,24(%rdi)                   # rk[6]
-               xor     12(%rdi),%eax
-               mov     %eax,28(%rdi)                   # rk[7]
-               add     \$1,%ecx
-               lea     16(%rdi),%rdi
-               cmp     \$10,%ecx
-       jl      .L10loop
-
-       movl    \$10,80(%rdi)                   # setup number of rounds
-       xor     %rax,%rax
-       jmp     .Lexit
-
-.L12rounds:
-       mov     0(%rsi),%rax                    # copy first 6 dwords
-       mov     8(%rsi),%rbx
-       mov     16(%rsi),%rdx
-       mov     %rax,0(%rdi)
-       mov     %rbx,8(%rdi)
-       mov     %rdx,16(%rdi)
-
-       shr     \$32,%rdx
-       xor     %ecx,%ecx
-       jmp     .L12shortcut
-.align 4
-.L12loop:
-               mov     0(%rdi),%eax                    # rk[0]
-               mov     20(%rdi),%edx                   # rk[5]
-.L12shortcut:
-___
-               &enckey ();
-$code.=<<___;
-               mov     %eax,24(%rdi)                   # rk[6]
-               xor     4(%rdi),%eax
-               mov     %eax,28(%rdi)                   # rk[7]
-               xor     8(%rdi),%eax
-               mov     %eax,32(%rdi)                   # rk[8]
-               xor     12(%rdi),%eax
-               mov     %eax,36(%rdi)                   # rk[9]
-
-               cmp     \$7,%ecx
-               je      .L12break
-               add     \$1,%ecx
-
-               xor     16(%rdi),%eax
-               mov     %eax,40(%rdi)                   # rk[10]
-               xor     20(%rdi),%eax
-               mov     %eax,44(%rdi)                   # rk[11]
-
-               lea     24(%rdi),%rdi
-       jmp     .L12loop
-.L12break:
-       movl    \$12,72(%rdi)           # setup number of rounds
-       xor     %rax,%rax
-       jmp     .Lexit
-
-.L14rounds:
-       mov     0(%rsi),%rax                    # copy first 8 dwords
-       mov     8(%rsi),%rbx
-       mov     16(%rsi),%rcx
-       mov     24(%rsi),%rdx
-       mov     %rax,0(%rdi)
-       mov     %rbx,8(%rdi)
-       mov     %rcx,16(%rdi)
-       mov     %rdx,24(%rdi)
-
-       shr     \$32,%rdx
-       xor     %ecx,%ecx
-       jmp     .L14shortcut
-.align 4
-.L14loop:
-               mov     0(%rdi),%eax                    # rk[0]
-               mov     28(%rdi),%edx                   # rk[4]
-.L14shortcut:
-___
-               &enckey ();
-$code.=<<___;
-               mov     %eax,32(%rdi)                   # rk[8]
-               xor     4(%rdi),%eax
-               mov     %eax,36(%rdi)                   # rk[9]
-               xor     8(%rdi),%eax
-               mov     %eax,40(%rdi)                   # rk[10]
-               xor     12(%rdi),%eax
-               mov     %eax,44(%rdi)                   # rk[11]
-
-               cmp     \$6,%ecx
-               je      .L14break
-               add     \$1,%ecx
-
-               mov     %eax,%edx
-               mov     16(%rdi),%eax                   # rk[4]
-               movz    %dl,%esi                        # rk[11]>>0
-               movzb   -128(%rbp,%rsi),%ebx
-               movz    %dh,%esi                        # rk[11]>>8
-               xor     %ebx,%eax
-
-               movzb   -128(%rbp,%rsi),%ebx
-               shr     \$16,%edx
-               shl     \$8,%ebx
-               movz    %dl,%esi                        # rk[11]>>16
-               xor     %ebx,%eax
-
-               movzb   -128(%rbp,%rsi),%ebx
-               movz    %dh,%esi                        # rk[11]>>24
-               shl     \$16,%ebx
-               xor     %ebx,%eax
-
-               movzb   -128(%rbp,%rsi),%ebx
-               shl     \$24,%ebx
-               xor     %ebx,%eax
-
-               mov     %eax,48(%rdi)                   # rk[12]
-               xor     20(%rdi),%eax
-               mov     %eax,52(%rdi)                   # rk[13]
-               xor     24(%rdi),%eax
-               mov     %eax,56(%rdi)                   # rk[14]
-               xor     28(%rdi),%eax
-               mov     %eax,60(%rdi)                   # rk[15]
-
-               lea     32(%rdi),%rdi
-       jmp     .L14loop
-.L14break:
-       movl    \$14,48(%rdi)           # setup number of rounds
-       xor     %rax,%rax
-       jmp     .Lexit
-
-.Lbadpointer:
-       mov     \$-1,%rax
-.Lexit:
-       .byte   0xf3,0xc3                       # rep ret
-.cfi_endproc
-.size  _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
-___
-
-sub deckey_ref()
-{ my ($i,$ptr,$te,$td) = @_;
-  my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
-$code.=<<___;
-       mov     $i($ptr),$tp1
-       mov     $tp1,$acc
-       and     \$0x80808080,$acc
-       mov     $acc,$tp4
-       shr     \$7,$tp4
-       lea     0($tp1,$tp1),$tp2
-       sub     $tp4,$acc
-       and     \$0xfefefefe,$tp2
-       and     \$0x1b1b1b1b,$acc
-       xor     $tp2,$acc
-       mov     $acc,$tp2
-
-       and     \$0x80808080,$acc
-       mov     $acc,$tp8
-       shr     \$7,$tp8
-       lea     0($tp2,$tp2),$tp4
-       sub     $tp8,$acc
-       and     \$0xfefefefe,$tp4
-       and     \$0x1b1b1b1b,$acc
-        xor    $tp1,$tp2               # tp2^tp1
-       xor     $tp4,$acc
-       mov     $acc,$tp4
-
-       and     \$0x80808080,$acc
-       mov     $acc,$tp8
-       shr     \$7,$tp8
-       sub     $tp8,$acc
-       lea     0($tp4,$tp4),$tp8
-        xor    $tp1,$tp4               # tp4^tp1
-       and     \$0xfefefefe,$tp8
-       and     \$0x1b1b1b1b,$acc
-       xor     $acc,$tp8
-
-       xor     $tp8,$tp1               # tp1^tp8
-       rol     \$8,$tp1                # ROTATE(tp1^tp8,8)
-       xor     $tp8,$tp2               # tp2^tp1^tp8
-       xor     $tp8,$tp4               # tp4^tp1^tp8
-       xor     $tp2,$tp8
-       xor     $tp4,$tp8               # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
-
-       xor     $tp8,$tp1
-       rol     \$24,$tp2               # ROTATE(tp2^tp1^tp8,24)
-       xor     $tp2,$tp1
-       rol     \$16,$tp4               # ROTATE(tp4^tp1^tp8,16)
-       xor     $tp4,$tp1
-
-       mov     $tp1,$i($ptr)
-___
-}
-
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
-#                        AES_KEY *key)
-$code.=<<___;
-.globl AES_set_decrypt_key
-.type  AES_set_decrypt_key,\@function,3
-.align 16
-AES_set_decrypt_key:
-.cfi_startproc
-       push    %rbx
-.cfi_push      %rbx
-       push    %rbp
-.cfi_push      %rbp
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       push    %rdx                    # save key schedule
-.cfi_adjust_cfa_offset 8
-.Ldec_key_prologue:
-
-       call    _x86_64_AES_set_encrypt_key
-       mov     (%rsp),%r8              # restore key schedule
-       cmp     \$0,%eax
-       jne     .Labort
-
-       mov     240(%r8),%r14d          # pull number of rounds
-       xor     %rdi,%rdi
-       lea     (%rdi,%r14d,4),%rcx
-       mov     %r8,%rsi
-       lea     (%r8,%rcx,4),%rdi       # pointer to last chunk
-.align 4
-.Linvert:
-               mov     0(%rsi),%rax
-               mov     8(%rsi),%rbx
-               mov     0(%rdi),%rcx
-               mov     8(%rdi),%rdx
-               mov     %rax,0(%rdi)
-               mov     %rbx,8(%rdi)
-               mov     %rcx,0(%rsi)
-               mov     %rdx,8(%rsi)
-               lea     16(%rsi),%rsi
-               lea     -16(%rdi),%rdi
-               cmp     %rsi,%rdi
-       jne     .Linvert
-
-       lea     .LAES_Te+2048+1024(%rip),%rax   # rcon
-
-       mov     40(%rax),$mask80
-       mov     48(%rax),$maskfe
-       mov     56(%rax),$mask1b
-
-       mov     %r8,$key
-       sub     \$1,%r14d
-.align 4
-.Lpermute:
-               lea     16($key),$key
-               mov     0($key),%rax
-               mov     8($key),%rcx
-___
-               &dectransform ();
-$code.=<<___;
-               mov     %eax,0($key)
-               mov     %ebx,4($key)
-               mov     %ecx,8($key)
-               mov     %edx,12($key)
-               sub     \$1,%r14d
-       jnz     .Lpermute
-
-       xor     %rax,%rax
-.Labort:
-       mov     8(%rsp),%r15
-.cfi_restore   %r15
-       mov     16(%rsp),%r14
-.cfi_restore   %r14
-       mov     24(%rsp),%r13
-.cfi_restore   %r13
-       mov     32(%rsp),%r12
-.cfi_restore   %r12
-       mov     40(%rsp),%rbp
-.cfi_restore   %rbp
-       mov     48(%rsp),%rbx
-.cfi_restore   %rbx
-       add     \$56,%rsp
-.cfi_adjust_cfa_offset -56
-.Ldec_key_epilogue:
-       ret
-.cfi_endproc
-.size  AES_set_decrypt_key,.-AES_set_decrypt_key
-___
-
-# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
-#                      size_t length, const AES_KEY *key,
-#                      unsigned char *ivp,const int enc);
-{
-# stack frame layout
-# -8(%rsp)             return address
-my $keyp="0(%rsp)";            # one to pass as $key
-my $keyend="8(%rsp)";          # &(keyp->rd_key[4*keyp->rounds])
-my $_rsp="16(%rsp)";           # saved %rsp
-my $_inp="24(%rsp)";           # copy of 1st parameter, inp
-my $_out="32(%rsp)";           # copy of 2nd parameter, out
-my $_len="40(%rsp)";           # copy of 3rd parameter, length
-my $_key="48(%rsp)";           # copy of 4th parameter, key
-my $_ivp="56(%rsp)";           # copy of 5th parameter, ivp
-my $ivec="64(%rsp)";           # ivec[16]
-my $aes_key="80(%rsp)";                # copy of aes_key
-my $mark="80+240(%rsp)";       # copy of aes_key->rounds
-
-$code.=<<___;
-.globl AES_cbc_encrypt
-.type  AES_cbc_encrypt,\@function,6
-.align 16
-.extern        OPENSSL_ia32cap_P
-.globl asm_AES_cbc_encrypt
-.hidden        asm_AES_cbc_encrypt
-asm_AES_cbc_encrypt:
-AES_cbc_encrypt:
-.cfi_startproc
-       cmp     \$0,%rdx        # check length
-       je      .Lcbc_epilogue
-       pushfq
-# This could be .cfi_push 49, but libunwind fails on registers it does not
-# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
-.cfi_adjust_cfa_offset 8
-       push    %rbx
-.cfi_push      %rbx
-       push    %rbp
-.cfi_push      %rbp
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-.Lcbc_prologue:
-
-       cld
-       mov     %r9d,%r9d       # clear upper half of enc
-
-       lea     .LAES_Te(%rip),$sbox
-       lea     .LAES_Td(%rip),%r10
-       cmp     \$0,%r9
-       cmoveq  %r10,$sbox
-
-.cfi_remember_state
-       mov     OPENSSL_ia32cap_P(%rip),%r10d
-       cmp     \$$speed_limit,%rdx
-       jb      .Lcbc_slow_prologue
-       test    \$15,%rdx
-       jnz     .Lcbc_slow_prologue
-       bt      \$28,%r10d
-       jc      .Lcbc_slow_prologue
-
-       # allocate aligned stack frame...
-       lea     -88-248(%rsp),$key
-       and     \$-64,$key
-
-       # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
-       mov     $sbox,%r10
-       lea     2304($sbox),%r11
-       mov     $key,%r12
-       and     \$0xFFF,%r10    # s = $sbox&0xfff
-       and     \$0xFFF,%r11    # e = ($sbox+2048)&0xfff
-       and     \$0xFFF,%r12    # p = %rsp&0xfff
-
-       cmp     %r11,%r12       # if (p=>e) %rsp =- (p-e);
-       jb      .Lcbc_te_break_out
-       sub     %r11,%r12
-       sub     %r12,$key
-       jmp     .Lcbc_te_ok
-.Lcbc_te_break_out:            # else %rsp -= (p-s)&0xfff + framesz
-       sub     %r10,%r12
-       and     \$0xFFF,%r12
-       add     \$320,%r12
-       sub     %r12,$key
-.align 4
-.Lcbc_te_ok:
-
-       xchg    %rsp,$key
-.cfi_def_cfa_register  $key
-       #add    \$8,%rsp        # reserve for return address!
-       mov     $key,$_rsp      # save %rsp
-.cfi_cfa_expression    $_rsp,deref,+64
-.Lcbc_fast_body:
-       mov     %rdi,$_inp      # save copy of inp
-       mov     %rsi,$_out      # save copy of out
-       mov     %rdx,$_len      # save copy of len
-       mov     %rcx,$_key      # save copy of key
-       mov     %r8,$_ivp       # save copy of ivp
-       movl    \$0,$mark       # copy of aes_key->rounds = 0;
-       mov     %r8,%rbp        # rearrange input arguments
-       mov     %r9,%rbx
-       mov     %rsi,$out
-       mov     %rdi,$inp
-       mov     %rcx,$key
-
-       mov     240($key),%eax          # key->rounds
-       # do we copy key schedule to stack?
-       mov     $key,%r10
-       sub     $sbox,%r10
-       and     \$0xfff,%r10
-       cmp     \$2304,%r10
-       jb      .Lcbc_do_ecopy
-       cmp     \$4096-248,%r10
-       jb      .Lcbc_skip_ecopy
-.align 4
-.Lcbc_do_ecopy:
-               mov     $key,%rsi
-               lea     $aes_key,%rdi
-               lea     $aes_key,$key
-               mov     \$240/8,%ecx
-               .long   0x90A548F3      # rep movsq
-               mov     %eax,(%rdi)     # copy aes_key->rounds
-.Lcbc_skip_ecopy:
-       mov     $key,$keyp      # save key pointer
-
-       mov     \$18,%ecx
-.align 4
-.Lcbc_prefetch_te:
-               mov     0($sbox),%r10
-               mov     32($sbox),%r11
-               mov     64($sbox),%r12
-               mov     96($sbox),%r13
-               lea     128($sbox),$sbox
-               sub     \$1,%ecx
-       jnz     .Lcbc_prefetch_te
-       lea     -2304($sbox),$sbox
-
-       cmp     \$0,%rbx
-       je      .LFAST_DECRYPT
-
-#----------------------------- ENCRYPT -----------------------------#
-       mov     0(%rbp),$s0             # load iv
-       mov     4(%rbp),$s1
-       mov     8(%rbp),$s2
-       mov     12(%rbp),$s3
-
-.align 4
-.Lcbc_fast_enc_loop:
-               xor     0($inp),$s0
-               xor     4($inp),$s1
-               xor     8($inp),$s2
-               xor     12($inp),$s3
-               mov     $keyp,$key      # restore key
-               mov     $inp,$_inp      # if ($verticalspin) save inp
-
-               call    _x86_64_AES_encrypt
-
-               mov     $_inp,$inp      # if ($verticalspin) restore inp
-               mov     $_len,%r10
-               mov     $s0,0($out)
-               mov     $s1,4($out)
-               mov     $s2,8($out)
-               mov     $s3,12($out)
-
-               lea     16($inp),$inp
-               lea     16($out),$out
-               sub     \$16,%r10
-               test    \$-16,%r10
-               mov     %r10,$_len
-       jnz     .Lcbc_fast_enc_loop
-       mov     $_ivp,%rbp      # restore ivp
-       mov     $s0,0(%rbp)     # save ivec
-       mov     $s1,4(%rbp)
-       mov     $s2,8(%rbp)
-       mov     $s3,12(%rbp)
-
-       jmp     .Lcbc_fast_cleanup
-
-#----------------------------- DECRYPT -----------------------------#
-.align 16
-.LFAST_DECRYPT:
-       cmp     $inp,$out
-       je      .Lcbc_fast_dec_in_place
-
-       mov     %rbp,$ivec
-.align 4
-.Lcbc_fast_dec_loop:
-               mov     0($inp),$s0     # read input
-               mov     4($inp),$s1
-               mov     8($inp),$s2
-               mov     12($inp),$s3
-               mov     $keyp,$key      # restore key
-               mov     $inp,$_inp      # if ($verticalspin) save inp
-
-               call    _x86_64_AES_decrypt
-
-               mov     $ivec,%rbp      # load ivp
-               mov     $_inp,$inp      # if ($verticalspin) restore inp
-               mov     $_len,%r10      # load len
-               xor     0(%rbp),$s0     # xor iv
-               xor     4(%rbp),$s1
-               xor     8(%rbp),$s2
-               xor     12(%rbp),$s3
-               mov     $inp,%rbp       # current input, next iv
-
-               sub     \$16,%r10
-               mov     %r10,$_len      # update len
-               mov     %rbp,$ivec      # update ivp
-
-               mov     $s0,0($out)     # write output
-               mov     $s1,4($out)
-               mov     $s2,8($out)
-               mov     $s3,12($out)
-
-               lea     16($inp),$inp
-               lea     16($out),$out
-       jnz     .Lcbc_fast_dec_loop
-       mov     $_ivp,%r12              # load user ivp
-       mov     0(%rbp),%r10            # load iv
-       mov     8(%rbp),%r11
-       mov     %r10,0(%r12)            # copy back to user
-       mov     %r11,8(%r12)
-       jmp     .Lcbc_fast_cleanup
-
-.align 16
-.Lcbc_fast_dec_in_place:
-       mov     0(%rbp),%r10            # copy iv to stack
-       mov     8(%rbp),%r11
-       mov     %r10,0+$ivec
-       mov     %r11,8+$ivec
-.align 4
-.Lcbc_fast_dec_in_place_loop:
-               mov     0($inp),$s0     # load input
-               mov     4($inp),$s1
-               mov     8($inp),$s2
-               mov     12($inp),$s3
-               mov     $keyp,$key      # restore key
-               mov     $inp,$_inp      # if ($verticalspin) save inp
-
-               call    _x86_64_AES_decrypt
-
-               mov     $_inp,$inp      # if ($verticalspin) restore inp
-               mov     $_len,%r10
-               xor     0+$ivec,$s0
-               xor     4+$ivec,$s1
-               xor     8+$ivec,$s2
-               xor     12+$ivec,$s3
-
-               mov     0($inp),%r11    # load input
-               mov     8($inp),%r12
-               sub     \$16,%r10
-               jz      .Lcbc_fast_dec_in_place_done
-
-               mov     %r11,0+$ivec    # copy input to iv
-               mov     %r12,8+$ivec
-
-               mov     $s0,0($out)     # save output [zaps input]
-               mov     $s1,4($out)
-               mov     $s2,8($out)
-               mov     $s3,12($out)
-
-               lea     16($inp),$inp
-               lea     16($out),$out
-               mov     %r10,$_len
-       jmp     .Lcbc_fast_dec_in_place_loop
-.Lcbc_fast_dec_in_place_done:
-       mov     $_ivp,%rdi
-       mov     %r11,0(%rdi)    # copy iv back to user
-       mov     %r12,8(%rdi)
-
-       mov     $s0,0($out)     # save output [zaps input]
-       mov     $s1,4($out)
-       mov     $s2,8($out)
-       mov     $s3,12($out)
-
-.align 4
-.Lcbc_fast_cleanup:
-       cmpl    \$0,$mark       # was the key schedule copied?
-       lea     $aes_key,%rdi
-       je      .Lcbc_exit
-               mov     \$240/8,%ecx
-               xor     %rax,%rax
-               .long   0x90AB48F3      # rep stosq
-
-       jmp     .Lcbc_exit
-
-#--------------------------- SLOW ROUTINE ---------------------------#
-.align 16
-.Lcbc_slow_prologue:
-.cfi_restore_state
-       # allocate aligned stack frame...
-       lea     -88(%rsp),%rbp
-       and     \$-64,%rbp
-       # ... just "above" key schedule
-       lea     -88-63(%rcx),%r10
-       sub     %rbp,%r10
-       neg     %r10
-       and     \$0x3c0,%r10
-       sub     %r10,%rbp
-
-       xchg    %rsp,%rbp
-.cfi_def_cfa_register  %rbp
-       #add    \$8,%rsp        # reserve for return address!
-       mov     %rbp,$_rsp      # save %rsp
-.cfi_cfa_expression    $_rsp,deref,+64
-.Lcbc_slow_body:
-       #mov    %rdi,$_inp      # save copy of inp
-       #mov    %rsi,$_out      # save copy of out
-       #mov    %rdx,$_len      # save copy of len
-       #mov    %rcx,$_key      # save copy of key
-       mov     %r8,$_ivp       # save copy of ivp
-       mov     %r8,%rbp        # rearrange input arguments
-       mov     %r9,%rbx
-       mov     %rsi,$out
-       mov     %rdi,$inp
-       mov     %rcx,$key
-       mov     %rdx,%r10
-
-       mov     240($key),%eax
-       mov     $key,$keyp      # save key pointer
-       shl     \$4,%eax
-       lea     ($key,%rax),%rax
-       mov     %rax,$keyend
-
-       # pick Te4 copy which can't "overlap" with stack frame or key schedule
-       lea     2048($sbox),$sbox
-       lea     768-8(%rsp),%rax
-       sub     $sbox,%rax
-       and     \$0x300,%rax
-       lea     ($sbox,%rax),$sbox
-
-       cmp     \$0,%rbx
-       je      .LSLOW_DECRYPT
-
-#--------------------------- SLOW ENCRYPT ---------------------------#
-       test    \$-16,%r10              # check upon length
-       mov     0(%rbp),$s0             # load iv
-       mov     4(%rbp),$s1
-       mov     8(%rbp),$s2
-       mov     12(%rbp),$s3
-       jz      .Lcbc_slow_enc_tail     # short input...
-
-.align 4
-.Lcbc_slow_enc_loop:
-               xor     0($inp),$s0
-               xor     4($inp),$s1
-               xor     8($inp),$s2
-               xor     12($inp),$s3
-               mov     $keyp,$key      # restore key
-               mov     $inp,$_inp      # save inp
-               mov     $out,$_out      # save out
-               mov     %r10,$_len      # save len
-
-               call    _x86_64_AES_encrypt_compact
-
-               mov     $_inp,$inp      # restore inp
-               mov     $_out,$out      # restore out
-               mov     $_len,%r10      # restore len
-               mov     $s0,0($out)
-               mov     $s1,4($out)
-               mov     $s2,8($out)
-               mov     $s3,12($out)
-
-               lea     16($inp),$inp
-               lea     16($out),$out
-               sub     \$16,%r10
-               test    \$-16,%r10
-       jnz     .Lcbc_slow_enc_loop
-       test    \$15,%r10
-       jnz     .Lcbc_slow_enc_tail
-       mov     $_ivp,%rbp      # restore ivp
-       mov     $s0,0(%rbp)     # save ivec
-       mov     $s1,4(%rbp)
-       mov     $s2,8(%rbp)
-       mov     $s3,12(%rbp)
-
-       jmp     .Lcbc_exit
-
-.align 4
-.Lcbc_slow_enc_tail:
-       mov     %rax,%r11
-       mov     %rcx,%r12
-       mov     %r10,%rcx
-       mov     $inp,%rsi
-       mov     $out,%rdi
-       .long   0x9066A4F3              # rep movsb
-       mov     \$16,%rcx               # zero tail
-       sub     %r10,%rcx
-       xor     %rax,%rax
-       .long   0x9066AAF3              # rep stosb
-       mov     $out,$inp               # this is not a mistake!
-       mov     \$16,%r10               # len=16
-       mov     %r11,%rax
-       mov     %r12,%rcx
-       jmp     .Lcbc_slow_enc_loop     # one more spin...
-#--------------------------- SLOW DECRYPT ---------------------------#
-.align 16
-.LSLOW_DECRYPT:
-       shr     \$3,%rax
-       add     %rax,$sbox              # recall "magic" constants!
-
-       mov     0(%rbp),%r11            # copy iv to stack
-       mov     8(%rbp),%r12
-       mov     %r11,0+$ivec
-       mov     %r12,8+$ivec
-
-.align 4
-.Lcbc_slow_dec_loop:
-               mov     0($inp),$s0     # load input
-               mov     4($inp),$s1
-               mov     8($inp),$s2
-               mov     12($inp),$s3
-               mov     $keyp,$key      # restore key
-               mov     $inp,$_inp      # save inp
-               mov     $out,$_out      # save out
-               mov     %r10,$_len      # save len
-
-               call    _x86_64_AES_decrypt_compact
-
-               mov     $_inp,$inp      # restore inp
-               mov     $_out,$out      # restore out
-               mov     $_len,%r10
-               xor     0+$ivec,$s0
-               xor     4+$ivec,$s1
-               xor     8+$ivec,$s2
-               xor     12+$ivec,$s3
-
-               mov     0($inp),%r11    # load input
-               mov     8($inp),%r12
-               sub     \$16,%r10
-               jc      .Lcbc_slow_dec_partial
-               jz      .Lcbc_slow_dec_done
-
-               mov     %r11,0+$ivec    # copy input to iv
-               mov     %r12,8+$ivec
-
-               mov     $s0,0($out)     # save output [can zap input]
-               mov     $s1,4($out)
-               mov     $s2,8($out)
-               mov     $s3,12($out)
-
-               lea     16($inp),$inp
-               lea     16($out),$out
-       jmp     .Lcbc_slow_dec_loop
-.Lcbc_slow_dec_done:
-       mov     $_ivp,%rdi
-       mov     %r11,0(%rdi)            # copy iv back to user
-       mov     %r12,8(%rdi)
-
-       mov     $s0,0($out)             # save output [can zap input]
-       mov     $s1,4($out)
-       mov     $s2,8($out)
-       mov     $s3,12($out)
-
-       jmp     .Lcbc_exit
-
-.align 4
-.Lcbc_slow_dec_partial:
-       mov     $_ivp,%rdi
-       mov     %r11,0(%rdi)            # copy iv back to user
-       mov     %r12,8(%rdi)
-
-       mov     $s0,0+$ivec             # save output to stack
-       mov     $s1,4+$ivec
-       mov     $s2,8+$ivec
-       mov     $s3,12+$ivec
-
-       mov     $out,%rdi
-       lea     $ivec,%rsi
-       lea     16(%r10),%rcx
-       .long   0x9066A4F3      # rep movsb
-       jmp     .Lcbc_exit
-
-.align 16
-.Lcbc_exit:
-       mov     $_rsp,%rsi
-.cfi_def_cfa   %rsi,64
-       mov     (%rsi),%r15
-.cfi_restore   %r15
-       mov     8(%rsi),%r14
-.cfi_restore   %r14
-       mov     16(%rsi),%r13
-.cfi_restore   %r13
-       mov     24(%rsi),%r12
-.cfi_restore   %r12
-       mov     32(%rsi),%rbp
-.cfi_restore   %rbp
-       mov     40(%rsi),%rbx
-.cfi_restore   %rbx
-       lea     48(%rsi),%rsp
-.cfi_def_cfa   %rsp,16
-.Lcbc_popfq:
-       popfq
-# This could be .cfi_pop 49, but libunwind fails on registers it does not
-# recognize. See https://bugzilla.redhat.com/show_bug.cgi?id=217087.
-.cfi_adjust_cfa_offset -8
-.Lcbc_epilogue:
-       ret
-.cfi_endproc
-.size  AES_cbc_encrypt,.-AES_cbc_encrypt
-___
-}
-
-$code.=<<___;
-.align 64
-.LAES_Te:
-___
-       &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
-       &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
-       &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
-       &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
-       &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
-       &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
-       &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
-       &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
-       &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
-       &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
-       &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
-       &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
-       &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
-       &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
-       &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
-       &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
-       &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
-       &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
-       &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
-       &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
-       &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
-       &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
-       &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
-       &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
-       &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
-       &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
-       &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
-       &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
-       &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
-       &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
-       &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
-       &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
-       &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
-       &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
-       &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
-       &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
-       &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
-       &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
-       &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
-       &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
-       &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
-       &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
-       &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
-       &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
-       &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
-       &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
-       &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
-       &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
-       &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
-       &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
-       &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
-       &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
-       &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
-       &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
-       &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
-       &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
-       &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
-       &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
-       &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
-       &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
-       &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
-       &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
-       &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
-       &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
-
-#Te4   # four copies of Te4 to choose from to avoid L1 aliasing
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-
-       &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
-       &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
-       &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
-       &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
-       &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
-       &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
-       &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
-       &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
-       &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
-       &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
-       &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
-       &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
-       &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
-       &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
-       &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
-       &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
-       &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
-       &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
-       &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
-       &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
-       &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
-       &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
-       &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
-       &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
-       &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
-       &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
-       &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
-       &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
-       &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
-       &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
-       &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
-       &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
-#rcon:
-$code.=<<___;
-       .long   0x00000001, 0x00000002, 0x00000004, 0x00000008
-       .long   0x00000010, 0x00000020, 0x00000040, 0x00000080
-       .long   0x0000001b, 0x00000036, 0x80808080, 0x80808080
-       .long   0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
-___
-$code.=<<___;
-.align 64
-.LAES_Td:
-___
-       &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
-       &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
-       &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
-       &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
-       &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
-       &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
-       &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
-       &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
-       &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
-       &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
-       &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
-       &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
-       &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
-       &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
-       &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
-       &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
-       &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
-       &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
-       &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
-       &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
-       &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
-       &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
-       &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
-       &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
-       &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
-       &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
-       &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
-       &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
-       &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
-       &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
-       &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
-       &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
-       &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
-       &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
-       &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
-       &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
-       &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
-       &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
-       &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
-       &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
-       &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
-       &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
-       &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
-       &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
-       &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
-       &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
-       &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
-       &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
-       &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
-       &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
-       &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
-       &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
-       &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
-       &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
-       &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
-       &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
-       &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
-       &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
-       &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
-       &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
-       &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
-       &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
-       &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
-       &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
-
-#Td4:  # four copies of Td4 to choose from to avoid L1 aliasing
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-       .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-       .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-       .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-       .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-       .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-       .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-___
-       &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
-       &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
-       &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
-       &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
-       &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
-       &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
-       &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
-       &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
-       &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
-       &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
-       &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
-       &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
-       &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
-       &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
-       &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
-       &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
-       &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
-       &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
-       &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
-       &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
-       &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
-       &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
-       &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
-       &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
-       &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
-       &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
-       &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
-       &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
-       &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
-       &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
-       &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
-       &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
-$code.=<<___;
-       .long   0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
-       .long   0x1b1b1b1b, 0x1b1b1b1b, 0, 0
-.asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 64
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern        __imp_RtlVirtualUnwind
-.type  block_se_handler,\@abi-omnipotent
-.align 16
-block_se_handler:
-       push    %rsi
-       push    %rdi
-       push    %rbx
-       push    %rbp
-       push    %r12
-       push    %r13
-       push    %r14
-       push    %r15
-       pushfq
-       sub     \$64,%rsp
-
-       mov     120($context),%rax      # pull context->Rax
-       mov     248($context),%rbx      # pull context->Rip
-
-       mov     8($disp),%rsi           # disp->ImageBase
-       mov     56($disp),%r11          # disp->HandlerData
-
-       mov     0(%r11),%r10d           # HandlerData[0]
-       lea     (%rsi,%r10),%r10        # prologue label
-       cmp     %r10,%rbx               # context->Rip<prologue label
-       jb      .Lin_block_prologue
-
-       mov     152($context),%rax      # pull context->Rsp
-
-       mov     4(%r11),%r10d           # HandlerData[1]
-       lea     (%rsi,%r10),%r10        # epilogue label
-       cmp     %r10,%rbx               # context->Rip>=epilogue label
-       jae     .Lin_block_prologue
-
-       mov     24(%rax),%rax           # pull saved real stack pointer
-
-       mov     -8(%rax),%rbx
-       mov     -16(%rax),%rbp
-       mov     -24(%rax),%r12
-       mov     -32(%rax),%r13
-       mov     -40(%rax),%r14
-       mov     -48(%rax),%r15
-       mov     %rbx,144($context)      # restore context->Rbx
-       mov     %rbp,160($context)      # restore context->Rbp
-       mov     %r12,216($context)      # restore context->R12
-       mov     %r13,224($context)      # restore context->R13
-       mov     %r14,232($context)      # restore context->R14
-       mov     %r15,240($context)      # restore context->R15
-
-.Lin_block_prologue:
-       mov     8(%rax),%rdi
-       mov     16(%rax),%rsi
-       mov     %rax,152($context)      # restore context->Rsp
-       mov     %rsi,168($context)      # restore context->Rsi
-       mov     %rdi,176($context)      # restore context->Rdi
-
-       jmp     .Lcommon_seh_exit
-.size  block_se_handler,.-block_se_handler
-
-.type  key_se_handler,\@abi-omnipotent
-.align 16
-key_se_handler:
-       push    %rsi
-       push    %rdi
-       push    %rbx
-       push    %rbp
-       push    %r12
-       push    %r13
-       push    %r14
-       push    %r15
-       pushfq
-       sub     \$64,%rsp
-
-       mov     120($context),%rax      # pull context->Rax
-       mov     248($context),%rbx      # pull context->Rip
-
-       mov     8($disp),%rsi           # disp->ImageBase
-       mov     56($disp),%r11          # disp->HandlerData
-
-       mov     0(%r11),%r10d           # HandlerData[0]
-       lea     (%rsi,%r10),%r10        # prologue label
-       cmp     %r10,%rbx               # context->Rip<prologue label
-       jb      .Lin_key_prologue
-
-       mov     152($context),%rax      # pull context->Rsp
-
-       mov     4(%r11),%r10d           # HandlerData[1]
-       lea     (%rsi,%r10),%r10        # epilogue label
-       cmp     %r10,%rbx               # context->Rip>=epilogue label
-       jae     .Lin_key_prologue
-
-       lea     56(%rax),%rax
-
-       mov     -8(%rax),%rbx
-       mov     -16(%rax),%rbp
-       mov     -24(%rax),%r12
-       mov     -32(%rax),%r13
-       mov     -40(%rax),%r14
-       mov     -48(%rax),%r15
-       mov     %rbx,144($context)      # restore context->Rbx
-       mov     %rbp,160($context)      # restore context->Rbp
-       mov     %r12,216($context)      # restore context->R12
-       mov     %r13,224($context)      # restore context->R13
-       mov     %r14,232($context)      # restore context->R14
-       mov     %r15,240($context)      # restore context->R15
-
-.Lin_key_prologue:
-       mov     8(%rax),%rdi
-       mov     16(%rax),%rsi
-       mov     %rax,152($context)      # restore context->Rsp
-       mov     %rsi,168($context)      # restore context->Rsi
-       mov     %rdi,176($context)      # restore context->Rdi
-
-       jmp     .Lcommon_seh_exit
-.size  key_se_handler,.-key_se_handler
-
-.type  cbc_se_handler,\@abi-omnipotent
-.align 16
-cbc_se_handler:
-       push    %rsi
-       push    %rdi
-       push    %rbx
-       push    %rbp
-       push    %r12
-       push    %r13
-       push    %r14
-       push    %r15
-       pushfq
-       sub     \$64,%rsp
-
-       mov     120($context),%rax      # pull context->Rax
-       mov     248($context),%rbx      # pull context->Rip
-
-       lea     .Lcbc_prologue(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
-       jb      .Lin_cbc_prologue
-
-       lea     .Lcbc_fast_body(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip<.Lcbc_fast_body
-       jb      .Lin_cbc_frame_setup
-
-       lea     .Lcbc_slow_prologue(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip<.Lcbc_slow_prologue
-       jb      .Lin_cbc_body
-
-       lea     .Lcbc_slow_body(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip<.Lcbc_slow_body
-       jb      .Lin_cbc_frame_setup
-
-.Lin_cbc_body:
-       mov     152($context),%rax      # pull context->Rsp
-
-       lea     .Lcbc_epilogue(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip>=.Lcbc_epilogue
-       jae     .Lin_cbc_prologue
-
-       lea     8(%rax),%rax
-
-       lea     .Lcbc_popfq(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip>=.Lcbc_popfq
-       jae     .Lin_cbc_prologue
-
-       mov     `16-8`(%rax),%rax       # biased $_rsp
-       lea     56(%rax),%rax
-
-.Lin_cbc_frame_setup:
-       mov     -16(%rax),%rbx
-       mov     -24(%rax),%rbp
-       mov     -32(%rax),%r12
-       mov     -40(%rax),%r13
-       mov     -48(%rax),%r14
-       mov     -56(%rax),%r15
-       mov     %rbx,144($context)      # restore context->Rbx
-       mov     %rbp,160($context)      # restore context->Rbp
-       mov     %r12,216($context)      # restore context->R12
-       mov     %r13,224($context)      # restore context->R13
-       mov     %r14,232($context)      # restore context->R14
-       mov     %r15,240($context)      # restore context->R15
-
-.Lin_cbc_prologue:
-       mov     8(%rax),%rdi
-       mov     16(%rax),%rsi
-       mov     %rax,152($context)      # restore context->Rsp
-       mov     %rsi,168($context)      # restore context->Rsi
-       mov     %rdi,176($context)      # restore context->Rdi
-
-.Lcommon_seh_exit:
-
-       mov     40($disp),%rdi          # disp->ContextRecord
-       mov     $context,%rsi           # context
-       mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
-       .long   0xa548f3fc              # cld; rep movsq
-
-       mov     $disp,%rsi
-       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
-       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
-       mov     0(%rsi),%r8             # arg3, disp->ControlPc
-       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
-       mov     40(%rsi),%r10           # disp->ContextRecord
-       lea     56(%rsi),%r11           # &disp->HandlerData
-       lea     24(%rsi),%r12           # &disp->EstablisherFrame
-       mov     %r10,32(%rsp)           # arg5
-       mov     %r11,40(%rsp)           # arg6
-       mov     %r12,48(%rsp)           # arg7
-       mov     %rcx,56(%rsp)           # arg8, (NULL)
-       call    *__imp_RtlVirtualUnwind(%rip)
-
-       mov     \$1,%eax                # ExceptionContinueSearch
-       add     \$64,%rsp
-       popfq
-       pop     %r15
-       pop     %r14
-       pop     %r13
-       pop     %r12
-       pop     %rbp
-       pop     %rbx
-       pop     %rdi
-       pop     %rsi
-       ret
-.size  cbc_se_handler,.-cbc_se_handler
-
-.section       .pdata
-.align 4
-       .rva    .LSEH_begin_AES_encrypt
-       .rva    .LSEH_end_AES_encrypt
-       .rva    .LSEH_info_AES_encrypt
-
-       .rva    .LSEH_begin_AES_decrypt
-       .rva    .LSEH_end_AES_decrypt
-       .rva    .LSEH_info_AES_decrypt
-
-       .rva    .LSEH_begin_AES_set_encrypt_key
-       .rva    .LSEH_end_AES_set_encrypt_key
-       .rva    .LSEH_info_AES_set_encrypt_key
-
-       .rva    .LSEH_begin_AES_set_decrypt_key
-       .rva    .LSEH_end_AES_set_decrypt_key
-       .rva    .LSEH_info_AES_set_decrypt_key
-
-       .rva    .LSEH_begin_AES_cbc_encrypt
-       .rva    .LSEH_end_AES_cbc_encrypt
-       .rva    .LSEH_info_AES_cbc_encrypt
-
-.section       .xdata
-.align 8
-.LSEH_info_AES_encrypt:
-       .byte   9,0,0,0
-       .rva    block_se_handler
-       .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
-.LSEH_info_AES_decrypt:
-       .byte   9,0,0,0
-       .rva    block_se_handler
-       .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
-       .byte   9,0,0,0
-       .rva    key_se_handler
-       .rva    .Lenc_key_prologue,.Lenc_key_epilogue   # HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
-       .byte   9,0,0,0
-       .rva    key_se_handler
-       .rva    .Ldec_key_prologue,.Ldec_key_epilogue   # HandlerData[]
-.LSEH_info_AES_cbc_encrypt:
-       .byte   9,0,0,0
-       .rva    cbc_se_handler
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-print $code;
-
-close STDOUT;
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
deleted file mode 100644 (file)
index e623427..0000000
+++ /dev/null
@@ -1,3239 +0,0 @@
-#! /usr/bin/env perl
-# Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-
-###################################################################
-### AES-128 [originally in CTR mode]                           ###
-### bitsliced implementation for Intel Core 2 processors       ###
-### requires support of SSE extensions up to SSSE3             ###
-### Author: Emilia Käsper and Peter Schwabe                   ###
-### Date: 2009-03-19                                           ###
-### Public domain                                              ###
-###                                                            ###
-### See http://homes.esat.kuleuven.be/~ekasper/#software for   ###
-### further information.                                       ###
-###################################################################
-#
-# September 2011.
-#
-# Started as transliteration to "perlasm" the original code has
-# undergone following changes:
-#
-# - code was made position-independent;
-# - rounds were folded into a loop resulting in >5x size reduction
-#   from 12.5KB to 2.2KB;
-# - above was possibile thanks to mixcolumns() modification that
-#   allowed to feed its output back to aesenc[last], this was
-#   achieved at cost of two additional inter-registers moves;
-# - some instruction reordering and interleaving;
-# - this module doesn't implement key setup subroutine, instead it
-#   relies on conversion of "conventional" key schedule as returned
-#   by AES_set_encrypt_key (see discussion below);
-# - first and last round keys are treated differently, which allowed
-#   to skip one shiftrows(), reduce bit-sliced key schedule and
-#   speed-up conversion by 22%;
-# - support for 192- and 256-bit keys was added;
-#
-# Resulting performance in CPU cycles spent to encrypt one byte out
-# of 4096-byte buffer with 128-bit key is:
-#
-#              Emilia's        this(*)         difference
-#
-# Core 2       9.30            8.69            +7%
-# Nehalem(**)  7.63            6.88            +11%
-# Atom         17.1            16.4            +4%
-# Silvermont   -               12.9
-# Goldmont     -               8.85
-#
-# (*)  Comparison is not completely fair, because "this" is ECB,
-#      i.e. no extra processing such as counter values calculation
-#      and xor-ing input as in Emilia's CTR implementation is
-#      performed. However, the CTR calculations stand for not more
-#      than 1% of total time, so comparison is *rather* fair.
-#
-# (**) Results were collected on Westmere, which is considered to
-#      be equivalent to Nehalem for this code.
-#
-# As for key schedule conversion subroutine. Interface to OpenSSL
-# relies on per-invocation on-the-fly conversion. This naturally
-# has impact on performance, especially for short inputs. Conversion
-# time in CPU cycles and its ratio to CPU cycles spent in 8x block
-# function is:
-#
-#              conversion      conversion/8x block
-# Core 2       240             0.22
-# Nehalem      180             0.20
-# Atom         430             0.20
-#
-# The ratio values mean that 128-byte blocks will be processed
-# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
-# etc. Then keep in mind that input sizes not divisible by 128 are
-# *effectively* slower, especially shortest ones, e.g. consecutive
-# 144-byte blocks are processed 44% slower than one would expect,
-# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
-# it's still faster than ["hyper-threading-safe" code path in]
-# aes-x86_64.pl on all lengths above 64 bytes...
-#
-# October 2011.
-#
-# Add decryption procedure. Performance in CPU cycles spent to decrypt
-# one byte out of 4096-byte buffer with 128-bit key is:
-#
-# Core 2       9.98
-# Nehalem      7.80
-# Atom         17.9
-# Silvermont   14.0
-# Goldmont     10.2
-#
-# November 2011.
-#
-# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
-# suboptimal, but XTS is meant to be used with larger blocks...
-#
-#                                              <appro@openssl.org>
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
-my @XMM=map("%xmm$_",(15,0..14));      # best on Atom, +10% over (0..15)
-my $ecb=0;     # suppress unreferenced ECB subroutines, spare some space...
-
-{
-my ($key,$rounds,$const)=("%rax","%r10d","%r11");
-
-sub Sbox {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
-my @b=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-       &InBasisChange  (@b);
-       &Inv_GF256      (@b[6,5,0,3,7,1,4,2],@t,@s);
-       &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
-}
-
-sub InBasisChange {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
-my @b=@_[0..7];
-$code.=<<___;
-       pxor    @b[6], @b[5]
-       pxor    @b[1], @b[2]
-       pxor    @b[0], @b[3]
-       pxor    @b[2], @b[6]
-       pxor    @b[0], @b[5]
-
-       pxor    @b[3], @b[6]
-       pxor    @b[7], @b[3]
-       pxor    @b[5], @b[7]
-       pxor    @b[4], @b[3]
-       pxor    @b[5], @b[4]
-       pxor    @b[1], @b[3]
-
-       pxor    @b[7], @b[2]
-       pxor    @b[5], @b[1]
-___
-}
-
-sub OutBasisChange {
-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
-my @b=@_[0..7];
-$code.=<<___;
-       pxor    @b[6], @b[0]
-       pxor    @b[4], @b[1]
-       pxor    @b[0], @b[2]
-       pxor    @b[6], @b[4]
-       pxor    @b[1], @b[6]
-
-       pxor    @b[5], @b[1]
-       pxor    @b[3], @b[5]
-       pxor    @b[7], @b[3]
-       pxor    @b[5], @b[7]
-       pxor    @b[5], @b[2]
-
-       pxor    @b[7], @b[4]
-___
-}
-
-sub InvSbox {
-# input in lsb         > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb        > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
-my @b=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-       &InvInBasisChange       (@b);
-       &Inv_GF256              (@b[5,1,2,6,3,7,0,4],@t,@s);
-       &InvOutBasisChange      (@b[3,7,0,4,5,1,2,6]);
-}
-
-sub InvInBasisChange {         # OutBasisChange in reverse
-my @b=@_[5,1,2,6,3,7,0,4];
-$code.=<<___
-       pxor    @b[7], @b[4]
-
-       pxor    @b[5], @b[7]
-       pxor    @b[5], @b[2]
-       pxor    @b[7], @b[3]
-       pxor    @b[3], @b[5]
-       pxor    @b[5], @b[1]
-
-       pxor    @b[1], @b[6]
-       pxor    @b[0], @b[2]
-       pxor    @b[6], @b[4]
-       pxor    @b[6], @b[0]
-       pxor    @b[4], @b[1]
-___
-}
-
-sub InvOutBasisChange {                # InBasisChange in reverse
-my @b=@_[2,5,7,3,6,1,0,4];
-$code.=<<___;
-       pxor    @b[5], @b[1]
-       pxor    @b[7], @b[2]
-
-       pxor    @b[1], @b[3]
-       pxor    @b[5], @b[4]
-       pxor    @b[5], @b[7]
-       pxor    @b[4], @b[3]
-        pxor   @b[0], @b[5]
-       pxor    @b[7], @b[3]
-        pxor   @b[2], @b[6]
-        pxor   @b[1], @b[2]
-       pxor    @b[3], @b[6]
-
-       pxor    @b[0], @b[3]
-       pxor    @b[6], @b[5]
-___
-}
-
-sub Mul_GF4 {
-#;*************************************************************
-#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
-#;*************************************************************
-my ($x0,$x1,$y0,$y1,$t0)=@_;
-$code.=<<___;
-       movdqa  $y0, $t0
-       pxor    $y1, $t0
-       pand    $x0, $t0
-       pxor    $x1, $x0
-       pand    $y0, $x1
-       pand    $y1, $x0
-       pxor    $x1, $x0
-       pxor    $t0, $x1
-___
-}
-
-sub Mul_GF4_N {                                # not used, see next subroutine
-# multiply and scale by N
-my ($x0,$x1,$y0,$y1,$t0)=@_;
-$code.=<<___;
-       movdqa  $y0, $t0
-       pxor    $y1, $t0
-       pand    $x0, $t0
-       pxor    $x1, $x0
-       pand    $y0, $x1
-       pand    $y1, $x0
-       pxor    $x0, $x1
-       pxor    $t0, $x0
-___
-}
-
-sub Mul_GF4_N_GF4 {
-# interleaved Mul_GF4_N and Mul_GF4
-my ($x0,$x1,$y0,$y1,$t0,
-    $x2,$x3,$y2,$y3,$t1)=@_;
-$code.=<<___;
-       movdqa  $y0, $t0
-        movdqa $y2, $t1
-       pxor    $y1, $t0
-        pxor   $y3, $t1
-       pand    $x0, $t0
-        pand   $x2, $t1
-       pxor    $x1, $x0
-        pxor   $x3, $x2
-       pand    $y0, $x1
-        pand   $y2, $x3
-       pand    $y1, $x0
-        pand   $y3, $x2
-       pxor    $x0, $x1
-        pxor   $x3, $x2
-       pxor    $t0, $x0
-        pxor   $t1, $x3
-___
-}
-sub Mul_GF16_2 {
-my @x=@_[0..7];
-my @y=@_[8..11];
-my @t=@_[12..15];
-$code.=<<___;
-       movdqa  @x[0], @t[0]
-       movdqa  @x[1], @t[1]
-___
-       &Mul_GF4        (@x[0], @x[1], @y[0], @y[1], @t[2]);
-$code.=<<___;
-       pxor    @x[2], @t[0]
-       pxor    @x[3], @t[1]
-       pxor    @y[2], @y[0]
-       pxor    @y[3], @y[1]
-___
-       Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
-                        @x[2], @x[3], @y[2], @y[3], @t[2]);
-$code.=<<___;
-       pxor    @t[0], @x[0]
-       pxor    @t[0], @x[2]
-       pxor    @t[1], @x[1]
-       pxor    @t[1], @x[3]
-
-       movdqa  @x[4], @t[0]
-       movdqa  @x[5], @t[1]
-       pxor    @x[6], @t[0]
-       pxor    @x[7], @t[1]
-___
-       &Mul_GF4_N_GF4  (@t[0], @t[1], @y[0], @y[1], @t[3],
-                        @x[6], @x[7], @y[2], @y[3], @t[2]);
-$code.=<<___;
-       pxor    @y[2], @y[0]
-       pxor    @y[3], @y[1]
-___
-       &Mul_GF4        (@x[4], @x[5], @y[0], @y[1], @t[3]);
-$code.=<<___;
-       pxor    @t[0], @x[4]
-       pxor    @t[0], @x[6]
-       pxor    @t[1], @x[5]
-       pxor    @t[1], @x[7]
-___
-}
-sub Inv_GF256 {
-#;********************************************************************
-#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
-#;********************************************************************
-my @x=@_[0..7];
-my @t=@_[8..11];
-my @s=@_[12..15];
-# direct optimizations from hardware
-$code.=<<___;
-       movdqa  @x[4], @t[3]
-       movdqa  @x[5], @t[2]
-       movdqa  @x[1], @t[1]
-       movdqa  @x[7], @s[1]
-       movdqa  @x[0], @s[0]
-
-       pxor    @x[6], @t[3]
-       pxor    @x[7], @t[2]
-       pxor    @x[3], @t[1]
-        movdqa @t[3], @s[2]
-       pxor    @x[6], @s[1]
-        movdqa @t[2], @t[0]
-       pxor    @x[2], @s[0]
-        movdqa @t[3], @s[3]
-
-       por     @t[1], @t[2]
-       por     @s[0], @t[3]
-       pxor    @t[0], @s[3]
-       pand    @s[0], @s[2]
-       pxor    @t[1], @s[0]
-       pand    @t[1], @t[0]
-       pand    @s[0], @s[3]
-       movdqa  @x[3], @s[0]
-       pxor    @x[2], @s[0]
-       pand    @s[0], @s[1]
-       pxor    @s[1], @t[3]
-       pxor    @s[1], @t[2]
-       movdqa  @x[4], @s[1]
-       movdqa  @x[1], @s[0]
-       pxor    @x[5], @s[1]
-       pxor    @x[0], @s[0]
-       movdqa  @s[1], @t[1]
-       pand    @s[0], @s[1]
-       por     @s[0], @t[1]
-       pxor    @s[1], @t[0]
-       pxor    @s[3], @t[3]
-       pxor    @s[2], @t[2]
-       pxor    @s[3], @t[1]
-       movdqa  @x[7], @s[0]
-       pxor    @s[2], @t[0]
-       movdqa  @x[6], @s[1]
-       pxor    @s[2], @t[1]
-       movdqa  @x[5], @s[2]
-       pand    @x[3], @s[0]
-       movdqa  @x[4], @s[3]
-       pand    @x[2], @s[1]
-       pand    @x[1], @s[2]
-       por     @x[0], @s[3]
-       pxor    @s[0], @t[3]
-       pxor    @s[1], @t[2]
-       pxor    @s[2], @t[1]
-       pxor    @s[3], @t[0]
-
-       #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
-
-       # new smaller inversion
-
-       movdqa  @t[3], @s[0]
-       pand    @t[1], @t[3]
-       pxor    @t[2], @s[0]
-
-       movdqa  @t[0], @s[2]
-       movdqa  @s[0], @s[3]
-       pxor    @t[3], @s[2]
-       pand    @s[2], @s[3]
-
-       movdqa  @t[1], @s[1]
-       pxor    @t[2], @s[3]
-       pxor    @t[0], @s[1]
-
-       pxor    @t[2], @t[3]
-
-       pand    @t[3], @s[1]
-
-       movdqa  @s[2], @t[2]
-       pxor    @t[0], @s[1]
-
-       pxor    @s[1], @t[2]
-       pxor    @s[1], @t[1]
-
-       pand    @t[0], @t[2]
-
-       pxor    @t[2], @s[2]
-       pxor    @t[2], @t[1]
-
-       pand    @s[3], @s[2]
-
-       pxor    @s[0], @s[2]
-___
-# output in s3, s2, s1, t1
-
-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
-
-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
-       &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
-
-### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
-}
-
-# AES linear components
-
-sub ShiftRows {
-my @x=@_[0..7];
-my $mask=pop;
-$code.=<<___;
-       pxor    0x00($key),@x[0]
-       pxor    0x10($key),@x[1]
-       pxor    0x20($key),@x[2]
-       pxor    0x30($key),@x[3]
-       pshufb  $mask,@x[0]
-       pshufb  $mask,@x[1]
-       pxor    0x40($key),@x[4]
-       pxor    0x50($key),@x[5]
-       pshufb  $mask,@x[2]
-       pshufb  $mask,@x[3]
-       pxor    0x60($key),@x[6]
-       pxor    0x70($key),@x[7]
-       pshufb  $mask,@x[4]
-       pshufb  $mask,@x[5]
-       pshufb  $mask,@x[6]
-       pshufb  $mask,@x[7]
-       lea     0x80($key),$key
-___
-}
-
-sub MixColumns {
-# modified to emit output in order suitable for feeding back to aesenc[last]
-my @x=@_[0..7];
-my @t=@_[8..15];
-my $inv=@_[16];        # optional
-$code.=<<___;
-       pshufd  \$0x93, @x[0], @t[0]    # x0 <<< 32
-       pshufd  \$0x93, @x[1], @t[1]
-        pxor   @t[0], @x[0]            # x0 ^ (x0 <<< 32)
-       pshufd  \$0x93, @x[2], @t[2]
-        pxor   @t[1], @x[1]
-       pshufd  \$0x93, @x[3], @t[3]
-        pxor   @t[2], @x[2]
-       pshufd  \$0x93, @x[4], @t[4]
-        pxor   @t[3], @x[3]
-       pshufd  \$0x93, @x[5], @t[5]
-        pxor   @t[4], @x[4]
-       pshufd  \$0x93, @x[6], @t[6]
-        pxor   @t[5], @x[5]
-       pshufd  \$0x93, @x[7], @t[7]
-        pxor   @t[6], @x[6]
-        pxor   @t[7], @x[7]
-
-       pxor    @x[0], @t[1]
-       pxor    @x[7], @t[0]
-       pxor    @x[7], @t[1]
-        pshufd \$0x4E, @x[0], @x[0]    # (x0 ^ (x0 <<< 32)) <<< 64)
-       pxor    @x[1], @t[2]
-        pshufd \$0x4E, @x[1], @x[1]
-       pxor    @x[4], @t[5]
-        pxor   @t[0], @x[0]
-       pxor    @x[5], @t[6]
-        pxor   @t[1], @x[1]
-       pxor    @x[3], @t[4]
-        pshufd \$0x4E, @x[4], @t[0]
-       pxor    @x[6], @t[7]
-        pshufd \$0x4E, @x[5], @t[1]
-       pxor    @x[2], @t[3]
-        pshufd \$0x4E, @x[3], @x[4]
-       pxor    @x[7], @t[3]
-        pshufd \$0x4E, @x[7], @x[5]
-       pxor    @x[7], @t[4]
-        pshufd \$0x4E, @x[6], @x[3]
-       pxor    @t[4], @t[0]
-        pshufd \$0x4E, @x[2], @x[6]
-       pxor    @t[5], @t[1]
-___
-$code.=<<___ if (!$inv);
-       pxor    @t[3], @x[4]
-       pxor    @t[7], @x[5]
-       pxor    @t[6], @x[3]
-        movdqa @t[0], @x[2]
-       pxor    @t[2], @x[6]
-        movdqa @t[1], @x[7]
-___
-$code.=<<___ if ($inv);
-       pxor    @x[4], @t[3]
-       pxor    @t[7], @x[5]
-       pxor    @x[3], @t[6]
-        movdqa @t[0], @x[3]
-       pxor    @t[2], @x[6]
-        movdqa @t[6], @x[2]
-        movdqa @t[1], @x[7]
-        movdqa @x[6], @x[4]
-        movdqa @t[3], @x[6]
-___
-}
-
-sub InvMixColumns_orig {
-my @x=@_[0..7];
-my @t=@_[8..15];
-
-$code.=<<___;
-       # multiplication by 0x0e
-       pshufd  \$0x93, @x[7], @t[7]
-       movdqa  @x[2], @t[2]
-       pxor    @x[5], @x[7]            # 7 5
-       pxor    @x[5], @x[2]            # 2 5
-       pshufd  \$0x93, @x[0], @t[0]
-       movdqa  @x[5], @t[5]
-       pxor    @x[0], @x[5]            # 5 0           [1]
-       pxor    @x[1], @x[0]            # 0 1
-       pshufd  \$0x93, @x[1], @t[1]
-       pxor    @x[2], @x[1]            # 1 25
-       pxor    @x[6], @x[0]            # 01 6          [2]
-       pxor    @x[3], @x[1]            # 125 3         [4]
-       pshufd  \$0x93, @x[3], @t[3]
-       pxor    @x[0], @x[2]            # 25 016        [3]
-       pxor    @x[7], @x[3]            # 3 75
-       pxor    @x[6], @x[7]            # 75 6          [0]
-       pshufd  \$0x93, @x[6], @t[6]
-       movdqa  @x[4], @t[4]
-       pxor    @x[4], @x[6]            # 6 4
-       pxor    @x[3], @x[4]            # 4 375         [6]
-       pxor    @x[7], @x[3]            # 375 756=36
-       pxor    @t[5], @x[6]            # 64 5          [7]
-       pxor    @t[2], @x[3]            # 36 2
-       pxor    @t[4], @x[3]            # 362 4         [5]
-       pshufd  \$0x93, @t[5], @t[5]
-___
-                                       my @y = @x[7,5,0,2,1,3,4,6];
-$code.=<<___;
-       # multiplication by 0x0b
-       pxor    @y[0], @y[1]
-       pxor    @t[0], @y[0]
-       pxor    @t[1], @y[1]
-       pshufd  \$0x93, @t[2], @t[2]
-       pxor    @t[5], @y[0]
-       pxor    @t[6], @y[1]
-       pxor    @t[7], @y[0]
-       pshufd  \$0x93, @t[4], @t[4]
-       pxor    @t[6], @t[7]            # clobber t[7]
-       pxor    @y[0], @y[1]
-
-       pxor    @t[0], @y[3]
-       pshufd  \$0x93, @t[0], @t[0]
-       pxor    @t[1], @y[2]
-       pxor    @t[1], @y[4]
-       pxor    @t[2], @y[2]
-       pshufd  \$0x93, @t[1], @t[1]
-       pxor    @t[2], @y[3]
-       pxor    @t[2], @y[5]
-       pxor    @t[7], @y[2]
-       pshufd  \$0x93, @t[2], @t[2]
-       pxor    @t[3], @y[3]
-       pxor    @t[3], @y[6]
-       pxor    @t[3], @y[4]
-       pshufd  \$0x93, @t[3], @t[3]
-       pxor    @t[4], @y[7]
-       pxor    @t[4], @y[5]
-       pxor    @t[7], @y[7]
-       pxor    @t[5], @y[3]
-       pxor    @t[4], @y[4]
-       pxor    @t[5], @t[7]            # clobber t[7] even more
-
-       pxor    @t[7], @y[5]
-       pshufd  \$0x93, @t[4], @t[4]
-       pxor    @t[7], @y[6]
-       pxor    @t[7], @y[4]
-
-       pxor    @t[5], @t[7]
-       pshufd  \$0x93, @t[5], @t[5]
-       pxor    @t[6], @t[7]            # restore t[7]
-
-       # multiplication by 0x0d
-       pxor    @y[7], @y[4]
-       pxor    @t[4], @y[7]
-       pshufd  \$0x93, @t[6], @t[6]
-       pxor    @t[0], @y[2]
-       pxor    @t[5], @y[7]
-       pxor    @t[2], @y[2]
-       pshufd  \$0x93, @t[7], @t[7]
-
-       pxor    @y[1], @y[3]
-       pxor    @t[1], @y[1]
-       pxor    @t[0], @y[0]
-       pxor    @t[0], @y[3]
-       pxor    @t[5], @y[1]
-       pxor    @t[5], @y[0]
-       pxor    @t[7], @y[1]
-       pshufd  \$0x93, @t[0], @t[0]
-       pxor    @t[6], @y[0]
-       pxor    @y[1], @y[3]
-       pxor    @t[1], @y[4]
-       pshufd  \$0x93, @t[1], @t[1]
-
-       pxor    @t[7], @y[7]
-       pxor    @t[2], @y[4]
-       pxor    @t[2], @y[5]
-       pshufd  \$0x93, @t[2], @t[2]
-       pxor    @t[6], @y[2]
-       pxor    @t[3], @t[6]            # clobber t[6]
-       pxor    @y[7], @y[4]
-       pxor    @t[6], @y[3]
-
-       pxor    @t[6], @y[6]
-       pxor    @t[5], @y[5]
-       pxor    @t[4], @y[6]
-       pshufd  \$0x93, @t[4], @t[4]
-       pxor    @t[6], @y[5]
-       pxor    @t[7], @y[6]
-       pxor    @t[3], @t[6]            # restore t[6]
-
-       pshufd  \$0x93, @t[5], @t[5]
-       pshufd  \$0x93, @t[6], @t[6]
-       pshufd  \$0x93, @t[7], @t[7]
-       pshufd  \$0x93, @t[3], @t[3]
-
-       # multiplication by 0x09
-       pxor    @y[1], @y[4]
-       pxor    @y[1], @t[1]            # t[1]=y[1]
-       pxor    @t[5], @t[0]            # clobber t[0]
-       pxor    @t[5], @t[1]
-       pxor    @t[0], @y[3]
-       pxor    @y[0], @t[0]            # t[0]=y[0]
-       pxor    @t[6], @t[1]
-       pxor    @t[7], @t[6]            # clobber t[6]
-       pxor    @t[1], @y[4]
-       pxor    @t[4], @y[7]
-       pxor    @y[4], @t[4]            # t[4]=y[4]
-       pxor    @t[3], @y[6]
-       pxor    @y[3], @t[3]            # t[3]=y[3]
-       pxor    @t[2], @y[5]
-       pxor    @y[2], @t[2]            # t[2]=y[2]
-       pxor    @t[7], @t[3]
-       pxor    @y[5], @t[5]            # t[5]=y[5]
-       pxor    @t[6], @t[2]
-       pxor    @t[6], @t[5]
-       pxor    @y[6], @t[6]            # t[6]=y[6]
-       pxor    @y[7], @t[7]            # t[7]=y[7]
-
-       movdqa  @t[0],@XMM[0]
-       movdqa  @t[1],@XMM[1]
-       movdqa  @t[2],@XMM[2]
-       movdqa  @t[3],@XMM[3]
-       movdqa  @t[4],@XMM[4]
-       movdqa  @t[5],@XMM[5]
-       movdqa  @t[6],@XMM[6]
-       movdqa  @t[7],@XMM[7]
-___
-}
-
-sub InvMixColumns {
-my @x=@_[0..7];
-my @t=@_[8..15];
-
-# Thanks to Jussi Kivilinna for providing pointer to
-#
-# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
-# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
-# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
-# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
-
-$code.=<<___;
-       # multiplication by 0x05-0x00-0x04-0x00
-       pshufd  \$0x4E, @x[0], @t[0]
-       pshufd  \$0x4E, @x[6], @t[6]
-       pxor    @x[0], @t[0]
-       pshufd  \$0x4E, @x[7], @t[7]
-       pxor    @x[6], @t[6]
-       pshufd  \$0x4E, @x[1], @t[1]
-       pxor    @x[7], @t[7]
-       pshufd  \$0x4E, @x[2], @t[2]
-       pxor    @x[1], @t[1]
-       pshufd  \$0x4E, @x[3], @t[3]
-       pxor    @x[2], @t[2]
-        pxor   @t[6], @x[0]
-        pxor   @t[6], @x[1]
-       pshufd  \$0x4E, @x[4], @t[4]
-       pxor    @x[3], @t[3]
-        pxor   @t[0], @x[2]
-        pxor   @t[1], @x[3]
-       pshufd  \$0x4E, @x[5], @t[5]
-       pxor    @x[4], @t[4]
-        pxor   @t[7], @x[1]
-        pxor   @t[2], @x[4]
-       pxor    @x[5], @t[5]
-
-        pxor   @t[7], @x[2]
-        pxor   @t[6], @x[3]
-        pxor   @t[6], @x[4]
-        pxor   @t[3], @x[5]
-        pxor   @t[4], @x[6]
-        pxor   @t[7], @x[4]
-        pxor   @t[7], @x[5]
-        pxor   @t[5], @x[7]
-___
-       &MixColumns     (@x,@t,1);      # flipped 2<->3 and 4<->6
-}
-
-sub aesenc {                           # not used
-my @b=@_[0..7];
-my @t=@_[8..15];
-$code.=<<___;
-       movdqa  0x30($const),@t[0]      # .LSR
-___
-       &ShiftRows      (@b,@t[0]);
-       &Sbox           (@b,@t);
-       &MixColumns     (@b[0,1,4,6,3,7,2,5],@t);
-}
-
-sub aesenclast {                       # not used
-my @b=@_[0..7];
-my @t=@_[8..15];
-$code.=<<___;
-       movdqa  0x40($const),@t[0]      # .LSRM0
-___
-       &ShiftRows      (@b,@t[0]);
-       &Sbox           (@b,@t);
-$code.=<<___
-       pxor    0x00($key),@b[0]
-       pxor    0x10($key),@b[1]
-       pxor    0x20($key),@b[4]
-       pxor    0x30($key),@b[6]
-       pxor    0x40($key),@b[3]
-       pxor    0x50($key),@b[7]
-       pxor    0x60($key),@b[2]
-       pxor    0x70($key),@b[5]
-___
-}
-
-sub swapmove {
-my ($a,$b,$n,$mask,$t)=@_;
-$code.=<<___;
-       movdqa  $b,$t
-       psrlq   \$$n,$b
-       pxor    $a,$b
-       pand    $mask,$b
-       pxor    $b,$a
-       psllq   \$$n,$b
-       pxor    $t,$b
-___
-}
-sub swapmove2x {
-my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
-$code.=<<___;
-       movdqa  $b0,$t0
-       psrlq   \$$n,$b0
-        movdqa $b1,$t1
-        psrlq  \$$n,$b1
-       pxor    $a0,$b0
-        pxor   $a1,$b1
-       pand    $mask,$b0
-        pand   $mask,$b1
-       pxor    $b0,$a0
-       psllq   \$$n,$b0
-        pxor   $b1,$a1
-        psllq  \$$n,$b1
-       pxor    $t0,$b0
-        pxor   $t1,$b1
-___
-}
-
-sub bitslice {
-my @x=reverse(@_[0..7]);
-my ($t0,$t1,$t2,$t3)=@_[8..11];
-$code.=<<___;
-       movdqa  0x00($const),$t0        # .LBS0
-       movdqa  0x10($const),$t1        # .LBS1
-___
-       &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
-       &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-$code.=<<___;
-       movdqa  0x20($const),$t0        # .LBS2
-___
-       &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
-       &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-
-       &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
-       &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
-}
-
-$code.=<<___;
-.text
-
-.extern        asm_AES_encrypt
-.extern        asm_AES_decrypt
-
-.type  _bsaes_encrypt8,\@abi-omnipotent
-.align 64
-_bsaes_encrypt8:
-.cfi_startproc
-       lea     .LBS0(%rip), $const     # constants table
-
-       movdqa  ($key), @XMM[9]         # round 0 key
-       lea     0x10($key), $key
-       movdqa  0x50($const), @XMM[8]   # .LM0SR
-       pxor    @XMM[9], @XMM[0]        # xor with round0 key
-       pxor    @XMM[9], @XMM[1]
-       pxor    @XMM[9], @XMM[2]
-       pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[0]
-        pshufb @XMM[8], @XMM[1]
-       pxor    @XMM[9], @XMM[4]
-       pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[2]
-        pshufb @XMM[8], @XMM[3]
-       pxor    @XMM[9], @XMM[6]
-       pxor    @XMM[9], @XMM[7]
-        pshufb @XMM[8], @XMM[4]
-        pshufb @XMM[8], @XMM[5]
-        pshufb @XMM[8], @XMM[6]
-        pshufb @XMM[8], @XMM[7]
-_bsaes_encrypt8_bitslice:
-___
-       &bitslice       (@XMM[0..7, 8..11]);
-$code.=<<___;
-       dec     $rounds
-       jmp     .Lenc_sbox
-.align 16
-.Lenc_loop:
-___
-       &ShiftRows      (@XMM[0..7, 8]);
-$code.=".Lenc_sbox:\n";
-       &Sbox           (@XMM[0..7, 8..15]);
-$code.=<<___;
-       dec     $rounds
-       jl      .Lenc_done
-___
-       &MixColumns     (@XMM[0,1,4,6,3,7,2,5, 8..15]);
-$code.=<<___;
-       movdqa  0x30($const), @XMM[8]   # .LSR
-       jnz     .Lenc_loop
-       movdqa  0x40($const), @XMM[8]   # .LSRM0
-       jmp     .Lenc_loop
-.align 16
-.Lenc_done:
-___
-       # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
-       &bitslice       (@XMM[0,1,4,6,3,7,2,5, 8..11]);
-$code.=<<___;
-       movdqa  ($key), @XMM[8]         # last round key
-       pxor    @XMM[8], @XMM[4]
-       pxor    @XMM[8], @XMM[6]
-       pxor    @XMM[8], @XMM[3]
-       pxor    @XMM[8], @XMM[7]
-       pxor    @XMM[8], @XMM[2]
-       pxor    @XMM[8], @XMM[5]
-       pxor    @XMM[8], @XMM[0]
-       pxor    @XMM[8], @XMM[1]
-       ret
-.cfi_endproc
-.size  _bsaes_encrypt8,.-_bsaes_encrypt8
-
-.type  _bsaes_decrypt8,\@abi-omnipotent
-.align 64
-_bsaes_decrypt8:
-.cfi_startproc
-       lea     .LBS0(%rip), $const     # constants table
-
-       movdqa  ($key), @XMM[9]         # round 0 key
-       lea     0x10($key), $key
-       movdqa  -0x30($const), @XMM[8]  # .LM0ISR
-       pxor    @XMM[9], @XMM[0]        # xor with round0 key
-       pxor    @XMM[9], @XMM[1]
-       pxor    @XMM[9], @XMM[2]
-       pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[0]
-        pshufb @XMM[8], @XMM[1]
-       pxor    @XMM[9], @XMM[4]
-       pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[2]
-        pshufb @XMM[8], @XMM[3]
-       pxor    @XMM[9], @XMM[6]
-       pxor    @XMM[9], @XMM[7]
-        pshufb @XMM[8], @XMM[4]
-        pshufb @XMM[8], @XMM[5]
-        pshufb @XMM[8], @XMM[6]
-        pshufb @XMM[8], @XMM[7]
-___
-       &bitslice       (@XMM[0..7, 8..11]);
-$code.=<<___;
-       dec     $rounds
-       jmp     .Ldec_sbox
-.align 16
-.Ldec_loop:
-___
-       &ShiftRows      (@XMM[0..7, 8]);
-$code.=".Ldec_sbox:\n";
-       &InvSbox        (@XMM[0..7, 8..15]);
-$code.=<<___;
-       dec     $rounds
-       jl      .Ldec_done
-___
-       &InvMixColumns  (@XMM[0,1,6,4,2,7,3,5, 8..15]);
-$code.=<<___;
-       movdqa  -0x10($const), @XMM[8]  # .LISR
-       jnz     .Ldec_loop
-       movdqa  -0x20($const), @XMM[8]  # .LISRM0
-       jmp     .Ldec_loop
-.align 16
-.Ldec_done:
-___
-       &bitslice       (@XMM[0,1,6,4,2,7,3,5, 8..11]);
-$code.=<<___;
-       movdqa  ($key), @XMM[8]         # last round key
-       pxor    @XMM[8], @XMM[6]
-       pxor    @XMM[8], @XMM[4]
-       pxor    @XMM[8], @XMM[2]
-       pxor    @XMM[8], @XMM[7]
-       pxor    @XMM[8], @XMM[3]
-       pxor    @XMM[8], @XMM[5]
-       pxor    @XMM[8], @XMM[0]
-       pxor    @XMM[8], @XMM[1]
-       ret
-.cfi_endproc
-.size  _bsaes_decrypt8,.-_bsaes_decrypt8
-___
-}
-{
-my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
-
-sub bitslice_key {
-my @x=reverse(@_[0..7]);
-my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
-
-       &swapmove       (@x[0,1],1,$bs0,$t2,$t3);
-$code.=<<___;
-       #&swapmove(@x[2,3],1,$t0,$t2,$t3);
-       movdqa  @x[0], @x[2]
-       movdqa  @x[1], @x[3]
-___
-       #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
-
-       &swapmove2x     (@x[0,2,1,3],2,$bs1,$t2,$t3);
-$code.=<<___;
-       #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
-       movdqa  @x[0], @x[4]
-       movdqa  @x[2], @x[6]
-       movdqa  @x[1], @x[5]
-       movdqa  @x[3], @x[7]
-___
-       &swapmove2x     (@x[0,4,1,5],4,$bs2,$t2,$t3);
-       &swapmove2x     (@x[2,6,3,7],4,$bs2,$t2,$t3);
-}
-
-$code.=<<___;
-.type  _bsaes_key_convert,\@abi-omnipotent
-.align 16
-_bsaes_key_convert:
-.cfi_startproc
-       lea     .Lmasks(%rip), $const
-       movdqu  ($inp), %xmm7           # load round 0 key
-       lea     0x10($inp), $inp
-       movdqa  0x00($const), %xmm0     # 0x01...
-       movdqa  0x10($const), %xmm1     # 0x02...
-       movdqa  0x20($const), %xmm2     # 0x04...
-       movdqa  0x30($const), %xmm3     # 0x08...
-       movdqa  0x40($const), %xmm4     # .LM0
-       pcmpeqd %xmm5, %xmm5            # .LNOT
-
-       movdqu  ($inp), %xmm6           # load round 1 key
-       movdqa  %xmm7, ($out)           # save round 0 key
-       lea     0x10($out), $out
-       dec     $rounds
-       jmp     .Lkey_loop
-.align 16
-.Lkey_loop:
-       pshufb  %xmm4, %xmm6            # .LM0
-
-       movdqa  %xmm0,  %xmm8
-       movdqa  %xmm1,  %xmm9
-
-       pand    %xmm6,  %xmm8
-       pand    %xmm6,  %xmm9
-       movdqa  %xmm2,  %xmm10
-       pcmpeqb %xmm0,  %xmm8
-       psllq   \$4,    %xmm0           # 0x10...
-       movdqa  %xmm3,  %xmm11
-       pcmpeqb %xmm1,  %xmm9
-       psllq   \$4,    %xmm1           # 0x20...
-
-       pand    %xmm6,  %xmm10
-       pand    %xmm6,  %xmm11
-       movdqa  %xmm0,  %xmm12
-       pcmpeqb %xmm2,  %xmm10
-       psllq   \$4,    %xmm2           # 0x40...
-       movdqa  %xmm1,  %xmm13
-       pcmpeqb %xmm3,  %xmm11
-       psllq   \$4,    %xmm3           # 0x80...
-
-       movdqa  %xmm2,  %xmm14
-       movdqa  %xmm3,  %xmm15
-        pxor   %xmm5,  %xmm8           # "pnot"
-        pxor   %xmm5,  %xmm9
-
-       pand    %xmm6,  %xmm12
-       pand    %xmm6,  %xmm13
-        movdqa %xmm8, 0x00($out)       # write bit-sliced round key
-       pcmpeqb %xmm0,  %xmm12
-       psrlq   \$4,    %xmm0           # 0x01...
-        movdqa %xmm9, 0x10($out)
-       pcmpeqb %xmm1,  %xmm13
-       psrlq   \$4,    %xmm1           # 0x02...
-        lea    0x10($inp), $inp
-
-       pand    %xmm6,  %xmm14
-       pand    %xmm6,  %xmm15
-        movdqa %xmm10, 0x20($out)
-       pcmpeqb %xmm2,  %xmm14
-       psrlq   \$4,    %xmm2           # 0x04...
-        movdqa %xmm11, 0x30($out)
-       pcmpeqb %xmm3,  %xmm15
-       psrlq   \$4,    %xmm3           # 0x08...
-        movdqu ($inp), %xmm6           # load next round key
-
-       pxor    %xmm5, %xmm13           # "pnot"
-       pxor    %xmm5, %xmm14
-       movdqa  %xmm12, 0x40($out)
-       movdqa  %xmm13, 0x50($out)
-       movdqa  %xmm14, 0x60($out)
-       movdqa  %xmm15, 0x70($out)
-       lea     0x80($out),$out
-       dec     $rounds
-       jnz     .Lkey_loop
-
-       movdqa  0x50($const), %xmm7     # .L63
-       #movdqa %xmm6, ($out)           # don't save last round key
-       ret
-.cfi_endproc
-.size  _bsaes_key_convert,.-_bsaes_key_convert
-___
-}
-
-if (0 && !$win64) {    # following four functions are unsupported interface
-                       # used for benchmarking...
-$code.=<<___;
-.globl bsaes_enc_key_convert
-.type  bsaes_enc_key_convert,\@function,2
-.align 16
-bsaes_enc_key_convert:
-       mov     240($inp),%r10d         # pass rounds
-       mov     $inp,%rcx               # pass key
-       mov     $out,%rax               # pass key schedule
-       call    _bsaes_key_convert
-       pxor    %xmm6,%xmm7             # fix up last round key
-       movdqa  %xmm7,(%rax)            # save last round key
-       ret
-.size  bsaes_enc_key_convert,.-bsaes_enc_key_convert
-
-.globl bsaes_encrypt_128
-.type  bsaes_encrypt_128,\@function,4
-.align 16
-bsaes_encrypt_128:
-.Lenc128_loop:
-       movdqu  0x00($inp), @XMM[0]     # load input
-       movdqu  0x10($inp), @XMM[1]
-       movdqu  0x20($inp), @XMM[2]
-       movdqu  0x30($inp), @XMM[3]
-       movdqu  0x40($inp), @XMM[4]
-       movdqu  0x50($inp), @XMM[5]
-       movdqu  0x60($inp), @XMM[6]
-       movdqu  0x70($inp), @XMM[7]
-       mov     $key, %rax              # pass the $key
-       lea     0x80($inp), $inp
-       mov     \$10,%r10d
-
-       call    _bsaes_encrypt8
-
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       movdqu  @XMM[6], 0x30($out)
-       movdqu  @XMM[3], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[2], 0x60($out)
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-       sub     \$0x80,$len
-       ja      .Lenc128_loop
-       ret
-.size  bsaes_encrypt_128,.-bsaes_encrypt_128
-
-.globl bsaes_dec_key_convert
-.type  bsaes_dec_key_convert,\@function,2
-.align 16
-bsaes_dec_key_convert:
-       mov     240($inp),%r10d         # pass rounds
-       mov     $inp,%rcx               # pass key
-       mov     $out,%rax               # pass key schedule
-       call    _bsaes_key_convert
-       pxor    ($out),%xmm7            # fix up round 0 key
-       movdqa  %xmm6,(%rax)            # save last round key
-       movdqa  %xmm7,($out)
-       ret
-.size  bsaes_dec_key_convert,.-bsaes_dec_key_convert
-
-.globl bsaes_decrypt_128
-.type  bsaes_decrypt_128,\@function,4
-.align 16
-bsaes_decrypt_128:
-.Ldec128_loop:
-       movdqu  0x00($inp), @XMM[0]     # load input
-       movdqu  0x10($inp), @XMM[1]
-       movdqu  0x20($inp), @XMM[2]
-       movdqu  0x30($inp), @XMM[3]
-       movdqu  0x40($inp), @XMM[4]
-       movdqu  0x50($inp), @XMM[5]
-       movdqu  0x60($inp), @XMM[6]
-       movdqu  0x70($inp), @XMM[7]
-       mov     $key, %rax              # pass the $key
-       lea     0x80($inp), $inp
-       mov     \$10,%r10d
-
-       call    _bsaes_decrypt8
-
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[3], 0x60($out)
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-       sub     \$0x80,$len
-       ja      .Ldec128_loop
-       ret
-.size  bsaes_decrypt_128,.-bsaes_decrypt_128
-___
-}
-{
-######################################################################
-#
-# OpenSSL interface
-#
-my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64        ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
-                                               : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
-my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
-
-if ($ecb) {
-$code.=<<___;
-.globl bsaes_ecb_encrypt_blocks
-.type  bsaes_ecb_encrypt_blocks,\@abi-omnipotent
-.align 16
-bsaes_ecb_encrypt_blocks:
-.cfi_startproc
-       mov     %rsp, %rax
-.Lecb_enc_prologue:
-       push    %rbp
-.cfi_push      %rbp
-       push    %rbx
-.cfi_push      %rbx
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       lea     -0x48(%rsp),%rsp
-.cfi_adjust_cfa_offset 0x48
-___
-$code.=<<___ if ($win64);
-       lea     -0xa0(%rsp), %rsp
-       movaps  %xmm6, 0x40(%rsp)
-       movaps  %xmm7, 0x50(%rsp)
-       movaps  %xmm8, 0x60(%rsp)
-       movaps  %xmm9, 0x70(%rsp)
-       movaps  %xmm10, 0x80(%rsp)
-       movaps  %xmm11, 0x90(%rsp)
-       movaps  %xmm12, 0xa0(%rsp)
-       movaps  %xmm13, 0xb0(%rsp)
-       movaps  %xmm14, 0xc0(%rsp)
-       movaps  %xmm15, 0xd0(%rsp)
-.Lecb_enc_body:
-___
-$code.=<<___;
-       mov     %rsp,%rbp               # backup %rsp
-.cfi_def_cfa_register  %rbp
-       mov     240($arg4),%eax         # rounds
-       mov     $arg1,$inp              # backup arguments
-       mov     $arg2,$out
-       mov     $arg3,$len
-       mov     $arg4,$key
-       cmp     \$8,$arg3
-       jb      .Lecb_enc_short
-
-       mov     %eax,%ebx               # backup rounds
-       shl     \$7,%rax                # 128 bytes per inner round key
-       sub     \$`128-32`,%rax         # size of bit-sliced key schedule
-       sub     %rax,%rsp
-       mov     %rsp,%rax               # pass key schedule
-       mov     $key,%rcx               # pass key
-       mov     %ebx,%r10d              # pass rounds
-       call    _bsaes_key_convert
-       pxor    %xmm6,%xmm7             # fix up last round key
-       movdqa  %xmm7,(%rax)            # save last round key
-
-       sub     \$8,$len
-.Lecb_enc_loop:
-       movdqu  0x00($inp), @XMM[0]     # load input
-       movdqu  0x10($inp), @XMM[1]
-       movdqu  0x20($inp), @XMM[2]
-       movdqu  0x30($inp), @XMM[3]
-       movdqu  0x40($inp), @XMM[4]
-       movdqu  0x50($inp), @XMM[5]
-       mov     %rsp, %rax              # pass key schedule
-       movdqu  0x60($inp), @XMM[6]
-       mov     %ebx,%r10d              # pass rounds
-       movdqu  0x70($inp), @XMM[7]
-       lea     0x80($inp), $inp
-
-       call    _bsaes_encrypt8
-
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       movdqu  @XMM[6], 0x30($out)
-       movdqu  @XMM[3], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[2], 0x60($out)
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-       sub     \$8,$len
-       jnc     .Lecb_enc_loop
-
-       add     \$8,$len
-       jz      .Lecb_enc_done
-
-       movdqu  0x00($inp), @XMM[0]     # load input
-       mov     %rsp, %rax              # pass key schedule
-       mov     %ebx,%r10d              # pass rounds
-       cmp     \$2,$len
-       jb      .Lecb_enc_one
-       movdqu  0x10($inp), @XMM[1]
-       je      .Lecb_enc_two
-       movdqu  0x20($inp), @XMM[2]
-       cmp     \$4,$len
-       jb      .Lecb_enc_three
-       movdqu  0x30($inp), @XMM[3]
-       je      .Lecb_enc_four
-       movdqu  0x40($inp), @XMM[4]
-       cmp     \$6,$len
-       jb      .Lecb_enc_five
-       movdqu  0x50($inp), @XMM[5]
-       je      .Lecb_enc_six
-       movdqu  0x60($inp), @XMM[6]
-       call    _bsaes_encrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       movdqu  @XMM[6], 0x30($out)
-       movdqu  @XMM[3], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[2], 0x60($out)
-       jmp     .Lecb_enc_done
-.align 16
-.Lecb_enc_six:
-       call    _bsaes_encrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       movdqu  @XMM[6], 0x30($out)
-       movdqu  @XMM[3], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       jmp     .Lecb_enc_done
-.align 16
-.Lecb_enc_five:
-       call    _bsaes_encrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       movdqu  @XMM[6], 0x30($out)
-       movdqu  @XMM[3], 0x40($out)
-       jmp     .Lecb_enc_done
-.align 16
-.Lecb_enc_four:
-       call    _bsaes_encrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       movdqu  @XMM[6], 0x30($out)
-       jmp     .Lecb_enc_done
-.align 16
-.Lecb_enc_three:
-       call    _bsaes_encrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       jmp     .Lecb_enc_done
-.align 16
-.Lecb_enc_two:
-       call    _bsaes_encrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       jmp     .Lecb_enc_done
-.align 16
-.Lecb_enc_one:
-       call    _bsaes_encrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       jmp     .Lecb_enc_done
-.align 16
-.Lecb_enc_short:
-       lea     ($inp), $arg1
-       lea     ($out), $arg2
-       lea     ($key), $arg3
-       call    asm_AES_encrypt
-       lea     16($inp), $inp
-       lea     16($out), $out
-       dec     $len
-       jnz     .Lecb_enc_short
-
-.Lecb_enc_done:
-       lea     (%rsp),%rax
-       pxor    %xmm0, %xmm0
-.Lecb_enc_bzero:                       # wipe key schedule [if any]
-       movdqa  %xmm0, 0x00(%rax)
-       movdqa  %xmm0, 0x10(%rax)
-       lea     0x20(%rax), %rax
-       cmp     %rax, %rbp
-       jb      .Lecb_enc_bzero
-
-       lea     0x78(%rbp),%rax
-.cfi_def_cfa   %rax,8
-___
-$code.=<<___ if ($win64);
-       movaps  0x40(%rbp), %xmm6
-       movaps  0x50(%rbp), %xmm7
-       movaps  0x60(%rbp), %xmm8
-       movaps  0x70(%rbp), %xmm9
-       movaps  0x80(%rbp), %xmm10
-       movaps  0x90(%rbp), %xmm11
-       movaps  0xa0(%rbp), %xmm12
-       movaps  0xb0(%rbp), %xmm13
-       movaps  0xc0(%rbp), %xmm14
-       movaps  0xd0(%rbp), %xmm15
-       lea     0xa0(%rax), %rax
-.Lecb_enc_tail:
-___
-$code.=<<___;
-       mov     -48(%rax), %r15
-.cfi_restore   %r15
-       mov     -40(%rax), %r14
-.cfi_restore   %r14
-       mov     -32(%rax), %r13
-.cfi_restore   %r13
-       mov     -24(%rax), %r12
-.cfi_restore   %r12
-       mov     -16(%rax), %rbx
-.cfi_restore   %rbx
-       mov     -8(%rax), %rbp
-.cfi_restore   %rbp
-       lea     (%rax), %rsp            # restore %rsp
-.cfi_def_cfa_register  %rsp
-.Lecb_enc_epilogue:
-       ret
-.cfi_endproc
-.size  bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
-
-.globl bsaes_ecb_decrypt_blocks
-.type  bsaes_ecb_decrypt_blocks,\@abi-omnipotent
-.align 16
-bsaes_ecb_decrypt_blocks:
-.cfi_startproc
-       mov     %rsp, %rax
-.Lecb_dec_prologue:
-       push    %rbp
-.cfi_push      %rbp
-       push    %rbx
-.cfi_push      %rbx
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       lea     -0x48(%rsp),%rsp
-.cfi_adjust_cfa_offset 0x48
-___
-$code.=<<___ if ($win64);
-       lea     -0xa0(%rsp), %rsp
-       movaps  %xmm6, 0x40(%rsp)
-       movaps  %xmm7, 0x50(%rsp)
-       movaps  %xmm8, 0x60(%rsp)
-       movaps  %xmm9, 0x70(%rsp)
-       movaps  %xmm10, 0x80(%rsp)
-       movaps  %xmm11, 0x90(%rsp)
-       movaps  %xmm12, 0xa0(%rsp)
-       movaps  %xmm13, 0xb0(%rsp)
-       movaps  %xmm14, 0xc0(%rsp)
-       movaps  %xmm15, 0xd0(%rsp)
-.Lecb_dec_body:
-___
-$code.=<<___;
-       mov     %rsp,%rbp               # backup %rsp
-.cfi_def_cfa_register  %rbp
-       mov     240($arg4),%eax         # rounds
-       mov     $arg1,$inp              # backup arguments
-       mov     $arg2,$out
-       mov     $arg3,$len
-       mov     $arg4,$key
-       cmp     \$8,$arg3
-       jb      .Lecb_dec_short
-
-       mov     %eax,%ebx               # backup rounds
-       shl     \$7,%rax                # 128 bytes per inner round key
-       sub     \$`128-32`,%rax         # size of bit-sliced key schedule
-       sub     %rax,%rsp
-       mov     %rsp,%rax               # pass key schedule
-       mov     $key,%rcx               # pass key
-       mov     %ebx,%r10d              # pass rounds
-       call    _bsaes_key_convert
-       pxor    (%rsp),%xmm7            # fix up 0 round key
-       movdqa  %xmm6,(%rax)            # save last round key
-       movdqa  %xmm7,(%rsp)
-
-       sub     \$8,$len
-.Lecb_dec_loop:
-       movdqu  0x00($inp), @XMM[0]     # load input
-       movdqu  0x10($inp), @XMM[1]
-       movdqu  0x20($inp), @XMM[2]
-       movdqu  0x30($inp), @XMM[3]
-       movdqu  0x40($inp), @XMM[4]
-       movdqu  0x50($inp), @XMM[5]
-       mov     %rsp, %rax              # pass key schedule
-       movdqu  0x60($inp), @XMM[6]
-       mov     %ebx,%r10d              # pass rounds
-       movdqu  0x70($inp), @XMM[7]
-       lea     0x80($inp), $inp
-
-       call    _bsaes_decrypt8
-
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[3], 0x60($out)
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-       sub     \$8,$len
-       jnc     .Lecb_dec_loop
-
-       add     \$8,$len
-       jz      .Lecb_dec_done
-
-       movdqu  0x00($inp), @XMM[0]     # load input
-       mov     %rsp, %rax              # pass key schedule
-       mov     %ebx,%r10d              # pass rounds
-       cmp     \$2,$len
-       jb      .Lecb_dec_one
-       movdqu  0x10($inp), @XMM[1]
-       je      .Lecb_dec_two
-       movdqu  0x20($inp), @XMM[2]
-       cmp     \$4,$len
-       jb      .Lecb_dec_three
-       movdqu  0x30($inp), @XMM[3]
-       je      .Lecb_dec_four
-       movdqu  0x40($inp), @XMM[4]
-       cmp     \$6,$len
-       jb      .Lecb_dec_five
-       movdqu  0x50($inp), @XMM[5]
-       je      .Lecb_dec_six
-       movdqu  0x60($inp), @XMM[6]
-       call    _bsaes_decrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[3], 0x60($out)
-       jmp     .Lecb_dec_done
-.align 16
-.Lecb_dec_six:
-       call    _bsaes_decrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       jmp     .Lecb_dec_done
-.align 16
-.Lecb_dec_five:
-       call    _bsaes_decrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       jmp     .Lecb_dec_done
-.align 16
-.Lecb_dec_four:
-       call    _bsaes_decrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       jmp     .Lecb_dec_done
-.align 16
-.Lecb_dec_three:
-       call    _bsaes_decrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       jmp     .Lecb_dec_done
-.align 16
-.Lecb_dec_two:
-       call    _bsaes_decrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       jmp     .Lecb_dec_done
-.align 16
-.Lecb_dec_one:
-       call    _bsaes_decrypt8
-       movdqu  @XMM[0], 0x00($out)     # write output
-       jmp     .Lecb_dec_done
-.align 16
-.Lecb_dec_short:
-       lea     ($inp), $arg1
-       lea     ($out), $arg2
-       lea     ($key), $arg3
-       call    asm_AES_decrypt
-       lea     16($inp), $inp
-       lea     16($out), $out
-       dec     $len
-       jnz     .Lecb_dec_short
-
-.Lecb_dec_done:
-       lea     (%rsp),%rax
-       pxor    %xmm0, %xmm0
-.Lecb_dec_bzero:                       # wipe key schedule [if any]
-       movdqa  %xmm0, 0x00(%rax)
-       movdqa  %xmm0, 0x10(%rax)
-       lea     0x20(%rax), %rax
-       cmp     %rax, %rbp
-       jb      .Lecb_dec_bzero
-
-       lea     0x78(%rbp),%rax
-.cfi_def_cfa   %rax,8
-___
-$code.=<<___ if ($win64);
-       movaps  0x40(%rbp), %xmm6
-       movaps  0x50(%rbp), %xmm7
-       movaps  0x60(%rbp), %xmm8
-       movaps  0x70(%rbp), %xmm9
-       movaps  0x80(%rbp), %xmm10
-       movaps  0x90(%rbp), %xmm11
-       movaps  0xa0(%rbp), %xmm12
-       movaps  0xb0(%rbp), %xmm13
-       movaps  0xc0(%rbp), %xmm14
-       movaps  0xd0(%rbp), %xmm15
-       lea     0xa0(%rax), %rax
-.Lecb_dec_tail:
-___
-$code.=<<___;
-       mov     -48(%rax), %r15
-.cfi_restore   %r15
-       mov     -40(%rax), %r14
-.cfi_restore   %r14
-       mov     -32(%rax), %r13
-.cfi_restore   %r13
-       mov     -24(%rax), %r12
-.cfi_restore   %r12
-       mov     -16(%rax), %rbx
-.cfi_restore   %rbx
-       mov     -8(%rax), %rbp
-.cfi_restore   %rbp
-       lea     (%rax), %rsp            # restore %rsp
-.cfi_def_cfa_register  %rsp
-.Lecb_dec_epilogue:
-       ret
-.cfi_endproc
-.size  bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
-___
-}
-$code.=<<___;
-.extern        asm_AES_cbc_encrypt
-.globl bsaes_cbc_encrypt
-.type  bsaes_cbc_encrypt,\@abi-omnipotent
-.align 16
-bsaes_cbc_encrypt:
-.cfi_startproc
-___
-$code.=<<___ if ($win64);
-       mov     48(%rsp),$arg6          # pull direction flag
-___
-$code.=<<___;
-       cmp     \$0,$arg6
-       jne     asm_AES_cbc_encrypt
-       cmp     \$128,$arg3
-       jb      asm_AES_cbc_encrypt
-
-       mov     %rsp, %rax
-.Lcbc_dec_prologue:
-       push    %rbp
-.cfi_push      %rbp
-       push    %rbx
-.cfi_push      %rbx
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       lea     -0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset 0x48
-___
-$code.=<<___ if ($win64);
-       mov     0xa0(%rsp),$arg5        # pull ivp
-       lea     -0xa0(%rsp), %rsp
-       movaps  %xmm6, 0x40(%rsp)
-       movaps  %xmm7, 0x50(%rsp)
-       movaps  %xmm8, 0x60(%rsp)
-       movaps  %xmm9, 0x70(%rsp)
-       movaps  %xmm10, 0x80(%rsp)
-       movaps  %xmm11, 0x90(%rsp)
-       movaps  %xmm12, 0xa0(%rsp)
-       movaps  %xmm13, 0xb0(%rsp)
-       movaps  %xmm14, 0xc0(%rsp)
-       movaps  %xmm15, 0xd0(%rsp)
-.Lcbc_dec_body:
-___
-$code.=<<___;
-       mov     %rsp, %rbp              # backup %rsp
-.cfi_def_cfa_register  %rbp
-       mov     240($arg4), %eax        # rounds
-       mov     $arg1, $inp             # backup arguments
-       mov     $arg2, $out
-       mov     $arg3, $len
-       mov     $arg4, $key
-       mov     $arg5, %rbx
-       shr     \$4, $len               # bytes to blocks
-
-       mov     %eax, %edx              # rounds
-       shl     \$7, %rax               # 128 bytes per inner round key
-       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
-       sub     %rax, %rsp
-
-       mov     %rsp, %rax              # pass key schedule
-       mov     $key, %rcx              # pass key
-       mov     %edx, %r10d             # pass rounds
-       call    _bsaes_key_convert
-       pxor    (%rsp),%xmm7            # fix up 0 round key
-       movdqa  %xmm6,(%rax)            # save last round key
-       movdqa  %xmm7,(%rsp)
-
-       movdqu  (%rbx), @XMM[15]        # load IV
-       sub     \$8,$len
-.Lcbc_dec_loop:
-       movdqu  0x00($inp), @XMM[0]     # load input
-       movdqu  0x10($inp), @XMM[1]
-       movdqu  0x20($inp), @XMM[2]
-       movdqu  0x30($inp), @XMM[3]
-       movdqu  0x40($inp), @XMM[4]
-       movdqu  0x50($inp), @XMM[5]
-       mov     %rsp, %rax              # pass key schedule
-       movdqu  0x60($inp), @XMM[6]
-       mov     %edx,%r10d              # pass rounds
-       movdqu  0x70($inp), @XMM[7]
-       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
-
-       call    _bsaes_decrypt8
-
-       pxor    0x20(%rbp), @XMM[0]     # ^= IV
-       movdqu  0x00($inp), @XMM[8]     # re-load input
-       movdqu  0x10($inp), @XMM[9]
-       pxor    @XMM[8], @XMM[1]
-       movdqu  0x20($inp), @XMM[10]
-       pxor    @XMM[9], @XMM[6]
-       movdqu  0x30($inp), @XMM[11]
-       pxor    @XMM[10], @XMM[4]
-       movdqu  0x40($inp), @XMM[12]
-       pxor    @XMM[11], @XMM[2]
-       movdqu  0x50($inp), @XMM[13]
-       pxor    @XMM[12], @XMM[7]
-       movdqu  0x60($inp), @XMM[14]
-       pxor    @XMM[13], @XMM[3]
-       movdqu  0x70($inp), @XMM[15]    # IV
-       pxor    @XMM[14], @XMM[5]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       lea     0x80($inp), $inp
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[3], 0x60($out)
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-       sub     \$8,$len
-       jnc     .Lcbc_dec_loop
-
-       add     \$8,$len
-       jz      .Lcbc_dec_done
-
-       movdqu  0x00($inp), @XMM[0]     # load input
-       mov     %rsp, %rax              # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-       cmp     \$2,$len
-       jb      .Lcbc_dec_one
-       movdqu  0x10($inp), @XMM[1]
-       je      .Lcbc_dec_two
-       movdqu  0x20($inp), @XMM[2]
-       cmp     \$4,$len
-       jb      .Lcbc_dec_three
-       movdqu  0x30($inp), @XMM[3]
-       je      .Lcbc_dec_four
-       movdqu  0x40($inp), @XMM[4]
-       cmp     \$6,$len
-       jb      .Lcbc_dec_five
-       movdqu  0x50($inp), @XMM[5]
-       je      .Lcbc_dec_six
-       movdqu  0x60($inp), @XMM[6]
-       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
-       call    _bsaes_decrypt8
-       pxor    0x20(%rbp), @XMM[0]     # ^= IV
-       movdqu  0x00($inp), @XMM[8]     # re-load input
-       movdqu  0x10($inp), @XMM[9]
-       pxor    @XMM[8], @XMM[1]
-       movdqu  0x20($inp), @XMM[10]
-       pxor    @XMM[9], @XMM[6]
-       movdqu  0x30($inp), @XMM[11]
-       pxor    @XMM[10], @XMM[4]
-       movdqu  0x40($inp), @XMM[12]
-       pxor    @XMM[11], @XMM[2]
-       movdqu  0x50($inp), @XMM[13]
-       pxor    @XMM[12], @XMM[7]
-       movdqu  0x60($inp), @XMM[15]    # IV
-       pxor    @XMM[13], @XMM[3]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[3], 0x60($out)
-       jmp     .Lcbc_dec_done
-.align 16
-.Lcbc_dec_six:
-       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
-       call    _bsaes_decrypt8
-       pxor    0x20(%rbp), @XMM[0]     # ^= IV
-       movdqu  0x00($inp), @XMM[8]     # re-load input
-       movdqu  0x10($inp), @XMM[9]
-       pxor    @XMM[8], @XMM[1]
-       movdqu  0x20($inp), @XMM[10]
-       pxor    @XMM[9], @XMM[6]
-       movdqu  0x30($inp), @XMM[11]
-       pxor    @XMM[10], @XMM[4]
-       movdqu  0x40($inp), @XMM[12]
-       pxor    @XMM[11], @XMM[2]
-       movdqu  0x50($inp), @XMM[15]    # IV
-       pxor    @XMM[12], @XMM[7]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       jmp     .Lcbc_dec_done
-.align 16
-.Lcbc_dec_five:
-       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
-       call    _bsaes_decrypt8
-       pxor    0x20(%rbp), @XMM[0]     # ^= IV
-       movdqu  0x00($inp), @XMM[8]     # re-load input
-       movdqu  0x10($inp), @XMM[9]
-       pxor    @XMM[8], @XMM[1]
-       movdqu  0x20($inp), @XMM[10]
-       pxor    @XMM[9], @XMM[6]
-       movdqu  0x30($inp), @XMM[11]
-       pxor    @XMM[10], @XMM[4]
-       movdqu  0x40($inp), @XMM[15]    # IV
-       pxor    @XMM[11], @XMM[2]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       jmp     .Lcbc_dec_done
-.align 16
-.Lcbc_dec_four:
-       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
-       call    _bsaes_decrypt8
-       pxor    0x20(%rbp), @XMM[0]     # ^= IV
-       movdqu  0x00($inp), @XMM[8]     # re-load input
-       movdqu  0x10($inp), @XMM[9]
-       pxor    @XMM[8], @XMM[1]
-       movdqu  0x20($inp), @XMM[10]
-       pxor    @XMM[9], @XMM[6]
-       movdqu  0x30($inp), @XMM[15]    # IV
-       pxor    @XMM[10], @XMM[4]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       jmp     .Lcbc_dec_done
-.align 16
-.Lcbc_dec_three:
-       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
-       call    _bsaes_decrypt8
-       pxor    0x20(%rbp), @XMM[0]     # ^= IV
-       movdqu  0x00($inp), @XMM[8]     # re-load input
-       movdqu  0x10($inp), @XMM[9]
-       pxor    @XMM[8], @XMM[1]
-       movdqu  0x20($inp), @XMM[15]    # IV
-       pxor    @XMM[9], @XMM[6]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       jmp     .Lcbc_dec_done
-.align 16
-.Lcbc_dec_two:
-       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
-       call    _bsaes_decrypt8
-       pxor    0x20(%rbp), @XMM[0]     # ^= IV
-       movdqu  0x00($inp), @XMM[8]     # re-load input
-       movdqu  0x10($inp), @XMM[15]    # IV
-       pxor    @XMM[8], @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       jmp     .Lcbc_dec_done
-.align 16
-.Lcbc_dec_one:
-       lea     ($inp), $arg1
-       lea     0x20(%rbp), $arg2       # buffer output
-       lea     ($key), $arg3
-       call    asm_AES_decrypt         # doesn't touch %xmm
-       pxor    0x20(%rbp), @XMM[15]    # ^= IV
-       movdqu  @XMM[15], ($out)        # write output
-       movdqa  @XMM[0], @XMM[15]       # IV
-
-.Lcbc_dec_done:
-       movdqu  @XMM[15], (%rbx)        # return IV
-       lea     (%rsp), %rax
-       pxor    %xmm0, %xmm0
-.Lcbc_dec_bzero:                       # wipe key schedule [if any]
-       movdqa  %xmm0, 0x00(%rax)
-       movdqa  %xmm0, 0x10(%rax)
-       lea     0x20(%rax), %rax
-       cmp     %rax, %rbp
-       ja      .Lcbc_dec_bzero
-
-       lea     0x78(%rbp),%rax
-.cfi_def_cfa   %rax,8
-___
-$code.=<<___ if ($win64);
-       movaps  0x40(%rbp), %xmm6
-       movaps  0x50(%rbp), %xmm7
-       movaps  0x60(%rbp), %xmm8
-       movaps  0x70(%rbp), %xmm9
-       movaps  0x80(%rbp), %xmm10
-       movaps  0x90(%rbp), %xmm11
-       movaps  0xa0(%rbp), %xmm12
-       movaps  0xb0(%rbp), %xmm13
-       movaps  0xc0(%rbp), %xmm14
-       movaps  0xd0(%rbp), %xmm15
-       lea     0xa0(%rax), %rax
-.Lcbc_dec_tail:
-___
-$code.=<<___;
-       mov     -48(%rax), %r15
-.cfi_restore   %r15
-       mov     -40(%rax), %r14
-.cfi_restore   %r14
-       mov     -32(%rax), %r13
-.cfi_restore   %r13
-       mov     -24(%rax), %r12
-.cfi_restore   %r12
-       mov     -16(%rax), %rbx
-.cfi_restore   %rbx
-       mov     -8(%rax), %rbp
-.cfi_restore   %rbp
-       lea     (%rax), %rsp            # restore %rsp
-.cfi_def_cfa_register  %rsp
-.Lcbc_dec_epilogue:
-       ret
-.cfi_endproc
-.size  bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
-
-.globl bsaes_ctr32_encrypt_blocks
-.type  bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
-.align 16
-bsaes_ctr32_encrypt_blocks:
-.cfi_startproc
-       mov     %rsp, %rax
-.Lctr_enc_prologue:
-       push    %rbp
-.cfi_push      %rbp
-       push    %rbx
-.cfi_push      %rbx
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       lea     -0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset 0x48
-___
-$code.=<<___ if ($win64);
-       mov     0xa0(%rsp),$arg5        # pull ivp
-       lea     -0xa0(%rsp), %rsp
-       movaps  %xmm6, 0x40(%rsp)
-       movaps  %xmm7, 0x50(%rsp)
-       movaps  %xmm8, 0x60(%rsp)
-       movaps  %xmm9, 0x70(%rsp)
-       movaps  %xmm10, 0x80(%rsp)
-       movaps  %xmm11, 0x90(%rsp)
-       movaps  %xmm12, 0xa0(%rsp)
-       movaps  %xmm13, 0xb0(%rsp)
-       movaps  %xmm14, 0xc0(%rsp)
-       movaps  %xmm15, 0xd0(%rsp)
-.Lctr_enc_body:
-___
-$code.=<<___;
-       mov     %rsp, %rbp              # backup %rsp
-.cfi_def_cfa_register  %rbp
-       movdqu  ($arg5), %xmm0          # load counter
-       mov     240($arg4), %eax        # rounds
-       mov     $arg1, $inp             # backup arguments
-       mov     $arg2, $out
-       mov     $arg3, $len
-       mov     $arg4, $key
-       movdqa  %xmm0, 0x20(%rbp)       # copy counter
-       cmp     \$8, $arg3
-       jb      .Lctr_enc_short
-
-       mov     %eax, %ebx              # rounds
-       shl     \$7, %rax               # 128 bytes per inner round key
-       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
-       sub     %rax, %rsp
-
-       mov     %rsp, %rax              # pass key schedule
-       mov     $key, %rcx              # pass key
-       mov     %ebx, %r10d             # pass rounds
-       call    _bsaes_key_convert
-       pxor    %xmm6,%xmm7             # fix up last round key
-       movdqa  %xmm7,(%rax)            # save last round key
-
-       movdqa  (%rsp), @XMM[9]         # load round0 key
-       lea     .LADD1(%rip), %r11
-       movdqa  0x20(%rbp), @XMM[0]     # counter copy
-       movdqa  -0x20(%r11), @XMM[8]    # .LSWPUP
-       pshufb  @XMM[8], @XMM[9]        # byte swap upper part
-       pshufb  @XMM[8], @XMM[0]
-       movdqa  @XMM[9], (%rsp)         # save adjusted round0 key
-       jmp     .Lctr_enc_loop
-.align 16
-.Lctr_enc_loop:
-       movdqa  @XMM[0], 0x20(%rbp)     # save counter
-       movdqa  @XMM[0], @XMM[1]        # prepare 8 counter values
-       movdqa  @XMM[0], @XMM[2]
-       paddd   0x00(%r11), @XMM[1]     # .LADD1
-       movdqa  @XMM[0], @XMM[3]
-       paddd   0x10(%r11), @XMM[2]     # .LADD2
-       movdqa  @XMM[0], @XMM[4]
-       paddd   0x20(%r11), @XMM[3]     # .LADD3
-       movdqa  @XMM[0], @XMM[5]
-       paddd   0x30(%r11), @XMM[4]     # .LADD4
-       movdqa  @XMM[0], @XMM[6]
-       paddd   0x40(%r11), @XMM[5]     # .LADD5
-       movdqa  @XMM[0], @XMM[7]
-       paddd   0x50(%r11), @XMM[6]     # .LADD6
-       paddd   0x60(%r11), @XMM[7]     # .LADD7
-
-       # Borrow prologue from _bsaes_encrypt8 to use the opportunity
-       # to flip byte order in 32-bit counter
-       movdqa  (%rsp), @XMM[9]         # round 0 key
-       lea     0x10(%rsp), %rax        # pass key schedule
-       movdqa  -0x10(%r11), @XMM[8]    # .LSWPUPM0SR
-       pxor    @XMM[9], @XMM[0]        # xor with round0 key
-       pxor    @XMM[9], @XMM[1]
-       pxor    @XMM[9], @XMM[2]
-       pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[0]
-        pshufb @XMM[8], @XMM[1]
-       pxor    @XMM[9], @XMM[4]
-       pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[2]
-        pshufb @XMM[8], @XMM[3]
-       pxor    @XMM[9], @XMM[6]
-       pxor    @XMM[9], @XMM[7]
-        pshufb @XMM[8], @XMM[4]
-        pshufb @XMM[8], @XMM[5]
-        pshufb @XMM[8], @XMM[6]
-        pshufb @XMM[8], @XMM[7]
-       lea     .LBS0(%rip), %r11       # constants table
-       mov     %ebx,%r10d              # pass rounds
-
-       call    _bsaes_encrypt8_bitslice
-
-       sub     \$8,$len
-       jc      .Lctr_enc_loop_done
-
-       movdqu  0x00($inp), @XMM[8]     # load input
-       movdqu  0x10($inp), @XMM[9]
-       movdqu  0x20($inp), @XMM[10]
-       movdqu  0x30($inp), @XMM[11]
-       movdqu  0x40($inp), @XMM[12]
-       movdqu  0x50($inp), @XMM[13]
-       movdqu  0x60($inp), @XMM[14]
-       movdqu  0x70($inp), @XMM[15]
-       lea     0x80($inp),$inp
-       pxor    @XMM[0], @XMM[8]
-       movdqa  0x20(%rbp), @XMM[0]     # load counter
-       pxor    @XMM[9], @XMM[1]
-       movdqu  @XMM[8], 0x00($out)     # write output
-       pxor    @XMM[10], @XMM[4]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    @XMM[11], @XMM[6]
-       movdqu  @XMM[4], 0x20($out)
-       pxor    @XMM[12], @XMM[3]
-       movdqu  @XMM[6], 0x30($out)
-       pxor    @XMM[13], @XMM[7]
-       movdqu  @XMM[3], 0x40($out)
-       pxor    @XMM[14], @XMM[2]
-       movdqu  @XMM[7], 0x50($out)
-       pxor    @XMM[15], @XMM[5]
-       movdqu  @XMM[2], 0x60($out)
-       lea     .LADD1(%rip), %r11
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-       paddd   0x70(%r11), @XMM[0]     # .LADD8
-       jnz     .Lctr_enc_loop
-
-       jmp     .Lctr_enc_done
-.align 16
-.Lctr_enc_loop_done:
-       add     \$8, $len
-       movdqu  0x00($inp), @XMM[8]     # load input
-       pxor    @XMM[8], @XMM[0]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       cmp     \$2,$len
-       jb      .Lctr_enc_done
-       movdqu  0x10($inp), @XMM[9]
-       pxor    @XMM[9], @XMM[1]
-       movdqu  @XMM[1], 0x10($out)
-       je      .Lctr_enc_done
-       movdqu  0x20($inp), @XMM[10]
-       pxor    @XMM[10], @XMM[4]
-       movdqu  @XMM[4], 0x20($out)
-       cmp     \$4,$len
-       jb      .Lctr_enc_done
-       movdqu  0x30($inp), @XMM[11]
-       pxor    @XMM[11], @XMM[6]
-       movdqu  @XMM[6], 0x30($out)
-       je      .Lctr_enc_done
-       movdqu  0x40($inp), @XMM[12]
-       pxor    @XMM[12], @XMM[3]
-       movdqu  @XMM[3], 0x40($out)
-       cmp     \$6,$len
-       jb      .Lctr_enc_done
-       movdqu  0x50($inp), @XMM[13]
-       pxor    @XMM[13], @XMM[7]
-       movdqu  @XMM[7], 0x50($out)
-       je      .Lctr_enc_done
-       movdqu  0x60($inp), @XMM[14]
-       pxor    @XMM[14], @XMM[2]
-       movdqu  @XMM[2], 0x60($out)
-       jmp     .Lctr_enc_done
-
-.align 16
-.Lctr_enc_short:
-       lea     0x20(%rbp), $arg1
-       lea     0x30(%rbp), $arg2
-       lea     ($key), $arg3
-       call    asm_AES_encrypt
-       movdqu  ($inp), @XMM[1]
-       lea     16($inp), $inp
-       mov     0x2c(%rbp), %eax        # load 32-bit counter
-       bswap   %eax
-       pxor    0x30(%rbp), @XMM[1]
-       inc     %eax                    # increment
-       movdqu  @XMM[1], ($out)
-       bswap   %eax
-       lea     16($out), $out
-       mov     %eax, 0x2c(%rsp)        # save 32-bit counter
-       dec     $len
-       jnz     .Lctr_enc_short
-
-.Lctr_enc_done:
-       lea     (%rsp), %rax
-       pxor    %xmm0, %xmm0
-.Lctr_enc_bzero:                       # wipe key schedule [if any]
-       movdqa  %xmm0, 0x00(%rax)
-       movdqa  %xmm0, 0x10(%rax)
-       lea     0x20(%rax), %rax
-       cmp     %rax, %rbp
-       ja      .Lctr_enc_bzero
-
-       lea     0x78(%rbp),%rax
-.cfi_def_cfa   %rax,8
-___
-$code.=<<___ if ($win64);
-       movaps  0x40(%rbp), %xmm6
-       movaps  0x50(%rbp), %xmm7
-       movaps  0x60(%rbp), %xmm8
-       movaps  0x70(%rbp), %xmm9
-       movaps  0x80(%rbp), %xmm10
-       movaps  0x90(%rbp), %xmm11
-       movaps  0xa0(%rbp), %xmm12
-       movaps  0xb0(%rbp), %xmm13
-       movaps  0xc0(%rbp), %xmm14
-       movaps  0xd0(%rbp), %xmm15
-       lea     0xa0(%rax), %rax
-.Lctr_enc_tail:
-___
-$code.=<<___;
-       mov     -48(%rax), %r15
-.cfi_restore   %r15
-       mov     -40(%rax), %r14
-.cfi_restore   %r14
-       mov     -32(%rax), %r13
-.cfi_restore   %r13
-       mov     -24(%rax), %r12
-.cfi_restore   %r12
-       mov     -16(%rax), %rbx
-.cfi_restore   %rbx
-       mov     -8(%rax), %rbp
-.cfi_restore   %rbp
-       lea     (%rax), %rsp            # restore %rsp
-.cfi_def_cfa_register  %rsp
-.Lctr_enc_epilogue:
-       ret
-.cfi_endproc
-.size  bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
-___
-######################################################################
-# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
-#      const AES_KEY *key1, const AES_KEY *key2,
-#      const unsigned char iv[16]);
-#
-my ($twmask,$twres,$twtmp)=@XMM[13..15];
-$arg6=~s/d$//;
-
-$code.=<<___;
-.globl bsaes_xts_encrypt
-.type  bsaes_xts_encrypt,\@abi-omnipotent
-.align 16
-bsaes_xts_encrypt:
-.cfi_startproc
-       mov     %rsp, %rax
-.Lxts_enc_prologue:
-       push    %rbp
-.cfi_push      %rbp
-       push    %rbx
-.cfi_push      %rbx
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       lea     -0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset 0x48
-___
-$code.=<<___ if ($win64);
-       mov     0xa0(%rsp),$arg5        # pull key2
-       mov     0xa8(%rsp),$arg6        # pull ivp
-       lea     -0xa0(%rsp), %rsp
-       movaps  %xmm6, 0x40(%rsp)
-       movaps  %xmm7, 0x50(%rsp)
-       movaps  %xmm8, 0x60(%rsp)
-       movaps  %xmm9, 0x70(%rsp)
-       movaps  %xmm10, 0x80(%rsp)
-       movaps  %xmm11, 0x90(%rsp)
-       movaps  %xmm12, 0xa0(%rsp)
-       movaps  %xmm13, 0xb0(%rsp)
-       movaps  %xmm14, 0xc0(%rsp)
-       movaps  %xmm15, 0xd0(%rsp)
-.Lxts_enc_body:
-___
-$code.=<<___;
-       mov     %rsp, %rbp              # backup %rsp
-.cfi_def_cfa_register  %rbp
-       mov     $arg1, $inp             # backup arguments
-       mov     $arg2, $out
-       mov     $arg3, $len
-       mov     $arg4, $key
-
-       lea     ($arg6), $arg1
-       lea     0x20(%rbp), $arg2
-       lea     ($arg5), $arg3
-       call    asm_AES_encrypt         # generate initial tweak
-
-       mov     240($key), %eax         # rounds
-       mov     $len, %rbx              # backup $len
-
-       mov     %eax, %edx              # rounds
-       shl     \$7, %rax               # 128 bytes per inner round key
-       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
-       sub     %rax, %rsp
-
-       mov     %rsp, %rax              # pass key schedule
-       mov     $key, %rcx              # pass key
-       mov     %edx, %r10d             # pass rounds
-       call    _bsaes_key_convert
-       pxor    %xmm6, %xmm7            # fix up last round key
-       movdqa  %xmm7, (%rax)           # save last round key
-
-       and     \$-16, $len
-       sub     \$0x80, %rsp            # place for tweak[8]
-       movdqa  0x20(%rbp), @XMM[7]     # initial tweak
-
-       pxor    $twtmp, $twtmp
-       movdqa  .Lxts_magic(%rip), $twmask
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-
-       sub     \$0x80, $len
-       jc      .Lxts_enc_short
-       jmp     .Lxts_enc_loop
-
-.align 16
-.Lxts_enc_loop:
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-       pshufd  \$0x13, $twtmp, $twres
-       pxor    $twtmp, $twtmp
-       movdqa  @XMM[7], @XMM[$i]
-       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
-       pand    $twmask, $twres         # isolate carry and residue
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-       pxor    $twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
-___
-    $code.=<<___ if ($i>=2);
-       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-       movdqu  0x60($inp), @XMM[8+6]
-       pxor    @XMM[8+5], @XMM[5]
-       movdqu  0x70($inp), @XMM[8+7]
-       lea     0x80($inp), $inp
-       movdqa  @XMM[7], 0x70(%rsp)
-       pxor    @XMM[8+6], @XMM[6]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       pxor    @XMM[8+7], @XMM[7]
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_encrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[4]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[6]
-       movdqu  @XMM[4], 0x20($out)
-       pxor    0x40(%rsp), @XMM[3]
-       movdqu  @XMM[6], 0x30($out)
-       pxor    0x50(%rsp), @XMM[7]
-       movdqu  @XMM[3], 0x40($out)
-       pxor    0x60(%rsp), @XMM[2]
-       movdqu  @XMM[7], 0x50($out)
-       pxor    0x70(%rsp), @XMM[5]
-       movdqu  @XMM[2], 0x60($out)
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-
-       movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
-       pxor    $twtmp, $twtmp
-       movdqa  .Lxts_magic(%rip), $twmask
-       pcmpgtd @XMM[7], $twtmp
-       pshufd  \$0x13, $twtmp, $twres
-       pxor    $twtmp, $twtmp
-       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
-       pand    $twmask, $twres         # isolate carry and residue
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-       pxor    $twres, @XMM[7]
-
-       sub     \$0x80,$len
-       jnc     .Lxts_enc_loop
-
-.Lxts_enc_short:
-       add     \$0x80, $len
-       jz      .Lxts_enc_done
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-       pshufd  \$0x13, $twtmp, $twres
-       pxor    $twtmp, $twtmp
-       movdqa  @XMM[7], @XMM[$i]
-       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
-       pand    $twmask, $twres         # isolate carry and residue
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-       pxor    $twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
-       cmp     \$`0x10*$i`,$len
-       je      .Lxts_enc_$i
-___
-    $code.=<<___ if ($i>=2);
-       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-       movdqu  0x60($inp), @XMM[8+6]
-       pxor    @XMM[8+5], @XMM[5]
-       movdqa  @XMM[7], 0x70(%rsp)
-       lea     0x70($inp), $inp
-       pxor    @XMM[8+6], @XMM[6]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_encrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[4]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[6]
-       movdqu  @XMM[4], 0x20($out)
-       pxor    0x40(%rsp), @XMM[3]
-       movdqu  @XMM[6], 0x30($out)
-       pxor    0x50(%rsp), @XMM[7]
-       movdqu  @XMM[3], 0x40($out)
-       pxor    0x60(%rsp), @XMM[2]
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[2], 0x60($out)
-       lea     0x70($out), $out
-
-       movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_enc_done
-.align 16
-.Lxts_enc_6:
-       pxor    @XMM[8+4], @XMM[4]
-       lea     0x60($inp), $inp
-       pxor    @XMM[8+5], @XMM[5]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_encrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[4]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[6]
-       movdqu  @XMM[4], 0x20($out)
-       pxor    0x40(%rsp), @XMM[3]
-       movdqu  @XMM[6], 0x30($out)
-       pxor    0x50(%rsp), @XMM[7]
-       movdqu  @XMM[3], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       lea     0x60($out), $out
-
-       movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_enc_done
-.align 16
-.Lxts_enc_5:
-       pxor    @XMM[8+3], @XMM[3]
-       lea     0x50($inp), $inp
-       pxor    @XMM[8+4], @XMM[4]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_encrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[4]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[6]
-       movdqu  @XMM[4], 0x20($out)
-       pxor    0x40(%rsp), @XMM[3]
-       movdqu  @XMM[6], 0x30($out)
-       movdqu  @XMM[3], 0x40($out)
-       lea     0x50($out), $out
-
-       movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_enc_done
-.align 16
-.Lxts_enc_4:
-       pxor    @XMM[8+2], @XMM[2]
-       lea     0x40($inp), $inp
-       pxor    @XMM[8+3], @XMM[3]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_encrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[4]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[6]
-       movdqu  @XMM[4], 0x20($out)
-       movdqu  @XMM[6], 0x30($out)
-       lea     0x40($out), $out
-
-       movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_enc_done
-.align 16
-.Lxts_enc_3:
-       pxor    @XMM[8+1], @XMM[1]
-       lea     0x30($inp), $inp
-       pxor    @XMM[8+2], @XMM[2]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_encrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[4]
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[4], 0x20($out)
-       lea     0x30($out), $out
-
-       movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_enc_done
-.align 16
-.Lxts_enc_2:
-       pxor    @XMM[8+0], @XMM[0]
-       lea     0x20($inp), $inp
-       pxor    @XMM[8+1], @XMM[1]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_encrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       lea     0x20($out), $out
-
-       movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_enc_done
-.align 16
-.Lxts_enc_1:
-       pxor    @XMM[0], @XMM[8]
-       lea     0x10($inp), $inp
-       movdqa  @XMM[8], 0x20(%rbp)
-       lea     0x20(%rbp), $arg1
-       lea     0x20(%rbp), $arg2
-       lea     ($key), $arg3
-       call    asm_AES_encrypt         # doesn't touch %xmm
-       pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
-       #pxor   @XMM[8], @XMM[0]
-       #lea    0x80(%rsp), %rax        # pass key schedule
-       #mov    %edx, %r10d             # pass rounds
-       #call   _bsaes_encrypt8
-       #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       lea     0x10($out), $out
-
-       movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
-
-.Lxts_enc_done:
-       and     \$15, %ebx
-       jz      .Lxts_enc_ret
-       mov     $out, %rdx
-
-.Lxts_enc_steal:
-       movzb   ($inp), %eax
-       movzb   -16(%rdx), %ecx
-       lea     1($inp), $inp
-       mov     %al, -16(%rdx)
-       mov     %cl, 0(%rdx)
-       lea     1(%rdx), %rdx
-       sub     \$1,%ebx
-       jnz     .Lxts_enc_steal
-
-       movdqu  -16($out), @XMM[0]
-       lea     0x20(%rbp), $arg1
-       pxor    @XMM[7], @XMM[0]
-       lea     0x20(%rbp), $arg2
-       movdqa  @XMM[0], 0x20(%rbp)
-       lea     ($key), $arg3
-       call    asm_AES_encrypt         # doesn't touch %xmm
-       pxor    0x20(%rbp), @XMM[7]
-       movdqu  @XMM[7], -16($out)
-
-.Lxts_enc_ret:
-       lea     (%rsp), %rax
-       pxor    %xmm0, %xmm0
-.Lxts_enc_bzero:                       # wipe key schedule [if any]
-       movdqa  %xmm0, 0x00(%rax)
-       movdqa  %xmm0, 0x10(%rax)
-       lea     0x20(%rax), %rax
-       cmp     %rax, %rbp
-       ja      .Lxts_enc_bzero
-
-       lea     0x78(%rbp),%rax
-.cfi_def_cfa   %rax,8
-___
-$code.=<<___ if ($win64);
-       movaps  0x40(%rbp), %xmm6
-       movaps  0x50(%rbp), %xmm7
-       movaps  0x60(%rbp), %xmm8
-       movaps  0x70(%rbp), %xmm9
-       movaps  0x80(%rbp), %xmm10
-       movaps  0x90(%rbp), %xmm11
-       movaps  0xa0(%rbp), %xmm12
-       movaps  0xb0(%rbp), %xmm13
-       movaps  0xc0(%rbp), %xmm14
-       movaps  0xd0(%rbp), %xmm15
-       lea     0xa0(%rax), %rax
-.Lxts_enc_tail:
-___
-$code.=<<___;
-       mov     -48(%rax), %r15
-.cfi_restore   %r15
-       mov     -40(%rax), %r14
-.cfi_restore   %r14
-       mov     -32(%rax), %r13
-.cfi_restore   %r13
-       mov     -24(%rax), %r12
-.cfi_restore   %r12
-       mov     -16(%rax), %rbx
-.cfi_restore   %rbx
-       mov     -8(%rax), %rbp
-.cfi_restore   %rbp
-       lea     (%rax), %rsp            # restore %rsp
-.cfi_def_cfa_register  %rsp
-.Lxts_enc_epilogue:
-       ret
-.cfi_endproc
-.size  bsaes_xts_encrypt,.-bsaes_xts_encrypt
-
-.globl bsaes_xts_decrypt
-.type  bsaes_xts_decrypt,\@abi-omnipotent
-.align 16
-bsaes_xts_decrypt:
-.cfi_startproc
-       mov     %rsp, %rax
-.Lxts_dec_prologue:
-       push    %rbp
-.cfi_push      %rbp
-       push    %rbx
-.cfi_push      %rbx
-       push    %r12
-.cfi_push      %r12
-       push    %r13
-.cfi_push      %r13
-       push    %r14
-.cfi_push      %r14
-       push    %r15
-.cfi_push      %r15
-       lea     -0x48(%rsp), %rsp
-.cfi_adjust_cfa_offset 0x48
-___
-$code.=<<___ if ($win64);
-       mov     0xa0(%rsp),$arg5        # pull key2
-       mov     0xa8(%rsp),$arg6        # pull ivp
-       lea     -0xa0(%rsp), %rsp
-       movaps  %xmm6, 0x40(%rsp)
-       movaps  %xmm7, 0x50(%rsp)
-       movaps  %xmm8, 0x60(%rsp)
-       movaps  %xmm9, 0x70(%rsp)
-       movaps  %xmm10, 0x80(%rsp)
-       movaps  %xmm11, 0x90(%rsp)
-       movaps  %xmm12, 0xa0(%rsp)
-       movaps  %xmm13, 0xb0(%rsp)
-       movaps  %xmm14, 0xc0(%rsp)
-       movaps  %xmm15, 0xd0(%rsp)
-.Lxts_dec_body:
-___
-$code.=<<___;
-       mov     %rsp, %rbp              # backup %rsp
-       mov     $arg1, $inp             # backup arguments
-       mov     $arg2, $out
-       mov     $arg3, $len
-       mov     $arg4, $key
-
-       lea     ($arg6), $arg1
-       lea     0x20(%rbp), $arg2
-       lea     ($arg5), $arg3
-       call    asm_AES_encrypt         # generate initial tweak
-
-       mov     240($key), %eax         # rounds
-       mov     $len, %rbx              # backup $len
-
-       mov     %eax, %edx              # rounds
-       shl     \$7, %rax               # 128 bytes per inner round key
-       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
-       sub     %rax, %rsp
-
-       mov     %rsp, %rax              # pass key schedule
-       mov     $key, %rcx              # pass key
-       mov     %edx, %r10d             # pass rounds
-       call    _bsaes_key_convert
-       pxor    (%rsp), %xmm7           # fix up round 0 key
-       movdqa  %xmm6, (%rax)           # save last round key
-       movdqa  %xmm7, (%rsp)
-
-       xor     %eax, %eax              # if ($len%16) len-=16;
-       and     \$-16, $len
-       test    \$15, %ebx
-       setnz   %al
-       shl     \$4, %rax
-       sub     %rax, $len
-
-       sub     \$0x80, %rsp            # place for tweak[8]
-       movdqa  0x20(%rbp), @XMM[7]     # initial tweak
-
-       pxor    $twtmp, $twtmp
-       movdqa  .Lxts_magic(%rip), $twmask
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-
-       sub     \$0x80, $len
-       jc      .Lxts_dec_short
-       jmp     .Lxts_dec_loop
-
-.align 16
-.Lxts_dec_loop:
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-       pshufd  \$0x13, $twtmp, $twres
-       pxor    $twtmp, $twtmp
-       movdqa  @XMM[7], @XMM[$i]
-       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
-       pand    $twmask, $twres         # isolate carry and residue
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-       pxor    $twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
-___
-    $code.=<<___ if ($i>=2);
-       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-       movdqu  0x60($inp), @XMM[8+6]
-       pxor    @XMM[8+5], @XMM[5]
-       movdqu  0x70($inp), @XMM[8+7]
-       lea     0x80($inp), $inp
-       movdqa  @XMM[7], 0x70(%rsp)
-       pxor    @XMM[8+6], @XMM[6]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       pxor    @XMM[8+7], @XMM[7]
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_decrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[6]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[4]
-       movdqu  @XMM[6], 0x20($out)
-       pxor    0x40(%rsp), @XMM[2]
-       movdqu  @XMM[4], 0x30($out)
-       pxor    0x50(%rsp), @XMM[7]
-       movdqu  @XMM[2], 0x40($out)
-       pxor    0x60(%rsp), @XMM[3]
-       movdqu  @XMM[7], 0x50($out)
-       pxor    0x70(%rsp), @XMM[5]
-       movdqu  @XMM[3], 0x60($out)
-       movdqu  @XMM[5], 0x70($out)
-       lea     0x80($out), $out
-
-       movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
-       pxor    $twtmp, $twtmp
-       movdqa  .Lxts_magic(%rip), $twmask
-       pcmpgtd @XMM[7], $twtmp
-       pshufd  \$0x13, $twtmp, $twres
-       pxor    $twtmp, $twtmp
-       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
-       pand    $twmask, $twres         # isolate carry and residue
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-       pxor    $twres, @XMM[7]
-
-       sub     \$0x80,$len
-       jnc     .Lxts_dec_loop
-
-.Lxts_dec_short:
-       add     \$0x80, $len
-       jz      .Lxts_dec_done
-___
-    for ($i=0;$i<7;$i++) {
-    $code.=<<___;
-       pshufd  \$0x13, $twtmp, $twres
-       pxor    $twtmp, $twtmp
-       movdqa  @XMM[7], @XMM[$i]
-       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
-       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
-       pand    $twmask, $twres         # isolate carry and residue
-       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
-       pxor    $twres, @XMM[7]
-___
-    $code.=<<___ if ($i>=1);
-       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
-       cmp     \$`0x10*$i`,$len
-       je      .Lxts_dec_$i
-___
-    $code.=<<___ if ($i>=2);
-       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
-___
-    }
-$code.=<<___;
-       movdqu  0x60($inp), @XMM[8+6]
-       pxor    @XMM[8+5], @XMM[5]
-       movdqa  @XMM[7], 0x70(%rsp)
-       lea     0x70($inp), $inp
-       pxor    @XMM[8+6], @XMM[6]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_decrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[6]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[4]
-       movdqu  @XMM[6], 0x20($out)
-       pxor    0x40(%rsp), @XMM[2]
-       movdqu  @XMM[4], 0x30($out)
-       pxor    0x50(%rsp), @XMM[7]
-       movdqu  @XMM[2], 0x40($out)
-       pxor    0x60(%rsp), @XMM[3]
-       movdqu  @XMM[7], 0x50($out)
-       movdqu  @XMM[3], 0x60($out)
-       lea     0x70($out), $out
-
-       movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_dec_done
-.align 16
-.Lxts_dec_6:
-       pxor    @XMM[8+4], @XMM[4]
-       lea     0x60($inp), $inp
-       pxor    @XMM[8+5], @XMM[5]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_decrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[6]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[4]
-       movdqu  @XMM[6], 0x20($out)
-       pxor    0x40(%rsp), @XMM[2]
-       movdqu  @XMM[4], 0x30($out)
-       pxor    0x50(%rsp), @XMM[7]
-       movdqu  @XMM[2], 0x40($out)
-       movdqu  @XMM[7], 0x50($out)
-       lea     0x60($out), $out
-
-       movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_dec_done
-.align 16
-.Lxts_dec_5:
-       pxor    @XMM[8+3], @XMM[3]
-       lea     0x50($inp), $inp
-       pxor    @XMM[8+4], @XMM[4]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_decrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[6]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[4]
-       movdqu  @XMM[6], 0x20($out)
-       pxor    0x40(%rsp), @XMM[2]
-       movdqu  @XMM[4], 0x30($out)
-       movdqu  @XMM[2], 0x40($out)
-       lea     0x50($out), $out
-
-       movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_dec_done
-.align 16
-.Lxts_dec_4:
-       pxor    @XMM[8+2], @XMM[2]
-       lea     0x40($inp), $inp
-       pxor    @XMM[8+3], @XMM[3]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_decrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[6]
-       movdqu  @XMM[1], 0x10($out)
-       pxor    0x30(%rsp), @XMM[4]
-       movdqu  @XMM[6], 0x20($out)
-       movdqu  @XMM[4], 0x30($out)
-       lea     0x40($out), $out
-
-       movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_dec_done
-.align 16
-.Lxts_dec_3:
-       pxor    @XMM[8+1], @XMM[1]
-       lea     0x30($inp), $inp
-       pxor    @XMM[8+2], @XMM[2]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_decrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       pxor    0x20(%rsp), @XMM[6]
-       movdqu  @XMM[1], 0x10($out)
-       movdqu  @XMM[6], 0x20($out)
-       lea     0x30($out), $out
-
-       movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_dec_done
-.align 16
-.Lxts_dec_2:
-       pxor    @XMM[8+0], @XMM[0]
-       lea     0x20($inp), $inp
-       pxor    @XMM[8+1], @XMM[1]
-       lea     0x80(%rsp), %rax        # pass key schedule
-       mov     %edx, %r10d             # pass rounds
-
-       call    _bsaes_decrypt8
-
-       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
-       pxor    0x10(%rsp), @XMM[1]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       movdqu  @XMM[1], 0x10($out)
-       lea     0x20($out), $out
-
-       movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
-       jmp     .Lxts_dec_done
-.align 16
-.Lxts_dec_1:
-       pxor    @XMM[0], @XMM[8]
-       lea     0x10($inp), $inp
-       movdqa  @XMM[8], 0x20(%rbp)
-       lea     0x20(%rbp), $arg1
-       lea     0x20(%rbp), $arg2
-       lea     ($key), $arg3
-       call    asm_AES_decrypt         # doesn't touch %xmm
-       pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
-       #pxor   @XMM[8], @XMM[0]
-       #lea    0x80(%rsp), %rax        # pass key schedule
-       #mov    %edx, %r10d             # pass rounds
-       #call   _bsaes_decrypt8
-       #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
-       movdqu  @XMM[0], 0x00($out)     # write output
-       lea     0x10($out), $out
-
-       movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
-
-.Lxts_dec_done:
-       and     \$15, %ebx
-       jz      .Lxts_dec_ret
-
-       pxor    $twtmp, $twtmp
-       movdqa  .Lxts_magic(%rip), $twmask
-       pcmpgtd @XMM[7], $twtmp
-       pshufd  \$0x13, $twtmp, $twres
-       movdqa  @XMM[7], @XMM[6]
-       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
-       pand    $twmask, $twres         # isolate carry and residue
-       movdqu  ($inp), @XMM[0]
-       pxor    $twres, @XMM[7]
-
-       lea     0x20(%rbp), $arg1
-       pxor    @XMM[7], @XMM[0]
-       lea     0x20(%rbp), $arg2
-       movdqa  @XMM[0], 0x20(%rbp)
-       lea     ($key), $arg3
-       call    asm_AES_decrypt         # doesn't touch %xmm
-       pxor    0x20(%rbp), @XMM[7]
-       mov     $out, %rdx
-       movdqu  @XMM[7], ($out)
-
-.Lxts_dec_steal:
-       movzb   16($inp), %eax
-       movzb   (%rdx), %ecx
-       lea     1($inp), $inp
-       mov     %al, (%rdx)
-       mov     %cl, 16(%rdx)
-       lea     1(%rdx), %rdx
-       sub     \$1,%ebx
-       jnz     .Lxts_dec_steal
-
-       movdqu  ($out), @XMM[0]
-       lea     0x20(%rbp), $arg1
-       pxor    @XMM[6], @XMM[0]
-       lea     0x20(%rbp), $arg2
-       movdqa  @XMM[0], 0x20(%rbp)
-       lea     ($key), $arg3
-       call    asm_AES_decrypt         # doesn't touch %xmm
-       pxor    0x20(%rbp), @XMM[6]
-       movdqu  @XMM[6], ($out)
-
-.Lxts_dec_ret:
-       lea     (%rsp), %rax
-       pxor    %xmm0, %xmm0
-.Lxts_dec_bzero:                       # wipe key schedule [if any]
-       movdqa  %xmm0, 0x00(%rax)
-       movdqa  %xmm0, 0x10(%rax)
-       lea     0x20(%rax), %rax
-       cmp     %rax, %rbp
-       ja      .Lxts_dec_bzero
-
-       lea     0x78(%rbp),%rax
-.cfi_def_cfa   %rax,8
-___
-$code.=<<___ if ($win64);
-       movaps  0x40(%rbp), %xmm6
-       movaps  0x50(%rbp), %xmm7
-       movaps  0x60(%rbp), %xmm8
-       movaps  0x70(%rbp), %xmm9
-       movaps  0x80(%rbp), %xmm10
-       movaps  0x90(%rbp), %xmm11
-       movaps  0xa0(%rbp), %xmm12
-       movaps  0xb0(%rbp), %xmm13
-       movaps  0xc0(%rbp), %xmm14
-       movaps  0xd0(%rbp), %xmm15
-       lea     0xa0(%rax), %rax
-.Lxts_dec_tail:
-___
-$code.=<<___;
-       mov     -48(%rax), %r15
-.cfi_restore   %r15
-       mov     -40(%rax), %r14
-.cfi_restore   %r14
-       mov     -32(%rax), %r13
-.cfi_restore   %r13
-       mov     -24(%rax), %r12
-.cfi_restore   %r12
-       mov     -16(%rax), %rbx
-.cfi_restore   %rbx
-       mov     -8(%rax), %rbp
-.cfi_restore   %rbp
-       lea     (%rax), %rsp            # restore %rsp
-.cfi_def_cfa_register  %rsp
-.Lxts_dec_epilogue:
-       ret
-.cfi_endproc
-.size  bsaes_xts_decrypt,.-bsaes_xts_decrypt
-___
-}
-$code.=<<___;
-.type  _bsaes_const,\@object
-.align 64
-_bsaes_const:
-.LM0ISR:       # InvShiftRows constants
-       .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
-.LISRM0:
-       .quad   0x01040b0e0205080f, 0x0306090c00070a0d
-.LISR:
-       .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
-.LBS0:         # bit-slice constants
-       .quad   0x5555555555555555, 0x5555555555555555
-.LBS1:
-       .quad   0x3333333333333333, 0x3333333333333333
-.LBS2:
-       .quad   0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
-.LSR:          # shiftrows constants
-       .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
-.LSRM0:
-       .quad   0x0304090e00050a0f, 0x01060b0c0207080d
-.LM0SR:
-       .quad   0x0a0e02060f03070b, 0x0004080c05090d01
-.LSWPUP:       # byte-swap upper dword
-       .quad   0x0706050403020100, 0x0c0d0e0f0b0a0908
-.LSWPUPM0SR:
-       .quad   0x0a0d02060c03070b, 0x0004080f05090e01
-.LADD1:                # counter increment constants
-       .quad   0x0000000000000000, 0x0000000100000000
-.LADD2:
-       .quad   0x0000000000000000, 0x0000000200000000
-.LADD3:
-       .quad   0x0000000000000000, 0x0000000300000000
-.LADD4:
-       .quad   0x0000000000000000, 0x0000000400000000
-.LADD5:
-       .quad   0x0000000000000000, 0x0000000500000000
-.LADD6:
-       .quad   0x0000000000000000, 0x0000000600000000
-.LADD7:
-       .quad   0x0000000000000000, 0x0000000700000000
-.LADD8:
-       .quad   0x0000000000000000, 0x0000000800000000
-.Lxts_magic:
-       .long   0x87,0,1,0
-.Lmasks:
-       .quad   0x0101010101010101, 0x0101010101010101
-       .quad   0x0202020202020202, 0x0202020202020202
-       .quad   0x0404040404040404, 0x0404040404040404
-       .quad   0x0808080808080808, 0x0808080808080808
-.LM0:
-       .quad   0x02060a0e03070b0f, 0x0004080c0105090d
-.L63:
-       .quad   0x6363636363636363, 0x6363636363636363
-.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
-.align 64
-.size  _bsaes_const,.-_bsaes_const
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern        __imp_RtlVirtualUnwind
-.type  se_handler,\@abi-omnipotent
-.align 16
-se_handler:
-       push    %rsi
-       push    %rdi
-       push    %rbx
-       push    %rbp
-       push    %r12
-       push    %r13
-       push    %r14
-       push    %r15
-       pushfq
-       sub     \$64,%rsp
-
-       mov     120($context),%rax      # pull context->Rax
-       mov     248($context),%rbx      # pull context->Rip
-
-       mov     8($disp),%rsi           # disp->ImageBase
-       mov     56($disp),%r11          # disp->HandlerData
-
-       mov     0(%r11),%r10d           # HandlerData[0]
-       lea     (%rsi,%r10),%r10        # prologue label
-       cmp     %r10,%rbx               # context->Rip<=prologue label
-       jbe     .Lin_prologue
-
-       mov     4(%r11),%r10d           # HandlerData[1]
-       lea     (%rsi,%r10),%r10        # epilogue label
-       cmp     %r10,%rbx               # context->Rip>=epilogue label
-       jae     .Lin_prologue
-
-       mov     8(%r11),%r10d           # HandlerData[2]
-       lea     (%rsi,%r10),%r10        # epilogue label
-       cmp     %r10,%rbx               # context->Rip>=tail label
-       jae     .Lin_tail
-
-       mov     160($context),%rax      # pull context->Rbp
-
-       lea     0x40(%rax),%rsi         # %xmm save area
-       lea     512($context),%rdi      # &context.Xmm6
-       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
-       .long   0xa548f3fc              # cld; rep movsq
-       lea     0xa0+0x78(%rax),%rax    # adjust stack pointer
-
-.Lin_tail:
-       mov     -48(%rax),%rbp
-       mov     -40(%rax),%rbx
-       mov     -32(%rax),%r12
-       mov     -24(%rax),%r13
-       mov     -16(%rax),%r14
-       mov     -8(%rax),%r15
-       mov     %rbx,144($context)      # restore context->Rbx
-       mov     %rbp,160($context)      # restore context->Rbp
-       mov     %r12,216($context)      # restore context->R12
-       mov     %r13,224($context)      # restore context->R13
-       mov     %r14,232($context)      # restore context->R14
-       mov     %r15,240($context)      # restore context->R15
-
-.Lin_prologue:
-       mov     %rax,152($context)      # restore context->Rsp
-
-       mov     40($disp),%rdi          # disp->ContextRecord
-       mov     $context,%rsi           # context
-       mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
-       .long   0xa548f3fc              # cld; rep movsq
-
-       mov     $disp,%rsi
-       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
-       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
-       mov     0(%rsi),%r8             # arg3, disp->ControlPc
-       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
-       mov     40(%rsi),%r10           # disp->ContextRecord
-       lea     56(%rsi),%r11           # &disp->HandlerData
-       lea     24(%rsi),%r12           # &disp->EstablisherFrame
-       mov     %r10,32(%rsp)           # arg5
-       mov     %r11,40(%rsp)           # arg6
-       mov     %r12,48(%rsp)           # arg7
-       mov     %rcx,56(%rsp)           # arg8, (NULL)
-       call    *__imp_RtlVirtualUnwind(%rip)
-
-       mov     \$1,%eax                # ExceptionContinueSearch
-       add     \$64,%rsp
-       popfq
-       pop     %r15
-       pop     %r14
-       pop     %r13
-       pop     %r12
-       pop     %rbp
-       pop     %rbx
-       pop     %rdi
-       pop     %rsi
-       ret
-.size  se_handler,.-se_handler
-
-.section       .pdata
-.align 4
-___
-$code.=<<___ if ($ecb);
-       .rva    .Lecb_enc_prologue
-       .rva    .Lecb_enc_epilogue
-       .rva    .Lecb_enc_info
-
-       .rva    .Lecb_dec_prologue
-       .rva    .Lecb_dec_epilogue
-       .rva    .Lecb_dec_info
-___
-$code.=<<___;
-       .rva    .Lcbc_dec_prologue
-       .rva    .Lcbc_dec_epilogue
-       .rva    .Lcbc_dec_info
-
-       .rva    .Lctr_enc_prologue
-       .rva    .Lctr_enc_epilogue
-       .rva    .Lctr_enc_info
-
-       .rva    .Lxts_enc_prologue
-       .rva    .Lxts_enc_epilogue
-       .rva    .Lxts_enc_info
-
-       .rva    .Lxts_dec_prologue
-       .rva    .Lxts_dec_epilogue
-       .rva    .Lxts_dec_info
-
-.section       .xdata
-.align 8
-___
-$code.=<<___ if ($ecb);
-.Lecb_enc_info:
-       .byte   9,0,0,0
-       .rva    se_handler
-       .rva    .Lecb_enc_body,.Lecb_enc_epilogue       # HandlerData[]
-       .rva    .Lecb_enc_tail
-       .long   0
-.Lecb_dec_info:
-       .byte   9,0,0,0
-       .rva    se_handler
-       .rva    .Lecb_dec_body,.Lecb_dec_epilogue       # HandlerData[]
-       .rva    .Lecb_dec_tail
-       .long   0
-___
-$code.=<<___;
-.Lcbc_dec_info:
-       .byte   9,0,0,0
-       .rva    se_handler
-       .rva    .Lcbc_dec_body,.Lcbc_dec_epilogue       # HandlerData[]
-       .rva    .Lcbc_dec_tail
-       .long   0
-.Lctr_enc_info:
-       .byte   9,0,0,0
-       .rva    se_handler
-       .rva    .Lctr_enc_body,.Lctr_enc_epilogue       # HandlerData[]
-       .rva    .Lctr_enc_tail
-       .long   0
-.Lxts_enc_info:
-       .byte   9,0,0,0
-       .rva    se_handler
-       .rva    .Lxts_enc_body,.Lxts_enc_epilogue       # HandlerData[]
-       .rva    .Lxts_enc_tail
-       .long   0
-.Lxts_dec_info:
-       .byte   9,0,0,0
-       .rva    se_handler
-       .rva    .Lxts_dec_body,.Lxts_dec_epilogue       # HandlerData[]
-       .rva    .Lxts_dec_tail
-       .long   0
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-print $code;
-
-close STDOUT;
index 90a410b00db7fa4daa5803d60c2b9967339778e5..1db346fc864bb4b424d656ff9ef0a172192e2185 100644 (file)
@@ -176,7 +176,7 @@ static void ctr64_inc(unsigned char *counter)
 # define HWAES_xts_decrypt aes_p8_xts_decrypt
 #endif
 
-#if     defined(AES_ASM) && !defined(I386_ONLY) &&      (  \
+#if     !defined(OPENSSL_NO_ASM) &&                     (  \
         ((defined(__i386)       || defined(__i386__)    || \
           defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
         defined(__x86_64)       || defined(__x86_64__)  || \