-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2019 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
-# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
+# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
# identical to CBC, because CBC-MAC is essentially CBC encrypt without
# saving output. CCM CTR "stays invisible," because it's neatly
# interleaved wih CBC-MAC. This provides ~30% improvement over
-# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# "straightforward" CCM implementation with CTR and CBC-MAC performed
# disjointly. Parallelizable modes practically achieve the theoretical
# limit.
#
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
-# instructions' interleave factor. Westmere can execute at most 3
+# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
# asymptotic, if it can be surpassed, isn't it? What happens there?
# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
# magic is responsible for this. Processor overlaps not only the
-# additional instructions with AES ones, but even AES instuctions
+# additional instructions with AES ones, but even AES instructions
# processing adjacent triplets of independent blocks. In the 6x case
# additional instructions still claim disproportionally small amount
# of additional cycles, but in 8x case number of instructions must be
# a tad too high for out-of-order logic to cope with, and AES unit
# remains underutilized... As you can see 8x interleave is hardly
# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
-# utilizies 6x interleave because of limited register bank capacity.
+# utilizes 6x interleave because of limited register bank capacity.
#
# Higher interleave factors do have negative impact on Westmere
# performance. While for ECB mode it's negligible ~1.5%, other
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
-# CBC en-/decrypt CTR XTS ECB
+# CBC en-/decrypt CTR XTS ECB OCB
# Westmere 3.77/1.25 1.25 1.25 1.26
-# * Bridge 5.07/0.74 0.75 0.90 0.85
-# Haswell 4.44/0.63 0.63 0.73 0.63
-# Silvermont 5.75/3.54 3.56 4.12 3.87(*)
-# Bulldozer 5.77/0.70 0.72 0.90 0.70
+# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
+# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
+# Skylake 2.62/0.63 0.63 0.63 0.63
+# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
+# Knights L 2.54/0.77 0.78 0.85 - 1.50
+# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
+# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
+# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
#
# (*) Atom Silvermont ECB result is suboptimal because of penalties
# incurred by operations on %xmm8-15. As ECB is not considered
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
.type ${PREFIX}_encrypt,\@abi-omnipotent
.align 16
${PREFIX}_encrypt:
+.cfi_startproc
movups ($inp),$inout0 # load input
mov 240($key),$rounds # key->rounds
___
movups $inout0,($out) # output
pxor $inout0,$inout0
ret
+.cfi_endproc
.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
.globl ${PREFIX}_decrypt
.type ${PREFIX}_decrypt,\@abi-omnipotent
.align 16
${PREFIX}_decrypt:
+.cfi_startproc
movups ($inp),$inout0 # load input
mov 240($key),$rounds # key->rounds
___
movups $inout0,($out) # output
pxor $inout0,$inout0
ret
+.cfi_endproc
.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
___
}
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
-# * Bridge and "super-optimal" for other Intel CPUs...
+# * Bridge and "super-optimal" for other Intel CPUs...
sub aesni_generate2 {
my $dir=shift;
.type _aesni_${dir}rypt2,\@abi-omnipotent
.align 16
_aesni_${dir}rypt2:
+.cfi_startproc
$movkey ($key),$rndkey0
shl \$4,$rounds
$movkey 16($key),$rndkey1
aes${dir}last $rndkey0,$inout0
aes${dir}last $rndkey0,$inout1
ret
+.cfi_endproc
.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
___
}
.type _aesni_${dir}rypt3,\@abi-omnipotent
.align 16
_aesni_${dir}rypt3:
+.cfi_startproc
$movkey ($key),$rndkey0
shl \$4,$rounds
$movkey 16($key),$rndkey1
aes${dir}last $rndkey0,$inout1
aes${dir}last $rndkey0,$inout2
ret
+.cfi_endproc
.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
___
}
.type _aesni_${dir}rypt4,\@abi-omnipotent
.align 16
_aesni_${dir}rypt4:
+.cfi_startproc
$movkey ($key),$rndkey0
shl \$4,$rounds
$movkey 16($key),$rndkey1
aes${dir}last $rndkey0,$inout2
aes${dir}last $rndkey0,$inout3
ret
+.cfi_endproc
.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
___
}
.type _aesni_${dir}rypt6,\@abi-omnipotent
.align 16
_aesni_${dir}rypt6:
+.cfi_startproc
$movkey ($key),$rndkey0
shl \$4,$rounds
$movkey 16($key),$rndkey1
aes${dir}last $rndkey0,$inout4
aes${dir}last $rndkey0,$inout5
ret
+.cfi_endproc
.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
___
}
.type _aesni_${dir}rypt8,\@abi-omnipotent
.align 16
_aesni_${dir}rypt8:
+.cfi_startproc
$movkey ($key),$rndkey0
shl \$4,$rounds
$movkey 16($key),$rndkey1
aes${dir}last $rndkey0,$inout6
aes${dir}last $rndkey0,$inout7
ret
+.cfi_endproc
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
.type aesni_ecb_encrypt,\@function,5
.align 16
aesni_ecb_encrypt:
+.cfi_startproc
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
___
$code.=<<___;
ret
+.cfi_endproc
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
___
\f
# with zero-round key xor.
{
my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
-my ($key0,$ctr)=("${key_}d","${ivp}d");
+my ($key0,$ctr)=("%ebp","${ivp}d");
my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
.type aesni_ctr32_encrypt_blocks,\@function,5
.align 16
aesni_ctr32_encrypt_blocks:
+.cfi_startproc
cmp \$1,$len
jne .Lctr32_bulk
.align 16
.Lctr32_bulk:
- lea (%rsp),%rax
+ lea (%rsp),$key_ # use $key_ as frame pointer
+.cfi_def_cfa_register $key_
push %rbp
+.cfi_push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8($key_) # offload everything
+ movaps %xmm7,-0x98($key_)
+ movaps %xmm8,-0x88($key_)
+ movaps %xmm9,-0x78($key_)
+ movaps %xmm10,-0x68($key_)
+ movaps %xmm11,-0x58($key_)
+ movaps %xmm12,-0x48($key_)
+ movaps %xmm13,-0x38($key_)
+ movaps %xmm14,-0x28($key_)
+ movaps %xmm15,-0x18($key_)
.Lctr32_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
# 8 16-byte words on top of stack are counter values
# xor-ed with zero-round key
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
- mov OPENSSL_ia32cap_P+4(%rip),%r10d
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
sub \$8,$len
jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
- add \$8,$len # restore real remainig $len
+ add \$8,$len # restore real remaining $len
jz .Lctr32_done # done if ($len==0)
lea -0x80($key),$key
.Lctr32_tail:
# note that at this point $inout0..5 are populated with
- # counter values xor-ed with 0-round key
+ # counter values xor-ed with 0-round key
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_loop3
movups $inout2,0x20($out) # $len was 3, stop store
.Lctr32_done:
- xorps %xmm0,%xmm0 # clear regiser bank
+ xorps %xmm0,%xmm0 # clear register bank
xor $key0,$key0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8($key_),%xmm6
+ movaps %xmm0,-0xa8($key_) # clear stack
+ movaps -0x98($key_),%xmm7
+ movaps %xmm0,-0x98($key_)
+ movaps -0x88($key_),%xmm8
+ movaps %xmm0,-0x88($key_)
+ movaps -0x78($key_),%xmm9
+ movaps %xmm0,-0x78($key_)
+ movaps -0x68($key_),%xmm10
+ movaps %xmm0,-0x68($key_)
+ movaps -0x58($key_),%xmm11
+ movaps %xmm0,-0x58($key_)
+ movaps -0x48($key_),%xmm12
+ movaps %xmm0,-0x48($key_)
+ movaps -0x38($key_),%xmm13
+ movaps %xmm0,-0x38($key_)
+ movaps -0x28($key_),%xmm14
+ movaps %xmm0,-0x28($key_)
+ movaps -0x18($key_),%xmm15
+ movaps %xmm0,-0x18($key_)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
movaps %xmm0,0x70(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8($key_),%rbp
+.cfi_restore %rbp
+ lea ($key_),%rsp
+.cfi_def_cfa_register %rsp
.Lctr32_epilogue:
ret
+.cfi_endproc
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
}
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
my $frame_size = 0x70 + ($win64?160:0);
+my $key_ = "%rbp"; # override so that we can use %r11 as FP
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
- lea (%rsp),%rax
+.cfi_startproc
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
push %rbp
+.cfi_push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_enc_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
movups ($ivp),$inout0 # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
lea `16*6`($inp),$inp
pxor $twmask,$inout5
- pxor $twres,@tweak[0] # calclulate tweaks^round[last]
+ pxor $twres,@tweak[0] # calculate tweaks^round[last]
aesenc $rndkey1,$inout4
pxor $twres,@tweak[1]
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lxts_enc_epilogue:
ret
+.cfi_endproc
.size aesni_xts_encrypt,.-aesni_xts_encrypt
___
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_xts_decrypt:
- lea (%rsp),%rax
+.cfi_startproc
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
push %rbp
+.cfi_push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%rax) # offload everything
- movaps %xmm7,-0x98(%rax)
- movaps %xmm8,-0x88(%rax)
- movaps %xmm9,-0x78(%rax)
- movaps %xmm10,-0x68(%rax)
- movaps %xmm11,-0x58(%rax)
- movaps %xmm12,-0x48(%rax)
- movaps %xmm13,-0x38(%rax)
- movaps %xmm14,-0x28(%rax)
- movaps %xmm15,-0x18(%rax)
+ movaps %xmm6,-0xa8(%r11) # offload everything
+ movaps %xmm7,-0x98(%r11)
+ movaps %xmm8,-0x88(%r11)
+ movaps %xmm9,-0x78(%r11)
+ movaps %xmm10,-0x68(%r11)
+ movaps %xmm11,-0x58(%r11)
+ movaps %xmm12,-0x48(%r11)
+ movaps %xmm13,-0x38(%r11)
+ movaps %xmm14,-0x28(%r11)
+ movaps %xmm15,-0x18(%r11)
.Lxts_dec_body:
___
$code.=<<___;
- lea -8(%rax),%rbp
movups ($ivp),$inout0 # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
lea `16*6`($inp),$inp
pxor $twmask,$inout5
- pxor $twres,@tweak[0] # calclulate tweaks^round[last]
+ pxor $twres,@tweak[0] # calculate tweaks^round[last]
aesdec $rndkey1,$inout4
pxor $twres,@tweak[1]
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
pxor %xmm15,%xmm15
___
$code.=<<___ if ($win64);
- movaps -0xa0(%rbp),%xmm6
- movaps %xmm0,-0xa0(%rbp) # clear stack
- movaps -0x90(%rbp),%xmm7
- movaps %xmm0,-0x90(%rbp)
- movaps -0x80(%rbp),%xmm8
- movaps %xmm0,-0x80(%rbp)
- movaps -0x70(%rbp),%xmm9
- movaps %xmm0,-0x70(%rbp)
- movaps -0x60(%rbp),%xmm10
- movaps %xmm0,-0x60(%rbp)
- movaps -0x50(%rbp),%xmm11
- movaps %xmm0,-0x50(%rbp)
- movaps -0x40(%rbp),%xmm12
- movaps %xmm0,-0x40(%rbp)
- movaps -0x30(%rbp),%xmm13
- movaps %xmm0,-0x30(%rbp)
- movaps -0x20(%rbp),%xmm14
- movaps %xmm0,-0x20(%rbp)
- movaps -0x10(%rbp),%xmm15
- movaps %xmm0,-0x10(%rbp)
+ movaps -0xa8(%r11),%xmm6
+ movaps %xmm0,-0xa8(%r11) # clear stack
+ movaps -0x98(%r11),%xmm7
+ movaps %xmm0,-0x98(%r11)
+ movaps -0x88(%r11),%xmm8
+ movaps %xmm0,-0x88(%r11)
+ movaps -0x78(%r11),%xmm9
+ movaps %xmm0,-0x78(%r11)
+ movaps -0x68(%r11),%xmm10
+ movaps %xmm0,-0x68(%r11)
+ movaps -0x58(%r11),%xmm11
+ movaps %xmm0,-0x58(%r11)
+ movaps -0x48(%r11),%xmm12
+ movaps %xmm0,-0x48(%r11)
+ movaps -0x38(%r11),%xmm13
+ movaps %xmm0,-0x38(%r11)
+ movaps -0x28(%r11),%xmm14
+ movaps %xmm0,-0x28(%r11)
+ movaps -0x18(%r11),%xmm15
+ movaps %xmm0,-0x18(%r11)
movaps %xmm0,0x00(%rsp)
movaps %xmm0,0x10(%rsp)
movaps %xmm0,0x20(%rsp)
movaps %xmm0,0x60(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lxts_dec_epilogue:
ret
+.cfi_endproc
.size aesni_xts_decrypt,.-aesni_xts_decrypt
___
+}
+\f
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+# const AES_KEY *key, unsigned int start_block_num,
+# unsigned char offset_i[16], const unsigned char L_[][16],
+# unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl aesni_ocb_encrypt
+.type aesni_ocb_encrypt,\@function,6
+.align 32
+aesni_ocb_encrypt:
+.cfi_startproc
+ lea (%rsp),%rax
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_enc_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_enc_done
+
+.Locb_enc_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_enc_short
+ jmp .Locb_enc_grandloop
+
+.align 32
+.Locb_enc_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_encrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+ movups $inout5,`16*5`($out)
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_enc_grandloop
+
+.Locb_enc_short:
+ add \$6,$blocks
+ jz .Locb_enc_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_enc_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_enc_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_enc_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_enc_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_encrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+ movups $inout4,`16*4`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_encrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out)
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_three:
+ pxor $inout3,$inout3
+
+ call __ocb_encrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+
+ jmp .Locb_enc_done
+
+.align 16
+.Locb_enc_four:
+ call __ocb_encrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out)
+ movups $inout1,`16*1`($out)
+ movups $inout2,`16*2`($out)
+ movups $inout3,`16*3`($out)
+
+.Locb_enc_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+.cfi_def_cfa %rax,8
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+.cfi_restore %r14
+ mov -32(%rax),%r13
+.cfi_restore %r13
+ mov -24(%rax),%r12
+.cfi_restore %r12
+ mov -16(%rax),%rbp
+.cfi_restore %rbp
+ mov -8(%rax),%rbx
+.cfi_restore %rbx
+ lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Locb_enc_epilogue:
+ ret
+.cfi_endproc
+.size aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type __ocb_encrypt6,\@abi-omnipotent
+.align 32
+__ocb_encrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor $inout4,$checksum
+ pxor @offset[4],$inout4
+ pxor $inout5,$checksum
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesenc $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesenc $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_enc_loop6
+
+.align 32
+.Locb_enc_loop6:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ aesenc $rndkey0,$inout4
+ aesenc $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop6
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ aesenc $rndkey1,$inout4
+ aesenc $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesenclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ aesenclast @offset[4],$inout4
+ aesenclast @offset[5],$inout5
+ ret
+.size __ocb_encrypt6,.-__ocb_encrypt6
+
+.type __ocb_encrypt4,\@abi-omnipotent
+.align 32
+__ocb_encrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor $inout1,$checksum
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor $inout2,$checksum
+ pxor @offset[2],$inout2
+ pxor $inout3,$checksum
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop4
+
+.align 32
+.Locb_enc_loop4:
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ aesenc $rndkey0,$inout1
+ aesenc $rndkey0,$inout2
+ aesenc $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop4
+
+ aesenc $rndkey1,$inout0
+ aesenc $rndkey1,$inout1
+ aesenc $rndkey1,$inout2
+ aesenc $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast @offset[0],$inout0
+ aesenclast @offset[1],$inout1
+ aesenclast @offset[2],$inout2
+ aesenclast @offset[3],$inout3
+ ret
+.size __ocb_encrypt4,.-__ocb_encrypt4
+
+.type __ocb_encrypt1,\@abi-omnipotent
+.align 32
+__ocb_encrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout0,$checksum # accumulate checksum
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesenc $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesenc $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_enc_loop1
+
+.align 32
+.Locb_enc_loop1:
+ aesenc $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesenc $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_enc_loop1
+
+ aesenc $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesenclast $inout5,$inout0
+ ret
+.size __ocb_encrypt1,.-__ocb_encrypt1
+
+.globl aesni_ocb_decrypt
+.type aesni_ocb_decrypt,\@function,6
+.align 32
+aesni_ocb_decrypt:
+.cfi_startproc
+ lea (%rsp),%rax
+ push %rbx
+.cfi_push %rbx
+ push %rbp
+.cfi_push %rbp
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+___
+$code.=<<___ if ($win64);
+ lea -0xa0(%rsp),%rsp
+ movaps %xmm6,0x00(%rsp) # offload everything
+ movaps %xmm7,0x10(%rsp)
+ movaps %xmm8,0x20(%rsp)
+ movaps %xmm9,0x30(%rsp)
+ movaps %xmm10,0x40(%rsp)
+ movaps %xmm11,0x50(%rsp)
+ movaps %xmm12,0x60(%rsp)
+ movaps %xmm13,0x70(%rsp)
+ movaps %xmm14,0x80(%rsp)
+ movaps %xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+ mov $seventh_arg(%rax),$L_p # 7th argument
+ mov $seventh_arg+8(%rax),$checksum_p# 8th argument
+
+ mov 240($key),$rnds_
+ mov $key,$key_
+ shl \$4,$rnds_
+ $movkey ($key),$rndkey0l # round[0]
+ $movkey 16($key,$rnds_),$rndkey1 # round[last]
+
+ movdqu ($offset_p),@offset[5] # load last offset_i
+ pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
+ pxor $rndkey1,@offset[5] # offset_i ^ round[last]
+
+ mov \$16+32,$rounds
+ lea 32($key_,$rnds_),$key
+ $movkey 16($key_),$rndkey1 # round[1]
+ sub %r10,%rax # twisted $rounds
+ mov %rax,%r10 # backup twisted $rounds
+
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ movdqu ($checksum_p),$checksum # load checksum
+
+ test \$1,$block_num # is first block number odd?
+ jnz .Locb_dec_odd
+
+ bsf $block_num,$i1
+ add \$1,$block_num
+ shl \$4,$i1
+ movdqu ($L_p,$i1),$inout5 # borrow
+ movdqu ($inp),$inout0
+ lea 16($inp),$inp
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,($out)
+ xorps $inout0,$checksum # accumulate checksum
+ lea 16($out),$out
+ sub \$1,$blocks
+ jz .Locb_dec_done
+
+.Locb_dec_odd:
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ lea 6($block_num),$block_num
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ shl \$4,$i5
+
+ sub \$6,$blocks
+ jc .Locb_dec_short
+ jmp .Locb_dec_grandloop
+
+.align 32
+.Locb_dec_grandloop:
+ movdqu `16*0`($inp),$inout0 # load input
+ movdqu `16*1`($inp),$inout1
+ movdqu `16*2`($inp),$inout2
+ movdqu `16*3`($inp),$inout3
+ movdqu `16*4`($inp),$inout4
+ movdqu `16*5`($inp),$inout5
+ lea `16*6`($inp),$inp
+
+ call __ocb_decrypt6
+
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+ movups $inout5,`16*5`($out)
+ pxor $inout5,$checksum
+ lea `16*6`($out),$out
+ sub \$6,$blocks
+ jnc .Locb_dec_grandloop
+
+.Locb_dec_short:
+ add \$6,$blocks
+ jz .Locb_dec_done
+
+ movdqu `16*0`($inp),$inout0
+ cmp \$2,$blocks
+ jb .Locb_dec_one
+ movdqu `16*1`($inp),$inout1
+ je .Locb_dec_two
+
+ movdqu `16*2`($inp),$inout2
+ cmp \$4,$blocks
+ jb .Locb_dec_three
+ movdqu `16*3`($inp),$inout3
+ je .Locb_dec_four
+
+ movdqu `16*4`($inp),$inout4
+ pxor $inout5,$inout5
+
+ call __ocb_decrypt6
+
+ movdqa @offset[4],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+ movups $inout4,`16*4`($out)
+ pxor $inout4,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_one:
+ movdqa @offset[0],$inout5 # borrow
+
+ call __ocb_decrypt1
+
+ movdqa $inout5,@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_two:
+ pxor $inout2,$inout2
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[1],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_three:
+ pxor $inout3,$inout3
+
+ call __ocb_decrypt4
+
+ movdqa @offset[2],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ xorps $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ xorps $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ xorps $inout2,$checksum
+
+ jmp .Locb_dec_done
+
+.align 16
+.Locb_dec_four:
+ call __ocb_decrypt4
+
+ movdqa @offset[3],@offset[5]
+ movups $inout0,`16*0`($out) # store output
+ pxor $inout0,$checksum # accumulate checksum
+ movups $inout1,`16*1`($out)
+ pxor $inout1,$checksum
+ movups $inout2,`16*2`($out)
+ pxor $inout2,$checksum
+ movups $inout3,`16*3`($out)
+ pxor $inout3,$checksum
+
+.Locb_dec_done:
+ pxor $rndkey0,@offset[5] # "remove" round[last]
+ movdqu $checksum,($checksum_p) # store checksum
+ movdqu @offset[5],($offset_p) # store last offset_i
+
+ xorps %xmm0,%xmm0 # clear register bank
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+ pxor %xmm12,%xmm12
+ pxor %xmm13,%xmm13
+ pxor %xmm14,%xmm14
+ pxor %xmm15,%xmm15
+ lea 0x28(%rsp),%rax
+.cfi_def_cfa %rax,8
+___
+$code.=<<___ if ($win64);
+ movaps 0x00(%rsp),%xmm6
+ movaps %xmm0,0x00(%rsp) # clear stack
+ movaps 0x10(%rsp),%xmm7
+ movaps %xmm0,0x10(%rsp)
+ movaps 0x20(%rsp),%xmm8
+ movaps %xmm0,0x20(%rsp)
+ movaps 0x30(%rsp),%xmm9
+ movaps %xmm0,0x30(%rsp)
+ movaps 0x40(%rsp),%xmm10
+ movaps %xmm0,0x40(%rsp)
+ movaps 0x50(%rsp),%xmm11
+ movaps %xmm0,0x50(%rsp)
+ movaps 0x60(%rsp),%xmm12
+ movaps %xmm0,0x60(%rsp)
+ movaps 0x70(%rsp),%xmm13
+ movaps %xmm0,0x70(%rsp)
+ movaps 0x80(%rsp),%xmm14
+ movaps %xmm0,0x80(%rsp)
+ movaps 0x90(%rsp),%xmm15
+ movaps %xmm0,0x90(%rsp)
+ lea 0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+___
+$code.=<<___;
+ mov -40(%rax),%r14
+.cfi_restore %r14
+ mov -32(%rax),%r13
+.cfi_restore %r13
+ mov -24(%rax),%r12
+.cfi_restore %r12
+ mov -16(%rax),%rbp
+.cfi_restore %rbp
+ mov -8(%rax),%rbx
+.cfi_restore %rbx
+ lea (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Locb_dec_epilogue:
+ ret
+.cfi_endproc
+.size aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type __ocb_decrypt6,\@abi-omnipotent
+.align 32
+__ocb_decrypt6:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ movdqa @offset[0],@offset[4]
+ pxor @offset[5],@offset[0]
+ movdqu ($L_p,$i5),@offset[5]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],@offset[4]
+ pxor @offset[3],$inout3
+ pxor @offset[4],@offset[5]
+ pxor @offset[4],$inout4
+ pxor @offset[5],$inout5
+ $movkey 32($key_),$rndkey0
+
+ lea 1($block_num),$i1 # even-numbered blocks
+ lea 3($block_num),$i3
+ lea 5($block_num),$i5
+ add \$6,$block_num
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ bsf $i1,$i1 # ntz(block)
+ bsf $i3,$i3
+ bsf $i5,$i5
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ aesdec $rndkey1,$inout4
+ pxor $rndkey0l,@offset[3]
+ pxor $rndkey0l,@offset[4]
+ aesdec $rndkey1,$inout5
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,@offset[5]
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey 64($key_),$rndkey0
+ shl \$4,$i1 # ntz(block) -> table offset
+ shl \$4,$i3
+ jmp .Locb_dec_loop6
+
+.align 32
+.Locb_dec_loop6:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ aesdec $rndkey0,$inout4
+ aesdec $rndkey0,$inout5
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop6
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ aesdec $rndkey1,$inout4
+ aesdec $rndkey1,$inout5
+ $movkey 16($key_),$rndkey1
+ shl \$4,$i5
+
+ aesdeclast @offset[0],$inout0
+ movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
+ mov %r10,%rax # restore twisted rounds
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ aesdeclast @offset[4],$inout4
+ aesdeclast @offset[5],$inout5
+ ret
+.size __ocb_decrypt6,.-__ocb_decrypt6
+
+.type __ocb_decrypt4,\@abi-omnipotent
+.align 32
+__ocb_decrypt4:
+ pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
+ movdqu ($L_p,$i1),@offset[1]
+ movdqa @offset[0],@offset[2]
+ movdqu ($L_p,$i3),@offset[3]
+ pxor @offset[5],@offset[0]
+ pxor @offset[0],@offset[1]
+ pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
+ pxor @offset[1],@offset[2]
+ pxor @offset[1],$inout1
+ pxor @offset[2],@offset[3]
+ pxor @offset[2],$inout2
+ pxor @offset[3],$inout3
+ $movkey 32($key_),$rndkey0
+
+ pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
+ pxor $rndkey0l,@offset[1]
+ pxor $rndkey0l,@offset[2]
+ pxor $rndkey0l,@offset[3]
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 48($key_),$rndkey1
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop4
+
+.align 32
+.Locb_dec_loop4:
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ aesdec $rndkey0,$inout1
+ aesdec $rndkey0,$inout2
+ aesdec $rndkey0,$inout3
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop4
+
+ aesdec $rndkey1,$inout0
+ aesdec $rndkey1,$inout1
+ aesdec $rndkey1,$inout2
+ aesdec $rndkey1,$inout3
+ $movkey 16($key_),$rndkey1
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast @offset[0],$inout0
+ aesdeclast @offset[1],$inout1
+ aesdeclast @offset[2],$inout2
+ aesdeclast @offset[3],$inout3
+ ret
+.size __ocb_decrypt4,.-__ocb_decrypt4
+
+.type __ocb_decrypt1,\@abi-omnipotent
+.align 32
+__ocb_decrypt1:
+ pxor @offset[5],$inout5 # offset_i
+ pxor $rndkey0l,$inout5 # offset_i ^ round[0]
+ pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
+ $movkey 32($key_),$rndkey0
+
+ aesdec $rndkey1,$inout0
+ $movkey 48($key_),$rndkey1
+ pxor $rndkey0l,$inout5 # offset_i ^ round[last]
+
+ aesdec $rndkey0,$inout0
+ $movkey 64($key_),$rndkey0
+ jmp .Locb_dec_loop1
+
+.align 32
+.Locb_dec_loop1:
+ aesdec $rndkey1,$inout0
+ $movkey ($key,%rax),$rndkey1
+ add \$32,%rax
+
+ aesdec $rndkey0,$inout0
+ $movkey -16($key,%rax),$rndkey0
+ jnz .Locb_dec_loop1
+
+ aesdec $rndkey1,$inout0
+ $movkey 16($key_),$rndkey1 # redundant in tail
+ mov %r10,%rax # restore twisted rounds
+
+ aesdeclast $inout5,$inout0
+ ret
+.size __ocb_decrypt1,.-__ocb_decrypt1
+___
} }}
\f
########################################################################
{
my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
-my $inp_=$key_;
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
.type ${PREFIX}_cbc_encrypt,\@function,6
.align 16
${PREFIX}_cbc_encrypt:
+.cfi_startproc
test $len,$len # check length
jz .Lcbc_ret
jmp .Lcbc_ret
.align 16
.Lcbc_decrypt_bulk:
- lea (%rsp),%rax
+ lea (%rsp),%r11 # frame pointer
+.cfi_def_cfa_register %r11
push %rbp
+.cfi_push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
+
+my $inp_=$key_="%rbp"; # reassign $key_
+
$code.=<<___;
- lea -8(%rax),%rbp
+ mov $key,$key_ # [re-]backup $key [after reassignment]
movups ($ivp),$iv
mov $rnds_,$rounds
cmp \$0x50,$len
pxor $rndkey0,$inout1
$movkey 0x10-0x70($key),$rndkey1
pxor $rndkey0,$inout2
- xor $inp_,$inp_
+ mov \$-1,$inp_
cmp \$0x70,$len # is there at least 0x60 bytes ahead?
pxor $rndkey0,$inout3
pxor $rndkey0,$inout4
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
aesdec $rndkey1,$inout6
- setnc ${inp_}b
- shl \$7,$inp_
+ adc \$0,$inp_
+ and \$128,$inp_
aesdec $rndkey1,$inout7
add $inp,$inp_
$movkey 0x30-0x70($key),$rndkey1
movaps %xmm0,0xa0(%rsp)
___
$code.=<<___;
- lea (%rbp),%rsp
- pop %rbp
+ mov -8(%r11),%rbp
+.cfi_restore %rbp
+ lea (%r11),%rsp
+.cfi_def_cfa_register %rsp
.Lcbc_ret:
ret
+.cfi_endproc
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
___
} \f
.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
.align 16
${PREFIX}_set_decrypt_key:
+.cfi_startproc
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+.cfi_adjust_cfa_offset 8
call __aesni_set_encrypt_key
shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
test %eax,%eax
pxor %xmm0,%xmm0
.Ldec_key_ret:
add \$8,%rsp
+.cfi_adjust_cfa_offset -8
ret
+.cfi_endproc
.LSEH_end_set_decrypt_key:
.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
___
\f
-# This is based on submission by
-#
-# Huang Ying <ying.huang@intel.com>
-# Vinodh Gopal <vinodh.gopal@intel.com>
+# This is based on submission from Intel by
+# Huang Ying
+# Vinodh Gopal
# Kahraman Akdemir
#
-# Agressively optimized in respect to aeskeygenassist's critical path
+# Aggressively optimized in respect to aeskeygenassist's critical path
# and is contained in %xmm0-5 to meet Win64 ABI requirement.
#
# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
.align 16
${PREFIX}_set_encrypt_key:
__aesni_set_encrypt_key:
+.cfi_startproc
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
+.cfi_adjust_cfa_offset 8
mov \$-1,%rax
test $inp,$inp
jz .Lenc_key_ret
.align 16
.L14rounds:
- movups 16($inp),%xmm2 # remaning half of *userKey
+ movups 16($inp),%xmm2 # remaining half of *userKey
mov \$13,$bits # 14 rounds for 256
lea 16(%rax),%rax
cmp \$`1<<28`,%r10d # AVX, but no XOP
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
add \$8,%rsp
+.cfi_adjust_cfa_offset -8
ret
+.cfi_endproc
.LSEH_end_set_encrypt_key:
\f
.align 16
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- mov 160($context),%rax # pull context->Rbp
- lea -0xa0(%rax),%rsi # %xmm save area
+ mov 208($context),%rax # pull context->R11
+
+ lea -0xa8(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- jmp .Lcommon_rbp_tail
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
+ jmp .Lcommon_seh_tail
.size ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type ocb_se_handler,\@abi-omnipotent
+.align 16
+ocb_se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue lable
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10
+ cmp %r10,%rbx # context->Rip>=pop label
+ jae .Locb_no_xmm
+
+ mov 152($context),%rax # pull context->Rsp
+
+ lea (%rax),%rsi # %xmm save area
+ lea 512($context),%rdi # & context.Xmm6
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
+ .long 0xa548f3fc # cld; rep movsq
+ lea 0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size ocb_se_handler,.-ocb_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lcommon_seh_tail
+ mov 120($context),%rax # pull context->Rax
+
lea .Lcbc_decrypt_body(%rip),%r10
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
- jb .Lrestore_cbc_rax
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
lea .Lcbc_ret(%rip),%r10
cmp %r10,%rbx # context->Rip>="epilogue" label
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
-.Lcommon_rbp_tail:
- mov 160($context),%rax # pull context->Rbp
- mov (%rax),%rbp # restore saved %rbp
- lea 8(%rax),%rax # adjust stack pointer
- mov %rbp,160($context) # restore context->Rbp
- jmp .Lcommon_seh_tail
+ mov 208($context),%rax # pull context->R11
-.Lrestore_cbc_rax:
- mov 120($context),%rax
+ mov -8(%rax),%rbp # restore saved %rbp
+ mov %rbp,160($context) # restore context->Rbp
.Lcommon_seh_tail:
mov 8(%rax),%rdi
.rva .LSEH_begin_aesni_xts_decrypt
.rva .LSEH_end_aesni_xts_decrypt
.rva .LSEH_info_xts_dec
+
+ .rva .LSEH_begin_aesni_ocb_encrypt
+ .rva .LSEH_end_aesni_ocb_encrypt
+ .rva .LSEH_info_ocb_enc
+
+ .rva .LSEH_begin_aesni_ocb_decrypt
+ .rva .LSEH_end_aesni_ocb_decrypt
+ .rva .LSEH_info_ocb_dec
___
$code.=<<___;
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
.byte 9,0,0,0
.rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+.LSEH_info_ocb_enc:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
+ .rva .Locb_enc_pop
+ .long 0
+.LSEH_info_ocb_dec:
+ .byte 9,0,0,0
+ .rva ocb_se_handler
+ .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
+ .rva .Locb_dec_pop
+ .long 0
___
$code.=<<___;
.LSEH_info_cbc: