-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
###################################################################
### AES-128 [originally in CTR mode] ###
# Emilia's this(*) difference
#
# Core 2 9.30 8.69 +7%
-# Nehalem(**) 7.63 6.98 +9%
-# Atom 17.1 17.4 -2%(***)
+# Nehalem(**) 7.63 6.88 +11%
+# Atom 17.1 16.4 +4%
+# Silvermont - 12.9
+# Goldmont - 8.85
#
# (*) Comparison is not completely fair, because "this" is ECB,
# i.e. no extra processing such as counter values calculation
# (**) Results were collected on Westmere, which is considered to
# be equivalent to Nehalem for this code.
#
-# (***) Slowdown on Atom is rather strange per se, because original
-# implementation has a number of 9+-bytes instructions, which
-# are bad for Atom front-end, and which I eliminated completely.
-# In attempt to address deterioration sbox() was tested in FP
-# SIMD "domain" (movaps instead of movdqa, xorps instead of
-# pxor, etc.). While it resulted in nominal 4% improvement on
-# Atom, it hurted Westmere by more than 2x factor.
-#
# As for key schedule conversion subroutine. Interface to OpenSSL
# relies on per-invocation on-the-fly conversion. This naturally
# has impact on performance, especially for short inputs. Conversion
# conversion conversion/8x block
# Core 2 240 0.22
# Nehalem 180 0.20
-# Atom 430 0.19
+# Atom 430 0.20
#
# The ratio values mean that 128-byte blocks will be processed
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
-# Core 2 11.0
-# Nehalem 9.16
-# Atom 20.9
+# Core 2 9.98
+# Nehalem 7.80
+# Atom 17.9
+# Silvermont 14.0
+# Goldmont 10.2
#
# November 2011.
#
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[5]
pxor @s[0], @t[3]
pxor @s[1], @t[2]
pxor @s[2], @t[1]
- pxor @s[3], @t[0]
+ pxor @s[3], @t[0]
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
$code.=<<___;
pxor 0x00($key),@x[0]
pxor 0x10($key),@x[1]
- pshufb $mask,@x[0]
pxor 0x20($key),@x[2]
- pshufb $mask,@x[1]
pxor 0x30($key),@x[3]
- pshufb $mask,@x[2]
+ pshufb $mask,@x[0]
+ pshufb $mask,@x[1]
pxor 0x40($key),@x[4]
- pshufb $mask,@x[3]
pxor 0x50($key),@x[5]
- pshufb $mask,@x[4]
+ pshufb $mask,@x[2]
+ pshufb $mask,@x[3]
pxor 0x60($key),@x[6]
- pshufb $mask,@x[5]
pxor 0x70($key),@x[7]
+ pshufb $mask,@x[4]
+ pshufb $mask,@x[5]
pshufb $mask,@x[6]
- lea 0x80($key),$key
pshufb $mask,@x[7]
+ lea 0x80($key),$key
___
}
# modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7];
my @t=@_[8..15];
+my $inv=@_[16]; # optional
$code.=<<___;
pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
pshufd \$0x93, @x[1], @t[1]
pxor @t[4], @t[0]
pshufd \$0x4E, @x[2], @x[6]
pxor @t[5], @t[1]
-
+___
+$code.=<<___ if (!$inv);
pxor @t[3], @x[4]
pxor @t[7], @x[5]
pxor @t[6], @x[3]
pxor @t[2], @x[6]
movdqa @t[1], @x[7]
___
+$code.=<<___ if ($inv);
+ pxor @x[4], @t[3]
+ pxor @t[7], @x[5]
+ pxor @x[3], @t[6]
+ movdqa @t[0], @x[3]
+ pxor @t[2], @x[6]
+ movdqa @t[6], @x[2]
+ movdqa @t[1], @x[7]
+ movdqa @x[6], @x[4]
+ movdqa @t[3], @x[6]
+___
}
-sub InvMixColumns {
+sub InvMixColumns_orig {
my @x=@_[0..7];
my @t=@_[8..15];
___
}
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
+# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
+
+$code.=<<___;
+ # multiplication by 0x05-0x00-0x04-0x00
+ pshufd \$0x4E, @x[0], @t[0]
+ pshufd \$0x4E, @x[6], @t[6]
+ pxor @x[0], @t[0]
+ pshufd \$0x4E, @x[7], @t[7]
+ pxor @x[6], @t[6]
+ pshufd \$0x4E, @x[1], @t[1]
+ pxor @x[7], @t[7]
+ pshufd \$0x4E, @x[2], @t[2]
+ pxor @x[1], @t[1]
+ pshufd \$0x4E, @x[3], @t[3]
+ pxor @x[2], @t[2]
+ pxor @t[6], @x[0]
+ pxor @t[6], @x[1]
+ pshufd \$0x4E, @x[4], @t[4]
+ pxor @x[3], @t[3]
+ pxor @t[0], @x[2]
+ pxor @t[1], @x[3]
+ pshufd \$0x4E, @x[5], @t[5]
+ pxor @x[4], @t[4]
+ pxor @t[7], @x[1]
+ pxor @t[2], @x[4]
+ pxor @x[5], @t[5]
+
+ pxor @t[7], @x[2]
+ pxor @t[6], @x[3]
+ pxor @t[6], @x[4]
+ pxor @t[3], @x[5]
+ pxor @t[4], @x[6]
+ pxor @t[7], @x[4]
+ pxor @t[7], @x[5]
+ pxor @t[5], @x[7]
+___
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
+}
+
sub aesenc { # not used
my @b=@_[0..7];
my @t=@_[8..15];
movdqa 0x50($const), @XMM[8] # .LM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
_bsaes_encrypt8_bitslice:
movdqa -0x30($const), @XMM[8] # .LM0ISR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
___
.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
.align 16
bsaes_ecb_encrypt_blocks:
+.cfi_startproc
mov %rsp, %rax
.Lecb_enc_prologue:
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
lea -0x48(%rsp),%rsp
+.cfi_adjust_cfa_offset 0x48
___
$code.=<<___ if ($win64);
lea -0xa0(%rsp), %rsp
___
$code.=<<___;
mov %rsp,%rbp # backup %rsp
+.cfi_def_cfa_register %rbp
mov 240($arg4),%eax # rounds
mov $arg1,$inp # backup arguments
mov $arg2,$out
cmp %rax, %rbp
jb .Lecb_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lecb_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+.cfi_restore %r15
+ mov -40(%rax), %r14
+.cfi_restore %r14
+ mov -32(%rax), %r13
+.cfi_restore %r13
+ mov -24(%rax), %r12
+.cfi_restore %r12
+ mov -16(%rax), %rbx
+.cfi_restore %rbx
+ mov -8(%rax), %rbp
+.cfi_restore %rbp
+ lea (%rax), %rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lecb_enc_epilogue:
ret
+.cfi_endproc
.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
.globl bsaes_ecb_decrypt_blocks
.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
.align 16
bsaes_ecb_decrypt_blocks:
+.cfi_startproc
mov %rsp, %rax
.Lecb_dec_prologue:
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
lea -0x48(%rsp),%rsp
+.cfi_adjust_cfa_offset 0x48
___
$code.=<<___ if ($win64);
lea -0xa0(%rsp), %rsp
___
$code.=<<___;
mov %rsp,%rbp # backup %rsp
+.cfi_def_cfa_register %rbp
mov 240($arg4),%eax # rounds
mov $arg1,$inp # backup arguments
mov $arg2,$out
cmp %rax, %rbp
jb .Lecb_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lecb_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+.cfi_restore %r15
+ mov -40(%rax), %r14
+.cfi_restore %r14
+ mov -32(%rax), %r13
+.cfi_restore %r13
+ mov -24(%rax), %r12
+.cfi_restore %r12
+ mov -16(%rax), %rbx
+.cfi_restore %rbx
+ mov -8(%rax), %rbp
+.cfi_restore %rbp
+ lea (%rax), %rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lecb_dec_epilogue:
ret
+.cfi_endproc
.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
___
}
.type bsaes_cbc_encrypt,\@abi-omnipotent
.align 16
bsaes_cbc_encrypt:
+.cfi_startproc
___
$code.=<<___ if ($win64);
mov 48(%rsp),$arg6 # pull direction flag
mov %rsp, %rax
.Lcbc_dec_prologue:
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
lea -0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset 0x48
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull ivp
___
$code.=<<___;
mov %rsp, %rbp # backup %rsp
+.cfi_def_cfa_register %rbp
mov 240($arg4), %eax # rounds
mov $arg1, $inp # backup arguments
mov $arg2, $out
cmp %rax, %rbp
ja .Lcbc_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lcbc_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+.cfi_restore %r15
+ mov -40(%rax), %r14
+.cfi_restore %r14
+ mov -32(%rax), %r13
+.cfi_restore %r13
+ mov -24(%rax), %r12
+.cfi_restore %r12
+ mov -16(%rax), %rbx
+.cfi_restore %rbx
+ mov -8(%rax), %rbp
+.cfi_restore %rbp
+ lea (%rax), %rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lcbc_dec_epilogue:
ret
+.cfi_endproc
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
.globl bsaes_ctr32_encrypt_blocks
.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
.align 16
bsaes_ctr32_encrypt_blocks:
+.cfi_startproc
mov %rsp, %rax
.Lctr_enc_prologue:
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
lea -0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset 0x48
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull ivp
___
$code.=<<___;
mov %rsp, %rbp # backup %rsp
+.cfi_def_cfa_register %rbp
movdqu ($arg5), %xmm0 # load counter
mov 240($arg4), %eax # rounds
mov $arg1, $inp # backup arguments
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
- lea .LBS0(%rip), %r11 # constants table
pshufb @XMM[8], @XMM[7]
+ lea .LBS0(%rip), %r11 # constants table
mov %ebx,%r10d # pass rounds
call _bsaes_encrypt8_bitslice
cmp %rax, %rbp
ja .Lctr_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lctr_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+.cfi_restore %r15
+ mov -40(%rax), %r14
+.cfi_restore %r14
+ mov -32(%rax), %r13
+.cfi_restore %r13
+ mov -24(%rax), %r12
+.cfi_restore %r12
+ mov -16(%rax), %rbx
+.cfi_restore %rbx
+ mov -8(%rax), %rbp
+.cfi_restore %rbp
+ lea (%rax), %rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lctr_enc_epilogue:
ret
+.cfi_endproc
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
___
######################################################################
# const unsigned char iv[16]);
#
my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$arg6=~s/d$//;
+
$code.=<<___;
.globl bsaes_xts_encrypt
.type bsaes_xts_encrypt,\@abi-omnipotent
.align 16
bsaes_xts_encrypt:
+.cfi_startproc
mov %rsp, %rax
.Lxts_enc_prologue:
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
lea -0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset 0x48
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull key2
___
$code.=<<___;
mov %rsp, %rbp # backup %rsp
+.cfi_def_cfa_register %rbp
mov $arg1, $inp # backup arguments
mov $arg2, $out
mov $arg3, $len
cmp %rax, %rbp
ja .Lxts_enc_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lxts_enc_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+.cfi_restore %r15
+ mov -40(%rax), %r14
+.cfi_restore %r14
+ mov -32(%rax), %r13
+.cfi_restore %r13
+ mov -24(%rax), %r12
+.cfi_restore %r12
+ mov -16(%rax), %rbx
+.cfi_restore %rbx
+ mov -8(%rax), %rbp
+.cfi_restore %rbp
+ lea (%rax), %rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lxts_enc_epilogue:
ret
+.cfi_endproc
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
.globl bsaes_xts_decrypt
.type bsaes_xts_decrypt,\@abi-omnipotent
.align 16
bsaes_xts_decrypt:
+.cfi_startproc
mov %rsp, %rax
.Lxts_dec_prologue:
push %rbp
+.cfi_push %rbp
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
lea -0x48(%rsp), %rsp
+.cfi_adjust_cfa_offset 0x48
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull key2
cmp %rax, %rbp
ja .Lxts_dec_bzero
- lea (%rbp),%rsp # restore %rsp
+ lea 0x78(%rbp),%rax
+.cfi_def_cfa %rax,8
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rbp), %rsp
+ lea 0xa0(%rax), %rax
+.Lxts_dec_tail:
___
$code.=<<___;
- mov 0x48(%rsp), %r15
- mov 0x50(%rsp), %r14
- mov 0x58(%rsp), %r13
- mov 0x60(%rsp), %r12
- mov 0x68(%rsp), %rbx
- mov 0x70(%rsp), %rax
- lea 0x78(%rsp), %rsp
- mov %rax, %rbp
+ mov -48(%rax), %r15
+.cfi_restore %r15
+ mov -40(%rax), %r14
+.cfi_restore %r14
+ mov -32(%rax), %r13
+.cfi_restore %r13
+ mov -24(%rax), %r12
+.cfi_restore %r12
+ mov -16(%rax), %rbx
+.cfi_restore %rbx
+ mov -8(%rax), %rbp
+.cfi_restore %rbp
+ lea (%rax), %rsp # restore %rsp
+.cfi_def_cfa_register %rsp
.Lxts_dec_epilogue:
ret
+.cfi_endproc
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
___
}
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lin_prologue
-
- mov 152($context),%rax # pull context->Rsp
+ cmp %r10,%rbx # context->Rip<=prologue label
+ jbe .Lin_prologue
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
+ mov 8(%r11),%r10d # HandlerData[2]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=tail label
+ jae .Lin_tail
+
mov 160($context),%rax # pull context->Rbp
lea 0x40(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
- lea 0xa0(%rax),%rax # adjust stack pointer
-
- mov 0x70(%rax),%rbp
- mov 0x68(%rax),%rbx
- mov 0x60(%rax),%r12
- mov 0x58(%rax),%r13
- mov 0x50(%rax),%r14
- mov 0x48(%rax),%r15
- lea 0x78(%rax),%rax # adjust stack pointer
+ lea 0xa0+0x78(%rax),%rax # adjust stack pointer
+
+.Lin_tail:
+ mov -48(%rax),%rbp
+ mov -40(%rax),%rbx
+ mov -32(%rax),%r12
+ mov -24(%rax),%r13
+ mov -16(%rax),%r14
+ mov -8(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
+ .rva .Lecb_enc_tail
+ .long 0
.Lecb_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
+ .rva .Lecb_dec_tail
+ .long 0
___
$code.=<<___;
.Lcbc_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
+ .rva .Lcbc_dec_tail
+ .long 0
.Lctr_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
+ .rva .Lctr_enc_tail
+ .long 0
.Lxts_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
+ .rva .Lxts_enc_tail
+ .long 0
.Lxts_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
+ .rva .Lxts_dec_tail
+ .long 0
___
}