#! /usr/bin/env perl
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
# subroutine:
#
-# AES-128/-192/-256+SHA256 this(**)gain
-# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
-# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
-# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
-# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
-# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
+# AES-128/-192/-256+SHA256 this(**) gain
+# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
+# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
+# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
+# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
+# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
+# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
+# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
#
-# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
+# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
# Westmere is omitted from loop, this is because gain was not
# estimated high enough to justify the effort;
# (**) these are EVP-free results, results obtained with 'speed
# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
+# (***) these are SHAEXT results;
$flavour = shift;
$output = shift;
$shaext=$avx; ### set to zero if compiling for 1.0.1
$avx=1 if (!$shaext && $avx);
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$func="aesni_cbc_sha256_enc";
$_ivp="16*$SZ+4*8(%rsp)";
$_ctx="16*$SZ+5*8(%rsp)";
$_in0="16*$SZ+6*8(%rsp)";
-$_rsp="16*$SZ+7*8(%rsp)";
+$_rsp="`16*$SZ+7*8`(%rsp)";
$framesz=16*$SZ+8*8;
$code=<<___;
.type ${func}_xop,\@function,6
.align 64
${func}_xop:
+.cfi_startproc
.Lxop_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
mov $_ivp,$ivp
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vmovdqu $iv,($ivp) # output IV
vzeroall
___
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_xop:
ret
+.cfi_endproc
.size ${func}_xop,.-${func}_xop
___
######################################################################
.type ${func}_avx,\@function,6
.align 64
${func}_avx:
+.cfi_startproc
.Lavx_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
mov $_ivp,$ivp
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vmovdqu $iv,($ivp) # output IV
vzeroall
___
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
ret
+.cfi_endproc
.size ${func}_avx,.-${func}_avx
___
######################################################################
# AVX2+BMI code path
#
-my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
.type ${func}_avx2,\@function,6
.align 64
${func}_avx2:
+.cfi_startproc
.Lavx2_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
and \$-256*$SZ,%rsp # align stack frame
add \$`2*$SZ*($rounds-8)`,%rsp
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
lea ($Tbl),%rsp
mov $_ivp,$ivp
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vmovdqu $iv,($ivp) # output IV
vzeroall
___
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx2:
ret
+.cfi_endproc
.size ${func}_avx2,.-${func}_avx2
___
}}
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
+ movups ($ivp),$iv # load IV
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization
$code.=<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
- lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp