-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
# subroutine:
#
-# AES-128/-192/-256+SHA256 this(**)gain
-# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
-# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
-# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
-# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
+# AES-128/-192/-256+SHA256 this(**) gain
+# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
+# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
+# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
+# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
+# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
+# Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
+# Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
#
-# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
+# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
# Westmere is omitted from loop, this is because gain was not
# estimated high enough to justify the effort;
# (**) these are EVP-free results, results obtained with 'speed
# -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
+# (***) these are SHAEXT results;
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$avx = ($1>=10) + ($1>=12);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
$shaext=$avx; ### set to zero if compiling for 1.0.1
$avx=1 if (!$shaext && $avx);
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+ or die "can't call $xlate: $!";
*STDOUT=*OUT;
$func="aesni_cbc_sha256_enc";
$_ivp="16*$SZ+4*8(%rsp)";
$_ctx="16*$SZ+5*8(%rsp)";
$_in0="16*$SZ+6*8(%rsp)";
-$_rsp="16*$SZ+7*8(%rsp)";
+$_rsp="`16*$SZ+7*8`(%rsp)";
$framesz=16*$SZ+8*8;
$code=<<___;
.type $func,\@abi-omnipotent
.align 16
$func:
+.cfi_startproc
___
if ($avx) {
$code.=<<___;
je ${func}_avx2
___
$code.=<<___;
- and \$`1<<30`,%eax # mask "Intel CPU" bit
- and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
- or %eax,%r10d
- cmp \$`1<<28|1<<9|1<<30`,%r10d
- je ${func}_avx
+ and \$`1<<28`,%r10d # check for AVX
+ jnz ${func}_avx
ud2
___
}
ud2
.Lprobe:
ret
+.cfi_endproc
.size $func,.-$func
.align 64
.type ${func}_xop,\@function,6
.align 64
${func}_xop:
+.cfi_startproc
.Lxop_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
mov $_ivp,$ivp
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vmovdqu $iv,($ivp) # output IV
vzeroall
___
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_xop:
ret
+.cfi_endproc
.size ${func}_xop,.-${func}_xop
___
######################################################################
.type ${func}_avx,\@function,6
.align 64
${func}_avx:
+.cfi_startproc
.Lavx_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
mov $_ivp,$ivp
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vmovdqu $iv,($ivp) # output IV
vzeroall
___
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
ret
+.cfi_endproc
.size ${func}_avx,.-${func}_avx
___
######################################################################
# AVX2+BMI code path
#
-my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
.type ${func}_avx2,\@function,6
.align 64
${func}_avx2:
+.cfi_startproc
.Lavx2_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
+.cfi_push %r15
sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
and \$-256*$SZ,%rsp # align stack frame
add \$`2*$SZ*($rounds-8)`,%rsp
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
+.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
lea ($Tbl),%rsp
mov $_ivp,$ivp
mov $_rsp,%rsi
+.cfi_def_cfa %rsi,8
vmovdqu $iv,($ivp) # output IV
vzeroall
___
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx2:
ret
+.cfi_endproc
.size ${func}_avx2,.-${func}_avx2
___
}}
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
+ movups ($ivp),$iv # load IV
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
+if ($win64 && $avx) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
-$code.=<<___ if ($avx);
+$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
$code.=<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
- lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
.rva .LSEH_end_${func}_shaext
.rva .LSEH_info_${func}_shaext
___
-$code.=<<___ if ($avx);
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_${func}_xop: