-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
#
-# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
+# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
# Westmere is omitted from loop, this is because gain was not
# estimated high enough to justify the effort;
# (**) these are EVP-free results, results obtained with 'speed
$avx = ($1>=10) + ($1>=12);
}
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
$shaext=$avx; ### set to zero if compiling for 1.0.1
$avx=1 if (!$shaext && $avx);
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$func="aesni_cbc_sha256_enc";
${func}_xop:
.Lxop_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_xop:
ret
.size ${func}_xop,.-${func}_xop
${func}_avx:
.Lavx_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
sub \$`$framesz+$win64*16*10`,%rsp
and \$-64,%rsp # align stack frame
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx:
ret
.size ${func}_avx,.-${func}_avx
######################################################################
# AVX2+BMI code path
#
-my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
+my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;
${func}_avx2:
.Lavx2_shortcut:
mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
+ mov %rsp,%rax # copy %rsp
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
- mov %rsp,%r11 # copy %rsp
sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
and \$-256*$SZ,%rsp # align stack frame
add \$`2*$SZ*($rounds-8)`,%rsp
mov $ivp,$_ivp
mov $ctx,$_ctx
mov $in0,$_in0
- mov %r11,$_rsp
+ mov %rax,$_rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,`$framesz+16*0`(%rsp)
movaps `$framesz+16*9`(%rsp),%xmm15
___
$code.=<<___;
- mov (%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ mov -48(%rsi),%r15
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx2:
ret
.size ${func}_avx2,.-${func}_avx2
$code.=<<___;
mov %rax,%rsi # put aside Rsp
mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
- lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp