Update copyright year

[openssl.git] / crypto / sha / asm / sha256-586.pl
diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl

index d072dd3dfc46510a346a200ef53c0cc4214773a6..dccc771ad584b8515180d1900bbdd8ab18d57f73 100644 (file)
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -10,8 +17,8 @@
  # SHA256 block transform for x86. September 2007.
  #
  # Performance improvement over compiler generated code varies from
-# 10% to 40% [see below]. Not very impressive on some µ-archs, but
-# it's 5 times smaller and optimizies amount of writes.
+# 10% to 40% [see below]. Not very impressive on some Âµ-archs, but
+# it's 5 times smaller and optimizes amount of writes.
  #
  # May 2012.
  #
@@ -40,7 +47,7 @@
  #
  # Performance in clock cycles per processed byte (less is better):
  #
-#              gcc     icc     x86 asm(*)      SIMD    x86_64 asm(**)  
+#              gcc     icc     x86 asm(*)      SIMD    x86_64 asm(**)
  # Pentium      46      57      40/38           -       -
  # PIII         36      33      27/24           -       -
  # P4           41      38      28              -       17.3
@@ -50,19 +57,26 @@
  # Sandy Bridge 25      -       15.9            12.4    11.6
  # Ivy Bridge   24      -       15.0            11.4    10.3
  # Haswell      22      -       13.9            9.46    7.80
+# Skylake      20      -       14.9            9.50    7.70
  # Bulldozer    36      -       27/22           17.0    13.6
  # VIA Nano     36      -       25/22           16.8    16.5
  # Atom         50      -       30/25           21.9    18.9
+# Silvermont   40      -       34/31           22.9    20.6
+# Goldmont     29      -       20              16.3(***)
  #
  # (*)  numbers after slash are for unrolled loop, where applicable;
  # (**) x86_64 assembly performance is presented for reference
  #      purposes, results are best-available;
+# (***)        SHAEXT result is 4.1, strangely enough better than 64-bit one;
  
  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  push(@INC,"${dir}","${dir}../../perlasm");
  require "x86asm.pl";
  
-&asm_init($ARGV[0],"sha512-586.pl",$ARGV[$#ARGV] eq "386");
+$output=pop;
+open STDOUT,">$output";
+
+&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
  
  $xmm=$avx=0;
  for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
@@ -82,8 +96,8 @@ if ($xmm && !$avx && $ARGV[0] eq "win32" &&
         $avx = ($1>=10) + ($1>=11);
  }
  
-if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /LLVM ([3-9]\.[0-9]+)/) {
-       $avx = ($1>=3.0) + ($1>=3.1);
+if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
+       $avx = ($2>=3.0) + ($2>3.0);
  }
  
  $shaext=$xmm;  ### set to zero if compiling for 1.0.1
@@ -265,7 +279,7 @@ my $suffix=shift;
         &mov    ($Coff,"ecx");
         &mov    ($Doff,"edi");
         &mov    (&DWP(0,"esp"),"ebx");  # magic
-       &mov    ($E,&DWP(16,"esi"));    
+       &mov    ($E,&DWP(16,"esi"));
         &mov    ("ebx",&DWP(20,"esi"));
         &mov    ("ecx",&DWP(24,"esi"));
         &mov    ("edi",&DWP(28,"esi"));
@@ -374,7 +388,7 @@ my @AH=($A,$K256);
         &xor    ($AH[1],"ecx");         # magic
         &mov    (&DWP(8,"esp"),"ecx");
         &mov    (&DWP(12,"esp"),"ebx");
-       &mov    ($E,&DWP(16,"esi"));    
+       &mov    ($E,&DWP(16,"esi"));
         &mov    ("ebx",&DWP(20,"esi"));
         &mov    ("ecx",&DWP(24,"esi"));
         &mov    ("esi",&DWP(28,"esi"));
@@ -1278,3 +1292,5 @@ sub bodyx_00_15 () {                      # +10%
  &function_end_B("sha256_block_data_order");
  
  &asm_finish();
+
+close STDOUT;