Add an Apple privacy info file for OpenSSL

[openssl.git] / crypto / whrlpool / asm / wp-mmx.pl
diff --git a/crypto/whrlpool/asm/wp-mmx.pl b/crypto/whrlpool/asm/wp-mmx.pl

index 7f49c778e80a10badfe8f4aff04f0bf4de35d643..ad2528a9e28aaca471729aa580cbb76d1839c97f 100644 (file)
--- a/crypto/whrlpool/asm/wp-mmx.pl
+++ b/crypto/whrlpool/asm/wp-mmx.pl
@@ -1,9 +1,16 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  #
  # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  # project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# forms are granted according to the License.
  # ====================================================================
  #
  # whirlpool_block_mmx implementation.
@@ -16,7 +23,7 @@
  # table]. I stick to value of 2 for two reasons: 1. smaller table
  # minimizes cache trashing and thus mitigates the hazard of side-
  # channel leakage similar to AES cache-timing one; 2. performance
-# gap among different µ-archs is smaller.
+# gap among different Âµ-archs is smaller.
  #
  # Performance table lists rounded amounts of CPU cycles spent by
  # whirlpool_block_mmx routine on single 64 byte input block, i.e.
@@ -24,14 +31,14 @@
  # multiplying 64 by CPU clock frequency and dividing by relevant
  # value from the given table:
  #
-#              $SCALE=2/8      icc8    gcc3    
+#              $SCALE=2/8      icc8    gcc3
  # Intel P4     3200/4600       4600(*) 6400
  # Intel PIII   2900/3000       4900    5400
  # AMD K[78]    2500/1800       9900    8200(**)
  #
  # (*)  I've sketched even non-MMX assembler, but for the record
  #      I've failed to beat the Intel compiler on P4, without using
-*      MMX that is...
+#      MMX that is...
  # (**) ... on AMD on the other hand non-MMX assembler was observed
  #      to perform significantly better, but I figured this MMX
  #      implementation is even faster anyway, so why bother? As for
@@ -45,10 +52,13 @@
  #      non-MMX implementation would actually pay off, but till
  #      opposite is proved "unlikely" is assumed.
  
-push(@INC,"../CVS/HEAD/openssl/crypto/perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
  require "x86asm.pl";
  
-&asm_init($ARGV[0],"wp-mmx.pl");
+$output=pop and open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
  
  sub L()  { &data_byte(@_); }
  sub LL()
@@ -58,19 +68,19 @@ sub LL()
                                         unshift(@_,pop(@_));
                                   }
                                 }
-       else                    { die "unvalid SCALE value"; }
+       else                    { die "invalid SCALE value"; }
  }
  
  sub scale()
  {      if      ($SCALE==2)     { &lea(@_[0],&DWP(0,@_[1],@_[1])); }
         elsif   ($SCALE==8)     { &lea(@_[0],&DWP(0,"",@_[1],8));  }
-       else                    { die "unvalid SCALE value";       }
+       else                    { die "invalid SCALE value";       }
  }
  
  sub row()
  {      if      ($SCALE==2)     { ((8-shift)&7); }
         elsif   ($SCALE==8)     { (8*shift);     }
-       else                    { die "unvalid SCALE value"; }
+       else                    { die "invalid SCALE value"; }
  }
  
  $tbl="ebp";
@@ -99,52 +109,54 @@ $tbl="ebp";
         &call   (&label("pic_point"));
  &set_label("pic_point");
         &blindpop($tbl);
-        &lea    ($tbl,&DWP(&label("table")."-".&label("pic_point"),$tbl));
+       &lea    ($tbl,&DWP(&label("table")."-".&label("pic_point"),$tbl));
  
         &xor    ("ecx","ecx");
         &xor    ("edx","edx");
  
         for($i=0;$i<8;$i++) { &movq(@mm[$i],&QWP($i*8,"esi")); }    # L=H
  &set_label("outerloop");
-       for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); }    # K=H
+       for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); }    # K=L
         for($i=0;$i<8;$i++) { &pxor(@mm[$i],&QWP($i*8,"edi")); }    # L^=inp
-       for($i=0;$i<8;$i++) { &movq(&QWP(64+$i*8,"esp"),@mm[$i]); } #S=L
+       for($i=0;$i<8;$i++) { &movq(&QWP(64+$i*8,"esp"),@mm[$i]); } # S=L
  
         &xor    ("esi","esi");
         &mov    (&DWP(12,"ebx"),"esi");         # zero round counter
  
  &set_label("round",16);
-       &movq   (@mm[0],&DWP(2048*$SCALE,$tbl,"esi",8));        # rc[r]
+       &movq   (@mm[0],&QWP(2048*$SCALE,$tbl,"esi",8));        # rc[r]
         &mov    ("eax",&DWP(0,"esp"));
         &mov    ("ebx",&DWP(4,"esp"));
+       &movz   ("ecx",&LB("eax"));
+       &movz   ("edx",&HB("eax"));
  for($i=0;$i<8;$i++) {
-    my $func = ($i==0)? movq : pxor;
-       &movb   (&LB("ecx"),&LB("eax"));
-       &movb   (&LB("edx"),&HB("eax"));
+    my $func = ($i==0)? \&movq : \&pxor;
+       &shr    ("eax",16);
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("eax"));
         &scale  ("edi","edx");
-       &shr    ("eax",16);
+       &movz   ("edx",&HB("eax"));
         &pxor   (@mm[0],&QWP(&row(0),$tbl,"esi",8));
         &$func  (@mm[1],&QWP(&row(1),$tbl,"edi",8));
-       &movb   (&LB("ecx"),&LB("eax"));
-       &movb   (&LB("edx"),&HB("eax"));
         &mov    ("eax",&DWP(($i+1)*8,"esp"));
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("ebx"));
         &scale  ("edi","edx");
+       &movz   ("edx",&HB("ebx"));
         &$func  (@mm[2],&QWP(&row(2),$tbl,"esi",8));
         &$func  (@mm[3],&QWP(&row(3),$tbl,"edi",8));
-       &movb   (&LB("ecx"),&LB("ebx"));
-       &movb   (&LB("edx"),&HB("ebx"));
+       &shr    ("ebx",16);
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("ebx"));
         &scale  ("edi","edx");
-       &shr    ("ebx",16);
+       &movz   ("edx",&HB("ebx"));
         &$func  (@mm[4],&QWP(&row(4),$tbl,"esi",8));
         &$func  (@mm[5],&QWP(&row(5),$tbl,"edi",8));
-       &movb   (&LB("ecx"),&LB("ebx"));
-       &movb   (&LB("edx"),&HB("ebx"));
         &mov    ("ebx",&DWP(($i+1)*8+4,"esp"));
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("eax"));
         &scale  ("edi","edx");
+       &movz   ("edx",&HB("eax"));
         &$func  (@mm[6],&QWP(&row(6),$tbl,"esi",8));
         &$func  (@mm[7],&QWP(&row(7),$tbl,"edi",8));
      push(@mm,shift(@mm));
@@ -153,32 +165,32 @@ for($i=0;$i<8;$i++) {
         for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); }    # K=L
  
  for($i=0;$i<8;$i++) {
-       &movb   (&LB("ecx"),&LB("eax"));
-       &movb   (&LB("edx"),&HB("eax"));
+       &shr    ("eax",16);
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("eax"));
         &scale  ("edi","edx");
-       &shr    ("eax",16);
+       &movz   ("edx",&HB("eax"));
         &pxor   (@mm[0],&QWP(&row(0),$tbl,"esi",8));
         &pxor   (@mm[1],&QWP(&row(1),$tbl,"edi",8));
-       &movb   (&LB("ecx"),&LB("eax"));
-       &movb   (&LB("edx"),&HB("eax"));
         &mov    ("eax",&DWP(64+($i+1)*8,"esp"))         if ($i<7);
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("ebx"));
         &scale  ("edi","edx");
+       &movz   ("edx",&HB("ebx"));
         &pxor   (@mm[2],&QWP(&row(2),$tbl,"esi",8));
         &pxor   (@mm[3],&QWP(&row(3),$tbl,"edi",8));
-       &movb   (&LB("ecx"),&LB("ebx"));
-       &movb   (&LB("edx"),&HB("ebx"));
+       &shr    ("ebx",16);
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("ebx"));
         &scale  ("edi","edx");
-       &shr    ("ebx",16);
+       &movz   ("edx",&HB("ebx"));
         &pxor   (@mm[4],&QWP(&row(4),$tbl,"esi",8));
         &pxor   (@mm[5],&QWP(&row(5),$tbl,"edi",8));
-       &movb   (&LB("ecx"),&LB("ebx"));
-       &movb   (&LB("edx"),&HB("ebx"));
         &mov    ("ebx",&DWP(64+($i+1)*8+4,"esp"))       if ($i<7);
         &scale  ("esi","ecx");
+       &movz   ("ecx",&LB("eax"));
         &scale  ("edi","edx");
+       &movz   ("edx",&HB("eax"));
         &pxor   (@mm[6],&QWP(&row(6),$tbl,"esi",8));
         &pxor   (@mm[7],&QWP(&row(7),$tbl,"edi",8));
      push(@mm,shift(@mm));
@@ -488,5 +500,7 @@ for($i=0;$i<8;$i++) {
         &L(0xfb,0xee,0x7c,0x66,0xdd,0x17,0x47,0x9e);
         &L(0xca,0x2d,0xbf,0x07,0xad,0x5a,0x83,0x33);
  
-&function_end_B("whrilpool_block_mmx");
-&asm_finish(); 
+&function_end_B("whirlpool_block_mmx");
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";