ARM64 assembly pack: make it Windows-friendly.

[openssl.git] / crypto / sha / asm / sha512-armv8.pl
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl

index c55efb30854468c41f00cb99fdcbc804ab9eb728..f7c67219ed0904ae83bc68a4412639e5564d920c 100644 (file)
--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -1,7 +1,7 @@
  #! /usr/bin/env perl
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
  #
-# Licensed under the OpenSSL license (the "License").  You may not use
+# Licensed under the Apache License 2.0 (the "License").  You may not use
  # this file except in compliance with the License.  You can obtain a copy
  # in the file LICENSE in the source distribution or at
  # https://www.openssl.org/source/license.html
@@ -27,6 +27,7 @@
  # Denver       2.01            10.5 (+26%)     6.70 (+8%)
  # X-Gene                       20.0 (+100%)    12.8 (+300%(***))
  # Mongoose     2.36            13.0 (+50%)     8.36 (+33%)
+# Kryo         1.92            17.4 (+30%)     11.2 (+8%)
  #
  # (*)  Software SHA256 results are of lesser relevance, presented
  #      mostly for informational purposes.
@@ -35,7 +36,7 @@
  #      on Cortex-A53 (or by 4 cycles per round).
  # (***)        Super-impressive coefficients over gcc-generated code are
  #      indication of some compiler "pathology", most notably code
-#      generated with -mgeneral-regs-only is significanty faster
+#      generated with -mgeneral-regs-only is significantly faster
  #      and the gap is only 40-90%.
  #
  # October 2016.
@@ -187,33 +188,32 @@ ___
  $code.=<<___;
  #ifndef        __KERNEL__
  # include "arm_arch.h"
+.extern        OPENSSL_armcap_P
  #endif
  
  .text
  
-.extern        OPENSSL_armcap_P
  .globl $func
  .type  $func,%function
  .align 6
  $func:
+#ifndef        __KERNEL__
+       adrp    x16,OPENSSL_armcap_P
+       ldr     w16,[x16,#:lo12:OPENSSL_armcap_P]
  ___
  $code.=<<___   if ($SZ==4);
-#ifndef        __KERNEL__
-# ifdef        __ILP32__
-       ldrsw   x16,.LOPENSSL_armcap_P
-# else
-       ldr     x16,.LOPENSSL_armcap_P
-# endif
-       adr     x17,.LOPENSSL_armcap_P
-       add     x16,x16,x17
-       ldr     w16,[x16]
         tst     w16,#ARMV8_SHA256
         b.ne    .Lv8_entry
         tst     w16,#ARMV7_NEON
         b.ne    .Lneon_entry
-#endif
+___
+$code.=<<___   if ($SZ==8);
+       tst     w16,#ARMV8_SHA512
+       b.ne    .Lv8_entry
  ___
  $code.=<<___;
+#endif
+       .inst   0xd503233f                              // paciasp
         stp     x29,x30,[sp,#-128]!
         add     x29,sp,#0
  
@@ -275,6 +275,7 @@ $code.=<<___;
         ldp     x25,x26,[x29,#64]
         ldp     x27,x28,[x29,#80]
         ldp     x29,x30,[sp],#128
+       .inst   0xd50323bf                              // autiasp
         ret
  .size  $func,.-$func
  
@@ -346,15 +347,6 @@ $code.=<<___ if ($SZ==4);
  ___
  $code.=<<___;
  .size  .LK$BITS,.-.LK$BITS
-#ifndef        __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef        __ILP32__
-       .long   OPENSSL_armcap_P-.
-# else
-       .quad   OPENSSL_armcap_P-.
-# endif
-#endif
  .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align 2
  ___
@@ -731,8 +723,110 @@ $code.=<<___;
  ___
  }
  
+if ($SZ==8) {
+my $Ktbl="x3";
+
+my @H = map("v$_.16b",(0..4));
+my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
+my @MSG=map("v$_.16b",(16..23));
+my ($W0,$W1)=("v24.2d","v25.2d");
+my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
+
  $code.=<<___;
  #ifndef        __KERNEL__
+.type  sha512_block_armv8,%function
+.align 6
+sha512_block_armv8:
+.Lv8_entry:
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+
+       ld1             {@MSG[0]-@MSG[3]},[$inp],#64    // load input
+       ld1             {@MSG[4]-@MSG[7]},[$inp],#64
+
+       ld1.64          {@H[0]-@H[3]},[$ctx]            // load context
+       adr             $Ktbl,.LK512
+
+       rev64           @MSG[0],@MSG[0]
+       rev64           @MSG[1],@MSG[1]
+       rev64           @MSG[2],@MSG[2]
+       rev64           @MSG[3],@MSG[3]
+       rev64           @MSG[4],@MSG[4]
+       rev64           @MSG[5],@MSG[5]
+       rev64           @MSG[6],@MSG[6]
+       rev64           @MSG[7],@MSG[7]
+       b               .Loop_hw
+
+.align 4
+.Loop_hw:
+       ld1.64          {$W0},[$Ktbl],#16
+       subs            $num,$num,#1
+       sub             x4,$inp,#128
+       orr             $AB,@H[0],@H[0]                 // offload
+       orr             $CD,@H[1],@H[1]
+       orr             $EF,@H[2],@H[2]
+       orr             $GH,@H[3],@H[3]
+       csel            $inp,$inp,x4,ne                 // conditional rewind
+___
+for($i=0;$i<32;$i++) {
+$code.=<<___;
+       add.i64         $W0,$W0,@MSG[0]
+       ld1.64          {$W1},[$Ktbl],#16
+       ext             $W0,$W0,$W0,#8
+       ext             $fg,@H[2],@H[3],#8
+       ext             $de,@H[1],@H[2],#8
+       add.i64         @H[3],@H[3],$W0                 // "T1 + H + K512[i]"
+        sha512su0      @MSG[0],@MSG[1]
+        ext            $m9_10,@MSG[4],@MSG[5],#8
+       sha512h         @H[3],$fg,$de
+        sha512su1      @MSG[0],@MSG[7],$m9_10
+       add.i64         @H[4],@H[1],@H[3]               // "D + T1"
+       sha512h2        @H[3],$H[1],@H[0]
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+for(;$i<40;$i++) {
+$code.=<<___   if ($i<39);
+       ld1.64          {$W1},[$Ktbl],#16
+___
+$code.=<<___   if ($i==39);
+       sub             $Ktbl,$Ktbl,#$rounds*$SZ        // rewind
+___
+$code.=<<___;
+       add.i64         $W0,$W0,@MSG[0]
+        ld1            {@MSG[0]},[$inp],#16            // load next input
+       ext             $W0,$W0,$W0,#8
+       ext             $fg,@H[2],@H[3],#8
+       ext             $de,@H[1],@H[2],#8
+       add.i64         @H[3],@H[3],$W0                 // "T1 + H + K512[i]"
+       sha512h         @H[3],$fg,$de
+        rev64          @MSG[0],@MSG[0]
+       add.i64         @H[4],@H[1],@H[3]               // "D + T1"
+       sha512h2        @H[3],$H[1],@H[0]
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+       @H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+$code.=<<___;
+       add.i64         @H[0],@H[0],$AB                 // accumulate
+       add.i64         @H[1],@H[1],$CD
+       add.i64         @H[2],@H[2],$EF
+       add.i64         @H[3],@H[3],$GH
+
+       cbnz            $num,.Loop_hw
+
+       st1.64          {@H[0]-@H[3]},[$ctx]            // store context
+
+       ldr             x29,[sp],#16
+       ret
+.size  sha512_block_armv8,.-sha512_block_armv8
+#endif
+___
+}
+
+$code.=<<___;
+#if !defined(__KERNEL__) && !defined(_WIN64)
  .comm  OPENSSL_armcap_P,4,4
  #endif
  ___
@@ -752,6 +846,21 @@ ___
      }
  }
  
+{   my  %opcode = (
+       "sha512h"       => 0xce608000,  "sha512h2"      => 0xce608400,
+       "sha512su0"     => 0xcec08000,  "sha512su1"     => 0xce608800   );
+
+    sub unsha512 {
+       my ($mnemonic,$arg)=@_;
+
+       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+       &&
+       sprintf ".inst\t0x%08x\t//%s %s",
+                       $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+                       $mnemonic,$arg;
+    }
+}
+
  open SELF,$0;
  while(<SELF>) {
          next if (/^#!/);
@@ -764,12 +873,15 @@ foreach(split("\n",$code)) {
  
         s/\`([^\`]*)\`/eval($1)/ge;
  
+       s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge   or
         s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
  
         s/\bq([0-9]+)\b/v$1.16b/g;              # old->new registers
  
         s/\.[ui]?8(\s)/$1/;
+       s/\.\w?64\b//           and s/\.16b/\.2d/g      or
         s/\.\w?32\b//           and s/\.16b/\.4s/g;
+       m/\bext\b/              and s/\.2d/\.16b/g      or
         m/(ld|st)1[^\[]+\[0\]/  and s/\.4s/\.s/g;
  
         print $_,"\n";