Add sha/asm/keccak1600-x86_64.pl.

[openssl.git] / crypto / sha / asm / sha1-armv4-large.pl
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl

index c0e3b80f0ea30ea8a44ea02d770528e8c5673868..7ff5bfbba6cbf55c439b5dde2ff2b6c5372363bd 100644 (file)
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -60,14 +67,28 @@
  # is ~2.5x larger and there are some redundant instructions executed
  # when processing last block, improvement is not as big for smallest
  # blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
-# byte, which is also >80% faster than integer-only code.
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
  
  # May 2014.
  #
  # Add ARMv8 code path performing at 2.35 cpb on Apple A7.
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  $ctx="r0";
  $inp="r1";
@@ -167,20 +188,29 @@ $code=<<___;
  #include "arm_arch.h"
  
  .text
+#if defined(__thumb2__)
+.syntax        unified
+.thumb
+#else
  .code  32
+#endif
  
  .global        sha1_block_data_order
  .type  sha1_block_data_order,%function
  
  .align 5
  sha1_block_data_order:
-#if __ARM_ARCH__>=7
-       sub     r3,pc,#8                @ sha1_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.Lsha1_block:
+       adr     r3,.Lsha1_block
         ldr     r12,.LOPENSSL_armcap
         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
-       tst     r12,#8
+#ifdef __APPLE__
+       ldr     r12,[r12]
+#endif
+       tst     r12,#ARMV8_SHA1
         bne     .LARMv8
-       tst     r12,#1
+       tst     r12,#ARMV7_NEON
         bne     .LNEON
  #endif
         stmdb   sp!,{r4-r12,lr}
@@ -199,7 +229,12 @@ for($i=0;$i<5;$i++) {
         &BODY_00_15(@V);        unshift(@V,pop(@V));
  }
  $code.=<<___;
+#if defined(__thumb2__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
         teq     $Xi,sp
+#endif
         bne     .L_00_15                @ [((11+4)*5+2)*3]
         sub     sp,sp,#25*4
  ___
@@ -218,7 +253,12 @@ for($i=0;$i<5;$i++) {
         &BODY_20_39(@V);        unshift(@V,pop(@V));
  }
  $code.=<<___;
+#if defined(__thumb2__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
         teq     $Xi,sp                  @ preserve carry
+#endif
         bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
         bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
  
@@ -230,7 +270,12 @@ for($i=0;$i<5;$i++) {
         &BODY_40_59(@V);        unshift(@V,pop(@V));
  }
  $code.=<<___;
+#if defined(__thumb2__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
         teq     $Xi,sp
+#endif
         bne     .L_40_59                @ [+((12+5)*5+2)*4]
  
         ldr     $K,.LK_60_79
@@ -264,8 +309,10 @@ $code.=<<___;
  .LK_20_39:     .word   0x6ed9eba1
  .LK_40_59:     .word   0x8f1bbcdc
  .LK_60_79:     .word   0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
  .LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-sha1_block_data_order
+.word  OPENSSL_armcap_P-.Lsha1_block
+#endif
  .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align 5
  ___
@@ -439,6 +486,7 @@ sub Xuplast_80 ()
  
         &teq            ($inp,$len);
         &sub            ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+       &it             ("eq");
         &subeq          ($inp,$inp,64);         # reload last block to avoid SEGV
         &vld1_8         ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
          eval(shift(@insns));
@@ -476,7 +524,8 @@ sub Xloop()
  }
  
  $code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
  .fpu   neon
  
  .type  sha1_block_data_order_neon,%function
@@ -488,12 +537,12 @@ sha1_block_data_order_neon:
         @ dmb                           @ errata #451034 on early Cortex A8
         @ vstmdb        sp!,{d8-d15}    @ ABI specification says so
         mov     $saved_sp,sp
-       sub     sp,sp,#64               @ alloca
+       sub     $Xfer,sp,#64
         adr     $K_XX_XX,.LK_00_19
-       bic     sp,sp,#15               @ align for 128-bit stores
+       bic     $Xfer,$Xfer,#15         @ align for 128-bit stores
  
         ldmia   $ctx,{$a,$b,$c,$d,$e}   @ load context
-       mov     $Xfer,sp
+       mov     sp,$Xfer                @ alloca
  
         vld1.8          {@X[-4&7]-@X[-3&7]},[$inp]!     @ handles unaligned
         veor            $zero,$zero,$zero
@@ -540,10 +589,13 @@ $code.=<<___;
         add     $b,$b,$t0
         add     $c,$c,$t1
         add     $d,$d,$Xfer
+       it      eq
         moveq   sp,$saved_sp
         add     $e,$e,$Ki
+       it      ne
         ldrne   $Ki,[sp]
         stmia   $ctx,{$a,$b,$c,$d,$e}
+       itt     ne
         addne   $Xfer,sp,#3*16
         bne     .Loop_neon
  
@@ -563,7 +615,14 @@ my @Kxx=map("q$_",(8..11));
  my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
  
  $code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)        .byte   c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d|0x10
+# endif
+
  .type  sha1_block_data_order_armv8,%function
  .align 5
  sha1_block_data_order_armv8:
@@ -631,13 +690,15 @@ $code.=<<___;
         vst1.32         {$E\[0]},[$ctx]
  
         vldmia  sp!,{d8-d15}
-       bx      lr
+       ret                                     @ bx lr
  .size  sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
  #endif
  ___
  }}}
  $code.=<<___;
+#if __ARM_MAX_ARCH__>=7
  .comm  OPENSSL_armcap_P,4,4
+#endif
  ___
  
  {   my  %opcode = (
@@ -648,13 +709,21 @@ ___
      sub unsha1 {
         my ($mnemonic,$arg)=@_;
  
-       $arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
-       &&
-       sprintf ".long\t0x%08x\t@ %s %s",
-                       $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-                                         |(($2&7)<<17)|(($2&8)<<4)
-                                         |(($3&7)<<1) |(($3&8)<<2),
+       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                        |(($2&7)<<17)|(($2&8)<<4)
+                                        |(($3&7)<<1) |(($3&8)<<2);
+           # since ARMv7 instructions are always encoded little-endian.
+           # correct solution is to use .inst directive, but older
+           # assemblers don't implement it:-(
+
+           # this fix-up provides Thumb encoding in conjunction with INST
+           $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+                       $word&0xff,($word>>8)&0xff,
+                       ($word>>16)&0xff,($word>>24)&0xff,
                         $mnemonic,$arg;
+       }
      }
  }
  
@@ -664,6 +733,7 @@ foreach (split($/,$code)) {
  
         s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
  
+       s/\bret\b/bx    lr/o            or
         s/\bbx\s+lr\b/.word\t0xe12fff1e/o;      # make it possible to compile with -march=armv4
  
         print $_,$/;