-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
-# byte, which is also >80% faster than integer-only code.
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0";
$inp="r1";
#include "arm_arch.h"
.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
.code 32
+#endif
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 5
sha1_block_data_order:
-#if __ARM_ARCH__>=7
- sub r3,pc,#8 @ sha1_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.Lsha1_block:
+ adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#8
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV8_SHA1
bne .LARMv8
- tst r12,#1
+ tst r12,#ARMV7_NEON
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
&BODY_00_15(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp
+#endif
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
___
&BODY_20_39(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp @ preserve carry
+#endif
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
&BODY_40_59(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp
+#endif
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr $K,.LK_60_79
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha1_block_data_order
+.word OPENSSL_armcap_P-.Lsha1_block
+#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
&teq ($inp,$len);
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+ &it ("eq");
&subeq ($inp,$inp,64); # reload last block to avoid SEGV
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
eval(shift(@insns));
}
$code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
.fpu neon
.type sha1_block_data_order_neon,%function
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
- sub sp,sp,#64 @ alloca
+ sub $Xfer,sp,#64
adr $K_XX_XX,.LK_00_19
- bic sp,sp,#15 @ align for 128-bit stores
+ bic $Xfer,$Xfer,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
- mov $Xfer,sp
+ mov sp,$Xfer @ alloca
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
+ it eq
moveq sp,$saved_sp
add $e,$e,$Ki
+ it ne
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
+ itt ne
addne $Xfer,sp,#3*16
bne .Loop_neon
my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
$code.=<<___;
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# endif
+
.type sha1_block_data_order_armv8,%function
.align 5
sha1_block_data_order_armv8:
vst1.32 {$E\[0]},[$ctx]
vldmia sp!,{d8-d15}
- bx lr
+ ret @ bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
+#endif
___
{ my %opcode = (
sub unsha1 {
my ($mnemonic,$arg)=@_;
- $arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
- &&
- sprintf ".long\t0x%08x\t@ %s %s",
- $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
- |(($2&7)<<17)|(($2&8)<<4)
- |(($3&7)<<1) |(($3&8)<<2),
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+
+ # this fix-up provides Thumb encoding in conjunction with INST
+ $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
+ }
}
}
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
+ s/\bret\b/bx lr/o or
s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
print $_,$/;