-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.64/+50% 1.18(*)
-# X-Gene 2.13/+68% 2.19
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
# translator is not almighty;
-$flavour=shift;
-$output=shift;
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
// forward "declarations" are required for Apple
.extern OPENSSL_armcap_P
+.hidden OPENSSL_armcap_P
+.globl poly1305_init
+.hidden poly1305_init
.globl poly1305_blocks
+.hidden poly1305_blocks
.globl poly1305_emit
+.hidden poly1305_emit
-.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
csel x0,xzr,x0,eq
b.eq .Lno_key
-#ifdef __ILP32__
- ldrsw $t1,.LOPENSSL_armcap_P
-#else
- ldr $t1,.LOPENSSL_armcap_P
-#endif
- adr $t0,.LOPENSSL_armcap_P
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
- ldr w17,[$t0,$t1]
#ifdef __ARMEB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
tst w17,#ARMV7_NEON
- adr $d0,poly1305_blocks
- adr $r0,poly1305_blocks_neon
- adr $d1,poly1305_emit
- adr $r1,poly1305_emit_neon
+ adr $d0,.Lpoly1305_blocks
+ adr $r0,.Lpoly1305_blocks_neon
+ adr $d1,.Lpoly1305_emit
+ adr $r1,.Lpoly1305_emit_neon
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
+#ifdef __ILP32__
+ stp w12,w13,[$len]
+#else
stp $d0,$d1,[$len]
+#endif
mov x0,#1
.Lno_key:
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
+.Lpoly1305_blocks:
ands $len,$len,#-16
b.eq .Lno_data
.type poly1305_emit,%function
.align 5
poly1305_emit:
+.Lpoly1305_emit:
ldp $h0,$h1,[$ctx] // load hash base 2^64
ldr $h2,[$ctx,#16]
ldp $t0,$t1,[$nonce] // load nonce
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
ldr $is_base2_26,[$ctx,#24]
cmp $len,#128
b.hs .Lblocks_neon
- cbz $is_base2_26,poly1305_blocks
+ cbz $is_base2_26,.Lpoly1305_blocks
.Lblocks_neon:
+ .inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
csel $in2,$zeros,$in2,lo
mov x4,#1
- str x4,[$ctx,#-24] // set is_base2_26
+ stur x4,[$ctx,#-24] // set is_base2_26
sub $ctx,$ctx,#48 // restore original $ctx
b .Ldo_neon
fmov $IN01_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
fmov $IN01_2,x8
fmov $IN01_3,x10
fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
b.ls .Lskip_loop
fmov $IN01_2,x8
umlal $ACC2,$IN01_4,${S3}[0]
fmov $IN01_3,x10
+ fmov $IN01_4,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
ushr $T0.2d,$ACC3,#26
- fmov $IN01_4,x12
xtn $H3,$ACC3
ushr $T1.2d,$ACC0,#26
- xtn $H0,$ACC0
+ and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
add $ACC1,$ACC1,$T1.2d // h0 -> h1
- bic $H0,#0xfc,lsl#24
- shrn $T0.2s,$ACC4,#26
+ ushr $T0.2d,$ACC4,#26
xtn $H4,$ACC4
ushr $T1.2d,$ACC1,#26
xtn $H1,$ACC1
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
bic $H4,#0xfc,lsl#24
- bic $H1,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
- add $H0,$H0,$T0.2s
- shl $T0.2s,$T0.2s,#2
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
shrn $T1.2s,$ACC2,#26
xtn $H2,$ACC2
- add $H0,$H0,$T0.2s // h4 -> h0
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
add $H3,$H3,$T1.2s // h2 -> h3
bic $H2,#0xfc,lsl#24
- ushr $T0.2s,$H0,#26
- bic $H0,#0xfc,lsl#24
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
ushr $T1.2s,$H3,#26
bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
add $H1,$H1,$T0.2s // h0 -> h1
add $H4,$H4,$T1.2s // h3 -> h4
.Lskip_loop:
dup $IN23_2,${IN23_2}[0]
- movi $MASK.2d,#-1
add $IN01_2,$IN01_2,$H2
- ushr $MASK.2d,$MASK.2d,#38
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
st1 {$ACC4}[0],[$ctx]
.Lno_data_neon:
+ .inst 0xd50323bf // autiasp
ldr x29,[sp],#80
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
+.Lpoly1305_emit_neon:
ldr $is_base2_26,[$ctx,#24]
cbz $is_base2_26,poly1305_emit
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
-#endif
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
print $_,"\n";
}
-close STDOUT;
+close STDOUT or die "error closing STDOUT: $!";