projects
/
openssl.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
ARMv8 assembly pack: add Qualcomm Kryo results.
[openssl.git]
/
crypto
/
poly1305
/
asm
/
poly1305-armv8.pl
diff --git
a/crypto/poly1305/asm/poly1305-armv8.pl
b/crypto/poly1305/asm/poly1305-armv8.pl
index 2e1dae3df238d157a23215ec7d88885cf105c11b..ac06457b65301a4dd8b95e8aa30f12f5b02bd389 100755
(executable)
--- a/
crypto/poly1305/asm/poly1305-armv8.pl
+++ b/
crypto/poly1305/asm/poly1305-armv8.pl
@@
-1,4
+1,11
@@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@
-19,7
+26,9
@@
# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.64/+50% 1.18(*)
# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.64/+50% 1.18(*)
-# X-Gene 2.13/+68% 2.19
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
@@
-92,7
+101,11
@@
poly1305_init:
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
+#ifdef __ILP32__
+ stp w12,w13,[$len]
+#else
stp $d0,$d1,[$len]
stp $d0,$d1,[$len]
+#endif
mov x0,#1
.Lno_key:
mov x0,#1
.Lno_key:
@@
-507,9
+520,11
@@
poly1305_blocks_neon:
fmov $IN01_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov $IN01_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
fmov $IN01_2,x8
fmov $IN01_3,x10
fmov $IN01_4,x12
fmov $IN01_2,x8
fmov $IN01_3,x10
fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
b.ls .Lskip_loop
b.ls .Lskip_loop
@@
-660,41
+675,43
@@
poly1305_blocks_neon:
fmov $IN01_2,x8
umlal $ACC2,$IN01_4,${S3}[0]
fmov $IN01_3,x10
fmov $IN01_2,x8
umlal $ACC2,$IN01_4,${S3}[0]
fmov $IN01_3,x10
+ fmov $IN01_4,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
ushr $T0.2d,$ACC3,#26
ushr $T0.2d,$ACC3,#26
- fmov $IN01_4,x12
xtn $H3,$ACC3
ushr $T1.2d,$ACC0,#26
xtn $H3,$ACC3
ushr $T1.2d,$ACC0,#26
- xtn $H0,$ACC0
+ and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
add $ACC1,$ACC1,$T1.2d // h0 -> h1
add $ACC4,$ACC4,$T0.2d // h3 -> h4
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
add $ACC1,$ACC1,$T1.2d // h0 -> h1
- bic $H0,#0xfc,lsl#24
-
shrn $T0.2s
,$ACC4,#26
+
ushr $T0.2d
,$ACC4,#26
xtn $H4,$ACC4
ushr $T1.2d,$ACC1,#26
xtn $H1,$ACC1
xtn $H4,$ACC4
ushr $T1.2d,$ACC1,#26
xtn $H1,$ACC1
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
bic $H4,#0xfc,lsl#24
bic $H4,#0xfc,lsl#24
- bic $H1,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
- add $
H0,$H0,$T0.2s
- shl $T0.2
s,$T0.2s
,#2
+ add $
ACC0,$ACC0,$T0.2d
+ shl $T0.2
d,$T0.2d
,#2
shrn $T1.2s,$ACC2,#26
xtn $H2,$ACC2
shrn $T1.2s,$ACC2,#26
xtn $H2,$ACC2
- add $H0,$H0,$T0.2s // h4 -> h0
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
add $H3,$H3,$T1.2s // h2 -> h3
bic $H2,#0xfc,lsl#24
add $H3,$H3,$T1.2s // h2 -> h3
bic $H2,#0xfc,lsl#24
-
ushr $T0.2s,$H
0,#26
- bic $H0,#0xfc,lsl#24
+
shrn $T0.2s,$ACC
0,#26
+ xtn $H0,$ACC0
ushr $T1.2s,$H3,#26
bic $H3,#0xfc,lsl#24
ushr $T1.2s,$H3,#26
bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
add $H1,$H1,$T0.2s // h0 -> h1
add $H4,$H4,$T1.2s // h3 -> h4
add $H1,$H1,$T0.2s // h0 -> h1
add $H4,$H4,$T1.2s // h3 -> h4
@@
-702,9
+719,7
@@
poly1305_blocks_neon:
.Lskip_loop:
dup $IN23_2,${IN23_2}[0]
.Lskip_loop:
dup $IN23_2,${IN23_2}[0]
- movi $MASK.2d,#-1
add $IN01_2,$IN01_2,$H2
add $IN01_2,$IN01_2,$H2
- ushr $MASK.2d,$MASK.2d,#38
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1