X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fpoly1305%2Fasm%2Fpoly1305-armv8.pl;h=ac06457b65301a4dd8b95e8aa30f12f5b02bd389;hp=2e1dae3df238d157a23215ec7d88885cf105c11b;hb=753316232243ccbf86b96c1c51ffcb41651d9ad5;hpb=4b8736a22e758c371bc2f8b3534dc0c274acf42c

diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index 2e1dae3df2..ac06457b65 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -19,7 +26,9 @@
 # Cortex-A53	2.69/+58%	1.47
 # Cortex-A57	2.70/+7%	1.14
 # Denver	1.64/+50%	1.18(*)
-# X-Gene	2.13/+68%	2.19
+# X-Gene	2.13/+68%	2.27
+# Mongoose	1.77/+75%	1.12
+# Kryo		2.70/+55%	1.13
 #
 # (*)	estimate based on resources availability is less than 1.0,
 #	i.e. measured result is worse than expected, presumably binary
@@ -92,7 +101,11 @@ poly1305_init:
 	csel	$d0,$d0,$r0,eq
 	csel	$d1,$d1,$r1,eq
 
+#ifdef	__ILP32__
+	stp	w12,w13,[$len]
+#else
 	stp	$d0,$d1,[$len]
+#endif
 
 	mov	x0,#1
 .Lno_key:
@@ -507,9 +520,11 @@ poly1305_blocks_neon:
 	fmov	$IN01_1,x6
 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
+	movi	$MASK.2d,#-1
 	fmov	$IN01_2,x8
 	fmov	$IN01_3,x10
 	fmov	$IN01_4,x12
+	ushr	$MASK.2d,$MASK.2d,#38
 
 	b.ls	.Lskip_loop
 
@@ -660,41 +675,43 @@ poly1305_blocks_neon:
 	 fmov	$IN01_2,x8
 	umlal	$ACC2,$IN01_4,${S3}[0]
 	 fmov	$IN01_3,x10
+	 fmov	$IN01_4,x12
 
 	/////////////////////////////////////////////////////////////////
 	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-        // and P. Schwabe
+	// and P. Schwabe
+	//
+	// [see discussion in poly1305-armv4 module]
 
 	ushr	$T0.2d,$ACC3,#26
-	 fmov	$IN01_4,x12
 	xtn	$H3,$ACC3
 	 ushr	$T1.2d,$ACC0,#26
-	 xtn	$H0,$ACC0
+	 and	$ACC0,$ACC0,$MASK.2d
 	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
 	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
 	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-	 bic	$H0,#0xfc,lsl#24
 
-	shrn	$T0.2s,$ACC4,#26
+	ushr	$T0.2d,$ACC4,#26
 	xtn	$H4,$ACC4
 	 ushr	$T1.2d,$ACC1,#26
 	 xtn	$H1,$ACC1
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
 	bic	$H4,#0xfc,lsl#24
-	 bic	$H1,#0xfc,lsl#24
+	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
 
-	add	$H0,$H0,$T0.2s
-	shl	$T0.2s,$T0.2s,#2
+	add	$ACC0,$ACC0,$T0.2d
+	shl	$T0.2d,$T0.2d,#2
 	 shrn	$T1.2s,$ACC2,#26
 	 xtn	$H2,$ACC2
-	add	$H0,$H0,$T0.2s		// h4 -> h0
+	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
+	 bic	$H1,#0xfc,lsl#24
 	 add	$H3,$H3,$T1.2s		// h2 -> h3
 	 bic	$H2,#0xfc,lsl#24
 
-	ushr	$T0.2s,$H0,#26
-	bic	$H0,#0xfc,lsl#24
+	shrn	$T0.2s,$ACC0,#26
+	xtn	$H0,$ACC0
 	 ushr	$T1.2s,$H3,#26
 	 bic	$H3,#0xfc,lsl#24
+	 bic	$H0,#0xfc,lsl#24
 	add	$H1,$H1,$T0.2s		// h0 -> h1
 	 add	$H4,$H4,$T1.2s		// h3 -> h4
 
@@ -702,9 +719,7 @@ poly1305_blocks_neon:
 
 .Lskip_loop:
 	dup	$IN23_2,${IN23_2}[0]
-	movi	$MASK.2d,#-1
 	add	$IN01_2,$IN01_2,$H2
-	ushr	$MASK.2d,$MASK.2d,#38
 
 	////////////////////////////////////////////////////////////////
 	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1