From: Andy Polyakov <appro@openssl.org>
Date: Mon, 20 Apr 2015 12:30:50 +0000 (+0200)
Subject: aes/asm/vpaes-armv8.pl: make it compile on iOS.
X-Git-Tag: OpenSSL_1_1_0-pre1~1306
X-Git-Url: https://git.openssl.org/?p=openssl.git;a=commitdiff_plain;h=35141544e2994f0f3b87be7d7c9a43ea3cd9840a

aes/asm/vpaes-armv8.pl: make it compile on iOS.

Reviewed-by: Richard Levitte <levitte@openssl.org>
---

diff --git a/crypto/aes/asm/vpaes-armv8.pl b/crypto/aes/asm/vpaes-armv8.pl
index 1144536508..bc90b9f3bb 100755
--- a/crypto/aes/asm/vpaes-armv8.pl
+++ b/crypto/aes/asm/vpaes-armv8.pl
@@ -20,11 +20,14 @@
 # Cortex-A53        21.5        18.1/20.6        [17.5/19.8         ]
 # Cortex-A57        36.0(**)    20.4/24.9(**)    [14.4/16.6         ]
 # X-Gene            45.9(**)    45.8/57.7(**)    [33.1/37.6(**)     ]
+# Denver(***)       16.6(**)    15.1/17.8(**)    [8.80/9.93         ]
+# Apple A7(***)     22.7(**)    10.9/14.3        [8.45/10.0         ]
 #
 # (*)	ECB denotes approximate result for parallelizeable modes
 #	such as CBC decrypt, CTR, etc.;
 # (**)	these results are worse than scalar compiler-generated
 #	code, but it's constant-time and therefore preferred;
+# (***)	presented for reference/comparison purposes;
 
 $flavour = shift;
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
@@ -206,7 +209,7 @@ _vpaes_encrypt_core:
 	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
 	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
 	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
-	bic	x11, x11, #1<<6			// and		\$0x30,	%r11		# ... mod 4
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
 	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
 	sub	w8, w8, #1			// nr--
 
@@ -309,7 +312,7 @@ _vpaes_encrypt_2x:
 	 tbl	v12.16b, {v11.16b},v1.16b
 	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
 	 eor	v8.16b,  v8.16b,  v11.16b
-	bic	x11, x11, #1<<6			// and		\$0x30,	%r11		# ... mod 4
+	and	x11, x11, #~(1<<6)		// and		\$0x30,	%r11		# ... mod 4
 	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
 	 eor	v8.16b,  v8.16b,  v12.16b
 	sub	w8, w8, #1			// nr--
@@ -683,8 +686,8 @@ _vpaes_schedule_core:
 
 .Lschedule_go:
 	cmp	$bits, #192			// cmp	\$192,	%esi
-	bhi	.Lschedule_256
-	beq	.Lschedule_192
+	b.hi	.Lschedule_256
+	b.eq	.Lschedule_192
 	// 128: fall though
 
 ##
@@ -1021,7 +1024,7 @@ _vpaes_schedule_mangle:
 .Lschedule_mangle_both:
 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
 	add	x8, x8, #64-16			// add	\$-16,	%r8
-	bic	x8, x8, #1<<6			// and	\$0x30,	%r8
+	and	x8, x8, #~(1<<6)		// and	\$0x30,	%r8
 	st1	{v3.2d}, [$out]			// vmovdqu	%xmm3,	(%rdx)
 	ret
 .size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle