From db42bb440e76399b89fc8ae04644441a2a5f6821 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Fri, 15 Feb 2019 22:16:41 +0100
Subject: [PATCH] ARM64 assembly pack: make it Windows-friendly.

"Windows friendliness" means a) unified PIC-ification, unified across
all platforms; b) unified commantary delimiter; c) explicit ldur/stur,
as Visual Studio assembler can't automatically encode ldr/str as
ldur/stur when needed.

Reviewed-by: Paul Dale <paul.dale@oracle.com>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8256)
---
 crypto/aes/asm/vpaes-armv8.pl         | 276 +++++++++++++-------------
 crypto/bn/asm/armv8-mont.pl           |  16 +-
 crypto/chacha/asm/chacha-armv8.pl     |  19 +-
 crypto/ec/asm/ecp_nistz256-armv8.pl   |   6 +-
 crypto/perlasm/arm-xlate.pl           |  10 +
 crypto/poly1305/asm/poly1305-armv8.pl |  31 ++-
 crypto/sha/asm/sha1-armv8.pl          |  30 +--
 crypto/sha/asm/sha512-armv8.pl        |  23 +--
 8 files changed, 191 insertions(+), 220 deletions(-)

diff --git a/crypto/aes/asm/vpaes-armv8.pl b/crypto/aes/asm/vpaes-armv8.pl
index ece9f20bec..f08ae58383 100755
--- a/crypto/aes/asm/vpaes-armv8.pl
+++ b/crypto/aes/asm/vpaes-armv8.pl
@@ -150,12 +150,12 @@ my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
 
 $code.=<<___;
-##
-##  _aes_preheat
-##
-##  Fills register %r10 -> .aes_consts (so you can -fPIC)
-##  and %xmm9-%xmm15 as specified below.
-##
+//
+//  _aes_preheat
+//
+//  Fills register %r10 -> .aes_consts (so you can -fPIC)
+//  and %xmm9-%xmm15 as specified below.
+//
 .type	_vpaes_encrypt_preheat,%function
 .align	4
 _vpaes_encrypt_preheat:
@@ -167,21 +167,21 @@ _vpaes_encrypt_preheat:
 	ret
 .size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
 
-##
-##  _aes_encrypt_core
-##
-##  AES-encrypt %xmm0.
-##
-##  Inputs:
-##     %xmm0 = input
-##     %xmm9-%xmm15 as in _vpaes_preheat
-##    (%rdx) = scheduled keys
-##
-##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
-##  Preserves %xmm6 - %xmm8 so you get some local vectors
-##
-##
+//
+//  _aes_encrypt_core
+//
+//  AES-encrypt %xmm0.
+//
+//  Inputs:
+//     %xmm0 = input
+//     %xmm9-%xmm15 as in _vpaes_preheat
+//    (%rdx) = scheduled keys
+//
+//  Output in %xmm0
+//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+//  Preserves %xmm6 - %xmm8 so you get some local vectors
+//
+//
 .type	_vpaes_encrypt_core,%function
 .align 4
 _vpaes_encrypt_core:
@@ -387,11 +387,11 @@ _vpaes_decrypt_preheat:
 	ret
 .size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
 
-##
-##  Decryption core
-##
-##  Same API as encryption core.
-##
+//
+//  Decryption core
+//
+//  Same API as encryption core.
+//
 .type	_vpaes_decrypt_core,%function
 .align	4
 _vpaes_decrypt_core:
@@ -643,11 +643,11 @@ my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
 
 $code.=<<___;
-########################################################
-##                                                    ##
-##                  AES key schedule                  ##
-##                                                    ##
-########################################################
+////////////////////////////////////////////////////////
+//                                                    //
+//                  AES key schedule                  //
+//                                                    //
+////////////////////////////////////////////////////////
 .type	_vpaes_key_preheat,%function
 .align	4
 _vpaes_key_preheat:
@@ -703,14 +703,14 @@ _vpaes_schedule_core:
 	b.eq	.Lschedule_192
 	// 128: fall though
 
-##
-##  .schedule_128
-##
-##  128-bit specific part of key schedule.
-##
-##  This schedule is really simple, because all its parts
-##  are accomplished by the subroutines.
-##
+//
+//  .schedule_128
+//
+//  128-bit specific part of key schedule.
+//
+//  This schedule is really simple, because all its parts
+//  are accomplished by the subroutines.
+//
 .Lschedule_128:
 	mov	$inp, #10			// mov	\$10, %esi
 
@@ -721,21 +721,21 @@ _vpaes_schedule_core:
 	bl	_vpaes_schedule_mangle		// write output
 	b 	.Loop_schedule_128
 
-##
-##  .aes_schedule_192
-##
-##  192-bit specific part of key schedule.
-##
-##  The main body of this schedule is the same as the 128-bit
-##  schedule, but with more smearing.  The long, high side is
-##  stored in %xmm7 as before, and the short, low side is in
-##  the high bits of %xmm6.
-##
-##  This schedule is somewhat nastier, however, because each
-##  round produces 192 bits of key material, or 1.5 round keys.
-##  Therefore, on each cycle we do 2 rounds and produce 3 round
-##  keys.
-##
+//
+//  .aes_schedule_192
+//
+//  192-bit specific part of key schedule.
+//
+//  The main body of this schedule is the same as the 128-bit
+//  schedule, but with more smearing.  The long, high side is
+//  stored in %xmm7 as before, and the short, low side is in
+//  the high bits of %xmm6.
+//
+//  This schedule is somewhat nastier, however, because each
+//  round produces 192 bits of key material, or 1.5 round keys.
+//  Therefore, on each cycle we do 2 rounds and produce 3 round
+//  keys.
+//
 .align	4
 .Lschedule_192:
 	sub	$inp, $inp, #8
@@ -759,16 +759,16 @@ _vpaes_schedule_core:
 	bl	_vpaes_schedule_192_smear
 	b	.Loop_schedule_192
 
-##
-##  .aes_schedule_256
-##
-##  256-bit specific part of key schedule.
-##
-##  The structure here is very similar to the 128-bit
-##  schedule, but with an additional "low side" in
-##  %xmm6.  The low side's rounds are the same as the
-##  high side's, except no rcon and no rotation.
-##
+//
+//  .aes_schedule_256
+//
+//  256-bit specific part of key schedule.
+//
+//  The structure here is very similar to the 128-bit
+//  schedule, but with an additional "low side" in
+//  %xmm6.  The low side's rounds are the same as the
+//  high side's, except no rcon and no rotation.
+//
 .align	4
 .Lschedule_256:
 	ld1	{v0.16b}, [$inp]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
@@ -795,16 +795,16 @@ _vpaes_schedule_core:
 
 	b	.Loop_schedule_256
 
-##
-##  .aes_schedule_mangle_last
-##
-##  Mangler for last round of key schedule
-##  Mangles %xmm0
-##    when encrypting, outputs out(%xmm0) ^ 63
-##    when decrypting, outputs unskew(%xmm0)
-##
-##  Always called right before return... jumps to cleanup and exits
-##
+//
+//  .aes_schedule_mangle_last
+//
+//  Mangler for last round of key schedule
+//  Mangles %xmm0
+//    when encrypting, outputs out(%xmm0) ^ 63
+//    when decrypting, outputs unskew(%xmm0)
+//
+//  Always called right before return... jumps to cleanup and exits
+//
 .align	4
 .Lschedule_mangle_last:
 	// schedule last round key from xmm0
@@ -838,20 +838,20 @@ _vpaes_schedule_core:
 	ret
 .size	_vpaes_schedule_core,.-_vpaes_schedule_core
 
-##
-##  .aes_schedule_192_smear
-##
-##  Smear the short, low side in the 192-bit key schedule.
-##
-##  Inputs:
-##    %xmm7: high side, b  a  x  y
-##    %xmm6:  low side, d  c  0  0
-##    %xmm13: 0
-##
-##  Outputs:
-##    %xmm6: b+c+d  b+c  0  0
-##    %xmm0: b+c+d  b+c  b  a
-##
+//
+//  .aes_schedule_192_smear
+//
+//  Smear the short, low side in the 192-bit key schedule.
+//
+//  Inputs:
+//    %xmm7: high side, b  a  x  y
+//    %xmm6:  low side, d  c  0  0
+//    %xmm13: 0
+//
+//  Outputs:
+//    %xmm6: b+c+d  b+c  0  0
+//    %xmm0: b+c+d  b+c  b  a
+//
 .type	_vpaes_schedule_192_smear,%function
 .align	4
 _vpaes_schedule_192_smear:
@@ -867,24 +867,24 @@ _vpaes_schedule_192_smear:
 	ret
 .size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
 
-##
-##  .aes_schedule_round
-##
-##  Runs one main round of the key schedule on %xmm0, %xmm7
-##
-##  Specifically, runs subbytes on the high dword of %xmm0
-##  then rotates it by one byte and xors into the low dword of
-##  %xmm7.
-##
-##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
-##  next rcon.
-##
-##  Smears the dwords of %xmm7 by xoring the low into the
-##  second low, result into third, result into highest.
-##
-##  Returns results in %xmm7 = %xmm0.
-##  Clobbers %xmm1-%xmm4, %r11.
-##
+//
+//  .aes_schedule_round
+//
+//  Runs one main round of the key schedule on %xmm0, %xmm7
+//
+//  Specifically, runs subbytes on the high dword of %xmm0
+//  then rotates it by one byte and xors into the low dword of
+//  %xmm7.
+//
+//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+//  next rcon.
+//
+//  Smears the dwords of %xmm7 by xoring the low into the
+//  second low, result into third, result into highest.
+//
+//  Returns results in %xmm7 = %xmm0.
+//  Clobbers %xmm1-%xmm4, %r11.
+//
 .type	_vpaes_schedule_round,%function
 .align	4
 _vpaes_schedule_round:
@@ -932,15 +932,15 @@ _vpaes_schedule_low_round:
 	ret
 .size	_vpaes_schedule_round,.-_vpaes_schedule_round
 
-##
-##  .aes_schedule_transform
-##
-##  Linear-transform %xmm0 according to tables at (%r11)
-##
-##  Requires that %xmm9 = 0x0F0F... as in preheat
-##  Output in %xmm0
-##  Clobbers %xmm1, %xmm2
-##
+//
+//  .aes_schedule_transform
+//
+//  Linear-transform %xmm0 according to tables at (%r11)
+//
+//  Requires that %xmm9 = 0x0F0F... as in preheat
+//  Output in %xmm0
+//  Clobbers %xmm1, %xmm2
+//
 .type	_vpaes_schedule_transform,%function
 .align	4
 _vpaes_schedule_transform:
@@ -954,29 +954,29 @@ _vpaes_schedule_transform:
 	ret
 .size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
 
-##
-##  .aes_schedule_mangle
-##
-##  Mangle xmm0 from (basis-transformed) standard version
-##  to our version.
-##
-##  On encrypt,
-##    xor with 0x63
-##    multiply by circulant 0,1,1,1
-##    apply shiftrows transform
-##
-##  On decrypt,
-##    xor with 0x63
-##    multiply by "inverse mixcolumns" circulant E,B,D,9
-##    deskew
-##    apply shiftrows transform
-##
-##
-##  Writes out to (%rdx), and increments or decrements it
-##  Keeps track of round number mod 4 in %r8
-##  Preserves xmm0
-##  Clobbers xmm1-xmm5
-##
+//
+//  .aes_schedule_mangle
+//
+//  Mangle xmm0 from (basis-transformed) standard version
+//  to our version.
+//
+//  On encrypt,
+//    xor with 0x63
+//    multiply by circulant 0,1,1,1
+//    apply shiftrows transform
+//
+//  On decrypt,
+//    xor with 0x63
+//    multiply by "inverse mixcolumns" circulant E,B,D,9
+//    deskew
+//    apply shiftrows transform
+//
+//
+//  Writes out to (%rdx), and increments or decrements it
+//  Keeps track of round number mod 4 in %r8
+//  Preserves xmm0
+//  Clobbers xmm1-xmm5
+//
 .type	_vpaes_schedule_mangle,%function
 .align	4
 _vpaes_schedule_mangle:
diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl
index c09c783e51..c755555d88 100755
--- a/crypto/bn/asm/armv8-mont.pl
+++ b/crypto/bn/asm/armv8-mont.pl
@@ -197,7 +197,7 @@ bn_mul_mont:
 	mul	$nlo,$nj,$m1		// np[j]*m1
 	adds	$lo1,$lo1,$lo0
 	umulh	$nhi,$nj,$m1
-	str	$lo1,[$tp,#-16]		// tp[j-1]
+	stur	$lo1,[$tp,#-16]		// tp[j-1]
 	cbnz	$j,.Linner
 
 .Linner_skip:
@@ -253,13 +253,13 @@ bn_mul_mont:
 	csel	$nj,$tj,$aj,lo		// did it borrow?
 	ldr	$tj,[$tp],#8
 	ldr	$aj,[$rp],#8
-	str	xzr,[$tp,#-16]		// wipe tp
-	str	$nj,[$rp,#-16]
+	stur	xzr,[$tp,#-16]		// wipe tp
+	stur	$nj,[$rp,#-16]
 	cbnz	$num,.Lcond_copy
 
 	csel	$nj,$tj,$aj,lo
-	str	xzr,[$tp,#-8]		// wipe tp
-	str	$nj,[$rp,#-8]
+	stur	xzr,[$tp,#-8]		// wipe tp
+	stur	$nj,[$rp,#-8]
 
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
@@ -596,7 +596,7 @@ __bn_sqr8x_mont:
 	ldp	$a4,$a5,[$tp,#8*4]
 	ldp	$a6,$a7,[$tp,#8*6]
 	adds	$acc0,$acc0,$a0
-	ldr	$n0,[$rp,#-8*8]
+	ldur	$n0,[$rp,#-8*8]
 	adcs	$acc1,$acc1,$a1
 	ldp	$a0,$a1,[$ap,#8*0]
 	adcs	$acc2,$acc2,$a2
@@ -794,7 +794,7 @@ $code.=<<___;
 	//adc	$carry,xzr,xzr		// moved below
 	cbz	$cnt,.Lsqr8x8_post_condition
 
-	ldr	$n0,[$tp,#-8*8]
+	ldur	$n0,[$tp,#-8*8]
 	ldp	$a0,$a1,[$np,#8*0]
 	ldp	$a2,$a3,[$np,#8*2]
 	ldp	$a4,$a5,[$np,#8*4]
@@ -852,7 +852,7 @@ $code.=<<___;
 	ldp	$a6,$a7,[$tp,#8*6]
 	cbz	$cnt,.Lsqr8x_tail_break
 
-	ldr	$n0,[$rp,#-8*8]
+	ldur	$n0,[$rp,#-8*8]
 	adds	$acc0,$acc0,$a0
 	adcs	$acc1,$acc1,$a1
 	ldp	$a0,$a1,[$np,#8*0]
diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl
index cee787d4d6..56ba1c36ba 100755
--- a/crypto/chacha/asm/chacha-armv8.pl
+++ b/crypto/chacha/asm/chacha-armv8.pl
@@ -131,12 +131,6 @@ $code.=<<___;
 .quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
 .Lone:
 .long	1,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef	__ILP32__
-.long	OPENSSL_armcap_P-.
-#else
-.quad	OPENSSL_armcap_P-.
-#endif
 .asciz	"ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 
 .globl	ChaCha20_ctr32
@@ -144,17 +138,13 @@ $code.=<<___;
 .align	5
 ChaCha20_ctr32:
 	cbz	$len,.Labort
-	adr	@x[0],.LOPENSSL_armcap_P
 	cmp	$len,#192
 	b.lo	.Lshort
-#ifdef	__ILP32__
-	ldrsw	@x[1],[@x[0]]
-#else
-	ldr	@x[1],[@x[0]]
-#endif
-	ldr	w17,[@x[1],@x[0]]
+
+	adrp	x17,OPENSSL_armcap_P
+	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
 	tst	w17,#ARMV7_NEON
-	b.ne	ChaCha20_neon
+	b.ne	.LChaCha20_neon
 
 .Lshort:
 	.inst	0xd503233f			// paciasp
@@ -380,6 +370,7 @@ $code.=<<___;
 .type	ChaCha20_neon,%function
 .align	5
 ChaCha20_neon:
+.LChaCha20_neon:
 	.inst	0xd503233f			// paciasp
 	stp	x29,x30,[sp,#-96]!
 	add	x29,sp,#0
diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl
index d84f523cd1..8914f1a619 100644
--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
@@ -1654,7 +1654,7 @@ ecp_nistz256_scatter_w5:
 
 	ldp	x4,x5,[$inp]		// X
 	ldp	x6,x7,[$inp,#16]
-	str	w4,[$out,#64*0-4]
+	stur	w4,[$out,#64*0-4]
 	lsr	x4,x4,#32
 	str	w5,[$out,#64*1-4]
 	lsr	x5,x5,#32
@@ -1670,7 +1670,7 @@ ecp_nistz256_scatter_w5:
 
 	ldp	x4,x5,[$inp,#32]	// Y
 	ldp	x6,x7,[$inp,#48]
-	str	w4,[$out,#64*0-4]
+	stur	w4,[$out,#64*0-4]
 	lsr	x4,x4,#32
 	str	w5,[$out,#64*1-4]
 	lsr	x5,x5,#32
@@ -1686,7 +1686,7 @@ ecp_nistz256_scatter_w5:
 
 	ldp	x4,x5,[$inp,#64]	// Z
 	ldp	x6,x7,[$inp,#80]
-	str	w4,[$out,#64*0-4]
+	stur	w4,[$out,#64*0-4]
 	lsr	x4,x4,#32
 	str	w5,[$out,#64*1-4]
 	lsr	x5,x5,#32
diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl
index b953f1fc19..af6f7d3188 100755
--- a/crypto/perlasm/arm-xlate.pl
+++ b/crypto/perlasm/arm-xlate.pl
@@ -103,6 +103,12 @@ my $asciz = sub {
     {	"";	}
 };
 
+my $adrp = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    "\tadrp\t$args\@PAGE";
+} if ($flavour =~ /ios64/);
+
+
 sub range {
   my ($r,$sfx,$start,$end) = @_;
 
@@ -132,6 +138,10 @@ sub expand_line {
 
     $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
 
+    if ($flavour =~ /ios64/) {
+	$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
+    }
+
     return $line;
 }
 
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index 1aded5a275..b7aa7dc90b 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -71,17 +71,12 @@ poly1305_init:
 	csel	x0,xzr,x0,eq
 	b.eq	.Lno_key
 
-#ifdef	__ILP32__
-	ldrsw	$t1,.LOPENSSL_armcap_P
-#else
-	ldr	$t1,.LOPENSSL_armcap_P
-#endif
-	adr	$t0,.LOPENSSL_armcap_P
+	adrp	x17,OPENSSL_armcap_P
+	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
 
 	ldp	$r0,$r1,[$inp]		// load key
 	mov	$s1,#0xfffffffc0fffffff
 	movk	$s1,#0x0fff,lsl#48
-	ldr	w17,[$t0,$t1]
 #ifdef	__ARMEB__
 	rev	$r0,$r0			// flip bytes
 	rev	$r1,$r1
@@ -93,10 +88,10 @@ poly1305_init:
 
 	tst	w17,#ARMV7_NEON
 
-	adr	$d0,poly1305_blocks
-	adr	$r0,poly1305_blocks_neon
-	adr	$d1,poly1305_emit
-	adr	$r1,poly1305_emit_neon
+	adr	$d0,.Lpoly1305_blocks
+	adr	$r0,.Lpoly1305_blocks_neon
+	adr	$d1,.Lpoly1305_emit
+	adr	$r1,.Lpoly1305_emit_neon
 
 	csel	$d0,$d0,$r0,eq
 	csel	$d1,$d1,$r1,eq
@@ -115,6 +110,7 @@ poly1305_init:
 .type	poly1305_blocks,%function
 .align	5
 poly1305_blocks:
+.Lpoly1305_blocks:
 	ands	$len,$len,#-16
 	b.eq	.Lno_data
 
@@ -179,6 +175,7 @@ poly1305_blocks:
 .type	poly1305_emit,%function
 .align	5
 poly1305_emit:
+.Lpoly1305_emit:
 	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
 	ldr	$h2,[$ctx,#16]
 	ldp	$t0,$t1,[$nonce]	// load nonce
@@ -285,10 +282,11 @@ poly1305_splat:
 .type	poly1305_blocks_neon,%function
 .align	5
 poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
 	ldr	$is_base2_26,[$ctx,#24]
 	cmp	$len,#128
 	b.hs	.Lblocks_neon
-	cbz	$is_base2_26,poly1305_blocks
+	cbz	$is_base2_26,.Lpoly1305_blocks
 
 .Lblocks_neon:
 	.inst	0xd503233f		// paciasp
@@ -431,7 +429,7 @@ poly1305_blocks_neon:
 	csel	$in2,$zeros,$in2,lo
 
 	mov	x4,#1
-	str	x4,[$ctx,#-24]		// set is_base2_26
+	stur	x4,[$ctx,#-24]		// set is_base2_26
 	sub	$ctx,$ctx,#48		// restore original $ctx
 	b	.Ldo_neon
 
@@ -868,6 +866,7 @@ poly1305_blocks_neon:
 .type	poly1305_emit_neon,%function
 .align	5
 poly1305_emit_neon:
+.Lpoly1305_emit_neon:
 	ldr	$is_base2_26,[$ctx,#24]
 	cbz	$is_base2_26,poly1305_emit
 
@@ -920,12 +919,6 @@ poly1305_emit_neon:
 .align	5
 .Lzeros:
 .long	0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef	__ILP32__
-.long	OPENSSL_armcap_P-.
-#else
-.quad	OPENSSL_armcap_P-.
-#endif
 .asciz	"Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
 ___
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
index a695b2c474..7a0cbf539b 100644
--- a/crypto/sha/asm/sha1-armv8.pl
+++ b/crypto/sha/asm/sha1-armv8.pl
@@ -58,10 +58,10 @@ $code.=<<___ if ($i<15 && !($i&1));
 	lsr	@Xx[$i+1],@Xx[$i],#32
 ___
 $code.=<<___ if ($i<14 && !($i&1));
-	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
+	ldur	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
 ___
 $code.=<<___ if ($i<14 && ($i&1));
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	@Xx[$i+1],@Xx[$i+1],#32
 #else
 	rev32	@Xx[$i+1],@Xx[$i+1]
@@ -171,23 +171,19 @@ ___
 }
 
 $code.=<<___;
-#include "arm_arch.h"
+#ifndef	__KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif
 
 .text
 
-.extern	OPENSSL_armcap_P
 .globl	sha1_block_data_order
 .type	sha1_block_data_order,%function
 .align	6
 sha1_block_data_order:
-#ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
-#else
-	ldr	x16,.LOPENSSL_armcap_P
-#endif
-	adr	x17,.LOPENSSL_armcap_P
-	add	x16,x16,x17
-	ldr	w16,[x16]
+	adrp	x16,OPENSSL_armcap_P
+	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
 	tst	w16,#ARMV8_SHA1
 	b.ne	.Lv8_entry
 
@@ -208,7 +204,7 @@ sha1_block_data_order:
 	movz	$K,#0x7999
 	sub	$num,$num,#1
 	movk	$K,#0x5a82,lsl#16
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	$Xx[0],@Xx[0],#32
 #else
 	rev32	@Xx[0],@Xx[0]
@@ -321,15 +317,11 @@ $code.=<<___;
 .long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
 .long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
 .long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
-.LOPENSSL_armcap_P:
-#ifdef	__ILP32__
-.long	OPENSSL_armcap_P-.
-#else
-.quad	OPENSSL_armcap_P-.
-#endif
 .asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+#if !defined(__KERNELL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
+#endif
 ___
 }}}
 
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
index 6540a9f3a6..f7c67219ed 100644
--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -188,24 +188,18 @@ ___
 $code.=<<___;
 #ifndef	__KERNEL__
 # include "arm_arch.h"
+.extern	OPENSSL_armcap_P
 #endif
 
 .text
 
-.extern	OPENSSL_armcap_P
 .globl	$func
 .type	$func,%function
 .align	6
 $func:
 #ifndef	__KERNEL__
-# ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
-# else
-	ldr	x16,.LOPENSSL_armcap_P
-# endif
-	adr	x17,.LOPENSSL_armcap_P
-	add	x16,x16,x17
-	ldr	w16,[x16]
+	adrp	x16,OPENSSL_armcap_P
+	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
 ___
 $code.=<<___	if ($SZ==4);
 	tst	w16,#ARMV8_SHA256
@@ -353,15 +347,6 @@ $code.=<<___ if ($SZ==4);
 ___
 $code.=<<___;
 .size	.LK$BITS,.-.LK$BITS
-#ifndef	__KERNEL__
-.align	3
-.LOPENSSL_armcap_P:
-# ifdef	__ILP32__
-	.long	OPENSSL_armcap_P-.
-# else
-	.quad	OPENSSL_armcap_P-.
-# endif
-#endif
 .asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
 ___
@@ -841,7 +826,7 @@ ___
 }
 
 $code.=<<___;
-#ifndef	__KERNEL__
+#if !defined(__KERNEL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
 #endif
 ___
-- 
2.34.1