From db42bb440e76399b89fc8ae04644441a2a5f6821 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 15 Feb 2019 22:16:41 +0100 Subject: [PATCH] ARM64 assembly pack: make it Windows-friendly. "Windows friendliness" means a) unified PIC-ification, unified across all platforms; b) unified commantary delimiter; c) explicit ldur/stur, as Visual Studio assembler can't automatically encode ldr/str as ldur/stur when needed. Reviewed-by: Paul Dale Reviewed-by: Richard Levitte (Merged from https://github.com/openssl/openssl/pull/8256) --- crypto/aes/asm/vpaes-armv8.pl | 276 +++++++++++++------------- crypto/bn/asm/armv8-mont.pl | 16 +- crypto/chacha/asm/chacha-armv8.pl | 19 +- crypto/ec/asm/ecp_nistz256-armv8.pl | 6 +- crypto/perlasm/arm-xlate.pl | 10 + crypto/poly1305/asm/poly1305-armv8.pl | 31 ++- crypto/sha/asm/sha1-armv8.pl | 30 +-- crypto/sha/asm/sha512-armv8.pl | 23 +-- 8 files changed, 191 insertions(+), 220 deletions(-) diff --git a/crypto/aes/asm/vpaes-armv8.pl b/crypto/aes/asm/vpaes-armv8.pl index ece9f20bec..f08ae58383 100755 --- a/crypto/aes/asm/vpaes-armv8.pl +++ b/crypto/aes/asm/vpaes-armv8.pl @@ -150,12 +150,12 @@ my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); $code.=<<___; -## -## _aes_preheat -## -## Fills register %r10 -> .aes_consts (so you can -fPIC) -## and %xmm9-%xmm15 as specified below. -## +// +// _aes_preheat +// +// Fills register %r10 -> .aes_consts (so you can -fPIC) +// and %xmm9-%xmm15 as specified below. +// .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: @@ -167,21 +167,21 @@ _vpaes_encrypt_preheat: ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat -## -## _aes_encrypt_core -## -## AES-encrypt %xmm0. -## -## Inputs: -## %xmm0 = input -## %xmm9-%xmm15 as in _vpaes_preheat -## (%rdx) = scheduled keys -## -## Output in %xmm0 -## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax -## Preserves %xmm6 - %xmm8 so you get some local vectors -## -## +// +// _aes_encrypt_core +// +// AES-encrypt %xmm0. +// +// Inputs: +// %xmm0 = input +// %xmm9-%xmm15 as in _vpaes_preheat +// (%rdx) = scheduled keys +// +// Output in %xmm0 +// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +// Preserves %xmm6 - %xmm8 so you get some local vectors +// +// .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: @@ -387,11 +387,11 @@ _vpaes_decrypt_preheat: ret .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat -## -## Decryption core -## -## Same API as encryption core. -## +// +// Decryption core +// +// Same API as encryption core. +// .type _vpaes_decrypt_core,%function .align 4 _vpaes_decrypt_core: @@ -643,11 +643,11 @@ my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); $code.=<<___; -######################################################## -## ## -## AES key schedule ## -## ## -######################################################## +//////////////////////////////////////////////////////// +// // +// AES key schedule // +// // +//////////////////////////////////////////////////////// .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: @@ -703,14 +703,14 @@ _vpaes_schedule_core: b.eq .Lschedule_192 // 128: fall though -## -## .schedule_128 -## -## 128-bit specific part of key schedule. -## -## This schedule is really simple, because all its parts -## are accomplished by the subroutines. -## +// +// .schedule_128 +// +// 128-bit specific part of key schedule. +// +// This schedule is really simple, because all its parts +// are accomplished by the subroutines. +// .Lschedule_128: mov $inp, #10 // mov \$10, %esi @@ -721,21 +721,21 @@ _vpaes_schedule_core: bl _vpaes_schedule_mangle // write output b .Loop_schedule_128 -## -## .aes_schedule_192 -## -## 192-bit specific part of key schedule. -## -## The main body of this schedule is the same as the 128-bit -## schedule, but with more smearing. The long, high side is -## stored in %xmm7 as before, and the short, low side is in -## the high bits of %xmm6. -## -## This schedule is somewhat nastier, however, because each -## round produces 192 bits of key material, or 1.5 round keys. -## Therefore, on each cycle we do 2 rounds and produce 3 round -## keys. -## +// +// .aes_schedule_192 +// +// 192-bit specific part of key schedule. +// +// The main body of this schedule is the same as the 128-bit +// schedule, but with more smearing. The long, high side is +// stored in %xmm7 as before, and the short, low side is in +// the high bits of %xmm6. +// +// This schedule is somewhat nastier, however, because each +// round produces 192 bits of key material, or 1.5 round keys. +// Therefore, on each cycle we do 2 rounds and produce 3 round +// keys. +// .align 4 .Lschedule_192: sub $inp, $inp, #8 @@ -759,16 +759,16 @@ _vpaes_schedule_core: bl _vpaes_schedule_192_smear b .Loop_schedule_192 -## -## .aes_schedule_256 -## -## 256-bit specific part of key schedule. -## -## The structure here is very similar to the 128-bit -## schedule, but with an additional "low side" in -## %xmm6. The low side's rounds are the same as the -## high side's, except no rcon and no rotation. -## +// +// .aes_schedule_256 +// +// 256-bit specific part of key schedule. +// +// The structure here is very similar to the 128-bit +// schedule, but with an additional "low side" in +// %xmm6. The low side's rounds are the same as the +// high side's, except no rcon and no rotation. +// .align 4 .Lschedule_256: ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) @@ -795,16 +795,16 @@ _vpaes_schedule_core: b .Loop_schedule_256 -## -## .aes_schedule_mangle_last -## -## Mangler for last round of key schedule -## Mangles %xmm0 -## when encrypting, outputs out(%xmm0) ^ 63 -## when decrypting, outputs unskew(%xmm0) -## -## Always called right before return... jumps to cleanup and exits -## +// +// .aes_schedule_mangle_last +// +// Mangler for last round of key schedule +// Mangles %xmm0 +// when encrypting, outputs out(%xmm0) ^ 63 +// when decrypting, outputs unskew(%xmm0) +// +// Always called right before return... jumps to cleanup and exits +// .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 @@ -838,20 +838,20 @@ _vpaes_schedule_core: ret .size _vpaes_schedule_core,.-_vpaes_schedule_core -## -## .aes_schedule_192_smear -## -## Smear the short, low side in the 192-bit key schedule. -## -## Inputs: -## %xmm7: high side, b a x y -## %xmm6: low side, d c 0 0 -## %xmm13: 0 -## -## Outputs: -## %xmm6: b+c+d b+c 0 0 -## %xmm0: b+c+d b+c b a -## +// +// .aes_schedule_192_smear +// +// Smear the short, low side in the 192-bit key schedule. +// +// Inputs: +// %xmm7: high side, b a x y +// %xmm6: low side, d c 0 0 +// %xmm13: 0 +// +// Outputs: +// %xmm6: b+c+d b+c 0 0 +// %xmm0: b+c+d b+c b a +// .type _vpaes_schedule_192_smear,%function .align 4 _vpaes_schedule_192_smear: @@ -867,24 +867,24 @@ _vpaes_schedule_192_smear: ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -## -## .aes_schedule_round -## -## Runs one main round of the key schedule on %xmm0, %xmm7 -## -## Specifically, runs subbytes on the high dword of %xmm0 -## then rotates it by one byte and xors into the low dword of -## %xmm7. -## -## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -## next rcon. -## -## Smears the dwords of %xmm7 by xoring the low into the -## second low, result into third, result into highest. -## -## Returns results in %xmm7 = %xmm0. -## Clobbers %xmm1-%xmm4, %r11. -## +// +// .aes_schedule_round +// +// Runs one main round of the key schedule on %xmm0, %xmm7 +// +// Specifically, runs subbytes on the high dword of %xmm0 +// then rotates it by one byte and xors into the low dword of +// %xmm7. +// +// Adds rcon from low byte of %xmm8, then rotates %xmm8 for +// next rcon. +// +// Smears the dwords of %xmm7 by xoring the low into the +// second low, result into third, result into highest. +// +// Returns results in %xmm7 = %xmm0. +// Clobbers %xmm1-%xmm4, %r11. +// .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: @@ -932,15 +932,15 @@ _vpaes_schedule_low_round: ret .size _vpaes_schedule_round,.-_vpaes_schedule_round -## -## .aes_schedule_transform -## -## Linear-transform %xmm0 according to tables at (%r11) -## -## Requires that %xmm9 = 0x0F0F... as in preheat -## Output in %xmm0 -## Clobbers %xmm1, %xmm2 -## +// +// .aes_schedule_transform +// +// Linear-transform %xmm0 according to tables at (%r11) +// +// Requires that %xmm9 = 0x0F0F... as in preheat +// Output in %xmm0 +// Clobbers %xmm1, %xmm2 +// .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: @@ -954,29 +954,29 @@ _vpaes_schedule_transform: ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform -## -## .aes_schedule_mangle -## -## Mangle xmm0 from (basis-transformed) standard version -## to our version. -## -## On encrypt, -## xor with 0x63 -## multiply by circulant 0,1,1,1 -## apply shiftrows transform -## -## On decrypt, -## xor with 0x63 -## multiply by "inverse mixcolumns" circulant E,B,D,9 -## deskew -## apply shiftrows transform -## -## -## Writes out to (%rdx), and increments or decrements it -## Keeps track of round number mod 4 in %r8 -## Preserves xmm0 -## Clobbers xmm1-xmm5 -## +// +// .aes_schedule_mangle +// +// Mangle xmm0 from (basis-transformed) standard version +// to our version. +// +// On encrypt, +// xor with 0x63 +// multiply by circulant 0,1,1,1 +// apply shiftrows transform +// +// On decrypt, +// xor with 0x63 +// multiply by "inverse mixcolumns" circulant E,B,D,9 +// deskew +// apply shiftrows transform +// +// +// Writes out to (%rdx), and increments or decrements it +// Keeps track of round number mod 4 in %r8 +// Preserves xmm0 +// Clobbers xmm1-xmm5 +// .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl index c09c783e51..c755555d88 100755 --- a/crypto/bn/asm/armv8-mont.pl +++ b/crypto/bn/asm/armv8-mont.pl @@ -197,7 +197,7 @@ bn_mul_mont: mul $nlo,$nj,$m1 // np[j]*m1 adds $lo1,$lo1,$lo0 umulh $nhi,$nj,$m1 - str $lo1,[$tp,#-16] // tp[j-1] + stur $lo1,[$tp,#-16] // tp[j-1] cbnz $j,.Linner .Linner_skip: @@ -253,13 +253,13 @@ bn_mul_mont: csel $nj,$tj,$aj,lo // did it borrow? ldr $tj,[$tp],#8 ldr $aj,[$rp],#8 - str xzr,[$tp,#-16] // wipe tp - str $nj,[$rp,#-16] + stur xzr,[$tp,#-16] // wipe tp + stur $nj,[$rp,#-16] cbnz $num,.Lcond_copy csel $nj,$tj,$aj,lo - str xzr,[$tp,#-8] // wipe tp - str $nj,[$rp,#-8] + stur xzr,[$tp,#-8] // wipe tp + stur $nj,[$rp,#-8] ldp x19,x20,[x29,#16] mov sp,x29 @@ -596,7 +596,7 @@ __bn_sqr8x_mont: ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] adds $acc0,$acc0,$a0 - ldr $n0,[$rp,#-8*8] + ldur $n0,[$rp,#-8*8] adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$ap,#8*0] adcs $acc2,$acc2,$a2 @@ -794,7 +794,7 @@ $code.=<<___; //adc $carry,xzr,xzr // moved below cbz $cnt,.Lsqr8x8_post_condition - ldr $n0,[$tp,#-8*8] + ldur $n0,[$tp,#-8*8] ldp $a0,$a1,[$np,#8*0] ldp $a2,$a3,[$np,#8*2] ldp $a4,$a5,[$np,#8*4] @@ -852,7 +852,7 @@ $code.=<<___; ldp $a6,$a7,[$tp,#8*6] cbz $cnt,.Lsqr8x_tail_break - ldr $n0,[$rp,#-8*8] + ldur $n0,[$rp,#-8*8] adds $acc0,$acc0,$a0 adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$np,#8*0] diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl index cee787d4d6..56ba1c36ba 100755 --- a/crypto/chacha/asm/chacha-armv8.pl +++ b/crypto/chacha/asm/chacha-armv8.pl @@ -131,12 +131,6 @@ $code.=<<___; .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by " .globl ChaCha20_ctr32 @@ -144,17 +138,13 @@ $code.=<<___; .align 5 ChaCha20_ctr32: cbz $len,.Labort - adr @x[0],.LOPENSSL_armcap_P cmp $len,#192 b.lo .Lshort -#ifdef __ILP32__ - ldrsw @x[1],[@x[0]] -#else - ldr @x[1],[@x[0]] -#endif - ldr w17,[@x[1],@x[0]] + + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON - b.ne ChaCha20_neon + b.ne .LChaCha20_neon .Lshort: .inst 0xd503233f // paciasp @@ -380,6 +370,7 @@ $code.=<<___; .type ChaCha20_neon,%function .align 5 ChaCha20_neon: +.LChaCha20_neon: .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl index d84f523cd1..8914f1a619 100644 --- a/crypto/ec/asm/ecp_nistz256-armv8.pl +++ b/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -1654,7 +1654,7 @@ ecp_nistz256_scatter_w5: ldp x4,x5,[$inp] // X ldp x6,x7,[$inp,#16] - str w4,[$out,#64*0-4] + stur w4,[$out,#64*0-4] lsr x4,x4,#32 str w5,[$out,#64*1-4] lsr x5,x5,#32 @@ -1670,7 +1670,7 @@ ecp_nistz256_scatter_w5: ldp x4,x5,[$inp,#32] // Y ldp x6,x7,[$inp,#48] - str w4,[$out,#64*0-4] + stur w4,[$out,#64*0-4] lsr x4,x4,#32 str w5,[$out,#64*1-4] lsr x5,x5,#32 @@ -1686,7 +1686,7 @@ ecp_nistz256_scatter_w5: ldp x4,x5,[$inp,#64] // Z ldp x6,x7,[$inp,#80] - str w4,[$out,#64*0-4] + stur w4,[$out,#64*0-4] lsr x4,x4,#32 str w5,[$out,#64*1-4] lsr x5,x5,#32 diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl index b953f1fc19..af6f7d3188 100755 --- a/crypto/perlasm/arm-xlate.pl +++ b/crypto/perlasm/arm-xlate.pl @@ -103,6 +103,12 @@ my $asciz = sub { { ""; } }; +my $adrp = sub { + my ($args,$comment) = split(m|\s*//|,shift); + "\tadrp\t$args\@PAGE"; +} if ($flavour =~ /ios64/); + + sub range { my ($r,$sfx,$start,$end) = @_; @@ -132,6 +138,10 @@ sub expand_line { $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + if ($flavour =~ /ios64/) { + $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/; + } + return $line; } diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl index 1aded5a275..b7aa7dc90b 100755 --- a/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/poly1305/asm/poly1305-armv8.pl @@ -71,17 +71,12 @@ poly1305_init: csel x0,xzr,x0,eq b.eq .Lno_key -#ifdef __ILP32__ - ldrsw $t1,.LOPENSSL_armcap_P -#else - ldr $t1,.LOPENSSL_armcap_P -#endif - adr $t0,.LOPENSSL_armcap_P + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] ldp $r0,$r1,[$inp] // load key mov $s1,#0xfffffffc0fffffff movk $s1,#0x0fff,lsl#48 - ldr w17,[$t0,$t1] #ifdef __ARMEB__ rev $r0,$r0 // flip bytes rev $r1,$r1 @@ -93,10 +88,10 @@ poly1305_init: tst w17,#ARMV7_NEON - adr $d0,poly1305_blocks - adr $r0,poly1305_blocks_neon - adr $d1,poly1305_emit - adr $r1,poly1305_emit_neon + adr $d0,.Lpoly1305_blocks + adr $r0,.Lpoly1305_blocks_neon + adr $d1,.Lpoly1305_emit + adr $r1,.Lpoly1305_emit_neon csel $d0,$d0,$r0,eq csel $d1,$d1,$r1,eq @@ -115,6 +110,7 @@ poly1305_init: .type poly1305_blocks,%function .align 5 poly1305_blocks: +.Lpoly1305_blocks: ands $len,$len,#-16 b.eq .Lno_data @@ -179,6 +175,7 @@ poly1305_blocks: .type poly1305_emit,%function .align 5 poly1305_emit: +.Lpoly1305_emit: ldp $h0,$h1,[$ctx] // load hash base 2^64 ldr $h2,[$ctx,#16] ldp $t0,$t1,[$nonce] // load nonce @@ -285,10 +282,11 @@ poly1305_splat: .type poly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: +.Lpoly1305_blocks_neon: ldr $is_base2_26,[$ctx,#24] cmp $len,#128 b.hs .Lblocks_neon - cbz $is_base2_26,poly1305_blocks + cbz $is_base2_26,.Lpoly1305_blocks .Lblocks_neon: .inst 0xd503233f // paciasp @@ -431,7 +429,7 @@ poly1305_blocks_neon: csel $in2,$zeros,$in2,lo mov x4,#1 - str x4,[$ctx,#-24] // set is_base2_26 + stur x4,[$ctx,#-24] // set is_base2_26 sub $ctx,$ctx,#48 // restore original $ctx b .Ldo_neon @@ -868,6 +866,7 @@ poly1305_blocks_neon: .type poly1305_emit_neon,%function .align 5 poly1305_emit_neon: +.Lpoly1305_emit_neon: ldr $is_base2_26,[$ctx,#24] cbz $is_base2_26,poly1305_emit @@ -920,12 +919,6 @@ poly1305_emit_neon: .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .asciz "Poly1305 for ARMv8, CRYPTOGAMS by " .align 2 ___ diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl index a695b2c474..7a0cbf539b 100644 --- a/crypto/sha/asm/sha1-armv8.pl +++ b/crypto/sha/asm/sha1-armv8.pl @@ -58,10 +58,10 @@ $code.=<<___ if ($i<15 && !($i&1)); lsr @Xx[$i+1],@Xx[$i],#32 ___ $code.=<<___ if ($i<14 && !($i&1)); - ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`] + ldur @Xx[$i+2],[$inp,#`($i+2)*4-64`] ___ $code.=<<___ if ($i<14 && ($i&1)); -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror @Xx[$i+1],@Xx[$i+1],#32 #else rev32 @Xx[$i+1],@Xx[$i+1] @@ -171,23 +171,19 @@ ___ } $code.=<<___; -#include "arm_arch.h" +#ifndef __KERNEL__ +# include "arm_arch.h" +.extern OPENSSL_armcap_P +#endif .text -.extern OPENSSL_armcap_P .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 sha1_block_data_order: -#ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -#else - ldr x16,.LOPENSSL_armcap_P -#endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry @@ -208,7 +204,7 @@ sha1_block_data_order: movz $K,#0x7999 sub $num,$num,#1 movk $K,#0x5a82,lsl#16 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror $Xx[0],@Xx[0],#32 #else rev32 @Xx[0],@Xx[0] @@ -321,15 +317,11 @@ $code.=<<___; .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by " .align 2 +#if !defined(__KERNELL__) && !defined(_WIN64) .comm OPENSSL_armcap_P,4,4 +#endif ___ }}} diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl index 6540a9f3a6..f7c67219ed 100644 --- a/crypto/sha/asm/sha512-armv8.pl +++ b/crypto/sha/asm/sha512-armv8.pl @@ -188,24 +188,18 @@ ___ $code.=<<___; #ifndef __KERNEL__ # include "arm_arch.h" +.extern OPENSSL_armcap_P #endif .text -.extern OPENSSL_armcap_P .globl $func .type $func,%function .align 6 $func: #ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] ___ $code.=<<___ if ($SZ==4); tst w16,#ARMV8_SHA256 @@ -353,15 +347,6 @@ $code.=<<___ if ($SZ==4); ___ $code.=<<___; .size .LK$BITS,.-.LK$BITS -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ - .long OPENSSL_armcap_P-. -# else - .quad OPENSSL_armcap_P-. -# endif -#endif .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " .align 2 ___ @@ -841,7 +826,7 @@ ___ } $code.=<<___; -#ifndef __KERNEL__ +#if !defined(__KERNEL__) && !defined(_WIN64) .comm OPENSSL_armcap_P,4,4 #endif ___ -- 2.34.1