ARM64 assembly pack: add ThunderX2 results.
[openssl.git] / crypto / chacha / asm / chacha-armv8.pl
index cee787d4d6514e7e56bf2a8ecda72f285556ca4f..dc38cbd42d18c48c35b5b9c23a91c06723c87b14 100755 (executable)
@@ -29,6 +29,7 @@
 # X-Gene               9.50/+46%       8.82            8.89(*)
 # Mongoose             8.00/+44%       3.64            3.25
 # Kryo                 8.17/+50%       4.83            4.65
+# ThunderX2            7.26/+48%       7.91            4.30
 #
 # (*)  it's expected that doubling interleave factor doesn't help
 #      all processors, only those with higher NEON latency and
@@ -131,12 +132,6 @@ $code.=<<___;
 .quad  0x3320646e61707865,0x6b20657479622d32           // endian-neutral
 .Lone:
 .long  1,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long  OPENSSL_armcap_P-.
-#else
-.quad  OPENSSL_armcap_P-.
-#endif
 .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 
 .globl ChaCha20_ctr32
@@ -144,17 +139,13 @@ $code.=<<___;
 .align 5
 ChaCha20_ctr32:
        cbz     $len,.Labort
-       adr     @x[0],.LOPENSSL_armcap_P
        cmp     $len,#192
        b.lo    .Lshort
-#ifdef __ILP32__
-       ldrsw   @x[1],[@x[0]]
-#else
-       ldr     @x[1],[@x[0]]
-#endif
-       ldr     w17,[@x[1],@x[0]]
+
+       adrp    x17,OPENSSL_armcap_P
+       ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
        tst     w17,#ARMV7_NEON
-       b.ne    ChaCha20_neon
+       b.ne    .LChaCha20_neon
 
 .Lshort:
        .inst   0xd503233f                      // paciasp
@@ -380,6 +371,7 @@ $code.=<<___;
 .type  ChaCha20_neon,%function
 .align 5
 ChaCha20_neon:
+.LChaCha20_neon:
        .inst   0xd503233f                      // paciasp
        stp     x29,x30,[sp,#-96]!
        add     x29,sp,#0