Acceleration of chacha20 on aarch64 by SVE
authorDaniel Hu <Daniel.Hu@arm.com>
Mon, 7 Feb 2022 10:17:06 +0000 (10:17 +0000)
committerPauli <pauli@openssl.org>
Tue, 3 May 2022 04:37:46 +0000 (14:37 +1000)
This patch accelerates chacha20 on aarch64 when Scalable Vector Extension
(SVE) is supported by CPU. Tested on modern micro-architecture with
256-bit SVE, it has the potential to improve performance up to 20%

The solution takes a hybrid approach. SVE will handle multi-blocks that fit
the SVE vector length, with Neon/Scalar to process any tail data

Test result:
With SVE
type            1024 bytes   8192 bytes  16384 bytes
ChaCha20        1596208.13k  1650010.79k  1653151.06k

Without SVE (by Neon/Scalar)
type            1024 bytes   8192 bytes  16384 bytes
chacha20        1355487.91k  1372678.83k  1372662.44k

The assembly code has been reviewed internally by
ARM engineer Fangming.Fang@arm.com

Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17916)

crypto/arm64cpuid.pl
crypto/arm_arch.h
crypto/armcap.c
crypto/chacha/asm/chacha-armv8-sve.pl [new file with mode: 0755]
crypto/chacha/asm/chacha-armv8.pl
crypto/chacha/build.info

index ad48c53402a9a467943756cf11146d0e1eb1577f..0a6cd01e30a735eb9b981dca7bdb2211cdeac681 100755 (executable)
@@ -104,6 +104,22 @@ _armv8_eor3_probe:
        ret
 .size  _armv8_eor3_probe,.-_armv8_eor3_probe
 
+.globl _armv8_sve_probe
+.type  _armv8_sve_probe,%function
+_armv8_sve_probe:
+       AARCH64_VALID_CALL_TARGET
+       .inst   0x04a03000      // eor z0.d,z0.d,z0.d
+       ret
+.size  _armv8_sve_probe,.-_armv8_sve_probe
+
+.globl _armv8_sve2_probe
+.type  _armv8_sve2_probe,%function
+_armv8_sve2_probe:
+       AARCH64_VALID_CALL_TARGET
+       .inst   0x04e03400      // xar z0.d,z0.d,z0.d
+       ret
+.size  _armv8_sve2_probe,.-_armv8_sve2_probe
+
 .globl _armv8_cpuid_probe
 .type  _armv8_cpuid_probe,%function
 _armv8_cpuid_probe:
index 33acbd99c0b34558b9ff596a86dcced6ec7f126f..5fc0905885a7448cc62d4eb538436cd4a885b9ab 100644 (file)
@@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 # define ARMV8_SM4       (1<<10)
 # define ARMV8_SHA3      (1<<11)
 # define ARMV8_UNROLL8_EOR3      (1<<12)
+# define ARMV8_SVE       (1<<13)
+# define ARMV8_SVE2      (1<<14)
 
 /*
  * MIDR_EL1 system register
index c50322f504bdff13c778f129cbe4fbe3f79b5916..91ba45c950333f4af227762c6ecb787767ebb957 100644 (file)
@@ -57,6 +57,8 @@ void _armv8_sm3_probe(void);
 void _armv8_sm4_probe(void);
 void _armv8_sha512_probe(void);
 unsigned int _armv8_cpuid_probe(void);
+void _armv8_sve_probe(void);
+void _armv8_sve2_probe(void);
 void _armv8_rng_probe(void);
 
 size_t OPENSSL_rndr_asm(unsigned char *buf, size_t len);
@@ -175,8 +177,10 @@ static unsigned long getauxval(unsigned long key)
 #  define HWCAP_CE_SM3           (1 << 18)
 #  define HWCAP_CE_SM4           (1 << 19)
 #  define HWCAP_CE_SHA512        (1 << 21)
+#  define HWCAP_SVE              (1 << 22)
                                   /* AT_HWCAP2 */
 #  define HWCAP2                 26
+#  define HWCAP2_SVE2            (1 << 1)
 #  define HWCAP2_RNG             (1 << 16)
 # endif
 
@@ -270,6 +274,12 @@ void OPENSSL_cpuid_setup(void)
 #  endif
     }
 #  ifdef __aarch64__
+        if (getauxval(HWCAP) & HWCAP_SVE)
+            OPENSSL_armcap_P |= ARMV8_SVE;
+
+        if (getauxval(HWCAP2) & HWCAP2_SVE2)
+            OPENSSL_armcap_P |= ARMV8_SVE2;
+
         if (getauxval(HWCAP2) & HWCAP2_RNG)
             OPENSSL_armcap_P |= ARMV8_RNG;
 #  endif
@@ -330,6 +340,16 @@ void OPENSSL_cpuid_setup(void)
 #  endif
     }
 #  ifdef __aarch64__
+    if (sigsetjmp(ill_jmp, 1) == 0) {
+        _armv8_sve_probe();
+        OPENSSL_armcap_P |= ARMV8_SVE;
+    }
+
+    if (sigsetjmp(ill_jmp, 1) == 0) {
+        _armv8_sve2_probe();
+        OPENSSL_armcap_P |= ARMV8_SVE2;
+    }
+
     if (sigsetjmp(ill_jmp, 1) == 0) {
         _armv8_rng_probe();
         OPENSSL_armcap_P |= ARMV8_RNG;
diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl
new file mode 100755 (executable)
index 0000000..6080414
--- /dev/null
@@ -0,0 +1,843 @@
+#! /usr/bin/env perl
+# Copyright 2022  The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+#
+# ChaCha20 for ARMv8 via SVE
+#
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+sub AUTOLOAD()         # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
+my ($state) = ("x5");
+my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7");
+my ($saved_outp) = ("x8");
+my ($wctr, $xctr) = ("w9", "x9");
+my @mx=map("z$_",(0..7,16..23));
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
+my @xt=map("z$_",(24..31,8..11));
+my ($rot8) = ("z12");
+my ($zctr) = ("z13");
+my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt;
+my $debug_encoder=0;
+
+sub SVE_ADD() {
+       my $x = shift;
+       my $y = shift;
+
+$code.=<<___;
+       add     @mx[$x].s,@mx[$x].s,@mx[$y].s
+___
+       if (@_) {
+               &SVE_ADD(@_);
+       }
+}
+
+sub SVE_EOR() {
+       my $x = shift;
+       my $y = shift;
+
+$code.=<<___;
+       eor     @mx[$x].d,@mx[$x].d,@mx[$y].d
+___
+       if (@_) {
+               &SVE_EOR(@_);
+       }
+}
+
+sub SVE_LSL() {
+       my $bits = shift;
+       my $x = shift;
+       my $y = shift;
+       my $next = $x + 1;
+
+$code.=<<___;
+       lsl     @xt[$x].s,@mx[$y].s,$bits
+___
+       if (@_) {
+               &SVE_LSL($bits,$next,@_);
+       }
+}
+
+sub SVE_LSR() {
+       my $bits = shift;
+       my $x = shift;
+
+$code.=<<___;
+       lsr     @mx[$x].s,@mx[$x].s,$bits
+___
+       if (@_) {
+               &SVE_LSR($bits,@_);
+       }
+}
+
+sub SVE_ORR() {
+       my $x = shift;
+       my $y = shift;
+       my $next = $x + 1;
+
+$code.=<<___;
+       orr     @mx[$y].d,@mx[$y].d,@xt[$x].d
+___
+       if (@_) {
+               &SVE_ORR($next,@_);
+       }
+}
+
+sub SVE_REV16() {
+       my $x = shift;
+
+$code.=<<___;
+       revh    @mx[$x].s,p0/m,@mx[$x].s
+___
+       if (@_) {
+               &SVE_REV16(@_);
+       }
+}
+
+sub SVE_ROT8() {
+       my $x = shift;
+
+$code.=<<___;
+       tbl     @mx[$x].b,{@mx[$x].b},$rot8.b
+___
+       if (@_) {
+               &SVE_ROT8(@_);
+       }
+}
+
+sub SVE2_XAR() {
+       my $bits = shift;
+       my $x = shift;
+       my $y = shift;
+       my $rbits = 32-$bits;
+
+$code.=<<___;
+       xar     @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits
+___
+       if (@_) {
+               &SVE2_XAR($bits,@_);
+       }
+}
+
+sub SVE_QR_GROUP() {
+       my $have_sve2 = shift;
+       my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
+
+       &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
+       &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+       &SVE_REV16($d0,$d1,$d2,$d3);
+
+       &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
+       if ($have_sve2 == 0) {
+               &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
+               &SVE_LSL(12,0,$b0,$b1,$b2,$b3);
+               &SVE_LSR(20,$b0,$b1,$b2,$b3);
+               &SVE_ORR(0,$b0,$b1,$b2,$b3,);
+       } else {
+               &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
+       }
+
+       &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
+       &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
+       &SVE_ROT8($d0,$d1,$d2,$d3);
+
+       &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
+       if ($have_sve2 == 0) {
+               &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
+               &SVE_LSL(7,0,$b0,$b1,$b2,$b3);
+               &SVE_LSR(25,$b0,$b1,$b2,$b3);
+               &SVE_ORR(0,$b0,$b1,$b2,$b3);
+       } else {
+               &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3);
+       }
+}
+
+sub SVE_INNER_BLOCK() {
+$code.=<<___;
+       //cbnz $sve2flag, 10f
+___
+       &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
+       &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+$code.=<<___;
+       // SVE 2 not enabled until hardware available
+#if 0
+       b 11f
+10:
+___
+#      &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
+#      &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
+$code.=<<___;
+11:
+#endif
+___
+}
+
+{{{
+my ($dlen,$rsize,$tmp) = ("x10","x11","x12");
+
+sub load() {
+       my $x0 = shift;
+       my $x1 = shift;
+       my $x2 = shift;
+       my $x3 = shift;
+       my $x4 = shift;
+       my $x5 = shift;
+       my $x6 = shift;
+       my $x7 = shift;
+
+$code.=<<___;
+       ld1w    {$x0.s},p0/z,[$inp]
+       ld1w    {$x1.s},p0/z,[$inp, #1, MUL VL]
+       ld1w    {$x2.s},p0/z,[$inp, #2, MUL VL]
+       ld1w    {$x3.s},p0/z,[$inp, #3, MUL VL]
+       ld1w    {$x4.s},p0/z,[$inp, #4, MUL VL]
+       ld1w    {$x5.s},p0/z,[$inp, #5, MUL VL]
+       ld1w    {$x6.s},p0/z,[$inp, #6, MUL VL]
+       ld1w    {$x7.s},p0/z,[$inp, #7, MUL VL]
+       addvl   $inp,$inp,#8
+___
+}
+
+sub store() {
+       my $x0 = shift;
+       my $x1 = shift;
+       my $x2 = shift;
+       my $x3 = shift;
+       my $x4 = shift;
+       my $x5 = shift;
+       my $x6 = shift;
+       my $x7 = shift;
+
+$code.=<<___;
+       st1w    {$x0.s},p0,[$outp]
+       st1w    {$x1.s},p0,[$outp, #1, MUL VL]
+       st1w    {$x2.s},p0,[$outp, #2, MUL VL]
+       st1w    {$x3.s},p0,[$outp, #3, MUL VL]
+       st1w    {$x4.s},p0,[$outp, #4, MUL VL]
+       st1w    {$x5.s},p0,[$outp, #5, MUL VL]
+       st1w    {$x6.s},p0,[$outp, #6, MUL VL]
+       st1w    {$x7.s},p0,[$outp, #7, MUL VL]
+       addvl   $outp,$outp,#8
+___
+}
+
+sub transpose() {
+       my $xa = shift;
+       my $xb = shift;
+       my $xc = shift;
+       my $xd = shift;
+
+$code.=<<___;
+       zip1    $xt8.s,$xa.s,$xb.s
+       zip2    $xt9.s,$xa.s,$xb.s
+       zip1    $xt10.s,$xc.s,$xd.s
+       zip2    $xt11.s,$xc.s,$xd.s
+       zip1    $xa.d,$xt8.d,$xt10.d
+       zip2    $xb.d,$xt8.d,$xt10.d
+       zip1    $xc.d,$xt9.d,$xt11.d
+       zip2    $xd.d,$xt9.d,$xt11.d
+___
+}
+
+sub add_states() {
+       my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
+
+$code.=<<___;
+       ldp     $tmpw0,$tmpw1,[$state]
+       ldp     $tmpw2,$tmpw3,[$state,#8]
+       dup     $xt0.s,$tmpw0
+       dup     $xt1.s,$tmpw1
+       dup     $xt2.s,$tmpw2
+       dup     $xt3.s,$tmpw3
+       ldp     $tmpw0,$tmpw1,[$state,#16]
+       ldp     $tmpw2,$tmpw3,[$state,#24]
+       add     @mx[0].s,@mx[0].s,$xt0.s
+       add     @mx[1].s,@mx[1].s,$xt1.s
+       add     @mx[2].s,@mx[2].s,$xt2.s
+       add     @mx[3].s,@mx[3].s,$xt3.s
+       dup     $xt4.s,$tmpw0
+       dup     $xt5.s,$tmpw1
+       dup     $xt6.s,$tmpw2
+       dup     $xt7.s,$tmpw3
+       ldp     $tmpw0,$tmpw1,[$state,#32]
+       ldp     $tmpw2,$tmpw3,[$state,#40]
+       add     @mx[4].s,@mx[4].s,$xt4.s
+       add     @mx[5].s,@mx[5].s,$xt5.s
+       add     @mx[6].s,@mx[6].s,$xt6.s
+       add     @mx[7].s,@mx[7].s,$xt7.s
+       dup     $xt0.s,$tmpw0
+       dup     $xt1.s,$tmpw1
+       dup     $xt2.s,$tmpw2
+       dup     $xt3.s,$tmpw3
+       ldp     $tmpw0,$tmpw1,[$state,#48]
+       ldp     $tmpw2,$tmpw3,[$state,#56]
+       add     @mx[8].s,@mx[8].s,$xt0.s
+       add     @mx[9].s,@mx[9].s,$xt1.s
+       add     @mx[10].s,@mx[10].s,$xt2.s
+       add     @mx[11].s,@mx[11].s,$xt3.s
+       dup     $xt5.s,$tmpw1
+       dup     $xt6.s,$tmpw2
+       dup     $xt7.s,$tmpw3
+       add     @mx[12].s,@mx[12].s,$zctr.s
+       add     @mx[13].s,@mx[13].s,$xt5.s
+       add     @mx[14].s,@mx[14].s,$xt6.s
+       add     @mx[15].s,@mx[15].s,$xt7.s
+___
+}
+
+sub SVE_TRANSFORMS() {
+       &add_states();
+       &transpose($xa0,$xb0,$xc0,$xd0);
+       &transpose($xa1,$xb1,$xc1,$xd1);
+       &transpose($xa2,$xb2,$xc2,$xd2);
+       &transpose($xa3,$xb3,$xc3,$xd3);
+       &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
+       &transpose($xa0,$xa1,$xa2,$xa3);
+       &transpose($xb0,$xb1,$xb2,$xb3);
+$code.=<<___;
+       eor     $xa0.d,$xa0.d,$xt0.d
+       eor     $xa1.d,$xa1.d,$xt1.d
+       eor     $xa2.d,$xa2.d,$xt2.d
+       eor     $xa3.d,$xa3.d,$xt3.d
+       eor     $xb0.d,$xb0.d,$xt4.d
+       eor     $xb1.d,$xb1.d,$xt5.d
+       eor     $xb2.d,$xb2.d,$xt6.d
+       eor     $xb3.d,$xb3.d,$xt7.d
+___
+       &transpose($xc0,$xc1,$xc2,$xc3);
+       &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
+       &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
+       &transpose($xd0,$xd1,$xd2,$xd3);
+$code.=<<___;
+       eor     $xc0.d,$xc0.d,$xt0.d
+       eor     $xc1.d,$xc1.d,$xt1.d
+       eor     $xc2.d,$xc2.d,$xt2.d
+       eor     $xc3.d,$xc3.d,$xt3.d
+       eor     $xd0.d,$xd0.d,$xt4.d
+       eor     $xd1.d,$xd1.d,$xt5.d
+       eor     $xd2.d,$xd2.d,$xt6.d
+       eor     $xd3.d,$xd3.d,$xt7.d
+___
+       &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3);
+$code.=<<___;
+       incw    $xctr, ALL, MUL #1
+       incw    $zctr.s, ALL, MUL #1
+___
+}
+}}}
+
+sub SVE_LOAD_STATES() {
+       my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
+
+$code.=<<___;
+       // FIXME following code are not functionally necessary
+       // but appear to enhance performance
+#if 1
+       ptrues  p2.s,ALL
+       ptrues  p2.s,ALL
+       ptrues  p2.s,ALL
+       ptrues  p2.s,ALL
+       ptrues  p2.s,ALL
+       ptrues  p2.s,ALL
+#endif
+___
+$code.=<<___;
+       ldp     $tmpw0,$tmpw1,[$state]
+       ldp     $tmpw2,$tmpw3,[$state,#8]
+       dup     @mx[0].s,$tmpw0
+       dup     @mx[1].s,$tmpw1
+       dup     @mx[2].s,$tmpw2
+       dup     @mx[3].s,$tmpw3
+       ldp     $tmpw0,$tmpw1,[$state,#16]
+       ldp     $tmpw2,$tmpw3,[$state,#24]
+       dup     @mx[4].s,$tmpw0
+       dup     @mx[5].s,$tmpw1
+       dup     @mx[6].s,$tmpw2
+       dup     @mx[7].s,$tmpw3
+       ldp     $tmpw0,$tmpw1,[$state,#32]
+       ldp     $tmpw2,$tmpw3,[$state,#40]
+       dup     @mx[8].s,$tmpw0
+       dup     @mx[9].s,$tmpw1
+       dup     @mx[10].s,$tmpw2
+       dup     @mx[11].s,$tmpw3
+       ldp     $tmpw0,$tmpw1,[$state, #48]
+       ldp     $tmpw2,$tmpw3,[$state,#56]
+       mov     @mx[12].s,p0/m,$zctr.s
+       dup     @mx[13].s,$tmpw1
+       dup     @mx[14].s,$tmpw2
+       dup     @mx[15].s,$tmpw3
+___
+}
+
+sub sve_handle_blocks() {
+       my ($counter) = ("x10");
+
+       &SVE_LOAD_STATES();
+$code.=<<___;
+       mov     $counter,#10
+.align 5
+1:
+___
+
+       &SVE_INNER_BLOCK();
+$code.=<<___;
+       subs    $counter,$counter,1
+       b.ne    1b
+___
+       &SVE_TRANSFORMS();
+}
+
+sub chacha20_process() {
+       my ($counter) = ("x10");
+       my ($tmpw) = ("w11");
+
+$code.=<<___;
+.align 5
+.Loop:
+       cmp     $blocks,$veclen
+       b.lt    .Lexit
+___
+       &sve_handle_blocks();
+$code.=<<___;
+       subs    $blocks,$blocks,$veclen
+       b.gt    .Loop
+.Lexit:
+___
+}
+
+{{{
+my ($tmp,$tmpw) = ("x10", "w10");
+my ($tmpw0,$tmpw1) = ("w11", "w12");
+my ($ptr) = ("x13");
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.arch   armv8-a
+
+#if 0
+.extern        OPENSSL_armcap_P
+.hidden        OPENSSL_armcap_P
+#endif
+
+.text
+.align 5
+.Lchacha20_consts:
+       .word 0x61707865
+       .word 0x3320646e
+       .word 0x79622d32
+       .word 0x6b206574
+.Lrot8:
+       .word 0x02010003,0x04040404,0x02010003,0x04040404
+.globl ChaCha20_ctr32_sve
+.type  ChaCha20_ctr32_sve,%function
+.align 5
+ChaCha20_ctr32_sve:
+       AARCH64_VALID_CALL_TARGET
+       mov     $tmp, #64
+       whilelo p0.s,xzr,$tmp
+       cntp    $veclen,p0,p0.s
+       // run Neon if we only have 128-bit SVE
+       // in the future, we need to check SVE2
+       cmp     $veclen,4
+       b.le    .Lreturn
+       lsr     $blocks,$len,#6
+       cmp     $blocks,$veclen
+       b.lt    .Lreturn
+       stp     d8,d9,[sp,-48]!
+       stp     d10,d11,[sp,16]
+       stp     d12,d13,[sp,32]
+       sub     sp,sp,#64
+       adr     $tmp,.Lchacha20_consts
+       ld1     {v0.4s},[$tmp]
+       adr     $tmp,.Lrot8
+       ldp     $tmpw0,$tmpw1,[$tmp]
+       ld1     {v1.4s,v2.4s},[$key]
+       ld1     {v3.4s},[$ctr]
+       ldr     $wctr,[$ctr]
+       index   $zctr.s,$wctr,1
+       index   $rot8.s,$tmpw0,$tmpw1
+       st1     {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
+       mov     $state,sp
+#if 0
+       // SVE2 code not enabled until we have hardware
+       // for verification
+       mov     $sve2flag,0
+       adrp    $tmp,OPENSSL_armcap_P
+       ldr     $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
+       tst     $tmpw,#ARMV8_SVE2
+       b.eq    1f
+       mov     $sve2flag,1
+1:
+#endif
+___
+       &chacha20_process();
+$code.=<<___;
+       add     sp,sp,#64
+       ldp     d10,d11,[sp,16]
+       ldp     d12,d13,[sp,32]
+       ldp     d8,d9,[sp],48
+       str     $wctr,[$ctr]
+       and     $len,$len,#63
+       add     $len,$len,$blocks,lsl #6
+.Lreturn:
+       ret
+.size  ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve
+___
+
+}}}
+
+########################################
+{
+my  %opcode_unpred = (
+       "eor"          => 0x04a03000,
+       "add"          => 0x04200000,
+       "orr"          => 0x04603000,
+       "lsl"          => 0x04209C00,
+       "lsr"          => 0x04209400,
+       "incw"         => 0x04B0C000,
+       "xar"          => 0x04203400,
+       "zip1"         => 0x05206000,
+       "zip2"         => 0x05206400,
+       "uzp1"         => 0x05206800,
+       "uzp2"         => 0x05206C00,
+       "index"        => 0x04204C00,
+       "mov"          => 0x05203800,
+       "dup"          => 0x05203800,
+       "tbl"          => 0x05203000);
+
+my  %opcode_imm_unpred = (
+       "dup"          => 0x2538C000,
+       "index"        => 0x04204400);
+
+my %opcode_scalar_pred = (
+       "mov"          => 0x0528A000,
+       "cpy"          => 0x0528A000,
+       "st4w"         => 0xE5606000,
+       "st1w"         => 0xE5004000,
+       "ld1w"         => 0xA5404000);
+
+my %opcode_gather_pred = (
+       "ld1w"         => 0x85204000);
+
+my  %opcode_pred = (
+       "eor"          => 0x04190000,
+       "add"          => 0x04000000,
+       "orr"          => 0x04180000,
+       "whilelo"      => 0x25200C00,
+       "whilelt"      => 0x25200400,
+       "cntp"         => 0x25208000,
+       "addvl"        => 0x04205000,
+       "lsl"          => 0x04038000,
+       "lsr"          => 0x04018000,
+       "sel"          => 0x0520C000,
+       "mov"          => 0x0520C000,
+       "ptrue"        => 0x2518E000,
+       "pfalse"       => 0x2518E400,
+       "ptrues"       => 0x2519E000,
+       "pnext"        => 0x2519C400,
+       "ld4w"         => 0xA560E000,
+       "st4w"         => 0xE570E000,
+       "st1w"         => 0xE500E000,
+       "ld1w"         => 0xA540A000,
+       "revh"         => 0x05258000);
+
+my  %tsize = (
+       'b'          => 0,
+       'h'          => 1,
+       's'          => 2,
+       'd'          => 3);
+
+my %sf = (
+       "w"          => 0,
+       "x"          => 1);
+
+my %pattern = (
+       "POW2"       => 0,
+       "VL1"        => 1,
+       "VL2"        => 2,
+       "VL3"        => 3,
+       "VL4"        => 4,
+       "VL5"        => 5,
+       "VL6"        => 6,
+       "VL7"        => 7,
+       "VL8"        => 8,
+       "VL16"       => 9,
+       "VL32"       => 10,
+       "VL64"       => 11,
+       "VL128"      => 12,
+       "VL256"      => 13,
+       "MUL4"       => 29,
+       "MUL3"       => 30,
+       "ALL"        => 31);
+
+sub create_verifier {
+       my $filename="./compile_sve.sh";
+
+$scripts = <<___;
+#! /bin/bash
+set -e
+CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'}
+
+[ -z "\$1" ] && exit 1
+ARCH=`uname -p | xargs echo -n`
+
+# need gcc-10 and above to compile SVE code
+# change this according to your system during debugging
+if [ \$ARCH == 'aarch64' ]; then
+       CC=gcc-11
+       OBJDUMP=objdump
+else
+       CC=\${CROSS_COMPILE}gcc
+       OBJDUMP=\${CROSS_COMPILE}objdump
+fi
+TMPFILE=/tmp/\$\$
+cat > \$TMPFILE.c << EOF
+extern __attribute__((noinline, section("disasm_output"))) void dummy_func()
+{
+       asm("\$@\\t\\n");
+}
+int main(int argc, char *argv[])
+{
+}
+EOF
+\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c
+\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}'
+rm \$TMPFILE.c \$TMPFILE.out
+___
+       open(FH, '>', $filename) or die $!;
+       print FH $scripts;
+       close(FH);
+       system("chmod a+x ./compile_sve.sh");
+}
+
+sub compile_sve {
+       return `./compile_sve.sh '@_'`
+}
+
+sub verify_inst {
+       my ($code,$inst)=@_;
+       my $hexcode = (sprintf "%08x", $code);
+
+       if ($debug_encoder == 1) {
+               my $expect=&compile_sve($inst);
+               if ($expect ne $hexcode) {
+                       return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode);
+               }
+       }
+       return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst);
+}
+
+sub reg_code {
+       my $code = shift;
+
+       if ($code == "zr") {
+               return "31";
+       }
+       return $code;
+}
+
+sub encode_size_imm() {
+       my ($mnemonic, $isize, $const)=@_;
+       my $esize = (8<<$tsize{$isize});
+       my $tsize_imm = $esize + $const;
+
+       if ($mnemonic eq "lsr" || $mnemonic eq "xar") {
+               $tsize_imm = 2*$esize - $const;
+       }
+       return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16);
+}
+
+sub encode_shift_pred() {
+       my ($mnemonic, $isize, $const)=@_;
+       my $esize = (8<<$tsize{$isize});
+       my $tsize_imm = $esize + $const;
+
+       if ($mnemonic eq "lsr") {
+               $tsize_imm = 2*$esize - $const;
+       }
+       return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5);
+}
+
+sub sve_unpred {
+       my ($mnemonic,$arg)=@_;
+       my $inst = (sprintf "%s %s", $mnemonic,$arg);
+
+       if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) {
+               return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16),
+                                       $inst)
+       } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) {
+                       my $regd = $1;
+               my $isize = $2;
+               my $regs=$3;
+
+               if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
+                       if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
+                               && ((8<<$tsize{$isize}) > $2)) {
+                               return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2),
+                                       $inst);
+                       }
+               } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) {
+                       return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
+               } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) {
+                       return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst);
+               } elsif ($regs =~ m/[wx]([0-9]+)/o) {
+                       return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst);
+               } else {
+                       my $encoded_size = 0;
+                       if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) {
+                               $encoded_size = ($tsize{$isize}<<22);
+                       }
+                       if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o &&
+                               $1 == $regd) {
+                               return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst);
+                       } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) {
+                               return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst);
+                       }
+               }
+       } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) {
+               return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22),
+                                       $inst)
+       }
+       sprintf "%s // fail to parse", $inst;
+}
+
+sub sve_pred {
+       my ($mnemonic,,$arg)=@_;
+       my $inst = (sprintf "%s %s", $mnemonic,$arg);
+
+       if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) {
+               my $zt = $1;
+               my $size = $tsize{$2};
+               my $pg = $3;
+               my $addr = $5;
+               my $xn = 31;
+
+               if ($addr =~ m/x([0-9]+)\s*/o) {
+                       $xn = $1;
+               }
+               if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
+                       return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
+               } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
+                       my $xs = ($2 eq "SXTW") ? 1 : 0;
+                       return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
+               } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) {
+                       return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
+               } else {
+                       return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst);
+               }
+       } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) {
+               my $regd = $1;
+               my $isize = $2;
+               my $pg = $3;
+               my $mod = $4;
+               my $regs = $5;
+
+               if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) {
+                       if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o
+                               && $regd == $1
+                               && $mode == 'm'
+                               && ((8<<$tsize{$isize}) > $2)) {
+                               return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst);
+                       }
+               } elsif($regs =~ m/[wx]([0-9]+)/o) {
+                       return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
+               } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) {
+                       if ($mnemonic eq "sel") {
+                               return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst);
+                       } elsif ($mnemonic eq "mov") {
+                               return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst);
+                       } elsif (length $2 > 0) {
+                               return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst);
+                       } else {
+                               return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst);
+                       }
+               }
+       } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) {
+               my $pg = $1;
+               my $isize = $2;
+               my $regs = $3;
+
+               if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) {
+                       return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(&reg_code($2)<<5)|(&reg_code($3)<<16), $inst);
+               } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) {
+                       return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst);
+               } else {
+                       return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst);
+               }
+       } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) {
+               return &verify_inst($opcode_pred{$mnemonic}|$1, $inst);
+       }
+
+       sprintf "%s // fail to parse", $inst;
+}
+
+sub sve_other {
+       my ($mnemonic,$arg)=@_;
+       my $inst = (sprintf "%s %s", $mnemonic,$arg);
+
+       if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) {
+               return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst);
+       } elsif ($mnemonic =~ /inc[bhdw]/) {
+               if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
+                       return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16), $inst);
+               } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
+                       return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
+               } elsif ($arg =~ m/x([0-9]+)/o) {
+                       return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
+               }
+       } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
+               return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
+       }
+       sprintf "%s // fail to parse", $inst;
+}
+}
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+       last if (!s/^#/\/\// and !/^$/);
+       print;
+}
+close SELF;
+
+if ($debug_encoder == 1) {
+       &create_verifier();
+}
+
+foreach(split("\n",$code)) {
+       s/\`([^\`]*)\`/eval($1)/ge;
+       s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
+       s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
+       s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
+       s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
+       s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
+       s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
+       print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
index e1a8b8159421ae32b2b977b93f5efbfa4eb1e0d7..f6e0e2ef5423dc41e5384da0aa3fb4c92d29d438 100755 (executable)
@@ -136,6 +136,8 @@ $code.=<<___;
 #ifndef        __KERNEL__
 .extern        OPENSSL_armcap_P
 .hidden        OPENSSL_armcap_P
+
+.extern ChaCha20_ctr32_sve
 #endif
 
 .text
@@ -149,18 +151,17 @@ $code.=<<___;
 .long  0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
 .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
 
-.globl ChaCha20_ctr32
-.type  ChaCha20_ctr32,%function
+.globl ChaCha20_ctr32_dflt
+.type  ChaCha20_ctr32_dflt,%function
 .align 5
-ChaCha20_ctr32:
+ChaCha20_ctr32_dflt:
        AARCH64_SIGN_LINK_REGISTER
-       cbz     $len,.Labort
        cmp     $len,#192
        b.lo    .Lshort
-
 #ifndef        __KERNEL__
        adrp    x17,OPENSSL_armcap_P
        ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
+.Lcheck_neon:
        tst     w17,#ARMV7_NEON
        b.ne    .LChaCha20_neon
 #endif
@@ -344,6 +345,41 @@ $code.=<<___;
        ldp     x29,x30,[sp],#96
        AARCH64_VALIDATE_LINK_REGISTER
        ret
+.size  ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt
+
+.globl ChaCha20_ctr32
+.type  ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+       AARCH64_SIGN_LINK_REGISTER
+       cbz     $len,.Labort
+       cmp     $len,#192
+       b.lo    .Lshort
+#ifndef        __KERNEL__
+       adrp    x17,OPENSSL_armcap_P
+       ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
+       tst     w17,#ARMV8_SVE
+       b.eq    .Lcheck_neon
+       stp     x29,x30,[sp,#-16]!
+       sub     sp,sp,#16
+       // SVE handling will inevitably increment the counter
+       // Neon/Scalar code that follows to process tail data needs to
+       // use new counter, unfortunately the input counter buffer
+       // pointed to by ctr is meant to be read-only per API contract
+       // we have to copy the buffer to stack to be writable by SVE
+       ldp     x5,x6,[$ctr]
+       stp     x5,x6,[sp]
+       mov     $ctr,sp
+       bl      ChaCha20_ctr32_sve
+       cbz     $len,1f
+       bl      ChaCha20_ctr32_dflt
+1:
+       add     sp,sp,#16
+       ldp     x29,x30,[sp],#16
+       AARCH64_VALIDATE_LINK_REGISTER
+       ret
+#endif
+       b       .Lshort
 .size  ChaCha20_ctr32,.-ChaCha20_ctr32
 ___
 
index 5fe74778733e4f38e8f2fb86e3ebf3a2d73f5d85..d02c571d02ddec8855c384b12c1e5dffefd73808 100644 (file)
@@ -10,7 +10,7 @@ IF[{- !$disabled{asm} -}]
   $CHACHAASM_s390x=chacha-s390x.S
 
   $CHACHAASM_armv4=chacha-armv4.S
-  $CHACHAASM_aarch64=chacha-armv8.S
+  $CHACHAASM_aarch64=chacha-armv8.S chacha-armv8-sve.S
 
   $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s
   $CHACHAASM_ppc64=$CHACHAASM_ppc32
@@ -33,7 +33,9 @@ GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl
 GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl
 INCLUDE[chacha-armv4.o]=..
 GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl
+GENERATE[chacha-armv8-sve.S]=asm/chacha-armv8-sve.pl
 INCLUDE[chacha-armv8.o]=..
+INCLUDE[chacha-armv8-sve.o]=..
 INCLUDE[chacha-s390x.o]=..
 GENERATE[chacha-c64xplus.S]=asm/chacha-c64xplus.pl
 GENERATE[chacha-s390x.S]=asm/chacha-s390x.pl