From: Daniel Hu Date: Mon, 7 Feb 2022 10:17:06 +0000 (+0000) Subject: Acceleration of chacha20 on aarch64 by SVE X-Git-Tag: openssl-3.2.0-alpha1~2722 X-Git-Url: https://git.openssl.org/gitweb/?a=commitdiff_plain;h=b1b2146ded9ce5a84c62f30c6c4a922b449f6c90;hp=04904a0fff639c058d38b355d75485ca5dde0a89;p=openssl.git Acceleration of chacha20 on aarch64 by SVE This patch accelerates chacha20 on aarch64 when Scalable Vector Extension (SVE) is supported by CPU. Tested on modern micro-architecture with 256-bit SVE, it has the potential to improve performance up to 20% The solution takes a hybrid approach. SVE will handle multi-blocks that fit the SVE vector length, with Neon/Scalar to process any tail data Test result: With SVE type 1024 bytes 8192 bytes 16384 bytes ChaCha20 1596208.13k 1650010.79k 1653151.06k Without SVE (by Neon/Scalar) type 1024 bytes 8192 bytes 16384 bytes chacha20 1355487.91k 1372678.83k 1372662.44k The assembly code has been reviewed internally by ARM engineer Fangming.Fang@arm.com Signed-off-by: Daniel Hu Reviewed-by: Tomas Mraz Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/17916) --- diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl index ad48c53402..0a6cd01e30 100755 --- a/crypto/arm64cpuid.pl +++ b/crypto/arm64cpuid.pl @@ -104,6 +104,22 @@ _armv8_eor3_probe: ret .size _armv8_eor3_probe,.-_armv8_eor3_probe +.globl _armv8_sve_probe +.type _armv8_sve_probe,%function +_armv8_sve_probe: + AARCH64_VALID_CALL_TARGET + .inst 0x04a03000 // eor z0.d,z0.d,z0.d + ret +.size _armv8_sve_probe,.-_armv8_sve_probe + +.globl _armv8_sve2_probe +.type _armv8_sve2_probe,%function +_armv8_sve2_probe: + AARCH64_VALID_CALL_TARGET + .inst 0x04e03400 // xar z0.d,z0.d,z0.d + ret +.size _armv8_sve2_probe,.-_armv8_sve2_probe + .globl _armv8_cpuid_probe .type _armv8_cpuid_probe,%function _armv8_cpuid_probe: diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 33acbd99c0..5fc0905885 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; # define ARMV8_SM4 (1<<10) # define ARMV8_SHA3 (1<<11) # define ARMV8_UNROLL8_EOR3 (1<<12) +# define ARMV8_SVE (1<<13) +# define ARMV8_SVE2 (1<<14) /* * MIDR_EL1 system register diff --git a/crypto/armcap.c b/crypto/armcap.c index c50322f504..91ba45c950 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -57,6 +57,8 @@ void _armv8_sm3_probe(void); void _armv8_sm4_probe(void); void _armv8_sha512_probe(void); unsigned int _armv8_cpuid_probe(void); +void _armv8_sve_probe(void); +void _armv8_sve2_probe(void); void _armv8_rng_probe(void); size_t OPENSSL_rndr_asm(unsigned char *buf, size_t len); @@ -175,8 +177,10 @@ static unsigned long getauxval(unsigned long key) # define HWCAP_CE_SM3 (1 << 18) # define HWCAP_CE_SM4 (1 << 19) # define HWCAP_CE_SHA512 (1 << 21) +# define HWCAP_SVE (1 << 22) /* AT_HWCAP2 */ # define HWCAP2 26 +# define HWCAP2_SVE2 (1 << 1) # define HWCAP2_RNG (1 << 16) # endif @@ -270,6 +274,12 @@ void OPENSSL_cpuid_setup(void) # endif } # ifdef __aarch64__ + if (getauxval(HWCAP) & HWCAP_SVE) + OPENSSL_armcap_P |= ARMV8_SVE; + + if (getauxval(HWCAP2) & HWCAP2_SVE2) + OPENSSL_armcap_P |= ARMV8_SVE2; + if (getauxval(HWCAP2) & HWCAP2_RNG) OPENSSL_armcap_P |= ARMV8_RNG; # endif @@ -330,6 +340,16 @@ void OPENSSL_cpuid_setup(void) # endif } # ifdef __aarch64__ + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sve_probe(); + OPENSSL_armcap_P |= ARMV8_SVE; + } + + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sve2_probe(); + OPENSSL_armcap_P |= ARMV8_SVE2; + } + if (sigsetjmp(ill_jmp, 1) == 0) { _armv8_rng_probe(); OPENSSL_armcap_P |= ARMV8_RNG; diff --git a/crypto/chacha/asm/chacha-armv8-sve.pl b/crypto/chacha/asm/chacha-armv8-sve.pl new file mode 100755 index 0000000000..6080414e0d --- /dev/null +++ b/crypto/chacha/asm/chacha-armv8-sve.pl @@ -0,0 +1,843 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# +# ChaCha20 for ARMv8 via SVE +# +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4)); +my ($state) = ("x5"); +my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7"); +my ($saved_outp) = ("x8"); +my ($wctr, $xctr) = ("w9", "x9"); +my @mx=map("z$_",(0..7,16..23)); +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx; +my @xt=map("z$_",(24..31,8..11)); +my ($rot8) = ("z12"); +my ($zctr) = ("z13"); +my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt; +my $debug_encoder=0; + +sub SVE_ADD() { + my $x = shift; + my $y = shift; + +$code.=<<___; + add @mx[$x].s,@mx[$x].s,@mx[$y].s +___ + if (@_) { + &SVE_ADD(@_); + } +} + +sub SVE_EOR() { + my $x = shift; + my $y = shift; + +$code.=<<___; + eor @mx[$x].d,@mx[$x].d,@mx[$y].d +___ + if (@_) { + &SVE_EOR(@_); + } +} + +sub SVE_LSL() { + my $bits = shift; + my $x = shift; + my $y = shift; + my $next = $x + 1; + +$code.=<<___; + lsl @xt[$x].s,@mx[$y].s,$bits +___ + if (@_) { + &SVE_LSL($bits,$next,@_); + } +} + +sub SVE_LSR() { + my $bits = shift; + my $x = shift; + +$code.=<<___; + lsr @mx[$x].s,@mx[$x].s,$bits +___ + if (@_) { + &SVE_LSR($bits,@_); + } +} + +sub SVE_ORR() { + my $x = shift; + my $y = shift; + my $next = $x + 1; + +$code.=<<___; + orr @mx[$y].d,@mx[$y].d,@xt[$x].d +___ + if (@_) { + &SVE_ORR($next,@_); + } +} + +sub SVE_REV16() { + my $x = shift; + +$code.=<<___; + revh @mx[$x].s,p0/m,@mx[$x].s +___ + if (@_) { + &SVE_REV16(@_); + } +} + +sub SVE_ROT8() { + my $x = shift; + +$code.=<<___; + tbl @mx[$x].b,{@mx[$x].b},$rot8.b +___ + if (@_) { + &SVE_ROT8(@_); + } +} + +sub SVE2_XAR() { + my $bits = shift; + my $x = shift; + my $y = shift; + my $rbits = 32-$bits; + +$code.=<<___; + xar @mx[$x].s,@mx[$x].s,@mx[$y].s,$rbits +___ + if (@_) { + &SVE2_XAR($bits,@_); + } +} + +sub SVE_QR_GROUP() { + my $have_sve2 = shift; + my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; + + &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); + &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); + &SVE_REV16($d0,$d1,$d2,$d3); + + &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); + if ($have_sve2 == 0) { + &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); + &SVE_LSL(12,0,$b0,$b1,$b2,$b3); + &SVE_LSR(20,$b0,$b1,$b2,$b3); + &SVE_ORR(0,$b0,$b1,$b2,$b3,); + } else { + &SVE2_XAR(12,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); + } + + &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); + &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); + &SVE_ROT8($d0,$d1,$d2,$d3); + + &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); + if ($have_sve2 == 0) { + &SVE_EOR($b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); + &SVE_LSL(7,0,$b0,$b1,$b2,$b3); + &SVE_LSR(25,$b0,$b1,$b2,$b3); + &SVE_ORR(0,$b0,$b1,$b2,$b3); + } else { + &SVE2_XAR(7,$b0,$c0,$b1,$c1,$b2,$c2,$b3,$c3); + } +} + +sub SVE_INNER_BLOCK() { +$code.=<<___; + //cbnz $sve2flag, 10f +___ + &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); + &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); +$code.=<<___; + // SVE 2 not enabled until hardware available +#if 0 + b 11f +10: +___ +# &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); +# &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); +$code.=<<___; +11: +#endif +___ +} + +{{{ +my ($dlen,$rsize,$tmp) = ("x10","x11","x12"); + +sub load() { + my $x0 = shift; + my $x1 = shift; + my $x2 = shift; + my $x3 = shift; + my $x4 = shift; + my $x5 = shift; + my $x6 = shift; + my $x7 = shift; + +$code.=<<___; + ld1w {$x0.s},p0/z,[$inp] + ld1w {$x1.s},p0/z,[$inp, #1, MUL VL] + ld1w {$x2.s},p0/z,[$inp, #2, MUL VL] + ld1w {$x3.s},p0/z,[$inp, #3, MUL VL] + ld1w {$x4.s},p0/z,[$inp, #4, MUL VL] + ld1w {$x5.s},p0/z,[$inp, #5, MUL VL] + ld1w {$x6.s},p0/z,[$inp, #6, MUL VL] + ld1w {$x7.s},p0/z,[$inp, #7, MUL VL] + addvl $inp,$inp,#8 +___ +} + +sub store() { + my $x0 = shift; + my $x1 = shift; + my $x2 = shift; + my $x3 = shift; + my $x4 = shift; + my $x5 = shift; + my $x6 = shift; + my $x7 = shift; + +$code.=<<___; + st1w {$x0.s},p0,[$outp] + st1w {$x1.s},p0,[$outp, #1, MUL VL] + st1w {$x2.s},p0,[$outp, #2, MUL VL] + st1w {$x3.s},p0,[$outp, #3, MUL VL] + st1w {$x4.s},p0,[$outp, #4, MUL VL] + st1w {$x5.s},p0,[$outp, #5, MUL VL] + st1w {$x6.s},p0,[$outp, #6, MUL VL] + st1w {$x7.s},p0,[$outp, #7, MUL VL] + addvl $outp,$outp,#8 +___ +} + +sub transpose() { + my $xa = shift; + my $xb = shift; + my $xc = shift; + my $xd = shift; + +$code.=<<___; + zip1 $xt8.s,$xa.s,$xb.s + zip2 $xt9.s,$xa.s,$xb.s + zip1 $xt10.s,$xc.s,$xd.s + zip2 $xt11.s,$xc.s,$xd.s + zip1 $xa.d,$xt8.d,$xt10.d + zip2 $xb.d,$xt8.d,$xt10.d + zip1 $xc.d,$xt9.d,$xt11.d + zip2 $xd.d,$xt9.d,$xt11.d +___ +} + +sub add_states() { + my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13"); + +$code.=<<___; + ldp $tmpw0,$tmpw1,[$state] + ldp $tmpw2,$tmpw3,[$state,#8] + dup $xt0.s,$tmpw0 + dup $xt1.s,$tmpw1 + dup $xt2.s,$tmpw2 + dup $xt3.s,$tmpw3 + ldp $tmpw0,$tmpw1,[$state,#16] + ldp $tmpw2,$tmpw3,[$state,#24] + add @mx[0].s,@mx[0].s,$xt0.s + add @mx[1].s,@mx[1].s,$xt1.s + add @mx[2].s,@mx[2].s,$xt2.s + add @mx[3].s,@mx[3].s,$xt3.s + dup $xt4.s,$tmpw0 + dup $xt5.s,$tmpw1 + dup $xt6.s,$tmpw2 + dup $xt7.s,$tmpw3 + ldp $tmpw0,$tmpw1,[$state,#32] + ldp $tmpw2,$tmpw3,[$state,#40] + add @mx[4].s,@mx[4].s,$xt4.s + add @mx[5].s,@mx[5].s,$xt5.s + add @mx[6].s,@mx[6].s,$xt6.s + add @mx[7].s,@mx[7].s,$xt7.s + dup $xt0.s,$tmpw0 + dup $xt1.s,$tmpw1 + dup $xt2.s,$tmpw2 + dup $xt3.s,$tmpw3 + ldp $tmpw0,$tmpw1,[$state,#48] + ldp $tmpw2,$tmpw3,[$state,#56] + add @mx[8].s,@mx[8].s,$xt0.s + add @mx[9].s,@mx[9].s,$xt1.s + add @mx[10].s,@mx[10].s,$xt2.s + add @mx[11].s,@mx[11].s,$xt3.s + dup $xt5.s,$tmpw1 + dup $xt6.s,$tmpw2 + dup $xt7.s,$tmpw3 + add @mx[12].s,@mx[12].s,$zctr.s + add @mx[13].s,@mx[13].s,$xt5.s + add @mx[14].s,@mx[14].s,$xt6.s + add @mx[15].s,@mx[15].s,$xt7.s +___ +} + +sub SVE_TRANSFORMS() { + &add_states(); + &transpose($xa0,$xb0,$xc0,$xd0); + &transpose($xa1,$xb1,$xc1,$xd1); + &transpose($xa2,$xb2,$xc2,$xd2); + &transpose($xa3,$xb3,$xc3,$xd3); + &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); + &transpose($xa0,$xa1,$xa2,$xa3); + &transpose($xb0,$xb1,$xb2,$xb3); +$code.=<<___; + eor $xa0.d,$xa0.d,$xt0.d + eor $xa1.d,$xa1.d,$xt1.d + eor $xa2.d,$xa2.d,$xt2.d + eor $xa3.d,$xa3.d,$xt3.d + eor $xb0.d,$xb0.d,$xt4.d + eor $xb1.d,$xb1.d,$xt5.d + eor $xb2.d,$xb2.d,$xt6.d + eor $xb3.d,$xb3.d,$xt7.d +___ + &transpose($xc0,$xc1,$xc2,$xc3); + &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); + &load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7); + &transpose($xd0,$xd1,$xd2,$xd3); +$code.=<<___; + eor $xc0.d,$xc0.d,$xt0.d + eor $xc1.d,$xc1.d,$xt1.d + eor $xc2.d,$xc2.d,$xt2.d + eor $xc3.d,$xc3.d,$xt3.d + eor $xd0.d,$xd0.d,$xt4.d + eor $xd1.d,$xd1.d,$xt5.d + eor $xd2.d,$xd2.d,$xt6.d + eor $xd3.d,$xd3.d,$xt7.d +___ + &store($xc0,$xc1,$xc2,$xc3,$xd0,$xd1,$xd2,$xd3); +$code.=<<___; + incw $xctr, ALL, MUL #1 + incw $zctr.s, ALL, MUL #1 +___ +} +}}} + +sub SVE_LOAD_STATES() { + my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13"); + +$code.=<<___; + // FIXME following code are not functionally necessary + // but appear to enhance performance +#if 1 + ptrues p2.s,ALL + ptrues p2.s,ALL + ptrues p2.s,ALL + ptrues p2.s,ALL + ptrues p2.s,ALL + ptrues p2.s,ALL +#endif +___ +$code.=<<___; + ldp $tmpw0,$tmpw1,[$state] + ldp $tmpw2,$tmpw3,[$state,#8] + dup @mx[0].s,$tmpw0 + dup @mx[1].s,$tmpw1 + dup @mx[2].s,$tmpw2 + dup @mx[3].s,$tmpw3 + ldp $tmpw0,$tmpw1,[$state,#16] + ldp $tmpw2,$tmpw3,[$state,#24] + dup @mx[4].s,$tmpw0 + dup @mx[5].s,$tmpw1 + dup @mx[6].s,$tmpw2 + dup @mx[7].s,$tmpw3 + ldp $tmpw0,$tmpw1,[$state,#32] + ldp $tmpw2,$tmpw3,[$state,#40] + dup @mx[8].s,$tmpw0 + dup @mx[9].s,$tmpw1 + dup @mx[10].s,$tmpw2 + dup @mx[11].s,$tmpw3 + ldp $tmpw0,$tmpw1,[$state, #48] + ldp $tmpw2,$tmpw3,[$state,#56] + mov @mx[12].s,p0/m,$zctr.s + dup @mx[13].s,$tmpw1 + dup @mx[14].s,$tmpw2 + dup @mx[15].s,$tmpw3 +___ +} + +sub sve_handle_blocks() { + my ($counter) = ("x10"); + + &SVE_LOAD_STATES(); +$code.=<<___; + mov $counter,#10 +.align 5 +1: +___ + + &SVE_INNER_BLOCK(); +$code.=<<___; + subs $counter,$counter,1 + b.ne 1b +___ + &SVE_TRANSFORMS(); +} + +sub chacha20_process() { + my ($counter) = ("x10"); + my ($tmpw) = ("w11"); + +$code.=<<___; +.align 5 +.Loop: + cmp $blocks,$veclen + b.lt .Lexit +___ + &sve_handle_blocks(); +$code.=<<___; + subs $blocks,$blocks,$veclen + b.gt .Loop +.Lexit: +___ +} + +{{{ +my ($tmp,$tmpw) = ("x10", "w10"); +my ($tmpw0,$tmpw1) = ("w11", "w12"); +my ($ptr) = ("x13"); + +$code.=<<___; +#include "arm_arch.h" + +.arch armv8-a + +#if 0 +.extern OPENSSL_armcap_P +.hidden OPENSSL_armcap_P +#endif + +.text +.align 5 +.Lchacha20_consts: + .word 0x61707865 + .word 0x3320646e + .word 0x79622d32 + .word 0x6b206574 +.Lrot8: + .word 0x02010003,0x04040404,0x02010003,0x04040404 +.globl ChaCha20_ctr32_sve +.type ChaCha20_ctr32_sve,%function +.align 5 +ChaCha20_ctr32_sve: + AARCH64_VALID_CALL_TARGET + mov $tmp, #64 + whilelo p0.s,xzr,$tmp + cntp $veclen,p0,p0.s + // run Neon if we only have 128-bit SVE + // in the future, we need to check SVE2 + cmp $veclen,4 + b.le .Lreturn + lsr $blocks,$len,#6 + cmp $blocks,$veclen + b.lt .Lreturn + stp d8,d9,[sp,-48]! + stp d10,d11,[sp,16] + stp d12,d13,[sp,32] + sub sp,sp,#64 + adr $tmp,.Lchacha20_consts + ld1 {v0.4s},[$tmp] + adr $tmp,.Lrot8 + ldp $tmpw0,$tmpw1,[$tmp] + ld1 {v1.4s,v2.4s},[$key] + ld1 {v3.4s},[$ctr] + ldr $wctr,[$ctr] + index $zctr.s,$wctr,1 + index $rot8.s,$tmpw0,$tmpw1 + st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp] + mov $state,sp +#if 0 + // SVE2 code not enabled until we have hardware + // for verification + mov $sve2flag,0 + adrp $tmp,OPENSSL_armcap_P + ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] + tst $tmpw,#ARMV8_SVE2 + b.eq 1f + mov $sve2flag,1 +1: +#endif +___ + &chacha20_process(); +$code.=<<___; + add sp,sp,#64 + ldp d10,d11,[sp,16] + ldp d12,d13,[sp,32] + ldp d8,d9,[sp],48 + str $wctr,[$ctr] + and $len,$len,#63 + add $len,$len,$blocks,lsl #6 +.Lreturn: + ret +.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve +___ + +}}} + +######################################## +{ +my %opcode_unpred = ( + "eor" => 0x04a03000, + "add" => 0x04200000, + "orr" => 0x04603000, + "lsl" => 0x04209C00, + "lsr" => 0x04209400, + "incw" => 0x04B0C000, + "xar" => 0x04203400, + "zip1" => 0x05206000, + "zip2" => 0x05206400, + "uzp1" => 0x05206800, + "uzp2" => 0x05206C00, + "index" => 0x04204C00, + "mov" => 0x05203800, + "dup" => 0x05203800, + "tbl" => 0x05203000); + +my %opcode_imm_unpred = ( + "dup" => 0x2538C000, + "index" => 0x04204400); + +my %opcode_scalar_pred = ( + "mov" => 0x0528A000, + "cpy" => 0x0528A000, + "st4w" => 0xE5606000, + "st1w" => 0xE5004000, + "ld1w" => 0xA5404000); + +my %opcode_gather_pred = ( + "ld1w" => 0x85204000); + +my %opcode_pred = ( + "eor" => 0x04190000, + "add" => 0x04000000, + "orr" => 0x04180000, + "whilelo" => 0x25200C00, + "whilelt" => 0x25200400, + "cntp" => 0x25208000, + "addvl" => 0x04205000, + "lsl" => 0x04038000, + "lsr" => 0x04018000, + "sel" => 0x0520C000, + "mov" => 0x0520C000, + "ptrue" => 0x2518E000, + "pfalse" => 0x2518E400, + "ptrues" => 0x2519E000, + "pnext" => 0x2519C400, + "ld4w" => 0xA560E000, + "st4w" => 0xE570E000, + "st1w" => 0xE500E000, + "ld1w" => 0xA540A000, + "revh" => 0x05258000); + +my %tsize = ( + 'b' => 0, + 'h' => 1, + 's' => 2, + 'd' => 3); + +my %sf = ( + "w" => 0, + "x" => 1); + +my %pattern = ( + "POW2" => 0, + "VL1" => 1, + "VL2" => 2, + "VL3" => 3, + "VL4" => 4, + "VL5" => 5, + "VL6" => 6, + "VL7" => 7, + "VL8" => 8, + "VL16" => 9, + "VL32" => 10, + "VL64" => 11, + "VL128" => 12, + "VL256" => 13, + "MUL4" => 29, + "MUL3" => 30, + "ALL" => 31); + +sub create_verifier { + my $filename="./compile_sve.sh"; + +$scripts = <<___; +#! /bin/bash +set -e +CROSS_COMPILE=\${CROSS_COMPILE:-'aarch64-none-linux-gnu-'} + +[ -z "\$1" ] && exit 1 +ARCH=`uname -p | xargs echo -n` + +# need gcc-10 and above to compile SVE code +# change this according to your system during debugging +if [ \$ARCH == 'aarch64' ]; then + CC=gcc-11 + OBJDUMP=objdump +else + CC=\${CROSS_COMPILE}gcc + OBJDUMP=\${CROSS_COMPILE}objdump +fi +TMPFILE=/tmp/\$\$ +cat > \$TMPFILE.c << EOF +extern __attribute__((noinline, section("disasm_output"))) void dummy_func() +{ + asm("\$@\\t\\n"); +} +int main(int argc, char *argv[]) +{ +} +EOF +\$CC -march=armv8.2-a+sve+sve2 -o \$TMPFILE.out \$TMPFILE.c +\$OBJDUMP -d \$TMPFILE.out | awk -F"\\n" -v RS="\\n\\n" '\$1 ~ /dummy_func/' | awk 'FNR == 2 {printf "%s",\$2}' +rm \$TMPFILE.c \$TMPFILE.out +___ + open(FH, '>', $filename) or die $!; + print FH $scripts; + close(FH); + system("chmod a+x ./compile_sve.sh"); +} + +sub compile_sve { + return `./compile_sve.sh '@_'` +} + +sub verify_inst { + my ($code,$inst)=@_; + my $hexcode = (sprintf "%08x", $code); + + if ($debug_encoder == 1) { + my $expect=&compile_sve($inst); + if ($expect ne $hexcode) { + return (sprintf "%s // Encode Error! expect [%s] actual [%s]", $inst, $expect, $hexcode); + } + } + return (sprintf ".inst\t0x%s\t//%s", $hexcode, $inst); +} + +sub reg_code { + my $code = shift; + + if ($code == "zr") { + return "31"; + } + return $code; +} + +sub encode_size_imm() { + my ($mnemonic, $isize, $const)=@_; + my $esize = (8<<$tsize{$isize}); + my $tsize_imm = $esize + $const; + + if ($mnemonic eq "lsr" || $mnemonic eq "xar") { + $tsize_imm = 2*$esize - $const; + } + return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<16); +} + +sub encode_shift_pred() { + my ($mnemonic, $isize, $const)=@_; + my $esize = (8<<$tsize{$isize}); + my $tsize_imm = $esize + $const; + + if ($mnemonic eq "lsr") { + $tsize_imm = 2*$esize - $const; + } + return (($tsize_imm>>5)<<22)|(($tsize_imm&0x1f)<<5); +} + +sub sve_unpred { + my ($mnemonic,$arg)=@_; + my $inst = (sprintf "%s %s", $mnemonic,$arg); + + if ($arg =~ m/z([0-9]+)\.([bhsd]),\s*\{\s*z([0-9]+)\.[bhsd].*\},\s*z([0-9]+)\.[bhsd].*/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22)|($4<<16), + $inst) + } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*([zwx][0-9]+.*)/o) { + my $regd = $1; + my $isize = $2; + my $regs=$3; + + if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { + if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o + && ((8<<$tsize{$isize}) > $2)) { + return &verify_inst($opcode_unpred{$mnemonic}|$regd|($1<<5)|&encode_size_imm($mnemonic,$isize,$2), + $inst); + } + } elsif($regs =~ m/[wx]([0-9]+),\s*[wx]([0-9]+)/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); + } elsif ($regs =~ m/[wx]([0-9]+),\s*#?([0-9]+)/o) { + return &verify_inst($opcode_imm_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5)|($2<<16), $inst); + } elsif ($regs =~ m/[wx]([0-9]+)/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$regd|($tsize{$isize}<<22)|($1<<5), $inst); + } else { + my $encoded_size = 0; + if (($mnemonic eq "add") || ($mnemonic =~ /zip./) || ($mnemonic =~ /uzp./) ) { + $encoded_size = ($tsize{$isize}<<22); + } + if ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd],\s*([0-9]+)/o && + $1 == $regd) { + return &verify_inst($opcode_unpred{$mnemonic}|$regd|($2<<5)|&encode_size_imm($mnemonic,$isize,$3), $inst); + } elsif ($regs =~ m/z([0-9]+)\.[bhsd],\s*z([0-9]+)\.[bhsd]/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$regd|$encoded_size|($1<<5)|($2<<16), $inst); + } + } + } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*#?([0-9]+)/o) { + return &verify_inst($opcode_imm_unpred{$mnemonic}|$1|($3<<5)|($tsize{$2}<<22), + $inst) + } + sprintf "%s // fail to parse", $inst; +} + +sub sve_pred { + my ($mnemonic,,$arg)=@_; + my $inst = (sprintf "%s %s", $mnemonic,$arg); + + if ($arg =~ m/\{\s*z([0-9]+)\.([bhsd]).*\},\s*p([0-9])+(\/z)?,\s*\[(\s*[xs].*)\]/o) { + my $zt = $1; + my $size = $tsize{$2}; + my $pg = $3; + my $addr = $5; + my $xn = 31; + + if ($addr =~ m/x([0-9]+)\s*/o) { + $xn = $1; + } + if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) { + return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); + } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) { + my $xs = ($2 eq "SXTW") ? 1 : 0; + return &verify_inst($opcode_gather_pred{$mnemonic}|($xs<<22)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); + } elsif($addr =~ m/\w+\s*,\s*#?([0-9]+)/o) { + return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); + } else { + return &verify_inst($opcode_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($xn<<5),$inst); + } + } elsif ($arg =~ m/z([0-9]+)\.([bhsd]),\s*p([0-9]+)\/([mz]),\s*([zwx][0-9]+.*)/o) { + my $regd = $1; + my $isize = $2; + my $pg = $3; + my $mod = $4; + my $regs = $5; + + if (($mnemonic eq "lsl") || ($mnemonic eq "lsr")) { + if ($regs =~ m/z([0-9]+)[^,]*(?:,\s*#?([0-9]+))?/o + && $regd == $1 + && $mode == 'm' + && ((8<<$tsize{$isize}) > $2)) { + return &verify_inst($opcode_pred{$mnemonic}|$regd|($pg<<10)|&encode_shift_pred($mnemonic,$isize,$2), $inst); + } + } elsif($regs =~ m/[wx]([0-9]+)/o) { + return &verify_inst($opcode_scalar_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); + } elsif ($regs =~ m/z([0-9]+)[^,]*(?:,\s*z([0-9]+))?/o) { + if ($mnemonic eq "sel") { + return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($2<<16), $inst); + } elsif ($mnemonic eq "mov") { + return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5)|($regd<<16), $inst); + } elsif (length $2 > 0) { + return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($2<<5), $inst); + } else { + return &verify_inst($opcode_pred{$mnemonic}|$regd|($tsize{$isize}<<22)|($pg<<10)|($1<<5), $inst); + } + } + } elsif ($arg =~ m/p([0-9]+)\.([bhsd]),\s*(\w+.*)/o) { + my $pg = $1; + my $isize = $2; + my $regs = $3; + + if ($regs =~ m/([wx])(zr|[0-9]+),\s*[wx](zr|[0-9]+)/o) { + return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($sf{$1}<<12)|(®_code($2)<<5)|(®_code($3)<<16), $inst); + } elsif ($regs =~ m/p([0-9]+),\s*p([0-9]+)\.[bhsd]/o) { + return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($1<<5), $inst); + } else { + return &verify_inst($opcode_pred{$mnemonic}|($tsize{$isize}<<22)|$pg|($pattern{$regs}<<5), $inst); + } + } elsif ($arg =~ m/p([0-9]+)\.([bhsd])/o) { + return &verify_inst($opcode_pred{$mnemonic}|$1, $inst); + } + + sprintf "%s // fail to parse", $inst; +} + +sub sve_other { + my ($mnemonic,$arg)=@_; + my $inst = (sprintf "%s %s", $mnemonic,$arg); + + if ($arg =~ m/x([0-9]+)[^,]*,\s*p([0-9]+)[^,]*,\s*p([0-9]+)\.([bhsd])/o) { + return &verify_inst($opcode_pred{$mnemonic}|($tsize{$4}<<22)|$1|($2<<10)|($3<<5), $inst); + } elsif ($mnemonic =~ /inc[bhdw]/) { + if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(2<<12)|(($3 - 1)<<16), $inst); + } elsif ($arg =~ m/z([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst); + } elsif ($arg =~ m/x([0-9]+)/o) { + return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst); + } + } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) { + return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst); + } + sprintf "%s // fail to parse", $inst; +} +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +if ($debug_encoder == 1) { + &create_verifier(); +} + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge; + s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge; + s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge; + s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; + s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge; + s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl index e1a8b81594..f6e0e2ef54 100755 --- a/crypto/chacha/asm/chacha-armv8.pl +++ b/crypto/chacha/asm/chacha-armv8.pl @@ -136,6 +136,8 @@ $code.=<<___; #ifndef __KERNEL__ .extern OPENSSL_armcap_P .hidden OPENSSL_armcap_P + +.extern ChaCha20_ctr32_sve #endif .text @@ -149,18 +151,17 @@ $code.=<<___; .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm" -.globl ChaCha20_ctr32 -.type ChaCha20_ctr32,%function +.globl ChaCha20_ctr32_dflt +.type ChaCha20_ctr32_dflt,%function .align 5 -ChaCha20_ctr32: +ChaCha20_ctr32_dflt: AARCH64_SIGN_LINK_REGISTER - cbz $len,.Labort cmp $len,#192 b.lo .Lshort - #ifndef __KERNEL__ adrp x17,OPENSSL_armcap_P ldr w17,[x17,#:lo12:OPENSSL_armcap_P] +.Lcheck_neon: tst w17,#ARMV7_NEON b.ne .LChaCha20_neon #endif @@ -344,6 +345,41 @@ $code.=<<___; ldp x29,x30,[sp],#96 AARCH64_VALIDATE_LINK_REGISTER ret +.size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: + AARCH64_SIGN_LINK_REGISTER + cbz $len,.Labort + cmp $len,#192 + b.lo .Lshort +#ifndef __KERNEL__ + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] + tst w17,#ARMV8_SVE + b.eq .Lcheck_neon + stp x29,x30,[sp,#-16]! + sub sp,sp,#16 + // SVE handling will inevitably increment the counter + // Neon/Scalar code that follows to process tail data needs to + // use new counter, unfortunately the input counter buffer + // pointed to by ctr is meant to be read-only per API contract + // we have to copy the buffer to stack to be writable by SVE + ldp x5,x6,[$ctr] + stp x5,x6,[sp] + mov $ctr,sp + bl ChaCha20_ctr32_sve + cbz $len,1f + bl ChaCha20_ctr32_dflt +1: + add sp,sp,#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +#endif + b .Lshort .size ChaCha20_ctr32,.-ChaCha20_ctr32 ___ diff --git a/crypto/chacha/build.info b/crypto/chacha/build.info index 5fe7477873..d02c571d02 100644 --- a/crypto/chacha/build.info +++ b/crypto/chacha/build.info @@ -10,7 +10,7 @@ IF[{- !$disabled{asm} -}] $CHACHAASM_s390x=chacha-s390x.S $CHACHAASM_armv4=chacha-armv4.S - $CHACHAASM_aarch64=chacha-armv8.S + $CHACHAASM_aarch64=chacha-armv8.S chacha-armv8-sve.S $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s $CHACHAASM_ppc64=$CHACHAASM_ppc32 @@ -33,7 +33,9 @@ GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl INCLUDE[chacha-armv4.o]=.. GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl +GENERATE[chacha-armv8-sve.S]=asm/chacha-armv8-sve.pl INCLUDE[chacha-armv8.o]=.. +INCLUDE[chacha-armv8-sve.o]=.. INCLUDE[chacha-s390x.o]=.. GENERATE[chacha-c64xplus.S]=asm/chacha-c64xplus.pl GENERATE[chacha-s390x.S]=asm/chacha-s390x.pl