+ lghi ("%r1",0);
+
+LABEL (".Loop_tail_4x");
+ llgc ("%r5","0(%r1,$inp)");
+ llgc ("%r6","$stdframe(%r1,$sp)");
+ xr ("%r6","%r5");
+ stc ("%r6","0(%r1,$out)");
+ la ("%r1","1(%r1)");
+ brct ($len,".Loop_tail_4x");
+
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+ la ($sp,"$FRAME($sp)");
+ br ("%r14");
+SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
+}
+
+########################################################################
+# 6x"horizontal" layout is optimal fit for the platform in its current
+# shape, more specifically for given vector instructions' latency. Well,
+# computational part of 8x"vertical" would be faster, but it consumes
+# all registers and dealing with that will diminish the return...
+#
+{
+my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
+ $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
+ $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
+my @K=map("%v$_",(27,24..26));
+my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
+my $beperm="%v31";
+my $FRAME=$stdframe + 4*16;
+
+GLOBL ("ChaCha20_ctr32_vx");
+ALIGN (32);
+LABEL ("ChaCha20_ctr32_vx");
+LABEL (".LChaCha20_ctr32_vx");
+&{$z? \&clgfi:\&clfi} ($len,256);
+ jle (".LChaCha20_ctr32_4x");
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)");
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
+if ($z) {
+ std ("%f8","$FRAME-8*8($sp)");
+ std ("%f9","$FRAME-8*7($sp)");
+ std ("%f10","$FRAME-8*6($sp)");
+ std ("%f11","$FRAME-8*5($sp)");
+ std ("%f12","$FRAME-8*4($sp)");
+ std ("%f13","$FRAME-8*3($sp)");
+ std ("%f14","$FRAME-8*2($sp)");
+ std ("%f15","$FRAME-8*1($sp)");
+}
+ larl ("%r7",".Lsigma");
+ lhi ("%r0",10);
+
+ vlm (@K[1],@K[2],"0($key)"); # load key
+ vl (@K[3],"0($counter)"); # load counter
+
+ vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
+
+LABEL (".Loop_outer_vx");
+ vlr ($a0,@K[0]);
+ vlr ($b0,@K[1]);
+ vlr ($a1,@K[0]);
+ vlr ($b1,@K[1]);
+ vlr ($a2,@K[0]);
+ vlr ($b2,@K[1]);
+ vlr ($a3,@K[0]);
+ vlr ($b3,@K[1]);
+ vlr ($a4,@K[0]);
+ vlr ($b4,@K[1]);
+ vlr ($a5,@K[0]);
+ vlr ($b5,@K[1]);
+
+ vlr ($d0,@K[3]);
+ vaf ($d1,@K[3],$t1); # K[3]+1
+ vaf ($d2,@K[3],$t2); # K[3]+2
+ vaf ($d3,@K[3],$t3); # K[3]+3
+ vaf ($d4,$d2,$t2); # K[3]+4
+ vaf ($d5,$d2,$t3); # K[3]+5
+
+ vlr ($c0,@K[2]);
+ vlr ($c1,@K[2]);
+ vlr ($c2,@K[2]);
+ vlr ($c3,@K[2]);
+ vlr ($c4,@K[2]);
+ vlr ($c5,@K[2]);
+
+ vlr ($t1,$d1);
+ vlr ($t2,$d2);
+ vlr ($t3,$d3);
+
+ALIGN (4);
+LABEL (".Loop_vx");
+
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+ $b0,$b1,$b2,$b3,$b4,$b5,
+ $c0,$c1,$c2,$c3,$c4,$c5,
+ $d0,$d1,$d2,$d3,$d4,$d5,
+ 0);
+
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+ $b0,$b1,$b2,$b3,$b4,$b5,
+ $c0,$c1,$c2,$c3,$c4,$c5,
+ $d0,$d1,$d2,$d3,$d4,$d5,
+ 1);
+
+ brct ("%r0",".Loop_vx");
+
+ vaf ($a0,$a0,@K[0]);
+ vaf ($b0,$b0,@K[1]);
+ vaf ($c0,$c0,@K[2]);
+ vaf ($d0,$d0,@K[3]);
+ vaf ($a1,$a1,@K[0]);
+ vaf ($d1,$d1,$t1); # +K[3]+1
+
+ vperm ($a0,$a0,$a0,$beperm);
+ vperm ($b0,$b0,$b0,$beperm);
+ vperm ($c0,$c0,$c0,$beperm);
+ vperm ($d0,$d0,$d0,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vaf ($d2,$d2,$t2); # +K[3]+2
+ vaf ($d3,$d3,$t3); # +K[3]+3
+ vlm ($t0,$t3,"0($inp)");
+
+ vx ($a0,$a0,$t0);
+ vx ($b0,$b0,$t1);
+ vx ($c0,$c0,$t2);
+ vx ($d0,$d0,$t3);
+
+ vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($b1,$b1,@K[1]);
+ vaf ($c1,$c1,@K[2]);
+
+ vperm ($a0,$a1,$a1,$beperm);
+ vperm ($b0,$b1,$b1,$beperm);
+ vperm ($c0,$c1,$c1,$beperm);
+ vperm ($d0,$d1,$d1,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a2,$a2,@K[0]);
+ vaf ($b2,$b2,@K[1]);
+ vaf ($c2,$c2,@K[2]);
+
+ vperm ($a0,$a2,$a2,$beperm);
+ vperm ($b0,$b2,$b2,$beperm);
+ vperm ($c0,$c2,$c2,$beperm);
+ vperm ($d0,$d2,$d2,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a3,$a3,@K[0]);
+ vaf ($b3,$b3,@K[1]);
+ vaf ($c3,$c3,@K[2]);
+ vaf ($d2,@K[3],$t3); # K[3]+3
+
+ vperm ($a0,$a3,$a3,$beperm);
+ vperm ($b0,$b3,$b3,$beperm);
+ vperm ($c0,$c3,$c3,$beperm);
+ vperm ($d0,$d3,$d3,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vaf ($d3,$d2,$t1); # K[3]+4
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a4,$a4,@K[0]);
+ vaf ($b4,$b4,@K[1]);
+ vaf ($c4,$c4,@K[2]);
+ vaf ($d4,$d4,$d3); # +K[3]+4
+ vaf ($d3,$d3,$t1); # K[3]+5
+ vaf (@K[3],$d2,$t3); # K[3]+=6
+
+ vperm ($a0,$a4,$a4,$beperm);
+ vperm ($b0,$b4,$b4,$beperm);
+ vperm ($c0,$c4,$c4,$beperm);
+ vperm ($d0,$d4,$d4,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a5,$a5,@K[0]);
+ vaf ($b5,$b5,@K[1]);
+ vaf ($c5,$c5,@K[2]);
+ vaf ($d5,$d5,$d3); # +K[3]+5
+
+ vperm ($a0,$a5,$a5,$beperm);
+ vperm ($b0,$b5,$b5,$beperm);
+ vperm ($c0,$c5,$c5,$beperm);
+ vperm ($d0,$d5,$d5,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+ lhi ("%r0",10);
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ jne (".Loop_outer_vx");
+
+LABEL (".Ldone_vx");
+if (!$z) {
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+ ld ("%f8","$FRAME-8*8($sp)");
+ ld ("%f9","$FRAME-8*7($sp)");
+ ld ("%f10","$FRAME-8*6($sp)");
+ ld ("%f11","$FRAME-8*5($sp)");
+ ld ("%f12","$FRAME-8*4($sp)");
+ ld ("%f13","$FRAME-8*3($sp)");
+ ld ("%f14","$FRAME-8*2($sp)");
+ ld ("%f15","$FRAME-8*1($sp)");
+}
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+ la ($sp,"$FRAME($sp)");
+ br ("%r14");
+
+ALIGN (16);
+LABEL (".Ltail_vx");
+if (!$z) {
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+ ld ("%f8","$FRAME-8*8($sp)");
+ ld ("%f9","$FRAME-8*7($sp)");
+ ld ("%f10","$FRAME-8*6($sp)");
+ ld ("%f11","$FRAME-8*5($sp)");
+ ld ("%f12","$FRAME-8*4($sp)");
+ ld ("%f13","$FRAME-8*3($sp)");
+ ld ("%f14","$FRAME-8*2($sp)");
+ ld ("%f15","$FRAME-8*1($sp)");
+}
+ vstm ($a0,$d0,"$stdframe($sp)");
+ lghi ("%r1",0);
+
+LABEL (".Loop_tail_vx");
+ llgc ("%r5","0(%r1,$inp)");
+ llgc ("%r6","$stdframe(%r1,$sp)");
+ xr ("%r6","%r5");
+ stc ("%r6","0(%r1,$out)");
+ la ("%r1","1(%r1)");
+ brct ($len,".Loop_tail_vx");
+
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+ la ($sp,"$FRAME($sp)");
+ br ("%r14");
+SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");