s390x assembly pack: import chacha from cryptogams repo
[openssl.git] / crypto / chacha / asm / chacha-s390x.pl
index 1b13a41c8be37f07137af53b4d0aa05b1eeb6bc3..51efe64408bb04fc6e279d242930805ff21a8afa 100755 (executable)
@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 #
 # 3 times faster than compiler-generated code.
 
-$flavour = shift;
+#
+# August 2018
+#
+# Add vx code path: 4x"vertical".
+#
+# Copyright IBM Corp. 2018
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+
+#
+# February 2019
+#
+# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
+# 4x"vertical" submission [on z13] and >3 faster than scalar code.
+# But to harness overheads revert to transliteration of VSX code path
+# from chacha-ppc module, which is also 4x"vertical", to handle inputs
+# not longer than 256 bytes.
+
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin/../..";
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
 
+my $flavour = shift;
+
+my ($z,$SIZE_T);
 if ($flavour =~ /3[12]/) {
+       $z=0;   # S/390 ABI
        $SIZE_T=4;
-       $g="";
 } else {
+       $z=1;   # zSeries ABI
        $SIZE_T=8;
-       $g="g";
 }
 
+my $output;
 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-sub AUTOLOAD()         # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
-    $code .= "\t$opcode\t".join(',',@_)."\n";
-}
 
 my $sp="%r15";
-
 my $stdframe=16*$SIZE_T+4*8;
-my $frame=$stdframe+4*20;
-
-my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
 
+sub ROUND {
 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
 my @t=map("%r$_",(8,9));
-
-sub ROUND {
 my ($a0,$b0,$c0,$d0)=@_;
 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_)=map("\"$_\"",@t);
-my @x=map("\"$_\"",@x);
+my ($xc,$xc_)=map("$_",@t);
 
        # Consider order in which variables are addressed by their
        # index:
@@ -78,249 +90,967 @@ my @x=map("\"$_\"",@x);
        # 'c' stores and loads in the middle, but none in the beginning
        # or end.
 
-       (
-       "&alr   (@x[$a0],@x[$b0])",     # Q1
-        "&alr  (@x[$a1],@x[$b1])",     # Q2
-       "&xr    (@x[$d0],@x[$a0])",
-        "&xr   (@x[$d1],@x[$a1])",
-       "&rll   (@x[$d0],@x[$d0],16)",
-        "&rll  (@x[$d1],@x[$d1],16)",
-
-       "&alr   ($xc,@x[$d0])",
-        "&alr  ($xc_,@x[$d1])",
-       "&xr    (@x[$b0],$xc)",
-        "&xr   (@x[$b1],$xc_)",
-       "&rll   (@x[$b0],@x[$b0],12)",
-        "&rll  (@x[$b1],@x[$b1],12)",
-
-       "&alr   (@x[$a0],@x[$b0])",
-        "&alr  (@x[$a1],@x[$b1])",
-       "&xr    (@x[$d0],@x[$a0])",
-        "&xr   (@x[$d1],@x[$a1])",
-       "&rll   (@x[$d0],@x[$d0],8)",
-        "&rll  (@x[$d1],@x[$d1],8)",
-
-       "&alr   ($xc,@x[$d0])",
-        "&alr  ($xc_,@x[$d1])",
-       "&xr    (@x[$b0],$xc)",
-        "&xr   (@x[$b1],$xc_)",
-       "&rll   (@x[$b0],@x[$b0],7)",
-        "&rll  (@x[$b1],@x[$b1],7)",
-
-       "&stm   ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
-       "&lm    ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
-
-       "&alr   (@x[$a2],@x[$b2])",     # Q3
-        "&alr  (@x[$a3],@x[$b3])",     # Q4
-       "&xr    (@x[$d2],@x[$a2])",
-        "&xr   (@x[$d3],@x[$a3])",
-       "&rll   (@x[$d2],@x[$d2],16)",
-        "&rll  (@x[$d3],@x[$d3],16)",
-
-       "&alr   ($xc,@x[$d2])",
-        "&alr  ($xc_,@x[$d3])",
-       "&xr    (@x[$b2],$xc)",
-        "&xr   (@x[$b3],$xc_)",
-       "&rll   (@x[$b2],@x[$b2],12)",
-        "&rll  (@x[$b3],@x[$b3],12)",
-
-       "&alr   (@x[$a2],@x[$b2])",
-        "&alr  (@x[$a3],@x[$b3])",
-       "&xr    (@x[$d2],@x[$a2])",
-        "&xr   (@x[$d3],@x[$a3])",
-       "&rll   (@x[$d2],@x[$d2],8)",
-        "&rll  (@x[$d3],@x[$d3],8)",
-
-       "&alr   ($xc,@x[$d2])",
-        "&alr  ($xc_,@x[$d3])",
-       "&xr    (@x[$b2],$xc)",
-        "&xr   (@x[$b3],$xc_)",
-       "&rll   (@x[$b2],@x[$b2],7)",
-        "&rll  (@x[$b3],@x[$b3],7)"
-       );
+       alr     (@x[$a0],@x[$b0]);      # Q1
+        alr    (@x[$a1],@x[$b1]);      # Q2
+       xr      (@x[$d0],@x[$a0]);
+        xr     (@x[$d1],@x[$a1]);
+       rll     (@x[$d0],@x[$d0],16);
+        rll    (@x[$d1],@x[$d1],16);
+
+       alr     ($xc,@x[$d0]);
+        alr    ($xc_,@x[$d1]);
+       xr      (@x[$b0],$xc);
+        xr     (@x[$b1],$xc_);
+       rll     (@x[$b0],@x[$b0],12);
+        rll    (@x[$b1],@x[$b1],12);
+
+       alr     (@x[$a0],@x[$b0]);
+        alr    (@x[$a1],@x[$b1]);
+       xr      (@x[$d0],@x[$a0]);
+        xr     (@x[$d1],@x[$a1]);
+       rll     (@x[$d0],@x[$d0],8);
+        rll    (@x[$d1],@x[$d1],8);
+
+       alr     ($xc,@x[$d0]);
+        alr    ($xc_,@x[$d1]);
+       xr      (@x[$b0],$xc);
+        xr     (@x[$b1],$xc_);
+       rll     (@x[$b0],@x[$b0],7);
+        rll    (@x[$b1],@x[$b1],7);
+
+       stm     ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)");  # reload pair of 'c's
+       lm      ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
+
+       alr     (@x[$a2],@x[$b2]);      # Q3
+        alr    (@x[$a3],@x[$b3]);      # Q4
+       xr      (@x[$d2],@x[$a2]);
+        xr     (@x[$d3],@x[$a3]);
+       rll     (@x[$d2],@x[$d2],16);
+        rll    (@x[$d3],@x[$d3],16);
+
+       alr     ($xc,@x[$d2]);
+        alr    ($xc_,@x[$d3]);
+       xr      (@x[$b2],$xc);
+        xr     (@x[$b3],$xc_);
+       rll     (@x[$b2],@x[$b2],12);
+        rll    (@x[$b3],@x[$b3],12);
+
+       alr     (@x[$a2],@x[$b2]);
+        alr    (@x[$a3],@x[$b3]);
+       xr      (@x[$d2],@x[$a2]);
+        xr     (@x[$d3],@x[$a3]);
+       rll     (@x[$d2],@x[$d2],8);
+        rll    (@x[$d3],@x[$d3],8);
+
+       alr     ($xc,@x[$d2]);
+        alr    ($xc_,@x[$d3]);
+       xr      (@x[$b2],$xc);
+        xr     (@x[$b3],$xc_);
+       rll     (@x[$b2],@x[$b2],7);
+        rll    (@x[$b3],@x[$b3],7);
+}
+
+sub VX_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("%v$_",(0..15));
+
+       vaf     (@x[$a0],@x[$a0],@x[$b0]);      # Q1
+       vx      (@x[$d0],@x[$d0],@x[$a0]);
+       verllf  (@x[$d0],@x[$d0],16);
+       vaf     (@x[$a1],@x[$a1],@x[$b1]);      # Q2
+       vx      (@x[$d1],@x[$d1],@x[$a1]);
+       verllf  (@x[$d1],@x[$d1],16);
+       vaf     (@x[$a2],@x[$a2],@x[$b2]);      # Q3
+       vx      (@x[$d2],@x[$d2],@x[$a2]);
+       verllf  (@x[$d2],@x[$d2],16);
+       vaf     (@x[$a3],@x[$a3],@x[$b3]);      # Q4
+       vx      (@x[$d3],@x[$d3],@x[$a3]);
+       verllf  (@x[$d3],@x[$d3],16);
+
+       vaf     (@x[$c0],@x[$c0],@x[$d0]);
+       vx      (@x[$b0],@x[$b0],@x[$c0]);
+       verllf  (@x[$b0],@x[$b0],12);
+       vaf     (@x[$c1],@x[$c1],@x[$d1]);
+       vx      (@x[$b1],@x[$b1],@x[$c1]);
+       verllf  (@x[$b1],@x[$b1],12);
+       vaf     (@x[$c2],@x[$c2],@x[$d2]);
+       vx      (@x[$b2],@x[$b2],@x[$c2]);
+       verllf  (@x[$b2],@x[$b2],12);
+       vaf     (@x[$c3],@x[$c3],@x[$d3]);
+       vx      (@x[$b3],@x[$b3],@x[$c3]);
+       verllf  (@x[$b3],@x[$b3],12);
+
+       vaf     (@x[$a0],@x[$a0],@x[$b0]);
+       vx      (@x[$d0],@x[$d0],@x[$a0]);
+       verllf  (@x[$d0],@x[$d0],8);
+       vaf     (@x[$a1],@x[$a1],@x[$b1]);
+       vx      (@x[$d1],@x[$d1],@x[$a1]);
+       verllf  (@x[$d1],@x[$d1],8);
+       vaf     (@x[$a2],@x[$a2],@x[$b2]);
+       vx      (@x[$d2],@x[$d2],@x[$a2]);
+       verllf  (@x[$d2],@x[$d2],8);
+       vaf     (@x[$a3],@x[$a3],@x[$b3]);
+       vx      (@x[$d3],@x[$d3],@x[$a3]);
+       verllf  (@x[$d3],@x[$d3],8);
+
+       vaf     (@x[$c0],@x[$c0],@x[$d0]);
+       vx      (@x[$b0],@x[$b0],@x[$c0]);
+       verllf  (@x[$b0],@x[$b0],7);
+       vaf     (@x[$c1],@x[$c1],@x[$d1]);
+       vx      (@x[$b1],@x[$b1],@x[$c1]);
+       verllf  (@x[$b1],@x[$b1],7);
+       vaf     (@x[$c2],@x[$c2],@x[$d2]);
+       vx      (@x[$b2],@x[$b2],@x[$c2]);
+       verllf  (@x[$b2],@x[$b2],7);
+       vaf     (@x[$c3],@x[$c3],@x[$d3]);
+       vx      (@x[$b3],@x[$b3],@x[$c3]);
+       verllf  (@x[$b3],@x[$b3],7);
+}
+
+sub VX_ROUND {
+my @a=@_[0..5];
+my @b=@_[6..11];
+my @c=@_[12..17];
+my @d=@_[18..23];
+my $odd=@_[24];
+
+       vaf             (@a[$_],@a[$_],@b[$_]) for (0..5);
+       vx              (@d[$_],@d[$_],@a[$_]) for (0..5);
+       verllf          (@d[$_],@d[$_],16) for (0..5);
+
+       vaf             (@c[$_],@c[$_],@d[$_]) for (0..5);
+       vx              (@b[$_],@b[$_],@c[$_]) for (0..5);
+       verllf          (@b[$_],@b[$_],12) for (0..5);
+
+       vaf             (@a[$_],@a[$_],@b[$_]) for (0..5);
+       vx              (@d[$_],@d[$_],@a[$_]) for (0..5);
+       verllf          (@d[$_],@d[$_],8) for (0..5);
+
+       vaf             (@c[$_],@c[$_],@d[$_]) for (0..5);
+       vx              (@b[$_],@b[$_],@c[$_]) for (0..5);
+       verllf          (@b[$_],@b[$_],7) for (0..5);
+
+       vsldb           (@c[$_],@c[$_],@c[$_],8) for (0..5);
+       vsldb           (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
+       vsldb           (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
+}
+
+PERLASM_BEGIN($output);
+
+INCLUDE        ("s390x_arch.h");
+TEXT   ();
+
+################
+# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
+#                     const unsigned int key[8], const unsigned int counter[4])
+my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
+{
+my $frame=$stdframe+4*20;
+my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
+my @t=map("%r$_",(8,9));
+
+GLOBL  ("ChaCha20_ctr32");
+TYPE   ("ChaCha20_ctr32","\@function");
+ALIGN  (32);
+LABEL  ("ChaCha20_ctr32");
+       larl    ("%r1","OPENSSL_s390xcap_P");
+
+       lghi    ("%r0",64);
+&{$z?  \&ltgr:\&ltr}   ($len,$len);            # len==0?
+       bzr     ("%r14");
+       lg      ("%r1","S390X_STFLE+16(%r1)");
+&{$z?  \&clgr:\&clr}   ($len,"%r0");
+       jle     (".Lshort");
+
+       tmhh    ("%r1",0x4000);                 # check for vx bit
+       jnz     (".LChaCha20_ctr32_vx");
+
+LABEL  (".Lshort");
+&{$z?  \&aghi:\&ahi}   ($len,-64);
+&{$z?  \&lghi:\&lhi}   ("%r1",-$frame);
+&{$z?  \&stmg:\&stm}   ("%r6","%r15","6*$SIZE_T($sp)");
+&{$z?  \&slgr:\&slr}   ($out,$inp);    # difference
+       la      ($len,"0($inp,$len)");  # end of input minus 64
+       larl    ("%r7",".Lsigma");
+       lgr     ("%r0",$sp);
+       la      ($sp,"0(%r1,$sp)");
+&{$z?  \&stg:\&st}     ("%r0","0($sp)");
+
+       lmg     ("%r8","%r11","0($key)");       # load key
+       lmg     ("%r12","%r13","0($counter)");  # load counter
+       lmg     ("%r6","%r7","0(%r7)"); # load sigma constant
+
+       la      ("%r14","0($inp)");
+&{$z?  \&stg:\&st}     ($out,"$frame+3*$SIZE_T($sp)");
+&{$z?  \&stg:\&st}     ($len,"$frame+4*$SIZE_T($sp)");
+       stmg    ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
+       srlg    (@x[12],"%r12",32);     # 32-bit counter value
+       j       (".Loop_outer");
+
+ALIGN  (16);
+LABEL  (".Loop_outer");
+       lm      (@x[0],@x[7],"$stdframe+4*0($sp)");     # load x[0]-x[7]
+       lm      (@t[0],@t[1],"$stdframe+4*10($sp)");    # load x[10]-x[11]
+       lm      (@x[13],@x[15],"$stdframe+4*13($sp)");  # load x[13]-x[15]
+       stm     (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
+       lm      (@t[0],@t[1],"$stdframe+4*8($sp)");     # load x[8]-x[9]
+       st      (@x[12],"$stdframe+4*12($sp)"); # save counter
+&{$z?  \&stg:\&st}     ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
+       lhi     ("%r14",10);
+       j       (".Loop");
+
+ALIGN  (4);
+LABEL  (".Loop");
+       ROUND   (0, 4, 8,12);
+       ROUND   (0, 5,10,15);
+       brct    ("%r14",".Loop");
+
+&{$z?  \&lg:\&l}       ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
+       stm     (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
+&{$z?  \&lmg:\&lm}     (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
+
+       al      (@x[0],"$stdframe+4*0($sp)");   # accumulate key schedule
+       al      (@x[1],"$stdframe+4*1($sp)");
+       al      (@x[2],"$stdframe+4*2($sp)");
+       al      (@x[3],"$stdframe+4*3($sp)");
+       al      (@x[4],"$stdframe+4*4($sp)");
+       al      (@x[5],"$stdframe+4*5($sp)");
+       al      (@x[6],"$stdframe+4*6($sp)");
+       al      (@x[7],"$stdframe+4*7($sp)");
+       lrvr    (@x[0],@x[0]);
+       lrvr    (@x[1],@x[1]);
+       lrvr    (@x[2],@x[2]);
+       lrvr    (@x[3],@x[3]);
+       lrvr    (@x[4],@x[4]);
+       lrvr    (@x[5],@x[5]);
+       lrvr    (@x[6],@x[6]);
+       lrvr    (@x[7],@x[7]);
+       al      (@x[12],"$stdframe+4*12($sp)");
+       al      (@x[13],"$stdframe+4*13($sp)");
+       al      (@x[14],"$stdframe+4*14($sp)");
+       al      (@x[15],"$stdframe+4*15($sp)");
+       lrvr    (@x[12],@x[12]);
+       lrvr    (@x[13],@x[13]);
+       lrvr    (@x[14],@x[14]);
+       lrvr    (@x[15],@x[15]);
+
+       la      (@t[0],"0(@t[0],%r14)");        # reconstruct output pointer
+&{$z?  \&clgr:\&clr}   ("%r14",@t[1]);
+       jh      (".Ltail");
+
+       x       (@x[0],"4*0(%r14)");    # xor with input
+       x       (@x[1],"4*1(%r14)");
+       st      (@x[0],"4*0(@t[0])");   # store output
+       x       (@x[2],"4*2(%r14)");
+       st      (@x[1],"4*1(@t[0])");
+       x       (@x[3],"4*3(%r14)");
+       st      (@x[2],"4*2(@t[0])");
+       x       (@x[4],"4*4(%r14)");
+       st      (@x[3],"4*3(@t[0])");
+        lm     (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
+       x       (@x[5],"4*5(%r14)");
+       st      (@x[4],"4*4(@t[0])");
+       x       (@x[6],"4*6(%r14)");
+        al     (@x[0],"$stdframe+4*8($sp)");
+       st      (@x[5],"4*5(@t[0])");
+       x       (@x[7],"4*7(%r14)");
+        al     (@x[1],"$stdframe+4*9($sp)");
+       st      (@x[6],"4*6(@t[0])");
+       x       (@x[12],"4*12(%r14)");
+        al     (@x[2],"$stdframe+4*10($sp)");
+       st      (@x[7],"4*7(@t[0])");
+       x       (@x[13],"4*13(%r14)");
+        al     (@x[3],"$stdframe+4*11($sp)");
+       st      (@x[12],"4*12(@t[0])");
+       x       (@x[14],"4*14(%r14)");
+       st      (@x[13],"4*13(@t[0])");
+       x       (@x[15],"4*15(%r14)");
+       st      (@x[14],"4*14(@t[0])");
+        lrvr   (@x[0],@x[0]);
+       st      (@x[15],"4*15(@t[0])");
+        lrvr   (@x[1],@x[1]);
+        lrvr   (@x[2],@x[2]);
+        lrvr   (@x[3],@x[3]);
+       lhi     (@x[12],1);
+        x      (@x[0],"4*8(%r14)");
+       al      (@x[12],"$stdframe+4*12($sp)"); # increment counter
+        x      (@x[1],"4*9(%r14)");
+        st     (@x[0],"4*8(@t[0])");
+        x      (@x[2],"4*10(%r14)");
+        st     (@x[1],"4*9(@t[0])");
+        x      (@x[3],"4*11(%r14)");
+        st     (@x[2],"4*10(@t[0])");
+        st     (@x[3],"4*11(@t[0])");
+
+&{$z?  \&clgr:\&clr}   ("%r14",@t[1]); # done yet?
+       la      ("%r14","64(%r14)");
+       jl      (".Loop_outer");
+
+LABEL  (".Ldone");
+       xgr     ("%r0","%r0");
+       xgr     ("%r1","%r1");
+       xgr     ("%r2","%r2");
+       xgr     ("%r3","%r3");
+       stmg    ("%r0","%r3","$stdframe+4*4($sp)");     # wipe key copy
+       stmg    ("%r0","%r3","$stdframe+4*12($sp)");
+
+&{$z?  \&lmg:\&lm}     ("%r6","%r15","$frame+6*$SIZE_T($sp)");
+       br      ("%r14");
+
+ALIGN  (16);
+LABEL  (".Ltail");
+       la      (@t[1],"64($t[1])");
+       stm     (@x[0],@x[7],"$stdframe+4*0($sp)");
+&{$z?  \&slgr:\&slr}   (@t[1],"%r14");
+       lm      (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
+&{$z?  \&lghi:\&lhi}   (@x[6],0);
+       stm     (@x[12],@x[15],"$stdframe+4*12($sp)");
+       al      (@x[0],"$stdframe+4*8($sp)");
+       al      (@x[1],"$stdframe+4*9($sp)");
+       al      (@x[2],"$stdframe+4*10($sp)");
+       al      (@x[3],"$stdframe+4*11($sp)");
+       lrvr    (@x[0],@x[0]);
+       lrvr    (@x[1],@x[1]);
+       lrvr    (@x[2],@x[2]);
+       lrvr    (@x[3],@x[3]);
+       stm     (@x[0],@x[3],"$stdframe+4*8($sp)");
+
+LABEL  (".Loop_tail");
+       llgc    (@x[4],"0(@x[6],%r14)");
+       llgc    (@x[5],"$stdframe(@x[6],$sp)");
+       xr      (@x[5],@x[4]);
+       stc     (@x[5],"0(@x[6],@t[0])");
+       la      (@x[6],"1(@x[6])");
+       brct    (@t[1],".Loop_tail");
+
+       j       (".Ldone");
+SIZE   ("ChaCha20_ctr32",".-ChaCha20_ctr32");
+}
+
+########################################################################
+# 4x"vertical" layout minimizes amount of instructions, but pipeline
+# runs underutilized [because of vector instructions' high latency].
+# On the other hand minimum amount of data it takes to fully utilize
+# the pipeline is higher, so that effectively, short inputs would be
+# processed slower. Hence this code path targeting <=256 bytes lengths.
+#
+{
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
+my @K=map("%v$_",(16..19));
+my $CTR="%v26";
+my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
+my $beperm="%v31";
+my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
+my $FRAME=$stdframe+4*16;
+
+ALIGN  (32);
+LABEL  ("ChaCha20_ctr32_4x");
+LABEL  (".LChaCha20_ctr32_4x");
+&{$z?  \&stmg:\&stm}   ("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+       std     ("%f4","16*$SIZE_T+2*8($sp)");
+       std     ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z?  \&lghi:\&lhi}   ("%r1",-$FRAME);
+       lgr     ("%r0",$sp);
+       la      ($sp,"0(%r1,$sp)");
+&{$z?  \&stg:\&st}     ("%r0","0($sp)");       # back-chain
+if ($z) {
+       std     ("%f8","$stdframe+8*0($sp)");
+       std     ("%f9","$stdframe+8*1($sp)");
+       std     ("%f10","$stdframe+8*2($sp)");
+       std     ("%f11","$stdframe+8*3($sp)");
+       std     ("%f12","$stdframe+8*4($sp)");
+       std     ("%f13","$stdframe+8*5($sp)");
+       std     ("%f14","$stdframe+8*6($sp)");
+       std     ("%f15","$stdframe+8*7($sp)");
+}
+       larl    ("%r7",".Lsigma");
+       lhi     ("%r0",10);
+       lhi     ("%r1",0);
+
+       vl      (@K[0],"0(%r7)");               # load sigma
+       vl      (@K[1],"0($key)");              # load key
+       vl      (@K[2],"16($key)");
+       vl      (@K[3],"0($counter)");          # load counter
+
+       vl      ($beperm,"0x40(%r7)");
+       vl      ($xt1,"0x50(%r7)");
+       vrepf   ($CTR,@K[3],0);
+       vlvgf   (@K[3],"%r1",0);                # clear @K[3].word[0]
+       vaf     ($CTR,$CTR,$xt1);
+
+#LABEL (".Loop_outer_4x");
+       vlm     ($xa0,$xa3,"0x60(%r7)");        # load [smashed] sigma
+
+       vrepf   ($xb0,@K[1],0);                 # smash the key
+       vrepf   ($xb1,@K[1],1);
+       vrepf   ($xb2,@K[1],2);
+       vrepf   ($xb3,@K[1],3);
+
+       vrepf   ($xc0,@K[2],0);
+       vrepf   ($xc1,@K[2],1);
+       vrepf   ($xc2,@K[2],2);
+       vrepf   ($xc3,@K[2],3);
+
+       vlr     ($xd0,$CTR);
+       vrepf   ($xd1,@K[3],1);
+       vrepf   ($xd2,@K[3],2);
+       vrepf   ($xd3,@K[3],3);
+
+LABEL  (".Loop_4x");
+       VX_lane_ROUND(0, 4, 8,12);
+       VX_lane_ROUND(0, 5,10,15);
+       brct    ("%r0",".Loop_4x");
+
+       vaf     ($xd0,$xd0,$CTR);
+
+       vmrhf   ($xt0,$xa0,$xa1);               # transpose data
+       vmrhf   ($xt1,$xa2,$xa3);
+       vmrlf   ($xt2,$xa0,$xa1);
+       vmrlf   ($xt3,$xa2,$xa3);
+       vpdi    ($xa0,$xt0,$xt1,0b0000);
+       vpdi    ($xa1,$xt0,$xt1,0b0101);
+       vpdi    ($xa2,$xt2,$xt3,0b0000);
+       vpdi    ($xa3,$xt2,$xt3,0b0101);
+
+       vmrhf   ($xt0,$xb0,$xb1);
+       vmrhf   ($xt1,$xb2,$xb3);
+       vmrlf   ($xt2,$xb0,$xb1);
+       vmrlf   ($xt3,$xb2,$xb3);
+       vpdi    ($xb0,$xt0,$xt1,0b0000);
+       vpdi    ($xb1,$xt0,$xt1,0b0101);
+       vpdi    ($xb2,$xt2,$xt3,0b0000);
+       vpdi    ($xb3,$xt2,$xt3,0b0101);
+
+       vmrhf   ($xt0,$xc0,$xc1);
+       vmrhf   ($xt1,$xc2,$xc3);
+       vmrlf   ($xt2,$xc0,$xc1);
+       vmrlf   ($xt3,$xc2,$xc3);
+       vpdi    ($xc0,$xt0,$xt1,0b0000);
+       vpdi    ($xc1,$xt0,$xt1,0b0101);
+       vpdi    ($xc2,$xt2,$xt3,0b0000);
+       vpdi    ($xc3,$xt2,$xt3,0b0101);
+
+       vmrhf   ($xt0,$xd0,$xd1);
+       vmrhf   ($xt1,$xd2,$xd3);
+       vmrlf   ($xt2,$xd0,$xd1);
+       vmrlf   ($xt3,$xd2,$xd3);
+       vpdi    ($xd0,$xt0,$xt1,0b0000);
+       vpdi    ($xd1,$xt0,$xt1,0b0101);
+       vpdi    ($xd2,$xt2,$xt3,0b0000);
+       vpdi    ($xd3,$xt2,$xt3,0b0101);
+
+       #vrepif ($xt0,4);
+       #vaf    ($CTR,$CTR,$xt0);               # next counter value
+
+       vaf     ($xa0,$xa0,@K[0]);
+       vaf     ($xb0,$xb0,@K[1]);
+       vaf     ($xc0,$xc0,@K[2]);
+       vaf     ($xd0,$xd0,@K[3]);
+
+       vperm   ($xa0,$xa0,$xa0,$beperm);
+       vperm   ($xb0,$xb0,$xb0,$beperm);
+       vperm   ($xc0,$xc0,$xc0,$beperm);
+       vperm   ($xd0,$xd0,$xd0,$beperm);
+
+       #&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       #jl     (".Ltail_4x");
+
+       vlm     ($xt0,$xt3,"0($inp)");
+
+       vx      ($xt0,$xt0,$xa0);
+       vx      ($xt1,$xt1,$xb0);
+       vx      ($xt2,$xt2,$xc0);
+       vx      ($xt3,$xt3,$xd0);
+
+       vstm    ($xt0,$xt3,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       #je     (".Ldone_4x");
+
+       vaf     ($xa0,$xa1,@K[0]);
+       vaf     ($xb0,$xb1,@K[1]);
+       vaf     ($xc0,$xc1,@K[2]);
+       vaf     ($xd0,$xd1,@K[3]);
+
+       vperm   ($xa0,$xa0,$xa0,$beperm);
+       vperm   ($xb0,$xb0,$xb0,$beperm);
+       vperm   ($xc0,$xc0,$xc0,$beperm);
+       vperm   ($xd0,$xd0,$xd0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_4x");
+
+       vlm     ($xt0,$xt3,"0($inp)");
+
+       vx      ($xt0,$xt0,$xa0);
+       vx      ($xt1,$xt1,$xb0);
+       vx      ($xt2,$xt2,$xc0);
+       vx      ($xt3,$xt3,$xd0);
+
+       vstm    ($xt0,$xt3,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       je      (".Ldone_4x");
+
+       vaf     ($xa0,$xa2,@K[0]);
+       vaf     ($xb0,$xb2,@K[1]);
+       vaf     ($xc0,$xc2,@K[2]);
+       vaf     ($xd0,$xd2,@K[3]);
+
+       vperm   ($xa0,$xa0,$xa0,$beperm);
+       vperm   ($xb0,$xb0,$xb0,$beperm);
+       vperm   ($xc0,$xc0,$xc0,$beperm);
+       vperm   ($xd0,$xd0,$xd0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_4x");
+
+       vlm     ($xt0,$xt3,"0($inp)");
+
+       vx      ($xt0,$xt0,$xa0);
+       vx      ($xt1,$xt1,$xb0);
+       vx      ($xt2,$xt2,$xc0);
+       vx      ($xt3,$xt3,$xd0);
+
+       vstm    ($xt0,$xt3,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       je      (".Ldone_4x");
+
+       vaf     ($xa0,$xa3,@K[0]);
+       vaf     ($xb0,$xb3,@K[1]);
+       vaf     ($xc0,$xc3,@K[2]);
+       vaf     ($xd0,$xd3,@K[3]);
+
+       vperm   ($xa0,$xa0,$xa0,$beperm);
+       vperm   ($xb0,$xb0,$xb0,$beperm);
+       vperm   ($xc0,$xc0,$xc0,$beperm);
+       vperm   ($xd0,$xd0,$xd0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_4x");
+
+       vlm     ($xt0,$xt3,"0($inp)");
+
+       vx      ($xt0,$xt0,$xa0);
+       vx      ($xt1,$xt1,$xb0);
+       vx      ($xt2,$xt2,$xc0);
+       vx      ($xt3,$xt3,$xd0);
+
+       vstm    ($xt0,$xt3,"0($out)");
+
+       #la     $inp,0x40($inp));
+       #la     $out,0x40($out));
+       #lhi    %r0,10);
+       #&{$z?  \&aghi:\&ahi}   $len,-0x40);
+       #jne    .Loop_outer_4x);
+
+LABEL  (".Ldone_4x");
+if (!$z) {
+       ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+       ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+       ld      ("%f8","$stdframe+8*0($sp)");
+       ld      ("%f9","$stdframe+8*1($sp)");
+       ld      ("%f10","$stdframe+8*2($sp)");
+       ld      ("%f11","$stdframe+8*3($sp)");
+       ld      ("%f12","$stdframe+8*4($sp)");
+       ld      ("%f13","$stdframe+8*5($sp)");
+       ld      ("%f14","$stdframe+8*6($sp)");
+       ld      ("%f15","$stdframe+8*7($sp)");
 }
+&{$z?  \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+       la      ($sp,"$FRAME($sp)");
+       br      ("%r14");
+
+ALIGN  (16);
+LABEL  (".Ltail_4x");
+if (!$z) {
+       vlr     ($xt0,$xb0);
+       ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+       ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+
+       vst     ($xa0,"$stdframe+0x00($sp)");
+       vst     ($xt0,"$stdframe+0x10($sp)");
+       vst     ($xc0,"$stdframe+0x20($sp)");
+       vst     ($xd0,"$stdframe+0x30($sp)");
+} else {
+       vlr     ($xt0,$xc0);
+       ld      ("%f8","$stdframe+8*0($sp)");
+       ld      ("%f9","$stdframe+8*1($sp)");
+       ld      ("%f10","$stdframe+8*2($sp)");
+       ld      ("%f11","$stdframe+8*3($sp)");
+       vlr     ($xt1,$xd0);
+       ld      ("%f12","$stdframe+8*4($sp)");
+       ld      ("%f13","$stdframe+8*5($sp)");
+       ld      ("%f14","$stdframe+8*6($sp)");
+       ld      ("%f15","$stdframe+8*7($sp)");
 
-$code.=<<___;
-.text
-
-.globl ChaCha20_ctr32
-.type  ChaCha20_ctr32,\@function
-.align 32
-ChaCha20_ctr32:
-       lt${g}r $len,$len                       # $len==0?
-       bzr     %r14
-       a${g}hi $len,-64
-       l${g}hi %r1,-$frame
-       stm${g} %r6,%r15,`6*$SIZE_T`($sp)
-       sl${g}r $out,$inp                       # difference
-       la      $len,0($inp,$len)               # end of input minus 64
-       larl    %r7,.Lsigma
-       lgr     %r0,$sp
-       la      $sp,0(%r1,$sp)
-       st${g}  %r0,0($sp)
-
-       lmg     %r8,%r11,0($key)                # load key
-       lmg     %r12,%r13,0($counter)           # load counter
-       lmg     %r6,%r7,0(%r7)                  # load sigma constant
-
-       la      %r14,0($inp)
-       st${g}  $out,$frame+3*$SIZE_T($sp)
-       st${g}  $len,$frame+4*$SIZE_T($sp)
-       stmg    %r6,%r13,$stdframe($sp)         # copy key schedule to stack
-       srlg    @x[12],%r12,32                  # 32-bit counter value
-       j       .Loop_outer
-
-.align 16
-.Loop_outer:
-       lm      @x[0],@x[7],$stdframe+4*0($sp)          # load x[0]-x[7]
-       lm      @t[0],@t[1],$stdframe+4*10($sp)         # load x[10]-x[11]
-       lm      @x[13],@x[15],$stdframe+4*13($sp)       # load x[13]-x[15]
-       stm     @t[0],@t[1],$stdframe+4*8+4*10($sp)     # offload x[10]-x[11]
-       lm      @t[0],@t[1],$stdframe+4*8($sp)          # load x[8]-x[9]
-       st      @x[12],$stdframe+4*12($sp)              # save counter
-       st${g}  %r14,$frame+2*$SIZE_T($sp)              # save input pointer
-       lhi     %r14,10
-       j       .Loop
-
-.align 4
-.Loop:
-___
-       foreach (&ROUND(0, 4, 8,12)) { eval; }
-       foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
-       brct    %r14,.Loop
-
-       l${g}   %r14,$frame+2*$SIZE_T($sp)              # pull input pointer
-       stm     @t[0],@t[1],$stdframe+4*8+4*8($sp)      # offload x[8]-x[9]
-       lm${g}  @t[0],@t[1],$frame+3*$SIZE_T($sp)
-
-       al      @x[0],$stdframe+4*0($sp)        # accumulate key schedule
-       al      @x[1],$stdframe+4*1($sp)
-       al      @x[2],$stdframe+4*2($sp)
-       al      @x[3],$stdframe+4*3($sp)
-       al      @x[4],$stdframe+4*4($sp)
-       al      @x[5],$stdframe+4*5($sp)
-       al      @x[6],$stdframe+4*6($sp)
-       al      @x[7],$stdframe+4*7($sp)
-       lrvr    @x[0],@x[0]
-       lrvr    @x[1],@x[1]
-       lrvr    @x[2],@x[2]
-       lrvr    @x[3],@x[3]
-       lrvr    @x[4],@x[4]
-       lrvr    @x[5],@x[5]
-       lrvr    @x[6],@x[6]
-       lrvr    @x[7],@x[7]
-       al      @x[12],$stdframe+4*12($sp)
-       al      @x[13],$stdframe+4*13($sp)
-       al      @x[14],$stdframe+4*14($sp)
-       al      @x[15],$stdframe+4*15($sp)
-       lrvr    @x[12],@x[12]
-       lrvr    @x[13],@x[13]
-       lrvr    @x[14],@x[14]
-       lrvr    @x[15],@x[15]
-
-       la      @t[0],0(@t[0],%r14)             # reconstruct output pointer
-       cl${g}r %r14,@t[1]
-       jh      .Ltail
-
-       x       @x[0],4*0(%r14)                 # xor with input
-       x       @x[1],4*1(%r14)
-       st      @x[0],4*0(@t[0])                # store output
-       x       @x[2],4*2(%r14)
-       st      @x[1],4*1(@t[0])
-       x       @x[3],4*3(%r14)
-       st      @x[2],4*2(@t[0])
-       x       @x[4],4*4(%r14)
-       st      @x[3],4*3(@t[0])
-        lm     @x[0],@x[3],$stdframe+4*8+4*8($sp)      # load x[8]-x[11]
-       x       @x[5],4*5(%r14)
-       st      @x[4],4*4(@t[0])
-       x       @x[6],4*6(%r14)
-        al     @x[0],$stdframe+4*8($sp)
-       st      @x[5],4*5(@t[0])
-       x       @x[7],4*7(%r14)
-        al     @x[1],$stdframe+4*9($sp)
-       st      @x[6],4*6(@t[0])
-       x       @x[12],4*12(%r14)
-        al     @x[2],$stdframe+4*10($sp)
-       st      @x[7],4*7(@t[0])
-       x       @x[13],4*13(%r14)
-        al     @x[3],$stdframe+4*11($sp)
-       st      @x[12],4*12(@t[0])
-       x       @x[14],4*14(%r14)
-       st      @x[13],4*13(@t[0])
-       x       @x[15],4*15(%r14)
-       st      @x[14],4*14(@t[0])
-        lrvr   @x[0],@x[0]
-       st      @x[15],4*15(@t[0])
-        lrvr   @x[1],@x[1]
-        lrvr   @x[2],@x[2]
-        lrvr   @x[3],@x[3]
-       lhi     @x[12],1
-        x      @x[0],4*8(%r14)
-       al      @x[12],$stdframe+4*12($sp)      # increment counter
-        x      @x[1],4*9(%r14)
-        st     @x[0],4*8(@t[0])
-        x      @x[2],4*10(%r14)
-        st     @x[1],4*9(@t[0])
-        x      @x[3],4*11(%r14)
-        st     @x[2],4*10(@t[0])
-        st     @x[3],4*11(@t[0])
-
-       cl${g}r %r14,@t[1]                      # done yet?
-       la      %r14,64(%r14)
-       jl      .Loop_outer
-
-.Ldone:
-       xgr     %r0,%r0
-       xgr     %r1,%r1
-       xgr     %r2,%r2
-       xgr     %r3,%r3
-       stmg    %r0,%r3,$stdframe+4*4($sp)      # wipe key copy
-       stmg    %r0,%r3,$stdframe+4*12($sp)
-
-       lm${g}  %r6,%r15,`$frame+6*$SIZE_T`($sp)
-       br      %r14
-
-.align 16
-.Ltail:
-       la      @t[1],64($t[1])
-       stm     @x[0],@x[7],$stdframe+4*0($sp)
-       sl${g}r @t[1],%r14
-       lm      @x[0],@x[3],$stdframe+4*8+4*8($sp)
-       l${g}hi @x[6],0
-       stm     @x[12],@x[15],$stdframe+4*12($sp)
-       al      @x[0],$stdframe+4*8($sp)
-       al      @x[1],$stdframe+4*9($sp)
-       al      @x[2],$stdframe+4*10($sp)
-       al      @x[3],$stdframe+4*11($sp)
-       lrvr    @x[0],@x[0]
-       lrvr    @x[1],@x[1]
-       lrvr    @x[2],@x[2]
-       lrvr    @x[3],@x[3]
-       stm     @x[0],@x[3],$stdframe+4*8($sp)
-
-.Loop_tail:
-       llgc    @x[4],0(@x[6],%r14)
-       llgc    @x[5],$stdframe(@x[6],$sp)
-       xr      @x[5],@x[4]
-       stc     @x[5],0(@x[6],@t[0])
-       la      @x[6],1(@x[6])
-       brct    @t[1],.Loop_tail
-
-       j       .Ldone
-.size  ChaCha20_ctr32,.-ChaCha20_ctr32
-
-.align 32
-.Lsigma:
-.long  0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
-.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.align 4
-___
-
-foreach (split("\n",$code)) {
-       s/\`([^\`]*)\`/eval $1/ge;
-
-       print $_,"\n";
+       vst     ($xa0,"$stdframe+0x00($sp)");
+       vst     ($xb0,"$stdframe+0x10($sp)");
+       vst     ($xt0,"$stdframe+0x20($sp)");
+       vst     ($xt1,"$stdframe+0x30($sp)");
 }
-close STDOUT;
+       lghi    ("%r1",0);
+
+LABEL  (".Loop_tail_4x");
+       llgc    ("%r5","0(%r1,$inp)");
+       llgc    ("%r6","$stdframe(%r1,$sp)");
+       xr      ("%r6","%r5");
+       stc     ("%r6","0(%r1,$out)");
+       la      ("%r1","1(%r1)");
+       brct    ($len,".Loop_tail_4x");
+
+&{$z?  \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+       la      ($sp,"$FRAME($sp)");
+       br      ("%r14");
+SIZE   ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
+}
+
+########################################################################
+# 6x"horizontal" layout is optimal fit for the platform in its current
+# shape, more specifically for given vector instructions' latency. Well,
+# computational part of 8x"vertical" would be faster, but it consumes
+# all registers and dealing with that will diminish the return...
+#
+{
+my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
+    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
+    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
+my @K=map("%v$_",(27,24..26));
+my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
+my $beperm="%v31";
+my $FRAME=$stdframe + 4*16;
+
+GLOBL  ("ChaCha20_ctr32_vx");
+ALIGN  (32);
+LABEL  ("ChaCha20_ctr32_vx");
+LABEL  (".LChaCha20_ctr32_vx");
+&{$z?  \&clgfi:\&clfi} ($len,256);
+       jle     (".LChaCha20_ctr32_4x");
+&{$z?  \&stmg:\&stm}   ("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+       std     ("%f4","16*$SIZE_T+2*8($sp)");
+       std     ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z?  \&lghi:\&lhi}   ("%r1",-$FRAME);
+       lgr     ("%r0",$sp);
+       la      ($sp,"0(%r1,$sp)");
+&{$z?  \&stg:\&st}     ("%r0","0($sp)");       # back-chain
+if ($z) {
+       std     ("%f8","$FRAME-8*8($sp)");
+       std     ("%f9","$FRAME-8*7($sp)");
+       std     ("%f10","$FRAME-8*6($sp)");
+       std     ("%f11","$FRAME-8*5($sp)");
+       std     ("%f12","$FRAME-8*4($sp)");
+       std     ("%f13","$FRAME-8*3($sp)");
+       std     ("%f14","$FRAME-8*2($sp)");
+       std     ("%f15","$FRAME-8*1($sp)");
+}
+       larl    ("%r7",".Lsigma");
+       lhi     ("%r0",10);
+
+       vlm     (@K[1],@K[2],"0($key)");        # load key
+       vl      (@K[3],"0($counter)");          # load counter
+
+       vlm     (@K[0],"$beperm","0(%r7)");     # load sigma, increments, ...
+
+LABEL  (".Loop_outer_vx");
+       vlr     ($a0,@K[0]);
+       vlr     ($b0,@K[1]);
+       vlr     ($a1,@K[0]);
+       vlr     ($b1,@K[1]);
+       vlr     ($a2,@K[0]);
+       vlr     ($b2,@K[1]);
+       vlr     ($a3,@K[0]);
+       vlr     ($b3,@K[1]);
+       vlr     ($a4,@K[0]);
+       vlr     ($b4,@K[1]);
+       vlr     ($a5,@K[0]);
+       vlr     ($b5,@K[1]);
+
+       vlr     ($d0,@K[3]);
+       vaf     ($d1,@K[3],$t1);                # K[3]+1
+       vaf     ($d2,@K[3],$t2);                # K[3]+2
+       vaf     ($d3,@K[3],$t3);                # K[3]+3
+       vaf     ($d4,$d2,$t2);                  # K[3]+4
+       vaf     ($d5,$d2,$t3);                  # K[3]+5
+
+       vlr     ($c0,@K[2]);
+       vlr     ($c1,@K[2]);
+       vlr     ($c2,@K[2]);
+       vlr     ($c3,@K[2]);
+       vlr     ($c4,@K[2]);
+       vlr     ($c5,@K[2]);
+
+       vlr     ($t1,$d1);
+       vlr     ($t2,$d2);
+       vlr     ($t3,$d3);
+
+ALIGN  (4);
+LABEL  (".Loop_vx");
+
+       VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+                $b0,$b1,$b2,$b3,$b4,$b5,
+                $c0,$c1,$c2,$c3,$c4,$c5,
+                $d0,$d1,$d2,$d3,$d4,$d5,
+                0);
+
+       VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+                $b0,$b1,$b2,$b3,$b4,$b5,
+                $c0,$c1,$c2,$c3,$c4,$c5,
+                $d0,$d1,$d2,$d3,$d4,$d5,
+                1);
+
+       brct    ("%r0",".Loop_vx");
+
+       vaf     ($a0,$a0,@K[0]);
+       vaf     ($b0,$b0,@K[1]);
+       vaf     ($c0,$c0,@K[2]);
+       vaf     ($d0,$d0,@K[3]);
+       vaf     ($a1,$a1,@K[0]);
+       vaf     ($d1,$d1,$t1);                  # +K[3]+1
+
+       vperm   ($a0,$a0,$a0,$beperm);
+       vperm   ($b0,$b0,$b0,$beperm);
+       vperm   ($c0,$c0,$c0,$beperm);
+       vperm   ($d0,$d0,$d0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_vx");
+
+       vaf     ($d2,$d2,$t2);                  # +K[3]+2
+       vaf     ($d3,$d3,$t3);                  # +K[3]+3
+       vlm     ($t0,$t3,"0($inp)");
+
+       vx      ($a0,$a0,$t0);
+       vx      ($b0,$b0,$t1);
+       vx      ($c0,$c0,$t2);
+       vx      ($d0,$d0,$t3);
+
+       vlm     (@K[0],$t3,"0(%r7)");           # re-load sigma and increments
+
+       vstm    ($a0,$d0,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       je      (".Ldone_vx");
+
+       vaf     ($b1,$b1,@K[1]);
+       vaf     ($c1,$c1,@K[2]);
+
+       vperm   ($a0,$a1,$a1,$beperm);
+       vperm   ($b0,$b1,$b1,$beperm);
+       vperm   ($c0,$c1,$c1,$beperm);
+       vperm   ($d0,$d1,$d1,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_vx");
+
+       vlm     ($a1,$d1,"0($inp)");
+
+       vx      ($a0,$a0,$a1);
+       vx      ($b0,$b0,$b1);
+       vx      ($c0,$c0,$c1);
+       vx      ($d0,$d0,$d1);
+
+       vstm    ($a0,$d0,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       je      (".Ldone_vx");
+
+       vaf     ($a2,$a2,@K[0]);
+       vaf     ($b2,$b2,@K[1]);
+       vaf     ($c2,$c2,@K[2]);
+
+       vperm   ($a0,$a2,$a2,$beperm);
+       vperm   ($b0,$b2,$b2,$beperm);
+       vperm   ($c0,$c2,$c2,$beperm);
+       vperm   ($d0,$d2,$d2,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_vx");
+
+       vlm     ($a1,$d1,"0($inp)");
+
+       vx      ($a0,$a0,$a1);
+       vx      ($b0,$b0,$b1);
+       vx      ($c0,$c0,$c1);
+       vx      ($d0,$d0,$d1);
+
+       vstm    ($a0,$d0,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       je      (".Ldone_vx");
+
+       vaf     ($a3,$a3,@K[0]);
+       vaf     ($b3,$b3,@K[1]);
+       vaf     ($c3,$c3,@K[2]);
+       vaf     ($d2,@K[3],$t3);                # K[3]+3
+
+       vperm   ($a0,$a3,$a3,$beperm);
+       vperm   ($b0,$b3,$b3,$beperm);
+       vperm   ($c0,$c3,$c3,$beperm);
+       vperm   ($d0,$d3,$d3,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_vx");
+
+       vaf     ($d3,$d2,$t1);                  # K[3]+4
+       vlm     ($a1,$d1,"0($inp)");
+
+       vx      ($a0,$a0,$a1);
+       vx      ($b0,$b0,$b1);
+       vx      ($c0,$c0,$c1);
+       vx      ($d0,$d0,$d1);
+
+       vstm    ($a0,$d0,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       je      (".Ldone_vx");
+
+       vaf     ($a4,$a4,@K[0]);
+       vaf     ($b4,$b4,@K[1]);
+       vaf     ($c4,$c4,@K[2]);
+       vaf     ($d4,$d4,$d3);                  # +K[3]+4
+       vaf     ($d3,$d3,$t1);                  # K[3]+5
+       vaf     (@K[3],$d2,$t3);                # K[3]+=6
+
+       vperm   ($a0,$a4,$a4,$beperm);
+       vperm   ($b0,$b4,$b4,$beperm);
+       vperm   ($c0,$c4,$c4,$beperm);
+       vperm   ($d0,$d4,$d4,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_vx");
+
+       vlm     ($a1,$d1,"0($inp)");
+
+       vx      ($a0,$a0,$a1);
+       vx      ($b0,$b0,$b1);
+       vx      ($c0,$c0,$c1);
+       vx      ($d0,$d0,$d1);
+
+       vstm    ($a0,$d0,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       je      (".Ldone_vx");
+
+       vaf     ($a5,$a5,@K[0]);
+       vaf     ($b5,$b5,@K[1]);
+       vaf     ($c5,$c5,@K[2]);
+       vaf     ($d5,$d5,$d3);                  # +K[3]+5
+
+       vperm   ($a0,$a5,$a5,$beperm);
+       vperm   ($b0,$b5,$b5,$beperm);
+       vperm   ($c0,$c5,$c5,$beperm);
+       vperm   ($d0,$d5,$d5,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+       jl      (".Ltail_vx");
+
+       vlm     ($a1,$d1,"0($inp)");
+
+       vx      ($a0,$a0,$a1);
+       vx      ($b0,$b0,$b1);
+       vx      ($c0,$c0,$c1);
+       vx      ($d0,$d0,$d1);
+
+       vstm    ($a0,$d0,"0($out)");
+
+       la      ($inp,"0x40($inp)");
+       la      ($out,"0x40($out)");
+       lhi     ("%r0",10);
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+       jne     (".Loop_outer_vx");
+
+LABEL  (".Ldone_vx");
+if (!$z) {
+       ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+       ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+       ld      ("%f8","$FRAME-8*8($sp)");
+       ld      ("%f9","$FRAME-8*7($sp)");
+       ld      ("%f10","$FRAME-8*6($sp)");
+       ld      ("%f11","$FRAME-8*5($sp)");
+       ld      ("%f12","$FRAME-8*4($sp)");
+       ld      ("%f13","$FRAME-8*3($sp)");
+       ld      ("%f14","$FRAME-8*2($sp)");
+       ld      ("%f15","$FRAME-8*1($sp)");
+}
+&{$z?  \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+       la      ($sp,"$FRAME($sp)");
+       br      ("%r14");
+
+ALIGN  (16);
+LABEL  (".Ltail_vx");
+if (!$z) {
+       ld      ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+       ld      ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+       ld      ("%f8","$FRAME-8*8($sp)");
+       ld      ("%f9","$FRAME-8*7($sp)");
+       ld      ("%f10","$FRAME-8*6($sp)");
+       ld      ("%f11","$FRAME-8*5($sp)");
+       ld      ("%f12","$FRAME-8*4($sp)");
+       ld      ("%f13","$FRAME-8*3($sp)");
+       ld      ("%f14","$FRAME-8*2($sp)");
+       ld      ("%f15","$FRAME-8*1($sp)");
+}
+       vstm    ($a0,$d0,"$stdframe($sp)");
+       lghi    ("%r1",0);
+
+LABEL  (".Loop_tail_vx");
+       llgc    ("%r5","0(%r1,$inp)");
+       llgc    ("%r6","$stdframe(%r1,$sp)");
+       xr      ("%r6","%r5");
+       stc     ("%r6","0(%r1,$out)");
+       la      ("%r1","1(%r1)");
+       brct    ($len,".Loop_tail_vx");
+
+&{$z?  \&lmg:\&lm}     ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+       la      ($sp,"$FRAME($sp)");
+       br      ("%r14");
+SIZE   ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
+}
+################
+
+ALIGN  (32);
+LABEL  (".Lsigma");
+LONG   (0x61707865,0x3320646e,0x79622d32,0x6b206574);  # endian-neutral sigma
+LONG   (1,0,0,0);
+LONG   (2,0,0,0);
+LONG   (3,0,0,0);
+LONG   (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);  # byte swap
+
+LONG   (0,1,2,3);
+LONG   (0x61707865,0x61707865,0x61707865,0x61707865);  # smashed sigma
+LONG   (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
+LONG   (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
+LONG   (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
+
+ASCIZ  ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
+ALIGN  (4);
+
+PERLASM_END();