+################
+# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
+{
+my ($ctx,$inp,$len) = map("%r$_",(2..4));
+my $padbit="%r0";
+
+GLOBL ("poly1305_blocks_vx");
+TYPE ("poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks_vx");
+if ($z) {
+ aghi ($sp,-$frame);
+ vstm ("%v8","%v15","0($sp)");
+} else {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+ llgfr ($len,$len);
+}
+ llgfr ($padbit,"%r5");
+ vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
+ larl ("%r5",".Lconst");
+ vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
+ sllg ($padbit,$padbit,24);
+ vlm (@vperm[0],$mask,"0(%r5)"); # load vperm ops, mask
+ vgbm ($mask4,0x0707);
+ vlvgp ($padvec,$padbit,$padbit);
+
+ srlg ("%r1",$len,6);
+ ltgr ("%r1","%r1");
+ jz (".Lvx_4x_done");
+
+ALIGN (16);
+LABEL (".Lvx_4x");
+ vlm ("%v20","%v23","0($inp)"); # load m0,m1,m2,m3
+
+ # m01,m23 -> base 2^26
+
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
+ vperm (@m23[0],"%v22","%v23",@vperm[0]);
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
+ vperm (@m23[2],"%v22","%v23",@vperm[1]);
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
+ vperm (@m23[4],"%v22","%v23",@vperm[2]);
+
+ vesrlg (@m01[1],@m01[0],26);
+ vesrlg (@m23[1],@m23[0],26);
+ vesrlg (@m01[3],@m01[2],30);
+ vesrlg (@m23[3],@m23[2],30);
+ vesrlg (@m01[2],@m01[2],4);
+ vesrlg (@m23[2],@m23[2],4);
+
+ vn (@m01[4],@m01[4],$mask4);
+ vn (@m23[4],@m23[4],$mask4);
+for (0..3) {
+ vn (@m01[$_],@m01[$_],$mask);
+ vn (@m23[$_],@m23[$_],$mask);
+}
+ vaf (@m01[4],@m01[4],$padvec); # pad m01
+ vaf (@m23[4],@m23[4],$padvec); # pad m23
+
+ # acc = acc * r^4 + m01 * r^2 + m23
+
+ vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
+ vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
+
+ vmalof (@tmp[0],@m01[4],@r5[1],@m23[0]);
+ vmalof (@tmp[1],@m01[4],@r5[2],@m23[1]);
+ vmalof (@tmp[2],@m01[4],@r5[3],@m23[2]);
+ vmalof (@tmp[3],@m01[4],@r5[4],@m23[3]);
+ vmalof (@tmp[4],@m01[4],@r[0],@m23[4]);
+
+ vmalof (@tmp[0],@m01[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@m01[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@m01[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@m01[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@m01[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@m01[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@m01[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@m01[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@m01[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@m01[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@m01[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@m01[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@m01[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@m01[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@m01[1],@r[3],@tmp[4]);
+
+ vmalof (@tmp[0],@m01[0],@r[0],@tmp[0]);
+ vmalof (@tmp[1],@m01[0],@r[1],@tmp[1]);
+ vmalof (@tmp[2],@m01[0],@r[2],@tmp[2]);
+ vmalof (@tmp[3],@m01[0],@r[3],@tmp[3]);
+ vmalof (@tmp[4],@m01[0],@r[4],@tmp[4]);
+
+ vlrepf (@r5[$_],"4*$_+148($ctx)") for (0..4); # load 5*r^4
+ vlrepf (@r[$_],"4*$_+128($ctx)") for (0..4); # load r^4
+
+ vmalof (@tmp[0],@acc[4],@r5[1],@tmp[0]);
+ vmalof (@tmp[1],@acc[4],@r5[2],@tmp[1]);
+ vmalof (@tmp[2],@acc[4],@r5[3],@tmp[2]);
+ vmalof (@tmp[3],@acc[4],@r5[4],@tmp[3]);
+ vmalof (@tmp[4],@acc[4],@r[0],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
+
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
+
+ REDUCE ();
+
+ la ($inp,"64($inp)");
+ brctg ("%r1",".Lvx_4x");
+
+ALIGN (16);
+LABEL (".Lvx_4x_done");
+ tml ($len,32);
+ jz (".Lvx_2x_done");
+
+ vlm ("%v20","%v21","0($inp)"); # load m0,m1
+
+ # m01 -> base 2^26
+
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
+
+ vesrlg (@m01[1],@m01[0],26);
+ vesrlg (@m01[3],@m01[2],30);
+ vesrlg (@m01[2],@m01[2],4);
+
+ vn (@m01[4],@m01[4],$mask4);
+ vn (@m01[$_],@m01[$_],$mask) for (0..3);
+
+ vaf (@m01[4],@m01[4],$padvec); # pad m01
+
+ # acc = acc * r^2+ m01
+
+ vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
+ vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
+
+ vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
+ vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
+ vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
+ vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
+ vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
+
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
+
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
+
+ REDUCE ();
+
+ la ($inp,"32($inp)");
+
+ALIGN (16);
+LABEL (".Lvx_2x_done");
+ tml ($len,16);
+ jz (".Lvx_done");
+
+ vleig ($padvec,0,0);
+
+ vzero ("%v20");
+ vl ("%v21","0($inp)"); # load m0
+
+ # m0 -> base 2^26
+
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
+
+ vesrlg (@m01[1],@m01[0],26);
+ vesrlg (@m01[3],@m01[2],30);
+ vesrlg (@m01[2],@m01[2],4);
+
+ vn (@m01[4],@m01[4],$mask4);
+ vn (@m01[$_],@m01[$_],$mask) for (0..3);
+
+ vaf (@m01[4],@m01[4],$padvec); # pad m0
+
+ # acc = acc * r + m01
+
+ vlrepf (@r5[$_],"4*$_+68($ctx)") for (0..4); # load 5*r
+ vlrepf (@r[$_],"4*$_+48($ctx)") for (0..4); # load r
+
+ vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
+ vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
+ vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
+ vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
+ vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
+
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
+
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
+
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
+
+ REDUCE ();
+
+ALIGN (16);
+LABEL (".Lvx_done");
+ vstef (@acc[$_],"4*$_($ctx)",1) for (0..4); # store acc
+ vstef (@acc[$_],"24+4*$_($ctx)",3) for (0..4);
+
+if ($z) {
+ vlm ("%v8","%v15","0($sp)");
+ la ($sp,"$frame($sp)");
+} else {
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
+}
+ br ("%r14");
+SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
+}