2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for s390x.
21 # ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
22 # code. For older compiler improvement coefficient is >3x, because
23 # then base 2^64 and base 2^32 implementations are compared.
25 # On side note, z13 enables vector base 2^26 implementation...
30 # Add vx code path (base 2^26).
32 # Copyright IBM Corp. 2019
33 # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
38 use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL);
43 if ($flavour =~ /3[12]/) {
52 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
56 # novx code path ctx layout
57 # ---------------------------------
59 # ---------------------------------
60 # u64 h[3] hash 2^64 0
62 # u64 r[2] key 2^64 32
64 # vx code path ctx layout
65 # ---------------------------------
67 # ---------------------------------
68 # u32 acc1[5] r^2-acc 2^26 0
70 # u32 acc2[5] r-acc 2^26 24
73 # u32 r15[5] 5*r 2^26 68
74 # u32 r2[5] r^2 2^26 88
75 # u32 r25[5] 5*r^2 2^26 108
76 # u32 r4[5] r^4 2^26 128
77 # u32 r45[5] 5*r^4 2^26 148
79 PERLASM_BEGIN($output);
84 # static void poly1305_init(void *ctx, const unsigned char key[16])
86 my ($ctx,$key)=map("%r$_",(2..3));
87 my ($r0,$r1,$r2)=map("%r$_",(9,11,13));
89 sub MUL_RKEY { # r*=key
90 my ($d0hi,$d0lo,$d1hi,$d1lo)=map("%r$_",(4..7));
91 my ($t0,$t1,$s1)=map("%r$_",(8,10,12));
93 lg ("%r0","32($ctx)");
94 lg ("%r1","40($ctx)");
138 sub ST_R5R { # store r,5*r -> base 2^26
139 my @d=map("%r$_",(4..8));
147 srlg ("%r1",@d[2],52);
149 srlg ("%r0",@d[2],26);
156 srlg ("%r1",@d[3],40);
159 srlg (@d[3],@d[3],14);
163 stm (@d[0],@d[4],"@off[0]($ctx)");
164 mhi (@d[$_],5) for (0..4);
165 stm (@d[0],@d[4],"@off[1]($ctx)");
168 GLOBL ("poly1305_init");
169 TYPE ("poly1305_init","\@function");
171 LABEL ("poly1305_init");
174 stg ("%r0","0($ctx)"); # zero hash value / acc1
175 stg ("%r0","8($ctx)");
176 stg ("%r0","16($ctx)");
178 &{$z? \&clgr:\&clr} ($key,"%r0");
181 lrvg ("%r4","0($key)"); # load little-endian key
182 lrvg ("%r5","8($key)");
184 nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
185 srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
186 srlg ("%r1","%r1",4);
187 nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
192 stg ("%r4","32($ctx)");
193 stg ("%r5","40($ctx)");
195 larl ("%r1","OPENSSL_s390xcap_P");
196 lg ("%r0","16(%r1)");
197 tmhh ("%r0",0x4000); # check for vector facility
200 larl ("%r4","poly1305_blocks_vx");
201 larl ("%r5","poly1305_emit_vx");
203 &{$z? \&stmg:\&stm} ("%r6","%r13","6*$SIZE_T($sp)");
204 &{$z? \&stmg:\&stm} ("%r4","%r5","4*$z+228($ctx)");
210 ST_R5R (48,68); # store r,5*r
213 ST_R5R (88,108); # store r^2,5*r^2
217 ST_R5R (128,148); # store r^4,5*r^4
220 stg ("%r0","24($ctx)"); # zero acc2
221 stg ("%r0","32($ctx)");
222 stg ("%r0","40($ctx)");
224 &{$z? \&lmg:\&lm} ("%r6","%r13","6*$SIZE_T($sp)");
231 SIZE ("poly1305_init",".-poly1305_init");
237 my @m01=map("%v$_",(0..4));
238 my @m23=map("%v$_",(5..9));
240 my @acc=map("%v$_",(10..14));
241 my @r=map("%v$_",(15..19));
242 my @r5=map("%v$_",(20..24));
245 my @vperm=map("%v$_",(28..30));
249 vesrlg (@tmp[0],@acc[0],26);
250 vesrlg (@tmp[3],@acc[3],26);
251 vn (@acc[0],@acc[0],$mask);
252 vn (@acc[3],@acc[3],$mask);
253 vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
254 vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
256 vesrlg (@tmp[1],@acc[1],26);
257 vesrlg (@tmp[4],@acc[4],26);
258 vn (@acc[1],@acc[1],$mask);
259 vn (@acc[4],@acc[4],$mask);
260 veslg (@tmp[0],@tmp[4],2);
261 vag (@tmp[4],@tmp[4],@tmp[0]); # h[4]*=5
262 vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
263 vag (@acc[0],@acc[0],@tmp[4]); # carry 4->0
265 vesrlg (@tmp[2],@acc[2],26);
266 vesrlg (@tmp[0],@acc[0],26);
267 vn (@acc[2],@acc[2],$mask);
268 vn (@acc[0],@acc[0],$mask);
269 vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
270 vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
272 vesrlg (@tmp[3],@acc[3],26);
273 vn (@acc[3],@acc[3],$mask);
274 vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
278 # static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
279 # size_t len, u32 padbit)
281 my ($ctx,$inp,$len) = map("%r$_",(2..4));
284 GLOBL ("poly1305_blocks_vx");
285 TYPE ("poly1305_blocks_vx","\@function");
287 LABEL ("poly1305_blocks_vx");
290 vstm ("%v8","%v15","0($sp)");
292 std ("%f4","16*$SIZE_T+2*8($sp)");
293 std ("%f6","16*$SIZE_T+3*8($sp)");
296 llgfr ($padbit,"%r5");
297 vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
298 larl ("%r5",".Lconst");
299 vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
300 sllg ($padbit,$padbit,24);
301 vlm (@vperm[0],$mask,"0(%r5)"); # load vperm ops, mask
302 vgbm ($mask4,0x0707);
303 vlvgp ($padvec,$padbit,$padbit);
311 vlm ("%v20","%v23","0($inp)"); # load m0,m1,m2,m3
313 # m01,m23 -> base 2^26
315 vperm (@m01[0],"%v20","%v21",@vperm[0]);
316 vperm (@m23[0],"%v22","%v23",@vperm[0]);
317 vperm (@m01[2],"%v20","%v21",@vperm[1]);
318 vperm (@m23[2],"%v22","%v23",@vperm[1]);
319 vperm (@m01[4],"%v20","%v21",@vperm[2]);
320 vperm (@m23[4],"%v22","%v23",@vperm[2]);
322 vesrlg (@m01[1],@m01[0],26);
323 vesrlg (@m23[1],@m23[0],26);
324 vesrlg (@m01[3],@m01[2],30);
325 vesrlg (@m23[3],@m23[2],30);
326 vesrlg (@m01[2],@m01[2],4);
327 vesrlg (@m23[2],@m23[2],4);
329 vn (@m01[4],@m01[4],$mask4);
330 vn (@m23[4],@m23[4],$mask4);
332 vn (@m01[$_],@m01[$_],$mask);
333 vn (@m23[$_],@m23[$_],$mask);
335 vaf (@m01[4],@m01[4],$padvec); # pad m01
336 vaf (@m23[4],@m23[4],$padvec); # pad m23
338 # acc = acc * r^4 + m01 * r^2 + m23
340 vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
341 vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
343 vmalof (@tmp[0],@m01[4],@r5[1],@m23[0]);
344 vmalof (@tmp[1],@m01[4],@r5[2],@m23[1]);
345 vmalof (@tmp[2],@m01[4],@r5[3],@m23[2]);
346 vmalof (@tmp[3],@m01[4],@r5[4],@m23[3]);
347 vmalof (@tmp[4],@m01[4],@r[0],@m23[4]);
349 vmalof (@tmp[0],@m01[3],@r5[2],@tmp[0]);
350 vmalof (@tmp[1],@m01[3],@r5[3],@tmp[1]);
351 vmalof (@tmp[2],@m01[3],@r5[4],@tmp[2]);
352 vmalof (@tmp[3],@m01[3],@r[0],@tmp[3]);
353 vmalof (@tmp[4],@m01[3],@r[1],@tmp[4]);
355 vmalof (@tmp[0],@m01[2],@r5[3],@tmp[0]);
356 vmalof (@tmp[1],@m01[2],@r5[4],@tmp[1]);
357 vmalof (@tmp[2],@m01[2],@r[0],@tmp[2]);
358 vmalof (@tmp[3],@m01[2],@r[1],@tmp[3]);
359 vmalof (@tmp[4],@m01[2],@r[2],@tmp[4]);
361 vmalof (@tmp[0],@m01[1],@r5[4],@tmp[0]);
362 vmalof (@tmp[1],@m01[1],@r[0],@tmp[1]);
363 vmalof (@tmp[2],@m01[1],@r[1],@tmp[2]);
364 vmalof (@tmp[3],@m01[1],@r[2],@tmp[3]);
365 vmalof (@tmp[4],@m01[1],@r[3],@tmp[4]);
367 vmalof (@tmp[0],@m01[0],@r[0],@tmp[0]);
368 vmalof (@tmp[1],@m01[0],@r[1],@tmp[1]);
369 vmalof (@tmp[2],@m01[0],@r[2],@tmp[2]);
370 vmalof (@tmp[3],@m01[0],@r[3],@tmp[3]);
371 vmalof (@tmp[4],@m01[0],@r[4],@tmp[4]);
373 vlrepf (@r5[$_],"4*$_+148($ctx)") for (0..4); # load 5*r^4
374 vlrepf (@r[$_],"4*$_+128($ctx)") for (0..4); # load r^4
376 vmalof (@tmp[0],@acc[4],@r5[1],@tmp[0]);
377 vmalof (@tmp[1],@acc[4],@r5[2],@tmp[1]);
378 vmalof (@tmp[2],@acc[4],@r5[3],@tmp[2]);
379 vmalof (@tmp[3],@acc[4],@r5[4],@tmp[3]);
380 vmalof (@tmp[4],@acc[4],@r[0],@tmp[4]);
382 vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
383 vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
384 vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
385 vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
386 vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
388 vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
389 vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
390 vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
391 vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
392 vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
394 vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
395 vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
396 vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
397 vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
398 vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
400 vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
401 vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
402 vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
403 vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
404 vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
408 la ($inp,"64($inp)");
409 brctg ("%r1",".Lvx_4x");
412 LABEL (".Lvx_4x_done");
416 vlm ("%v20","%v21","0($inp)"); # load m0,m1
420 vperm (@m01[0],"%v20","%v21",@vperm[0]);
421 vperm (@m01[2],"%v20","%v21",@vperm[1]);
422 vperm (@m01[4],"%v20","%v21",@vperm[2]);
424 vesrlg (@m01[1],@m01[0],26);
425 vesrlg (@m01[3],@m01[2],30);
426 vesrlg (@m01[2],@m01[2],4);
428 vn (@m01[4],@m01[4],$mask4);
429 vn (@m01[$_],@m01[$_],$mask) for (0..3);
431 vaf (@m01[4],@m01[4],$padvec); # pad m01
433 # acc = acc * r^2+ m01
435 vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
436 vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
438 vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
439 vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
440 vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
441 vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
442 vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
444 vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
445 vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
446 vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
447 vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
448 vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
450 vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
451 vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
452 vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
453 vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
454 vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
456 vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
457 vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
458 vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
459 vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
460 vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
462 vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
463 vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
464 vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
465 vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
466 vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
470 la ($inp,"32($inp)");
473 LABEL (".Lvx_2x_done");
480 vl ("%v21","0($inp)"); # load m0
484 vperm (@m01[0],"%v20","%v21",@vperm[0]);
485 vperm (@m01[2],"%v20","%v21",@vperm[1]);
486 vperm (@m01[4],"%v20","%v21",@vperm[2]);
488 vesrlg (@m01[1],@m01[0],26);
489 vesrlg (@m01[3],@m01[2],30);
490 vesrlg (@m01[2],@m01[2],4);
492 vn (@m01[4],@m01[4],$mask4);
493 vn (@m01[$_],@m01[$_],$mask) for (0..3);
495 vaf (@m01[4],@m01[4],$padvec); # pad m0
497 # acc = acc * r + m01
499 vlrepf (@r5[$_],"4*$_+68($ctx)") for (0..4); # load 5*r
500 vlrepf (@r[$_],"4*$_+48($ctx)") for (0..4); # load r
502 vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
503 vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
504 vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
505 vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
506 vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
508 vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
509 vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
510 vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
511 vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
512 vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
514 vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
515 vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
516 vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
517 vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
518 vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
520 vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
521 vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
522 vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
523 vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
524 vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
526 vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
527 vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
528 vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
529 vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
530 vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
536 vstef (@acc[$_],"4*$_($ctx)",1) for (0..4); # store acc
537 vstef (@acc[$_],"24+4*$_($ctx)",3) for (0..4);
540 vlm ("%v8","%v15","0($sp)");
541 la ($sp,"$frame($sp)");
543 ld ("%f4","16*$SIZE_T+2*8($sp)");
544 ld ("%f6","16*$SIZE_T+3*8($sp)");
547 SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
551 # static void poly1305_emit_vx(void *ctx, unsigned char mac[16],
552 # const u32 nonce[4])
554 my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
556 GLOBL ("poly1305_emit_vx");
557 TYPE ("poly1305_emit_vx","\@function");
559 LABEL ("poly1305_emit_vx");
562 vstm ("%v8","%v15","0($sp)");
564 std ("%f4","16*$SIZE_T+2*8($sp)");
565 std ("%f6","16*$SIZE_T+3*8($sp)");
567 larl ("%r5",".Lconst");
569 vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
570 vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
571 vlef (@r5[$_],"108+4*$_($ctx)",1) for (0..4); # load 5*r^2
572 vlef (@r[$_],"88+4*$_($ctx)",1) for (0..4); # load r^2
573 vlef (@r5[$_],"68+4*$_($ctx)",3) for (0..4); # load 5*r
574 vlef (@r[$_],"48+4*$_($ctx)",3) for (0..4); # load r
575 vl ($mask,"48(%r5)"); # load mask
577 # acc = acc1 * r^2 + acc2 * r
579 vmlof (@tmp[0],@acc[4],@r5[1]);
580 vmlof (@tmp[1],@acc[4],@r5[2]);
581 vmlof (@tmp[2],@acc[4],@r5[3]);
582 vmlof (@tmp[3],@acc[4],@r5[4]);
583 vmlof (@tmp[4],@acc[4],@r[0]);
585 vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
586 vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
587 vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
588 vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
589 vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
591 vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
592 vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
593 vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
594 vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
595 vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
597 vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
598 vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
599 vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
600 vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
601 vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
603 vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
604 vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
605 vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
606 vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
607 vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
610 vsumqg (@acc[$_],@acc[$_],"%v27") for (0..4);
614 vesrlg (@tmp[1],@acc[1],26);
615 vn (@acc[1],@acc[1],$mask);
616 vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
618 vesrlg (@tmp[2],@acc[2],26);
619 vn (@acc[2],@acc[2],$mask);
620 vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
622 vesrlg (@tmp[3],@acc[3],26);
623 vn (@acc[3],@acc[3],$mask);
624 vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
627 vleib ("%v30",6*8,7);
628 vleib ("%v29",13*8,7);
629 vleib ("%v28",3*8,7);
631 veslg (@acc[1],@acc[1],26);
632 veslg (@acc[3],@acc[3],26);
633 vo (@acc[0],@acc[0],@acc[1]);
634 vo (@acc[2],@acc[2],@acc[3]);
636 veslg (@acc[2],@acc[2],4);
637 vslb (@acc[2],@acc[2],"%v30"); # <<52
638 vo (@acc[0],@acc[0],@acc[2]);
640 vslb (@tmp[4],@acc[4],"%v29"); # <<104
641 vo (@acc[0],@acc[0],@tmp[4]);
643 vsrlb (@acc[1],@acc[4],"%v28"); # >>24
651 vaq (@tmp[0],@acc[0],"%v27");
652 vaccq (@tmp[1],@acc[0],"%v27");
654 vaq (@tmp[1],@tmp[1],"%v26");
655 vaccq (@tmp[1],@tmp[1],@acc[1]);
657 vaq (@tmp[1],@tmp[1],"%v29");
659 vn (@tmp[2],@tmp[1],@acc[0]);
660 vnc (@tmp[3],@tmp[0],@tmp[1]);
661 vo (@acc[0],@tmp[2],@tmp[3]);
664 vl (@vperm[0],"64(%r5)");
665 vlef (@tmp[0],"4*$_($nonce)",3-$_) for (0..3);
667 vaq (@acc[0],@acc[0],@tmp[0]);
669 vperm (@acc[0],@acc[0],@acc[0],@vperm[0]);
670 vst (@acc[0],"0($mac)"); # store mac
673 vlm ("%v8","%v15","0($sp)");
674 la ($sp,"$frame($sp)");
676 ld ("%f4","16*$SIZE_T+2*8($sp)");
677 ld ("%f6","16*$SIZE_T+3*8($sp)");
680 SIZE ("poly1305_emit_vx",".-poly1305_emit_vx");
687 # static void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
690 my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
692 my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
693 my ($r0,$r1,$s1) = map("%r$_",(0..2));
694 GLOBL ("poly1305_blocks");
695 TYPE ("poly1305_blocks","\@function");
697 LABEL ("poly1305_blocks");
698 $z? srlg ($len,$len,4) :srl ($len,4);
700 &{$z? \&clgr:\&clr} ($len,"%r0");
703 &{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
705 llgfr ($padbit,$padbit); # clear upper half, much needed with
707 lg ($r0,"32($ctx)"); # load key
710 lg ($h0,"0($ctx)"); # load hash value
714 &{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
716 algr ($s1,$r1); # s1 = r1 + r1>>2
721 lrvg ($d0lo,"0($inp)"); # load little-endian input
722 lrvg ($d1lo,"8($inp)");
723 la ($inp,"16($inp)");
725 algr ($d0lo,$h0); # accumulate input
729 mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
731 mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
733 mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
734 mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
745 msgr ($d1lo,$s1); # h2*s1
746 msgr ($h2,$r0); # h2*r0
749 alcgr ($t1,$d1hi); # $d1hi is zero
754 lghi ($h0,-4); # final reduction step
762 alcgr ($h1,$d1hi); # $d1hi is still zero
763 alcgr ($h2,$d1hi); # $d1hi is still zero
765 &{$z? \&brctg:\&brct} ($len,".Loop");
767 &{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
769 stg ($h0,"0($ctx)"); # store hash value
771 stg ($h2,"16($ctx)");
773 &{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
776 SIZE ("poly1305_blocks",".-poly1305_blocks");
780 # static void poly1305_emit(void *ctx, unsigned char mac[16],
781 # const u32 nonce[4])
783 my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
784 my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
786 GLOBL ("poly1305_emit");
787 TYPE ("poly1305_emit","\@function");
789 LABEL ("poly1305_emit");
790 &{$z? \&stmg:\&stm} ("%r6","%r9","6*$SIZE_T($sp)");
801 algr ($h0,"%r0"); # compare to modulus
805 srlg ($h2,$h2,2); # did it borrow/carry?
806 slgr ("%r1",$h2); # 0-$h2>>2
807 lg ($h2,"0($nonce)"); # load nonce
809 lg ($ctx,"8($nonce)");
810 xgr ("%r0","%r1"); # ~%r1
817 rllg ($d0,$h2,32); # flip nonce words
821 algr ($h0,$d0); # accumulate nonce
824 strvg ($h0,"0($mac)"); # write little-endian result
825 strvg ($h1,"8($mac)");
827 &{$z? \&lmg:\&lm} ("%r6","%r9","6*$SIZE_T($sp)");
829 SIZE ("poly1305_emit",".-poly1305_emit");
836 LONG (0x00060504,0x03020100,0x00161514,0x13121110); # vperm op[m[1],m[0]]
837 LONG (0x000c0b0a,0x09080706,0x001c1b1a,0x19181716); # vperm op[m[3],m[2]]
838 LONG (0x00000000,0x000f0e0d,0x00000000,0x001f1e1d); # vperm op[ - ,m[4]]
839 LONG (0x00000000,0x03ffffff,0x00000000,0x03ffffff); # [0,2^26-1,0,2^26-1]
840 LONG (0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); # vperm op endian
841 STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");