poly1305/asm/poly1305-ppc.pl: add vector base 2^26 implementation.
[openssl.git] / crypto / poly1305 / asm / poly1305-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
12 # project. The module is dual licensed under OpenSSL and CRYPTOGAMS
13 # licenses depending on where you obtain it. For further details see
14 # https://github.com/dot-asm/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for PowerPC.
18 #
19 # June 2015
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # and improvement coefficients relative to gcc-generated code.
23 #
24 #                       -m32            -m64
25 #
26 # Freescale e300        14.8/+80%       -
27 # PPC74x0               7.60/+60%       -
28 # PPC970                7.00/+114%      3.51/+205%
29 # POWER7                3.75/+260%      1.93/+100%
30 # POWER8                -               2.03/+200%
31 # POWER9                -               2.00/+150%
32 #
33 # Do we need floating-point implementation for PPC? Results presented
34 # in poly1305_ieee754.c are tricky to compare to, because they are for
35 # compiler-generated code. On the other hand it's known that floating-
36 # point performance can be dominated by FPU latency, which means that
37 # there is limit even for ideally optimized (and even vectorized) code.
38 # And this limit is estimated to be higher than above -m64 results. Or
39 # in other words floating-point implementation can be meaningful to
40 # consider only in 32-bit application context. We probably have to
41 # recognize that 32-bit builds are getting less popular on high-end
42 # systems and therefore tend to target embedded ones, which might not
43 # even have FPU...
44 #
45 # On side note, Power ISA 2.07 enables vector base 2^26 implementation,
46 # and POWER8 might have capacity to break 1.0 cycle per byte barrier...
47 #
48 # January 2019
49 #
50 # ... Unfortunately not:-( Estimate was a projection of ARM result,
51 # but ARM has vector multiply-n-add instruction, while PowerISA does
52 # not, not one usable in the context. Improvement is ~40% over -m64
53 # result above and is ~1.43 on little-endian systems.
54
55 $flavour = shift;
56
57 if ($flavour =~ /64/) {
58         $SIZE_T =8;
59         $LRSAVE =2*$SIZE_T;
60         $UCMP   ="cmpld";
61         $STU    ="stdu";
62         $POP    ="ld";
63         $PUSH   ="std";
64 } elsif ($flavour =~ /32/) {
65         $SIZE_T =4;
66         $LRSAVE =$SIZE_T;
67         $UCMP   ="cmplw";
68         $STU    ="stwu";
69         $POP    ="lwz";
70         $PUSH   ="stw";
71 } else { die "nonsense $flavour"; }
72
73 # Define endianness based on flavour
74 # i.e.: linux64le
75 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
76
77 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
79 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
80 die "can't locate ppc-xlate.pl";
81
82 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
83
84 $FRAME=24*$SIZE_T;
85
86 $sp="r1";
87 my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
88 my ($mac,$nonce)=($inp,$len);
89 my $mask = "r0";
90
91 $code=<<___;
92 .machine        "any"
93 .text
94 ___
95                                                         if ($flavour =~ /64/) {
96 ###############################################################################
97 # base 2^64 implementation
98
99 my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
100
101 $code.=<<___;
102 .globl  .poly1305_init_int
103 .align  4
104 .poly1305_init_int:
105         xor     r0,r0,r0
106         std     r0,0($ctx)              # zero hash value
107         std     r0,8($ctx)
108         std     r0,16($ctx)
109         stw     r0,24($ctx)             # clear is_base2_26
110
111         $UCMP   $inp,r0
112         beq-    Lno_key
113 ___
114 $code.=<<___    if ($LITTLE_ENDIAN);
115         ld      $d0,0($inp)             # load key material
116         ld      $d1,8($inp)
117 ___
118 $code.=<<___    if (!$LITTLE_ENDIAN);
119         li      $h0,4
120         lwbrx   $d0,0,$inp              # load key material
121         li      $d1,8
122         lwbrx   $h0,$h0,$inp
123         li      $h1,12
124         lwbrx   $d1,$d1,$inp
125         lwbrx   $h1,$h1,$inp
126         insrdi  $d0,$h0,32,0
127         insrdi  $d1,$h1,32,0
128 ___
129 $code.=<<___;
130         lis     $h1,0xfff               # 0x0fff0000
131         ori     $h1,$h1,0xfffc          # 0x0ffffffc
132         insrdi  $h1,$h1,32,0            # 0x0ffffffc0ffffffc
133         ori     $h0,$h1,3               # 0x0ffffffc0fffffff
134
135         and     $d0,$d0,$h0
136         and     $d1,$d1,$h1
137
138         std     $d0,32($ctx)            # store key
139         std     $d1,40($ctx)
140
141 Lno_key:
142         xor     r3,r3,r3
143         blr
144         .long   0
145         .byte   0,12,0x14,0,0,0,2,0
146 .size   .poly1305_init_int,.-.poly1305_init_int
147
148 .globl  .poly1305_blocks
149 .align  4
150 .poly1305_blocks:
151 Lpoly1305_blocks:
152         srdi.   $len,$len,4
153         beq-    Labort
154
155         $STU    $sp,-$FRAME($sp)
156         mflr    r0
157         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
158         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
159         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
160         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
161         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
162         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
163
164         ld      $r0,32($ctx)            # load key
165         ld      $r1,40($ctx)
166
167         ld      $h0,0($ctx)             # load hash value
168         ld      $h1,8($ctx)
169         ld      $h2,16($ctx)
170
171         srdi    $s1,$r1,2
172         mtctr   $len
173         add     $s1,$s1,$r1             # s1 = r1 + r1>>2
174         li      $mask,3
175         b       Loop
176
177 .align  4
178 Loop:
179 ___
180 $code.=<<___    if ($LITTLE_ENDIAN);
181         ld      $t0,0($inp)             # load input
182         ld      $t1,8($inp)
183 ___
184 $code.=<<___    if (!$LITTLE_ENDIAN);
185         li      $d0,4
186         lwbrx   $t0,0,$inp              # load input
187         li      $t1,8
188         lwbrx   $d0,$d0,$inp
189         li      $d1,12
190         lwbrx   $t1,$t1,$inp
191         lwbrx   $d1,$d1,$inp
192         insrdi  $t0,$d0,32,0
193         insrdi  $t1,$d1,32,0
194 ___
195 $code.=<<___;
196         addi    $inp,$inp,16
197
198         addc    $h0,$h0,$t0             # accumulate input
199         adde    $h1,$h1,$t1
200
201         mulld   $d0,$h0,$r0             # h0*r0
202         mulhdu  $d1,$h0,$r0
203         adde    $h2,$h2,$padbit
204
205         mulld   $t0,$h1,$s1             # h1*5*r1
206         mulhdu  $t1,$h1,$s1
207         addc    $d0,$d0,$t0
208         adde    $d1,$d1,$t1
209
210         mulld   $t0,$h0,$r1             # h0*r1
211         mulhdu  $d2,$h0,$r1
212         addc    $d1,$d1,$t0
213         addze   $d2,$d2
214
215         mulld   $t0,$h1,$r0             # h1*r0
216         mulhdu  $t1,$h1,$r0
217         addc    $d1,$d1,$t0
218         adde    $d2,$d2,$t1
219
220         mulld   $t0,$h2,$s1             # h2*5*r1
221         mulld   $t1,$h2,$r0             # h2*r0
222         addc    $d1,$d1,$t0
223         adde    $d2,$d2,$t1
224
225         andc    $t0,$d2,$mask           # final reduction step
226         and     $h2,$d2,$mask
227         srdi    $t1,$t0,2
228         add     $t0,$t0,$t1
229         addc    $h0,$d0,$t0
230         addze   $h1,$d1
231         addze   $h2,$h2
232
233         bdnz    Loop
234
235         std     $h0,0($ctx)             # store hash value
236         std     $h1,8($ctx)
237         std     $h2,16($ctx)
238
239         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
240         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
241         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
242         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
243         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
244         addi    $sp,$sp,$FRAME
245 Labort:
246         blr
247         .long   0
248         .byte   0,12,4,1,0x80,5,4,0
249 .size   .poly1305_blocks,.-.poly1305_blocks
250 ___
251 {
252 my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12));
253
254 $code.=<<___;
255 .globl  .poly1305_emit
256 .align  5
257 .poly1305_emit:
258         lwz     $h0,0($ctx)     # load hash value base 2^26
259         lwz     $h1,4($ctx)
260         lwz     $h2,8($ctx)
261         lwz     $h3,12($ctx)
262         lwz     $h4,16($ctx)
263         lwz     r0,24($ctx)     # is_base2_26
264
265         sldi    $h1,$h1,26      # base 2^26 -> base 2^64
266         sldi    $t0,$h2,52
267         srdi    $h2,$h2,12
268         sldi    $h3,$h3,14
269         add     $h0,$h0,$h1
270         addc    $h0,$h0,$t0
271         sldi    $t0,$h4,40
272         srdi    $h4,$h4,24
273         adde    $h1,$h2,$h3
274         addc    $h1,$h1,$t0
275         addze   $h2,$h4
276
277         ld      $h3,0($ctx)     # load hash value base 2^64
278         ld      $h4,8($ctx)
279         ld      $t0,16($ctx)
280
281         neg     r0,r0
282         xor     $h0,$h0,$h3     # choose between radixes
283         xor     $h1,$h1,$h4
284         xor     $h2,$h2,$t0
285         and     $h0,$h0,r0
286         and     $h1,$h1,r0
287         and     $h2,$h2,r0
288         xor     $h0,$h0,$h3
289         xor     $h1,$h1,$h4
290         xor     $h2,$h2,$t0
291
292         addic   $h3,$h0,5       # compare to modulus
293         addze   $h4,$h1
294         addze   $t0,$h2
295
296         srdi    $t0,$t0,2       # see if it carried/borrowed
297         neg     $t0,$t0
298
299         andc    $h0,$h0,$t0
300         and     $h3,$h3,$t0
301         andc    $h1,$h1,$t0
302         and     $h4,$h4,$t0
303         or      $h0,$h0,$h3
304         or      $h1,$h1,$h4
305
306         lwz     $t0,4($nonce)
307         lwz     $h2,12($nonce)
308         lwz     $h3,0($nonce)
309         lwz     $h4,8($nonce)
310
311         insrdi  $h3,$t0,32,0
312         insrdi  $h4,$h2,32,0
313
314         addc    $h0,$h0,$h3     # accumulate nonce
315         adde    $h1,$h1,$h4
316
317         addi    $ctx,$mac,-1
318         addi    $mac,$mac,7
319
320         stbu    $h0,1($ctx)     # write [little-endian] result
321         srdi    $h0,$h0,8
322         stbu    $h1,1($mac)
323         srdi    $h1,$h1,8
324
325         stbu    $h0,1($ctx)
326         srdi    $h0,$h0,8
327         stbu    $h1,1($mac)
328         srdi    $h1,$h1,8
329
330         stbu    $h0,1($ctx)
331         srdi    $h0,$h0,8
332         stbu    $h1,1($mac)
333         srdi    $h1,$h1,8
334
335         stbu    $h0,1($ctx)
336         srdi    $h0,$h0,8
337         stbu    $h1,1($mac)
338         srdi    $h1,$h1,8
339
340         stbu    $h0,1($ctx)
341         srdi    $h0,$h0,8
342         stbu    $h1,1($mac)
343         srdi    $h1,$h1,8
344
345         stbu    $h0,1($ctx)
346         srdi    $h0,$h0,8
347         stbu    $h1,1($mac)
348         srdi    $h1,$h1,8
349
350         stbu    $h0,1($ctx)
351         srdi    $h0,$h0,8
352         stbu    $h1,1($mac)
353         srdi    $h1,$h1,8
354
355         stbu    $h0,1($ctx)
356         stbu    $h1,1($mac)
357
358         blr
359         .long   0
360         .byte   0,12,0x14,0,0,0,3,0
361 .size   .poly1305_emit,.-.poly1305_emit
362 ___
363 }                                                       } else {
364 ###############################################################################
365 # base 2^32 implementation
366
367 my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
368     $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
369    ) = map("r$_",(7..12,14..31));
370
371 $code.=<<___;
372 .globl  .poly1305_init_int
373 .align  4
374 .poly1305_init_int:
375         xor     r0,r0,r0
376         stw     r0,0($ctx)              # zero hash value
377         stw     r0,4($ctx)
378         stw     r0,8($ctx)
379         stw     r0,12($ctx)
380         stw     r0,16($ctx)
381         stw     r0,24($ctx)             # clear is_base2_26
382
383         $UCMP   $inp,r0
384         beq-    Lno_key
385 ___
386 $code.=<<___    if ($LITTLE_ENDIAN);
387         lw      $h0,0($inp)             # load key material
388         lw      $h1,4($inp)
389         lw      $h2,8($inp)
390         lw      $h3,12($inp)
391 ___
392 $code.=<<___    if (!$LITTLE_ENDIAN);
393         li      $h1,4
394         lwbrx   $h0,0,$inp              # load key material
395         li      $h2,8
396         lwbrx   $h1,$h1,$inp
397         li      $h3,12
398         lwbrx   $h2,$h2,$inp
399         lwbrx   $h3,$h3,$inp
400 ___
401 $code.=<<___;
402         lis     $mask,0xf000            # 0xf0000000
403         li      $r0,-4
404         andc    $r0,$r0,$mask           # 0x0ffffffc
405
406         andc    $h0,$h0,$mask
407         and     $h1,$h1,$r0
408         and     $h2,$h2,$r0
409         and     $h3,$h3,$r0
410
411         stw     $h0,32($ctx)            # store key
412         stw     $h1,36($ctx)
413         stw     $h2,40($ctx)
414         stw     $h3,44($ctx)
415
416 Lno_key:
417         xor     r3,r3,r3
418         blr
419         .long   0
420         .byte   0,12,0x14,0,0,0,2,0
421 .size   .poly1305_init_int,.-.poly1305_init_int
422
423 .globl  .poly1305_blocks
424 .align  4
425 .poly1305_blocks:
426 Lpoly1305_blocks:
427         srwi.   $len,$len,4
428         beq-    Labort
429
430         $STU    $sp,-$FRAME($sp)
431         mflr    r0
432         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
433         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
434         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
435         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
436         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
437         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
438         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
439         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
440         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
441         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
442         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
443         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
444         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
445         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
446         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
447         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
448         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
449         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
450         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
451
452         lwz     $r0,32($ctx)            # load key
453         lwz     $r1,36($ctx)
454         lwz     $r2,40($ctx)
455         lwz     $r3,44($ctx)
456
457         lwz     $h0,0($ctx)             # load hash value
458         lwz     $h1,4($ctx)
459         lwz     $h2,8($ctx)
460         lwz     $h3,12($ctx)
461         lwz     $h4,16($ctx)
462
463         srwi    $s1,$r1,2
464         srwi    $s2,$r2,2
465         srwi    $s3,$r3,2
466         add     $s1,$s1,$r1             # si = ri + ri>>2
467         add     $s2,$s2,$r2
468         add     $s3,$s3,$r3
469         mtctr   $len
470         li      $mask,3
471         b       Loop
472
473 .align  4
474 Loop:
475 ___
476 $code.=<<___    if ($LITTLE_ENDIAN);
477         lwz     $d0,0($inp)             # load input
478         lwz     $d1,4($inp)
479         lwz     $d2,8($inp)
480         lwz     $d3,12($inp)
481 ___
482 $code.=<<___    if (!$LITTLE_ENDIAN);
483         li      $d1,4
484         lwbrx   $d0,0,$inp              # load input
485         li      $d2,8
486         lwbrx   $d1,$d1,$inp
487         li      $d3,12
488         lwbrx   $d2,$d2,$inp
489         lwbrx   $d3,$d3,$inp
490 ___
491 $code.=<<___;
492         addi    $inp,$inp,16
493
494         addc    $h0,$h0,$d0             # accumulate input
495         adde    $h1,$h1,$d1
496         adde    $h2,$h2,$d2
497
498         mullw   $d0,$h0,$r0             # h0*r0
499         mulhwu  $D0,$h0,$r0
500
501         mullw   $d1,$h0,$r1             # h0*r1
502         mulhwu  $D1,$h0,$r1
503
504         mullw   $d2,$h0,$r2             # h0*r2
505         mulhwu  $D2,$h0,$r2
506
507          adde   $h3,$h3,$d3
508          adde   $h4,$h4,$padbit
509
510         mullw   $d3,$h0,$r3             # h0*r3
511         mulhwu  $D3,$h0,$r3
512
513         mullw   $t0,$h1,$s3             # h1*s3
514         mulhwu  $t1,$h1,$s3
515
516         mullw   $t2,$h1,$r0             # h1*r0
517         mulhwu  $t3,$h1,$r0
518          addc   $d0,$d0,$t0
519          adde   $D0,$D0,$t1
520
521         mullw   $t0,$h1,$r1             # h1*r1
522         mulhwu  $t1,$h1,$r1
523          addc   $d1,$d1,$t2
524          adde   $D1,$D1,$t3
525
526         mullw   $t2,$h1,$r2             # h1*r2
527         mulhwu  $t3,$h1,$r2
528          addc   $d2,$d2,$t0
529          adde   $D2,$D2,$t1
530
531         mullw   $t0,$h2,$s2             # h2*s2
532         mulhwu  $t1,$h2,$s2
533          addc   $d3,$d3,$t2
534          adde   $D3,$D3,$t3
535
536         mullw   $t2,$h2,$s3             # h2*s3
537         mulhwu  $t3,$h2,$s3
538          addc   $d0,$d0,$t0
539          adde   $D0,$D0,$t1
540
541         mullw   $t0,$h2,$r0             # h2*r0
542         mulhwu  $t1,$h2,$r0
543          addc   $d1,$d1,$t2
544          adde   $D1,$D1,$t3
545
546         mullw   $t2,$h2,$r1             # h2*r1
547         mulhwu  $t3,$h2,$r1
548          addc   $d2,$d2,$t0
549          adde   $D2,$D2,$t1
550
551         mullw   $t0,$h3,$s1             # h3*s1
552         mulhwu  $t1,$h3,$s1
553          addc   $d3,$d3,$t2
554          adde   $D3,$D3,$t3
555
556         mullw   $t2,$h3,$s2             # h3*s2
557         mulhwu  $t3,$h3,$s2
558          addc   $d0,$d0,$t0
559          adde   $D0,$D0,$t1
560
561         mullw   $t0,$h3,$s3             # h3*s3
562         mulhwu  $t1,$h3,$s3
563          addc   $d1,$d1,$t2
564          adde   $D1,$D1,$t3
565
566         mullw   $t2,$h3,$r0             # h3*r0
567         mulhwu  $t3,$h3,$r0
568          addc   $d2,$d2,$t0
569          adde   $D2,$D2,$t1
570
571         mullw   $t0,$h4,$s1             # h4*s1
572          addc   $d3,$d3,$t2
573          adde   $D3,$D3,$t3
574         addc    $d1,$d1,$t0
575
576         mullw   $t1,$h4,$s2             # h4*s2
577          addze  $D1,$D1
578         addc    $d2,$d2,$t1
579         addze   $D2,$D2
580
581         mullw   $t2,$h4,$s3             # h4*s3
582         addc    $d3,$d3,$t2
583         addze   $D3,$D3
584
585         mullw   $h4,$h4,$r0             # h4*r0
586
587         addc    $h1,$d1,$D0
588         adde    $h2,$d2,$D1
589         adde    $h3,$d3,$D2
590         adde    $h4,$h4,$D3
591
592         andc    $D0,$h4,$mask           # final reduction step
593         and     $h4,$h4,$mask
594         srwi    $D1,$D0,2
595         add     $D0,$D0,$D1
596         addc    $h0,$d0,$D0
597         addze   $h1,$h1
598         addze   $h2,$h2
599         addze   $h3,$h3
600         addze   $h4,$h4
601
602         bdnz    Loop
603
604         stw     $h0,0($ctx)             # store hash value
605         stw     $h1,4($ctx)
606         stw     $h2,8($ctx)
607         stw     $h3,12($ctx)
608         stw     $h4,16($ctx)
609
610         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
611         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
612         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
613         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
614         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
615         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
616         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
617         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
618         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
619         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
620         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
621         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
622         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
623         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
624         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
625         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
626         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
627         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
628         addi    $sp,$sp,$FRAME
629 Labort:
630         blr
631         .long   0
632         .byte   0,12,4,1,0x80,18,4,0
633 .size   .poly1305_blocks,.-.poly1305_blocks
634 ___
635 {
636 my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12));
637
638 $code.=<<___;
639 .globl  .poly1305_emit
640 .align  5
641 .poly1305_emit:
642         lwz     r0,24($ctx)     # is_base2_26
643         lwz     $h0,0($ctx)     # load hash value
644         lwz     $h1,4($ctx)
645         lwz     $h2,8($ctx)
646         lwz     $h3,12($ctx)
647         lwz     $h4,16($ctx)
648         cmplwi  r0,0
649         beq     Lemit_base2_32
650
651         slwi    $t0,$h1,26      # base 2^26 -> base 2^32
652         srwi    $h1,$h1,6
653         slwi    $t1,$h2,20
654         srwi    $h2,$h2,12
655         addc    $h0,$h0,$t0
656         slwi    $t0,$h3,14
657         srwi    $h3,$h3,18
658         adde    $h1,$h1,$t1
659         slwi    $t1,$h4,8
660         srwi    $h4,$h4,24
661         adde    $h2,$h2,$t0
662         adde    $h3,$h3,$t1
663         addze   $h4,$h4
664
665 Lemit_base2_32:
666         addic   r0,$h0,5        # compare to modulus
667         addze   r0,$h1
668         addze   r0,$h2
669         addze   r0,$h3
670         addze   r0,$h4
671
672         srwi    r0,r0,2         # see if it carried/borrowed
673         neg     r0,r0
674         andi.   r0,r0,5
675
676         addc    $h0,$h0,r0
677         lwz     r0,0($nonce)
678         addze   $h1,$h1
679         lwz     $t0,4($nonce)
680         addze   $h2,$h2
681         lwz     $t1,8($nonce)
682         addze   $h3,$h3
683         lwz     $h4,12($nonce)
684
685         addc    $h0,$h0,r0      # accumulate nonce
686         adde    $h1,$h1,$t0
687         adde    $h2,$h2,$t1
688         adde    $h3,$h3,$h4
689
690         addi    $ctx,$mac,-1
691         addi    $mac,$mac,7
692
693         stbu    $h0,1($ctx)     # write [little-endian] result
694         srwi    $h0,$h0,8
695         stbu    $h2,1($mac)
696         srwi    $h2,$h2,8
697
698         stbu    $h0,1($ctx)
699         srwi    $h0,$h0,8
700         stbu    $h2,1($mac)
701         srwi    $h2,$h2,8
702
703         stbu    $h0,1($ctx)
704         srwi    $h0,$h0,8
705         stbu    $h2,1($mac)
706         srwi    $h2,$h2,8
707
708         stbu    $h0,1($ctx)
709         stbu    $h2,1($mac)
710
711         stbu    $h1,1($ctx)
712         srwi    $h1,$h1,8
713         stbu    $h3,1($mac)
714         srwi    $h3,$h3,8
715
716         stbu    $h1,1($ctx)
717         srwi    $h1,$h1,8
718         stbu    $h3,1($mac)
719         srwi    $h3,$h3,8
720
721         stbu    $h1,1($ctx)
722         srwi    $h1,$h1,8
723         stbu    $h3,1($mac)
724         srwi    $h3,$h3,8
725
726         stbu    $h1,1($ctx)
727         stbu    $h3,1($mac)
728
729         blr
730         .long   0
731         .byte   0,12,0x14,0,0,0,3,0
732 .size   .poly1305_emit,.-.poly1305_emit
733 ___
734 }                                                       }
735 {{{
736 ########################################################################
737 # PowerISA 2.07/VSX section                                            #
738 ########################################################################
739
740 my $LOCALS= 6*$SIZE_T;
741 my $VSXFRAME = $LOCALS + 6*$SIZE_T;
742    $VSXFRAME += 128;    # local variables
743    $VSXFRAME += 13*16;  # v20-v31 offload
744
745 my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0;
746
747 ########################################################################
748 # Layout of opaque area is following:
749 #
750 #       unsigned __int32 h[5];          # current hash value base 2^26
751 #       unsigned __int32 pad;
752 #       unsigned __int32 is_base2_26, pad;
753 #       unsigned __int64 r[2];          # key value base 2^64
754 #       struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9];
755 #
756 # where r^n are base 2^26 digits of powers of multiplier key. There are
757 # 5 digits, but last four are interleaved with multiples of 5, totalling
758 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of
759 # powers is as they appear in register, not memory.
760
761 my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4));
762 my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9));
763 my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14));
764 my      ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2);
765 my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19));
766 my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24));
767 my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31));
768 my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31)));
769 my ($ctx_,$_ctx,$const) = map("r$_",(10..12));
770
771                                                         if ($flavour =~ /64/) {
772 ###############################################################################
773 # setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms,
774 # but the base 2^26 computational part is same...
775
776 my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31));
777 my $mask = "r0";
778
779 $code.=<<___;
780 .globl  .poly1305_blocks_vsx
781 .align  5
782 .poly1305_blocks_vsx:
783         lwz     r7,24($ctx)             # is_base2_26
784         cmpldi  $len,128
785         bge     __poly1305_blocks_vsx
786
787         neg     r0,r7                   # is_base2_26 as mask
788         lwz     r7,0($ctx)              # load hash base 2^26
789         lwz     r8,4($ctx)
790         lwz     r9,8($ctx)
791         lwz     r10,12($ctx)
792         lwz     r11,16($ctx)
793
794         sldi    r8,r8,26                # base 2^26 -> base 2^64
795         sldi    r12,r9,52
796         add     r7,r7,r8
797         srdi    r9,r9,12
798         sldi    r10,r10,14
799         addc    r7,r7,r12
800         sldi    r8,r11,40
801         adde    r9,r9,r10
802         srdi    r11,r11,24
803         addc    r9,r9,r8
804         addze   r11,r11
805
806         ld      r8,0($ctx)              # load hash base 2^64
807         ld      r10,8($ctx)
808         ld      r12,16($ctx)
809
810         xor     r7,r7,r8                # select between radixes
811         xor     r9,r9,r10
812         xor     r11,r11,r12
813         and     r7,r7,r0
814         and     r9,r9,r0
815         and     r11,r11,r0
816         xor     r7,r7,r8
817         xor     r9,r9,r10
818         xor     r11,r11,r12
819
820         li      r0,0
821         std     r7,0($ctx)              # store hash base 2^64
822         std     r9,8($ctx)
823         std     r11,16($ctx)
824         stw     r0,24($ctx)             # clear is_base2_26
825
826         b       Lpoly1305_blocks
827         .long   0
828         .byte   0,12,0x14,0,0,0,4,0
829 .size   .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
830
831 .align  5
832 __poly1305_mul:
833         mulld   $d0,$h0,$r0             # h0*r0
834         mulhdu  $d1,$h0,$r0
835
836         mulld   $t0,$h1,$s1             # h1*5*r1
837         mulhdu  $t1,$h1,$s1
838         addc    $d0,$d0,$t0
839         adde    $d1,$d1,$t1
840
841         mulld   $t0,$h0,$r1             # h0*r1
842         mulhdu  $d2,$h0,$r1
843         addc    $d1,$d1,$t0
844         addze   $d2,$d2
845
846         mulld   $t0,$h1,$r0             # h1*r0
847         mulhdu  $t1,$h1,$r0
848         addc    $d1,$d1,$t0
849         adde    $d2,$d2,$t1
850
851         mulld   $t0,$h2,$s1             # h2*5*r1
852         mulld   $t1,$h2,$r0             # h2*r0
853         addc    $d1,$d1,$t0
854         adde    $d2,$d2,$t1
855
856         andc    $t0,$d2,$mask           # final reduction step
857         and     $h2,$d2,$mask
858         srdi    $t1,$t0,2
859         add     $t0,$t0,$t1
860         addc    $h0,$d0,$t0
861         addze   $h1,$d1
862         addze   $h2,$h2
863
864         blr
865         .long   0
866         .byte   0,12,0x14,0,0,0,0,0
867 .size   __poly1305_mul,.-__poly1305_mul
868
869 .align  5
870 __poly1305_splat:
871         extrdi  $d0,$h0,26,38
872         extrdi  $d1,$h0,26,12
873         stw     $d0,0x00($t1)
874
875         extrdi  $d2,$h0,12,0
876         slwi    $d0,$d1,2
877         stw     $d1,0x10($t1)
878         add     $d0,$d0,$d1             # * 5
879         stw     $d0,0x20($t1)
880
881         insrdi  $d2,$h1,14,38
882         slwi    $d0,$d2,2
883         stw     $d2,0x30($t1)
884         add     $d0,$d0,$d2             # * 5
885         stw     $d0,0x40($t1)
886
887         extrdi  $d1,$h1,26,24
888         extrdi  $d2,$h1,24,0
889         slwi    $d0,$d1,2
890         stw     $d1,0x50($t1)
891         add     $d0,$d0,$d1             # * 5
892         stw     $d0,0x60($t1)
893
894         insrdi  $d2,$h2,3,37
895         slwi    $d0,$d2,2
896         stw     $d2,0x70($t1)
897         add     $d0,$d0,$d2             # * 5
898         stw     $d0,0x80($t1)
899
900         blr
901         .long   0
902         .byte   0,12,0x14,0,0,0,0,0
903 .size   __poly1305_splat,.-__poly1305_splat
904
905 .align  5
906 __poly1305_blocks_vsx:
907         $STU    $sp,-$VSXFRAME($sp)
908         mflr    r0
909         li      r10,`15+$LOCALS+128`
910         li      r11,`31+$LOCALS+128`
911         mfspr   r12,256
912         stvx    v20,r10,$sp
913         addi    r10,r10,32
914         stvx    v21,r11,$sp
915         addi    r11,r11,32
916         stvx    v22,r10,$sp
917         addi    r10,r10,32
918         stvx    v23,r10,$sp
919         addi    r10,r10,32
920         stvx    v24,r11,$sp
921         addi    r11,r11,32
922         stvx    v25,r10,$sp
923         addi    r10,r10,32
924         stvx    v26,r10,$sp
925         addi    r10,r10,32
926         stvx    v27,r11,$sp
927         addi    r11,r11,32
928         stvx    v28,r10,$sp
929         addi    r10,r10,32
930         stvx    v29,r11,$sp
931         addi    r11,r11,32
932         stvx    v30,r10,$sp
933         stvx    v31,r11,$sp
934         stw     r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
935         li      r12,-1
936         mtspr   256,r12                 # preserve all AltiVec registers
937         $PUSH   r27,`$VSXFRAME-$SIZE_T*5`($sp)
938         $PUSH   r28,`$VSXFRAME-$SIZE_T*4`($sp)
939         $PUSH   r29,`$VSXFRAME-$SIZE_T*3`($sp)
940         $PUSH   r30,`$VSXFRAME-$SIZE_T*2`($sp)
941         $PUSH   r31,`$VSXFRAME-$SIZE_T*1`($sp)
942         $PUSH   r0,`$VSXFRAME+$LRSAVE`($sp)
943
944         bl      LPICmeup
945
946         li      $x10,0x10
947         li      $x20,0x20
948         li      $x30,0x30
949         li      $x40,0x40
950         li      $x50,0x50
951         lvx_u   $mask26,$x00,$const
952         lvx_u   $_26,$x10,$const
953         lvx_u   $_40,$x20,$const
954         lvx_u   $I2perm,$x30,$const
955         lvx_u   $padbits,$x40,$const
956
957         cmplwi  r7,0                    # is_base2_26?
958         bne     Lskip_init_vsx
959
960         ld      $r0,32($ctx)            # load key base 2^64
961         ld      $r1,40($ctx)
962         srdi    $s1,$r1,2
963         li      $mask,3
964         add     $s1,$s1,$r1             # s1 = r1 + r1>>2
965
966         mr      $h0,$r0                 # "calculate" r^1
967         mr      $h1,$r1
968         li      $h2,0
969         addi    $t1,$ctx,`48+(12^$BIG_ENDIAN)`
970         bl      __poly1305_splat
971
972         bl      __poly1305_mul          # caclulate r^2
973         addi    $t1,$ctx,`48+(4^$BIG_ENDIAN)`
974         bl      __poly1305_splat
975
976         bl      __poly1305_mul          # caclulate r^3
977         addi    $t1,$ctx,`48+(8^$BIG_ENDIAN)`
978         bl      __poly1305_splat
979
980         bl      __poly1305_mul          # caclulate r^4
981         addi    $t1,$ctx,`48+(0^$BIG_ENDIAN)`
982         bl      __poly1305_splat
983
984         ld      $h0,0($ctx)             # load hash
985         ld      $h1,8($ctx)
986         ld      $h2,16($ctx)
987
988         extrdi  $d0,$h0,26,38           # base 2^64 -> base 2^26
989         extrdi  $d1,$h0,26,12
990         extrdi  $d2,$h0,12,0
991         mtvrwz  $H0,$d0
992         insrdi  $d2,$h1,14,38
993         mtvrwz  $H1,$d1
994         extrdi  $d1,$h1,26,24
995         mtvrwz  $H2,$d2
996         extrdi  $d2,$h1,24,0
997         mtvrwz  $H3,$d1
998         insrdi  $d2,$h2,3,37
999         mtvrwz  $H4,$d2
1000 ___
1001                                                         } else {
1002 ###############################################################################
1003 # 32-bit initialization
1004
1005 my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12));
1006 my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4);
1007
1008 $code.=<<___;
1009 .globl  .poly1305_blocks_vsx
1010 .align  5
1011 .poly1305_blocks_vsx:
1012         lwz     r7,24($ctx)             # is_base2_26
1013         cmplwi  $len,128
1014         bge     __poly1305_blocks_vsx
1015         cmplwi  r7,0
1016         beq     Lpoly1305_blocks
1017
1018         lwz     $h0,0($ctx)             # load hash
1019         lwz     $h1,4($ctx)
1020         lwz     $h2,8($ctx)
1021         lwz     $h3,12($ctx)
1022         lwz     $h4,16($ctx)
1023
1024         slwi    $t0,$h1,26              # base 2^26 -> base 2^32
1025         srwi    $h1,$h1,6
1026         slwi    $t1,$h2,20
1027         srwi    $h2,$h2,12
1028         addc    $h0,$h0,$t0
1029         slwi    $t0,$h3,14
1030         srwi    $h3,$h3,18
1031         adde    $h1,$h1,$t1
1032         slwi    $t1,$h4,8
1033         srwi    $h4,$h4,24
1034         adde    $h2,$h2,$t0
1035         li      $t0,0
1036         adde    $h3,$h3,$t1
1037         addze   $h4,$h4
1038
1039         stw     $h0,0($ctx)             # store hash base 2^32
1040         stw     $h1,4($ctx)
1041         stw     $h2,8($ctx)
1042         stw     $h3,12($ctx)
1043         stw     $h4,16($ctx)
1044         stw     $t0,24($ctx)            # clear is_base2_26
1045
1046         b       Lpoly1305_blocks
1047         .long   0
1048         .byte   0,12,0x14,0,0,0,4,0
1049 .size   .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
1050
1051 .align  5
1052 __poly1305_mul:
1053         vmulouw         $ACC0,$H0,$R0
1054         vmulouw         $ACC1,$H1,$R0
1055         vmulouw         $ACC2,$H2,$R0
1056         vmulouw         $ACC3,$H3,$R0
1057         vmulouw         $ACC4,$H4,$R0
1058
1059         vmulouw         $T0,$H4,$S1
1060         vaddudm         $ACC0,$ACC0,$T0
1061         vmulouw         $T0,$H0,$R1
1062         vaddudm         $ACC1,$ACC1,$T0
1063         vmulouw         $T0,$H1,$R1
1064         vaddudm         $ACC2,$ACC2,$T0
1065         vmulouw         $T0,$H2,$R1
1066         vaddudm         $ACC3,$ACC3,$T0
1067         vmulouw         $T0,$H3,$R1
1068         vaddudm         $ACC4,$ACC4,$T0
1069
1070         vmulouw         $T0,$H3,$S2
1071         vaddudm         $ACC0,$ACC0,$T0
1072         vmulouw         $T0,$H4,$S2
1073         vaddudm         $ACC1,$ACC1,$T0
1074         vmulouw         $T0,$H0,$R2
1075         vaddudm         $ACC2,$ACC2,$T0
1076         vmulouw         $T0,$H1,$R2
1077         vaddudm         $ACC3,$ACC3,$T0
1078         vmulouw         $T0,$H2,$R2
1079         vaddudm         $ACC4,$ACC4,$T0
1080
1081         vmulouw         $T0,$H2,$S3
1082         vaddudm         $ACC0,$ACC0,$T0
1083         vmulouw         $T0,$H3,$S3
1084         vaddudm         $ACC1,$ACC1,$T0
1085         vmulouw         $T0,$H4,$S3
1086         vaddudm         $ACC2,$ACC2,$T0
1087         vmulouw         $T0,$H0,$R3
1088         vaddudm         $ACC3,$ACC3,$T0
1089         vmulouw         $T0,$H1,$R3
1090         vaddudm         $ACC4,$ACC4,$T0
1091
1092         vmulouw         $T0,$H1,$S4
1093         vaddudm         $ACC0,$ACC0,$T0
1094         vmulouw         $T0,$H2,$S4
1095         vaddudm         $ACC1,$ACC1,$T0
1096         vmulouw         $T0,$H3,$S4
1097         vaddudm         $ACC2,$ACC2,$T0
1098         vmulouw         $T0,$H4,$S4
1099         vaddudm         $ACC3,$ACC3,$T0
1100         vmulouw         $T0,$H0,$R4
1101         vaddudm         $ACC4,$ACC4,$T0
1102
1103         ################################################################
1104         # lazy reduction
1105
1106         vspltisb        $T0,2
1107         vsrd            $H4,$ACC3,$_26
1108         vsrd            $H1,$ACC0,$_26
1109         vand            $H3,$ACC3,$mask26
1110         vand            $H0,$ACC0,$mask26
1111         vaddudm         $H4,$H4,$ACC4           # h3 -> h4
1112         vaddudm         $H1,$H1,$ACC1           # h0 -> h1
1113
1114         vsrd            $ACC4,$H4,$_26
1115         vsrd            $ACC1,$H1,$_26
1116         vand            $H4,$H4,$mask26
1117         vand            $H1,$H1,$mask26
1118         vaddudm         $H0,$H0,$ACC4
1119         vaddudm         $H2,$ACC2,$ACC1         # h1 -> h2
1120
1121         vsld            $ACC4,$ACC4,$T0         # <<2
1122         vsrd            $ACC2,$H2,$_26
1123         vand            $H2,$H2,$mask26
1124         vaddudm         $H0,$H0,$ACC4           # h4 -> h0
1125         vaddudm         $H3,$H3,$ACC2           # h2 -> h3
1126
1127         vsrd            $ACC0,$H0,$_26
1128         vsrd            $ACC3,$H3,$_26
1129         vand            $H0,$H0,$mask26
1130         vand            $H3,$H3,$mask26
1131         vaddudm         $H1,$H1,$ACC0           # h0 -> h1
1132         vaddudm         $H4,$H4,$ACC3           # h3 -> h4
1133
1134         blr
1135         .long   0
1136         .byte   0,12,0x14,0,0,0,0,0
1137 .size   __poly1305_mul,.-__poly1305_mul
1138
1139 .align  5
1140 __poly1305_blocks_vsx:
1141         $STU    $sp,-$VSXFRAME($sp)
1142         mflr    r0
1143         li      r10,`15+$LOCALS+128`
1144         li      r11,`31+$LOCALS+128`
1145         mfspr   r12,256
1146         stvx    v20,r10,$sp
1147         addi    r10,r10,32
1148         stvx    v21,r11,$sp
1149         addi    r11,r11,32
1150         stvx    v22,r10,$sp
1151         addi    r10,r10,32
1152         stvx    v23,r10,$sp
1153         addi    r10,r10,32
1154         stvx    v24,r11,$sp
1155         addi    r11,r11,32
1156         stvx    v25,r10,$sp
1157         addi    r10,r10,32
1158         stvx    v26,r10,$sp
1159         addi    r10,r10,32
1160         stvx    v27,r11,$sp
1161         addi    r11,r11,32
1162         stvx    v28,r10,$sp
1163         addi    r10,r10,32
1164         stvx    v29,r11,$sp
1165         addi    r11,r11,32
1166         stvx    v30,r10,$sp
1167         stvx    v31,r11,$sp
1168         stw     r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
1169         li      r12,-1
1170         mtspr   256,r12                 # preserve all AltiVec registers
1171         $PUSH   r27,`$VSXFRAME-$SIZE_T*5`($sp)
1172         $PUSH   r28,`$VSXFRAME-$SIZE_T*4`($sp)
1173         $PUSH   r29,`$VSXFRAME-$SIZE_T*3`($sp)
1174         $PUSH   r30,`$VSXFRAME-$SIZE_T*2`($sp)
1175         $PUSH   r31,`$VSXFRAME-$SIZE_T*1`($sp)
1176         $PUSH   r0,`$VSXFRAME+$LRSAVE`($sp)
1177
1178         bl      LPICmeup
1179
1180         li      $x10,0x10
1181         li      $x20,0x20
1182         li      $x30,0x30
1183         li      $x40,0x40
1184         li      $x50,0x50
1185         lvx_u   $mask26,$x00,$const
1186         lvx_u   $_26,$x10,$const
1187         lvx_u   $_40,$x20,$const
1188         lvx_u   $I2perm,$x30,$const
1189         lvx_u   $padbits,$x40,$const
1190
1191         cmplwi  r7,0                    # is_base2_26?
1192         bne     Lskip_init_vsx
1193
1194         lwz     $h1,32($ctx)            # load key base 2^32
1195         lwz     $h2,36($ctx)
1196         lwz     $h3,40($ctx)
1197         lwz     $h4,44($ctx)
1198
1199         extrwi  $h0,$h1,26,6            # base 2^32 -> base 2^26
1200         extrwi  $h1,$h1,6,0
1201         insrwi  $h1,$h2,20,6
1202         extrwi  $h2,$h2,12,0
1203         insrwi  $h2,$h3,14,6
1204         extrwi  $h3,$h3,18,0
1205         insrwi  $h3,$h4,8,6
1206         extrwi  $h4,$h4,24,0
1207
1208         mtvrwz  $R0,$h0
1209         slwi    $h0,$h1,2
1210         mtvrwz  $R1,$h1
1211         add     $h1,$h1,$h0
1212         mtvrwz  $S1,$h1
1213         slwi    $h1,$h2,2
1214         mtvrwz  $R2,$h2
1215         add     $h2,$h2,$h1
1216         mtvrwz  $S2,$h2
1217         slwi    $h2,$h3,2
1218         mtvrwz  $R3,$h3
1219         add     $h3,$h3,$h2
1220         mtvrwz  $S3,$h3
1221         slwi    $h3,$h4,2
1222         mtvrwz  $R4,$h4
1223         add     $h4,$h4,$h3
1224         mtvrwz  $S4,$h4
1225
1226         vmr     $H0,$R0
1227         vmr     $H1,$R1
1228         vmr     $H2,$R2
1229         vmr     $H3,$R3
1230         vmr     $H4,$R4
1231
1232         bl      __poly1305_mul          # r^1:- * r^1:-
1233
1234         vpermdi $R0,$H0,$R0,0b00
1235         vpermdi $R1,$H1,$R1,0b00
1236         vpermdi $R2,$H2,$R2,0b00
1237         vpermdi $R3,$H3,$R3,0b00
1238         vpermdi $R4,$H4,$R4,0b00
1239         vpermdi $H0,$H0,$H0,0b00
1240         vpermdi $H1,$H1,$H1,0b00
1241         vpermdi $H2,$H2,$H2,0b00
1242         vpermdi $H3,$H3,$H3,0b00
1243         vpermdi $H4,$H4,$H4,0b00
1244         vsld    $S1,$R1,$T0             # <<2
1245         vsld    $S2,$R2,$T0
1246         vsld    $S3,$R3,$T0
1247         vsld    $S4,$R4,$T0
1248         vaddudm $S1,$S1,$R1
1249         vaddudm $S2,$S2,$R2
1250         vaddudm $S3,$S3,$R3
1251         vaddudm $S4,$S4,$R4
1252
1253         bl      __poly1305_mul          # r^2:r^2 * r^2:r^1
1254
1255         addi    $h0,$ctx,0x60
1256         lwz     $h1,0($ctx)             # load hash
1257         lwz     $h2,4($ctx)
1258         lwz     $h3,8($ctx)
1259         lwz     $h4,12($ctx)
1260         lwz     $t0,16($ctx)
1261
1262         vmrgow  $R0,$R0,$H0             # r^2:r^4:r^1:r^3
1263         vmrgow  $R1,$R1,$H1
1264         vmrgow  $R2,$R2,$H2
1265         vmrgow  $R3,$R3,$H3
1266         vmrgow  $R4,$R4,$H4
1267         vslw    $S1,$R1,$T0             # <<2
1268         vslw    $S2,$R2,$T0
1269         vslw    $S3,$R3,$T0
1270         vslw    $S4,$R4,$T0
1271         vadduwm $S1,$S1,$R1
1272         vadduwm $S2,$S2,$R2
1273         vadduwm $S3,$S3,$R3
1274         vadduwm $S4,$S4,$R4
1275
1276         stvx_u  $R0,$x30,$ctx
1277         stvx_u  $R1,$x40,$ctx
1278         stvx_u  $S1,$x50,$ctx
1279         stvx_u  $R2,$x00,$h0
1280         stvx_u  $S2,$x10,$h0
1281         stvx_u  $R3,$x20,$h0
1282         stvx_u  $S3,$x30,$h0
1283         stvx_u  $R4,$x40,$h0
1284         stvx_u  $S4,$x50,$h0
1285
1286         extrwi  $h0,$h1,26,6            # base 2^32 -> base 2^26
1287         extrwi  $h1,$h1,6,0
1288         mtvrwz  $H0,$h0
1289         insrwi  $h1,$h2,20,6
1290         extrwi  $h2,$h2,12,0
1291         mtvrwz  $H1,$h1
1292         insrwi  $h2,$h3,14,6
1293         extrwi  $h3,$h3,18,0
1294         mtvrwz  $H2,$h2
1295         insrwi  $h3,$h4,8,6
1296         extrwi  $h4,$h4,24,0
1297         mtvrwz  $H3,$h3
1298         insrwi  $h4,$t0,3,5
1299         mtvrwz  $H4,$h4
1300 ___
1301                                                         }
1302 $code.=<<___;
1303         li      r0,1
1304         stw     r0,24($ctx)             # set is_base2_26
1305         b       Loaded_vsx
1306
1307 .align  4
1308 Lskip_init_vsx:
1309         li              $x10,4
1310         li              $x20,8
1311         li              $x30,12
1312         li              $x40,16
1313         lvwzx_u         $H0,$x00,$ctx
1314         lvwzx_u         $H1,$x10,$ctx
1315         lvwzx_u         $H2,$x20,$ctx
1316         lvwzx_u         $H3,$x30,$ctx
1317         lvwzx_u         $H4,$x40,$ctx
1318
1319 Loaded_vsx:
1320         li              $x10,0x10
1321         li              $x20,0x20
1322         li              $x30,0x30
1323         li              $x40,0x40
1324         li              $x50,0x50
1325         li              $x60,0x60
1326         li              $x70,0x70
1327         addi            $ctx_,$ctx,64           # &ctx->r[1]
1328         addi            $_ctx,$sp,`$LOCALS+15`  # &ctx->r[1], r^2:r^4 shadow
1329
1330         vxor            $T0,$T0,$T0             # ensure second half is zero
1331         vpermdi         $H0,$H0,$T0,0b00
1332         vpermdi         $H1,$H1,$T0,0b00
1333         vpermdi         $H2,$H2,$T0,0b00
1334         vpermdi         $H3,$H3,$T0,0b00
1335         vpermdi         $H4,$H4,$T0,0b00
1336
1337         be?lvx_u        $_4,$x50,$const         # byte swap mask
1338         lvx_u           $T1,$x00,$inp           # load first input block
1339         lvx_u           $T2,$x10,$inp
1340         lvx_u           $T3,$x20,$inp
1341         lvx_u           $T4,$x30,$inp
1342         be?vperm        $T1,$T1,$T1,$_4
1343         be?vperm        $T2,$T2,$T2,$_4
1344         be?vperm        $T3,$T3,$T3,$_4
1345         be?vperm        $T4,$T4,$T4,$_4
1346
1347         vpermdi         $I0,$T1,$T2,0b00        # smash input to base 2^26
1348         vspltisb        $_4,4
1349         vperm           $I2,$T1,$T2,$I2perm     # 0x...0e0f0001...1e1f1011
1350         vspltisb        $_14,14
1351         vpermdi         $I3,$T1,$T2,0b11
1352
1353         vsrd            $I1,$I0,$_26
1354         vsrd            $I2,$I2,$_4
1355         vsrd            $I4,$I3,$_40
1356         vsrd            $I3,$I3,$_14
1357         vand            $I0,$I0,$mask26
1358         vand            $I1,$I1,$mask26
1359         vand            $I2,$I2,$mask26
1360         vand            $I3,$I3,$mask26
1361
1362         vpermdi         $T1,$T3,$T4,0b00
1363         vperm           $T2,$T3,$T4,$I2perm     # 0x...0e0f0001...1e1f1011
1364         vpermdi         $T3,$T3,$T4,0b11
1365
1366         vsrd            $T0,$T1,$_26
1367         vsrd            $T2,$T2,$_4
1368         vsrd            $T4,$T3,$_40
1369         vsrd            $T3,$T3,$_14
1370         vand            $T1,$T1,$mask26
1371         vand            $T0,$T0,$mask26
1372         vand            $T2,$T2,$mask26
1373         vand            $T3,$T3,$mask26
1374
1375         # inp[2]:inp[0]:inp[3]:inp[1]
1376         vmrgow          $I4,$T4,$I4
1377         vmrgow          $I0,$T1,$I0
1378         vmrgow          $I1,$T0,$I1
1379         vmrgow          $I2,$T2,$I2
1380         vmrgow          $I3,$T3,$I3
1381         vor             $I4,$I4,$padbits
1382
1383         lvx_splt        $R0,$x30,$ctx           # taking lvx_vsplt out of loop
1384         lvx_splt        $R1,$x00,$ctx_          # gives ~8% improvement
1385         lvx_splt        $S1,$x10,$ctx_
1386         lvx_splt        $R2,$x20,$ctx_
1387         lvx_splt        $S2,$x30,$ctx_
1388         lvx_splt        $T1,$x40,$ctx_
1389         lvx_splt        $T2,$x50,$ctx_
1390         lvx_splt        $T3,$x60,$ctx_
1391         lvx_splt        $T4,$x70,$ctx_
1392         stvx            $R1,$x00,$_ctx
1393         stvx            $S1,$x10,$_ctx
1394         stvx            $R2,$x20,$_ctx
1395         stvx            $S2,$x30,$_ctx
1396         stvx            $T1,$x40,$_ctx
1397         stvx            $T2,$x50,$_ctx
1398         stvx            $T3,$x60,$_ctx
1399         stvx            $T4,$x70,$_ctx
1400
1401         addi            $inp,$inp,0x40
1402         addi            $const,$const,0x50
1403         addi            r0,$len,-64
1404         srdi            r0,r0,6
1405         mtctr           r0
1406         b               Loop_vsx
1407
1408 .align  4
1409 Loop_vsx:
1410         ################################################################
1411         ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
1412         ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
1413         ##   \___________________/
1414         ##
1415         ## Note that we start with inp[2:3]*r^2. This is because it
1416         ## doesn't depend on reduction in previous iteration.
1417         ################################################################
1418         ## d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1419         ## d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1420         ## d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1421         ## d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1422         ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1423
1424         vmuleuw         $ACC0,$I0,$R0
1425         vmuleuw         $ACC1,$I0,$R1
1426         vmuleuw         $ACC2,$I0,$R2
1427         vmuleuw         $ACC3,$I1,$R2
1428
1429         vmuleuw         $T0,$I1,$R0
1430         vaddudm         $ACC1,$ACC1,$T0
1431         vmuleuw         $T0,$I1,$R1
1432         vaddudm         $ACC2,$ACC2,$T0
1433          vmuleuw        $ACC4,$I2,$R2
1434         vmuleuw         $T0,$I4,$S1
1435         vaddudm         $ACC0,$ACC0,$T0
1436         vmuleuw         $T0,$I2,$R1
1437         vaddudm         $ACC3,$ACC3,$T0
1438         lvx             $S3,$x50,$_ctx
1439         vmuleuw         $T0,$I3,$R1
1440         vaddudm         $ACC4,$ACC4,$T0
1441         lvx             $R3,$x40,$_ctx
1442
1443          vaddudm        $H2,$H2,$I2
1444          vaddudm        $H0,$H0,$I0
1445          vaddudm        $H3,$H3,$I3
1446          vaddudm        $H1,$H1,$I1
1447          vaddudm        $H4,$H4,$I4
1448
1449         vmuleuw         $T0,$I3,$S2
1450         vaddudm         $ACC0,$ACC0,$T0
1451         vmuleuw         $T0,$I4,$S2
1452         vaddudm         $ACC1,$ACC1,$T0
1453         vmuleuw         $T0,$I2,$R0
1454         vaddudm         $ACC2,$ACC2,$T0
1455         vmuleuw         $T0,$I3,$R0
1456         vaddudm         $ACC3,$ACC3,$T0
1457         lvx             $S4,$x70,$_ctx
1458         vmuleuw         $T0,$I4,$R0
1459         vaddudm         $ACC4,$ACC4,$T0
1460         lvx             $R4,$x60,$_ctx
1461
1462         vmuleuw         $T0,$I2,$S3
1463         vaddudm         $ACC0,$ACC0,$T0
1464         vmuleuw         $T0,$I3,$S3
1465         vaddudm         $ACC1,$ACC1,$T0
1466         vmuleuw         $T0,$I4,$S3
1467         vaddudm         $ACC2,$ACC2,$T0
1468         vmuleuw         $T0,$I0,$R3
1469         vaddudm         $ACC3,$ACC3,$T0
1470         vmuleuw         $T0,$I1,$R3
1471         vaddudm         $ACC4,$ACC4,$T0
1472
1473          be?lvx_u       $_4,$x00,$const         # byte swap mask
1474          lvx_u          $T1,$x00,$inp           # load next input block
1475          lvx_u          $T2,$x10,$inp
1476          lvx_u          $T3,$x20,$inp
1477          lvx_u          $T4,$x30,$inp
1478          be?vperm       $T1,$T1,$T1,$_4
1479          be?vperm       $T2,$T2,$T2,$_4
1480          be?vperm       $T3,$T3,$T3,$_4
1481          be?vperm       $T4,$T4,$T4,$_4
1482
1483         vmuleuw         $T0,$I1,$S4
1484         vaddudm         $ACC0,$ACC0,$T0
1485         vmuleuw         $T0,$I2,$S4
1486         vaddudm         $ACC1,$ACC1,$T0
1487         vmuleuw         $T0,$I3,$S4
1488         vaddudm         $ACC2,$ACC2,$T0
1489         vmuleuw         $T0,$I4,$S4
1490         vaddudm         $ACC3,$ACC3,$T0
1491         vmuleuw         $T0,$I0,$R4
1492         vaddudm         $ACC4,$ACC4,$T0
1493
1494          vpermdi        $I0,$T1,$T2,0b00        # smash input to base 2^26
1495          vspltisb       $_4,4
1496          vperm          $I2,$T1,$T2,$I2perm     # 0x...0e0f0001...1e1f1011
1497          vpermdi        $I3,$T1,$T2,0b11
1498
1499         # (hash + inp[0:1]) * r^4
1500         vmulouw         $T0,$H0,$R0
1501         vaddudm         $ACC0,$ACC0,$T0
1502         vmulouw         $T0,$H1,$R0
1503         vaddudm         $ACC1,$ACC1,$T0
1504         vmulouw         $T0,$H2,$R0
1505         vaddudm         $ACC2,$ACC2,$T0
1506         vmulouw         $T0,$H3,$R0
1507         vaddudm         $ACC3,$ACC3,$T0
1508         vmulouw         $T0,$H4,$R0
1509         vaddudm         $ACC4,$ACC4,$T0
1510
1511          vpermdi        $T1,$T3,$T4,0b00
1512          vperm          $T2,$T3,$T4,$I2perm     # 0x...0e0f0001...1e1f1011
1513          vpermdi        $T3,$T3,$T4,0b11
1514
1515         vmulouw         $T0,$H2,$S3
1516         vaddudm         $ACC0,$ACC0,$T0
1517         vmulouw         $T0,$H3,$S3
1518         vaddudm         $ACC1,$ACC1,$T0
1519         vmulouw         $T0,$H4,$S3
1520         vaddudm         $ACC2,$ACC2,$T0
1521         vmulouw         $T0,$H0,$R3
1522         vaddudm         $ACC3,$ACC3,$T0
1523         lvx             $S1,$x10,$_ctx
1524         vmulouw         $T0,$H1,$R3
1525         vaddudm         $ACC4,$ACC4,$T0
1526         lvx             $R1,$x00,$_ctx
1527
1528          vsrd           $I1,$I0,$_26
1529          vsrd           $I2,$I2,$_4
1530          vsrd           $I4,$I3,$_40
1531          vsrd           $I3,$I3,$_14
1532
1533         vmulouw         $T0,$H1,$S4
1534         vaddudm         $ACC0,$ACC0,$T0
1535         vmulouw         $T0,$H2,$S4
1536         vaddudm         $ACC1,$ACC1,$T0
1537         vmulouw         $T0,$H3,$S4
1538         vaddudm         $ACC2,$ACC2,$T0
1539         vmulouw         $T0,$H4,$S4
1540         vaddudm         $ACC3,$ACC3,$T0
1541         lvx             $S2,$x30,$_ctx
1542         vmulouw         $T0,$H0,$R4
1543         vaddudm         $ACC4,$ACC4,$T0
1544         lvx             $R2,$x20,$_ctx
1545
1546          vand           $I0,$I0,$mask26
1547          vand           $I1,$I1,$mask26
1548          vand           $I2,$I2,$mask26
1549          vand           $I3,$I3,$mask26
1550
1551         vmulouw         $T0,$H4,$S1
1552         vaddudm         $ACC0,$ACC0,$T0
1553         vmulouw         $T0,$H0,$R1
1554         vaddudm         $ACC1,$ACC1,$T0
1555         vmulouw         $T0,$H1,$R1
1556         vaddudm         $ACC2,$ACC2,$T0
1557         vmulouw         $T0,$H2,$R1
1558         vaddudm         $ACC3,$ACC3,$T0
1559         vmulouw         $T0,$H3,$R1
1560         vaddudm         $ACC4,$ACC4,$T0
1561
1562          vsrd           $T2,$T2,$_4
1563          vsrd           $_4,$T1,$_26
1564          vsrd           $T4,$T3,$_40
1565          vsrd           $T3,$T3,$_14
1566
1567         vmulouw         $T0,$H3,$S2
1568         vaddudm         $ACC0,$ACC0,$T0
1569         vmulouw         $T0,$H4,$S2
1570         vaddudm         $ACC1,$ACC1,$T0
1571         vmulouw         $T0,$H0,$R2
1572         vaddudm         $ACC2,$ACC2,$T0
1573         vmulouw         $T0,$H1,$R2
1574         vaddudm         $ACC3,$ACC3,$T0
1575         vmulouw         $T0,$H2,$R2
1576         vaddudm         $ACC4,$ACC4,$T0
1577
1578          vand           $T1,$T1,$mask26
1579          vand           $_4,$_4,$mask26
1580          vand           $T2,$T2,$mask26
1581          vand           $T3,$T3,$mask26
1582
1583         ################################################################
1584         # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1585         # and P. Schwabe
1586
1587         vspltisb        $T0,2
1588         vsrd            $H4,$ACC3,$_26
1589         vsrd            $H1,$ACC0,$_26
1590         vand            $H3,$ACC3,$mask26
1591         vand            $H0,$ACC0,$mask26
1592         vaddudm         $H4,$H4,$ACC4           # h3 -> h4
1593         vaddudm         $H1,$H1,$ACC1           # h0 -> h1
1594
1595          vmrgow         $I4,$T4,$I4
1596          vmrgow         $I0,$T1,$I0
1597          vmrgow         $I1,$_4,$I1
1598          vmrgow         $I2,$T2,$I2
1599          vmrgow         $I3,$T3,$I3
1600          vor            $I4,$I4,$padbits
1601
1602         vsrd            $ACC4,$H4,$_26
1603         vsrd            $ACC1,$H1,$_26
1604         vand            $H4,$H4,$mask26
1605         vand            $H1,$H1,$mask26
1606         vaddudm         $H0,$H0,$ACC4
1607         vaddudm         $H2,$ACC2,$ACC1         # h1 -> h2
1608
1609         vsld            $ACC4,$ACC4,$T0         # <<2
1610         vsrd            $ACC2,$H2,$_26
1611         vand            $H2,$H2,$mask26
1612         vaddudm         $H0,$H0,$ACC4           # h4 -> h0
1613         vaddudm         $H3,$H3,$ACC2           # h2 -> h3
1614
1615         vsrd            $ACC0,$H0,$_26
1616         vsrd            $ACC3,$H3,$_26
1617         vand            $H0,$H0,$mask26
1618         vand            $H3,$H3,$mask26
1619         vaddudm         $H1,$H1,$ACC0           # h0 -> h1
1620         vaddudm         $H4,$H4,$ACC3           # h3 -> h4
1621
1622         addi            $inp,$inp,0x40
1623         bdnz            Loop_vsx
1624
1625         neg             $len,$len
1626         andi.           $len,$len,0x30
1627         sub             $inp,$inp,$len
1628
1629         lvx_u           $R0,$x30,$ctx           # load all powers
1630         lvx_u           $R1,$x00,$ctx_
1631         lvx_u           $S1,$x10,$ctx_
1632         lvx_u           $R2,$x20,$ctx_
1633         lvx_u           $S2,$x30,$ctx_
1634
1635 Last_vsx:
1636         vmuleuw         $ACC0,$I0,$R0
1637         vmuleuw         $ACC1,$I1,$R0
1638         vmuleuw         $ACC2,$I2,$R0
1639         vmuleuw         $ACC3,$I3,$R0
1640         vmuleuw         $ACC4,$I4,$R0
1641
1642         vmuleuw         $T0,$I4,$S1
1643         vaddudm         $ACC0,$ACC0,$T0
1644         vmuleuw         $T0,$I0,$R1
1645         vaddudm         $ACC1,$ACC1,$T0
1646         vmuleuw         $T0,$I1,$R1
1647         vaddudm         $ACC2,$ACC2,$T0
1648         vmuleuw         $T0,$I2,$R1
1649         vaddudm         $ACC3,$ACC3,$T0
1650         lvx_u           $S3,$x50,$ctx_
1651         vmuleuw         $T0,$I3,$R1
1652         vaddudm         $ACC4,$ACC4,$T0
1653         lvx_u           $R3,$x40,$ctx_
1654
1655          vaddudm        $H2,$H2,$I2
1656          vaddudm        $H0,$H0,$I0
1657          vaddudm        $H3,$H3,$I3
1658          vaddudm        $H1,$H1,$I1
1659          vaddudm        $H4,$H4,$I4
1660
1661         vmuleuw         $T0,$I3,$S2
1662         vaddudm         $ACC0,$ACC0,$T0
1663         vmuleuw         $T0,$I4,$S2
1664         vaddudm         $ACC1,$ACC1,$T0
1665         vmuleuw         $T0,$I0,$R2
1666         vaddudm         $ACC2,$ACC2,$T0
1667         vmuleuw         $T0,$I1,$R2
1668         vaddudm         $ACC3,$ACC3,$T0
1669         lvx_u           $S4,$x70,$ctx_
1670         vmuleuw         $T0,$I2,$R2
1671         vaddudm         $ACC4,$ACC4,$T0
1672         lvx_u           $R4,$x60,$ctx_
1673
1674         vmuleuw         $T0,$I2,$S3
1675         vaddudm         $ACC0,$ACC0,$T0
1676         vmuleuw         $T0,$I3,$S3
1677         vaddudm         $ACC1,$ACC1,$T0
1678         vmuleuw         $T0,$I4,$S3
1679         vaddudm         $ACC2,$ACC2,$T0
1680         vmuleuw         $T0,$I0,$R3
1681         vaddudm         $ACC3,$ACC3,$T0
1682         vmuleuw         $T0,$I1,$R3
1683         vaddudm         $ACC4,$ACC4,$T0
1684
1685         vmuleuw         $T0,$I1,$S4
1686         vaddudm         $ACC0,$ACC0,$T0
1687         vmuleuw         $T0,$I2,$S4
1688         vaddudm         $ACC1,$ACC1,$T0
1689         vmuleuw         $T0,$I3,$S4
1690         vaddudm         $ACC2,$ACC2,$T0
1691         vmuleuw         $T0,$I4,$S4
1692         vaddudm         $ACC3,$ACC3,$T0
1693         vmuleuw         $T0,$I0,$R4
1694         vaddudm         $ACC4,$ACC4,$T0
1695
1696         # (hash + inp[0:1]) * r^4
1697         vmulouw         $T0,$H0,$R0
1698         vaddudm         $ACC0,$ACC0,$T0
1699         vmulouw         $T0,$H1,$R0
1700         vaddudm         $ACC1,$ACC1,$T0
1701         vmulouw         $T0,$H2,$R0
1702         vaddudm         $ACC2,$ACC2,$T0
1703         vmulouw         $T0,$H3,$R0
1704         vaddudm         $ACC3,$ACC3,$T0
1705         vmulouw         $T0,$H4,$R0
1706         vaddudm         $ACC4,$ACC4,$T0
1707
1708         vmulouw         $T0,$H2,$S3
1709         vaddudm         $ACC0,$ACC0,$T0
1710         vmulouw         $T0,$H3,$S3
1711         vaddudm         $ACC1,$ACC1,$T0
1712         vmulouw         $T0,$H4,$S3
1713         vaddudm         $ACC2,$ACC2,$T0
1714         vmulouw         $T0,$H0,$R3
1715         vaddudm         $ACC3,$ACC3,$T0
1716         lvx_u           $S1,$x10,$ctx_
1717         vmulouw         $T0,$H1,$R3
1718         vaddudm         $ACC4,$ACC4,$T0
1719         lvx_u           $R1,$x00,$ctx_
1720
1721         vmulouw         $T0,$H1,$S4
1722         vaddudm         $ACC0,$ACC0,$T0
1723         vmulouw         $T0,$H2,$S4
1724         vaddudm         $ACC1,$ACC1,$T0
1725         vmulouw         $T0,$H3,$S4
1726         vaddudm         $ACC2,$ACC2,$T0
1727         vmulouw         $T0,$H4,$S4
1728         vaddudm         $ACC3,$ACC3,$T0
1729         lvx_u           $S2,$x30,$ctx_
1730         vmulouw         $T0,$H0,$R4
1731         vaddudm         $ACC4,$ACC4,$T0
1732         lvx_u           $R2,$x20,$ctx_
1733
1734         vmulouw         $T0,$H4,$S1
1735         vaddudm         $ACC0,$ACC0,$T0
1736         vmulouw         $T0,$H0,$R1
1737         vaddudm         $ACC1,$ACC1,$T0
1738         vmulouw         $T0,$H1,$R1
1739         vaddudm         $ACC2,$ACC2,$T0
1740         vmulouw         $T0,$H2,$R1
1741         vaddudm         $ACC3,$ACC3,$T0
1742         vmulouw         $T0,$H3,$R1
1743         vaddudm         $ACC4,$ACC4,$T0
1744
1745         vmulouw         $T0,$H3,$S2
1746         vaddudm         $ACC0,$ACC0,$T0
1747         vmulouw         $T0,$H4,$S2
1748         vaddudm         $ACC1,$ACC1,$T0
1749         vmulouw         $T0,$H0,$R2
1750         vaddudm         $ACC2,$ACC2,$T0
1751         vmulouw         $T0,$H1,$R2
1752         vaddudm         $ACC3,$ACC3,$T0
1753         vmulouw         $T0,$H2,$R2
1754         vaddudm         $ACC4,$ACC4,$T0
1755
1756         ################################################################
1757         # horizontal addition
1758
1759         vpermdi         $H0,$ACC0,$ACC0,0b10
1760         vpermdi         $H1,$ACC1,$ACC1,0b10
1761         vpermdi         $H2,$ACC2,$ACC2,0b10
1762         vpermdi         $H3,$ACC3,$ACC3,0b10
1763         vpermdi         $H4,$ACC4,$ACC4,0b10
1764         vaddudm         $ACC0,$ACC0,$H0
1765         vaddudm         $ACC1,$ACC1,$H1
1766         vaddudm         $ACC2,$ACC2,$H2
1767         vaddudm         $ACC3,$ACC3,$H3
1768         vaddudm         $ACC4,$ACC4,$H4
1769
1770         ################################################################
1771         # lazy reduction
1772
1773         vspltisb        $T0,2
1774         vsrd            $H4,$ACC3,$_26
1775         vsrd            $H1,$ACC0,$_26
1776         vand            $H3,$ACC3,$mask26
1777         vand            $H0,$ACC0,$mask26
1778         vaddudm         $H4,$H4,$ACC4           # h3 -> h4
1779         vaddudm         $H1,$H1,$ACC1           # h0 -> h1
1780
1781         vsrd            $ACC4,$H4,$_26
1782         vsrd            $ACC1,$H1,$_26
1783         vand            $H4,$H4,$mask26
1784         vand            $H1,$H1,$mask26
1785         vaddudm         $H0,$H0,$ACC4
1786         vaddudm         $H2,$ACC2,$ACC1         # h1 -> h2
1787
1788         vsld            $ACC4,$ACC4,$T0         # <<2
1789         vsrd            $ACC2,$H2,$_26
1790         vand            $H2,$H2,$mask26
1791         vaddudm         $H0,$H0,$ACC4           # h4 -> h0
1792         vaddudm         $H3,$H3,$ACC2           # h2 -> h3
1793
1794         vsrd            $ACC0,$H0,$_26
1795         vsrd            $ACC3,$H3,$_26
1796         vand            $H0,$H0,$mask26
1797         vand            $H3,$H3,$mask26
1798         vaddudm         $H1,$H1,$ACC0           # h0 -> h1
1799         vaddudm         $H4,$H4,$ACC3           # h3 -> h4
1800
1801         beq             Ldone_vsx
1802
1803         add             r6,$const,$len
1804
1805         be?lvx_u        $_4,$x00,$const         # byte swap mask
1806         lvx_u           $T1,$x00,$inp           # load last partial input block
1807         lvx_u           $T2,$x10,$inp
1808         lvx_u           $T3,$x20,$inp
1809         lvx_u           $T4,$x30,$inp
1810         be?vperm        $T1,$T1,$T1,$_4
1811         be?vperm        $T2,$T2,$T2,$_4
1812         be?vperm        $T3,$T3,$T3,$_4
1813         be?vperm        $T4,$T4,$T4,$_4
1814
1815         vpermdi         $I0,$T1,$T2,0b00        # smash input to base 2^26
1816         vspltisb        $_4,4
1817         vperm           $I2,$T1,$T2,$I2perm     # 0x...0e0f0001...1e1f1011
1818         vpermdi         $I3,$T1,$T2,0b11
1819
1820         vsrd            $I1,$I0,$_26
1821         vsrd            $I2,$I2,$_4
1822         vsrd            $I4,$I3,$_40
1823         vsrd            $I3,$I3,$_14
1824         vand            $I0,$I0,$mask26
1825         vand            $I1,$I1,$mask26
1826         vand            $I2,$I2,$mask26
1827         vand            $I3,$I3,$mask26
1828
1829         vpermdi         $T0,$T3,$T4,0b00
1830         vperm           $T1,$T3,$T4,$I2perm     # 0x...0e0f0001...1e1f1011
1831         vpermdi         $T2,$T3,$T4,0b11
1832
1833         lvx_u           $ACC0,$x00,r6
1834         lvx_u           $ACC1,$x30,r6
1835
1836         vsrd            $T3,$T0,$_26
1837         vsrd            $T1,$T1,$_4
1838         vsrd            $T4,$T2,$_40
1839         vsrd            $T2,$T2,$_14
1840         vand            $T0,$T0,$mask26
1841         vand            $T3,$T3,$mask26
1842         vand            $T1,$T1,$mask26
1843         vand            $T2,$T2,$mask26
1844
1845         # inp[2]:inp[0]:inp[3]:inp[1]
1846         vmrgow          $I4,$T4,$I4
1847         vmrgow          $I0,$T0,$I0
1848         vmrgow          $I1,$T3,$I1
1849         vmrgow          $I2,$T1,$I2
1850         vmrgow          $I3,$T2,$I3
1851         vor             $I4,$I4,$padbits
1852
1853         vperm           $H0,$H0,$H0,$ACC0       # move hash to right lane
1854         vand            $I0,$I0,    $ACC1       # mask redundant input lane[s]
1855         vperm           $H1,$H1,$H1,$ACC0
1856         vand            $I1,$I1,    $ACC1
1857         vperm           $H2,$H2,$H2,$ACC0
1858         vand            $I2,$I2,    $ACC1
1859         vperm           $H3,$H3,$H3,$ACC0
1860         vand            $I3,$I3,    $ACC1
1861         vperm           $H4,$H4,$H4,$ACC0
1862         vand            $I4,$I4,    $ACC1
1863
1864         vaddudm         $I0,$I0,$H0             # accumulate hash
1865         vxor            $H0,$H0,$H0             # wipe hash value
1866         vaddudm         $I1,$I1,$H1
1867         vxor            $H1,$H1,$H1
1868         vaddudm         $I2,$I2,$H2
1869         vxor            $H2,$H2,$H2
1870         vaddudm         $I3,$I3,$H3
1871         vxor            $H3,$H3,$H3
1872         vaddudm         $I4,$I4,$H4
1873         vxor            $H4,$H4,$H4
1874
1875         xor.            $len,$len,$len
1876         b               Last_vsx
1877
1878 .align  4
1879 Ldone_vsx:
1880         $POP    r0,`$VSXFRAME+$LRSAVE`($sp)
1881         li      $x10,4
1882         li      $x20,8
1883         li      $x30,12
1884         li      $x40,16
1885         stvwx_u $H0,$x00,$ctx                   # store hash
1886         stvwx_u $H1,$x10,$ctx
1887         stvwx_u $H2,$x20,$ctx
1888         stvwx_u $H3,$x30,$ctx
1889         stvwx_u $H4,$x40,$ctx
1890
1891         lwz     r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave
1892         mtlr    r0
1893         li      r10,`15+$LOCALS+128`
1894         li      r11,`31+$LOCALS+128`
1895         mtspr   256,r12                         # restore vrsave
1896         lvx     v20,r10,$sp
1897         addi    r10,r10,32
1898         lvx     v21,r10,$sp
1899         addi    r10,r10,32
1900         lvx     v22,r11,$sp
1901         addi    r11,r11,32
1902         lvx     v23,r10,$sp
1903         addi    r10,r10,32
1904         lvx     v24,r11,$sp
1905         addi    r11,r11,32
1906         lvx     v25,r10,$sp
1907         addi    r10,r10,32
1908         lvx     v26,r11,$sp
1909         addi    r11,r11,32
1910         lvx     v27,r10,$sp
1911         addi    r10,r10,32
1912         lvx     v28,r11,$sp
1913         addi    r11,r11,32
1914         lvx     v29,r10,$sp
1915         addi    r10,r10,32
1916         lvx     v30,r11,$sp
1917         lvx     v31,r10,$sp
1918         $POP    r27,`$VSXFRAME-$SIZE_T*5`($sp)
1919         $POP    r28,`$VSXFRAME-$SIZE_T*4`($sp)
1920         $POP    r29,`$VSXFRAME-$SIZE_T*3`($sp)
1921         $POP    r30,`$VSXFRAME-$SIZE_T*2`($sp)
1922         $POP    r31,`$VSXFRAME-$SIZE_T*1`($sp)
1923         addi    $sp,$sp,$VSXFRAME
1924         blr
1925         .long   0
1926         .byte   0,12,0x04,1,0x80,5,4,0
1927         .long   0
1928 .size   __poly1305_blocks_vsx,.-__poly1305_blocks_vsx
1929
1930 .align  6
1931 LPICmeup:
1932         mflr    r0
1933         bcl     20,31,\$+4
1934         mflr    $const      # vvvvvv "distance" between . and 1st data entry
1935         addi    $const,$const,`64-8`
1936         mtlr    r0
1937         blr
1938         .long   0
1939         .byte   0,12,0x14,0,0,0,0,0
1940         .space  `64-9*4`
1941
1942 .quad   0x0000000003ffffff,0x0000000003ffffff   # mask26
1943 .quad   0x000000000000001a,0x000000000000001a   # _26
1944 .quad   0x0000000000000028,0x0000000000000028   # _40
1945 .quad   0x000000000e0f0001,0x000000001e1f1011   # I2perm
1946 .quad   0x0100000001000000,0x0100000001000000   # padbits
1947 .quad   0x0706050403020100,0x0f0e0d0c0b0a0908   # byte swap for big-endian
1948
1949 .quad   0x0000000000000000,0x0000000004050607   # magic tail masks
1950 .quad   0x0405060700000000,0x0000000000000000
1951 .quad   0x0000000000000000,0x0405060700000000
1952
1953 .quad   0xffffffff00000000,0xffffffffffffffff
1954 .quad   0xffffffff00000000,0xffffffff00000000
1955 .quad   0x0000000000000000,0xffffffff00000000
1956 ___
1957 }}}
1958 $code.=<<___;
1959 .asciz  "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm"
1960 ___
1961
1962 foreach (split("\n",$code)) {
1963         s/\`([^\`]*)\`/eval($1)/ge;
1964
1965         # instructions prefixed with '?' are endian-specific and need
1966         # to be adjusted accordingly...
1967         if ($flavour !~ /le$/) {        # big-endian
1968             s/be\?//            or
1969             s/le\?/#le#/
1970         } else {                        # little-endian
1971             s/le\?//            or
1972             s/be\?/#be#/
1973         }
1974
1975         print $_,"\n";
1976 }
1977 close STDOUT;