c8636a46edf08c228bf1098a59f26ab69e6a8afc
[openssl.git] / crypto / poly1305 / asm / poly1305-ppcfp.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements Poly1305 hash for PowerPC FPU.
11 #
12 # June 2015
13 #
14 # Numbers are cycles per processed byte with poly1305_blocks alone,
15 # and improvement coefficients relative to gcc-generated code.
16 #
17 # Freescale e300        9.78/+30%
18 # PPC74x0               6.92/+50%
19 # PPC970                6.03/+80%
20 # POWER7                3.50/+30%
21 # POWER8                3.75/+10%
22
23 $flavour = shift;
24
25 if ($flavour =~ /64/) {
26         $SIZE_T =8;
27         $LRSAVE =2*$SIZE_T;
28         $UCMP   ="cmpld";
29         $STU    ="stdu";
30         $POP    ="ld";
31         $PUSH   ="std";
32 } elsif ($flavour =~ /32/) {
33         $SIZE_T =4;
34         $LRSAVE =$SIZE_T;
35         $UCMP   ="cmplw";
36         $STU    ="stwu";
37         $POP    ="lwz";
38         $PUSH   ="stw";
39 } else { die "nonsense $flavour"; }
40
41 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
42
43 $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
44
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
48 die "can't locate ppc-xlate.pl";
49
50 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
51
52 $LOCALS=6*$SIZE_T;
53 $FRAME=$LOCALS+6*8+18*8;
54
55 my $sp="r1";
56
57 my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
58 my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
59
60 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
61     $two0,$two32,$two64,$two96,$two130,$five_two130,
62     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
63     $s2lo,$s2hi,$s3lo,$s3hi,
64     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
65 # borrowings
66 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
67 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
68 my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
69
70 $code.=<<___;
71 .machine        "any"
72 .text
73
74 .globl  .poly1305_init_fpu
75 .align  6
76 .poly1305_init_fpu:
77         $STU    $sp,-$LOCALS($sp)               # minimal frame
78         mflr    $padbit
79         $PUSH   $padbit,`$LOCALS+$LRSAVE`($sp)
80
81         bl      LPICmeup
82
83         xor     r0,r0,r0
84         mtlr    $padbit                         # restore lr
85
86         lfd     $two0,8*0($len)                 # load constants
87         lfd     $two32,8*1($len)
88         lfd     $two64,8*2($len)
89         lfd     $two96,8*3($len)
90         lfd     $two130,8*4($len)
91         lfd     $five_two130,8*5($len)
92
93         stfd    $two0,8*0($ctx)                 # initial hash value, biased 0
94         stfd    $two32,8*1($ctx)
95         stfd    $two64,8*2($ctx)
96         stfd    $two96,8*3($ctx)
97
98         $UCMP   $inp,r0
99         beq-    Lno_key
100
101         lfd     $h3lo,8*13($len)                # new fpscr
102         mffs    $h3hi                           # old fpscr
103
104         stfd    $two0,8*4($ctx)                 # key "template"
105         stfd    $two32,8*5($ctx)
106         stfd    $two64,8*6($ctx)
107         stfd    $two96,8*7($ctx)
108
109         li      $in1,4
110         li      $in2,8
111         li      $in3,12
112         $LWXLE  $in0,0,$inp                     # load key
113         $LWXLE  $in1,$in1,$inp
114         $LWXLE  $in2,$in2,$inp
115         $LWXLE  $in3,$in3,$inp
116
117         lis     $i1,0xf000                      #   0xf0000000
118         ori     $i2,$i1,3                       #   0xf0000003
119         andc    $in0,$in0,$i1                   # &=0x0fffffff
120         andc    $in1,$in1,$i2                   # &=0x0ffffffc
121         andc    $in2,$in2,$i2
122         andc    $in3,$in3,$i2
123
124         stw     $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx)     # fill "template"
125         stw     $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
126         stw     $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
127         stw     $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
128
129         mtfsf   255,$h3lo                       # fpscr
130         stfd    $two0,8*18($ctx)                # copy constants to context
131         stfd    $two32,8*19($ctx)
132         stfd    $two64,8*20($ctx)
133         stfd    $two96,8*21($ctx)
134         stfd    $two130,8*22($ctx)
135         stfd    $five_two130,8*23($ctx)
136
137         lfd     $h0lo,8*4($ctx)                 # load [biased] key
138         lfd     $h1lo,8*5($ctx)
139         lfd     $h2lo,8*6($ctx)
140         lfd     $h3lo,8*7($ctx)
141
142         fsub    $h0lo,$h0lo,$two0               # r0
143         fsub    $h1lo,$h1lo,$two32              # r1
144         fsub    $h2lo,$h2lo,$two64              # r2
145         fsub    $h3lo,$h3lo,$two96              # r3
146
147         lfd     $two0,8*6($len)                 # more constants
148         lfd     $two32,8*7($len)
149         lfd     $two64,8*8($len)
150         lfd     $two96,8*9($len)
151
152         fmul    $h1hi,$h1lo,$five_two130        # s1
153         fmul    $h2hi,$h2lo,$five_two130        # s2
154          stfd   $h3hi,8*15($ctx)                # borrow slot for original fpscr
155         fmul    $h3hi,$h3lo,$five_two130        # s3
156
157         fadd    $h0hi,$h0lo,$two0
158          stfd   $h1hi,8*12($ctx)                # put aside for now
159         fadd    $h1hi,$h1lo,$two32
160          stfd   $h2hi,8*13($ctx)
161         fadd    $h2hi,$h2lo,$two64
162          stfd   $h3hi,8*14($ctx)
163         fadd    $h3hi,$h3lo,$two96
164
165         fsub    $h0hi,$h0hi,$two0
166         fsub    $h1hi,$h1hi,$two32
167         fsub    $h2hi,$h2hi,$two64
168         fsub    $h3hi,$h3hi,$two96
169
170         lfd     $two0,8*10($len)                # more constants
171         lfd     $two32,8*11($len)
172         lfd     $two64,8*12($len)
173
174         fsub    $h0lo,$h0lo,$h0hi
175         fsub    $h1lo,$h1lo,$h1hi
176         fsub    $h2lo,$h2lo,$h2hi
177         fsub    $h3lo,$h3lo,$h3hi
178
179         stfd    $h0hi,8*5($ctx)                 # r0hi
180         stfd    $h1hi,8*7($ctx)                 # r1hi
181         stfd    $h2hi,8*9($ctx)                 # r2hi
182         stfd    $h3hi,8*11($ctx)                # r3hi
183
184         stfd    $h0lo,8*4($ctx)                 # r0lo
185         stfd    $h1lo,8*6($ctx)                 # r1lo
186         stfd    $h2lo,8*8($ctx)                 # r2lo
187         stfd    $h3lo,8*10($ctx)                # r3lo
188
189         lfd     $h1lo,8*12($ctx)                # s1
190         lfd     $h2lo,8*13($ctx)                # s2
191         lfd     $h3lo,8*14($ctx)                # s3
192         lfd     $h0lo,8*15($ctx)                # pull original fpscr
193
194         fadd    $h1hi,$h1lo,$two0
195         fadd    $h2hi,$h2lo,$two32
196         fadd    $h3hi,$h3lo,$two64
197
198         fsub    $h1hi,$h1hi,$two0
199         fsub    $h2hi,$h2hi,$two32
200         fsub    $h3hi,$h3hi,$two64
201
202         fsub    $h1lo,$h1lo,$h1hi
203         fsub    $h2lo,$h2lo,$h2hi
204         fsub    $h3lo,$h3lo,$h3hi
205
206         stfd    $h1hi,8*13($ctx)                # s1hi
207         stfd    $h2hi,8*15($ctx)                # s2hi
208         stfd    $h3hi,8*17($ctx)                # s3hi
209
210         stfd    $h1lo,8*12($ctx)                # s1lo
211         stfd    $h2lo,8*14($ctx)                # s2lo
212         stfd    $h3lo,8*16($ctx)                # s3lo
213
214         mtfsf   255,$h0lo                       # restore fpscr
215 Lno_key:
216         xor     r3,r3,r3
217         addi    $sp,$sp,$LOCALS
218         blr
219         .long   0
220         .byte   0,12,4,1,0x80,0,2,0
221 .size   .poly1305_init_fpu,.-.poly1305_init_fpu
222
223 .globl  .poly1305_blocks_fpu
224 .align  4
225 .poly1305_blocks_fpu:
226         srwi.   $len,$len,4
227         beq-    Labort
228
229         $STU    $sp,-$FRAME($sp)
230         mflr    r0
231         stfd    f14,`$FRAME-8*18`($sp)
232         stfd    f15,`$FRAME-8*17`($sp)
233         stfd    f16,`$FRAME-8*16`($sp)
234         stfd    f17,`$FRAME-8*15`($sp)
235         stfd    f18,`$FRAME-8*14`($sp)
236         stfd    f19,`$FRAME-8*13`($sp)
237         stfd    f20,`$FRAME-8*12`($sp)
238         stfd    f21,`$FRAME-8*11`($sp)
239         stfd    f22,`$FRAME-8*10`($sp)
240         stfd    f23,`$FRAME-8*9`($sp)
241         stfd    f24,`$FRAME-8*8`($sp)
242         stfd    f25,`$FRAME-8*7`($sp)
243         stfd    f26,`$FRAME-8*6`($sp)
244         stfd    f27,`$FRAME-8*5`($sp)
245         stfd    f28,`$FRAME-8*4`($sp)
246         stfd    f29,`$FRAME-8*3`($sp)
247         stfd    f30,`$FRAME-8*2`($sp)
248         stfd    f31,`$FRAME-8*1`($sp)
249         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
250
251         xor     r0,r0,r0
252         li      $in3,1
253         mtctr   $len
254         neg     $len,$len
255         stw     r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
256         stw     $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
257
258         lfd     $two0,8*18($ctx)                # load constants
259         lfd     $two32,8*19($ctx)
260         lfd     $two64,8*20($ctx)
261         lfd     $two96,8*21($ctx)
262         lfd     $two130,8*22($ctx)
263         lfd     $five_two130,8*23($ctx)
264
265         lfd     $h0lo,8*0($ctx)                 # load [biased] hash value
266         lfd     $h1lo,8*1($ctx)
267         lfd     $h2lo,8*2($ctx)
268         lfd     $h3lo,8*3($ctx)
269
270         stfd    $two0,`$LOCALS+8*0`($sp)        # input "template"
271         oris    $in3,$padbit,`(1023+52+96)<<4`
272         stfd    $two32,`$LOCALS+8*1`($sp)
273         stfd    $two64,`$LOCALS+8*2`($sp)
274         stw     $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
275
276         li      $i1,4
277         li      $i2,8
278         li      $i3,12
279         $LWXLE  $in0,0,$inp                     # load input
280         $LWXLE  $in1,$i1,$inp
281         $LWXLE  $in2,$i2,$inp
282         $LWXLE  $in3,$i3,$inp
283         addi    $inp,$inp,16
284
285         stw     $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
286         stw     $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
287         stw     $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
288         stw     $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
289
290         mffs    $x0                             # original fpscr
291         lfd     $x1,`$LOCALS+8*4`($sp)          # new fpscr
292         lfd     $r0lo,8*4($ctx)                 # load key
293         lfd     $r0hi,8*5($ctx)
294         lfd     $r1lo,8*6($ctx)
295         lfd     $r1hi,8*7($ctx)
296         lfd     $r2lo,8*8($ctx)
297         lfd     $r2hi,8*9($ctx)
298         lfd     $r3lo,8*10($ctx)
299         lfd     $r3hi,8*11($ctx)
300         lfd     $s1lo,8*12($ctx)
301         lfd     $s1hi,8*13($ctx)
302         lfd     $s2lo,8*14($ctx)
303         lfd     $s2hi,8*15($ctx)
304         lfd     $s3lo,8*16($ctx)
305         lfd     $s3hi,8*17($ctx)
306
307         stfd    $x0,`$LOCALS+8*4`($sp)          # save original fpscr
308         mtfsf   255,$x1
309
310         addic   $len,$len,1
311         addze   r0,r0
312         slwi.   r0,r0,4
313         sub     $inp,$inp,r0                    # conditional rewind
314
315         lfd     $x0,`$LOCALS+8*0`($sp)
316         lfd     $x1,`$LOCALS+8*1`($sp)
317         lfd     $x2,`$LOCALS+8*2`($sp)
318         lfd     $x3,`$LOCALS+8*3`($sp)
319
320         fsub    $h0lo,$h0lo,$two0               # de-bias hash value
321          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
322         fsub    $h1lo,$h1lo,$two32
323          $LWXLE $in1,$i1,$inp
324         fsub    $h2lo,$h2lo,$two64
325          $LWXLE $in2,$i2,$inp
326         fsub    $h3lo,$h3lo,$two96
327          $LWXLE $in3,$i3,$inp
328
329         fsub    $x0,$x0,$two0                   # de-bias input
330          addi   $inp,$inp,16
331         fsub    $x1,$x1,$two32
332         fsub    $x2,$x2,$two64
333         fsub    $x3,$x3,$two96
334
335         fadd    $x0,$x0,$h0lo                   # accumulate input
336          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
337         fadd    $x1,$x1,$h1lo
338          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
339         fadd    $x2,$x2,$h2lo
340          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
341         fadd    $x3,$x3,$h3lo
342          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
343
344         b       Lentry
345
346 .align  4
347 Loop:
348         fsub    $y0,$y0,$two0                   # de-bias input
349          addic  $len,$len,1
350         fsub    $y1,$y1,$two32
351          addze  r0,r0
352         fsub    $y2,$y2,$two64
353          slwi.  r0,r0,4
354         fsub    $y3,$y3,$two96
355          sub    $inp,$inp,r0                    # conditional rewind
356
357         fadd    $h0lo,$h0lo,$y0                 # accumulate input
358         fadd    $h0hi,$h0hi,$y1
359         fadd    $h2lo,$h2lo,$y2
360         fadd    $h2hi,$h2hi,$y3
361
362         ######################################### base 2^48 -> base 2^32
363         fadd    $c1lo,$h1lo,$two64
364          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
365         fadd    $c1hi,$h1hi,$two64
366          $LWXLE $in1,$i1,$inp
367         fadd    $c3lo,$h3lo,$two130
368          $LWXLE $in2,$i2,$inp
369         fadd    $c3hi,$h3hi,$two130
370          $LWXLE $in3,$i3,$inp
371         fadd    $c0lo,$h0lo,$two32
372          addi   $inp,$inp,16
373         fadd    $c0hi,$h0hi,$two32
374         fadd    $c2lo,$h2lo,$two96
375         fadd    $c2hi,$h2hi,$two96
376
377         fsub    $c1lo,$c1lo,$two64
378          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
379         fsub    $c1hi,$c1hi,$two64
380          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
381         fsub    $c3lo,$c3lo,$two130
382          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
383         fsub    $c3hi,$c3hi,$two130
384          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
385         fsub    $c0lo,$c0lo,$two32
386         fsub    $c0hi,$c0hi,$two32
387         fsub    $c2lo,$c2lo,$two96
388         fsub    $c2hi,$c2hi,$two96
389
390         fsub    $h1lo,$h1lo,$c1lo
391         fsub    $h1hi,$h1hi,$c1hi
392         fsub    $h3lo,$h3lo,$c3lo
393         fsub    $h3hi,$h3hi,$c3hi
394         fsub    $h2lo,$h2lo,$c2lo
395         fsub    $h2hi,$h2hi,$c2hi
396         fsub    $h0lo,$h0lo,$c0lo
397         fsub    $h0hi,$h0hi,$c0hi
398
399         fadd    $h1lo,$h1lo,$c0lo
400         fadd    $h1hi,$h1hi,$c0hi
401         fadd    $h3lo,$h3lo,$c2lo
402         fadd    $h3hi,$h3hi,$c2hi
403         fadd    $h2lo,$h2lo,$c1lo
404         fadd    $h2hi,$h2hi,$c1hi
405         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
406         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
407
408         fadd    $x1,$h1lo,$h1hi
409          lfd    $s1lo,8*12($ctx)                # reload constants
410         fadd    $x3,$h3lo,$h3hi
411          lfd    $s1hi,8*13($ctx)
412         fadd    $x2,$h2lo,$h2hi
413          lfd    $r3lo,8*10($ctx)
414         fadd    $x0,$h0lo,$h0hi
415          lfd    $r3hi,8*11($ctx)
416 Lentry:
417         fmul    $h0lo,$s3lo,$x1
418         fmul    $h0hi,$s3hi,$x1
419         fmul    $h2lo,$r1lo,$x1
420         fmul    $h2hi,$r1hi,$x1
421         fmul    $h1lo,$r0lo,$x1
422         fmul    $h1hi,$r0hi,$x1
423         fmul    $h3lo,$r2lo,$x1
424         fmul    $h3hi,$r2hi,$x1
425
426         fmadd   $h0lo,$s1lo,$x3,$h0lo
427         fmadd   $h0hi,$s1hi,$x3,$h0hi
428         fmadd   $h2lo,$s3lo,$x3,$h2lo
429         fmadd   $h2hi,$s3hi,$x3,$h2hi
430         fmadd   $h1lo,$s2lo,$x3,$h1lo
431         fmadd   $h1hi,$s2hi,$x3,$h1hi
432         fmadd   $h3lo,$r0lo,$x3,$h3lo
433         fmadd   $h3hi,$r0hi,$x3,$h3hi
434
435         fmadd   $h0lo,$s2lo,$x2,$h0lo
436         fmadd   $h0hi,$s2hi,$x2,$h0hi
437         fmadd   $h2lo,$r0lo,$x2,$h2lo
438         fmadd   $h2hi,$r0hi,$x2,$h2hi
439         fmadd   $h1lo,$s3lo,$x2,$h1lo
440         fmadd   $h1hi,$s3hi,$x2,$h1hi
441         fmadd   $h3lo,$r1lo,$x2,$h3lo
442         fmadd   $h3hi,$r1hi,$x2,$h3hi
443
444         fmadd   $h0lo,$r0lo,$x0,$h0lo
445          lfd    $y0,`$LOCALS+8*0`($sp)          # load [biased] input
446         fmadd   $h0hi,$r0hi,$x0,$h0hi
447          lfd    $y1,`$LOCALS+8*1`($sp)
448         fmadd   $h2lo,$r2lo,$x0,$h2lo
449          lfd    $y2,`$LOCALS+8*2`($sp)
450         fmadd   $h2hi,$r2hi,$x0,$h2hi
451          lfd    $y3,`$LOCALS+8*3`($sp)
452         fmadd   $h1lo,$r1lo,$x0,$h1lo
453         fmadd   $h1hi,$r1hi,$x0,$h1hi
454         fmadd   $h3lo,$r3lo,$x0,$h3lo
455         fmadd   $h3hi,$r3hi,$x0,$h3hi
456
457         bdnz    Loop
458
459         ######################################### base 2^48 -> base 2^32
460         fadd    $c0lo,$h0lo,$two32
461         fadd    $c0hi,$h0hi,$two32
462         fadd    $c2lo,$h2lo,$two96
463         fadd    $c2hi,$h2hi,$two96
464         fadd    $c1lo,$h1lo,$two64
465         fadd    $c1hi,$h1hi,$two64
466         fadd    $c3lo,$h3lo,$two130
467         fadd    $c3hi,$h3hi,$two130
468
469         fsub    $c0lo,$c0lo,$two32
470         fsub    $c0hi,$c0hi,$two32
471         fsub    $c2lo,$c2lo,$two96
472         fsub    $c2hi,$c2hi,$two96
473         fsub    $c1lo,$c1lo,$two64
474         fsub    $c1hi,$c1hi,$two64
475         fsub    $c3lo,$c3lo,$two130
476         fsub    $c3hi,$c3hi,$two130
477
478         fsub    $h1lo,$h1lo,$c1lo
479         fsub    $h1hi,$h1hi,$c1hi
480         fsub    $h3lo,$h3lo,$c3lo
481         fsub    $h3hi,$h3hi,$c3hi
482         fsub    $h2lo,$h2lo,$c2lo
483         fsub    $h2hi,$h2hi,$c2hi
484         fsub    $h0lo,$h0lo,$c0lo
485         fsub    $h0hi,$h0hi,$c0hi
486
487         fadd    $h1lo,$h1lo,$c0lo
488         fadd    $h1hi,$h1hi,$c0hi
489         fadd    $h3lo,$h3lo,$c2lo
490         fadd    $h3hi,$h3hi,$c2hi
491         fadd    $h2lo,$h2lo,$c1lo
492         fadd    $h2hi,$h2hi,$c1hi
493         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
494         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
495
496         fadd    $x1,$h1lo,$h1hi
497         fadd    $x3,$h3lo,$h3hi
498         fadd    $x2,$h2lo,$h2hi
499         fadd    $x0,$h0lo,$h0hi
500
501         lfd     $h0lo,`$LOCALS+8*4`($sp)        # pull saved fpscr
502         fadd    $x1,$x1,$two32                  # bias
503         fadd    $x3,$x3,$two96
504         fadd    $x2,$x2,$two64
505         fadd    $x0,$x0,$two0
506
507         stfd    $x1,8*1($ctx)                   # store [biased] hash value
508         stfd    $x3,8*3($ctx)
509         stfd    $x2,8*2($ctx)
510         stfd    $x0,8*0($ctx)
511
512         mtfsf   255,$h0lo                       # restore original fpscr
513         lfd     f14,`$FRAME-8*18`($sp)
514         lfd     f15,`$FRAME-8*17`($sp)
515         lfd     f16,`$FRAME-8*16`($sp)
516         lfd     f17,`$FRAME-8*15`($sp)
517         lfd     f18,`$FRAME-8*14`($sp)
518         lfd     f19,`$FRAME-8*13`($sp)
519         lfd     f20,`$FRAME-8*12`($sp)
520         lfd     f21,`$FRAME-8*11`($sp)
521         lfd     f22,`$FRAME-8*10`($sp)
522         lfd     f23,`$FRAME-8*9`($sp)
523         lfd     f24,`$FRAME-8*8`($sp)
524         lfd     f25,`$FRAME-8*7`($sp)
525         lfd     f26,`$FRAME-8*6`($sp)
526         lfd     f27,`$FRAME-8*5`($sp)
527         lfd     f28,`$FRAME-8*4`($sp)
528         lfd     f29,`$FRAME-8*3`($sp)
529         lfd     f30,`$FRAME-8*2`($sp)
530         lfd     f31,`$FRAME-8*1`($sp)
531         addi    $sp,$sp,$FRAME
532 Labort:
533         blr
534         .long   0
535         .byte   0,12,4,1,0x80,0,4,0
536 .size   .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
537 ___
538 {
539 my ($mac,$nonce)=($inp,$len);
540
541 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
542    ) = map("r$_",(7..11,28..31));
543 my $mask = "r0";
544 my $FRAME = (6+4)*$SIZE_T;
545
546 $code.=<<___;
547 .globl  .poly1305_emit_fpu
548 .align  4
549 .poly1305_emit_fpu:
550         $STU    $sp,-$FRAME($sp)
551         mflr    r0
552         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
553         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
554         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
555         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
556         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
557
558         lwz     $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx)      # load hash
559         lwz     $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
560         lwz     $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
561         lwz     $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
562         lwz     $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
563         lwz     $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
564         lwz     $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
565         lwz     $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
566
567         lis     $mask,0xfff0
568         andc    $d0,$d0,$mask                   # mask exponent
569         andc    $d1,$d1,$mask
570         andc    $d2,$d2,$mask
571         andc    $d3,$d3,$mask                   # can be partially reduced...
572         li      $mask,3
573
574         srwi    $padbit,$d3,2                   # ... so reduce
575         and     $h4,$d3,$mask
576         andc    $d3,$d3,$mask
577         add     $d3,$d3,$padbit
578 ___
579                                                 if ($SIZE_T==4) {
580 $code.=<<___;
581         addc    $h0,$h0,$d3
582         adde    $h1,$h1,$d0
583         adde    $h2,$h2,$d1
584         adde    $h3,$h3,$d2
585         addze   $h4,$h4
586
587         addic   $d0,$h0,5                       # compare to modulus
588         addze   $d1,$h1
589         addze   $d2,$h2
590         addze   $d3,$h3
591         addze   $mask,$h4
592
593         srwi    $mask,$mask,2                   # did it carry/borrow?
594         neg     $mask,$mask
595         srawi   $mask,$mask,31                  # mask
596
597         andc    $h0,$h0,$mask
598         and     $d0,$d0,$mask
599         andc    $h1,$h1,$mask
600         and     $d1,$d1,$mask
601         or      $h0,$h0,$d0
602         lwz     $d0,0($nonce)                   # load nonce
603         andc    $h2,$h2,$mask
604         and     $d2,$d2,$mask
605         or      $h1,$h1,$d1
606         lwz     $d1,4($nonce)
607         andc    $h3,$h3,$mask
608         and     $d3,$d3,$mask
609         or      $h2,$h2,$d2
610         lwz     $d2,8($nonce)
611         or      $h3,$h3,$d3
612         lwz     $d3,12($nonce)
613
614         addc    $h0,$h0,$d0                     # accumulate nonce
615         adde    $h1,$h1,$d1
616         adde    $h2,$h2,$d2
617         adde    $h3,$h3,$d3
618 ___
619                                                 } else {
620 $code.=<<___;
621         add     $h0,$h0,$d3
622         add     $h1,$h1,$d0
623         add     $h2,$h2,$d1
624         add     $h3,$h3,$d2
625
626         srdi    $d0,$h0,32
627         add     $h1,$h1,$d0
628         srdi    $d1,$h1,32
629         add     $h2,$h2,$d1
630         srdi    $d2,$h2,32
631         add     $h3,$h3,$d2
632         srdi    $d3,$h3,32
633         add     $h4,$h4,$d3
634
635         insrdi  $h0,$h1,32,0
636         insrdi  $h2,$h3,32,0
637
638         addic   $d0,$h0,5                       # compare to modulus
639         addze   $d1,$h2
640         addze   $d2,$h4
641
642         srdi    $mask,$d2,2                     # did it carry/borrow?
643         neg     $mask,$mask
644         sradi   $mask,$mask,63                  # mask
645         ld      $d2,0($nonce)                   # load nonce
646         ld      $d3,8($nonce)
647
648         andc    $h0,$h0,$mask
649         and     $d0,$d0,$mask
650         andc    $h2,$h2,$mask
651         and     $d1,$d1,$mask
652         or      $h0,$h0,$d0
653         or      $h2,$h2,$d1
654 ___
655 $code.=<<___    if (!$LITTLE_ENDIAN);
656         rotldi  $d2,$d2,32                      # flip nonce words
657         rotldi  $d3,$d3,32
658 ___
659 $code.=<<___;
660         addc    $h0,$h0,$d2                     # accumulate nonce
661         adde    $h2,$h2,$d3
662
663         srdi    $h1,$h0,32
664         srdi    $h3,$h2,32
665 ___
666                                                 }
667 $code.=<<___    if ($LITTLE_ENDIAN);
668         stw     $h0,0($mac)                     # write result
669         stw     $h1,4($mac)
670         stw     $h2,8($mac)
671         stw     $h3,12($mac)
672 ___
673 $code.=<<___    if (!$LITTLE_ENDIAN);
674         li      $d1,4
675         stwbrx  $h0,0,$mac                      # write result
676         li      $d2,8
677         stwbrx  $h1,$d1,$mac
678         li      $d3,12
679         stwbrx  $h2,$d2,$mac
680         stwbrx  $h3,$d3,$mac
681 ___
682 $code.=<<___;
683         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
684         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
685         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
686         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
687         addi    $sp,$sp,$FRAME
688         blr
689         .long   0
690         .byte   0,12,4,1,0x80,4,3,0
691 .size   .poly1305_emit_fpu,.-.poly1305_emit_fpu
692 ___
693 }
694 # Ugly hack here, because PPC assembler syntax seem to vary too
695 # much from platforms to platform...
696 $code.=<<___;
697 .align  6
698 LPICmeup:
699         mflr    r0
700         bcl     20,31,\$+4
701         mflr    $len    # vvvvvv "distance" between . and 1st data entry
702         addi    $len,$len,`64-8`        # borrow $len
703         mtlr    r0
704         blr
705         .long   0
706         .byte   0,12,0x14,0,0,0,0,0
707         .space  `64-9*4`
708
709 .quad   0x4330000000000000              # 2^(52+0)
710 .quad   0x4530000000000000              # 2^(52+32)
711 .quad   0x4730000000000000              # 2^(52+64)
712 .quad   0x4930000000000000              # 2^(52+96)
713 .quad   0x4b50000000000000              # 2^(52+130)
714
715 .quad   0x37f4000000000000              # 5/2^130
716
717 .quad   0x4430000000000000              # 2^(52+16+0)
718 .quad   0x4630000000000000              # 2^(52+16+32)
719 .quad   0x4830000000000000              # 2^(52+16+64)
720 .quad   0x4a30000000000000              # 2^(52+16+96)
721 .quad   0x3e30000000000000              # 2^(52+16+0-96)
722 .quad   0x4030000000000000              # 2^(52+16+32-96)
723 .quad   0x4230000000000000              # 2^(52+16+64-96)
724
725 .quad   0x0000000000000001              # fpscr: truncate, no exceptions
726 .asciz  "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
727 .align  4
728 ___
729
730 $code =~ s/\`([^\`]*)\`/eval $1/gem;
731 print $code;
732 close STDOUT;