x86 assembly pack: update performance results.
[openssl.git] / crypto / poly1305 / asm / poly1305-ppcfp.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for PowerPC FPU.
18 #
19 # June 2015
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # and improvement coefficients relative to gcc-generated code.
23 #
24 # Freescale e300        9.78/+30%
25 # PPC74x0               6.92/+50%
26 # PPC970                6.03/+80%
27 # POWER7                3.50/+30%
28 # POWER8                3.75/+10%
29
30 $flavour = shift;
31
32 if ($flavour =~ /64/) {
33         $SIZE_T =8;
34         $LRSAVE =2*$SIZE_T;
35         $UCMP   ="cmpld";
36         $STU    ="stdu";
37         $POP    ="ld";
38         $PUSH   ="std";
39 } elsif ($flavour =~ /32/) {
40         $SIZE_T =4;
41         $LRSAVE =$SIZE_T;
42         $UCMP   ="cmplw";
43         $STU    ="stwu";
44         $POP    ="lwz";
45         $PUSH   ="stw";
46 } else { die "nonsense $flavour"; }
47
48 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
49
50 $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
51
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
54 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
55 die "can't locate ppc-xlate.pl";
56
57 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
58
59 $LOCALS=6*$SIZE_T;
60 $FRAME=$LOCALS+6*8+18*8;
61
62 my $sp="r1";
63
64 my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
65 my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
66
67 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
68     $two0,$two32,$two64,$two96,$two130,$five_two130,
69     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
70     $s2lo,$s2hi,$s3lo,$s3hi,
71     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
72 # borrowings
73 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
74 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
75 my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
76
77 $code.=<<___;
78 .machine        "any"
79 .text
80
81 .globl  .poly1305_init_fpu
82 .align  6
83 .poly1305_init_fpu:
84         $STU    $sp,-$LOCALS($sp)               # minimal frame
85         mflr    $padbit
86         $PUSH   $padbit,`$LOCALS+$LRSAVE`($sp)
87
88         bl      LPICmeup
89
90         xor     r0,r0,r0
91         mtlr    $padbit                         # restore lr
92
93         lfd     $two0,8*0($len)                 # load constants
94         lfd     $two32,8*1($len)
95         lfd     $two64,8*2($len)
96         lfd     $two96,8*3($len)
97         lfd     $two130,8*4($len)
98         lfd     $five_two130,8*5($len)
99
100         stfd    $two0,8*0($ctx)                 # initial hash value, biased 0
101         stfd    $two32,8*1($ctx)
102         stfd    $two64,8*2($ctx)
103         stfd    $two96,8*3($ctx)
104
105         $UCMP   $inp,r0
106         beq-    Lno_key
107
108         lfd     $h3lo,8*13($len)                # new fpscr
109         mffs    $h3hi                           # old fpscr
110
111         stfd    $two0,8*4($ctx)                 # key "template"
112         stfd    $two32,8*5($ctx)
113         stfd    $two64,8*6($ctx)
114         stfd    $two96,8*7($ctx)
115
116         li      $in1,4
117         li      $in2,8
118         li      $in3,12
119         $LWXLE  $in0,0,$inp                     # load key
120         $LWXLE  $in1,$in1,$inp
121         $LWXLE  $in2,$in2,$inp
122         $LWXLE  $in3,$in3,$inp
123
124         lis     $i1,0xf000                      #   0xf0000000
125         ori     $i2,$i1,3                       #   0xf0000003
126         andc    $in0,$in0,$i1                   # &=0x0fffffff
127         andc    $in1,$in1,$i2                   # &=0x0ffffffc
128         andc    $in2,$in2,$i2
129         andc    $in3,$in3,$i2
130
131         stw     $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx)     # fill "template"
132         stw     $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
133         stw     $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
134         stw     $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
135
136         mtfsf   255,$h3lo                       # fpscr
137         stfd    $two0,8*18($ctx)                # copy constants to context
138         stfd    $two32,8*19($ctx)
139         stfd    $two64,8*20($ctx)
140         stfd    $two96,8*21($ctx)
141         stfd    $two130,8*22($ctx)
142         stfd    $five_two130,8*23($ctx)
143
144         lfd     $h0lo,8*4($ctx)                 # load [biased] key
145         lfd     $h1lo,8*5($ctx)
146         lfd     $h2lo,8*6($ctx)
147         lfd     $h3lo,8*7($ctx)
148
149         fsub    $h0lo,$h0lo,$two0               # r0
150         fsub    $h1lo,$h1lo,$two32              # r1
151         fsub    $h2lo,$h2lo,$two64              # r2
152         fsub    $h3lo,$h3lo,$two96              # r3
153
154         lfd     $two0,8*6($len)                 # more constants
155         lfd     $two32,8*7($len)
156         lfd     $two64,8*8($len)
157         lfd     $two96,8*9($len)
158
159         fmul    $h1hi,$h1lo,$five_two130        # s1
160         fmul    $h2hi,$h2lo,$five_two130        # s2
161          stfd   $h3hi,8*15($ctx)                # borrow slot for original fpscr
162         fmul    $h3hi,$h3lo,$five_two130        # s3
163
164         fadd    $h0hi,$h0lo,$two0
165          stfd   $h1hi,8*12($ctx)                # put aside for now
166         fadd    $h1hi,$h1lo,$two32
167          stfd   $h2hi,8*13($ctx)
168         fadd    $h2hi,$h2lo,$two64
169          stfd   $h3hi,8*14($ctx)
170         fadd    $h3hi,$h3lo,$two96
171
172         fsub    $h0hi,$h0hi,$two0
173         fsub    $h1hi,$h1hi,$two32
174         fsub    $h2hi,$h2hi,$two64
175         fsub    $h3hi,$h3hi,$two96
176
177         lfd     $two0,8*10($len)                # more constants
178         lfd     $two32,8*11($len)
179         lfd     $two64,8*12($len)
180
181         fsub    $h0lo,$h0lo,$h0hi
182         fsub    $h1lo,$h1lo,$h1hi
183         fsub    $h2lo,$h2lo,$h2hi
184         fsub    $h3lo,$h3lo,$h3hi
185
186         stfd    $h0hi,8*5($ctx)                 # r0hi
187         stfd    $h1hi,8*7($ctx)                 # r1hi
188         stfd    $h2hi,8*9($ctx)                 # r2hi
189         stfd    $h3hi,8*11($ctx)                # r3hi
190
191         stfd    $h0lo,8*4($ctx)                 # r0lo
192         stfd    $h1lo,8*6($ctx)                 # r1lo
193         stfd    $h2lo,8*8($ctx)                 # r2lo
194         stfd    $h3lo,8*10($ctx)                # r3lo
195
196         lfd     $h1lo,8*12($ctx)                # s1
197         lfd     $h2lo,8*13($ctx)                # s2
198         lfd     $h3lo,8*14($ctx)                # s3
199         lfd     $h0lo,8*15($ctx)                # pull original fpscr
200
201         fadd    $h1hi,$h1lo,$two0
202         fadd    $h2hi,$h2lo,$two32
203         fadd    $h3hi,$h3lo,$two64
204
205         fsub    $h1hi,$h1hi,$two0
206         fsub    $h2hi,$h2hi,$two32
207         fsub    $h3hi,$h3hi,$two64
208
209         fsub    $h1lo,$h1lo,$h1hi
210         fsub    $h2lo,$h2lo,$h2hi
211         fsub    $h3lo,$h3lo,$h3hi
212
213         stfd    $h1hi,8*13($ctx)                # s1hi
214         stfd    $h2hi,8*15($ctx)                # s2hi
215         stfd    $h3hi,8*17($ctx)                # s3hi
216
217         stfd    $h1lo,8*12($ctx)                # s1lo
218         stfd    $h2lo,8*14($ctx)                # s2lo
219         stfd    $h3lo,8*16($ctx)                # s3lo
220
221         mtfsf   255,$h0lo                       # restore fpscr
222 Lno_key:
223         xor     r3,r3,r3
224         addi    $sp,$sp,$LOCALS
225         blr
226         .long   0
227         .byte   0,12,4,1,0x80,0,2,0
228 .size   .poly1305_init_fpu,.-.poly1305_init_fpu
229
230 .globl  .poly1305_blocks_fpu
231 .align  4
232 .poly1305_blocks_fpu:
233         srwi.   $len,$len,4
234         beq-    Labort
235
236         $STU    $sp,-$FRAME($sp)
237         mflr    r0
238         stfd    f14,`$FRAME-8*18`($sp)
239         stfd    f15,`$FRAME-8*17`($sp)
240         stfd    f16,`$FRAME-8*16`($sp)
241         stfd    f17,`$FRAME-8*15`($sp)
242         stfd    f18,`$FRAME-8*14`($sp)
243         stfd    f19,`$FRAME-8*13`($sp)
244         stfd    f20,`$FRAME-8*12`($sp)
245         stfd    f21,`$FRAME-8*11`($sp)
246         stfd    f22,`$FRAME-8*10`($sp)
247         stfd    f23,`$FRAME-8*9`($sp)
248         stfd    f24,`$FRAME-8*8`($sp)
249         stfd    f25,`$FRAME-8*7`($sp)
250         stfd    f26,`$FRAME-8*6`($sp)
251         stfd    f27,`$FRAME-8*5`($sp)
252         stfd    f28,`$FRAME-8*4`($sp)
253         stfd    f29,`$FRAME-8*3`($sp)
254         stfd    f30,`$FRAME-8*2`($sp)
255         stfd    f31,`$FRAME-8*1`($sp)
256         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
257
258         xor     r0,r0,r0
259         li      $in3,1
260         mtctr   $len
261         neg     $len,$len
262         stw     r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
263         stw     $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
264
265         lfd     $two0,8*18($ctx)                # load constants
266         lfd     $two32,8*19($ctx)
267         lfd     $two64,8*20($ctx)
268         lfd     $two96,8*21($ctx)
269         lfd     $two130,8*22($ctx)
270         lfd     $five_two130,8*23($ctx)
271
272         lfd     $h0lo,8*0($ctx)                 # load [biased] hash value
273         lfd     $h1lo,8*1($ctx)
274         lfd     $h2lo,8*2($ctx)
275         lfd     $h3lo,8*3($ctx)
276
277         stfd    $two0,`$LOCALS+8*0`($sp)        # input "template"
278         oris    $in3,$padbit,`(1023+52+96)<<4`
279         stfd    $two32,`$LOCALS+8*1`($sp)
280         stfd    $two64,`$LOCALS+8*2`($sp)
281         stw     $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
282
283         li      $i1,4
284         li      $i2,8
285         li      $i3,12
286         $LWXLE  $in0,0,$inp                     # load input
287         $LWXLE  $in1,$i1,$inp
288         $LWXLE  $in2,$i2,$inp
289         $LWXLE  $in3,$i3,$inp
290         addi    $inp,$inp,16
291
292         stw     $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
293         stw     $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
294         stw     $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
295         stw     $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
296
297         mffs    $x0                             # original fpscr
298         lfd     $x1,`$LOCALS+8*4`($sp)          # new fpscr
299         lfd     $r0lo,8*4($ctx)                 # load key
300         lfd     $r0hi,8*5($ctx)
301         lfd     $r1lo,8*6($ctx)
302         lfd     $r1hi,8*7($ctx)
303         lfd     $r2lo,8*8($ctx)
304         lfd     $r2hi,8*9($ctx)
305         lfd     $r3lo,8*10($ctx)
306         lfd     $r3hi,8*11($ctx)
307         lfd     $s1lo,8*12($ctx)
308         lfd     $s1hi,8*13($ctx)
309         lfd     $s2lo,8*14($ctx)
310         lfd     $s2hi,8*15($ctx)
311         lfd     $s3lo,8*16($ctx)
312         lfd     $s3hi,8*17($ctx)
313
314         stfd    $x0,`$LOCALS+8*4`($sp)          # save original fpscr
315         mtfsf   255,$x1
316
317         addic   $len,$len,1
318         addze   r0,r0
319         slwi.   r0,r0,4
320         sub     $inp,$inp,r0                    # conditional rewind
321
322         lfd     $x0,`$LOCALS+8*0`($sp)
323         lfd     $x1,`$LOCALS+8*1`($sp)
324         lfd     $x2,`$LOCALS+8*2`($sp)
325         lfd     $x3,`$LOCALS+8*3`($sp)
326
327         fsub    $h0lo,$h0lo,$two0               # de-bias hash value
328          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
329         fsub    $h1lo,$h1lo,$two32
330          $LWXLE $in1,$i1,$inp
331         fsub    $h2lo,$h2lo,$two64
332          $LWXLE $in2,$i2,$inp
333         fsub    $h3lo,$h3lo,$two96
334          $LWXLE $in3,$i3,$inp
335
336         fsub    $x0,$x0,$two0                   # de-bias input
337          addi   $inp,$inp,16
338         fsub    $x1,$x1,$two32
339         fsub    $x2,$x2,$two64
340         fsub    $x3,$x3,$two96
341
342         fadd    $x0,$x0,$h0lo                   # accumulate input
343          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
344         fadd    $x1,$x1,$h1lo
345          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
346         fadd    $x2,$x2,$h2lo
347          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
348         fadd    $x3,$x3,$h3lo
349          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
350
351         b       Lentry
352
353 .align  4
354 Loop:
355         fsub    $y0,$y0,$two0                   # de-bias input
356          addic  $len,$len,1
357         fsub    $y1,$y1,$two32
358          addze  r0,r0
359         fsub    $y2,$y2,$two64
360          slwi.  r0,r0,4
361         fsub    $y3,$y3,$two96
362          sub    $inp,$inp,r0                    # conditional rewind
363
364         fadd    $h0lo,$h0lo,$y0                 # accumulate input
365         fadd    $h0hi,$h0hi,$y1
366         fadd    $h2lo,$h2lo,$y2
367         fadd    $h2hi,$h2hi,$y3
368
369         ######################################### base 2^48 -> base 2^32
370         fadd    $c1lo,$h1lo,$two64
371          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
372         fadd    $c1hi,$h1hi,$two64
373          $LWXLE $in1,$i1,$inp
374         fadd    $c3lo,$h3lo,$two130
375          $LWXLE $in2,$i2,$inp
376         fadd    $c3hi,$h3hi,$two130
377          $LWXLE $in3,$i3,$inp
378         fadd    $c0lo,$h0lo,$two32
379          addi   $inp,$inp,16
380         fadd    $c0hi,$h0hi,$two32
381         fadd    $c2lo,$h2lo,$two96
382         fadd    $c2hi,$h2hi,$two96
383
384         fsub    $c1lo,$c1lo,$two64
385          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
386         fsub    $c1hi,$c1hi,$two64
387          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
388         fsub    $c3lo,$c3lo,$two130
389          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
390         fsub    $c3hi,$c3hi,$two130
391          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
392         fsub    $c0lo,$c0lo,$two32
393         fsub    $c0hi,$c0hi,$two32
394         fsub    $c2lo,$c2lo,$two96
395         fsub    $c2hi,$c2hi,$two96
396
397         fsub    $h1lo,$h1lo,$c1lo
398         fsub    $h1hi,$h1hi,$c1hi
399         fsub    $h3lo,$h3lo,$c3lo
400         fsub    $h3hi,$h3hi,$c3hi
401         fsub    $h2lo,$h2lo,$c2lo
402         fsub    $h2hi,$h2hi,$c2hi
403         fsub    $h0lo,$h0lo,$c0lo
404         fsub    $h0hi,$h0hi,$c0hi
405
406         fadd    $h1lo,$h1lo,$c0lo
407         fadd    $h1hi,$h1hi,$c0hi
408         fadd    $h3lo,$h3lo,$c2lo
409         fadd    $h3hi,$h3hi,$c2hi
410         fadd    $h2lo,$h2lo,$c1lo
411         fadd    $h2hi,$h2hi,$c1hi
412         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
413         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
414
415         fadd    $x1,$h1lo,$h1hi
416          lfd    $s1lo,8*12($ctx)                # reload constants
417         fadd    $x3,$h3lo,$h3hi
418          lfd    $s1hi,8*13($ctx)
419         fadd    $x2,$h2lo,$h2hi
420          lfd    $r3lo,8*10($ctx)
421         fadd    $x0,$h0lo,$h0hi
422          lfd    $r3hi,8*11($ctx)
423 Lentry:
424         fmul    $h0lo,$s3lo,$x1
425         fmul    $h0hi,$s3hi,$x1
426         fmul    $h2lo,$r1lo,$x1
427         fmul    $h2hi,$r1hi,$x1
428         fmul    $h1lo,$r0lo,$x1
429         fmul    $h1hi,$r0hi,$x1
430         fmul    $h3lo,$r2lo,$x1
431         fmul    $h3hi,$r2hi,$x1
432
433         fmadd   $h0lo,$s1lo,$x3,$h0lo
434         fmadd   $h0hi,$s1hi,$x3,$h0hi
435         fmadd   $h2lo,$s3lo,$x3,$h2lo
436         fmadd   $h2hi,$s3hi,$x3,$h2hi
437         fmadd   $h1lo,$s2lo,$x3,$h1lo
438         fmadd   $h1hi,$s2hi,$x3,$h1hi
439         fmadd   $h3lo,$r0lo,$x3,$h3lo
440         fmadd   $h3hi,$r0hi,$x3,$h3hi
441
442         fmadd   $h0lo,$s2lo,$x2,$h0lo
443         fmadd   $h0hi,$s2hi,$x2,$h0hi
444         fmadd   $h2lo,$r0lo,$x2,$h2lo
445         fmadd   $h2hi,$r0hi,$x2,$h2hi
446         fmadd   $h1lo,$s3lo,$x2,$h1lo
447         fmadd   $h1hi,$s3hi,$x2,$h1hi
448         fmadd   $h3lo,$r1lo,$x2,$h3lo
449         fmadd   $h3hi,$r1hi,$x2,$h3hi
450
451         fmadd   $h0lo,$r0lo,$x0,$h0lo
452          lfd    $y0,`$LOCALS+8*0`($sp)          # load [biased] input
453         fmadd   $h0hi,$r0hi,$x0,$h0hi
454          lfd    $y1,`$LOCALS+8*1`($sp)
455         fmadd   $h2lo,$r2lo,$x0,$h2lo
456          lfd    $y2,`$LOCALS+8*2`($sp)
457         fmadd   $h2hi,$r2hi,$x0,$h2hi
458          lfd    $y3,`$LOCALS+8*3`($sp)
459         fmadd   $h1lo,$r1lo,$x0,$h1lo
460         fmadd   $h1hi,$r1hi,$x0,$h1hi
461         fmadd   $h3lo,$r3lo,$x0,$h3lo
462         fmadd   $h3hi,$r3hi,$x0,$h3hi
463
464         bdnz    Loop
465
466         ######################################### base 2^48 -> base 2^32
467         fadd    $c0lo,$h0lo,$two32
468         fadd    $c0hi,$h0hi,$two32
469         fadd    $c2lo,$h2lo,$two96
470         fadd    $c2hi,$h2hi,$two96
471         fadd    $c1lo,$h1lo,$two64
472         fadd    $c1hi,$h1hi,$two64
473         fadd    $c3lo,$h3lo,$two130
474         fadd    $c3hi,$h3hi,$two130
475
476         fsub    $c0lo,$c0lo,$two32
477         fsub    $c0hi,$c0hi,$two32
478         fsub    $c2lo,$c2lo,$two96
479         fsub    $c2hi,$c2hi,$two96
480         fsub    $c1lo,$c1lo,$two64
481         fsub    $c1hi,$c1hi,$two64
482         fsub    $c3lo,$c3lo,$two130
483         fsub    $c3hi,$c3hi,$two130
484
485         fsub    $h1lo,$h1lo,$c1lo
486         fsub    $h1hi,$h1hi,$c1hi
487         fsub    $h3lo,$h3lo,$c3lo
488         fsub    $h3hi,$h3hi,$c3hi
489         fsub    $h2lo,$h2lo,$c2lo
490         fsub    $h2hi,$h2hi,$c2hi
491         fsub    $h0lo,$h0lo,$c0lo
492         fsub    $h0hi,$h0hi,$c0hi
493
494         fadd    $h1lo,$h1lo,$c0lo
495         fadd    $h1hi,$h1hi,$c0hi
496         fadd    $h3lo,$h3lo,$c2lo
497         fadd    $h3hi,$h3hi,$c2hi
498         fadd    $h2lo,$h2lo,$c1lo
499         fadd    $h2hi,$h2hi,$c1hi
500         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
501         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
502
503         fadd    $x1,$h1lo,$h1hi
504         fadd    $x3,$h3lo,$h3hi
505         fadd    $x2,$h2lo,$h2hi
506         fadd    $x0,$h0lo,$h0hi
507
508         lfd     $h0lo,`$LOCALS+8*4`($sp)        # pull saved fpscr
509         fadd    $x1,$x1,$two32                  # bias
510         fadd    $x3,$x3,$two96
511         fadd    $x2,$x2,$two64
512         fadd    $x0,$x0,$two0
513
514         stfd    $x1,8*1($ctx)                   # store [biased] hash value
515         stfd    $x3,8*3($ctx)
516         stfd    $x2,8*2($ctx)
517         stfd    $x0,8*0($ctx)
518
519         mtfsf   255,$h0lo                       # restore original fpscr
520         lfd     f14,`$FRAME-8*18`($sp)
521         lfd     f15,`$FRAME-8*17`($sp)
522         lfd     f16,`$FRAME-8*16`($sp)
523         lfd     f17,`$FRAME-8*15`($sp)
524         lfd     f18,`$FRAME-8*14`($sp)
525         lfd     f19,`$FRAME-8*13`($sp)
526         lfd     f20,`$FRAME-8*12`($sp)
527         lfd     f21,`$FRAME-8*11`($sp)
528         lfd     f22,`$FRAME-8*10`($sp)
529         lfd     f23,`$FRAME-8*9`($sp)
530         lfd     f24,`$FRAME-8*8`($sp)
531         lfd     f25,`$FRAME-8*7`($sp)
532         lfd     f26,`$FRAME-8*6`($sp)
533         lfd     f27,`$FRAME-8*5`($sp)
534         lfd     f28,`$FRAME-8*4`($sp)
535         lfd     f29,`$FRAME-8*3`($sp)
536         lfd     f30,`$FRAME-8*2`($sp)
537         lfd     f31,`$FRAME-8*1`($sp)
538         addi    $sp,$sp,$FRAME
539 Labort:
540         blr
541         .long   0
542         .byte   0,12,4,1,0x80,0,4,0
543 .size   .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
544 ___
545 {
546 my ($mac,$nonce)=($inp,$len);
547
548 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
549    ) = map("r$_",(7..11,28..31));
550 my $mask = "r0";
551 my $FRAME = (6+4)*$SIZE_T;
552
553 $code.=<<___;
554 .globl  .poly1305_emit_fpu
555 .align  4
556 .poly1305_emit_fpu:
557         $STU    $sp,-$FRAME($sp)
558         mflr    r0
559         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
560         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
561         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
562         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
563         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
564
565         lwz     $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx)      # load hash
566         lwz     $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
567         lwz     $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
568         lwz     $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
569         lwz     $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
570         lwz     $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
571         lwz     $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
572         lwz     $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
573
574         lis     $mask,0xfff0
575         andc    $d0,$d0,$mask                   # mask exponent
576         andc    $d1,$d1,$mask
577         andc    $d2,$d2,$mask
578         andc    $d3,$d3,$mask                   # can be partially reduced...
579         li      $mask,3
580
581         srwi    $padbit,$d3,2                   # ... so reduce
582         and     $h4,$d3,$mask
583         andc    $d3,$d3,$mask
584         add     $d3,$d3,$padbit
585 ___
586                                                 if ($SIZE_T==4) {
587 $code.=<<___;
588         addc    $h0,$h0,$d3
589         adde    $h1,$h1,$d0
590         adde    $h2,$h2,$d1
591         adde    $h3,$h3,$d2
592         addze   $h4,$h4
593
594         addic   $d0,$h0,5                       # compare to modulus
595         addze   $d1,$h1
596         addze   $d2,$h2
597         addze   $d3,$h3
598         addze   $mask,$h4
599
600         srwi    $mask,$mask,2                   # did it carry/borrow?
601         neg     $mask,$mask
602         srawi   $mask,$mask,31                  # mask
603
604         andc    $h0,$h0,$mask
605         and     $d0,$d0,$mask
606         andc    $h1,$h1,$mask
607         and     $d1,$d1,$mask
608         or      $h0,$h0,$d0
609         lwz     $d0,0($nonce)                   # load nonce
610         andc    $h2,$h2,$mask
611         and     $d2,$d2,$mask
612         or      $h1,$h1,$d1
613         lwz     $d1,4($nonce)
614         andc    $h3,$h3,$mask
615         and     $d3,$d3,$mask
616         or      $h2,$h2,$d2
617         lwz     $d2,8($nonce)
618         or      $h3,$h3,$d3
619         lwz     $d3,12($nonce)
620
621         addc    $h0,$h0,$d0                     # accumulate nonce
622         adde    $h1,$h1,$d1
623         adde    $h2,$h2,$d2
624         adde    $h3,$h3,$d3
625 ___
626                                                 } else {
627 $code.=<<___;
628         add     $h0,$h0,$d3
629         add     $h1,$h1,$d0
630         add     $h2,$h2,$d1
631         add     $h3,$h3,$d2
632
633         srdi    $d0,$h0,32
634         add     $h1,$h1,$d0
635         srdi    $d1,$h1,32
636         add     $h2,$h2,$d1
637         srdi    $d2,$h2,32
638         add     $h3,$h3,$d2
639         srdi    $d3,$h3,32
640         add     $h4,$h4,$d3
641
642         insrdi  $h0,$h1,32,0
643         insrdi  $h2,$h3,32,0
644
645         addic   $d0,$h0,5                       # compare to modulus
646         addze   $d1,$h2
647         addze   $d2,$h4
648
649         srdi    $mask,$d2,2                     # did it carry/borrow?
650         neg     $mask,$mask
651         sradi   $mask,$mask,63                  # mask
652         ld      $d2,0($nonce)                   # load nonce
653         ld      $d3,8($nonce)
654
655         andc    $h0,$h0,$mask
656         and     $d0,$d0,$mask
657         andc    $h2,$h2,$mask
658         and     $d1,$d1,$mask
659         or      $h0,$h0,$d0
660         or      $h2,$h2,$d1
661 ___
662 $code.=<<___    if (!$LITTLE_ENDIAN);
663         rotldi  $d2,$d2,32                      # flip nonce words
664         rotldi  $d3,$d3,32
665 ___
666 $code.=<<___;
667         addc    $h0,$h0,$d2                     # accumulate nonce
668         adde    $h2,$h2,$d3
669
670         srdi    $h1,$h0,32
671         srdi    $h3,$h2,32
672 ___
673                                                 }
674 $code.=<<___    if ($LITTLE_ENDIAN);
675         stw     $h0,0($mac)                     # write result
676         stw     $h1,4($mac)
677         stw     $h2,8($mac)
678         stw     $h3,12($mac)
679 ___
680 $code.=<<___    if (!$LITTLE_ENDIAN);
681         li      $d1,4
682         stwbrx  $h0,0,$mac                      # write result
683         li      $d2,8
684         stwbrx  $h1,$d1,$mac
685         li      $d3,12
686         stwbrx  $h2,$d2,$mac
687         stwbrx  $h3,$d3,$mac
688 ___
689 $code.=<<___;
690         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
691         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
692         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
693         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
694         addi    $sp,$sp,$FRAME
695         blr
696         .long   0
697         .byte   0,12,4,1,0x80,4,3,0
698 .size   .poly1305_emit_fpu,.-.poly1305_emit_fpu
699 ___
700 }
701 # Ugly hack here, because PPC assembler syntax seem to vary too
702 # much from platforms to platform...
703 $code.=<<___;
704 .align  6
705 LPICmeup:
706         mflr    r0
707         bcl     20,31,\$+4
708         mflr    $len    # vvvvvv "distance" between . and 1st data entry
709         addi    $len,$len,`64-8`        # borrow $len
710         mtlr    r0
711         blr
712         .long   0
713         .byte   0,12,0x14,0,0,0,0,0
714         .space  `64-9*4`
715
716 .quad   0x4330000000000000              # 2^(52+0)
717 .quad   0x4530000000000000              # 2^(52+32)
718 .quad   0x4730000000000000              # 2^(52+64)
719 .quad   0x4930000000000000              # 2^(52+96)
720 .quad   0x4b50000000000000              # 2^(52+130)
721
722 .quad   0x37f4000000000000              # 5/2^130
723
724 .quad   0x4430000000000000              # 2^(52+16+0)
725 .quad   0x4630000000000000              # 2^(52+16+32)
726 .quad   0x4830000000000000              # 2^(52+16+64)
727 .quad   0x4a30000000000000              # 2^(52+16+96)
728 .quad   0x3e30000000000000              # 2^(52+16+0-96)
729 .quad   0x4030000000000000              # 2^(52+16+32-96)
730 .quad   0x4230000000000000              # 2^(52+16+64-96)
731
732 .quad   0x0000000000000001              # fpscr: truncate, no exceptions
733 .asciz  "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
734 .align  4
735 ___
736
737 $code =~ s/\`([^\`]*)\`/eval $1/gem;
738 print $code;
739 close STDOUT;