Also check for errors in x86_64-xlate.pl.
[openssl.git] / crypto / poly1305 / asm / poly1305-ppcfp.pl
1 #! /usr/bin/env perl
2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for PowerPC FPU.
18 #
19 # June 2015
20 #
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # and improvement coefficients relative to gcc-generated code.
23 #
24 # Freescale e300        9.78/+30%
25 # PPC74x0               6.92/+50%
26 # PPC970                6.03/+80%
27 # POWER7                3.50/+30%
28 # POWER8                3.75/+10%
29
30 # $output is the last argument if it looks like a file (it has an extension)
31 # $flavour is the first argument if it doesn't look like a file
32 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
33 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
34
35 if ($flavour =~ /64/) {
36         $SIZE_T =8;
37         $LRSAVE =2*$SIZE_T;
38         $UCMP   ="cmpld";
39         $STU    ="stdu";
40         $POP    ="ld";
41         $PUSH   ="std";
42 } elsif ($flavour =~ /32/) {
43         $SIZE_T =4;
44         $LRSAVE =$SIZE_T;
45         $UCMP   ="cmplw";
46         $STU    ="stwu";
47         $POP    ="lwz";
48         $PUSH   ="stw";
49 } else { die "nonsense $flavour"; }
50
51 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
52
53 $LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
54
55 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
57 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
58 die "can't locate ppc-xlate.pl";
59
60 open STDOUT,"| $^X $xlate $flavour \"$output\""
61     or die "can't call $xlate: $!";
62
63 $LOCALS=6*$SIZE_T;
64 $FRAME=$LOCALS+6*8+18*8;
65
66 my $sp="r1";
67
68 my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
69 my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
70
71 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
72     $two0,$two32,$two64,$two96,$two130,$five_two130,
73     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
74     $s2lo,$s2hi,$s3lo,$s3hi,
75     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
76 # borrowings
77 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
78 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
79 my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
80
81 $code.=<<___;
82 .machine        "any"
83 .text
84
85 .globl  .poly1305_init_fpu
86 .align  6
87 .poly1305_init_fpu:
88         $STU    $sp,-$LOCALS($sp)               # minimal frame
89         mflr    $padbit
90         $PUSH   $padbit,`$LOCALS+$LRSAVE`($sp)
91
92         bl      LPICmeup
93
94         xor     r0,r0,r0
95         mtlr    $padbit                         # restore lr
96
97         lfd     $two0,8*0($len)                 # load constants
98         lfd     $two32,8*1($len)
99         lfd     $two64,8*2($len)
100         lfd     $two96,8*3($len)
101         lfd     $two130,8*4($len)
102         lfd     $five_two130,8*5($len)
103
104         stfd    $two0,8*0($ctx)                 # initial hash value, biased 0
105         stfd    $two32,8*1($ctx)
106         stfd    $two64,8*2($ctx)
107         stfd    $two96,8*3($ctx)
108
109         $UCMP   $inp,r0
110         beq-    Lno_key
111
112         lfd     $h3lo,8*13($len)                # new fpscr
113         mffs    $h3hi                           # old fpscr
114
115         stfd    $two0,8*4($ctx)                 # key "template"
116         stfd    $two32,8*5($ctx)
117         stfd    $two64,8*6($ctx)
118         stfd    $two96,8*7($ctx)
119
120         li      $in1,4
121         li      $in2,8
122         li      $in3,12
123         $LWXLE  $in0,0,$inp                     # load key
124         $LWXLE  $in1,$in1,$inp
125         $LWXLE  $in2,$in2,$inp
126         $LWXLE  $in3,$in3,$inp
127
128         lis     $i1,0xf000                      #   0xf0000000
129         ori     $i2,$i1,3                       #   0xf0000003
130         andc    $in0,$in0,$i1                   # &=0x0fffffff
131         andc    $in1,$in1,$i2                   # &=0x0ffffffc
132         andc    $in2,$in2,$i2
133         andc    $in3,$in3,$i2
134
135         stw     $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx)     # fill "template"
136         stw     $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
137         stw     $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
138         stw     $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
139
140         mtfsf   255,$h3lo                       # fpscr
141         stfd    $two0,8*18($ctx)                # copy constants to context
142         stfd    $two32,8*19($ctx)
143         stfd    $two64,8*20($ctx)
144         stfd    $two96,8*21($ctx)
145         stfd    $two130,8*22($ctx)
146         stfd    $five_two130,8*23($ctx)
147
148         lfd     $h0lo,8*4($ctx)                 # load [biased] key
149         lfd     $h1lo,8*5($ctx)
150         lfd     $h2lo,8*6($ctx)
151         lfd     $h3lo,8*7($ctx)
152
153         fsub    $h0lo,$h0lo,$two0               # r0
154         fsub    $h1lo,$h1lo,$two32              # r1
155         fsub    $h2lo,$h2lo,$two64              # r2
156         fsub    $h3lo,$h3lo,$two96              # r3
157
158         lfd     $two0,8*6($len)                 # more constants
159         lfd     $two32,8*7($len)
160         lfd     $two64,8*8($len)
161         lfd     $two96,8*9($len)
162
163         fmul    $h1hi,$h1lo,$five_two130        # s1
164         fmul    $h2hi,$h2lo,$five_two130        # s2
165          stfd   $h3hi,8*15($ctx)                # borrow slot for original fpscr
166         fmul    $h3hi,$h3lo,$five_two130        # s3
167
168         fadd    $h0hi,$h0lo,$two0
169          stfd   $h1hi,8*12($ctx)                # put aside for now
170         fadd    $h1hi,$h1lo,$two32
171          stfd   $h2hi,8*13($ctx)
172         fadd    $h2hi,$h2lo,$two64
173          stfd   $h3hi,8*14($ctx)
174         fadd    $h3hi,$h3lo,$two96
175
176         fsub    $h0hi,$h0hi,$two0
177         fsub    $h1hi,$h1hi,$two32
178         fsub    $h2hi,$h2hi,$two64
179         fsub    $h3hi,$h3hi,$two96
180
181         lfd     $two0,8*10($len)                # more constants
182         lfd     $two32,8*11($len)
183         lfd     $two64,8*12($len)
184
185         fsub    $h0lo,$h0lo,$h0hi
186         fsub    $h1lo,$h1lo,$h1hi
187         fsub    $h2lo,$h2lo,$h2hi
188         fsub    $h3lo,$h3lo,$h3hi
189
190         stfd    $h0hi,8*5($ctx)                 # r0hi
191         stfd    $h1hi,8*7($ctx)                 # r1hi
192         stfd    $h2hi,8*9($ctx)                 # r2hi
193         stfd    $h3hi,8*11($ctx)                # r3hi
194
195         stfd    $h0lo,8*4($ctx)                 # r0lo
196         stfd    $h1lo,8*6($ctx)                 # r1lo
197         stfd    $h2lo,8*8($ctx)                 # r2lo
198         stfd    $h3lo,8*10($ctx)                # r3lo
199
200         lfd     $h1lo,8*12($ctx)                # s1
201         lfd     $h2lo,8*13($ctx)                # s2
202         lfd     $h3lo,8*14($ctx)                # s3
203         lfd     $h0lo,8*15($ctx)                # pull original fpscr
204
205         fadd    $h1hi,$h1lo,$two0
206         fadd    $h2hi,$h2lo,$two32
207         fadd    $h3hi,$h3lo,$two64
208
209         fsub    $h1hi,$h1hi,$two0
210         fsub    $h2hi,$h2hi,$two32
211         fsub    $h3hi,$h3hi,$two64
212
213         fsub    $h1lo,$h1lo,$h1hi
214         fsub    $h2lo,$h2lo,$h2hi
215         fsub    $h3lo,$h3lo,$h3hi
216
217         stfd    $h1hi,8*13($ctx)                # s1hi
218         stfd    $h2hi,8*15($ctx)                # s2hi
219         stfd    $h3hi,8*17($ctx)                # s3hi
220
221         stfd    $h1lo,8*12($ctx)                # s1lo
222         stfd    $h2lo,8*14($ctx)                # s2lo
223         stfd    $h3lo,8*16($ctx)                # s3lo
224
225         mtfsf   255,$h0lo                       # restore fpscr
226 Lno_key:
227         xor     r3,r3,r3
228         addi    $sp,$sp,$LOCALS
229         blr
230         .long   0
231         .byte   0,12,4,1,0x80,0,2,0
232 .size   .poly1305_init_fpu,.-.poly1305_init_fpu
233
234 .globl  .poly1305_blocks_fpu
235 .align  4
236 .poly1305_blocks_fpu:
237         srwi.   $len,$len,4
238         beq-    Labort
239
240         $STU    $sp,-$FRAME($sp)
241         mflr    r0
242         stfd    f14,`$FRAME-8*18`($sp)
243         stfd    f15,`$FRAME-8*17`($sp)
244         stfd    f16,`$FRAME-8*16`($sp)
245         stfd    f17,`$FRAME-8*15`($sp)
246         stfd    f18,`$FRAME-8*14`($sp)
247         stfd    f19,`$FRAME-8*13`($sp)
248         stfd    f20,`$FRAME-8*12`($sp)
249         stfd    f21,`$FRAME-8*11`($sp)
250         stfd    f22,`$FRAME-8*10`($sp)
251         stfd    f23,`$FRAME-8*9`($sp)
252         stfd    f24,`$FRAME-8*8`($sp)
253         stfd    f25,`$FRAME-8*7`($sp)
254         stfd    f26,`$FRAME-8*6`($sp)
255         stfd    f27,`$FRAME-8*5`($sp)
256         stfd    f28,`$FRAME-8*4`($sp)
257         stfd    f29,`$FRAME-8*3`($sp)
258         stfd    f30,`$FRAME-8*2`($sp)
259         stfd    f31,`$FRAME-8*1`($sp)
260         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
261
262         xor     r0,r0,r0
263         li      $in3,1
264         mtctr   $len
265         neg     $len,$len
266         stw     r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
267         stw     $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
268
269         lfd     $two0,8*18($ctx)                # load constants
270         lfd     $two32,8*19($ctx)
271         lfd     $two64,8*20($ctx)
272         lfd     $two96,8*21($ctx)
273         lfd     $two130,8*22($ctx)
274         lfd     $five_two130,8*23($ctx)
275
276         lfd     $h0lo,8*0($ctx)                 # load [biased] hash value
277         lfd     $h1lo,8*1($ctx)
278         lfd     $h2lo,8*2($ctx)
279         lfd     $h3lo,8*3($ctx)
280
281         stfd    $two0,`$LOCALS+8*0`($sp)        # input "template"
282         oris    $in3,$padbit,`(1023+52+96)<<4`
283         stfd    $two32,`$LOCALS+8*1`($sp)
284         stfd    $two64,`$LOCALS+8*2`($sp)
285         stw     $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
286
287         li      $i1,4
288         li      $i2,8
289         li      $i3,12
290         $LWXLE  $in0,0,$inp                     # load input
291         $LWXLE  $in1,$i1,$inp
292         $LWXLE  $in2,$i2,$inp
293         $LWXLE  $in3,$i3,$inp
294         addi    $inp,$inp,16
295
296         stw     $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
297         stw     $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
298         stw     $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
299         stw     $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
300
301         mffs    $x0                             # original fpscr
302         lfd     $x1,`$LOCALS+8*4`($sp)          # new fpscr
303         lfd     $r0lo,8*4($ctx)                 # load key
304         lfd     $r0hi,8*5($ctx)
305         lfd     $r1lo,8*6($ctx)
306         lfd     $r1hi,8*7($ctx)
307         lfd     $r2lo,8*8($ctx)
308         lfd     $r2hi,8*9($ctx)
309         lfd     $r3lo,8*10($ctx)
310         lfd     $r3hi,8*11($ctx)
311         lfd     $s1lo,8*12($ctx)
312         lfd     $s1hi,8*13($ctx)
313         lfd     $s2lo,8*14($ctx)
314         lfd     $s2hi,8*15($ctx)
315         lfd     $s3lo,8*16($ctx)
316         lfd     $s3hi,8*17($ctx)
317
318         stfd    $x0,`$LOCALS+8*4`($sp)          # save original fpscr
319         mtfsf   255,$x1
320
321         addic   $len,$len,1
322         addze   r0,r0
323         slwi.   r0,r0,4
324         sub     $inp,$inp,r0                    # conditional rewind
325
326         lfd     $x0,`$LOCALS+8*0`($sp)
327         lfd     $x1,`$LOCALS+8*1`($sp)
328         lfd     $x2,`$LOCALS+8*2`($sp)
329         lfd     $x3,`$LOCALS+8*3`($sp)
330
331         fsub    $h0lo,$h0lo,$two0               # de-bias hash value
332          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
333         fsub    $h1lo,$h1lo,$two32
334          $LWXLE $in1,$i1,$inp
335         fsub    $h2lo,$h2lo,$two64
336          $LWXLE $in2,$i2,$inp
337         fsub    $h3lo,$h3lo,$two96
338          $LWXLE $in3,$i3,$inp
339
340         fsub    $x0,$x0,$two0                   # de-bias input
341          addi   $inp,$inp,16
342         fsub    $x1,$x1,$two32
343         fsub    $x2,$x2,$two64
344         fsub    $x3,$x3,$two96
345
346         fadd    $x0,$x0,$h0lo                   # accumulate input
347          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
348         fadd    $x1,$x1,$h1lo
349          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
350         fadd    $x2,$x2,$h2lo
351          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
352         fadd    $x3,$x3,$h3lo
353          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
354
355         b       Lentry
356
357 .align  4
358 Loop:
359         fsub    $y0,$y0,$two0                   # de-bias input
360          addic  $len,$len,1
361         fsub    $y1,$y1,$two32
362          addze  r0,r0
363         fsub    $y2,$y2,$two64
364          slwi.  r0,r0,4
365         fsub    $y3,$y3,$two96
366          sub    $inp,$inp,r0                    # conditional rewind
367
368         fadd    $h0lo,$h0lo,$y0                 # accumulate input
369         fadd    $h0hi,$h0hi,$y1
370         fadd    $h2lo,$h2lo,$y2
371         fadd    $h2hi,$h2hi,$y3
372
373         ######################################### base 2^48 -> base 2^32
374         fadd    $c1lo,$h1lo,$two64
375          $LWXLE $in0,0,$inp                     # modulo-scheduled input load
376         fadd    $c1hi,$h1hi,$two64
377          $LWXLE $in1,$i1,$inp
378         fadd    $c3lo,$h3lo,$two130
379          $LWXLE $in2,$i2,$inp
380         fadd    $c3hi,$h3hi,$two130
381          $LWXLE $in3,$i3,$inp
382         fadd    $c0lo,$h0lo,$two32
383          addi   $inp,$inp,16
384         fadd    $c0hi,$h0hi,$two32
385         fadd    $c2lo,$h2lo,$two96
386         fadd    $c2hi,$h2hi,$two96
387
388         fsub    $c1lo,$c1lo,$two64
389          stw    $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)      # fill "template"
390         fsub    $c1hi,$c1hi,$two64
391          stw    $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
392         fsub    $c3lo,$c3lo,$two130
393          stw    $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
394         fsub    $c3hi,$c3hi,$two130
395          stw    $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
396         fsub    $c0lo,$c0lo,$two32
397         fsub    $c0hi,$c0hi,$two32
398         fsub    $c2lo,$c2lo,$two96
399         fsub    $c2hi,$c2hi,$two96
400
401         fsub    $h1lo,$h1lo,$c1lo
402         fsub    $h1hi,$h1hi,$c1hi
403         fsub    $h3lo,$h3lo,$c3lo
404         fsub    $h3hi,$h3hi,$c3hi
405         fsub    $h2lo,$h2lo,$c2lo
406         fsub    $h2hi,$h2hi,$c2hi
407         fsub    $h0lo,$h0lo,$c0lo
408         fsub    $h0hi,$h0hi,$c0hi
409
410         fadd    $h1lo,$h1lo,$c0lo
411         fadd    $h1hi,$h1hi,$c0hi
412         fadd    $h3lo,$h3lo,$c2lo
413         fadd    $h3hi,$h3hi,$c2hi
414         fadd    $h2lo,$h2lo,$c1lo
415         fadd    $h2hi,$h2hi,$c1hi
416         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
417         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
418
419         fadd    $x1,$h1lo,$h1hi
420          lfd    $s1lo,8*12($ctx)                # reload constants
421         fadd    $x3,$h3lo,$h3hi
422          lfd    $s1hi,8*13($ctx)
423         fadd    $x2,$h2lo,$h2hi
424          lfd    $r3lo,8*10($ctx)
425         fadd    $x0,$h0lo,$h0hi
426          lfd    $r3hi,8*11($ctx)
427 Lentry:
428         fmul    $h0lo,$s3lo,$x1
429         fmul    $h0hi,$s3hi,$x1
430         fmul    $h2lo,$r1lo,$x1
431         fmul    $h2hi,$r1hi,$x1
432         fmul    $h1lo,$r0lo,$x1
433         fmul    $h1hi,$r0hi,$x1
434         fmul    $h3lo,$r2lo,$x1
435         fmul    $h3hi,$r2hi,$x1
436
437         fmadd   $h0lo,$s1lo,$x3,$h0lo
438         fmadd   $h0hi,$s1hi,$x3,$h0hi
439         fmadd   $h2lo,$s3lo,$x3,$h2lo
440         fmadd   $h2hi,$s3hi,$x3,$h2hi
441         fmadd   $h1lo,$s2lo,$x3,$h1lo
442         fmadd   $h1hi,$s2hi,$x3,$h1hi
443         fmadd   $h3lo,$r0lo,$x3,$h3lo
444         fmadd   $h3hi,$r0hi,$x3,$h3hi
445
446         fmadd   $h0lo,$s2lo,$x2,$h0lo
447         fmadd   $h0hi,$s2hi,$x2,$h0hi
448         fmadd   $h2lo,$r0lo,$x2,$h2lo
449         fmadd   $h2hi,$r0hi,$x2,$h2hi
450         fmadd   $h1lo,$s3lo,$x2,$h1lo
451         fmadd   $h1hi,$s3hi,$x2,$h1hi
452         fmadd   $h3lo,$r1lo,$x2,$h3lo
453         fmadd   $h3hi,$r1hi,$x2,$h3hi
454
455         fmadd   $h0lo,$r0lo,$x0,$h0lo
456          lfd    $y0,`$LOCALS+8*0`($sp)          # load [biased] input
457         fmadd   $h0hi,$r0hi,$x0,$h0hi
458          lfd    $y1,`$LOCALS+8*1`($sp)
459         fmadd   $h2lo,$r2lo,$x0,$h2lo
460          lfd    $y2,`$LOCALS+8*2`($sp)
461         fmadd   $h2hi,$r2hi,$x0,$h2hi
462          lfd    $y3,`$LOCALS+8*3`($sp)
463         fmadd   $h1lo,$r1lo,$x0,$h1lo
464         fmadd   $h1hi,$r1hi,$x0,$h1hi
465         fmadd   $h3lo,$r3lo,$x0,$h3lo
466         fmadd   $h3hi,$r3hi,$x0,$h3hi
467
468         bdnz    Loop
469
470         ######################################### base 2^48 -> base 2^32
471         fadd    $c0lo,$h0lo,$two32
472         fadd    $c0hi,$h0hi,$two32
473         fadd    $c2lo,$h2lo,$two96
474         fadd    $c2hi,$h2hi,$two96
475         fadd    $c1lo,$h1lo,$two64
476         fadd    $c1hi,$h1hi,$two64
477         fadd    $c3lo,$h3lo,$two130
478         fadd    $c3hi,$h3hi,$two130
479
480         fsub    $c0lo,$c0lo,$two32
481         fsub    $c0hi,$c0hi,$two32
482         fsub    $c2lo,$c2lo,$two96
483         fsub    $c2hi,$c2hi,$two96
484         fsub    $c1lo,$c1lo,$two64
485         fsub    $c1hi,$c1hi,$two64
486         fsub    $c3lo,$c3lo,$two130
487         fsub    $c3hi,$c3hi,$two130
488
489         fsub    $h1lo,$h1lo,$c1lo
490         fsub    $h1hi,$h1hi,$c1hi
491         fsub    $h3lo,$h3lo,$c3lo
492         fsub    $h3hi,$h3hi,$c3hi
493         fsub    $h2lo,$h2lo,$c2lo
494         fsub    $h2hi,$h2hi,$c2hi
495         fsub    $h0lo,$h0lo,$c0lo
496         fsub    $h0hi,$h0hi,$c0hi
497
498         fadd    $h1lo,$h1lo,$c0lo
499         fadd    $h1hi,$h1hi,$c0hi
500         fadd    $h3lo,$h3lo,$c2lo
501         fadd    $h3hi,$h3hi,$c2hi
502         fadd    $h2lo,$h2lo,$c1lo
503         fadd    $h2hi,$h2hi,$c1hi
504         fmadd   $h0lo,$c3lo,$five_two130,$h0lo
505         fmadd   $h0hi,$c3hi,$five_two130,$h0hi
506
507         fadd    $x1,$h1lo,$h1hi
508         fadd    $x3,$h3lo,$h3hi
509         fadd    $x2,$h2lo,$h2hi
510         fadd    $x0,$h0lo,$h0hi
511
512         lfd     $h0lo,`$LOCALS+8*4`($sp)        # pull saved fpscr
513         fadd    $x1,$x1,$two32                  # bias
514         fadd    $x3,$x3,$two96
515         fadd    $x2,$x2,$two64
516         fadd    $x0,$x0,$two0
517
518         stfd    $x1,8*1($ctx)                   # store [biased] hash value
519         stfd    $x3,8*3($ctx)
520         stfd    $x2,8*2($ctx)
521         stfd    $x0,8*0($ctx)
522
523         mtfsf   255,$h0lo                       # restore original fpscr
524         lfd     f14,`$FRAME-8*18`($sp)
525         lfd     f15,`$FRAME-8*17`($sp)
526         lfd     f16,`$FRAME-8*16`($sp)
527         lfd     f17,`$FRAME-8*15`($sp)
528         lfd     f18,`$FRAME-8*14`($sp)
529         lfd     f19,`$FRAME-8*13`($sp)
530         lfd     f20,`$FRAME-8*12`($sp)
531         lfd     f21,`$FRAME-8*11`($sp)
532         lfd     f22,`$FRAME-8*10`($sp)
533         lfd     f23,`$FRAME-8*9`($sp)
534         lfd     f24,`$FRAME-8*8`($sp)
535         lfd     f25,`$FRAME-8*7`($sp)
536         lfd     f26,`$FRAME-8*6`($sp)
537         lfd     f27,`$FRAME-8*5`($sp)
538         lfd     f28,`$FRAME-8*4`($sp)
539         lfd     f29,`$FRAME-8*3`($sp)
540         lfd     f30,`$FRAME-8*2`($sp)
541         lfd     f31,`$FRAME-8*1`($sp)
542         addi    $sp,$sp,$FRAME
543 Labort:
544         blr
545         .long   0
546         .byte   0,12,4,1,0x80,0,4,0
547 .size   .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
548 ___
549 {
550 my ($mac,$nonce)=($inp,$len);
551
552 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
553    ) = map("r$_",(7..11,28..31));
554 my $mask = "r0";
555 my $FRAME = (6+4)*$SIZE_T;
556
557 $code.=<<___;
558 .globl  .poly1305_emit_fpu
559 .align  4
560 .poly1305_emit_fpu:
561         $STU    $sp,-$FRAME($sp)
562         mflr    r0
563         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
564         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
565         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
566         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
567         $PUSH   r0,`$FRAME+$LRSAVE`($sp)
568
569         lwz     $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx)      # load hash
570         lwz     $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
571         lwz     $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
572         lwz     $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
573         lwz     $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
574         lwz     $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
575         lwz     $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
576         lwz     $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
577
578         lis     $mask,0xfff0
579         andc    $d0,$d0,$mask                   # mask exponent
580         andc    $d1,$d1,$mask
581         andc    $d2,$d2,$mask
582         andc    $d3,$d3,$mask                   # can be partially reduced...
583         li      $mask,3
584
585         srwi    $padbit,$d3,2                   # ... so reduce
586         and     $h4,$d3,$mask
587         andc    $d3,$d3,$mask
588         add     $d3,$d3,$padbit
589 ___
590                                                 if ($SIZE_T==4) {
591 $code.=<<___;
592         addc    $h0,$h0,$d3
593         adde    $h1,$h1,$d0
594         adde    $h2,$h2,$d1
595         adde    $h3,$h3,$d2
596         addze   $h4,$h4
597
598         addic   $d0,$h0,5                       # compare to modulus
599         addze   $d1,$h1
600         addze   $d2,$h2
601         addze   $d3,$h3
602         addze   $mask,$h4
603
604         srwi    $mask,$mask,2                   # did it carry/borrow?
605         neg     $mask,$mask
606         srawi   $mask,$mask,31                  # mask
607
608         andc    $h0,$h0,$mask
609         and     $d0,$d0,$mask
610         andc    $h1,$h1,$mask
611         and     $d1,$d1,$mask
612         or      $h0,$h0,$d0
613         lwz     $d0,0($nonce)                   # load nonce
614         andc    $h2,$h2,$mask
615         and     $d2,$d2,$mask
616         or      $h1,$h1,$d1
617         lwz     $d1,4($nonce)
618         andc    $h3,$h3,$mask
619         and     $d3,$d3,$mask
620         or      $h2,$h2,$d2
621         lwz     $d2,8($nonce)
622         or      $h3,$h3,$d3
623         lwz     $d3,12($nonce)
624
625         addc    $h0,$h0,$d0                     # accumulate nonce
626         adde    $h1,$h1,$d1
627         adde    $h2,$h2,$d2
628         adde    $h3,$h3,$d3
629 ___
630                                                 } else {
631 $code.=<<___;
632         add     $h0,$h0,$d3
633         add     $h1,$h1,$d0
634         add     $h2,$h2,$d1
635         add     $h3,$h3,$d2
636
637         srdi    $d0,$h0,32
638         add     $h1,$h1,$d0
639         srdi    $d1,$h1,32
640         add     $h2,$h2,$d1
641         srdi    $d2,$h2,32
642         add     $h3,$h3,$d2
643         srdi    $d3,$h3,32
644         add     $h4,$h4,$d3
645
646         insrdi  $h0,$h1,32,0
647         insrdi  $h2,$h3,32,0
648
649         addic   $d0,$h0,5                       # compare to modulus
650         addze   $d1,$h2
651         addze   $d2,$h4
652
653         srdi    $mask,$d2,2                     # did it carry/borrow?
654         neg     $mask,$mask
655         sradi   $mask,$mask,63                  # mask
656         ld      $d2,0($nonce)                   # load nonce
657         ld      $d3,8($nonce)
658
659         andc    $h0,$h0,$mask
660         and     $d0,$d0,$mask
661         andc    $h2,$h2,$mask
662         and     $d1,$d1,$mask
663         or      $h0,$h0,$d0
664         or      $h2,$h2,$d1
665 ___
666 $code.=<<___    if (!$LITTLE_ENDIAN);
667         rotldi  $d2,$d2,32                      # flip nonce words
668         rotldi  $d3,$d3,32
669 ___
670 $code.=<<___;
671         addc    $h0,$h0,$d2                     # accumulate nonce
672         adde    $h2,$h2,$d3
673
674         srdi    $h1,$h0,32
675         srdi    $h3,$h2,32
676 ___
677                                                 }
678 $code.=<<___    if ($LITTLE_ENDIAN);
679         stw     $h0,0($mac)                     # write result
680         stw     $h1,4($mac)
681         stw     $h2,8($mac)
682         stw     $h3,12($mac)
683 ___
684 $code.=<<___    if (!$LITTLE_ENDIAN);
685         li      $d1,4
686         stwbrx  $h0,0,$mac                      # write result
687         li      $d2,8
688         stwbrx  $h1,$d1,$mac
689         li      $d3,12
690         stwbrx  $h2,$d2,$mac
691         stwbrx  $h3,$d3,$mac
692 ___
693 $code.=<<___;
694         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
695         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
696         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
697         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
698         addi    $sp,$sp,$FRAME
699         blr
700         .long   0
701         .byte   0,12,4,1,0x80,4,3,0
702 .size   .poly1305_emit_fpu,.-.poly1305_emit_fpu
703 ___
704 }
705 # Ugly hack here, because PPC assembler syntax seem to vary too
706 # much from platforms to platform...
707 $code.=<<___;
708 .align  6
709 LPICmeup:
710         mflr    r0
711         bcl     20,31,\$+4
712         mflr    $len    # vvvvvv "distance" between . and 1st data entry
713         addi    $len,$len,`64-8`        # borrow $len
714         mtlr    r0
715         blr
716         .long   0
717         .byte   0,12,0x14,0,0,0,0,0
718         .space  `64-9*4`
719
720 .quad   0x4330000000000000              # 2^(52+0)
721 .quad   0x4530000000000000              # 2^(52+32)
722 .quad   0x4730000000000000              # 2^(52+64)
723 .quad   0x4930000000000000              # 2^(52+96)
724 .quad   0x4b50000000000000              # 2^(52+130)
725
726 .quad   0x37f4000000000000              # 5/2^130
727
728 .quad   0x4430000000000000              # 2^(52+16+0)
729 .quad   0x4630000000000000              # 2^(52+16+32)
730 .quad   0x4830000000000000              # 2^(52+16+64)
731 .quad   0x4a30000000000000              # 2^(52+16+96)
732 .quad   0x3e30000000000000              # 2^(52+16+0-96)
733 .quad   0x4030000000000000              # 2^(52+16+32-96)
734 .quad   0x4230000000000000              # 2^(52+16+64-96)
735
736 .quad   0x0000000000000001              # fpscr: truncate, no exceptions
737 .asciz  "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
738 .align  4
739 ___
740
741 $code =~ s/\`([^\`]*)\`/eval $1/gem;
742 print $code;
743 close STDOUT or die "error closing STDOUT: $!";