Unify all assembler file generators
[openssl.git] / crypto / modes / asm / ghashp8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for for PowerISA v2.07.
18 #
19 # July 2014
20 #
21 # Accurate performance measurements are problematic, because it's
22 # always virtualized setup with possibly throttled processor.
23 # Relative comparison is therefore more informative. This initial
24 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25 # faster than "4-bit" integer-only compiler-generated 64-bit code.
26 # "Initial version" means that there is room for further improvement.
27
28 # May 2016
29 #
30 # 2x aggregated reduction improves performance by 50% (resulting
31 # performance on POWER8 is 1 cycle per processed byte), and 4x
32 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33 # POWER9 delivers 0.51 cpb.
34
35 # $output is the last argument if it looks like a file (it has an extension)
36 # $flavour is the first argument if it doesn't look like a file
37 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
38 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
39
40 if ($flavour =~ /64/) {
41         $SIZE_T=8;
42         $LRSAVE=2*$SIZE_T;
43         $STU="stdu";
44         $POP="ld";
45         $PUSH="std";
46         $UCMP="cmpld";
47         $SHRI="srdi";
48 } elsif ($flavour =~ /32/) {
49         $SIZE_T=4;
50         $LRSAVE=$SIZE_T;
51         $STU="stwu";
52         $POP="lwz";
53         $PUSH="stw";
54         $UCMP="cmplw";
55         $SHRI="srwi";
56 } else { die "nonsense $flavour"; }
57
58 $sp="r1";
59 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
60
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
63 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
64 die "can't locate ppc-xlate.pl";
65
66 open STDOUT,"| $^X $xlate $flavour \"$output\""
67     or die "can't call $xlate: $!";
68
69 my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));    # argument block
70
71 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
72 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
73 my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
74 my $vrsave="r12";
75
76 $code=<<___;
77 .machine        "any"
78
79 .text
80
81 .globl  .gcm_init_p8
82 .align  5
83 .gcm_init_p8:
84         li              r0,-4096
85         li              r8,0x10
86         mfspr           $vrsave,256
87         li              r9,0x20
88         mtspr           256,r0
89         li              r10,0x30
90         lvx_u           $H,0,r4                 # load H
91
92         vspltisb        $xC2,-16                # 0xf0
93         vspltisb        $t0,1                   # one
94         vaddubm         $xC2,$xC2,$xC2          # 0xe0
95         vxor            $zero,$zero,$zero
96         vor             $xC2,$xC2,$t0           # 0xe1
97         vsldoi          $xC2,$xC2,$zero,15      # 0xe1...
98         vsldoi          $t1,$zero,$t0,1         # ...1
99         vaddubm         $xC2,$xC2,$xC2          # 0xc2...
100         vspltisb        $t2,7
101         vor             $xC2,$xC2,$t1           # 0xc2....01
102         vspltb          $t1,$H,0                # most significant byte
103         vsl             $H,$H,$t0               # H<<=1
104         vsrab           $t1,$t1,$t2             # broadcast carry bit
105         vand            $t1,$t1,$xC2
106         vxor            $IN,$H,$t1              # twisted H
107
108         vsldoi          $H,$IN,$IN,8            # twist even more ...
109         vsldoi          $xC2,$zero,$xC2,8       # 0xc2.0
110         vsldoi          $Hl,$zero,$H,8          # ... and split
111         vsldoi          $Hh,$H,$zero,8
112
113         stvx_u          $xC2,0,r3               # save pre-computed table
114         stvx_u          $Hl,r8,r3
115         li              r8,0x40
116         stvx_u          $H, r9,r3
117         li              r9,0x50
118         stvx_u          $Hh,r10,r3
119         li              r10,0x60
120
121         vpmsumd         $Xl,$IN,$Hl             # H.lo·H.lo
122         vpmsumd         $Xm,$IN,$H              # H.hi·H.lo+H.lo·H.hi
123         vpmsumd         $Xh,$IN,$Hh             # H.hi·H.hi
124
125         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
126
127         vsldoi          $t0,$Xm,$zero,8
128         vsldoi          $t1,$zero,$Xm,8
129         vxor            $Xl,$Xl,$t0
130         vxor            $Xh,$Xh,$t1
131
132         vsldoi          $Xl,$Xl,$Xl,8
133         vxor            $Xl,$Xl,$t2
134
135         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
136         vpmsumd         $Xl,$Xl,$xC2
137         vxor            $t1,$t1,$Xh
138         vxor            $IN1,$Xl,$t1
139
140         vsldoi          $H2,$IN1,$IN1,8
141         vsldoi          $H2l,$zero,$H2,8
142         vsldoi          $H2h,$H2,$zero,8
143
144         stvx_u          $H2l,r8,r3              # save H^2
145         li              r8,0x70
146         stvx_u          $H2,r9,r3
147         li              r9,0x80
148         stvx_u          $H2h,r10,r3
149         li              r10,0x90
150 ___
151 {
152 my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
153 $code.=<<___;
154         vpmsumd         $Xl,$IN,$H2l            # H.lo·H^2.lo
155          vpmsumd        $Xl1,$IN1,$H2l          # H^2.lo·H^2.lo
156         vpmsumd         $Xm,$IN,$H2             # H.hi·H^2.lo+H.lo·H^2.hi
157          vpmsumd        $Xm1,$IN1,$H2           # H^2.hi·H^2.lo+H^2.lo·H^2.hi
158         vpmsumd         $Xh,$IN,$H2h            # H.hi·H^2.hi
159          vpmsumd        $Xh1,$IN1,$H2h          # H^2.hi·H^2.hi
160
161         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
162          vpmsumd        $t6,$Xl1,$xC2           # 1st reduction phase
163
164         vsldoi          $t0,$Xm,$zero,8
165         vsldoi          $t1,$zero,$Xm,8
166          vsldoi         $t4,$Xm1,$zero,8
167          vsldoi         $t5,$zero,$Xm1,8
168         vxor            $Xl,$Xl,$t0
169         vxor            $Xh,$Xh,$t1
170          vxor           $Xl1,$Xl1,$t4
171          vxor           $Xh1,$Xh1,$t5
172
173         vsldoi          $Xl,$Xl,$Xl,8
174          vsldoi         $Xl1,$Xl1,$Xl1,8
175         vxor            $Xl,$Xl,$t2
176          vxor           $Xl1,$Xl1,$t6
177
178         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
179          vsldoi         $t5,$Xl1,$Xl1,8         # 2nd reduction phase
180         vpmsumd         $Xl,$Xl,$xC2
181          vpmsumd        $Xl1,$Xl1,$xC2
182         vxor            $t1,$t1,$Xh
183          vxor           $t5,$t5,$Xh1
184         vxor            $Xl,$Xl,$t1
185          vxor           $Xl1,$Xl1,$t5
186
187         vsldoi          $H,$Xl,$Xl,8
188          vsldoi         $H2,$Xl1,$Xl1,8
189         vsldoi          $Hl,$zero,$H,8
190         vsldoi          $Hh,$H,$zero,8
191          vsldoi         $H2l,$zero,$H2,8
192          vsldoi         $H2h,$H2,$zero,8
193
194         stvx_u          $Hl,r8,r3               # save H^3
195         li              r8,0xa0
196         stvx_u          $H,r9,r3
197         li              r9,0xb0
198         stvx_u          $Hh,r10,r3
199         li              r10,0xc0
200          stvx_u         $H2l,r8,r3              # save H^4
201          stvx_u         $H2,r9,r3
202          stvx_u         $H2h,r10,r3
203
204         mtspr           256,$vrsave
205         blr
206         .long           0
207         .byte           0,12,0x14,0,0,0,2,0
208         .long           0
209 .size   .gcm_init_p8,.-.gcm_init_p8
210 ___
211 }
212 $code.=<<___;
213 .globl  .gcm_gmult_p8
214 .align  5
215 .gcm_gmult_p8:
216         lis             r0,0xfff8
217         li              r8,0x10
218         mfspr           $vrsave,256
219         li              r9,0x20
220         mtspr           256,r0
221         li              r10,0x30
222         lvx_u           $IN,0,$Xip              # load Xi
223
224         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
225          le?lvsl        $lemask,r0,r0
226         lvx_u           $H, r9,$Htbl
227          le?vspltisb    $t0,0x07
228         lvx_u           $Hh,r10,$Htbl
229          le?vxor        $lemask,$lemask,$t0
230         lvx_u           $xC2,0,$Htbl
231          le?vperm       $IN,$IN,$IN,$lemask
232         vxor            $zero,$zero,$zero
233
234         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
235         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
236         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
237
238         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
239
240         vsldoi          $t0,$Xm,$zero,8
241         vsldoi          $t1,$zero,$Xm,8
242         vxor            $Xl,$Xl,$t0
243         vxor            $Xh,$Xh,$t1
244
245         vsldoi          $Xl,$Xl,$Xl,8
246         vxor            $Xl,$Xl,$t2
247
248         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
249         vpmsumd         $Xl,$Xl,$xC2
250         vxor            $t1,$t1,$Xh
251         vxor            $Xl,$Xl,$t1
252
253         le?vperm        $Xl,$Xl,$Xl,$lemask
254         stvx_u          $Xl,0,$Xip              # write out Xi
255
256         mtspr           256,$vrsave
257         blr
258         .long           0
259         .byte           0,12,0x14,0,0,0,2,0
260         .long           0
261 .size   .gcm_gmult_p8,.-.gcm_gmult_p8
262
263 .globl  .gcm_ghash_p8
264 .align  5
265 .gcm_ghash_p8:
266         li              r0,-4096
267         li              r8,0x10
268         mfspr           $vrsave,256
269         li              r9,0x20
270         mtspr           256,r0
271         li              r10,0x30
272         lvx_u           $Xl,0,$Xip              # load Xi
273
274         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
275         li              r8,0x40
276          le?lvsl        $lemask,r0,r0
277         lvx_u           $H, r9,$Htbl
278         li              r9,0x50
279          le?vspltisb    $t0,0x07
280         lvx_u           $Hh,r10,$Htbl
281         li              r10,0x60
282          le?vxor        $lemask,$lemask,$t0
283         lvx_u           $xC2,0,$Htbl
284          le?vperm       $Xl,$Xl,$Xl,$lemask
285         vxor            $zero,$zero,$zero
286
287         ${UCMP}i        $len,64
288         bge             Lgcm_ghash_p8_4x
289
290         lvx_u           $IN,0,$inp
291         addi            $inp,$inp,16
292         subic.          $len,$len,16
293          le?vperm       $IN,$IN,$IN,$lemask
294         vxor            $IN,$IN,$Xl
295         beq             Lshort
296
297         lvx_u           $H2l,r8,$Htbl           # load H^2
298         li              r8,16
299         lvx_u           $H2, r9,$Htbl
300         add             r9,$inp,$len            # end of input
301         lvx_u           $H2h,r10,$Htbl
302         be?b            Loop_2x
303
304 .align  5
305 Loop_2x:
306         lvx_u           $IN1,0,$inp
307         le?vperm        $IN1,$IN1,$IN1,$lemask
308
309          subic          $len,$len,32
310         vpmsumd         $Xl,$IN,$H2l            # H^2.lo·Xi.lo
311          vpmsumd        $Xl1,$IN1,$Hl           # H.lo·Xi+1.lo
312          subfe          r0,r0,r0                # borrow?-1:0
313         vpmsumd         $Xm,$IN,$H2             # H^2.hi·Xi.lo+H^2.lo·Xi.hi
314          vpmsumd        $Xm1,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+1.hi
315          and            r0,r0,$len
316         vpmsumd         $Xh,$IN,$H2h            # H^2.hi·Xi.hi
317          vpmsumd        $Xh1,$IN1,$Hh           # H.hi·Xi+1.hi
318          add            $inp,$inp,r0
319
320         vxor            $Xl,$Xl,$Xl1
321         vxor            $Xm,$Xm,$Xm1
322
323         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
324
325         vsldoi          $t0,$Xm,$zero,8
326         vsldoi          $t1,$zero,$Xm,8
327          vxor           $Xh,$Xh,$Xh1
328         vxor            $Xl,$Xl,$t0
329         vxor            $Xh,$Xh,$t1
330
331         vsldoi          $Xl,$Xl,$Xl,8
332         vxor            $Xl,$Xl,$t2
333          lvx_u          $IN,r8,$inp
334          addi           $inp,$inp,32
335
336         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
337         vpmsumd         $Xl,$Xl,$xC2
338          le?vperm       $IN,$IN,$IN,$lemask
339         vxor            $t1,$t1,$Xh
340         vxor            $IN,$IN,$t1
341         vxor            $IN,$IN,$Xl
342         $UCMP           r9,$inp
343         bgt             Loop_2x                 # done yet?
344
345         cmplwi          $len,0
346         bne             Leven
347
348 Lshort:
349         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
350         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
351         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
352
353         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
354
355         vsldoi          $t0,$Xm,$zero,8
356         vsldoi          $t1,$zero,$Xm,8
357         vxor            $Xl,$Xl,$t0
358         vxor            $Xh,$Xh,$t1
359
360         vsldoi          $Xl,$Xl,$Xl,8
361         vxor            $Xl,$Xl,$t2
362
363         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
364         vpmsumd         $Xl,$Xl,$xC2
365         vxor            $t1,$t1,$Xh
366
367 Leven:
368         vxor            $Xl,$Xl,$t1
369         le?vperm        $Xl,$Xl,$Xl,$lemask
370         stvx_u          $Xl,0,$Xip              # write out Xi
371
372         mtspr           256,$vrsave
373         blr
374         .long           0
375         .byte           0,12,0x14,0,0,0,4,0
376         .long           0
377 ___
378 {
379 my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
380     $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
381 my $IN0=$IN;
382 my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
383
384 $code.=<<___;
385 .align  5
386 .gcm_ghash_p8_4x:
387 Lgcm_ghash_p8_4x:
388         $STU            $sp,-$FRAME($sp)
389         li              r10,`15+6*$SIZE_T`
390         li              r11,`31+6*$SIZE_T`
391         stvx            v20,r10,$sp
392         addi            r10,r10,32
393         stvx            v21,r11,$sp
394         addi            r11,r11,32
395         stvx            v22,r10,$sp
396         addi            r10,r10,32
397         stvx            v23,r11,$sp
398         addi            r11,r11,32
399         stvx            v24,r10,$sp
400         addi            r10,r10,32
401         stvx            v25,r11,$sp
402         addi            r11,r11,32
403         stvx            v26,r10,$sp
404         addi            r10,r10,32
405         stvx            v27,r11,$sp
406         addi            r11,r11,32
407         stvx            v28,r10,$sp
408         addi            r10,r10,32
409         stvx            v29,r11,$sp
410         addi            r11,r11,32
411         stvx            v30,r10,$sp
412         li              r10,0x60
413         stvx            v31,r11,$sp
414         li              r0,-1
415         stw             $vrsave,`$FRAME-4`($sp) # save vrsave
416         mtspr           256,r0                  # preserve all AltiVec registers
417
418         lvsl            $t0,0,r8                # 0x0001..0e0f
419         #lvx_u          $H2l,r8,$Htbl           # load H^2
420         li              r8,0x70
421         lvx_u           $H2, r9,$Htbl
422         li              r9,0x80
423         vspltisb        $t1,8                   # 0x0808..0808
424         #lvx_u          $H2h,r10,$Htbl
425         li              r10,0x90
426         lvx_u           $H3l,r8,$Htbl           # load H^3
427         li              r8,0xa0
428         lvx_u           $H3, r9,$Htbl
429         li              r9,0xb0
430         lvx_u           $H3h,r10,$Htbl
431         li              r10,0xc0
432         lvx_u           $H4l,r8,$Htbl           # load H^4
433         li              r8,0x10
434         lvx_u           $H4, r9,$Htbl
435         li              r9,0x20
436         lvx_u           $H4h,r10,$Htbl
437         li              r10,0x30
438
439         vsldoi          $t2,$zero,$t1,8         # 0x0000..0808
440         vaddubm         $hiperm,$t0,$t2         # 0x0001..1617
441         vaddubm         $loperm,$t1,$hiperm     # 0x0809..1e1f
442
443         $SHRI           $len,$len,4             # this allows to use sign bit
444                                                 # as carry
445         lvx_u           $IN0,0,$inp             # load input
446         lvx_u           $IN1,r8,$inp
447         subic.          $len,$len,8
448         lvx_u           $IN2,r9,$inp
449         lvx_u           $IN3,r10,$inp
450         addi            $inp,$inp,0x40
451         le?vperm        $IN0,$IN0,$IN0,$lemask
452         le?vperm        $IN1,$IN1,$IN1,$lemask
453         le?vperm        $IN2,$IN2,$IN2,$lemask
454         le?vperm        $IN3,$IN3,$IN3,$lemask
455
456         vxor            $Xh,$IN0,$Xl
457
458          vpmsumd        $Xl1,$IN1,$H3l
459          vpmsumd        $Xm1,$IN1,$H3
460          vpmsumd        $Xh1,$IN1,$H3h
461
462          vperm          $H21l,$H2,$H,$hiperm
463          vperm          $t0,$IN2,$IN3,$loperm
464          vperm          $H21h,$H2,$H,$loperm
465          vperm          $t1,$IN2,$IN3,$hiperm
466          vpmsumd        $Xm2,$IN2,$H2           # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
467          vpmsumd        $Xl3,$t0,$H21l          # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
468          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
469          vpmsumd        $Xh3,$t1,$H21h          # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
470
471          vxor           $Xm2,$Xm2,$Xm1
472          vxor           $Xl3,$Xl3,$Xl1
473          vxor           $Xm3,$Xm3,$Xm2
474          vxor           $Xh3,$Xh3,$Xh1
475
476         blt             Ltail_4x
477
478 Loop_4x:
479         lvx_u           $IN0,0,$inp
480         lvx_u           $IN1,r8,$inp
481         subic.          $len,$len,4
482         lvx_u           $IN2,r9,$inp
483         lvx_u           $IN3,r10,$inp
484         addi            $inp,$inp,0x40
485         le?vperm        $IN1,$IN1,$IN1,$lemask
486         le?vperm        $IN2,$IN2,$IN2,$lemask
487         le?vperm        $IN3,$IN3,$IN3,$lemask
488         le?vperm        $IN0,$IN0,$IN0,$lemask
489
490         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
491         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
492         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
493          vpmsumd        $Xl1,$IN1,$H3l
494          vpmsumd        $Xm1,$IN1,$H3
495          vpmsumd        $Xh1,$IN1,$H3h
496
497         vxor            $Xl,$Xl,$Xl3
498         vxor            $Xm,$Xm,$Xm3
499         vxor            $Xh,$Xh,$Xh3
500          vperm          $t0,$IN2,$IN3,$loperm
501          vperm          $t1,$IN2,$IN3,$hiperm
502
503         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
504          vpmsumd        $Xl3,$t0,$H21l          # H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
505          vpmsumd        $Xh3,$t1,$H21h          # H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
506
507         vsldoi          $t0,$Xm,$zero,8
508         vsldoi          $t1,$zero,$Xm,8
509         vxor            $Xl,$Xl,$t0
510         vxor            $Xh,$Xh,$t1
511
512         vsldoi          $Xl,$Xl,$Xl,8
513         vxor            $Xl,$Xl,$t2
514
515         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
516          vpmsumd        $Xm2,$IN2,$H2           # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
517          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
518         vpmsumd         $Xl,$Xl,$xC2
519
520          vxor           $Xl3,$Xl3,$Xl1
521          vxor           $Xh3,$Xh3,$Xh1
522         vxor            $Xh,$Xh,$IN0
523          vxor           $Xm2,$Xm2,$Xm1
524         vxor            $Xh,$Xh,$t1
525          vxor           $Xm3,$Xm3,$Xm2
526         vxor            $Xh,$Xh,$Xl
527         bge             Loop_4x
528
529 Ltail_4x:
530         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
531         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
532         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
533
534         vxor            $Xl,$Xl,$Xl3
535         vxor            $Xm,$Xm,$Xm3
536
537         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
538
539         vsldoi          $t0,$Xm,$zero,8
540         vsldoi          $t1,$zero,$Xm,8
541          vxor           $Xh,$Xh,$Xh3
542         vxor            $Xl,$Xl,$t0
543         vxor            $Xh,$Xh,$t1
544
545         vsldoi          $Xl,$Xl,$Xl,8
546         vxor            $Xl,$Xl,$t2
547
548         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
549         vpmsumd         $Xl,$Xl,$xC2
550         vxor            $t1,$t1,$Xh
551         vxor            $Xl,$Xl,$t1
552
553         addic.          $len,$len,4
554         beq             Ldone_4x
555
556         lvx_u           $IN0,0,$inp
557         ${UCMP}i        $len,2
558         li              $len,-4
559         blt             Lone
560         lvx_u           $IN1,r8,$inp
561         beq             Ltwo
562
563 Lthree:
564         lvx_u           $IN2,r9,$inp
565         le?vperm        $IN0,$IN0,$IN0,$lemask
566         le?vperm        $IN1,$IN1,$IN1,$lemask
567         le?vperm        $IN2,$IN2,$IN2,$lemask
568
569         vxor            $Xh,$IN0,$Xl
570         vmr             $H4l,$H3l
571         vmr             $H4, $H3
572         vmr             $H4h,$H3h
573
574         vperm           $t0,$IN1,$IN2,$loperm
575         vperm           $t1,$IN1,$IN2,$hiperm
576         vpmsumd         $Xm2,$IN1,$H2           # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
577         vpmsumd         $Xm3,$IN2,$H            # H.hi·Xi+2.lo  +H.lo·Xi+2.hi
578         vpmsumd         $Xl3,$t0,$H21l          # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
579         vpmsumd         $Xh3,$t1,$H21h          # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
580
581         vxor            $Xm3,$Xm3,$Xm2
582         b               Ltail_4x
583
584 .align  4
585 Ltwo:
586         le?vperm        $IN0,$IN0,$IN0,$lemask
587         le?vperm        $IN1,$IN1,$IN1,$lemask
588
589         vxor            $Xh,$IN0,$Xl
590         vperm           $t0,$zero,$IN1,$loperm
591         vperm           $t1,$zero,$IN1,$hiperm
592
593         vsldoi          $H4l,$zero,$H2,8
594         vmr             $H4, $H2
595         vsldoi          $H4h,$H2,$zero,8
596
597         vpmsumd         $Xl3,$t0, $H21l         # H.lo·Xi+1.lo
598         vpmsumd         $Xm3,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+2.hi
599         vpmsumd         $Xh3,$t1, $H21h         # H.hi·Xi+1.hi
600
601         b               Ltail_4x
602
603 .align  4
604 Lone:
605         le?vperm        $IN0,$IN0,$IN0,$lemask
606
607         vsldoi          $H4l,$zero,$H,8
608         vmr             $H4, $H
609         vsldoi          $H4h,$H,$zero,8
610
611         vxor            $Xh,$IN0,$Xl
612         vxor            $Xl3,$Xl3,$Xl3
613         vxor            $Xm3,$Xm3,$Xm3
614         vxor            $Xh3,$Xh3,$Xh3
615
616         b               Ltail_4x
617
618 Ldone_4x:
619         le?vperm        $Xl,$Xl,$Xl,$lemask
620         stvx_u          $Xl,0,$Xip              # write out Xi
621
622         li              r10,`15+6*$SIZE_T`
623         li              r11,`31+6*$SIZE_T`
624         mtspr           256,$vrsave
625         lvx             v20,r10,$sp
626         addi            r10,r10,32
627         lvx             v21,r11,$sp
628         addi            r11,r11,32
629         lvx             v22,r10,$sp
630         addi            r10,r10,32
631         lvx             v23,r11,$sp
632         addi            r11,r11,32
633         lvx             v24,r10,$sp
634         addi            r10,r10,32
635         lvx             v25,r11,$sp
636         addi            r11,r11,32
637         lvx             v26,r10,$sp
638         addi            r10,r10,32
639         lvx             v27,r11,$sp
640         addi            r11,r11,32
641         lvx             v28,r10,$sp
642         addi            r10,r10,32
643         lvx             v29,r11,$sp
644         addi            r11,r11,32
645         lvx             v30,r10,$sp
646         lvx             v31,r11,$sp
647         addi            $sp,$sp,$FRAME
648         blr
649         .long           0
650         .byte           0,12,0x04,0,0x80,0,4,0
651         .long           0
652 ___
653 }
654 $code.=<<___;
655 .size   .gcm_ghash_p8,.-.gcm_ghash_p8
656
657 .asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
658 .align  2
659 ___
660
661 foreach (split("\n",$code)) {
662         s/\`([^\`]*)\`/eval $1/geo;
663
664         if ($flavour =~ /le$/o) {       # little-endian
665             s/le\?//o           or
666             s/be\?/#be#/o;
667         } else {
668             s/le\?/#le#/o       or
669             s/be\?//o;
670         }
671         print $_,"\n";
672 }
673
674 close STDOUT; # enforce flush