PPC assembly pack: add POWER9 results.
[openssl.git] / crypto / modes / asm / ghashp8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for for PowerISA v2.07.
18 #
19 # July 2014
20 #
21 # Accurate performance measurements are problematic, because it's
22 # always virtualized setup with possibly throttled processor.
23 # Relative comparison is therefore more informative. This initial
24 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25 # faster than "4-bit" integer-only compiler-generated 64-bit code.
26 # "Initial version" means that there is room for further improvement.
27
28 # May 2016
29 #
30 # 2x aggregated reduction improves performance by 50% (resulting
31 # performance on POWER8 is 1 cycle per processed byte), and 4x
32 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33 # POWER9 delivers 0.40 cpb.
34
35 $flavour=shift;
36 $output =shift;
37
38 if ($flavour =~ /64/) {
39         $SIZE_T=8;
40         $LRSAVE=2*$SIZE_T;
41         $STU="stdu";
42         $POP="ld";
43         $PUSH="std";
44         $UCMP="cmpld";
45         $SHRI="srdi";
46 } elsif ($flavour =~ /32/) {
47         $SIZE_T=4;
48         $LRSAVE=$SIZE_T;
49         $STU="stwu";
50         $POP="lwz";
51         $PUSH="stw";
52         $UCMP="cmplw";
53         $SHRI="srwi";
54 } else { die "nonsense $flavour"; }
55
56 $sp="r1";
57 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
58
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
62 die "can't locate ppc-xlate.pl";
63
64 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
65
66 my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));    # argument block
67
68 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
69 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
70 my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
71 my $vrsave="r12";
72
73 $code=<<___;
74 .machine        "any"
75
76 .text
77
78 .globl  .gcm_init_p8
79 .align  5
80 .gcm_init_p8:
81         li              r0,-4096
82         li              r8,0x10
83         mfspr           $vrsave,256
84         li              r9,0x20
85         mtspr           256,r0
86         li              r10,0x30
87         lvx_u           $H,0,r4                 # load H
88
89         vspltisb        $xC2,-16                # 0xf0
90         vspltisb        $t0,1                   # one
91         vaddubm         $xC2,$xC2,$xC2          # 0xe0
92         vxor            $zero,$zero,$zero
93         vor             $xC2,$xC2,$t0           # 0xe1
94         vsldoi          $xC2,$xC2,$zero,15      # 0xe1...
95         vsldoi          $t1,$zero,$t0,1         # ...1
96         vaddubm         $xC2,$xC2,$xC2          # 0xc2...
97         vspltisb        $t2,7
98         vor             $xC2,$xC2,$t1           # 0xc2....01
99         vspltb          $t1,$H,0                # most significant byte
100         vsl             $H,$H,$t0               # H<<=1
101         vsrab           $t1,$t1,$t2             # broadcast carry bit
102         vand            $t1,$t1,$xC2
103         vxor            $IN,$H,$t1              # twisted H
104
105         vsldoi          $H,$IN,$IN,8            # twist even more ...
106         vsldoi          $xC2,$zero,$xC2,8       # 0xc2.0
107         vsldoi          $Hl,$zero,$H,8          # ... and split
108         vsldoi          $Hh,$H,$zero,8
109
110         stvx_u          $xC2,0,r3               # save pre-computed table
111         stvx_u          $Hl,r8,r3
112         li              r8,0x40
113         stvx_u          $H, r9,r3
114         li              r9,0x50
115         stvx_u          $Hh,r10,r3
116         li              r10,0x60
117
118         vpmsumd         $Xl,$IN,$Hl             # H.lo·H.lo
119         vpmsumd         $Xm,$IN,$H              # H.hi·H.lo+H.lo·H.hi
120         vpmsumd         $Xh,$IN,$Hh             # H.hi·H.hi
121
122         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
123
124         vsldoi          $t0,$Xm,$zero,8
125         vsldoi          $t1,$zero,$Xm,8
126         vxor            $Xl,$Xl,$t0
127         vxor            $Xh,$Xh,$t1
128
129         vsldoi          $Xl,$Xl,$Xl,8
130         vxor            $Xl,$Xl,$t2
131
132         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
133         vpmsumd         $Xl,$Xl,$xC2
134         vxor            $t1,$t1,$Xh
135         vxor            $IN1,$Xl,$t1
136
137         vsldoi          $H2,$IN1,$IN1,8
138         vsldoi          $H2l,$zero,$H2,8
139         vsldoi          $H2h,$H2,$zero,8
140
141         stvx_u          $H2l,r8,r3              # save H^2
142         li              r8,0x70
143         stvx_u          $H2,r9,r3
144         li              r9,0x80
145         stvx_u          $H2h,r10,r3
146         li              r10,0x90
147 ___
148 {
149 my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
150 $code.=<<___;
151         vpmsumd         $Xl,$IN,$H2l            # H.lo·H^2.lo
152          vpmsumd        $Xl1,$IN1,$H2l          # H^2.lo·H^2.lo
153         vpmsumd         $Xm,$IN,$H2             # H.hi·H^2.lo+H.lo·H^2.hi
154          vpmsumd        $Xm1,$IN1,$H2           # H^2.hi·H^2.lo+H^2.lo·H^2.hi
155         vpmsumd         $Xh,$IN,$H2h            # H.hi·H^2.hi
156          vpmsumd        $Xh1,$IN1,$H2h          # H^2.hi·H^2.hi
157
158         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
159          vpmsumd        $t6,$Xl1,$xC2           # 1st reduction phase
160
161         vsldoi          $t0,$Xm,$zero,8
162         vsldoi          $t1,$zero,$Xm,8
163          vsldoi         $t4,$Xm1,$zero,8
164          vsldoi         $t5,$zero,$Xm1,8
165         vxor            $Xl,$Xl,$t0
166         vxor            $Xh,$Xh,$t1
167          vxor           $Xl1,$Xl1,$t4
168          vxor           $Xh1,$Xh1,$t5
169
170         vsldoi          $Xl,$Xl,$Xl,8
171          vsldoi         $Xl1,$Xl1,$Xl1,8
172         vxor            $Xl,$Xl,$t2
173          vxor           $Xl1,$Xl1,$t6
174
175         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
176          vsldoi         $t5,$Xl1,$Xl1,8         # 2nd reduction phase
177         vpmsumd         $Xl,$Xl,$xC2
178          vpmsumd        $Xl1,$Xl1,$xC2
179         vxor            $t1,$t1,$Xh
180          vxor           $t5,$t5,$Xh1
181         vxor            $Xl,$Xl,$t1
182          vxor           $Xl1,$Xl1,$t5
183
184         vsldoi          $H,$Xl,$Xl,8
185          vsldoi         $H2,$Xl1,$Xl1,8
186         vsldoi          $Hl,$zero,$H,8
187         vsldoi          $Hh,$H,$zero,8
188          vsldoi         $H2l,$zero,$H2,8
189          vsldoi         $H2h,$H2,$zero,8
190
191         stvx_u          $Hl,r8,r3               # save H^3
192         li              r8,0xa0
193         stvx_u          $H,r9,r3
194         li              r9,0xb0
195         stvx_u          $Hh,r10,r3
196         li              r10,0xc0
197          stvx_u         $H2l,r8,r3              # save H^4
198          stvx_u         $H2,r9,r3
199          stvx_u         $H2h,r10,r3
200
201         mtspr           256,$vrsave
202         blr
203         .long           0
204         .byte           0,12,0x14,0,0,0,2,0
205         .long           0
206 .size   .gcm_init_p8,.-.gcm_init_p8
207 ___
208 }
209 $code.=<<___;
210 .globl  .gcm_gmult_p8
211 .align  5
212 .gcm_gmult_p8:
213         lis             r0,0xfff8
214         li              r8,0x10
215         mfspr           $vrsave,256
216         li              r9,0x20
217         mtspr           256,r0
218         li              r10,0x30
219         lvx_u           $IN,0,$Xip              # load Xi
220
221         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
222          le?lvsl        $lemask,r0,r0
223         lvx_u           $H, r9,$Htbl
224          le?vspltisb    $t0,0x07
225         lvx_u           $Hh,r10,$Htbl
226          le?vxor        $lemask,$lemask,$t0
227         lvx_u           $xC2,0,$Htbl
228          le?vperm       $IN,$IN,$IN,$lemask
229         vxor            $zero,$zero,$zero
230
231         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
232         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
233         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
234
235         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
236
237         vsldoi          $t0,$Xm,$zero,8
238         vsldoi          $t1,$zero,$Xm,8
239         vxor            $Xl,$Xl,$t0
240         vxor            $Xh,$Xh,$t1
241
242         vsldoi          $Xl,$Xl,$Xl,8
243         vxor            $Xl,$Xl,$t2
244
245         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
246         vpmsumd         $Xl,$Xl,$xC2
247         vxor            $t1,$t1,$Xh
248         vxor            $Xl,$Xl,$t1
249
250         le?vperm        $Xl,$Xl,$Xl,$lemask
251         stvx_u          $Xl,0,$Xip              # write out Xi
252
253         mtspr           256,$vrsave
254         blr
255         .long           0
256         .byte           0,12,0x14,0,0,0,2,0
257         .long           0
258 .size   .gcm_gmult_p8,.-.gcm_gmult_p8
259
260 .globl  .gcm_ghash_p8
261 .align  5
262 .gcm_ghash_p8:
263         li              r0,-4096
264         li              r8,0x10
265         mfspr           $vrsave,256
266         li              r9,0x20
267         mtspr           256,r0
268         li              r10,0x30
269         lvx_u           $Xl,0,$Xip              # load Xi
270
271         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
272         li              r8,0x40
273          le?lvsl        $lemask,r0,r0
274         lvx_u           $H, r9,$Htbl
275         li              r9,0x50
276          le?vspltisb    $t0,0x07
277         lvx_u           $Hh,r10,$Htbl
278         li              r10,0x60
279          le?vxor        $lemask,$lemask,$t0
280         lvx_u           $xC2,0,$Htbl
281          le?vperm       $Xl,$Xl,$Xl,$lemask
282         vxor            $zero,$zero,$zero
283
284         ${UCMP}i        $len,64
285         bge             Lgcm_ghash_p8_4x
286
287         lvx_u           $IN,0,$inp
288         addi            $inp,$inp,16
289         subic.          $len,$len,16
290          le?vperm       $IN,$IN,$IN,$lemask
291         vxor            $IN,$IN,$Xl
292         beq             Lshort
293
294         lvx_u           $H2l,r8,$Htbl           # load H^2
295         li              r8,16
296         lvx_u           $H2, r9,$Htbl
297         add             r9,$inp,$len            # end of input
298         lvx_u           $H2h,r10,$Htbl
299         be?b            Loop_2x
300
301 .align  5
302 Loop_2x:
303         lvx_u           $IN1,0,$inp
304         le?vperm        $IN1,$IN1,$IN1,$lemask
305
306          subic          $len,$len,32
307         vpmsumd         $Xl,$IN,$H2l            # H^2.lo·Xi.lo
308          vpmsumd        $Xl1,$IN1,$Hl           # H.lo·Xi+1.lo
309          subfe          r0,r0,r0                # borrow?-1:0
310         vpmsumd         $Xm,$IN,$H2             # H^2.hi·Xi.lo+H^2.lo·Xi.hi
311          vpmsumd        $Xm1,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+1.hi
312          and            r0,r0,$len
313         vpmsumd         $Xh,$IN,$H2h            # H^2.hi·Xi.hi
314          vpmsumd        $Xh1,$IN1,$Hh           # H.hi·Xi+1.hi
315          add            $inp,$inp,r0
316
317         vxor            $Xl,$Xl,$Xl1
318         vxor            $Xm,$Xm,$Xm1
319
320         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
321
322         vsldoi          $t0,$Xm,$zero,8
323         vsldoi          $t1,$zero,$Xm,8
324          vxor           $Xh,$Xh,$Xh1
325         vxor            $Xl,$Xl,$t0
326         vxor            $Xh,$Xh,$t1
327
328         vsldoi          $Xl,$Xl,$Xl,8
329         vxor            $Xl,$Xl,$t2
330          lvx_u          $IN,r8,$inp
331          addi           $inp,$inp,32
332
333         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
334         vpmsumd         $Xl,$Xl,$xC2
335          le?vperm       $IN,$IN,$IN,$lemask
336         vxor            $t1,$t1,$Xh
337         vxor            $IN,$IN,$t1
338         vxor            $IN,$IN,$Xl
339         $UCMP           r9,$inp
340         bgt             Loop_2x                 # done yet?
341
342         cmplwi          $len,0
343         bne             Leven
344
345 Lshort:
346         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
347         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
348         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
349
350         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
351
352         vsldoi          $t0,$Xm,$zero,8
353         vsldoi          $t1,$zero,$Xm,8
354         vxor            $Xl,$Xl,$t0
355         vxor            $Xh,$Xh,$t1
356
357         vsldoi          $Xl,$Xl,$Xl,8
358         vxor            $Xl,$Xl,$t2
359
360         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
361         vpmsumd         $Xl,$Xl,$xC2
362         vxor            $t1,$t1,$Xh
363
364 Leven:
365         vxor            $Xl,$Xl,$t1
366         le?vperm        $Xl,$Xl,$Xl,$lemask
367         stvx_u          $Xl,0,$Xip              # write out Xi
368
369         mtspr           256,$vrsave
370         blr
371         .long           0
372         .byte           0,12,0x14,0,0,0,4,0
373         .long           0
374 ___
375 {
376 my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
377     $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
378 my $IN0=$IN;
379 my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
380
381 $code.=<<___;
382 .align  5
383 .gcm_ghash_p8_4x:
384 Lgcm_ghash_p8_4x:
385         $STU            $sp,-$FRAME($sp)
386         li              r10,`15+6*$SIZE_T`
387         li              r11,`31+6*$SIZE_T`
388         stvx            v20,r10,$sp
389         addi            r10,r10,32
390         stvx            v21,r11,$sp
391         addi            r11,r11,32
392         stvx            v22,r10,$sp
393         addi            r10,r10,32
394         stvx            v23,r11,$sp
395         addi            r11,r11,32
396         stvx            v24,r10,$sp
397         addi            r10,r10,32
398         stvx            v25,r11,$sp
399         addi            r11,r11,32
400         stvx            v26,r10,$sp
401         addi            r10,r10,32
402         stvx            v27,r11,$sp
403         addi            r11,r11,32
404         stvx            v28,r10,$sp
405         addi            r10,r10,32
406         stvx            v29,r11,$sp
407         addi            r11,r11,32
408         stvx            v30,r10,$sp
409         li              r10,0x60
410         stvx            v31,r11,$sp
411         li              r0,-1
412         stw             $vrsave,`$FRAME-4`($sp) # save vrsave
413         mtspr           256,r0                  # preserve all AltiVec registers
414
415         lvsl            $t0,0,r8                # 0x0001..0e0f
416         #lvx_u          $H2l,r8,$Htbl           # load H^2
417         li              r8,0x70
418         lvx_u           $H2, r9,$Htbl
419         li              r9,0x80
420         vspltisb        $t1,8                   # 0x0808..0808
421         #lvx_u          $H2h,r10,$Htbl
422         li              r10,0x90
423         lvx_u           $H3l,r8,$Htbl           # load H^3
424         li              r8,0xa0
425         lvx_u           $H3, r9,$Htbl
426         li              r9,0xb0
427         lvx_u           $H3h,r10,$Htbl
428         li              r10,0xc0
429         lvx_u           $H4l,r8,$Htbl           # load H^4
430         li              r8,0x10
431         lvx_u           $H4, r9,$Htbl
432         li              r9,0x20
433         lvx_u           $H4h,r10,$Htbl
434         li              r10,0x30
435
436         vsldoi          $t2,$zero,$t1,8         # 0x0000..0808
437         vaddubm         $hiperm,$t0,$t2         # 0x0001..1617
438         vaddubm         $loperm,$t1,$hiperm     # 0x0809..1e1f
439
440         $SHRI           $len,$len,4             # this allows to use sign bit
441                                                 # as carry
442         lvx_u           $IN0,0,$inp             # load input
443         lvx_u           $IN1,r8,$inp
444         subic.          $len,$len,8
445         lvx_u           $IN2,r9,$inp
446         lvx_u           $IN3,r10,$inp
447         addi            $inp,$inp,0x40
448         le?vperm        $IN0,$IN0,$IN0,$lemask
449         le?vperm        $IN1,$IN1,$IN1,$lemask
450         le?vperm        $IN2,$IN2,$IN2,$lemask
451         le?vperm        $IN3,$IN3,$IN3,$lemask
452
453         vxor            $Xh,$IN0,$Xl
454
455          vpmsumd        $Xl1,$IN1,$H3l
456          vpmsumd        $Xm1,$IN1,$H3
457          vpmsumd        $Xh1,$IN1,$H3h
458
459          vperm          $H21l,$H2,$H,$hiperm
460          vperm          $t0,$IN2,$IN3,$loperm
461          vperm          $H21h,$H2,$H,$loperm
462          vperm          $t1,$IN2,$IN3,$hiperm
463          vpmsumd        $Xm2,$IN2,$H2           # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
464          vpmsumd        $Xl3,$t0,$H21l          # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
465          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
466          vpmsumd        $Xh3,$t1,$H21h          # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
467
468          vxor           $Xm2,$Xm2,$Xm1
469          vxor           $Xl3,$Xl3,$Xl1
470          vxor           $Xm3,$Xm3,$Xm2
471          vxor           $Xh3,$Xh3,$Xh1
472
473         blt             Ltail_4x
474
475 Loop_4x:
476         lvx_u           $IN0,0,$inp
477         lvx_u           $IN1,r8,$inp
478         subic.          $len,$len,4
479         lvx_u           $IN2,r9,$inp
480         lvx_u           $IN3,r10,$inp
481         addi            $inp,$inp,0x40
482         le?vperm        $IN1,$IN1,$IN1,$lemask
483         le?vperm        $IN2,$IN2,$IN2,$lemask
484         le?vperm        $IN3,$IN3,$IN3,$lemask
485         le?vperm        $IN0,$IN0,$IN0,$lemask
486
487         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
488         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
489         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
490          vpmsumd        $Xl1,$IN1,$H3l
491          vpmsumd        $Xm1,$IN1,$H3
492          vpmsumd        $Xh1,$IN1,$H3h
493
494         vxor            $Xl,$Xl,$Xl3
495         vxor            $Xm,$Xm,$Xm3
496         vxor            $Xh,$Xh,$Xh3
497          vperm          $t0,$IN2,$IN3,$loperm
498          vperm          $t1,$IN2,$IN3,$hiperm
499
500         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
501          vpmsumd        $Xl3,$t0,$H21l          # H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
502          vpmsumd        $Xh3,$t1,$H21h          # H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
503
504         vsldoi          $t0,$Xm,$zero,8
505         vsldoi          $t1,$zero,$Xm,8
506         vxor            $Xl,$Xl,$t0
507         vxor            $Xh,$Xh,$t1
508
509         vsldoi          $Xl,$Xl,$Xl,8
510         vxor            $Xl,$Xl,$t2
511
512         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
513          vpmsumd        $Xm2,$IN2,$H2           # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
514          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
515         vpmsumd         $Xl,$Xl,$xC2
516
517          vxor           $Xl3,$Xl3,$Xl1
518          vxor           $Xh3,$Xh3,$Xh1
519         vxor            $Xh,$Xh,$IN0
520          vxor           $Xm2,$Xm2,$Xm1
521         vxor            $Xh,$Xh,$t1
522          vxor           $Xm3,$Xm3,$Xm2
523         vxor            $Xh,$Xh,$Xl
524         bge             Loop_4x
525
526 Ltail_4x:
527         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
528         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
529         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
530
531         vxor            $Xl,$Xl,$Xl3
532         vxor            $Xm,$Xm,$Xm3
533
534         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
535
536         vsldoi          $t0,$Xm,$zero,8
537         vsldoi          $t1,$zero,$Xm,8
538          vxor           $Xh,$Xh,$Xh3
539         vxor            $Xl,$Xl,$t0
540         vxor            $Xh,$Xh,$t1
541
542         vsldoi          $Xl,$Xl,$Xl,8
543         vxor            $Xl,$Xl,$t2
544
545         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
546         vpmsumd         $Xl,$Xl,$xC2
547         vxor            $t1,$t1,$Xh
548         vxor            $Xl,$Xl,$t1
549
550         addic.          $len,$len,4
551         beq             Ldone_4x
552
553         lvx_u           $IN0,0,$inp
554         ${UCMP}i        $len,2
555         li              $len,-4
556         blt             Lone
557         lvx_u           $IN1,r8,$inp
558         beq             Ltwo
559
560 Lthree:
561         lvx_u           $IN2,r9,$inp
562         le?vperm        $IN0,$IN0,$IN0,$lemask
563         le?vperm        $IN1,$IN1,$IN1,$lemask
564         le?vperm        $IN2,$IN2,$IN2,$lemask
565
566         vxor            $Xh,$IN0,$Xl
567         vmr             $H4l,$H3l
568         vmr             $H4, $H3
569         vmr             $H4h,$H3h
570
571         vperm           $t0,$IN1,$IN2,$loperm
572         vperm           $t1,$IN1,$IN2,$hiperm
573         vpmsumd         $Xm2,$IN1,$H2           # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
574         vpmsumd         $Xm3,$IN2,$H            # H.hi·Xi+2.lo  +H.lo·Xi+2.hi
575         vpmsumd         $Xl3,$t0,$H21l          # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
576         vpmsumd         $Xh3,$t1,$H21h          # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
577
578         vxor            $Xm3,$Xm3,$Xm2
579         b               Ltail_4x
580
581 .align  4
582 Ltwo:
583         le?vperm        $IN0,$IN0,$IN0,$lemask
584         le?vperm        $IN1,$IN1,$IN1,$lemask
585
586         vxor            $Xh,$IN0,$Xl
587         vperm           $t0,$zero,$IN1,$loperm
588         vperm           $t1,$zero,$IN1,$hiperm
589
590         vsldoi          $H4l,$zero,$H2,8
591         vmr             $H4, $H2
592         vsldoi          $H4h,$H2,$zero,8
593
594         vpmsumd         $Xl3,$t0, $H21l         # H.lo·Xi+1.lo
595         vpmsumd         $Xm3,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+2.hi
596         vpmsumd         $Xh3,$t1, $H21h         # H.hi·Xi+1.hi
597
598         b               Ltail_4x
599
600 .align  4
601 Lone:
602         le?vperm        $IN0,$IN0,$IN0,$lemask
603
604         vsldoi          $H4l,$zero,$H,8
605         vmr             $H4, $H
606         vsldoi          $H4h,$H,$zero,8
607
608         vxor            $Xh,$IN0,$Xl
609         vxor            $Xl3,$Xl3,$Xl3
610         vxor            $Xm3,$Xm3,$Xm3
611         vxor            $Xh3,$Xh3,$Xh3
612
613         b               Ltail_4x
614
615 Ldone_4x:
616         le?vperm        $Xl,$Xl,$Xl,$lemask
617         stvx_u          $Xl,0,$Xip              # write out Xi
618
619         li              r10,`15+6*$SIZE_T`
620         li              r11,`31+6*$SIZE_T`
621         mtspr           256,$vrsave
622         lvx             v20,r10,$sp
623         addi            r10,r10,32
624         lvx             v21,r11,$sp
625         addi            r11,r11,32
626         lvx             v22,r10,$sp
627         addi            r10,r10,32
628         lvx             v23,r11,$sp
629         addi            r11,r11,32
630         lvx             v24,r10,$sp
631         addi            r10,r10,32
632         lvx             v25,r11,$sp
633         addi            r11,r11,32
634         lvx             v26,r10,$sp
635         addi            r10,r10,32
636         lvx             v27,r11,$sp
637         addi            r11,r11,32
638         lvx             v28,r10,$sp
639         addi            r10,r10,32
640         lvx             v29,r11,$sp
641         addi            r11,r11,32
642         lvx             v30,r10,$sp
643         lvx             v31,r11,$sp
644         addi            $sp,$sp,$FRAME
645         blr
646         .long           0
647         .byte           0,12,0x04,0,0x80,0,4,0
648         .long           0
649 ___
650 }
651 $code.=<<___;
652 .size   .gcm_ghash_p8,.-.gcm_ghash_p8
653
654 .asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
655 .align  2
656 ___
657
658 foreach (split("\n",$code)) {
659         s/\`([^\`]*)\`/eval $1/geo;
660
661         if ($flavour =~ /le$/o) {       # little-endian
662             s/le\?//o           or
663             s/be\?/#be#/o;
664         } else {
665             s/le\?/#le#/o       or
666             s/be\?//o;
667         }
668         print $_,"\n";
669 }
670
671 close STDOUT; # enforce flush