Many spelling fixes/typo's corrected.
[openssl.git] / crypto / modes / asm / ghashp8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # GHASH for for PowerISA v2.07.
18 #
19 # July 2014
20 #
21 # Accurate performance measurements are problematic, because it's
22 # always virtualized setup with possibly throttled processor.
23 # Relative comparison is therefore more informative. This initial
24 # version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25 # faster than "4-bit" integer-only compiler-generated 64-bit code.
26 # "Initial version" means that there is room for further improvement.
27
28 # May 2016
29 #
30 # 2x aggregated reduction improves performance by 50% (resulting
31 # performance on POWER8 is 1 cycle per processed byte), and 4x
32 # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33
34 $flavour=shift;
35 $output =shift;
36
37 if ($flavour =~ /64/) {
38         $SIZE_T=8;
39         $LRSAVE=2*$SIZE_T;
40         $STU="stdu";
41         $POP="ld";
42         $PUSH="std";
43         $UCMP="cmpld";
44         $SHRI="srdi";
45 } elsif ($flavour =~ /32/) {
46         $SIZE_T=4;
47         $LRSAVE=$SIZE_T;
48         $STU="stwu";
49         $POP="lwz";
50         $PUSH="stw";
51         $UCMP="cmplw";
52         $SHRI="srwi";
53 } else { die "nonsense $flavour"; }
54
55 $sp="r1";
56 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
57
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
61 die "can't locate ppc-xlate.pl";
62
63 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
64
65 my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));    # argument block
66
67 my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
68 my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
69 my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
70 my $vrsave="r12";
71
72 $code=<<___;
73 .machine        "any"
74
75 .text
76
77 .globl  .gcm_init_p8
78 .align  5
79 .gcm_init_p8:
80         li              r0,-4096
81         li              r8,0x10
82         mfspr           $vrsave,256
83         li              r9,0x20
84         mtspr           256,r0
85         li              r10,0x30
86         lvx_u           $H,0,r4                 # load H
87
88         vspltisb        $xC2,-16                # 0xf0
89         vspltisb        $t0,1                   # one
90         vaddubm         $xC2,$xC2,$xC2          # 0xe0
91         vxor            $zero,$zero,$zero
92         vor             $xC2,$xC2,$t0           # 0xe1
93         vsldoi          $xC2,$xC2,$zero,15      # 0xe1...
94         vsldoi          $t1,$zero,$t0,1         # ...1
95         vaddubm         $xC2,$xC2,$xC2          # 0xc2...
96         vspltisb        $t2,7
97         vor             $xC2,$xC2,$t1           # 0xc2....01
98         vspltb          $t1,$H,0                # most significant byte
99         vsl             $H,$H,$t0               # H<<=1
100         vsrab           $t1,$t1,$t2             # broadcast carry bit
101         vand            $t1,$t1,$xC2
102         vxor            $IN,$H,$t1              # twisted H
103
104         vsldoi          $H,$IN,$IN,8            # twist even more ...
105         vsldoi          $xC2,$zero,$xC2,8       # 0xc2.0
106         vsldoi          $Hl,$zero,$H,8          # ... and split
107         vsldoi          $Hh,$H,$zero,8
108
109         stvx_u          $xC2,0,r3               # save pre-computed table
110         stvx_u          $Hl,r8,r3
111         li              r8,0x40
112         stvx_u          $H, r9,r3
113         li              r9,0x50
114         stvx_u          $Hh,r10,r3
115         li              r10,0x60
116
117         vpmsumd         $Xl,$IN,$Hl             # H.lo·H.lo
118         vpmsumd         $Xm,$IN,$H              # H.hi·H.lo+H.lo·H.hi
119         vpmsumd         $Xh,$IN,$Hh             # H.hi·H.hi
120
121         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
122
123         vsldoi          $t0,$Xm,$zero,8
124         vsldoi          $t1,$zero,$Xm,8
125         vxor            $Xl,$Xl,$t0
126         vxor            $Xh,$Xh,$t1
127
128         vsldoi          $Xl,$Xl,$Xl,8
129         vxor            $Xl,$Xl,$t2
130
131         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
132         vpmsumd         $Xl,$Xl,$xC2
133         vxor            $t1,$t1,$Xh
134         vxor            $IN1,$Xl,$t1
135
136         vsldoi          $H2,$IN1,$IN1,8
137         vsldoi          $H2l,$zero,$H2,8
138         vsldoi          $H2h,$H2,$zero,8
139
140         stvx_u          $H2l,r8,r3              # save H^2
141         li              r8,0x70
142         stvx_u          $H2,r9,r3
143         li              r9,0x80
144         stvx_u          $H2h,r10,r3
145         li              r10,0x90
146 ___
147 {
148 my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
149 $code.=<<___;
150         vpmsumd         $Xl,$IN,$H2l            # H.lo·H^2.lo
151          vpmsumd        $Xl1,$IN1,$H2l          # H^2.lo·H^2.lo
152         vpmsumd         $Xm,$IN,$H2             # H.hi·H^2.lo+H.lo·H^2.hi
153          vpmsumd        $Xm1,$IN1,$H2           # H^2.hi·H^2.lo+H^2.lo·H^2.hi
154         vpmsumd         $Xh,$IN,$H2h            # H.hi·H^2.hi
155          vpmsumd        $Xh1,$IN1,$H2h          # H^2.hi·H^2.hi
156
157         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
158          vpmsumd        $t6,$Xl1,$xC2           # 1st reduction phase
159
160         vsldoi          $t0,$Xm,$zero,8
161         vsldoi          $t1,$zero,$Xm,8
162          vsldoi         $t4,$Xm1,$zero,8
163          vsldoi         $t5,$zero,$Xm1,8
164         vxor            $Xl,$Xl,$t0
165         vxor            $Xh,$Xh,$t1
166          vxor           $Xl1,$Xl1,$t4
167          vxor           $Xh1,$Xh1,$t5
168
169         vsldoi          $Xl,$Xl,$Xl,8
170          vsldoi         $Xl1,$Xl1,$Xl1,8
171         vxor            $Xl,$Xl,$t2
172          vxor           $Xl1,$Xl1,$t6
173
174         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
175          vsldoi         $t5,$Xl1,$Xl1,8         # 2nd reduction phase
176         vpmsumd         $Xl,$Xl,$xC2
177          vpmsumd        $Xl1,$Xl1,$xC2
178         vxor            $t1,$t1,$Xh
179          vxor           $t5,$t5,$Xh1
180         vxor            $Xl,$Xl,$t1
181          vxor           $Xl1,$Xl1,$t5
182
183         vsldoi          $H,$Xl,$Xl,8
184          vsldoi         $H2,$Xl1,$Xl1,8
185         vsldoi          $Hl,$zero,$H,8
186         vsldoi          $Hh,$H,$zero,8
187          vsldoi         $H2l,$zero,$H2,8
188          vsldoi         $H2h,$H2,$zero,8
189
190         stvx_u          $Hl,r8,r3               # save H^3
191         li              r8,0xa0
192         stvx_u          $H,r9,r3
193         li              r9,0xb0
194         stvx_u          $Hh,r10,r3
195         li              r10,0xc0
196          stvx_u         $H2l,r8,r3              # save H^4
197          stvx_u         $H2,r9,r3
198          stvx_u         $H2h,r10,r3
199
200         mtspr           256,$vrsave
201         blr
202         .long           0
203         .byte           0,12,0x14,0,0,0,2,0
204         .long           0
205 .size   .gcm_init_p8,.-.gcm_init_p8
206 ___
207 }
208 $code.=<<___;
209 .globl  .gcm_gmult_p8
210 .align  5
211 .gcm_gmult_p8:
212         lis             r0,0xfff8
213         li              r8,0x10
214         mfspr           $vrsave,256
215         li              r9,0x20
216         mtspr           256,r0
217         li              r10,0x30
218         lvx_u           $IN,0,$Xip              # load Xi
219
220         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
221          le?lvsl        $lemask,r0,r0
222         lvx_u           $H, r9,$Htbl
223          le?vspltisb    $t0,0x07
224         lvx_u           $Hh,r10,$Htbl
225          le?vxor        $lemask,$lemask,$t0
226         lvx_u           $xC2,0,$Htbl
227          le?vperm       $IN,$IN,$IN,$lemask
228         vxor            $zero,$zero,$zero
229
230         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
231         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
232         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
233
234         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
235
236         vsldoi          $t0,$Xm,$zero,8
237         vsldoi          $t1,$zero,$Xm,8
238         vxor            $Xl,$Xl,$t0
239         vxor            $Xh,$Xh,$t1
240
241         vsldoi          $Xl,$Xl,$Xl,8
242         vxor            $Xl,$Xl,$t2
243
244         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
245         vpmsumd         $Xl,$Xl,$xC2
246         vxor            $t1,$t1,$Xh
247         vxor            $Xl,$Xl,$t1
248
249         le?vperm        $Xl,$Xl,$Xl,$lemask
250         stvx_u          $Xl,0,$Xip              # write out Xi
251
252         mtspr           256,$vrsave
253         blr
254         .long           0
255         .byte           0,12,0x14,0,0,0,2,0
256         .long           0
257 .size   .gcm_gmult_p8,.-.gcm_gmult_p8
258
259 .globl  .gcm_ghash_p8
260 .align  5
261 .gcm_ghash_p8:
262         li              r0,-4096
263         li              r8,0x10
264         mfspr           $vrsave,256
265         li              r9,0x20
266         mtspr           256,r0
267         li              r10,0x30
268         lvx_u           $Xl,0,$Xip              # load Xi
269
270         lvx_u           $Hl,r8,$Htbl            # load pre-computed table
271         li              r8,0x40
272          le?lvsl        $lemask,r0,r0
273         lvx_u           $H, r9,$Htbl
274         li              r9,0x50
275          le?vspltisb    $t0,0x07
276         lvx_u           $Hh,r10,$Htbl
277         li              r10,0x60
278          le?vxor        $lemask,$lemask,$t0
279         lvx_u           $xC2,0,$Htbl
280          le?vperm       $Xl,$Xl,$Xl,$lemask
281         vxor            $zero,$zero,$zero
282
283         ${UCMP}i        $len,64
284         bge             Lgcm_ghash_p8_4x
285
286         lvx_u           $IN,0,$inp
287         addi            $inp,$inp,16
288         subic.          $len,$len,16
289          le?vperm       $IN,$IN,$IN,$lemask
290         vxor            $IN,$IN,$Xl
291         beq             Lshort
292
293         lvx_u           $H2l,r8,$Htbl           # load H^2
294         li              r8,16
295         lvx_u           $H2, r9,$Htbl
296         add             r9,$inp,$len            # end of input
297         lvx_u           $H2h,r10,$Htbl
298         be?b            Loop_2x
299
300 .align  5
301 Loop_2x:
302         lvx_u           $IN1,0,$inp
303         le?vperm        $IN1,$IN1,$IN1,$lemask
304
305          subic          $len,$len,32
306         vpmsumd         $Xl,$IN,$H2l            # H^2.lo·Xi.lo
307          vpmsumd        $Xl1,$IN1,$Hl           # H.lo·Xi+1.lo
308          subfe          r0,r0,r0                # borrow?-1:0
309         vpmsumd         $Xm,$IN,$H2             # H^2.hi·Xi.lo+H^2.lo·Xi.hi
310          vpmsumd        $Xm1,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+1.hi
311          and            r0,r0,$len
312         vpmsumd         $Xh,$IN,$H2h            # H^2.hi·Xi.hi
313          vpmsumd        $Xh1,$IN1,$Hh           # H.hi·Xi+1.hi
314          add            $inp,$inp,r0
315
316         vxor            $Xl,$Xl,$Xl1
317         vxor            $Xm,$Xm,$Xm1
318
319         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
320
321         vsldoi          $t0,$Xm,$zero,8
322         vsldoi          $t1,$zero,$Xm,8
323          vxor           $Xh,$Xh,$Xh1
324         vxor            $Xl,$Xl,$t0
325         vxor            $Xh,$Xh,$t1
326
327         vsldoi          $Xl,$Xl,$Xl,8
328         vxor            $Xl,$Xl,$t2
329          lvx_u          $IN,r8,$inp
330          addi           $inp,$inp,32
331
332         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
333         vpmsumd         $Xl,$Xl,$xC2
334          le?vperm       $IN,$IN,$IN,$lemask
335         vxor            $t1,$t1,$Xh
336         vxor            $IN,$IN,$t1
337         vxor            $IN,$IN,$Xl
338         $UCMP           r9,$inp
339         bgt             Loop_2x                 # done yet?
340
341         cmplwi          $len,0
342         bne             Leven
343
344 Lshort:
345         vpmsumd         $Xl,$IN,$Hl             # H.lo·Xi.lo
346         vpmsumd         $Xm,$IN,$H              # H.hi·Xi.lo+H.lo·Xi.hi
347         vpmsumd         $Xh,$IN,$Hh             # H.hi·Xi.hi
348
349         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
350
351         vsldoi          $t0,$Xm,$zero,8
352         vsldoi          $t1,$zero,$Xm,8
353         vxor            $Xl,$Xl,$t0
354         vxor            $Xh,$Xh,$t1
355
356         vsldoi          $Xl,$Xl,$Xl,8
357         vxor            $Xl,$Xl,$t2
358
359         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
360         vpmsumd         $Xl,$Xl,$xC2
361         vxor            $t1,$t1,$Xh
362
363 Leven:
364         vxor            $Xl,$Xl,$t1
365         le?vperm        $Xl,$Xl,$Xl,$lemask
366         stvx_u          $Xl,0,$Xip              # write out Xi
367
368         mtspr           256,$vrsave
369         blr
370         .long           0
371         .byte           0,12,0x14,0,0,0,4,0
372         .long           0
373 ___
374 {
375 my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
376     $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
377 my $IN0=$IN;
378 my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
379
380 $code.=<<___;
381 .align  5
382 .gcm_ghash_p8_4x:
383 Lgcm_ghash_p8_4x:
384         $STU            $sp,-$FRAME($sp)
385         li              r10,`15+6*$SIZE_T`
386         li              r11,`31+6*$SIZE_T`
387         stvx            v20,r10,$sp
388         addi            r10,r10,32
389         stvx            v21,r11,$sp
390         addi            r11,r11,32
391         stvx            v22,r10,$sp
392         addi            r10,r10,32
393         stvx            v23,r11,$sp
394         addi            r11,r11,32
395         stvx            v24,r10,$sp
396         addi            r10,r10,32
397         stvx            v25,r11,$sp
398         addi            r11,r11,32
399         stvx            v26,r10,$sp
400         addi            r10,r10,32
401         stvx            v27,r11,$sp
402         addi            r11,r11,32
403         stvx            v28,r10,$sp
404         addi            r10,r10,32
405         stvx            v29,r11,$sp
406         addi            r11,r11,32
407         stvx            v30,r10,$sp
408         li              r10,0x60
409         stvx            v31,r11,$sp
410         li              r0,-1
411         stw             $vrsave,`$FRAME-4`($sp) # save vrsave
412         mtspr           256,r0                  # preserve all AltiVec registers
413
414         lvsl            $t0,0,r8                # 0x0001..0e0f
415         #lvx_u          $H2l,r8,$Htbl           # load H^2
416         li              r8,0x70
417         lvx_u           $H2, r9,$Htbl
418         li              r9,0x80
419         vspltisb        $t1,8                   # 0x0808..0808
420         #lvx_u          $H2h,r10,$Htbl
421         li              r10,0x90
422         lvx_u           $H3l,r8,$Htbl           # load H^3
423         li              r8,0xa0
424         lvx_u           $H3, r9,$Htbl
425         li              r9,0xb0
426         lvx_u           $H3h,r10,$Htbl
427         li              r10,0xc0
428         lvx_u           $H4l,r8,$Htbl           # load H^4
429         li              r8,0x10
430         lvx_u           $H4, r9,$Htbl
431         li              r9,0x20
432         lvx_u           $H4h,r10,$Htbl
433         li              r10,0x30
434
435         vsldoi          $t2,$zero,$t1,8         # 0x0000..0808
436         vaddubm         $hiperm,$t0,$t2         # 0x0001..1617
437         vaddubm         $loperm,$t1,$hiperm     # 0x0809..1e1f
438
439         $SHRI           $len,$len,4             # this allows to use sign bit
440                                                 # as carry
441         lvx_u           $IN0,0,$inp             # load input
442         lvx_u           $IN1,r8,$inp
443         subic.          $len,$len,8
444         lvx_u           $IN2,r9,$inp
445         lvx_u           $IN3,r10,$inp
446         addi            $inp,$inp,0x40
447         le?vperm        $IN0,$IN0,$IN0,$lemask
448         le?vperm        $IN1,$IN1,$IN1,$lemask
449         le?vperm        $IN2,$IN2,$IN2,$lemask
450         le?vperm        $IN3,$IN3,$IN3,$lemask
451
452         vxor            $Xh,$IN0,$Xl
453
454          vpmsumd        $Xl1,$IN1,$H3l
455          vpmsumd        $Xm1,$IN1,$H3
456          vpmsumd        $Xh1,$IN1,$H3h
457
458          vperm          $H21l,$H2,$H,$hiperm
459          vperm          $t0,$IN2,$IN3,$loperm
460          vperm          $H21h,$H2,$H,$loperm
461          vperm          $t1,$IN2,$IN3,$hiperm
462          vpmsumd        $Xm2,$IN2,$H2           # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
463          vpmsumd        $Xl3,$t0,$H21l          # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
464          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
465          vpmsumd        $Xh3,$t1,$H21h          # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
466
467          vxor           $Xm2,$Xm2,$Xm1
468          vxor           $Xl3,$Xl3,$Xl1
469          vxor           $Xm3,$Xm3,$Xm2
470          vxor           $Xh3,$Xh3,$Xh1
471
472         blt             Ltail_4x
473
474 Loop_4x:
475         lvx_u           $IN0,0,$inp
476         lvx_u           $IN1,r8,$inp
477         subic.          $len,$len,4
478         lvx_u           $IN2,r9,$inp
479         lvx_u           $IN3,r10,$inp
480         addi            $inp,$inp,0x40
481         le?vperm        $IN1,$IN1,$IN1,$lemask
482         le?vperm        $IN2,$IN2,$IN2,$lemask
483         le?vperm        $IN3,$IN3,$IN3,$lemask
484         le?vperm        $IN0,$IN0,$IN0,$lemask
485
486         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
487         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
488         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
489          vpmsumd        $Xl1,$IN1,$H3l
490          vpmsumd        $Xm1,$IN1,$H3
491          vpmsumd        $Xh1,$IN1,$H3h
492
493         vxor            $Xl,$Xl,$Xl3
494         vxor            $Xm,$Xm,$Xm3
495         vxor            $Xh,$Xh,$Xh3
496          vperm          $t0,$IN2,$IN3,$loperm
497          vperm          $t1,$IN2,$IN3,$hiperm
498
499         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
500          vpmsumd        $Xl3,$t0,$H21l          # H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
501          vpmsumd        $Xh3,$t1,$H21h          # H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
502
503         vsldoi          $t0,$Xm,$zero,8
504         vsldoi          $t1,$zero,$Xm,8
505         vxor            $Xl,$Xl,$t0
506         vxor            $Xh,$Xh,$t1
507
508         vsldoi          $Xl,$Xl,$Xl,8
509         vxor            $Xl,$Xl,$t2
510
511         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
512          vpmsumd        $Xm2,$IN2,$H2           # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
513          vpmsumd        $Xm3,$IN3,$H            # H.hi·Xi+3.lo  +H.lo·Xi+3.hi
514         vpmsumd         $Xl,$Xl,$xC2
515
516          vxor           $Xl3,$Xl3,$Xl1
517          vxor           $Xh3,$Xh3,$Xh1
518         vxor            $Xh,$Xh,$IN0
519          vxor           $Xm2,$Xm2,$Xm1
520         vxor            $Xh,$Xh,$t1
521          vxor           $Xm3,$Xm3,$Xm2
522         vxor            $Xh,$Xh,$Xl
523         bge             Loop_4x
524
525 Ltail_4x:
526         vpmsumd         $Xl,$Xh,$H4l            # H^4.lo·Xi.lo
527         vpmsumd         $Xm,$Xh,$H4             # H^4.hi·Xi.lo+H^4.lo·Xi.hi
528         vpmsumd         $Xh,$Xh,$H4h            # H^4.hi·Xi.hi
529
530         vxor            $Xl,$Xl,$Xl3
531         vxor            $Xm,$Xm,$Xm3
532
533         vpmsumd         $t2,$Xl,$xC2            # 1st reduction phase
534
535         vsldoi          $t0,$Xm,$zero,8
536         vsldoi          $t1,$zero,$Xm,8
537          vxor           $Xh,$Xh,$Xh3
538         vxor            $Xl,$Xl,$t0
539         vxor            $Xh,$Xh,$t1
540
541         vsldoi          $Xl,$Xl,$Xl,8
542         vxor            $Xl,$Xl,$t2
543
544         vsldoi          $t1,$Xl,$Xl,8           # 2nd reduction phase
545         vpmsumd         $Xl,$Xl,$xC2
546         vxor            $t1,$t1,$Xh
547         vxor            $Xl,$Xl,$t1
548
549         addic.          $len,$len,4
550         beq             Ldone_4x
551
552         lvx_u           $IN0,0,$inp
553         ${UCMP}i        $len,2
554         li              $len,-4
555         blt             Lone
556         lvx_u           $IN1,r8,$inp
557         beq             Ltwo
558
559 Lthree:
560         lvx_u           $IN2,r9,$inp
561         le?vperm        $IN0,$IN0,$IN0,$lemask
562         le?vperm        $IN1,$IN1,$IN1,$lemask
563         le?vperm        $IN2,$IN2,$IN2,$lemask
564
565         vxor            $Xh,$IN0,$Xl
566         vmr             $H4l,$H3l
567         vmr             $H4, $H3
568         vmr             $H4h,$H3h
569
570         vperm           $t0,$IN1,$IN2,$loperm
571         vperm           $t1,$IN1,$IN2,$hiperm
572         vpmsumd         $Xm2,$IN1,$H2           # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
573         vpmsumd         $Xm3,$IN2,$H            # H.hi·Xi+2.lo  +H.lo·Xi+2.hi
574         vpmsumd         $Xl3,$t0,$H21l          # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
575         vpmsumd         $Xh3,$t1,$H21h          # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
576
577         vxor            $Xm3,$Xm3,$Xm2
578         b               Ltail_4x
579
580 .align  4
581 Ltwo:
582         le?vperm        $IN0,$IN0,$IN0,$lemask
583         le?vperm        $IN1,$IN1,$IN1,$lemask
584
585         vxor            $Xh,$IN0,$Xl
586         vperm           $t0,$zero,$IN1,$loperm
587         vperm           $t1,$zero,$IN1,$hiperm
588
589         vsldoi          $H4l,$zero,$H2,8
590         vmr             $H4, $H2
591         vsldoi          $H4h,$H2,$zero,8
592
593         vpmsumd         $Xl3,$t0, $H21l         # H.lo·Xi+1.lo
594         vpmsumd         $Xm3,$IN1,$H            # H.hi·Xi+1.lo+H.lo·Xi+2.hi
595         vpmsumd         $Xh3,$t1, $H21h         # H.hi·Xi+1.hi
596
597         b               Ltail_4x
598
599 .align  4
600 Lone:
601         le?vperm        $IN0,$IN0,$IN0,$lemask
602
603         vsldoi          $H4l,$zero,$H,8
604         vmr             $H4, $H
605         vsldoi          $H4h,$H,$zero,8
606
607         vxor            $Xh,$IN0,$Xl
608         vxor            $Xl3,$Xl3,$Xl3
609         vxor            $Xm3,$Xm3,$Xm3
610         vxor            $Xh3,$Xh3,$Xh3
611
612         b               Ltail_4x
613
614 Ldone_4x:
615         le?vperm        $Xl,$Xl,$Xl,$lemask
616         stvx_u          $Xl,0,$Xip              # write out Xi
617
618         li              r10,`15+6*$SIZE_T`
619         li              r11,`31+6*$SIZE_T`
620         mtspr           256,$vrsave
621         lvx             v20,r10,$sp
622         addi            r10,r10,32
623         lvx             v21,r11,$sp
624         addi            r11,r11,32
625         lvx             v22,r10,$sp
626         addi            r10,r10,32
627         lvx             v23,r11,$sp
628         addi            r11,r11,32
629         lvx             v24,r10,$sp
630         addi            r10,r10,32
631         lvx             v25,r11,$sp
632         addi            r11,r11,32
633         lvx             v26,r10,$sp
634         addi            r10,r10,32
635         lvx             v27,r11,$sp
636         addi            r11,r11,32
637         lvx             v28,r10,$sp
638         addi            r10,r10,32
639         lvx             v29,r11,$sp
640         addi            r11,r11,32
641         lvx             v30,r10,$sp
642         lvx             v31,r11,$sp
643         addi            $sp,$sp,$FRAME
644         blr
645         .long           0
646         .byte           0,12,0x04,0,0x80,0,4,0
647         .long           0
648 ___
649 }
650 $code.=<<___;
651 .size   .gcm_ghash_p8,.-.gcm_ghash_p8
652
653 .asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
654 .align  2
655 ___
656
657 foreach (split("\n",$code)) {
658         s/\`([^\`]*)\`/eval $1/geo;
659
660         if ($flavour =~ /le$/o) {       # little-endian
661             s/le\?//o           or
662             s/be\?/#be#/o;
663         } else {
664             s/le\?/#le#/o       or
665             s/be\?//o;
666         }
667         print $_,"\n";
668 }
669
670 close STDOUT; # enforce flush