1d6254543bae9b72719c84cd4fde0ea23d3e9b98
[openssl.git] / crypto / modes / asm / ghash-parisc.pl
1 #! /usr/bin/env perl
2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # April 2010
18 #
19 # The module implements "4-bit" GCM GHASH function and underlying
20 # single multiplication operation in GF(2^128). "4-bit" means that it
21 # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
22 # it processes one byte in 19.6 cycles, which is more than twice as
23 # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
24 # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
25 # processed byte. This is ~2.2x faster than 64-bit code generated by
26 # vendor compiler (which used to be very hard to beat:-).
27 #
28 # Special thanks to polarhome.com for providing HP-UX account.
29
30 $flavour = shift;
31 $output = shift;
32 open STDOUT,">$output";
33
34 if ($flavour =~ /64/) {
35         $LEVEL          ="2.0W";
36         $SIZE_T         =8;
37         $FRAME_MARKER   =80;
38         $SAVED_RP       =16;
39         $PUSH           ="std";
40         $PUSHMA         ="std,ma";
41         $POP            ="ldd";
42         $POPMB          ="ldd,mb";
43         $NREGS          =6;
44 } else {
45         $LEVEL          ="1.0"; #"\n\t.ALLOW\t2.0";
46         $SIZE_T         =4;
47         $FRAME_MARKER   =48;
48         $SAVED_RP       =20;
49         $PUSH           ="stw";
50         $PUSHMA         ="stwm";
51         $POP            ="ldw";
52         $POPMB          ="ldwm";
53         $NREGS          =11;
54 }
55
56 $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
57                                 #                 [+ argument transfer]
58
59 ################# volatile registers
60 $Xi="%r26";     # argument block
61 $Htbl="%r25";
62 $inp="%r24";
63 $len="%r23";
64 $Hhh=$Htbl;     # variables
65 $Hll="%r22";
66 $Zhh="%r21";
67 $Zll="%r20";
68 $cnt="%r19";
69 $rem_4bit="%r28";
70 $rem="%r29";
71 $mask0xf0="%r31";
72
73 ################# preserved registers
74 $Thh="%r1";
75 $Tll="%r2";
76 $nlo="%r3";
77 $nhi="%r4";
78 $byte="%r5";
79 if ($SIZE_T==4) {
80         $Zhl="%r6";
81         $Zlh="%r7";
82         $Hhl="%r8";
83         $Hlh="%r9";
84         $Thl="%r10";
85         $Tlh="%r11";
86 }
87 $rem2="%r6";    # used in PA-RISC 2.0 code
88
89 $code.=<<___;
90         .LEVEL  $LEVEL
91         .SPACE  \$TEXT\$
92         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
93
94         .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
95         .ALIGN  64
96 gcm_gmult_4bit
97         .PROC
98         .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
99         .ENTRY
100         $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
101         $PUSHMA %r3,$FRAME(%sp)
102         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
103         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
104         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
105 ___
106 $code.=<<___ if ($SIZE_T==4);
107         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
108         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
109         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
110         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
111         $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
112 ___
113 $code.=<<___;
114         blr     %r0,$rem_4bit
115         ldi     3,$rem
116 L\$pic_gmult
117         andcm   $rem_4bit,$rem,$rem_4bit
118         addl    $inp,$len,$len
119         ldo     L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
120         ldi     0xf0,$mask0xf0
121 ___
122 $code.=<<___ if ($SIZE_T==4);
123         ldi     31,$rem
124         mtctl   $rem,%cr11
125         extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
126         b       L\$parisc1_gmult
127         nop
128 ___
129 \f
130 $code.=<<___;
131         ldb     15($Xi),$nlo
132         ldo     8($Htbl),$Hll
133
134         and     $mask0xf0,$nlo,$nhi
135         depd,z  $nlo,59,4,$nlo
136
137         ldd     $nlo($Hll),$Zll
138         ldd     $nlo($Hhh),$Zhh
139
140         depd,z  $Zll,60,4,$rem
141         shrpd   $Zhh,$Zll,4,$Zll
142         extrd,u $Zhh,59,60,$Zhh
143         ldb     14($Xi),$nlo
144
145         ldd     $nhi($Hll),$Tll
146         ldd     $nhi($Hhh),$Thh
147         and     $mask0xf0,$nlo,$nhi
148         depd,z  $nlo,59,4,$nlo
149
150         xor     $Tll,$Zll,$Zll
151         xor     $Thh,$Zhh,$Zhh
152         ldd     $rem($rem_4bit),$rem
153         b       L\$oop_gmult_pa2
154         ldi     13,$cnt
155
156         .ALIGN  8
157 L\$oop_gmult_pa2
158         xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
159         depd,z  $Zll,60,4,$rem
160
161         shrpd   $Zhh,$Zll,4,$Zll
162         extrd,u $Zhh,59,60,$Zhh
163         ldd     $nlo($Hll),$Tll
164         ldd     $nlo($Hhh),$Thh
165
166         xor     $Tll,$Zll,$Zll
167         xor     $Thh,$Zhh,$Zhh
168         ldd     $rem($rem_4bit),$rem
169
170         xor     $rem,$Zhh,$Zhh
171         depd,z  $Zll,60,4,$rem
172         ldbx    $cnt($Xi),$nlo
173
174         shrpd   $Zhh,$Zll,4,$Zll
175         extrd,u $Zhh,59,60,$Zhh
176         ldd     $nhi($Hll),$Tll
177         ldd     $nhi($Hhh),$Thh
178
179         and     $mask0xf0,$nlo,$nhi
180         depd,z  $nlo,59,4,$nlo
181         ldd     $rem($rem_4bit),$rem
182
183         xor     $Tll,$Zll,$Zll
184         addib,uv -1,$cnt,L\$oop_gmult_pa2
185         xor     $Thh,$Zhh,$Zhh
186
187         xor     $rem,$Zhh,$Zhh
188         depd,z  $Zll,60,4,$rem
189
190         shrpd   $Zhh,$Zll,4,$Zll
191         extrd,u $Zhh,59,60,$Zhh
192         ldd     $nlo($Hll),$Tll
193         ldd     $nlo($Hhh),$Thh
194
195         xor     $Tll,$Zll,$Zll
196         xor     $Thh,$Zhh,$Zhh
197         ldd     $rem($rem_4bit),$rem
198
199         xor     $rem,$Zhh,$Zhh
200         depd,z  $Zll,60,4,$rem
201
202         shrpd   $Zhh,$Zll,4,$Zll
203         extrd,u $Zhh,59,60,$Zhh
204         ldd     $nhi($Hll),$Tll
205         ldd     $nhi($Hhh),$Thh
206
207         xor     $Tll,$Zll,$Zll
208         xor     $Thh,$Zhh,$Zhh
209         ldd     $rem($rem_4bit),$rem
210
211         xor     $rem,$Zhh,$Zhh
212         std     $Zll,8($Xi)
213         std     $Zhh,0($Xi)
214 ___
215 \f
216 $code.=<<___ if ($SIZE_T==4);
217         b       L\$done_gmult
218         nop
219
220 L\$parisc1_gmult
221         ldb     15($Xi),$nlo
222         ldo     12($Htbl),$Hll
223         ldo     8($Htbl),$Hlh
224         ldo     4($Htbl),$Hhl
225
226         and     $mask0xf0,$nlo,$nhi
227         zdep    $nlo,27,4,$nlo
228
229         ldwx    $nlo($Hll),$Zll
230         ldwx    $nlo($Hlh),$Zlh
231         ldwx    $nlo($Hhl),$Zhl
232         ldwx    $nlo($Hhh),$Zhh
233         zdep    $Zll,28,4,$rem
234         ldb     14($Xi),$nlo
235         ldwx    $rem($rem_4bit),$rem
236         shrpw   $Zlh,$Zll,4,$Zll
237         ldwx    $nhi($Hll),$Tll
238         shrpw   $Zhl,$Zlh,4,$Zlh
239         ldwx    $nhi($Hlh),$Tlh
240         shrpw   $Zhh,$Zhl,4,$Zhl
241         ldwx    $nhi($Hhl),$Thl
242         extru   $Zhh,27,28,$Zhh
243         ldwx    $nhi($Hhh),$Thh
244         xor     $rem,$Zhh,$Zhh
245         and     $mask0xf0,$nlo,$nhi
246         zdep    $nlo,27,4,$nlo
247
248         xor     $Tll,$Zll,$Zll
249         ldwx    $nlo($Hll),$Tll
250         xor     $Tlh,$Zlh,$Zlh
251         ldwx    $nlo($Hlh),$Tlh
252         xor     $Thl,$Zhl,$Zhl
253         b       L\$oop_gmult_pa1
254         ldi     13,$cnt
255
256         .ALIGN  8
257 L\$oop_gmult_pa1
258         zdep    $Zll,28,4,$rem
259         ldwx    $nlo($Hhl),$Thl
260         xor     $Thh,$Zhh,$Zhh
261         ldwx    $rem($rem_4bit),$rem
262         shrpw   $Zlh,$Zll,4,$Zll
263         ldwx    $nlo($Hhh),$Thh
264         shrpw   $Zhl,$Zlh,4,$Zlh
265         ldbx    $cnt($Xi),$nlo
266         xor     $Tll,$Zll,$Zll
267         ldwx    $nhi($Hll),$Tll
268         shrpw   $Zhh,$Zhl,4,$Zhl
269         xor     $Tlh,$Zlh,$Zlh
270         ldwx    $nhi($Hlh),$Tlh
271         extru   $Zhh,27,28,$Zhh
272         xor     $Thl,$Zhl,$Zhl
273         ldwx    $nhi($Hhl),$Thl
274         xor     $rem,$Zhh,$Zhh
275         zdep    $Zll,28,4,$rem
276         xor     $Thh,$Zhh,$Zhh
277         ldwx    $nhi($Hhh),$Thh
278         shrpw   $Zlh,$Zll,4,$Zll
279         ldwx    $rem($rem_4bit),$rem
280         shrpw   $Zhl,$Zlh,4,$Zlh
281         shrpw   $Zhh,$Zhl,4,$Zhl
282         and     $mask0xf0,$nlo,$nhi
283         extru   $Zhh,27,28,$Zhh
284         zdep    $nlo,27,4,$nlo
285         xor     $Tll,$Zll,$Zll
286         ldwx    $nlo($Hll),$Tll
287         xor     $Tlh,$Zlh,$Zlh
288         ldwx    $nlo($Hlh),$Tlh
289         xor     $rem,$Zhh,$Zhh
290         addib,uv -1,$cnt,L\$oop_gmult_pa1
291         xor     $Thl,$Zhl,$Zhl
292
293         zdep    $Zll,28,4,$rem
294         ldwx    $nlo($Hhl),$Thl
295         xor     $Thh,$Zhh,$Zhh
296         ldwx    $rem($rem_4bit),$rem
297         shrpw   $Zlh,$Zll,4,$Zll
298         ldwx    $nlo($Hhh),$Thh
299         shrpw   $Zhl,$Zlh,4,$Zlh
300         xor     $Tll,$Zll,$Zll
301         ldwx    $nhi($Hll),$Tll
302         shrpw   $Zhh,$Zhl,4,$Zhl
303         xor     $Tlh,$Zlh,$Zlh
304         ldwx    $nhi($Hlh),$Tlh
305         extru   $Zhh,27,28,$Zhh
306         xor     $rem,$Zhh,$Zhh
307         xor     $Thl,$Zhl,$Zhl
308         ldwx    $nhi($Hhl),$Thl
309         xor     $Thh,$Zhh,$Zhh
310         ldwx    $nhi($Hhh),$Thh
311         zdep    $Zll,28,4,$rem
312         ldwx    $rem($rem_4bit),$rem
313         shrpw   $Zlh,$Zll,4,$Zll
314         shrpw   $Zhl,$Zlh,4,$Zlh
315         shrpw   $Zhh,$Zhl,4,$Zhl
316         extru   $Zhh,27,28,$Zhh
317         xor     $Tll,$Zll,$Zll
318         xor     $Tlh,$Zlh,$Zlh
319         xor     $rem,$Zhh,$Zhh
320         stw     $Zll,12($Xi)
321         xor     $Thl,$Zhl,$Zhl
322         stw     $Zlh,8($Xi)
323         xor     $Thh,$Zhh,$Zhh
324         stw     $Zhl,4($Xi)
325         stw     $Zhh,0($Xi)
326 ___
327 $code.=<<___;
328 L\$done_gmult
329         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
330         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
331         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
332         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
333 ___
334 $code.=<<___ if ($SIZE_T==4);
335         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
336         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
337         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
338         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
339         $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
340 ___
341 $code.=<<___;
342         bv      (%r2)
343         .EXIT
344         $POPMB  -$FRAME(%sp),%r3
345         .PROCEND
346
347         .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
348         .ALIGN  64
349 gcm_ghash_4bit
350         .PROC
351         .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
352         .ENTRY
353         $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
354         $PUSHMA %r3,$FRAME(%sp)
355         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
356         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
357         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
358 ___
359 $code.=<<___ if ($SIZE_T==4);
360         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
361         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
362         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
363         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
364         $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
365 ___
366 $code.=<<___;
367         blr     %r0,$rem_4bit
368         ldi     3,$rem
369 L\$pic_ghash
370         andcm   $rem_4bit,$rem,$rem_4bit
371         addl    $inp,$len,$len
372         ldo     L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
373         ldi     0xf0,$mask0xf0
374 ___
375 $code.=<<___ if ($SIZE_T==4);
376         ldi     31,$rem
377         mtctl   $rem,%cr11
378         extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
379         b       L\$parisc1_ghash
380         nop
381 ___
382 \f\f
383 $code.=<<___;
384         ldb     15($Xi),$nlo
385         ldo     8($Htbl),$Hll
386
387 L\$outer_ghash_pa2
388         ldb     15($inp),$nhi
389         xor     $nhi,$nlo,$nlo
390         and     $mask0xf0,$nlo,$nhi
391         depd,z  $nlo,59,4,$nlo
392
393         ldd     $nlo($Hll),$Zll
394         ldd     $nlo($Hhh),$Zhh
395
396         depd,z  $Zll,60,4,$rem
397         shrpd   $Zhh,$Zll,4,$Zll
398         extrd,u $Zhh,59,60,$Zhh
399         ldb     14($Xi),$nlo
400         ldb     14($inp),$byte
401
402         ldd     $nhi($Hll),$Tll
403         ldd     $nhi($Hhh),$Thh
404         xor     $byte,$nlo,$nlo
405         and     $mask0xf0,$nlo,$nhi
406         depd,z  $nlo,59,4,$nlo
407
408         xor     $Tll,$Zll,$Zll
409         xor     $Thh,$Zhh,$Zhh
410         ldd     $rem($rem_4bit),$rem
411         b       L\$oop_ghash_pa2
412         ldi     13,$cnt
413
414         .ALIGN  8
415 L\$oop_ghash_pa2
416         xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
417         depd,z  $Zll,60,4,$rem2
418
419         shrpd   $Zhh,$Zll,4,$Zll
420         extrd,u $Zhh,59,60,$Zhh
421         ldd     $nlo($Hll),$Tll
422         ldd     $nlo($Hhh),$Thh
423
424         xor     $Tll,$Zll,$Zll
425         xor     $Thh,$Zhh,$Zhh
426         ldbx    $cnt($Xi),$nlo
427         ldbx    $cnt($inp),$byte
428
429         depd,z  $Zll,60,4,$rem
430         shrpd   $Zhh,$Zll,4,$Zll
431         ldd     $rem2($rem_4bit),$rem2
432
433         xor     $rem2,$Zhh,$Zhh
434         xor     $byte,$nlo,$nlo
435         ldd     $nhi($Hll),$Tll
436         ldd     $nhi($Hhh),$Thh
437
438         and     $mask0xf0,$nlo,$nhi
439         depd,z  $nlo,59,4,$nlo
440
441         extrd,u $Zhh,59,60,$Zhh
442         xor     $Tll,$Zll,$Zll
443
444         ldd     $rem($rem_4bit),$rem
445         addib,uv -1,$cnt,L\$oop_ghash_pa2
446         xor     $Thh,$Zhh,$Zhh
447
448         xor     $rem,$Zhh,$Zhh
449         depd,z  $Zll,60,4,$rem2
450
451         shrpd   $Zhh,$Zll,4,$Zll
452         extrd,u $Zhh,59,60,$Zhh
453         ldd     $nlo($Hll),$Tll
454         ldd     $nlo($Hhh),$Thh
455
456         xor     $Tll,$Zll,$Zll
457         xor     $Thh,$Zhh,$Zhh
458
459         depd,z  $Zll,60,4,$rem
460         shrpd   $Zhh,$Zll,4,$Zll
461         ldd     $rem2($rem_4bit),$rem2
462
463         xor     $rem2,$Zhh,$Zhh
464         ldd     $nhi($Hll),$Tll
465         ldd     $nhi($Hhh),$Thh
466
467         extrd,u $Zhh,59,60,$Zhh
468         xor     $Tll,$Zll,$Zll
469         xor     $Thh,$Zhh,$Zhh
470         ldd     $rem($rem_4bit),$rem
471
472         xor     $rem,$Zhh,$Zhh
473         std     $Zll,8($Xi)
474         ldo     16($inp),$inp
475         std     $Zhh,0($Xi)
476         cmpb,*<> $inp,$len,L\$outer_ghash_pa2
477         copy    $Zll,$nlo
478 ___
479 \f
480 $code.=<<___ if ($SIZE_T==4);
481         b       L\$done_ghash
482         nop
483
484 L\$parisc1_ghash
485         ldb     15($Xi),$nlo
486         ldo     12($Htbl),$Hll
487         ldo     8($Htbl),$Hlh
488         ldo     4($Htbl),$Hhl
489
490 L\$outer_ghash_pa1
491         ldb     15($inp),$byte
492         xor     $byte,$nlo,$nlo
493         and     $mask0xf0,$nlo,$nhi
494         zdep    $nlo,27,4,$nlo
495
496         ldwx    $nlo($Hll),$Zll
497         ldwx    $nlo($Hlh),$Zlh
498         ldwx    $nlo($Hhl),$Zhl
499         ldwx    $nlo($Hhh),$Zhh
500         zdep    $Zll,28,4,$rem
501         ldb     14($Xi),$nlo
502         ldb     14($inp),$byte
503         ldwx    $rem($rem_4bit),$rem
504         shrpw   $Zlh,$Zll,4,$Zll
505         ldwx    $nhi($Hll),$Tll
506         shrpw   $Zhl,$Zlh,4,$Zlh
507         ldwx    $nhi($Hlh),$Tlh
508         shrpw   $Zhh,$Zhl,4,$Zhl
509         ldwx    $nhi($Hhl),$Thl
510         extru   $Zhh,27,28,$Zhh
511         ldwx    $nhi($Hhh),$Thh
512         xor     $byte,$nlo,$nlo
513         xor     $rem,$Zhh,$Zhh
514         and     $mask0xf0,$nlo,$nhi
515         zdep    $nlo,27,4,$nlo
516
517         xor     $Tll,$Zll,$Zll
518         ldwx    $nlo($Hll),$Tll
519         xor     $Tlh,$Zlh,$Zlh
520         ldwx    $nlo($Hlh),$Tlh
521         xor     $Thl,$Zhl,$Zhl
522         b       L\$oop_ghash_pa1
523         ldi     13,$cnt
524
525         .ALIGN  8
526 L\$oop_ghash_pa1
527         zdep    $Zll,28,4,$rem
528         ldwx    $nlo($Hhl),$Thl
529         xor     $Thh,$Zhh,$Zhh
530         ldwx    $rem($rem_4bit),$rem
531         shrpw   $Zlh,$Zll,4,$Zll
532         ldwx    $nlo($Hhh),$Thh
533         shrpw   $Zhl,$Zlh,4,$Zlh
534         ldbx    $cnt($Xi),$nlo
535         xor     $Tll,$Zll,$Zll
536         ldwx    $nhi($Hll),$Tll
537         shrpw   $Zhh,$Zhl,4,$Zhl
538         ldbx    $cnt($inp),$byte
539         xor     $Tlh,$Zlh,$Zlh
540         ldwx    $nhi($Hlh),$Tlh
541         extru   $Zhh,27,28,$Zhh
542         xor     $Thl,$Zhl,$Zhl
543         ldwx    $nhi($Hhl),$Thl
544         xor     $rem,$Zhh,$Zhh
545         zdep    $Zll,28,4,$rem
546         xor     $Thh,$Zhh,$Zhh
547         ldwx    $nhi($Hhh),$Thh
548         shrpw   $Zlh,$Zll,4,$Zll
549         ldwx    $rem($rem_4bit),$rem
550         shrpw   $Zhl,$Zlh,4,$Zlh
551         xor     $byte,$nlo,$nlo
552         shrpw   $Zhh,$Zhl,4,$Zhl
553         and     $mask0xf0,$nlo,$nhi
554         extru   $Zhh,27,28,$Zhh
555         zdep    $nlo,27,4,$nlo
556         xor     $Tll,$Zll,$Zll
557         ldwx    $nlo($Hll),$Tll
558         xor     $Tlh,$Zlh,$Zlh
559         ldwx    $nlo($Hlh),$Tlh
560         xor     $rem,$Zhh,$Zhh
561         addib,uv -1,$cnt,L\$oop_ghash_pa1
562         xor     $Thl,$Zhl,$Zhl
563
564         zdep    $Zll,28,4,$rem
565         ldwx    $nlo($Hhl),$Thl
566         xor     $Thh,$Zhh,$Zhh
567         ldwx    $rem($rem_4bit),$rem
568         shrpw   $Zlh,$Zll,4,$Zll
569         ldwx    $nlo($Hhh),$Thh
570         shrpw   $Zhl,$Zlh,4,$Zlh
571         xor     $Tll,$Zll,$Zll
572         ldwx    $nhi($Hll),$Tll
573         shrpw   $Zhh,$Zhl,4,$Zhl
574         xor     $Tlh,$Zlh,$Zlh
575         ldwx    $nhi($Hlh),$Tlh
576         extru   $Zhh,27,28,$Zhh
577         xor     $rem,$Zhh,$Zhh
578         xor     $Thl,$Zhl,$Zhl
579         ldwx    $nhi($Hhl),$Thl
580         xor     $Thh,$Zhh,$Zhh
581         ldwx    $nhi($Hhh),$Thh
582         zdep    $Zll,28,4,$rem
583         ldwx    $rem($rem_4bit),$rem
584         shrpw   $Zlh,$Zll,4,$Zll
585         shrpw   $Zhl,$Zlh,4,$Zlh
586         shrpw   $Zhh,$Zhl,4,$Zhl
587         extru   $Zhh,27,28,$Zhh
588         xor     $Tll,$Zll,$Zll
589         xor     $Tlh,$Zlh,$Zlh
590         xor     $rem,$Zhh,$Zhh
591         stw     $Zll,12($Xi)
592         xor     $Thl,$Zhl,$Zhl
593         stw     $Zlh,8($Xi)
594         xor     $Thh,$Zhh,$Zhh
595         stw     $Zhl,4($Xi)
596         ldo     16($inp),$inp
597         stw     $Zhh,0($Xi)
598         comb,<> $inp,$len,L\$outer_ghash_pa1
599         copy    $Zll,$nlo
600 ___
601 $code.=<<___;
602 L\$done_ghash
603         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
604         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
605         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
606         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
607 ___
608 $code.=<<___ if ($SIZE_T==4);
609         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
610         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
611         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
612         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
613         $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
614 ___
615 $code.=<<___;
616         bv      (%r2)
617         .EXIT
618         $POPMB  -$FRAME(%sp),%r3
619         .PROCEND
620
621         .ALIGN  64
622 L\$rem_4bit
623         .WORD   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
624         .WORD   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
625         .WORD   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
626         .WORD   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
627         .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
628         .ALIGN  64
629 ___
630
631 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
632 # that it can be compiled with .LEVEL 1.0. It should be noted that I
633 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
634 # directive...
635
636 my $ldd = sub {
637   my ($mod,$args) = @_;
638   my $orig = "ldd$mod\t$args";
639
640     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
641     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
642         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
643     }
644     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
645     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
646         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
647         $opcode|=(1<<5)  if ($mod =~ /^,m/);
648         $opcode|=(1<<13) if ($mod =~ /^,mb/);
649         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
650     }
651     else { "\t".$orig; }
652 };
653
654 my $std = sub {
655   my ($mod,$args) = @_;
656   my $orig = "std$mod\t$args";
657
658     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
659     {   my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
660         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
661     }
662     else { "\t".$orig; }
663 };
664
665 my $extrd = sub {
666   my ($mod,$args) = @_;
667   my $orig = "extrd$mod\t$args";
668
669     # I only have ",u" completer, it's implicitly encoded...
670     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
671     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
672         my $len=32-$3;
673         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
674         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
675         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
676     }
677     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
678     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
679         my $len=32-$2;
680         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
681         $opcode |= (1<<13) if ($mod =~ /,\**=/);
682         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
683     }
684     else { "\t".$orig; }
685 };
686
687 my $shrpd = sub {
688   my ($mod,$args) = @_;
689   my $orig = "shrpd$mod\t$args";
690
691     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
692     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
693         my $cpos=63-$3;
694         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
695         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
696     }
697     elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)    # format 11
698     {   sprintf "\t.WORD\t0x%08x\t; %s",
699                 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
700     }
701     else { "\t".$orig; }
702 };
703
704 my $depd = sub {
705   my ($mod,$args) = @_;
706   my $orig = "depd$mod\t$args";
707
708     # I only have ",z" completer, it's impicitly encoded...
709     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 16
710     {   my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
711         my $cpos=63-$2;
712         my $len=32-$3;
713         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode pos
714         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
715         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
716     }
717     else { "\t".$orig; }
718 };
719
720 sub assemble {
721   my ($mnemonic,$mod,$args)=@_;
722   my $opcode = eval("\$$mnemonic");
723
724     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
725 }
726
727 foreach (split("\n",$code)) {
728         s/\`([^\`]*)\`/eval $1/ge;
729         if ($SIZE_T==4) {
730                 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
731                 s/cmpb,\*/comb,/;
732                 s/,\*/,/;
733         }
734         s/\bbv\b/bve/   if ($SIZE_T==8);
735         print $_,"\n";
736 }
737
738 close STDOUT;