crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.
[openssl.git] / crypto / modes / asm / ghash-parisc.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # April 2010
11 #
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
15 # it processes one byte in 19.6 cycles, which is more than twice as
16 # fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
17 # 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
18 # processed byte. This is ~2.2x faster than 64-bit code generated by
19 # vendor compiler (which used to be very hard to beat:-).
20 #
21 # Special thanks to polarhome.com for providing HP-UX account.
22
23 $flavour = shift;
24 $output = shift;
25 open STDOUT,">$output";
26
27 if ($flavour =~ /64/) {
28         $LEVEL          ="2.0W";
29         $SIZE_T         =8;
30         $FRAME_MARKER   =80;
31         $SAVED_RP       =16;
32         $PUSH           ="std";
33         $PUSHMA         ="std,ma";
34         $POP            ="ldd";
35         $POPMB          ="ldd,mb";
36         $NREGS          =6;
37 } else {
38         $LEVEL          ="1.0"; #"\n\t.ALLOW\t2.0";
39         $SIZE_T         =4;
40         $FRAME_MARKER   =48;
41         $SAVED_RP       =20;
42         $PUSH           ="stw";
43         $PUSHMA         ="stwm";
44         $POP            ="ldw";
45         $POPMB          ="ldwm";
46         $NREGS          =11;
47 }
48
49 $FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
50                                 #                 [+ argument transfer]
51
52 ################# volatile registers
53 $Xi="%r26";     # argument block
54 $Htbl="%r25";
55 $inp="%r24";
56 $len="%r23";
57 $Hhh=$Htbl;     # variables
58 $Hll="%r22";
59 $Zhh="%r21";
60 $Zll="%r20";
61 $cnt="%r19";
62 $rem_4bit="%r28";
63 $rem="%r29";
64 $mask0xf0="%r31";
65
66 ################# preserved registers
67 $Thh="%r1";
68 $Tll="%r2";
69 $nlo="%r3";
70 $nhi="%r4";
71 $byte="%r5";
72 if ($SIZE_T==4) {
73         $Zhl="%r6";
74         $Zlh="%r7";
75         $Hhl="%r8";
76         $Hlh="%r9";
77         $Thl="%r10";
78         $Tlh="%r11";
79 }
80 $rem2="%r6";    # used in PA-RISC 2.0 code
81
82 $code.=<<___;
83         .LEVEL  $LEVEL
84         .SPACE  \$TEXT\$
85         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
86
87         .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
88         .ALIGN  64
89 gcm_gmult_4bit
90         .PROC
91         .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
92         .ENTRY
93         $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
94         $PUSHMA %r3,$FRAME(%sp)
95         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
96         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
97         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
98 ___
99 $code.=<<___ if ($SIZE_T==4);
100         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
101         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
102         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
103         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
104         $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
105 ___
106 $code.=<<___;
107         blr     %r0,$rem_4bit
108         ldi     3,$rem
109 L\$pic_gmult
110         andcm   $rem_4bit,$rem,$rem_4bit
111         addl    $inp,$len,$len
112         ldo     L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
113         ldi     0xf0,$mask0xf0
114 ___
115 $code.=<<___ if ($SIZE_T==4);
116         ldi     31,$rem
117         mtctl   $rem,%cr11
118         extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
119         b       L\$parisc1_gmult
120         nop
121 ___
122 \f
123 $code.=<<___;
124         ldb     15($Xi),$nlo
125         ldo     8($Htbl),$Hll
126
127         and     $mask0xf0,$nlo,$nhi
128         depd,z  $nlo,59,4,$nlo
129
130         ldd     $nlo($Hll),$Zll
131         ldd     $nlo($Hhh),$Zhh
132
133         depd,z  $Zll,60,4,$rem
134         shrpd   $Zhh,$Zll,4,$Zll
135         extrd,u $Zhh,59,60,$Zhh
136         ldb     14($Xi),$nlo
137
138         ldd     $nhi($Hll),$Tll
139         ldd     $nhi($Hhh),$Thh
140         and     $mask0xf0,$nlo,$nhi
141         depd,z  $nlo,59,4,$nlo
142
143         xor     $Tll,$Zll,$Zll
144         xor     $Thh,$Zhh,$Zhh
145         ldd     $rem($rem_4bit),$rem
146         b       L\$oop_gmult_pa2
147         ldi     13,$cnt
148
149         .ALIGN  8
150 L\$oop_gmult_pa2
151         xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
152         depd,z  $Zll,60,4,$rem
153
154         shrpd   $Zhh,$Zll,4,$Zll
155         extrd,u $Zhh,59,60,$Zhh
156         ldd     $nlo($Hll),$Tll
157         ldd     $nlo($Hhh),$Thh
158
159         xor     $Tll,$Zll,$Zll
160         xor     $Thh,$Zhh,$Zhh
161         ldd     $rem($rem_4bit),$rem
162
163         xor     $rem,$Zhh,$Zhh
164         depd,z  $Zll,60,4,$rem
165         ldbx    $cnt($Xi),$nlo
166
167         shrpd   $Zhh,$Zll,4,$Zll
168         extrd,u $Zhh,59,60,$Zhh
169         ldd     $nhi($Hll),$Tll
170         ldd     $nhi($Hhh),$Thh
171
172         and     $mask0xf0,$nlo,$nhi
173         depd,z  $nlo,59,4,$nlo
174         ldd     $rem($rem_4bit),$rem
175
176         xor     $Tll,$Zll,$Zll
177         addib,uv -1,$cnt,L\$oop_gmult_pa2
178         xor     $Thh,$Zhh,$Zhh
179
180         xor     $rem,$Zhh,$Zhh
181         depd,z  $Zll,60,4,$rem
182
183         shrpd   $Zhh,$Zll,4,$Zll
184         extrd,u $Zhh,59,60,$Zhh
185         ldd     $nlo($Hll),$Tll
186         ldd     $nlo($Hhh),$Thh
187
188         xor     $Tll,$Zll,$Zll
189         xor     $Thh,$Zhh,$Zhh
190         ldd     $rem($rem_4bit),$rem
191
192         xor     $rem,$Zhh,$Zhh
193         depd,z  $Zll,60,4,$rem
194
195         shrpd   $Zhh,$Zll,4,$Zll
196         extrd,u $Zhh,59,60,$Zhh
197         ldd     $nhi($Hll),$Tll
198         ldd     $nhi($Hhh),$Thh
199
200         xor     $Tll,$Zll,$Zll
201         xor     $Thh,$Zhh,$Zhh
202         ldd     $rem($rem_4bit),$rem
203
204         xor     $rem,$Zhh,$Zhh
205         std     $Zll,8($Xi)
206         std     $Zhh,0($Xi)
207 ___
208 \f
209 $code.=<<___ if ($SIZE_T==4);
210         b       L\$done_gmult
211         nop
212
213 L\$parisc1_gmult
214         ldb     15($Xi),$nlo
215         ldo     12($Htbl),$Hll
216         ldo     8($Htbl),$Hlh
217         ldo     4($Htbl),$Hhl
218
219         and     $mask0xf0,$nlo,$nhi
220         zdep    $nlo,27,4,$nlo
221
222         ldwx    $nlo($Hll),$Zll
223         ldwx    $nlo($Hlh),$Zlh
224         ldwx    $nlo($Hhl),$Zhl
225         ldwx    $nlo($Hhh),$Zhh
226         zdep    $Zll,28,4,$rem
227         ldb     14($Xi),$nlo
228         ldwx    $rem($rem_4bit),$rem
229         shrpw   $Zlh,$Zll,4,$Zll
230         ldwx    $nhi($Hll),$Tll
231         shrpw   $Zhl,$Zlh,4,$Zlh
232         ldwx    $nhi($Hlh),$Tlh
233         shrpw   $Zhh,$Zhl,4,$Zhl
234         ldwx    $nhi($Hhl),$Thl
235         extru   $Zhh,27,28,$Zhh
236         ldwx    $nhi($Hhh),$Thh
237         xor     $rem,$Zhh,$Zhh
238         and     $mask0xf0,$nlo,$nhi
239         zdep    $nlo,27,4,$nlo
240
241         xor     $Tll,$Zll,$Zll
242         ldwx    $nlo($Hll),$Tll
243         xor     $Tlh,$Zlh,$Zlh
244         ldwx    $nlo($Hlh),$Tlh
245         xor     $Thl,$Zhl,$Zhl
246         b       L\$oop_gmult_pa1
247         ldi     13,$cnt
248
249         .ALIGN  8
250 L\$oop_gmult_pa1
251         zdep    $Zll,28,4,$rem
252         ldwx    $nlo($Hhl),$Thl
253         xor     $Thh,$Zhh,$Zhh
254         ldwx    $rem($rem_4bit),$rem
255         shrpw   $Zlh,$Zll,4,$Zll
256         ldwx    $nlo($Hhh),$Thh
257         shrpw   $Zhl,$Zlh,4,$Zlh
258         ldbx    $cnt($Xi),$nlo
259         xor     $Tll,$Zll,$Zll
260         ldwx    $nhi($Hll),$Tll
261         shrpw   $Zhh,$Zhl,4,$Zhl
262         xor     $Tlh,$Zlh,$Zlh
263         ldwx    $nhi($Hlh),$Tlh
264         extru   $Zhh,27,28,$Zhh
265         xor     $Thl,$Zhl,$Zhl
266         ldwx    $nhi($Hhl),$Thl
267         xor     $rem,$Zhh,$Zhh
268         zdep    $Zll,28,4,$rem
269         xor     $Thh,$Zhh,$Zhh
270         ldwx    $nhi($Hhh),$Thh
271         shrpw   $Zlh,$Zll,4,$Zll
272         ldwx    $rem($rem_4bit),$rem
273         shrpw   $Zhl,$Zlh,4,$Zlh
274         shrpw   $Zhh,$Zhl,4,$Zhl
275         and     $mask0xf0,$nlo,$nhi
276         extru   $Zhh,27,28,$Zhh
277         zdep    $nlo,27,4,$nlo
278         xor     $Tll,$Zll,$Zll
279         ldwx    $nlo($Hll),$Tll
280         xor     $Tlh,$Zlh,$Zlh
281         ldwx    $nlo($Hlh),$Tlh
282         xor     $rem,$Zhh,$Zhh
283         addib,uv -1,$cnt,L\$oop_gmult_pa1
284         xor     $Thl,$Zhl,$Zhl
285
286         zdep    $Zll,28,4,$rem
287         ldwx    $nlo($Hhl),$Thl
288         xor     $Thh,$Zhh,$Zhh
289         ldwx    $rem($rem_4bit),$rem
290         shrpw   $Zlh,$Zll,4,$Zll
291         ldwx    $nlo($Hhh),$Thh
292         shrpw   $Zhl,$Zlh,4,$Zlh
293         xor     $Tll,$Zll,$Zll
294         ldwx    $nhi($Hll),$Tll
295         shrpw   $Zhh,$Zhl,4,$Zhl
296         xor     $Tlh,$Zlh,$Zlh
297         ldwx    $nhi($Hlh),$Tlh
298         extru   $Zhh,27,28,$Zhh
299         xor     $rem,$Zhh,$Zhh
300         xor     $Thl,$Zhl,$Zhl
301         ldwx    $nhi($Hhl),$Thl
302         xor     $Thh,$Zhh,$Zhh
303         ldwx    $nhi($Hhh),$Thh
304         zdep    $Zll,28,4,$rem
305         ldwx    $rem($rem_4bit),$rem
306         shrpw   $Zlh,$Zll,4,$Zll
307         shrpw   $Zhl,$Zlh,4,$Zlh
308         shrpw   $Zhh,$Zhl,4,$Zhl
309         extru   $Zhh,27,28,$Zhh
310         xor     $Tll,$Zll,$Zll
311         xor     $Tlh,$Zlh,$Zlh
312         xor     $rem,$Zhh,$Zhh
313         stw     $Zll,12($Xi)
314         xor     $Thl,$Zhl,$Zhl
315         stw     $Zlh,8($Xi)
316         xor     $Thh,$Zhh,$Zhh
317         stw     $Zhl,4($Xi)
318         stw     $Zhh,0($Xi)
319 ___
320 $code.=<<___;
321 L\$done_gmult
322         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
323         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
324         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
325         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
326 ___
327 $code.=<<___ if ($SIZE_T==4);
328         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
329         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
330         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
331         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
332         $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
333 ___
334 $code.=<<___;
335         bv      (%r2)
336         .EXIT
337         $POPMB  -$FRAME(%sp),%r3
338         .PROCEND
339
340         .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
341         .ALIGN  64
342 gcm_ghash_4bit
343         .PROC
344         .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
345         .ENTRY
346         $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
347         $PUSHMA %r3,$FRAME(%sp)
348         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
349         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
350         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
351 ___
352 $code.=<<___ if ($SIZE_T==4);
353         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
354         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
355         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
356         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
357         $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
358 ___
359 $code.=<<___;
360         blr     %r0,$rem_4bit
361         ldi     3,$rem
362 L\$pic_ghash
363         andcm   $rem_4bit,$rem,$rem_4bit
364         addl    $inp,$len,$len
365         ldo     L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
366         ldi     0xf0,$mask0xf0
367 ___
368 $code.=<<___ if ($SIZE_T==4);
369         ldi     31,$rem
370         mtctl   $rem,%cr11
371         extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
372         b       L\$parisc1_ghash
373         nop
374 ___
375 \f\f
376 $code.=<<___;
377         ldb     15($Xi),$nlo
378         ldo     8($Htbl),$Hll
379
380 L\$outer_ghash_pa2
381         ldb     15($inp),$nhi
382         xor     $nhi,$nlo,$nlo
383         and     $mask0xf0,$nlo,$nhi
384         depd,z  $nlo,59,4,$nlo
385
386         ldd     $nlo($Hll),$Zll
387         ldd     $nlo($Hhh),$Zhh
388
389         depd,z  $Zll,60,4,$rem
390         shrpd   $Zhh,$Zll,4,$Zll
391         extrd,u $Zhh,59,60,$Zhh
392         ldb     14($Xi),$nlo
393         ldb     14($inp),$byte
394
395         ldd     $nhi($Hll),$Tll
396         ldd     $nhi($Hhh),$Thh
397         xor     $byte,$nlo,$nlo
398         and     $mask0xf0,$nlo,$nhi
399         depd,z  $nlo,59,4,$nlo
400
401         xor     $Tll,$Zll,$Zll
402         xor     $Thh,$Zhh,$Zhh
403         ldd     $rem($rem_4bit),$rem
404         b       L\$oop_ghash_pa2
405         ldi     13,$cnt
406
407         .ALIGN  8
408 L\$oop_ghash_pa2
409         xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
410         depd,z  $Zll,60,4,$rem2
411
412         shrpd   $Zhh,$Zll,4,$Zll
413         extrd,u $Zhh,59,60,$Zhh
414         ldd     $nlo($Hll),$Tll
415         ldd     $nlo($Hhh),$Thh
416
417         xor     $Tll,$Zll,$Zll
418         xor     $Thh,$Zhh,$Zhh
419         ldbx    $cnt($Xi),$nlo
420         ldbx    $cnt($inp),$byte
421
422         depd,z  $Zll,60,4,$rem
423         shrpd   $Zhh,$Zll,4,$Zll
424         ldd     $rem2($rem_4bit),$rem2
425
426         xor     $rem2,$Zhh,$Zhh
427         xor     $byte,$nlo,$nlo
428         ldd     $nhi($Hll),$Tll
429         ldd     $nhi($Hhh),$Thh
430
431         and     $mask0xf0,$nlo,$nhi
432         depd,z  $nlo,59,4,$nlo
433
434         extrd,u $Zhh,59,60,$Zhh
435         xor     $Tll,$Zll,$Zll
436
437         ldd     $rem($rem_4bit),$rem
438         addib,uv -1,$cnt,L\$oop_ghash_pa2
439         xor     $Thh,$Zhh,$Zhh
440
441         xor     $rem,$Zhh,$Zhh
442         depd,z  $Zll,60,4,$rem2
443
444         shrpd   $Zhh,$Zll,4,$Zll
445         extrd,u $Zhh,59,60,$Zhh
446         ldd     $nlo($Hll),$Tll
447         ldd     $nlo($Hhh),$Thh
448
449         xor     $Tll,$Zll,$Zll
450         xor     $Thh,$Zhh,$Zhh
451
452         depd,z  $Zll,60,4,$rem
453         shrpd   $Zhh,$Zll,4,$Zll
454         ldd     $rem2($rem_4bit),$rem2
455
456         xor     $rem2,$Zhh,$Zhh
457         ldd     $nhi($Hll),$Tll
458         ldd     $nhi($Hhh),$Thh
459
460         extrd,u $Zhh,59,60,$Zhh
461         xor     $Tll,$Zll,$Zll
462         xor     $Thh,$Zhh,$Zhh
463         ldd     $rem($rem_4bit),$rem
464
465         xor     $rem,$Zhh,$Zhh
466         std     $Zll,8($Xi)
467         ldo     16($inp),$inp
468         std     $Zhh,0($Xi)
469         cmpb,*<> $inp,$len,L\$outer_ghash_pa2
470         copy    $Zll,$nlo
471 ___
472 \f
473 $code.=<<___ if ($SIZE_T==4);
474         b       L\$done_ghash
475         nop
476
477 L\$parisc1_ghash
478         ldb     15($Xi),$nlo
479         ldo     12($Htbl),$Hll
480         ldo     8($Htbl),$Hlh
481         ldo     4($Htbl),$Hhl
482
483 L\$outer_ghash_pa1
484         ldb     15($inp),$byte
485         xor     $byte,$nlo,$nlo
486         and     $mask0xf0,$nlo,$nhi
487         zdep    $nlo,27,4,$nlo
488
489         ldwx    $nlo($Hll),$Zll
490         ldwx    $nlo($Hlh),$Zlh
491         ldwx    $nlo($Hhl),$Zhl
492         ldwx    $nlo($Hhh),$Zhh
493         zdep    $Zll,28,4,$rem
494         ldb     14($Xi),$nlo
495         ldb     14($inp),$byte
496         ldwx    $rem($rem_4bit),$rem
497         shrpw   $Zlh,$Zll,4,$Zll
498         ldwx    $nhi($Hll),$Tll
499         shrpw   $Zhl,$Zlh,4,$Zlh
500         ldwx    $nhi($Hlh),$Tlh
501         shrpw   $Zhh,$Zhl,4,$Zhl
502         ldwx    $nhi($Hhl),$Thl
503         extru   $Zhh,27,28,$Zhh
504         ldwx    $nhi($Hhh),$Thh
505         xor     $byte,$nlo,$nlo
506         xor     $rem,$Zhh,$Zhh
507         and     $mask0xf0,$nlo,$nhi
508         zdep    $nlo,27,4,$nlo
509
510         xor     $Tll,$Zll,$Zll
511         ldwx    $nlo($Hll),$Tll
512         xor     $Tlh,$Zlh,$Zlh
513         ldwx    $nlo($Hlh),$Tlh
514         xor     $Thl,$Zhl,$Zhl
515         b       L\$oop_ghash_pa1
516         ldi     13,$cnt
517
518         .ALIGN  8
519 L\$oop_ghash_pa1
520         zdep    $Zll,28,4,$rem
521         ldwx    $nlo($Hhl),$Thl
522         xor     $Thh,$Zhh,$Zhh
523         ldwx    $rem($rem_4bit),$rem
524         shrpw   $Zlh,$Zll,4,$Zll
525         ldwx    $nlo($Hhh),$Thh
526         shrpw   $Zhl,$Zlh,4,$Zlh
527         ldbx    $cnt($Xi),$nlo
528         xor     $Tll,$Zll,$Zll
529         ldwx    $nhi($Hll),$Tll
530         shrpw   $Zhh,$Zhl,4,$Zhl
531         ldbx    $cnt($inp),$byte
532         xor     $Tlh,$Zlh,$Zlh
533         ldwx    $nhi($Hlh),$Tlh
534         extru   $Zhh,27,28,$Zhh
535         xor     $Thl,$Zhl,$Zhl
536         ldwx    $nhi($Hhl),$Thl
537         xor     $rem,$Zhh,$Zhh
538         zdep    $Zll,28,4,$rem
539         xor     $Thh,$Zhh,$Zhh
540         ldwx    $nhi($Hhh),$Thh
541         shrpw   $Zlh,$Zll,4,$Zll
542         ldwx    $rem($rem_4bit),$rem
543         shrpw   $Zhl,$Zlh,4,$Zlh
544         xor     $byte,$nlo,$nlo
545         shrpw   $Zhh,$Zhl,4,$Zhl
546         and     $mask0xf0,$nlo,$nhi
547         extru   $Zhh,27,28,$Zhh
548         zdep    $nlo,27,4,$nlo
549         xor     $Tll,$Zll,$Zll
550         ldwx    $nlo($Hll),$Tll
551         xor     $Tlh,$Zlh,$Zlh
552         ldwx    $nlo($Hlh),$Tlh
553         xor     $rem,$Zhh,$Zhh
554         addib,uv -1,$cnt,L\$oop_ghash_pa1
555         xor     $Thl,$Zhl,$Zhl
556
557         zdep    $Zll,28,4,$rem
558         ldwx    $nlo($Hhl),$Thl
559         xor     $Thh,$Zhh,$Zhh
560         ldwx    $rem($rem_4bit),$rem
561         shrpw   $Zlh,$Zll,4,$Zll
562         ldwx    $nlo($Hhh),$Thh
563         shrpw   $Zhl,$Zlh,4,$Zlh
564         xor     $Tll,$Zll,$Zll
565         ldwx    $nhi($Hll),$Tll
566         shrpw   $Zhh,$Zhl,4,$Zhl
567         xor     $Tlh,$Zlh,$Zlh
568         ldwx    $nhi($Hlh),$Tlh
569         extru   $Zhh,27,28,$Zhh
570         xor     $rem,$Zhh,$Zhh
571         xor     $Thl,$Zhl,$Zhl
572         ldwx    $nhi($Hhl),$Thl
573         xor     $Thh,$Zhh,$Zhh
574         ldwx    $nhi($Hhh),$Thh
575         zdep    $Zll,28,4,$rem
576         ldwx    $rem($rem_4bit),$rem
577         shrpw   $Zlh,$Zll,4,$Zll
578         shrpw   $Zhl,$Zlh,4,$Zlh
579         shrpw   $Zhh,$Zhl,4,$Zhl
580         extru   $Zhh,27,28,$Zhh
581         xor     $Tll,$Zll,$Zll
582         xor     $Tlh,$Zlh,$Zlh
583         xor     $rem,$Zhh,$Zhh
584         stw     $Zll,12($Xi)
585         xor     $Thl,$Zhl,$Zhl
586         stw     $Zlh,8($Xi)
587         xor     $Thh,$Zhh,$Zhh
588         stw     $Zhl,4($Xi)
589         ldo     16($inp),$inp
590         stw     $Zhh,0($Xi)
591         comb,<> $inp,$len,L\$outer_ghash_pa1
592         copy    $Zll,$nlo
593 ___
594 $code.=<<___;
595 L\$done_ghash
596         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
597         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
598         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
599         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
600 ___
601 $code.=<<___ if ($SIZE_T==4);
602         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
603         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
604         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
605         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
606         $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
607 ___
608 $code.=<<___;
609         bv      (%r2)
610         .EXIT
611         $POPMB  -$FRAME(%sp),%r3
612         .PROCEND
613
614         .ALIGN  64
615 L\$rem_4bit
616         .WORD   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
617         .WORD   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
618         .WORD   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
619         .WORD   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
620         .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
621         .ALIGN  64
622 ___
623
624 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
625 # that it can be compiled with .LEVEL 1.0. It should be noted that I
626 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
627 # directive...
628
629 my $ldd = sub {
630   my ($mod,$args) = @_;
631   my $orig = "ldd$mod\t$args";
632
633     if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
634     {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
635         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
636     }
637     elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
638     {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
639         $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
640         $opcode|=(1<<5)  if ($mod =~ /^,m/);
641         $opcode|=(1<<13) if ($mod =~ /^,mb/);
642         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
643     }
644     else { "\t".$orig; }
645 };
646
647 my $std = sub {
648   my ($mod,$args) = @_;
649   my $orig = "std$mod\t$args";
650
651     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
652     {   my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
653         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
654     }
655     else { "\t".$orig; }
656 };
657
658 my $extrd = sub {
659   my ($mod,$args) = @_;
660   my $orig = "extrd$mod\t$args";
661
662     # I only have ",u" completer, it's implicitly encoded...
663     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
664     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
665         my $len=32-$3;
666         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
667         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
668         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
669     }
670     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
671     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
672         my $len=32-$2;
673         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
674         $opcode |= (1<<13) if ($mod =~ /,\**=/);
675         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
676     }
677     else { "\t".$orig; }
678 };
679
680 my $shrpd = sub {
681   my ($mod,$args) = @_;
682   my $orig = "shrpd$mod\t$args";
683
684     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
685     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
686         my $cpos=63-$3;
687         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
688         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
689     }
690     elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)    # format 11
691     {   sprintf "\t.WORD\t0x%08x\t; %s",
692                 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
693     }
694     else { "\t".$orig; }
695 };
696
697 my $depd = sub {
698   my ($mod,$args) = @_;
699   my $orig = "depd$mod\t$args";
700
701     # I only have ",z" completer, it's impicitly encoded...
702     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 16
703     {   my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
704         my $cpos=63-$2;
705         my $len=32-$3;
706         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode pos
707         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
708         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
709     }
710     else { "\t".$orig; }
711 };
712
713 sub assemble {
714   my ($mnemonic,$mod,$args)=@_;
715   my $opcode = eval("\$$mnemonic");
716
717     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
718 }
719
720 foreach (split("\n",$code)) {
721         s/\`([^\`]*)\`/eval $1/ge;
722         if ($SIZE_T==4) {
723                 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
724                 s/cmpb,\*/comb,/;
725                 s/,\*/,/;
726         }
727         s/\bbv\b/bve/   if ($SIZE_T==8);
728         print $_,"\n";
729 }
730
731 close STDOUT;