sha512-ppc.pl: add PPC32 code, >2x improvement on in-order cores.
[openssl.git] / crypto / sha / asm / sha512-parisc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256/512 block procedure for PA-RISC.
11
12 # June 2009.
13 #
14 # SHA256 performance is >75% better than gcc 3.2 generated code on
15 # PA-7100LC. Compared to code generated by vendor compiler this
16 # implementation is almost 70% faster in 64-bit build, but delivers
17 # virtually same performance in 32-bit build on PA-8600.
18 #
19 # SHA512 performance is >2.9x better than gcc 3.2 generated code on
20 # PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
21 # code is executed on PA-RISC 2.0 processor and switches to 64-bit
22 # code path delivering adequate peformance even in "blended" 32-bit
23 # build. Though 64-bit code is not any faster than code generated by
24 # vendor compiler on PA-8600...
25 #
26 # Special thanks to polarhome.com for providing HP-UX account.
27
28 $flavour = shift;
29 $output = shift;
30 open STDOUT,">$output";
31
32 if ($flavour =~ /64/) {
33         $LEVEL          ="2.0W";
34         $SIZE_T         =8;
35         $FRAME_MARKER   =80;
36         $SAVED_RP       =16;
37         $PUSH           ="std";
38         $PUSHMA         ="std,ma";
39         $POP            ="ldd";
40         $POPMB          ="ldd,mb";
41 } else {
42         $LEVEL          ="1.0";
43         $SIZE_T         =4;
44         $FRAME_MARKER   =48;
45         $SAVED_RP       =20;
46         $PUSH           ="stw";
47         $PUSHMA         ="stwm";
48         $POP            ="ldw";
49         $POPMB          ="ldwm";
50 }
51
52 if ($output =~ /512/) {
53         $func="sha512_block_data_order";
54         $SZ=8;
55         @Sigma0=(28,34,39);
56         @Sigma1=(14,18,41);
57         @sigma0=(1,  8, 7);
58         @sigma1=(19,61, 6);
59         $rounds=80;
60         $LAST10BITS=0x017;
61         $LD="ldd";
62         $LDM="ldd,ma";
63         $ST="std";
64 } else {
65         $func="sha256_block_data_order";
66         $SZ=4;
67         @Sigma0=( 2,13,22);
68         @Sigma1=( 6,11,25);
69         @sigma0=( 7,18, 3);
70         @sigma1=(17,19,10);
71         $rounds=64;
72         $LAST10BITS=0x0f2;
73         $LD="ldw";
74         $LDM="ldwm";
75         $ST="stw";
76 }
77
78 $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
79                                 #                 [+ argument transfer]
80 $XOFF=16*$SZ+32;                # local variables
81 $FRAME+=$XOFF;
82 $XOFF+=$FRAME_MARKER;           # distance between %sp and local variables
83
84 $ctx="%r26";    # zapped by $a0
85 $inp="%r25";    # zapped by $a1
86 $num="%r24";    # zapped by $t0
87
88 $a0 ="%r26";
89 $a1 ="%r25";
90 $t0 ="%r24";
91 $t1 ="%r29";
92 $Tbl="%r31";
93
94 @V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
95
96 @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
97     "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
98
99 sub ROUND_00_15 {
100 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101 $code.=<<___;
102         _ror    $e,$Sigma1[0],$a0
103         and     $f,$e,$t0
104         _ror    $e,$Sigma1[1],$a1
105         addl    $t1,$h,$h
106         andcm   $g,$e,$t1
107         xor     $a1,$a0,$a0
108         _ror    $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
109         or      $t0,$t1,$t1             ; Ch(e,f,g)
110         addl    @X[$i%16],$h,$h
111         xor     $a0,$a1,$a1             ; Sigma1(e)
112         addl    $t1,$h,$h
113         _ror    $a,$Sigma0[0],$a0
114         addl    $a1,$h,$h
115
116         _ror    $a,$Sigma0[1],$a1
117         and     $a,$b,$t0
118         and     $a,$c,$t1
119         xor     $a1,$a0,$a0
120         _ror    $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
121         xor     $t1,$t0,$t0
122         and     $b,$c,$t1
123         xor     $a0,$a1,$a1             ; Sigma0(a)
124         addl    $h,$d,$d
125         xor     $t1,$t0,$t0             ; Maj(a,b,c)
126         `"$LDM  $SZ($Tbl),$t1" if ($i<15)`
127         addl    $a1,$h,$h
128         addl    $t0,$h,$h
129
130 ___
131 }
132
133 sub ROUND_16_xx {
134 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
135 $i-=16;
136 $code.=<<___;
137         _ror    @X[($i+1)%16],$sigma0[0],$a0
138         _ror    @X[($i+1)%16],$sigma0[1],$a1
139         addl    @X[($i+9)%16],@X[$i],@X[$i]
140         _ror    @X[($i+14)%16],$sigma1[0],$t0
141         _ror    @X[($i+14)%16],$sigma1[1],$t1
142         xor     $a1,$a0,$a0
143         _shr    @X[($i+1)%16],$sigma0[2],$a1
144         xor     $t1,$t0,$t0
145         _shr    @X[($i+14)%16],$sigma1[2],$t1
146         xor     $a1,$a0,$a0             ; sigma0(X[(i+1)&0x0f])
147         xor     $t1,$t0,$t0             ; sigma1(X[(i+14)&0x0f])
148         $LDM    $SZ($Tbl),$t1
149         addl    $a0,@X[$i],@X[$i]
150         addl    $t0,@X[$i],@X[$i]
151 ___
152 $code.=<<___ if ($i==15);
153         extru   $t1,31,10,$a1
154         comiclr,<> $LAST10BITS,$a1,%r0
155         ldo     1($Tbl),$Tbl            ; signal end of $Tbl
156 ___
157 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
158 }
159
160 $code=<<___;
161         .LEVEL  $LEVEL
162         .SPACE  \$TEXT\$
163         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
164
165         .ALIGN  64
166 L\$table
167 ___
168 $code.=<<___ if ($SZ==8);
169         .WORD   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
170         .WORD   0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
171         .WORD   0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
172         .WORD   0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
173         .WORD   0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
174         .WORD   0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
175         .WORD   0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
176         .WORD   0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
177         .WORD   0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
178         .WORD   0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
179         .WORD   0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
180         .WORD   0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
181         .WORD   0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
182         .WORD   0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
183         .WORD   0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
184         .WORD   0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
185         .WORD   0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
186         .WORD   0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
187         .WORD   0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
188         .WORD   0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
189         .WORD   0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
190         .WORD   0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
191         .WORD   0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
192         .WORD   0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
193         .WORD   0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
194         .WORD   0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
195         .WORD   0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
196         .WORD   0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
197         .WORD   0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
198         .WORD   0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
199         .WORD   0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
200         .WORD   0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
201         .WORD   0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
202         .WORD   0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
203         .WORD   0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
204         .WORD   0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
205         .WORD   0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
206         .WORD   0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
207         .WORD   0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
208         .WORD   0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
209 ___
210 $code.=<<___ if ($SZ==4);
211         .WORD   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
212         .WORD   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
213         .WORD   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
214         .WORD   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
215         .WORD   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
216         .WORD   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
217         .WORD   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
218         .WORD   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
219         .WORD   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
220         .WORD   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
221         .WORD   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
222         .WORD   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
223         .WORD   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
224         .WORD   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
225         .WORD   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
226         .WORD   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
227 ___
228 $code.=<<___;
229
230         .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
231         .ALIGN  64
232 $func
233         .PROC
234         .CALLINFO       FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
235         .ENTRY
236         $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
237         $PUSHMA %r3,$FRAME(%sp)
238         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
239         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
240         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
241         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
242         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
243         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
244         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
245         $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
246         $PUSH   %r12,`-$FRAME+9*$SIZE_T`(%sp)
247         $PUSH   %r13,`-$FRAME+10*$SIZE_T`(%sp)
248         $PUSH   %r14,`-$FRAME+11*$SIZE_T`(%sp)
249         $PUSH   %r15,`-$FRAME+12*$SIZE_T`(%sp)
250         $PUSH   %r16,`-$FRAME+13*$SIZE_T`(%sp)
251         $PUSH   %r17,`-$FRAME+14*$SIZE_T`(%sp)
252         $PUSH   %r18,`-$FRAME+15*$SIZE_T`(%sp)
253
254         _shl    $num,`log(16*$SZ)/log(2)`,$num
255         addl    $inp,$num,$num          ; $num to point at the end of $inp
256
257         $PUSH   $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)    ; save arguments
258         $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
259         $PUSH   $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
260
261         blr     %r0,$Tbl
262         ldi     3,$t1
263 L\$pic
264         andcm   $Tbl,$t1,$Tbl           ; wipe privilege level
265         ldo     L\$table-L\$pic($Tbl),$Tbl
266 ___
267 $code.=<<___ if ($SZ==8 && $SIZE_T==4);
268         ldi     31,$t1
269         mtctl   $t1,%cr11
270         extrd,u,*= $t1,%sar,1,$t1       ; executes on PA-RISC 1.0
271         b       L\$parisc1
272         nop
273 ___
274 $code.=<<___;
275         $LD     `0*$SZ`($ctx),$A        ; load context
276         $LD     `1*$SZ`($ctx),$B
277         $LD     `2*$SZ`($ctx),$C
278         $LD     `3*$SZ`($ctx),$D
279         $LD     `4*$SZ`($ctx),$E
280         $LD     `5*$SZ`($ctx),$F
281         $LD     `6*$SZ`($ctx),$G
282         $LD     `7*$SZ`($ctx),$H
283
284         extru   $inp,31,`log($SZ)/log(2)`,$t0
285         sh3addl $t0,%r0,$t0
286         subi    `8*$SZ`,$t0,$t0
287         mtctl   $t0,%cr11               ; load %sar with align factor
288
289 L\$oop
290         ldi     `$SZ-1`,$t0
291         $LDM    $SZ($Tbl),$t1
292         andcm   $inp,$t0,$t0            ; align $inp
293 ___
294         for ($i=0;$i<15;$i++) {         # load input block
295         $code.="\t$LD   `$SZ*$i`($t0),@X[$i]\n";                }
296 $code.=<<___;
297         cmpb,*= $inp,$t0,L\$aligned
298         $LD     `$SZ*15`($t0),@X[15]
299         $LD     `$SZ*16`($t0),@X[16]
300 ___
301         for ($i=0;$i<16;$i++) {         # align data
302         $code.="\t_align        @X[$i],@X[$i+1],@X[$i]\n";      }
303 $code.=<<___;
304 L\$aligned
305         nop     ; otherwise /usr/ccs/bin/as is confused by below .WORD
306 ___
307
308 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
309 $code.=<<___;
310 L\$rounds
311         nop     ; otherwise /usr/ccs/bin/as is confused by below .WORD
312 ___
313 for(;$i<32;$i++)        { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
314 $code.=<<___;
315         bb,>=   $Tbl,31,L\$rounds       ; end of $Tbl signalled?
316         nop
317
318         $POP    `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx    ; restore arguments
319         $POP    `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
320         $POP    `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
321         ldo     `-$rounds*$SZ-1`($Tbl),$Tbl             ; rewind $Tbl
322
323         $LD     `0*$SZ`($ctx),@X[0]     ; load context
324         $LD     `1*$SZ`($ctx),@X[1]
325         $LD     `2*$SZ`($ctx),@X[2]
326         $LD     `3*$SZ`($ctx),@X[3]
327         $LD     `4*$SZ`($ctx),@X[4]
328         $LD     `5*$SZ`($ctx),@X[5]
329         addl    @X[0],$A,$A
330         $LD     `6*$SZ`($ctx),@X[6]
331         addl    @X[1],$B,$B
332         $LD     `7*$SZ`($ctx),@X[7]
333         ldo     `16*$SZ`($inp),$inp     ; advance $inp
334
335         $ST     $A,`0*$SZ`($ctx)        ; save context
336         addl    @X[2],$C,$C
337         $ST     $B,`1*$SZ`($ctx)
338         addl    @X[3],$D,$D
339         $ST     $C,`2*$SZ`($ctx)
340         addl    @X[4],$E,$E
341         $ST     $D,`3*$SZ`($ctx)
342         addl    @X[5],$F,$F
343         $ST     $E,`4*$SZ`($ctx)
344         addl    @X[6],$G,$G
345         $ST     $F,`5*$SZ`($ctx)
346         addl    @X[7],$H,$H
347         $ST     $G,`6*$SZ`($ctx)
348         $ST     $H,`7*$SZ`($ctx)
349
350         cmpb,*<>,n $inp,$num,L\$oop
351         $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)    ; save $inp
352 ___
353 if ($SZ==8 && $SIZE_T==4)       # SHA512 for 32-bit PA-RISC 1.0
354 {{
355 $code.=<<___;
356         b       L\$done
357         nop
358
359         .ALIGN  64
360 L\$parisc1
361 ___
362
363 @V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
364       $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) = 
365    ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
366      "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
367 $a0 ="%r17";
368 $a1 ="%r18";
369 $a2 ="%r19";
370 $a3 ="%r20";
371 $t0 ="%r21";
372 $t1 ="%r22";
373 $t2 ="%r28";
374 $t3 ="%r29";
375 $Tbl="%r31";
376
377 @X=("%r23","%r24","%r25","%r26");       # zaps $num,$inp,$ctx
378
379 sub ROUND_00_15_pa1 {
380 my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
381        $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
382 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
383
384 $code.=<<___ if (!$flag);
385         ldw     `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
386         ldw     `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo     ; load X[i+1]
387 ___
388 $code.=<<___;
389         shd     $ehi,$elo,$Sigma1[0],$t0
390          add    $Xlo,$hlo,$hlo
391         shd     $elo,$ehi,$Sigma1[0],$t1
392          addc   $Xhi,$hhi,$hhi          ; h += X[i]
393         shd     $ehi,$elo,$Sigma1[1],$t2
394          ldwm   8($Tbl),$Xhi
395         shd     $elo,$ehi,$Sigma1[1],$t3
396          ldw    -4($Tbl),$Xlo           ; load K[i]
397         xor     $t2,$t0,$t0
398         xor     $t3,$t1,$t1
399          and    $flo,$elo,$a0
400          and    $fhi,$ehi,$a1
401         shd     $ehi,$elo,$Sigma1[2],$t2
402          andcm  $glo,$elo,$a2
403         shd     $elo,$ehi,$Sigma1[2],$t3
404          andcm  $ghi,$ehi,$a3
405         xor     $t2,$t0,$t0
406         xor     $t3,$t1,$t1             ; Sigma1(e)
407         add     $Xlo,$hlo,$hlo
408          xor    $a2,$a0,$a0
409         addc    $Xhi,$hhi,$hhi          ; h += K[i]
410          xor    $a3,$a1,$a1             ; Ch(e,f,g)
411
412          add    $t0,$hlo,$hlo
413         shd     $ahi,$alo,$Sigma0[0],$t0
414          addc   $t1,$hhi,$hhi           ; h += Sigma1(e)
415         shd     $alo,$ahi,$Sigma0[0],$t1        
416          add    $a0,$hlo,$hlo
417         shd     $ahi,$alo,$Sigma0[1],$t2
418          addc   $a1,$hhi,$hhi           ; h += Ch(e,f,g)
419         shd     $alo,$ahi,$Sigma0[1],$t3
420
421         xor     $t2,$t0,$t0
422         xor     $t3,$t1,$t1
423         shd     $ahi,$alo,$Sigma0[2],$t2
424         and     $alo,$blo,$a0
425         shd     $alo,$ahi,$Sigma0[2],$t3
426         and     $ahi,$bhi,$a1
427         xor     $t2,$t0,$t0
428         xor     $t3,$t1,$t1             ; Sigma0(a)
429
430         and     $alo,$clo,$a2
431         and     $ahi,$chi,$a3
432         xor     $a2,$a0,$a0
433          add    $hlo,$dlo,$dlo
434         xor     $a3,$a1,$a1
435          addc   $hhi,$dhi,$dhi          ; d += h
436         and     $blo,$clo,$a2
437          add    $t0,$hlo,$hlo
438         and     $bhi,$chi,$a3
439          addc   $t1,$hhi,$hhi           ; h += Sigma0(a)
440         xor     $a2,$a0,$a0
441          add    $a0,$hlo,$hlo
442         xor     $a3,$a1,$a1             ; Maj(a,b,c)
443          addc   $a1,$hhi,$hhi           ; h += Maj(a,b,c)
444
445 ___
446 $code.=<<___ if ($i==15 && $flag);
447         extru   $Xlo,31,10,$Xlo
448         comiclr,= $LAST10BITS,$Xlo,%r0
449         b       L\$rounds_pa1
450         nop
451 ___
452 push(@X,shift(@X)); push(@X,shift(@X));
453 }
454
455 sub ROUND_16_xx_pa1 {
456 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
457 my ($i)=shift;
458 $i-=16;
459 $code.=<<___;
460         ldw     `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
461         ldw     `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo     ; load X[i+1]
462         ldw     `-$XOFF+8*(($i+9)%16)`(%sp),$a1
463         ldw     `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0       ; load X[i+9]
464         ldw     `-$XOFF+8*(($i+14)%16)`(%sp),$a3
465         ldw     `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2      ; load X[i+14]
466         shd     $Xnhi,$Xnlo,$sigma0[0],$t0
467         shd     $Xnlo,$Xnhi,$sigma0[0],$t1
468          add    $a0,$Xlo,$Xlo
469         shd     $Xnhi,$Xnlo,$sigma0[1],$t2
470          addc   $a1,$Xhi,$Xhi
471         shd     $Xnlo,$Xnhi,$sigma0[1],$t3
472         xor     $t2,$t0,$t0
473         shd     $Xnhi,$Xnlo,$sigma0[2],$t2
474         xor     $t3,$t1,$t1
475         extru   $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
476         xor     $t2,$t0,$t0
477          shd    $a3,$a2,$sigma1[0],$a0
478         xor     $t3,$t1,$t1             ; sigma0(X[i+1)&0x0f])
479          shd    $a2,$a3,$sigma1[0],$a1
480         add     $t0,$Xlo,$Xlo
481          shd    $a3,$a2,$sigma1[1],$t2
482         addc    $t1,$Xhi,$Xhi
483          shd    $a2,$a3,$sigma1[1],$t3
484         xor     $t2,$a0,$a0
485         shd     $a3,$a2,$sigma1[2],$t2
486         xor     $t3,$a1,$a1
487         extru   $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
488         xor     $t2,$a0,$a0
489         xor     $t3,$a1,$a1             ; sigma0(X[i+14)&0x0f])
490         add     $a0,$Xlo,$Xlo
491         addc    $a1,$Xhi,$Xhi
492
493         stw     $Xhi,`-$XOFF+8*($i%16)`(%sp)
494         stw     $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
495 ___
496 &ROUND_00_15_pa1($i,@_,1);
497 }
498 $code.=<<___;
499         ldw     `0*4`($ctx),$Ahi                ; load context
500         ldw     `1*4`($ctx),$Alo
501         ldw     `2*4`($ctx),$Bhi
502         ldw     `3*4`($ctx),$Blo
503         ldw     `4*4`($ctx),$Chi
504         ldw     `5*4`($ctx),$Clo
505         ldw     `6*4`($ctx),$Dhi
506         ldw     `7*4`($ctx),$Dlo
507         ldw     `8*4`($ctx),$Ehi
508         ldw     `9*4`($ctx),$Elo
509         ldw     `10*4`($ctx),$Fhi
510         ldw     `11*4`($ctx),$Flo
511         ldw     `12*4`($ctx),$Ghi
512         ldw     `13*4`($ctx),$Glo
513         ldw     `14*4`($ctx),$Hhi
514         ldw     `15*4`($ctx),$Hlo
515
516         extru   $inp,31,2,$t0
517         sh3addl $t0,%r0,$t0
518         subi    32,$t0,$t0
519         mtctl   $t0,%cr11               ; load %sar with align factor
520
521 L\$oop_pa1
522         extru   $inp,31,2,$a3
523         comib,= 0,$a3,L\$aligned_pa1
524         sub     $inp,$a3,$inp
525
526         ldw     `0*4`($inp),$X[0]
527         ldw     `1*4`($inp),$X[1]
528         ldw     `2*4`($inp),$t2
529         ldw     `3*4`($inp),$t3
530         ldw     `4*4`($inp),$a0
531         ldw     `5*4`($inp),$a1
532         ldw     `6*4`($inp),$a2
533         ldw     `7*4`($inp),$a3
534         vshd    $X[0],$X[1],$X[0]
535         vshd    $X[1],$t2,$X[1]
536         stw     $X[0],`-$XOFF+0*4`(%sp)
537         ldw     `8*4`($inp),$t0
538         vshd    $t2,$t3,$t2
539         stw     $X[1],`-$XOFF+1*4`(%sp)
540         ldw     `9*4`($inp),$t1
541         vshd    $t3,$a0,$t3
542 ___
543 {
544 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
545 for ($i=2;$i<=(128/4-8);$i++) {
546 $code.=<<___;
547         stw     $t[0],`-$XOFF+$i*4`(%sp)
548         ldw     `(8+$i)*4`($inp),$t[0]
549         vshd    $t[1],$t[2],$t[1]
550 ___
551 push(@t,shift(@t));
552 }
553 for (;$i<(128/4-1);$i++) {
554 $code.=<<___;
555         stw     $t[0],`-$XOFF+$i*4`(%sp)
556         vshd    $t[1],$t[2],$t[1]
557 ___
558 push(@t,shift(@t));
559 }
560 $code.=<<___;
561         b       L\$collected_pa1
562         stw     $t[0],`-$XOFF+$i*4`(%sp)
563
564 ___
565 }
566 $code.=<<___;
567 L\$aligned_pa1
568         ldw     `0*4`($inp),$X[0]
569         ldw     `1*4`($inp),$X[1]
570         ldw     `2*4`($inp),$t2
571         ldw     `3*4`($inp),$t3
572         ldw     `4*4`($inp),$a0
573         ldw     `5*4`($inp),$a1
574         ldw     `6*4`($inp),$a2
575         ldw     `7*4`($inp),$a3
576         stw     $X[0],`-$XOFF+0*4`(%sp)
577         ldw     `8*4`($inp),$t0
578         stw     $X[1],`-$XOFF+1*4`(%sp)
579         ldw     `9*4`($inp),$t1
580 ___
581 {
582 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
583 for ($i=2;$i<(128/4-8);$i++) {
584 $code.=<<___;
585         stw     $t[0],`-$XOFF+$i*4`(%sp)
586         ldw     `(8+$i)*4`($inp),$t[0]
587 ___
588 push(@t,shift(@t));
589 }
590 for (;$i<128/4;$i++) {
591 $code.=<<___;
592         stw     $t[0],`-$XOFF+$i*4`(%sp)
593 ___
594 push(@t,shift(@t));
595 }
596 $code.="L\$collected_pa1\n";
597 }
598
599 for($i=0;$i<16;$i++)    { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
600 $code.="L\$rounds_pa1\n";
601 for(;$i<32;$i++)        { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
602
603 $code.=<<___;
604         $POP    `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx    ; restore arguments
605         $POP    `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
606         $POP    `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
607         ldo     `-$rounds*$SZ`($Tbl),$Tbl               ; rewind $Tbl
608
609         ldw     `0*4`($ctx),$t1         ; update context
610         ldw     `1*4`($ctx),$t0
611         ldw     `2*4`($ctx),$t3
612         ldw     `3*4`($ctx),$t2
613         ldw     `4*4`($ctx),$a1
614         ldw     `5*4`($ctx),$a0
615         ldw     `6*4`($ctx),$a3
616         add     $t0,$Alo,$Alo
617         ldw     `7*4`($ctx),$a2
618         addc    $t1,$Ahi,$Ahi
619         ldw     `8*4`($ctx),$t1
620         add     $t2,$Blo,$Blo
621         ldw     `9*4`($ctx),$t0
622         addc    $t3,$Bhi,$Bhi
623         ldw     `10*4`($ctx),$t3
624         add     $a0,$Clo,$Clo
625         ldw     `11*4`($ctx),$t2
626         addc    $a1,$Chi,$Chi
627         ldw     `12*4`($ctx),$a1
628         add     $a2,$Dlo,$Dlo
629         ldw     `13*4`($ctx),$a0
630         addc    $a3,$Dhi,$Dhi
631         ldw     `14*4`($ctx),$a3
632         add     $t0,$Elo,$Elo
633         ldw     `15*4`($ctx),$a2
634         addc    $t1,$Ehi,$Ehi
635         stw     $Ahi,`0*4`($ctx)
636         add     $t2,$Flo,$Flo
637         stw     $Alo,`1*4`($ctx)
638         addc    $t3,$Fhi,$Fhi
639         stw     $Bhi,`2*4`($ctx)
640         add     $a0,$Glo,$Glo
641         stw     $Blo,`3*4`($ctx)
642         addc    $a1,$Ghi,$Ghi
643         stw     $Chi,`4*4`($ctx)
644         add     $a2,$Hlo,$Hlo
645         stw     $Clo,`5*4`($ctx)
646         addc    $a3,$Hhi,$Hhi
647         stw     $Dhi,`6*4`($ctx)
648         ldo     `16*$SZ`($inp),$inp     ; advance $inp
649         stw     $Dlo,`7*4`($ctx)
650         stw     $Ehi,`8*4`($ctx)
651         stw     $Elo,`9*4`($ctx)
652         stw     $Fhi,`10*4`($ctx)
653         stw     $Flo,`11*4`($ctx)
654         stw     $Ghi,`12*4`($ctx)
655         stw     $Glo,`13*4`($ctx)
656         stw     $Hhi,`14*4`($ctx)
657         comb,=  $inp,$num,L\$done
658         stw     $Hlo,`15*4`($ctx)
659         b       L\$oop_pa1
660         $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)    ; save $inp
661 L\$done
662 ___
663 }}
664 $code.=<<___;
665         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
666         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
667         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
668         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
669         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
670         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
671         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
672         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
673         $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
674         $POP    `-$FRAME+9*$SIZE_T`(%sp),%r12
675         $POP    `-$FRAME+10*$SIZE_T`(%sp),%r13
676         $POP    `-$FRAME+11*$SIZE_T`(%sp),%r14
677         $POP    `-$FRAME+12*$SIZE_T`(%sp),%r15
678         $POP    `-$FRAME+13*$SIZE_T`(%sp),%r16
679         $POP    `-$FRAME+14*$SIZE_T`(%sp),%r17
680         $POP    `-$FRAME+15*$SIZE_T`(%sp),%r18
681         bv      (%r2)
682         .EXIT
683         $POPMB  -$FRAME(%sp),%r3
684         .PROCEND
685         .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
686 ___
687
688 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
689 # that it can be compiled with .LEVEL 1.0. It should be noted that I
690 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
691 # directive...
692
693 my $ldd = sub {
694   my ($mod,$args) = @_;
695   my $orig = "ldd$mod\t$args";
696
697     if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
698     {   my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
699         $opcode|=(1<<3) if ($mod =~ /^,m/);
700         $opcode|=(1<<2) if ($mod =~ /^,mb/);
701         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
702     }
703     else { "\t".$orig; }
704 };
705
706 my $std = sub {
707   my ($mod,$args) = @_;
708   my $orig = "std$mod\t$args";
709
710     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
711     {   my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
712         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
713     }
714     else { "\t".$orig; }
715 };
716
717 my $extrd = sub {
718   my ($mod,$args) = @_;
719   my $orig = "extrd$mod\t$args";
720
721     # I only have ",u" completer, it's implicitly encoded...
722     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
723     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
724         my $len=32-$3;
725         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
726         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
727         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
728     }
729     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
730     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
731         my $len=32-$2;
732         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
733         $opcode |= (1<<13) if ($mod =~ /,\**=/);
734         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
735     }
736     else { "\t".$orig; }
737 };
738
739 my $shrpd = sub {
740   my ($mod,$args) = @_;
741   my $orig = "shrpd$mod\t$args";
742
743     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
744     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
745         my $cpos=63-$3;
746         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
747         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
748     }
749     elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)    # format 11
750     {   sprintf "\t.WORD\t0x%08x\t; %s",
751                 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
752     }
753     else { "\t".$orig; }
754 };
755
756 sub assemble {
757   my ($mnemonic,$mod,$args)=@_;
758   my $opcode = eval("\$$mnemonic");
759
760     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
761 }
762
763 foreach (split("\n",$code)) {
764         s/\`([^\`]*)\`/eval $1/ge;
765
766         s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
767                 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)        # rotation for >=32
768                 :       sprintf("shd\t%$1,%$2,%d",$3)/e                 or
769         # translate made up instructons: _ror, _shr, _align, _shl
770         s/_ror(\s+)(%r[0-9]+),/
771                 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e                 or
772
773         s/_shr(\s+%r[0-9]+),([0-9]+),/
774                 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
775                 :        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e   or
776
777         s/_align(\s+%r[0-9]+,%r[0-9]+),/
778                 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e                or
779
780         s/_shl(\s+%r[0-9]+),([0-9]+),/
781                 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
782                 :            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
783
784         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
785
786         s/cmpb,\*/comb,/ if ($SIZE_T==4);
787
788         print $_,"\n";
789 }
790
791 close STDOUT;