PA-RISC assembly pack: make it work with GNU assembler for HP-UX.
[openssl.git] / crypto / sha / asm / sha512-parisc.pl
1 #! /usr/bin/env perl
2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # SHA256/512 block procedure for PA-RISC.
18
19 # June 2009.
20 #
21 # SHA256 performance is >75% better than gcc 3.2 generated code on
22 # PA-7100LC. Compared to code generated by vendor compiler this
23 # implementation is almost 70% faster in 64-bit build, but delivers
24 # virtually same performance in 32-bit build on PA-8600.
25 #
26 # SHA512 performance is >2.9x better than gcc 3.2 generated code on
27 # PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
28 # code is executed on PA-RISC 2.0 processor and switches to 64-bit
29 # code path delivering adequate performance even in "blended" 32-bit
30 # build. Though 64-bit code is not any faster than code generated by
31 # vendor compiler on PA-8600...
32 #
33 # Special thanks to polarhome.com for providing HP-UX account.
34
35 $flavour = shift;
36 $output = shift;
37 open STDOUT,">$output";
38
39 if ($flavour =~ /64/) {
40         $LEVEL          ="2.0W";
41         $SIZE_T         =8;
42         $FRAME_MARKER   =80;
43         $SAVED_RP       =16;
44         $PUSH           ="std";
45         $PUSHMA         ="std,ma";
46         $POP            ="ldd";
47         $POPMB          ="ldd,mb";
48 } else {
49         $LEVEL          ="1.0";
50         $SIZE_T         =4;
51         $FRAME_MARKER   =48;
52         $SAVED_RP       =20;
53         $PUSH           ="stw";
54         $PUSHMA         ="stwm";
55         $POP            ="ldw";
56         $POPMB          ="ldwm";
57 }
58
59 if ($output =~ /512/) {
60         $func="sha512_block_data_order";
61         $SZ=8;
62         @Sigma0=(28,34,39);
63         @Sigma1=(14,18,41);
64         @sigma0=(1,  8, 7);
65         @sigma1=(19,61, 6);
66         $rounds=80;
67         $LAST10BITS=0x017;
68         $LD="ldd";
69         $LDM="ldd,ma";
70         $ST="std";
71 } else {
72         $func="sha256_block_data_order";
73         $SZ=4;
74         @Sigma0=( 2,13,22);
75         @Sigma1=( 6,11,25);
76         @sigma0=( 7,18, 3);
77         @sigma1=(17,19,10);
78         $rounds=64;
79         $LAST10BITS=0x0f2;
80         $LD="ldw";
81         $LDM="ldwm";
82         $ST="stw";
83 }
84
85 $FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
86                                 #                 [+ argument transfer]
87 $XOFF=16*$SZ+32;                # local variables
88 $FRAME+=$XOFF;
89 $XOFF+=$FRAME_MARKER;           # distance between %sp and local variables
90
91 $ctx="%r26";    # zapped by $a0
92 $inp="%r25";    # zapped by $a1
93 $num="%r24";    # zapped by $t0
94
95 $a0 ="%r26";
96 $a1 ="%r25";
97 $t0 ="%r24";
98 $t1 ="%r29";
99 $Tbl="%r31";
100
101 @V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
102
103 @X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
104     "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
105
106 sub ROUND_00_15 {
107 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
108 $code.=<<___;
109         _ror    $e,$Sigma1[0],$a0
110         and     $f,$e,$t0
111         _ror    $e,$Sigma1[1],$a1
112         addl    $t1,$h,$h
113         andcm   $g,$e,$t1
114         xor     $a1,$a0,$a0
115         _ror    $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
116         or      $t0,$t1,$t1             ; Ch(e,f,g)
117         addl    @X[$i%16],$h,$h
118         xor     $a0,$a1,$a1             ; Sigma1(e)
119         addl    $t1,$h,$h
120         _ror    $a,$Sigma0[0],$a0
121         addl    $a1,$h,$h
122
123         _ror    $a,$Sigma0[1],$a1
124         and     $a,$b,$t0
125         and     $a,$c,$t1
126         xor     $a1,$a0,$a0
127         _ror    $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
128         xor     $t1,$t0,$t0
129         and     $b,$c,$t1
130         xor     $a0,$a1,$a1             ; Sigma0(a)
131         addl    $h,$d,$d
132         xor     $t1,$t0,$t0             ; Maj(a,b,c)
133         `"$LDM  $SZ($Tbl),$t1" if ($i<15)`
134         addl    $a1,$h,$h
135         addl    $t0,$h,$h
136
137 ___
138 }
139
140 sub ROUND_16_xx {
141 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
142 $i-=16;
143 $code.=<<___;
144         _ror    @X[($i+1)%16],$sigma0[0],$a0
145         _ror    @X[($i+1)%16],$sigma0[1],$a1
146         addl    @X[($i+9)%16],@X[$i],@X[$i]
147         _ror    @X[($i+14)%16],$sigma1[0],$t0
148         _ror    @X[($i+14)%16],$sigma1[1],$t1
149         xor     $a1,$a0,$a0
150         _shr    @X[($i+1)%16],$sigma0[2],$a1
151         xor     $t1,$t0,$t0
152         _shr    @X[($i+14)%16],$sigma1[2],$t1
153         xor     $a1,$a0,$a0             ; sigma0(X[(i+1)&0x0f])
154         xor     $t1,$t0,$t0             ; sigma1(X[(i+14)&0x0f])
155         $LDM    $SZ($Tbl),$t1
156         addl    $a0,@X[$i],@X[$i]
157         addl    $t0,@X[$i],@X[$i]
158 ___
159 $code.=<<___ if ($i==15);
160         extru   $t1,31,10,$a1
161         comiclr,<> $LAST10BITS,$a1,%r0
162         ldo     1($Tbl),$Tbl            ; signal end of $Tbl
163 ___
164 &ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
165 }
166
167 $code=<<___;
168         .LEVEL  $LEVEL
169         .SPACE  \$TEXT\$
170         .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
171
172         .ALIGN  64
173 L\$table
174 ___
175 $code.=<<___ if ($SZ==8);
176         .WORD   0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
177         .WORD   0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
178         .WORD   0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
179         .WORD   0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
180         .WORD   0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
181         .WORD   0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
182         .WORD   0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
183         .WORD   0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
184         .WORD   0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
185         .WORD   0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
186         .WORD   0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
187         .WORD   0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
188         .WORD   0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
189         .WORD   0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
190         .WORD   0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
191         .WORD   0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
192         .WORD   0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
193         .WORD   0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
194         .WORD   0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
195         .WORD   0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
196         .WORD   0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
197         .WORD   0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
198         .WORD   0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
199         .WORD   0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
200         .WORD   0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
201         .WORD   0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
202         .WORD   0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
203         .WORD   0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
204         .WORD   0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
205         .WORD   0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
206         .WORD   0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
207         .WORD   0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
208         .WORD   0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
209         .WORD   0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
210         .WORD   0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
211         .WORD   0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
212         .WORD   0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
213         .WORD   0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
214         .WORD   0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
215         .WORD   0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
216 ___
217 $code.=<<___ if ($SZ==4);
218         .WORD   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
219         .WORD   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
220         .WORD   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
221         .WORD   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
222         .WORD   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
223         .WORD   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
224         .WORD   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
225         .WORD   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
226         .WORD   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
227         .WORD   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
228         .WORD   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
229         .WORD   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
230         .WORD   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
231         .WORD   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
232         .WORD   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
233         .WORD   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
234 ___
235 $code.=<<___;
236
237         .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
238         .ALIGN  64
239 $func
240         .PROC
241         .CALLINFO       FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
242         .ENTRY
243         $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
244         $PUSHMA %r3,$FRAME(%sp)
245         $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
246         $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
247         $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
248         $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
249         $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
250         $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
251         $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
252         $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
253         $PUSH   %r12,`-$FRAME+9*$SIZE_T`(%sp)
254         $PUSH   %r13,`-$FRAME+10*$SIZE_T`(%sp)
255         $PUSH   %r14,`-$FRAME+11*$SIZE_T`(%sp)
256         $PUSH   %r15,`-$FRAME+12*$SIZE_T`(%sp)
257         $PUSH   %r16,`-$FRAME+13*$SIZE_T`(%sp)
258         $PUSH   %r17,`-$FRAME+14*$SIZE_T`(%sp)
259         $PUSH   %r18,`-$FRAME+15*$SIZE_T`(%sp)
260
261         _shl    $num,`log(16*$SZ)/log(2)`,$num
262         addl    $inp,$num,$num          ; $num to point at the end of $inp
263
264         $PUSH   $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)    ; save arguments
265         $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
266         $PUSH   $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
267
268         blr     %r0,$Tbl
269         ldi     3,$t1
270 L\$pic
271         andcm   $Tbl,$t1,$Tbl           ; wipe privilege level
272         ldo     L\$table-L\$pic($Tbl),$Tbl
273 ___
274 $code.=<<___ if ($SZ==8 && $SIZE_T==4);
275         ldi     31,$t1
276         mtctl   $t1,%cr11
277         extrd,u,*= $t1,%sar,1,$t1       ; executes on PA-RISC 1.0
278         b       L\$parisc1
279         nop
280 ___
281 $code.=<<___;
282         $LD     `0*$SZ`($ctx),$A        ; load context
283         $LD     `1*$SZ`($ctx),$B
284         $LD     `2*$SZ`($ctx),$C
285         $LD     `3*$SZ`($ctx),$D
286         $LD     `4*$SZ`($ctx),$E
287         $LD     `5*$SZ`($ctx),$F
288         $LD     `6*$SZ`($ctx),$G
289         $LD     `7*$SZ`($ctx),$H
290
291         extru   $inp,31,`log($SZ)/log(2)`,$t0
292         sh3addl $t0,%r0,$t0
293         subi    `8*$SZ`,$t0,$t0
294         mtctl   $t0,%cr11               ; load %sar with align factor
295
296 L\$oop
297         ldi     `$SZ-1`,$t0
298         $LDM    $SZ($Tbl),$t1
299         andcm   $inp,$t0,$t0            ; align $inp
300 ___
301         for ($i=0;$i<15;$i++) {         # load input block
302         $code.="\t$LD   `$SZ*$i`($t0),@X[$i]\n";                }
303 $code.=<<___;
304         cmpb,*= $inp,$t0,L\$aligned
305         $LD     `$SZ*15`($t0),@X[15]
306         $LD     `$SZ*16`($t0),@X[16]
307 ___
308         for ($i=0;$i<16;$i++) {         # align data
309         $code.="\t_align        @X[$i],@X[$i+1],@X[$i]\n";      }
310 $code.=<<___;
311 L\$aligned
312         nop     ; otherwise /usr/ccs/bin/as is confused by below .WORD
313 ___
314
315 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
316 $code.=<<___;
317 L\$rounds
318         nop     ; otherwise /usr/ccs/bin/as is confused by below .WORD
319 ___
320 for(;$i<32;$i++)        { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
321 $code.=<<___;
322         bb,>=   $Tbl,31,L\$rounds       ; end of $Tbl signalled?
323         nop
324
325         $POP    `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx    ; restore arguments
326         $POP    `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
327         $POP    `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
328         ldo     `-$rounds*$SZ-1`($Tbl),$Tbl             ; rewind $Tbl
329
330         $LD     `0*$SZ`($ctx),@X[0]     ; load context
331         $LD     `1*$SZ`($ctx),@X[1]
332         $LD     `2*$SZ`($ctx),@X[2]
333         $LD     `3*$SZ`($ctx),@X[3]
334         $LD     `4*$SZ`($ctx),@X[4]
335         $LD     `5*$SZ`($ctx),@X[5]
336         addl    @X[0],$A,$A
337         $LD     `6*$SZ`($ctx),@X[6]
338         addl    @X[1],$B,$B
339         $LD     `7*$SZ`($ctx),@X[7]
340         ldo     `16*$SZ`($inp),$inp     ; advance $inp
341
342         $ST     $A,`0*$SZ`($ctx)        ; save context
343         addl    @X[2],$C,$C
344         $ST     $B,`1*$SZ`($ctx)
345         addl    @X[3],$D,$D
346         $ST     $C,`2*$SZ`($ctx)
347         addl    @X[4],$E,$E
348         $ST     $D,`3*$SZ`($ctx)
349         addl    @X[5],$F,$F
350         $ST     $E,`4*$SZ`($ctx)
351         addl    @X[6],$G,$G
352         $ST     $F,`5*$SZ`($ctx)
353         addl    @X[7],$H,$H
354         $ST     $G,`6*$SZ`($ctx)
355         $ST     $H,`7*$SZ`($ctx)
356
357         cmpb,*<>,n $inp,$num,L\$oop
358         $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)    ; save $inp
359 ___
360 if ($SZ==8 && $SIZE_T==4)       # SHA512 for 32-bit PA-RISC 1.0
361 {{
362 $code.=<<___;
363         b       L\$done
364         nop
365
366         .ALIGN  64
367 L\$parisc1
368 ___
369
370 @V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
371       $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) =
372    ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
373      "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
374 $a0 ="%r17";
375 $a1 ="%r18";
376 $a2 ="%r19";
377 $a3 ="%r20";
378 $t0 ="%r21";
379 $t1 ="%r22";
380 $t2 ="%r28";
381 $t3 ="%r29";
382 $Tbl="%r31";
383
384 @X=("%r23","%r24","%r25","%r26");       # zaps $num,$inp,$ctx
385
386 sub ROUND_00_15_pa1 {
387 my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
388        $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
389 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
390
391 $code.=<<___ if (!$flag);
392         ldw     `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
393         ldw     `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo     ; load X[i+1]
394 ___
395 $code.=<<___;
396         shd     $ehi,$elo,$Sigma1[0],$t0
397          add    $Xlo,$hlo,$hlo
398         shd     $elo,$ehi,$Sigma1[0],$t1
399          addc   $Xhi,$hhi,$hhi          ; h += X[i]
400         shd     $ehi,$elo,$Sigma1[1],$t2
401          ldwm   8($Tbl),$Xhi
402         shd     $elo,$ehi,$Sigma1[1],$t3
403          ldw    -4($Tbl),$Xlo           ; load K[i]
404         xor     $t2,$t0,$t0
405         xor     $t3,$t1,$t1
406          and    $flo,$elo,$a0
407          and    $fhi,$ehi,$a1
408         shd     $ehi,$elo,$Sigma1[2],$t2
409          andcm  $glo,$elo,$a2
410         shd     $elo,$ehi,$Sigma1[2],$t3
411          andcm  $ghi,$ehi,$a3
412         xor     $t2,$t0,$t0
413         xor     $t3,$t1,$t1             ; Sigma1(e)
414         add     $Xlo,$hlo,$hlo
415          xor    $a2,$a0,$a0
416         addc    $Xhi,$hhi,$hhi          ; h += K[i]
417          xor    $a3,$a1,$a1             ; Ch(e,f,g)
418
419          add    $t0,$hlo,$hlo
420         shd     $ahi,$alo,$Sigma0[0],$t0
421          addc   $t1,$hhi,$hhi           ; h += Sigma1(e)
422         shd     $alo,$ahi,$Sigma0[0],$t1
423          add    $a0,$hlo,$hlo
424         shd     $ahi,$alo,$Sigma0[1],$t2
425          addc   $a1,$hhi,$hhi           ; h += Ch(e,f,g)
426         shd     $alo,$ahi,$Sigma0[1],$t3
427
428         xor     $t2,$t0,$t0
429         xor     $t3,$t1,$t1
430         shd     $ahi,$alo,$Sigma0[2],$t2
431         and     $alo,$blo,$a0
432         shd     $alo,$ahi,$Sigma0[2],$t3
433         and     $ahi,$bhi,$a1
434         xor     $t2,$t0,$t0
435         xor     $t3,$t1,$t1             ; Sigma0(a)
436
437         and     $alo,$clo,$a2
438         and     $ahi,$chi,$a3
439         xor     $a2,$a0,$a0
440          add    $hlo,$dlo,$dlo
441         xor     $a3,$a1,$a1
442          addc   $hhi,$dhi,$dhi          ; d += h
443         and     $blo,$clo,$a2
444          add    $t0,$hlo,$hlo
445         and     $bhi,$chi,$a3
446          addc   $t1,$hhi,$hhi           ; h += Sigma0(a)
447         xor     $a2,$a0,$a0
448          add    $a0,$hlo,$hlo
449         xor     $a3,$a1,$a1             ; Maj(a,b,c)
450          addc   $a1,$hhi,$hhi           ; h += Maj(a,b,c)
451
452 ___
453 $code.=<<___ if ($i==15 && $flag);
454         extru   $Xlo,31,10,$Xlo
455         comiclr,= $LAST10BITS,$Xlo,%r0
456         b       L\$rounds_pa1
457         nop
458 ___
459 push(@X,shift(@X)); push(@X,shift(@X));
460 }
461
462 sub ROUND_16_xx_pa1 {
463 my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
464 my ($i)=shift;
465 $i-=16;
466 $code.=<<___;
467         ldw     `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
468         ldw     `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo     ; load X[i+1]
469         ldw     `-$XOFF+8*(($i+9)%16)`(%sp),$a1
470         ldw     `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0       ; load X[i+9]
471         ldw     `-$XOFF+8*(($i+14)%16)`(%sp),$a3
472         ldw     `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2      ; load X[i+14]
473         shd     $Xnhi,$Xnlo,$sigma0[0],$t0
474         shd     $Xnlo,$Xnhi,$sigma0[0],$t1
475          add    $a0,$Xlo,$Xlo
476         shd     $Xnhi,$Xnlo,$sigma0[1],$t2
477          addc   $a1,$Xhi,$Xhi
478         shd     $Xnlo,$Xnhi,$sigma0[1],$t3
479         xor     $t2,$t0,$t0
480         shd     $Xnhi,$Xnlo,$sigma0[2],$t2
481         xor     $t3,$t1,$t1
482         extru   $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
483         xor     $t2,$t0,$t0
484          shd    $a3,$a2,$sigma1[0],$a0
485         xor     $t3,$t1,$t1             ; sigma0(X[i+1)&0x0f])
486          shd    $a2,$a3,$sigma1[0],$a1
487         add     $t0,$Xlo,$Xlo
488          shd    $a3,$a2,$sigma1[1],$t2
489         addc    $t1,$Xhi,$Xhi
490          shd    $a2,$a3,$sigma1[1],$t3
491         xor     $t2,$a0,$a0
492         shd     $a3,$a2,$sigma1[2],$t2
493         xor     $t3,$a1,$a1
494         extru   $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
495         xor     $t2,$a0,$a0
496         xor     $t3,$a1,$a1             ; sigma0(X[i+14)&0x0f])
497         add     $a0,$Xlo,$Xlo
498         addc    $a1,$Xhi,$Xhi
499
500         stw     $Xhi,`-$XOFF+8*($i%16)`(%sp)
501         stw     $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
502 ___
503 &ROUND_00_15_pa1($i,@_,1);
504 }
505 $code.=<<___;
506         ldw     `0*4`($ctx),$Ahi                ; load context
507         ldw     `1*4`($ctx),$Alo
508         ldw     `2*4`($ctx),$Bhi
509         ldw     `3*4`($ctx),$Blo
510         ldw     `4*4`($ctx),$Chi
511         ldw     `5*4`($ctx),$Clo
512         ldw     `6*4`($ctx),$Dhi
513         ldw     `7*4`($ctx),$Dlo
514         ldw     `8*4`($ctx),$Ehi
515         ldw     `9*4`($ctx),$Elo
516         ldw     `10*4`($ctx),$Fhi
517         ldw     `11*4`($ctx),$Flo
518         ldw     `12*4`($ctx),$Ghi
519         ldw     `13*4`($ctx),$Glo
520         ldw     `14*4`($ctx),$Hhi
521         ldw     `15*4`($ctx),$Hlo
522
523         extru   $inp,31,2,$t0
524         sh3addl $t0,%r0,$t0
525         subi    32,$t0,$t0
526         mtctl   $t0,%cr11               ; load %sar with align factor
527
528 L\$oop_pa1
529         extru   $inp,31,2,$a3
530         comib,= 0,$a3,L\$aligned_pa1
531         sub     $inp,$a3,$inp
532
533         ldw     `0*4`($inp),$X[0]
534         ldw     `1*4`($inp),$X[1]
535         ldw     `2*4`($inp),$t2
536         ldw     `3*4`($inp),$t3
537         ldw     `4*4`($inp),$a0
538         ldw     `5*4`($inp),$a1
539         ldw     `6*4`($inp),$a2
540         ldw     `7*4`($inp),$a3
541         vshd    $X[0],$X[1],$X[0]
542         vshd    $X[1],$t2,$X[1]
543         stw     $X[0],`-$XOFF+0*4`(%sp)
544         ldw     `8*4`($inp),$t0
545         vshd    $t2,$t3,$t2
546         stw     $X[1],`-$XOFF+1*4`(%sp)
547         ldw     `9*4`($inp),$t1
548         vshd    $t3,$a0,$t3
549 ___
550 {
551 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
552 for ($i=2;$i<=(128/4-8);$i++) {
553 $code.=<<___;
554         stw     $t[0],`-$XOFF+$i*4`(%sp)
555         ldw     `(8+$i)*4`($inp),$t[0]
556         vshd    $t[1],$t[2],$t[1]
557 ___
558 push(@t,shift(@t));
559 }
560 for (;$i<(128/4-1);$i++) {
561 $code.=<<___;
562         stw     $t[0],`-$XOFF+$i*4`(%sp)
563         vshd    $t[1],$t[2],$t[1]
564 ___
565 push(@t,shift(@t));
566 }
567 $code.=<<___;
568         b       L\$collected_pa1
569         stw     $t[0],`-$XOFF+$i*4`(%sp)
570
571 ___
572 }
573 $code.=<<___;
574 L\$aligned_pa1
575         ldw     `0*4`($inp),$X[0]
576         ldw     `1*4`($inp),$X[1]
577         ldw     `2*4`($inp),$t2
578         ldw     `3*4`($inp),$t3
579         ldw     `4*4`($inp),$a0
580         ldw     `5*4`($inp),$a1
581         ldw     `6*4`($inp),$a2
582         ldw     `7*4`($inp),$a3
583         stw     $X[0],`-$XOFF+0*4`(%sp)
584         ldw     `8*4`($inp),$t0
585         stw     $X[1],`-$XOFF+1*4`(%sp)
586         ldw     `9*4`($inp),$t1
587 ___
588 {
589 my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
590 for ($i=2;$i<(128/4-8);$i++) {
591 $code.=<<___;
592         stw     $t[0],`-$XOFF+$i*4`(%sp)
593         ldw     `(8+$i)*4`($inp),$t[0]
594 ___
595 push(@t,shift(@t));
596 }
597 for (;$i<128/4;$i++) {
598 $code.=<<___;
599         stw     $t[0],`-$XOFF+$i*4`(%sp)
600 ___
601 push(@t,shift(@t));
602 }
603 $code.="L\$collected_pa1\n";
604 }
605
606 for($i=0;$i<16;$i++)    { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
607 $code.="L\$rounds_pa1\n";
608 for(;$i<32;$i++)        { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
609
610 $code.=<<___;
611         $POP    `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx    ; restore arguments
612         $POP    `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
613         $POP    `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
614         ldo     `-$rounds*$SZ`($Tbl),$Tbl               ; rewind $Tbl
615
616         ldw     `0*4`($ctx),$t1         ; update context
617         ldw     `1*4`($ctx),$t0
618         ldw     `2*4`($ctx),$t3
619         ldw     `3*4`($ctx),$t2
620         ldw     `4*4`($ctx),$a1
621         ldw     `5*4`($ctx),$a0
622         ldw     `6*4`($ctx),$a3
623         add     $t0,$Alo,$Alo
624         ldw     `7*4`($ctx),$a2
625         addc    $t1,$Ahi,$Ahi
626         ldw     `8*4`($ctx),$t1
627         add     $t2,$Blo,$Blo
628         ldw     `9*4`($ctx),$t0
629         addc    $t3,$Bhi,$Bhi
630         ldw     `10*4`($ctx),$t3
631         add     $a0,$Clo,$Clo
632         ldw     `11*4`($ctx),$t2
633         addc    $a1,$Chi,$Chi
634         ldw     `12*4`($ctx),$a1
635         add     $a2,$Dlo,$Dlo
636         ldw     `13*4`($ctx),$a0
637         addc    $a3,$Dhi,$Dhi
638         ldw     `14*4`($ctx),$a3
639         add     $t0,$Elo,$Elo
640         ldw     `15*4`($ctx),$a2
641         addc    $t1,$Ehi,$Ehi
642         stw     $Ahi,`0*4`($ctx)
643         add     $t2,$Flo,$Flo
644         stw     $Alo,`1*4`($ctx)
645         addc    $t3,$Fhi,$Fhi
646         stw     $Bhi,`2*4`($ctx)
647         add     $a0,$Glo,$Glo
648         stw     $Blo,`3*4`($ctx)
649         addc    $a1,$Ghi,$Ghi
650         stw     $Chi,`4*4`($ctx)
651         add     $a2,$Hlo,$Hlo
652         stw     $Clo,`5*4`($ctx)
653         addc    $a3,$Hhi,$Hhi
654         stw     $Dhi,`6*4`($ctx)
655         ldo     `16*$SZ`($inp),$inp     ; advance $inp
656         stw     $Dlo,`7*4`($ctx)
657         stw     $Ehi,`8*4`($ctx)
658         stw     $Elo,`9*4`($ctx)
659         stw     $Fhi,`10*4`($ctx)
660         stw     $Flo,`11*4`($ctx)
661         stw     $Ghi,`12*4`($ctx)
662         stw     $Glo,`13*4`($ctx)
663         stw     $Hhi,`14*4`($ctx)
664         comb,=  $inp,$num,L\$done
665         stw     $Hlo,`15*4`($ctx)
666         b       L\$oop_pa1
667         $PUSH   $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)    ; save $inp
668 L\$done
669 ___
670 }}
671 $code.=<<___;
672         $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
673         $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
674         $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
675         $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
676         $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
677         $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
678         $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
679         $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
680         $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
681         $POP    `-$FRAME+9*$SIZE_T`(%sp),%r12
682         $POP    `-$FRAME+10*$SIZE_T`(%sp),%r13
683         $POP    `-$FRAME+11*$SIZE_T`(%sp),%r14
684         $POP    `-$FRAME+12*$SIZE_T`(%sp),%r15
685         $POP    `-$FRAME+13*$SIZE_T`(%sp),%r16
686         $POP    `-$FRAME+14*$SIZE_T`(%sp),%r17
687         $POP    `-$FRAME+15*$SIZE_T`(%sp),%r18
688         bv      (%r2)
689         .EXIT
690         $POPMB  -$FRAME(%sp),%r3
691         .PROCEND
692         .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
693 ___
694
695 # Explicitly encode PA-RISC 2.0 instructions used in this module, so
696 # that it can be compiled with .LEVEL 1.0. It should be noted that I
697 # wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
698 # directive...
699
700 my $ldd = sub {
701   my ($mod,$args) = @_;
702   my $orig = "ldd$mod\t$args";
703
704     if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
705     {   my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
706         $opcode|=(1<<3) if ($mod =~ /^,m/);
707         $opcode|=(1<<2) if ($mod =~ /^,mb/);
708         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
709     }
710     else { "\t".$orig; }
711 };
712
713 my $std = sub {
714   my ($mod,$args) = @_;
715   my $orig = "std$mod\t$args";
716
717     if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
718     {   my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
719         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
720     }
721     else { "\t".$orig; }
722 };
723
724 my $extrd = sub {
725   my ($mod,$args) = @_;
726   my $orig = "extrd$mod\t$args";
727
728     # I only have ",u" completer, it's implicitly encoded...
729     if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
730     {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
731         my $len=32-$3;
732         $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
733         $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
734         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
735     }
736     elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
737     {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
738         my $len=32-$2;
739         $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
740         $opcode |= (1<<13) if ($mod =~ /,\**=/);
741         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
742     }
743     else { "\t".$orig; }
744 };
745
746 my $shrpd = sub {
747   my ($mod,$args) = @_;
748   my $orig = "shrpd$mod\t$args";
749
750     if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
751     {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
752         my $cpos=63-$3;
753         $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
754         sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
755     }
756     elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)    # format 11
757     {   sprintf "\t.WORD\t0x%08x\t; %s",
758                 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
759     }
760     else { "\t".$orig; }
761 };
762
763 sub assemble {
764   my ($mnemonic,$mod,$args)=@_;
765   my $opcode = eval("\$$mnemonic");
766
767     ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
768 }
769
770 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
771         =~ /GNU assembler/) {
772     $gnuas = 1;
773 }
774
775 foreach (split("\n",$code)) {
776         s/\`([^\`]*)\`/eval $1/ge;
777
778         s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
779                 $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)        # rotation for >=32
780                 :       sprintf("shd\t%$1,%$2,%d",$3)/e                 or
781         # translate made up instructions: _ror, _shr, _align, _shl
782         s/_ror(\s+)(%r[0-9]+),/
783                 ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e                 or
784
785         s/_shr(\s+%r[0-9]+),([0-9]+),/
786                 $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
787                 :        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e   or
788
789         s/_align(\s+%r[0-9]+,%r[0-9]+),/
790                 ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e                or
791
792         s/_shl(\s+%r[0-9]+),([0-9]+),/
793                 $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
794                 :            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
795
796         s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
797
798         s/(\.LEVEL\s+2\.0)W/$1w/        if ($gnuas && $SIZE_T==8);
799         s/\.SPACE\s+\$TEXT\$/.text/     if ($gnuas && $SIZE_T==8);
800         s/\.SUBSPA.*//                  if ($gnuas && $SIZE_T==8);
801         s/cmpb,\*/comb,/                if ($SIZE_T==4);
802         s/\bbv\b/bve/                   if ($SIZE_T==8);
803
804         print $_,"\n";
805 }
806
807 close STDOUT;