C64x+ assembler pack. linux-c64xplus build is *not* tested nor can it be
[openssl.git] / crypto / sha / asm / sha256-c64xplus.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # SHA256 for C64x+.
11 #
12 # January 2012
13 #
14 # Performance is just below 10 cycles per processed byte, which is
15 # almost 40% faster than compiler-generated code. Unroll is unlikely
16 # to give more than ~8% improvement...
17 #
18 # !!! Note that this module uses AMR, which means that all interrupt
19 # service routines are expected to preserve it and for own well-being
20 # zero it upon entry.
21
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23 open STDOUT,">$output";
24
25 ($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
26  $K256="A3";
27
28 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
29         =map("A$_",(16..31));
30 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
31         =map("B$_",(16..31));
32
33 ($Xia,$Xib)=("A5","B5");                        # circular/ring buffer
34  $CTXB=$t2e;
35
36 ($Xn,$X0,$K)=("B7","B8","B9");
37 ($Maj,$Ch)=($T2,"B6");
38
39 $code.=<<___;
40         .text
41         .if     __TI_EABI__
42         .nocmp
43         .endif
44
45         .asg    B3,RA
46         .asg    A15,FP
47         .asg    B15,SP
48
49         .if     .BIG_ENDIAN
50         .asg    SWAP2,MV
51         .asg    SWAP4,MV
52         .endif
53
54         .global _sha256_block_data_order
55 _sha256_block_data_order:
56         .asmfunc stack_usage(64)
57         MV      $NUM,A0                         ; reassign $NUM
58 ||      MVK     -64,B0
59   [!A0] BNOP    RA                              ; if ($NUM==0) return;
60 || [A0] STW     FP,*SP--[16]                    ; save frame pointer and alloca(64)
61 || [A0] MV      SP,FP
62    [A0] ADDKPC  _sha256_block_data_order,B2
63 || [A0] AND     B0,SP,SP                        ; align stack at 64 bytes
64         .if     __TI_EABI__
65    [A0] MVK     0x00404,B1
66 || [A0] MVKL    \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
67    [A0] MVKH    0x50000,B1
68 || [A0] MVKH    \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
69         .else
70    [A0] MVK     0x00404,B1
71 || [A0] MVKL    (K256-_sha256_block_data_order),$K256
72    [A0] MVKH    0x50000,B1
73 || [A0] MVKH    (K256-_sha256_block_data_order),$K256
74         .endif
75    [A0] MVC     B1,AMR                          ; setup circular addressing
76 || [A0] MV      SP,$Xia
77    [A0] MV      SP,$Xib
78 || [A0] ADD     B2,$K256,$K256
79 || [A0] MV      $CTXA,$CTXB
80 || [A0] SUBAW   SP,2,SP                         ; reserve two words above buffer
81         LDW     *${CTXA}[0],$A                  ; load ctx
82 ||      LDW     *${CTXB}[4],$E
83         LDW     *${CTXA}[1],$B
84 ||      LDW     *${CTXB}[5],$F
85         LDW     *${CTXA}[2],$C
86 ||      LDW     *${CTXB}[6],$G
87         LDW     *${CTXA}[3],$D
88 ||      LDW     *${CTXB}[7],$H
89
90         LDNW    *$INP++,$Xn                     ; pre-fetch input
91         LDW     *$K256++,$K                     ; pre-fetch K256[0]
92         MVK     14,B0                           ; loop counters
93         MVK     47,B1
94 ||      ADDAW   $Xia,9,$Xia
95 outerloop?:
96         SUB     A0,1,A0
97 ||      MV      $A,$Actx
98 ||      MV      $E,$Ectx
99 ||      MVD     $B,$Bctx
100 ||      MVD     $F,$Fctx
101         MV      $C,$Cctx
102 ||      MV      $G,$Gctx
103 ||      MVD     $D,$Dctx
104 ||      MVD     $H,$Hctx
105 ||      SWAP4   $Xn,$X0
106
107         SPLOOPD 8                               ; BODY_00_14
108 ||      MVC     B0,ILC
109 ||      SWAP2   $X0,$X0
110
111         LDNW    *$INP++,$Xn
112 ||      ROTL    $A,30,$S0
113 ||      OR      $A,$B,$Maj
114 ||      AND     $A,$B,$t2a
115 ||      ROTL    $E,26,$S1
116 ||      AND     $F,$E,$Ch
117 ||      ANDN    $G,$E,$t2e
118         ROTL    $A,19,$t0a
119 ||      AND     $C,$Maj,$Maj
120 ||      ROTL    $E,21,$t0e
121 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
122         ROTL    $A,10,$t1a
123 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
124 ||      ROTL    $E,7,$t1e
125 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
126         ADD     $X0,$T1,$T1                     ; T1 += X[i];
127 ||      STW     $X0,*$Xib++
128 ||      XOR     $t0a,$S0,$S0
129 ||      XOR     $t0e,$S1,$S1
130         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
131 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
132 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
133 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
134         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
135 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
136 ||      ROTL    $G,0,$H                         ; h = g
137 ||      MV      $F,$G                           ; g = f
138 ||      MV      $X0,$X14
139 ||      SWAP4   $Xn,$X0
140         SWAP2   $X0,$X0
141 ||      MV      $E,$F                           ; f = e
142 ||      ADD     $D,$T1,$E                       ; e = d + T1
143 ||      MV      $C,$D                           ; d = c
144         MV      $B,$C                           ; c = b
145 ||      MV      $A,$B                           ; b = a
146 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
147         SPKERNEL
148
149         ROTL    $A,30,$S0                       ; BODY_15
150 ||      OR      $A,$B,$Maj
151 ||      AND     $A,$B,$t2a
152 ||      ROTL    $E,26,$S1
153 ||      AND     $F,$E,$Ch
154 ||      ANDN    $G,$E,$t2e
155 ||      LDW     *${Xib}[1],$Xn                  ; modulo-scheduled
156         ROTL    $A,19,$t0a
157 ||      AND     $C,$Maj,$Maj
158 ||      ROTL    $E,21,$t0e
159 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
160 ||      LDW     *${Xib}[2],$X1                  ; modulo-scheduled
161         ROTL    $A,10,$t1a
162 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
163 ||      ROTL    $E,7,$t1e
164 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
165         ADD     $X0,$T1,$T1                     ; T1 += X[i];
166 ||      STW     $X0,*$Xib++
167 ||      XOR     $t0a,$S0,$S0
168 ||      XOR     $t0e,$S1,$S1
169         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
170 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
171 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
172 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
173         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
174 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
175 ||      ROTL    $G,0,$H                         ; h = g
176 ||      MV      $F,$G                           ; g = f
177 ||      MV      $X0,$X15
178         MV      $E,$F                           ; f = e
179 ||      ADD     $D,$T1,$E                       ; e = d + T1
180 ||      MV      $C,$D                           ; d = c
181 ||      MV      $Xn,$X0                         ; modulo-scheduled
182 ||      LDW     *$Xia,$X9                       ; modulo-scheduled
183 ||      ROTL    $X1,25,$t0e                     ; modulo-scheduled
184 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
185         SHRU    $X1,3,$s0                       ; modulo-scheduled
186 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
187 ||      ROTL    $B,0,$C                         ; c = b
188 ||      MV      $A,$B                           ; b = a
189 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
190
191         SPLOOPD 10                              ; BODY_16_63
192 ||      MVC     B1,ILC
193 ||      ROTL    $X1,14,$t1e                     ; modulo-scheduled
194 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
195
196         XOR     $t0e,$s0,$s0
197 ||      XOR     $t0a,$s1,$s1
198 ||      MV      $X15,$X14
199 ||      MV      $X1,$Xn
200         XOR     $t1e,$s0,$s0                    ; sigma0(X[i+1])
201 ||      XOR     $t1a,$s1,$s1                    ; sigma1(X[i+14])
202 ||      LDW     *${Xib}[2],$X1                  ; module-scheduled
203         ROTL    $A,30,$S0
204 ||      OR      $A,$B,$Maj
205 ||      AND     $A,$B,$t2a
206 ||      ROTL    $E,26,$S1
207 ||      AND     $F,$E,$Ch
208 ||      ANDN    $G,$E,$t2e
209 ||      ADD     $X9,$X0,$X0                     ; X[i] += X[i+9]
210         ROTL    $A,19,$t0a
211 ||      AND     $C,$Maj,$Maj
212 ||      ROTL    $E,21,$t0e
213 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
214 ||      ADD     $s0,$X0,$X0                     ; X[i] += sigma1(X[i+1])
215         ROTL    $A,10,$t1a
216 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
217 ||      ROTL    $E,7,$t1e
218 ||      ADD     $H,$K,$T1                       ; T1 = h + K256[i]
219 ||      ADD     $s1,$X0,$X0                     ; X[i] += sigma1(X[i+14])
220         XOR     $t0a,$S0,$S0
221 ||      XOR     $t0e,$S1,$S1
222 ||      ADD     $X0,$T1,$T1                     ; T1 += X[i]
223 ||      STW     $X0,*$Xib++
224         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
225 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
226 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
227 ||      MV      $X0,$X15
228 ||      ROTL    $G,0,$H                         ; h = g
229 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
230         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
231 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
232 ||      MV      $F,$G                           ; g = f
233 ||      MV      $Xn,$X0                         ; modulo-scheduled
234 ||      LDW     *++$Xia,$X9                     ; modulo-scheduled
235 ||      ROTL    $X1,25,$t0e                     ; module-scheduled
236 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
237         ROTL    $X1,14,$t1e                     ; modulo-scheduled
238 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
239 ||      MV      $E,$F                           ; f = e
240 ||      ADD     $D,$T1,$E                       ; e = d + T1
241 ||      MV      $C,$D                           ; d = c
242 ||      MV      $B,$C                           ; c = b
243         MV      $A,$B                           ; b = a
244 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
245 ||      SHRU    $X1,3,$s0                       ; modulo-scheduled
246 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
247         SPKERNEL
248
249    [A0] B       outerloop?
250 || [A0] LDNW    *$INP++,$Xn                     ; pre-fetch input
251 || [A0] ADDK    -260,$K256                      ; rewind K256
252 ||      ADD     $Actx,$A,$A                     ; accumulate ctx
253 ||      ADD     $Ectx,$E,$E
254 ||      ADD     $Bctx,$B,$B
255         ADD     $Fctx,$F,$F
256 ||      ADD     $Cctx,$C,$C
257 ||      ADD     $Gctx,$G,$G
258 ||      ADD     $Dctx,$D,$D
259 ||      ADD     $Hctx,$H,$H
260 || [A0] LDW     *$K256++,$K                     ; pre-fetch K256[0]
261
262   [!A0] BNOP    RA
263 ||[!A0] MV      $CTXA,$CTXB
264   [!A0] MV      FP,SP                           ; restore stack pointer
265 ||[!A0] LDW     *FP[0],FP                       ; restore frame pointer
266   [!A0] STW     $A,*${CTXA}[0]                  ; save ctx
267 ||[!A0] STW     $E,*${CTXB}[4]
268 ||[!A0] MVK     0,B0
269   [!A0] STW     $B,*${CTXA}[1]
270 ||[!A0] STW     $F,*${CTXB}[5]
271 ||[!A0] MVC     B0,AMR                          ; clear AMR
272         STW     $C,*${CTXA}[2]
273 ||      STW     $G,*${CTXB}[6]
274         STW     $D,*${CTXA}[3]
275 ||      STW     $H,*${CTXB}[7]
276         .endasmfunc
277
278         .sect   ".const:sha_asm"
279         .align  128
280 K256:
281         .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
282         .uword  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
283         .uword  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
284         .uword  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
285         .uword  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
286         .uword  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
287         .uword  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
288         .uword  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
289         .uword  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
290         .uword  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
291         .uword  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
292         .uword  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
293         .uword  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
294         .uword  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
295         .uword  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
296         .uword  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
297         .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
298         .align  4
299
300 ___
301
302 print $code;