3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance is 19 cycles per processed byte. Compared to block
15 # transform function from sha512.c compiled with cl6x with -mv6400+
16 # -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
17 # Loop unroll won't make it, this implementation, any faster, because
18 # it's effectively dominated by SHRU||SHL pairs and you can't schedule
21 # !!! Note that this module uses AMR, which means that all interrupt
22 # service routines are expected to preserve it and for own well-being
25 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
26 open STDOUT,">$output";
28 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
31 ($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
32 $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
33 ($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
34 $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
36 ($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
37 ($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
38 ($T1hi, $T2hi)= ("A6","A7");
39 ($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
40 ($Khi,$Klo)=("A9","A8");
41 ($MAJhi,$MAJlo)=($T2hi,$T2lo);
42 ($t1hi,$t1lo)=($Khi,"B2");
45 ($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer
51 .asg sha512_block_data_order,_sha512_block_data_order
66 .global _sha512_block_data_order
67 _sha512_block_data_order:
68 .asmfunc stack_usage(40+128)
69 MV $NUM,A0 ; reassign $NUM
71 [!A0] BNOP RA ; if ($NUM==0) return;
72 || [A0] STW FP,*SP--(40) ; save frame pointer
74 [A0] STDW B13:B12,*SP[4]
75 || [A0] MVK 0x00404,B1
76 [A0] STDW B11:B10,*SP[3]
77 || [A0] STDW A13:A12,*FP[-3]
78 || [A0] MVKH 0x60000,B1
79 [A0] STDW A11:A10,*SP[1]
80 || [A0] MVC B1,AMR ; setup circular addressing
81 || [A0] ADD B0,SP,SP ; alloca(128)
83 [A0] AND B0,SP,SP ; align stack at 128 bytes
84 || [A0] ADDKPC _sha512_block_data_order,B1
85 || [A0] MVKL \$PCR_OFFSET(K512,_sha512_block_data_order),$K512
86 [A0] MVKH \$PCR_OFFSET(K512,_sha512_block_data_order),$K512
87 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
89 [A0] AND B0,SP,SP ; align stack at 128 bytes
90 || [A0] ADDKPC _sha512_block_data_order,B1
91 || [A0] MVKL (K512-_sha512_block_data_order),$K512
92 [A0] MVKH (K512-_sha512_block_data_order),$K512
93 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
99 LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
100 || LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo
101 || ADD B1,$K512,$K512
102 LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi
103 || LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo
104 LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi
105 || LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo
106 LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi
107 || LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo
108 LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi
109 || LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo
110 LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi
111 || LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo
112 LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi
113 || LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo
114 LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi
115 || LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo
117 LDNDW *$INP++,B11:B10 ; pre-fetch input
118 LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
120 MVK 15,B0 ; loop counters
150 STW $T1hi,*$Xihi++[2]
151 || STW $T1lo,*$Xilo++[2] ; X[i] = T1
152 || ADD $Hhi,$T1hi,$T1hi
153 || ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
154 || SHRU $Ehi,14,$S1hi
155 || SHL $Ehi,32-14,$S1lo
157 || XOR $Flo,$Glo,$CHlo
158 || ADD KHI,$T1hi,$T1hi
159 || ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i]
160 || SHRU $Elo,14,$t0lo
161 || SHL $Elo,32-14,$t0hi
162 XOR $t0hi,$S1hi,$S1hi
163 || XOR $t0lo,$S1lo,$S1lo
164 || AND $Ehi,$CHhi,$CHhi
165 || AND $Elo,$CHlo,$CHlo
167 || ROTL $Glo,0,$Hlo ; h = g
168 || SHRU $Ehi,18,$t0hi
169 || SHL $Ehi,32-18,$t0lo
170 XOR $t0hi,$S1hi,$S1hi
171 || XOR $t0lo,$S1lo,$S1lo
172 || XOR $Ghi,$CHhi,$CHhi
173 || XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g
175 || ROTL $Flo,0,$Glo ; g = f
176 || SHRU $Elo,18,$t0lo
177 || SHL $Elo,32-18,$t0hi
178 XOR $t0hi,$S1hi,$S1hi
179 || XOR $t0lo,$S1lo,$S1lo
180 || OR $Ahi,$Bhi,$MAJhi
181 || OR $Alo,$Blo,$MAJlo
183 || ROTL $Elo,0,$Flo ; f = e
184 || SHRU $Ehi,41-32,$t0lo
185 || SHL $Ehi,64-41,$t0hi
186 XOR $t0hi,$S1hi,$S1hi
187 || XOR $t0lo,$S1lo,$S1lo
188 || AND $Chi,$MAJhi,$MAJhi
189 || AND $Clo,$MAJlo,$MAJlo
191 || ROTL $Dlo,0,$Elo ; e = d
192 || SHRU $Elo,41-32,$t0hi
193 || SHL $Elo,64-41,$t0lo
194 XOR $t0hi,$S1hi,$S1hi
195 || XOR $t0lo,$S1lo,$S1lo ; Sigma1(e)
196 || AND $Ahi,$Bhi,$t1hi
197 || AND $Alo,$Blo,$t1lo
199 || ROTL $Clo,0,$Dlo ; d = c
200 || SHRU $Ahi,28,$S0hi
201 || SHL $Ahi,32-28,$S0lo
202 OR $t1hi,$MAJhi,$MAJhi
203 || OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b)
204 || ADD $CHhi,$T1hi,$T1hi
205 || ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g)
207 || ROTL $Blo,0,$Clo ; c = b
208 || SHRU $Alo,28,$t0lo
209 || SHL $Alo,32-28,$t0hi
210 XOR $t0hi,$S0hi,$S0hi
211 || XOR $t0lo,$S0lo,$S0lo
212 || ADD $S1hi,$T1hi,$T1hi
213 || ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e)
215 || ROTL $Alo,0,$Blo ; b = a
216 || SHRU $Ahi,34-32,$t0lo
217 || SHL $Ahi,64-34,$t0hi
218 XOR $t0hi,$S0hi,$S0hi
219 || XOR $t0lo,$S0lo,$S0lo
220 || ADD $MAJhi,$T1hi,$T2hi
221 || ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c)
222 || SHRU $Alo,34-32,$t0hi
223 || SHL $Alo,64-34,$t0lo
224 XOR $t0hi,$S0hi,$S0hi
225 || XOR $t0lo,$S0lo,$S0lo
226 || ADD $Ehi,$T1hi,$T1hi
227 || ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e
228 || [B0] BNOP loop0_15?
229 || SHRU $Ahi,39-32,$t0lo
230 || SHL $Ahi,64-39,$t0hi
231 XOR $t0hi,$S0hi,$S0hi
232 || XOR $t0lo,$S0lo,$S0lo
233 || [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
235 || SHRU $Alo,39-32,$t0hi
236 || SHL $Alo,64-39,$t0lo
237 XOR $t0hi,$S0hi,$S0hi
238 || XOR $t0lo,$S0lo,$S0lo ; Sigma0(a)
239 || ADD $T1carry,$T1hi,$Ehi
240 || MV $T1lo,$Elo ; e = T1
241 ||[!B0] LDW *${Xihi}[28],$T1hi
242 ||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
243 ADD $S0hi,$T2hi,$T2hi
244 || ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a)
245 || [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i]
246 NOP ; avoid cross-path stall
247 ADD $T2carry,$T2hi,$Ahi
248 || MV $T2lo,$Alo ; a = T2
250 ;;===== branch to loop00_15? is taken here
252 ;;===== branch to break? is taken here
253 LDW *${Xihi}[2],$T2hi
254 || LDW *${Xilo}[2],$T2lo ; X[i+1]
255 || SHRU $T1hi,19,$S1hi
256 || SHL $T1hi,32-19,$S1lo
258 || SHL $T1lo,32-19,$t0hi
259 XOR $t0hi,$S1hi,$S1hi
260 || XOR $t0lo,$S1lo,$S1lo
261 || SHRU $T1hi,61-32,$t0lo
262 || SHL $T1hi,64-61,$t0hi
263 XOR $t0hi,$S1hi,$S1hi
264 || XOR $t0lo,$S1lo,$S1lo
265 || SHRU $T1lo,61-32,$t0hi
266 || SHL $T1lo,64-61,$t0lo
267 XOR $t0hi,$S1hi,$S1hi
268 || XOR $t0lo,$S1lo,$S1lo
269 || SHRU $T1hi,6,$t0hi
270 || SHL $T1hi,32-6,$t0lo
271 XOR $t0hi,$S1hi,$S1hi
272 || XOR $t0lo,$S1lo,$S1lo
273 || SHRU $T1lo,6,$t0lo
274 || LDW *${Xihi}[18],$T1hi
275 || LDW *${Xilo}[18],$T1lo ; X[i+9]
276 XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14])
278 || LDW *${Xihi}[0],$CHhi
279 || LDW *${Xilo}[0],$CHlo ; X[i]
280 || SHRU $T2hi,1,$S0hi
281 || SHL $T2hi,32-1,$S0lo
283 || SHL $T2lo,32-1,$t0hi
284 XOR $t0hi,$S0hi,$S0hi
285 || XOR $t0lo,$S0lo,$S0lo
286 || SHRU $T2hi,8,$t0hi
287 || SHL $T2hi,32-8,$t0lo
288 XOR $t0hi,$S0hi,$S0hi
289 || XOR $t0lo,$S0lo,$S0lo
290 || SHRU $T2lo,8,$t0lo
291 || SHL $T2lo,32-8,$t0hi
292 XOR $t0hi,$S0hi,$S0hi
293 || XOR $t0lo,$S0lo,$S0lo
294 || ADD $S1hi,$T1hi,$T1hi
295 || ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1()
296 || [B1] BNOP loop16_79?
297 || SHRU $T2hi,7,$t0hi
298 || SHL $T2hi,32-7,$t0lo
299 XOR $t0hi,$S0hi,$S0hi
300 || XOR $t0lo,$S0lo,$S0lo
301 || ADD $CHhi,$T1hi,$T1hi
302 || ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i]
303 || SHRU $T2lo,7,$t0lo
304 XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1]
306 ADD $S0hi,$T1hi,$T1hi
307 || ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0()
309 NOP ; avoid cross-path stall
310 ADD $T1carry,$T1hi,$T1hi
311 ;;===== branch to loop16_79? is taken here
314 ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx
315 || ADDU $Alo,$Actxlo,$Actxlo:$Alo
316 || [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input
317 || [A0] ADDK -640,$K512 ; rewind pointer to K512
318 ADD $Bhi,$Bctxhi,$Bhi
319 || ADDU $Blo,$Bctxlo,$Bctxlo:$Blo
320 || [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
321 ADD $Chi,$Cctxhi,$Chi
322 || ADDU $Clo,$Cctxlo,$Cctxlo:$Clo
323 || ADD $Actxlo,$Ahi,$Ahi
324 ||[!A0] MV $CTXA,$CTXB
325 ADD $Dhi,$Dctxhi,$Dhi
326 || ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo
327 || ADD $Bctxlo,$Bhi,$Bhi
328 ||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx
329 ||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN]
330 ADD $Ehi,$Ectxhi,$Ehi
331 || ADDU $Elo,$Ectxlo,$Ectxlo:$Elo
332 || ADD $Cctxlo,$Chi,$Chi
333 || [A0] BNOP outerloop?
334 ||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
335 ||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN]
336 ADD $Fhi,$Fctxhi,$Fhi
337 || ADDU $Flo,$Fctxlo,$Fctxlo:$Flo
338 || ADD $Dctxlo,$Dhi,$Dhi
339 ||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN]
340 ||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN]
341 ADD $Ghi,$Gctxhi,$Ghi
342 || ADDU $Glo,$Gctxlo,$Gctxlo:$Glo
343 || ADD $Ectxlo,$Ehi,$Ehi
344 ||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
345 ||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
346 ADD $Hhi,$Hctxhi,$Hhi
347 || ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo
348 || ADD $Fctxlo,$Fhi,$Fhi
349 ||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
350 ||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN]
351 ADD $Gctxlo,$Ghi,$Ghi
352 ||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
353 ||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN]
354 ADD $Hctxlo,$Hhi,$Hhi
355 ||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
356 ||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN]
357 ;;===== branch to outerloop? is taken here
359 STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
360 || STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
362 ADD FP,B0,SP ; destroy circular buffer
363 || LDDW *FP[-4],A11:A10
365 || LDDW *FP[-2],B11:B10
368 LDW *++SP(40),FP ; restore frame pointer
370 MVC B0,AMR ; clear AMR
371 NOP 2 ; wait till FP is committed
375 .sect ".text:sha_asm.const"
377 .sect ".const:sha_asm"
381 .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
382 .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
383 .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
384 .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
385 .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
386 .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
387 .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
388 .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
389 .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
390 .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
391 .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
392 .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
393 .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
394 .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
395 .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
396 .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
397 .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
398 .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
399 .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
400 .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
401 .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
402 .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
403 .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
404 .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
405 .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
406 .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
407 .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
408 .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
409 .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
410 .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
411 .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
412 .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
413 .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
414 .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
415 .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
416 .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
417 .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
418 .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
419 .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
420 .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
421 .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"