3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance is 19 cycles per processed byte. Compared to block
15 # transform function from sha512.c compiled with cl6x with -mv6400+
16 # -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
17 # Loop unroll won't make it, this implementation, any faster, because
18 # it's effectively dominated by SHRU||SHL pairs and you can't schedule
21 # !!! Note that this module uses AMR, which means that all interrupt
22 # service routines are expected to preserve it and for own well-being
25 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
26 open STDOUT,">$output";
28 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
31 ($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
32 $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
33 ($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
34 $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
36 ($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
37 ($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
38 ($T1hi, $T2hi)= ("A6","A7");
39 ($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
40 ($Khi,$Klo)=("A9","A8");
41 ($MAJhi,$MAJlo)=($T2hi,$T2lo);
42 ($t1hi,$t1lo)=($Khi,"B2");
45 ($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer
50 .if .ASSEMBLER_VERSION<7000000
55 .asg sha512_block_data_order,_sha512_block_data_order
70 .global _sha512_block_data_order
71 _sha512_block_data_order:
73 .asmfunc stack_usage(40+128)
74 MV $NUM,A0 ; reassign $NUM
76 [!A0] BNOP RA ; if ($NUM==0) return;
77 || [A0] STW FP,*SP--(40) ; save frame pointer
79 [A0] STDW B13:B12,*SP[4]
80 || [A0] MVK 0x00404,B1
81 [A0] STDW B11:B10,*SP[3]
82 || [A0] STDW A13:A12,*FP[-3]
83 || [A0] MVKH 0x60000,B1
84 [A0] STDW A11:A10,*SP[1]
85 || [A0] MVC B1,AMR ; setup circular addressing
86 || [A0] ADD B0,SP,SP ; alloca(128)
88 [A0] AND B0,SP,SP ; align stack at 128 bytes
89 || [A0] ADDKPC __sha512_block,B1
90 || [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512
91 [A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512
92 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
94 [A0] AND B0,SP,SP ; align stack at 128 bytes
95 || [A0] ADDKPC __sha512_block,B1
96 || [A0] MVKL (K512-__sha512_block),$K512
97 [A0] MVKH (K512-__sha512_block),$K512
98 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
104 LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
105 || LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo
106 || ADD B1,$K512,$K512
107 LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi
108 || LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo
109 LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi
110 || LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo
111 LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi
112 || LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo
113 LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi
114 || LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo
115 LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi
116 || LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo
117 LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi
118 || LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo
119 LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi
120 || LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo
122 LDNDW *$INP++,B11:B10 ; pre-fetch input
123 LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
125 MVK 15,B0 ; loop counters
155 STW $T1hi,*$Xihi++[2]
156 || STW $T1lo,*$Xilo++[2] ; X[i] = T1
157 || ADD $Hhi,$T1hi,$T1hi
158 || ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
159 || SHRU $Ehi,14,$S1hi
160 || SHL $Ehi,32-14,$S1lo
162 || XOR $Flo,$Glo,$CHlo
163 || ADD KHI,$T1hi,$T1hi
164 || ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i]
165 || SHRU $Elo,14,$t0lo
166 || SHL $Elo,32-14,$t0hi
167 XOR $t0hi,$S1hi,$S1hi
168 || XOR $t0lo,$S1lo,$S1lo
169 || AND $Ehi,$CHhi,$CHhi
170 || AND $Elo,$CHlo,$CHlo
172 || ROTL $Glo,0,$Hlo ; h = g
173 || SHRU $Ehi,18,$t0hi
174 || SHL $Ehi,32-18,$t0lo
175 XOR $t0hi,$S1hi,$S1hi
176 || XOR $t0lo,$S1lo,$S1lo
177 || XOR $Ghi,$CHhi,$CHhi
178 || XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g
180 || ROTL $Flo,0,$Glo ; g = f
181 || SHRU $Elo,18,$t0lo
182 || SHL $Elo,32-18,$t0hi
183 XOR $t0hi,$S1hi,$S1hi
184 || XOR $t0lo,$S1lo,$S1lo
185 || OR $Ahi,$Bhi,$MAJhi
186 || OR $Alo,$Blo,$MAJlo
188 || ROTL $Elo,0,$Flo ; f = e
189 || SHRU $Ehi,41-32,$t0lo
190 || SHL $Ehi,64-41,$t0hi
191 XOR $t0hi,$S1hi,$S1hi
192 || XOR $t0lo,$S1lo,$S1lo
193 || AND $Chi,$MAJhi,$MAJhi
194 || AND $Clo,$MAJlo,$MAJlo
196 || ROTL $Dlo,0,$Elo ; e = d
197 || SHRU $Elo,41-32,$t0hi
198 || SHL $Elo,64-41,$t0lo
199 XOR $t0hi,$S1hi,$S1hi
200 || XOR $t0lo,$S1lo,$S1lo ; Sigma1(e)
201 || AND $Ahi,$Bhi,$t1hi
202 || AND $Alo,$Blo,$t1lo
204 || ROTL $Clo,0,$Dlo ; d = c
205 || SHRU $Ahi,28,$S0hi
206 || SHL $Ahi,32-28,$S0lo
207 OR $t1hi,$MAJhi,$MAJhi
208 || OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b)
209 || ADD $CHhi,$T1hi,$T1hi
210 || ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g)
212 || ROTL $Blo,0,$Clo ; c = b
213 || SHRU $Alo,28,$t0lo
214 || SHL $Alo,32-28,$t0hi
215 XOR $t0hi,$S0hi,$S0hi
216 || XOR $t0lo,$S0lo,$S0lo
217 || ADD $S1hi,$T1hi,$T1hi
218 || ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e)
220 || ROTL $Alo,0,$Blo ; b = a
221 || SHRU $Ahi,34-32,$t0lo
222 || SHL $Ahi,64-34,$t0hi
223 XOR $t0hi,$S0hi,$S0hi
224 || XOR $t0lo,$S0lo,$S0lo
225 || ADD $MAJhi,$T1hi,$T2hi
226 || ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c)
227 || SHRU $Alo,34-32,$t0hi
228 || SHL $Alo,64-34,$t0lo
229 XOR $t0hi,$S0hi,$S0hi
230 || XOR $t0lo,$S0lo,$S0lo
231 || ADD $Ehi,$T1hi,$T1hi
232 || ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e
233 || [B0] BNOP loop0_15?
234 || SHRU $Ahi,39-32,$t0lo
235 || SHL $Ahi,64-39,$t0hi
236 XOR $t0hi,$S0hi,$S0hi
237 || XOR $t0lo,$S0lo,$S0lo
238 || [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
240 || SHRU $Alo,39-32,$t0hi
241 || SHL $Alo,64-39,$t0lo
242 XOR $t0hi,$S0hi,$S0hi
243 || XOR $t0lo,$S0lo,$S0lo ; Sigma0(a)
244 || ADD $T1carry,$T1hi,$Ehi
245 || MV $T1lo,$Elo ; e = T1
246 ||[!B0] LDW *${Xihi}[28],$T1hi
247 ||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
248 ADD $S0hi,$T2hi,$T2hi
249 || ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a)
250 || [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i]
251 NOP ; avoid cross-path stall
252 ADD $T2carry,$T2hi,$Ahi
253 || MV $T2lo,$Alo ; a = T2
255 ;;===== branch to loop00_15? is taken here
257 ;;===== branch to break? is taken here
258 LDW *${Xihi}[2],$T2hi
259 || LDW *${Xilo}[2],$T2lo ; X[i+1]
260 || SHRU $T1hi,19,$S1hi
261 || SHL $T1hi,32-19,$S1lo
263 || SHL $T1lo,32-19,$t0hi
264 XOR $t0hi,$S1hi,$S1hi
265 || XOR $t0lo,$S1lo,$S1lo
266 || SHRU $T1hi,61-32,$t0lo
267 || SHL $T1hi,64-61,$t0hi
268 XOR $t0hi,$S1hi,$S1hi
269 || XOR $t0lo,$S1lo,$S1lo
270 || SHRU $T1lo,61-32,$t0hi
271 || SHL $T1lo,64-61,$t0lo
272 XOR $t0hi,$S1hi,$S1hi
273 || XOR $t0lo,$S1lo,$S1lo
274 || SHRU $T1hi,6,$t0hi
275 || SHL $T1hi,32-6,$t0lo
276 XOR $t0hi,$S1hi,$S1hi
277 || XOR $t0lo,$S1lo,$S1lo
278 || SHRU $T1lo,6,$t0lo
279 || LDW *${Xihi}[18],$T1hi
280 || LDW *${Xilo}[18],$T1lo ; X[i+9]
281 XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14])
283 || LDW *${Xihi}[0],$CHhi
284 || LDW *${Xilo}[0],$CHlo ; X[i]
285 || SHRU $T2hi,1,$S0hi
286 || SHL $T2hi,32-1,$S0lo
288 || SHL $T2lo,32-1,$t0hi
289 XOR $t0hi,$S0hi,$S0hi
290 || XOR $t0lo,$S0lo,$S0lo
291 || SHRU $T2hi,8,$t0hi
292 || SHL $T2hi,32-8,$t0lo
293 XOR $t0hi,$S0hi,$S0hi
294 || XOR $t0lo,$S0lo,$S0lo
295 || SHRU $T2lo,8,$t0lo
296 || SHL $T2lo,32-8,$t0hi
297 XOR $t0hi,$S0hi,$S0hi
298 || XOR $t0lo,$S0lo,$S0lo
299 || ADD $S1hi,$T1hi,$T1hi
300 || ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1()
301 || [B1] BNOP loop16_79?
302 || SHRU $T2hi,7,$t0hi
303 || SHL $T2hi,32-7,$t0lo
304 XOR $t0hi,$S0hi,$S0hi
305 || XOR $t0lo,$S0lo,$S0lo
306 || ADD $CHhi,$T1hi,$T1hi
307 || ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i]
308 || SHRU $T2lo,7,$t0lo
309 XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1]
311 ADD $S0hi,$T1hi,$T1hi
312 || ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0()
314 NOP ; avoid cross-path stall
315 ADD $T1carry,$T1hi,$T1hi
316 ;;===== branch to loop16_79? is taken here
319 ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx
320 || ADDU $Alo,$Actxlo,$Actxlo:$Alo
321 || [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input
322 || [A0] ADDK -640,$K512 ; rewind pointer to K512
323 ADD $Bhi,$Bctxhi,$Bhi
324 || ADDU $Blo,$Bctxlo,$Bctxlo:$Blo
325 || [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
326 ADD $Chi,$Cctxhi,$Chi
327 || ADDU $Clo,$Cctxlo,$Cctxlo:$Clo
328 || ADD $Actxlo,$Ahi,$Ahi
329 ||[!A0] MV $CTXA,$CTXB
330 ADD $Dhi,$Dctxhi,$Dhi
331 || ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo
332 || ADD $Bctxlo,$Bhi,$Bhi
333 ||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx
334 ||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN]
335 ADD $Ehi,$Ectxhi,$Ehi
336 || ADDU $Elo,$Ectxlo,$Ectxlo:$Elo
337 || ADD $Cctxlo,$Chi,$Chi
338 || [A0] BNOP outerloop?
339 ||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
340 ||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN]
341 ADD $Fhi,$Fctxhi,$Fhi
342 || ADDU $Flo,$Fctxlo,$Fctxlo:$Flo
343 || ADD $Dctxlo,$Dhi,$Dhi
344 ||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN]
345 ||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN]
346 ADD $Ghi,$Gctxhi,$Ghi
347 || ADDU $Glo,$Gctxlo,$Gctxlo:$Glo
348 || ADD $Ectxlo,$Ehi,$Ehi
349 ||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
350 ||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
351 ADD $Hhi,$Hctxhi,$Hhi
352 || ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo
353 || ADD $Fctxlo,$Fhi,$Fhi
354 ||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
355 ||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN]
356 ADD $Gctxlo,$Ghi,$Ghi
357 ||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
358 ||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN]
359 ADD $Hctxlo,$Hhi,$Hhi
360 ||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
361 ||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN]
362 ;;===== branch to outerloop? is taken here
364 STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
365 || STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
367 ADD FP,B0,SP ; destroy circular buffer
368 || LDDW *FP[-4],A11:A10
370 || LDDW *FP[-2],B11:B10
373 LDW *++SP(40),FP ; restore frame pointer
375 MVC B0,AMR ; clear AMR
376 NOP 2 ; wait till FP is committed
380 .sect ".text:sha_asm.const"
382 .sect ".const:sha_asm"
386 .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
387 .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
388 .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
389 .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
390 .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
391 .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
392 .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
393 .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
394 .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
395 .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
396 .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
397 .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
398 .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
399 .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
400 .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
401 .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
402 .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
403 .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
404 .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
405 .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
406 .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
407 .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
408 .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
409 .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
410 .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
411 .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
412 .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
413 .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
414 .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
415 .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
416 .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
417 .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
418 .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
419 .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
420 .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
421 .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
422 .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
423 .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
424 .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
425 .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
426 .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"