3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance is 19 cycles per processed byte. Compared to block
15 # transform function from sha512.c compiled with cl6x with -mv6400+
16 # -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
17 # Loop unroll won't make it, this implementation, any faster, because
18 # it's effectively dominated by SHRU||SHL pairs and you can't schedule
21 # !!! Note that this module uses AMR, which means that all interrupt
22 # service routines are expected to preserve it and for own well-being
25 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
26 open STDOUT,">$output";
28 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
31 ($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
32 $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
33 ($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
34 $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
36 ($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
37 ($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
38 ($T1hi, $T2hi)= ("A6","A7");
39 ($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
40 ($Khi,$Klo)=("A9","A8");
41 ($MAJhi,$MAJlo)=($T2hi,$T2lo);
42 ($t1hi,$t1lo)=($Khi,"B2");
45 ($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer
65 .global _sha512_block_data_order
66 _sha512_block_data_order:
67 .asmfunc stack_usage(40+128)
68 MV $NUM,A0 ; reassign $NUM
70 [!A0] BNOP RA ; if ($NUM==0) return;
71 || [A0] STW FP,*SP--(40) ; save frame pointer
73 [A0] STDW B13:B12,*SP[4]
74 || [A0] MVK 0x00404,B1
75 [A0] STDW B11:B10,*SP[3]
76 || [A0] STDW A13:A12,*FP[-3]
77 || [A0] MVKH 0x60000,B1
78 [A0] STDW A11:A10,*SP[1]
79 || [A0] MVC B1,AMR ; setup circular addressing
80 || [A0] ADD B0,SP,SP ; alloca(128)
82 [A0] AND B0,SP,SP ; align stack at 128 bytes
83 || [A0] ADDKPC _sha512_block_data_order,B1
84 || [A0] MVKL \$PCR_OFFSET(K512,_sha512_block_data_order),$K512
85 [A0] MVKH \$PCR_OFFSET(K512,_sha512_block_data_order),$K512
86 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
88 [A0] AND B0,SP,SP ; align stack at 128 bytes
89 || [A0] ADDKPC _sha512_block_data_order,B1
90 || [A0] MVKL (K512-_sha512_block_data_order),$K512
91 [A0] MVKH (K512-_sha512_block_data_order),$K512
92 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
98 LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
99 || LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo
100 || ADD B1,$K512,$K512
101 LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi
102 || LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo
103 LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi
104 || LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo
105 LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi
106 || LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo
107 LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi
108 || LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo
109 LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi
110 || LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo
111 LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi
112 || LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo
113 LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi
114 || LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo
116 LDNDW *$INP++,B11:B10 ; pre-fetch input
117 LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
119 MVK 15,B0 ; loop counters
149 STW $T1hi,*$Xihi++[2]
150 || STW $T1lo,*$Xilo++[2] ; X[i] = T1
151 || ADD $Hhi,$T1hi,$T1hi
152 || ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
153 || SHRU $Ehi,14,$S1hi
154 || SHL $Ehi,32-14,$S1lo
156 || XOR $Flo,$Glo,$CHlo
157 || ADD KHI,$T1hi,$T1hi
158 || ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i]
159 || SHRU $Elo,14,$t0lo
160 || SHL $Elo,32-14,$t0hi
161 XOR $t0hi,$S1hi,$S1hi
162 || XOR $t0lo,$S1lo,$S1lo
163 || AND $Ehi,$CHhi,$CHhi
164 || AND $Elo,$CHlo,$CHlo
166 || ROTL $Glo,0,$Hlo ; h = g
167 || SHRU $Ehi,18,$t0hi
168 || SHL $Ehi,32-18,$t0lo
169 XOR $t0hi,$S1hi,$S1hi
170 || XOR $t0lo,$S1lo,$S1lo
171 || XOR $Ghi,$CHhi,$CHhi
172 || XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g
174 || ROTL $Flo,0,$Glo ; g = f
175 || SHRU $Elo,18,$t0lo
176 || SHL $Elo,32-18,$t0hi
177 XOR $t0hi,$S1hi,$S1hi
178 || XOR $t0lo,$S1lo,$S1lo
179 || OR $Ahi,$Bhi,$MAJhi
180 || OR $Alo,$Blo,$MAJlo
182 || ROTL $Elo,0,$Flo ; f = e
183 || SHRU $Ehi,41-32,$t0lo
184 || SHL $Ehi,64-41,$t0hi
185 XOR $t0hi,$S1hi,$S1hi
186 || XOR $t0lo,$S1lo,$S1lo
187 || AND $Chi,$MAJhi,$MAJhi
188 || AND $Clo,$MAJlo,$MAJlo
190 || ROTL $Dlo,0,$Elo ; e = d
191 || SHRU $Elo,41-32,$t0hi
192 || SHL $Elo,64-41,$t0lo
193 XOR $t0hi,$S1hi,$S1hi
194 || XOR $t0lo,$S1lo,$S1lo ; Sigma1(e)
195 || AND $Ahi,$Bhi,$t1hi
196 || AND $Alo,$Blo,$t1lo
198 || ROTL $Clo,0,$Dlo ; d = c
199 || SHRU $Ahi,28,$S0hi
200 || SHL $Ahi,32-28,$S0lo
201 OR $t1hi,$MAJhi,$MAJhi
202 || OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b)
203 || ADD $CHhi,$T1hi,$T1hi
204 || ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g)
206 || ROTL $Blo,0,$Clo ; c = b
207 || SHRU $Alo,28,$t0lo
208 || SHL $Alo,32-28,$t0hi
209 XOR $t0hi,$S0hi,$S0hi
210 || XOR $t0lo,$S0lo,$S0lo
211 || ADD $S1hi,$T1hi,$T1hi
212 || ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e)
214 || ROTL $Alo,0,$Blo ; b = a
215 || SHRU $Ahi,34-32,$t0lo
216 || SHL $Ahi,64-34,$t0hi
217 XOR $t0hi,$S0hi,$S0hi
218 || XOR $t0lo,$S0lo,$S0lo
219 || ADD $MAJhi,$T1hi,$T2hi
220 || ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c)
221 || SHRU $Alo,34-32,$t0hi
222 || SHL $Alo,64-34,$t0lo
223 XOR $t0hi,$S0hi,$S0hi
224 || XOR $t0lo,$S0lo,$S0lo
225 || ADD $Ehi,$T1hi,$T1hi
226 || ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e
227 || [B0] BNOP loop0_15?
228 || SHRU $Ahi,39-32,$t0lo
229 || SHL $Ahi,64-39,$t0hi
230 XOR $t0hi,$S0hi,$S0hi
231 || XOR $t0lo,$S0lo,$S0lo
232 || [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
234 || SHRU $Alo,39-32,$t0hi
235 || SHL $Alo,64-39,$t0lo
236 XOR $t0hi,$S0hi,$S0hi
237 || XOR $t0lo,$S0lo,$S0lo ; Sigma0(a)
238 || ADD $T1carry,$T1hi,$Ehi
239 || MV $T1lo,$Elo ; e = T1
240 ||[!B0] LDW *${Xihi}[28],$T1hi
241 ||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
242 ADD $S0hi,$T2hi,$T2hi
243 || ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a)
244 || [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i]
245 NOP ; avoid cross-path stall
246 ADD $T2carry,$T2hi,$Ahi
247 || MV $T2lo,$Alo ; a = T2
249 ;;===== branch to loop00_15? is taken here
251 ;;===== branch to break? is taken here
252 LDW *${Xihi}[2],$T2hi
253 || LDW *${Xilo}[2],$T2lo ; X[i+1]
254 || SHRU $T1hi,19,$S1hi
255 || SHL $T1hi,32-19,$S1lo
257 || SHL $T1lo,32-19,$t0hi
258 XOR $t0hi,$S1hi,$S1hi
259 || XOR $t0lo,$S1lo,$S1lo
260 || SHRU $T1hi,61-32,$t0lo
261 || SHL $T1hi,64-61,$t0hi
262 XOR $t0hi,$S1hi,$S1hi
263 || XOR $t0lo,$S1lo,$S1lo
264 || SHRU $T1lo,61-32,$t0hi
265 || SHL $T1lo,64-61,$t0lo
266 XOR $t0hi,$S1hi,$S1hi
267 || XOR $t0lo,$S1lo,$S1lo
268 || SHRU $T1hi,6,$t0hi
269 || SHL $T1hi,32-6,$t0lo
270 XOR $t0hi,$S1hi,$S1hi
271 || XOR $t0lo,$S1lo,$S1lo
272 || SHRU $T1lo,6,$t0lo
273 || LDW *${Xihi}[18],$T1hi
274 || LDW *${Xilo}[18],$T1lo ; X[i+9]
275 XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14])
277 || LDW *${Xihi}[0],$CHhi
278 || LDW *${Xilo}[0],$CHlo ; X[i]
279 || SHRU $T2hi,1,$S0hi
280 || SHL $T2hi,32-1,$S0lo
282 || SHL $T2lo,32-1,$t0hi
283 XOR $t0hi,$S0hi,$S0hi
284 || XOR $t0lo,$S0lo,$S0lo
285 || SHRU $T2hi,8,$t0hi
286 || SHL $T2hi,32-8,$t0lo
287 XOR $t0hi,$S0hi,$S0hi
288 || XOR $t0lo,$S0lo,$S0lo
289 || SHRU $T2lo,8,$t0lo
290 || SHL $T2lo,32-8,$t0hi
291 XOR $t0hi,$S0hi,$S0hi
292 || XOR $t0lo,$S0lo,$S0lo
293 || ADD $S1hi,$T1hi,$T1hi
294 || ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1()
295 || [B1] BNOP loop16_79?
296 || SHRU $T2hi,7,$t0hi
297 || SHL $T2hi,32-7,$t0lo
298 XOR $t0hi,$S0hi,$S0hi
299 || XOR $t0lo,$S0lo,$S0lo
300 || ADD $CHhi,$T1hi,$T1hi
301 || ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i]
302 || SHRU $T2lo,7,$t0lo
303 XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1]
305 ADD $S0hi,$T1hi,$T1hi
306 || ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0()
308 NOP ; avoid cross-path stall
309 ADD $T1carry,$T1hi,$T1hi
310 ;;===== branch to loop16_79? is taken here
313 ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx
314 || ADDU $Alo,$Actxlo,$Actxlo:$Alo
315 || [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input
316 || [A0] ADDK -640,$K512 ; rewind pointer to K512
317 ADD $Bhi,$Bctxhi,$Bhi
318 || ADDU $Blo,$Bctxlo,$Bctxlo:$Blo
319 || [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
320 ADD $Chi,$Cctxhi,$Chi
321 || ADDU $Clo,$Cctxlo,$Cctxlo:$Clo
322 || ADD $Actxlo,$Ahi,$Ahi
323 ||[!A0] MV $CTXA,$CTXB
324 ADD $Dhi,$Dctxhi,$Dhi
325 || ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo
326 || ADD $Bctxlo,$Bhi,$Bhi
327 ||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx
328 ||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN]
329 ADD $Ehi,$Ectxhi,$Ehi
330 || ADDU $Elo,$Ectxlo,$Ectxlo:$Elo
331 || ADD $Cctxlo,$Chi,$Chi
332 || [A0] BNOP outerloop?
333 ||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
334 ||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN]
335 ADD $Fhi,$Fctxhi,$Fhi
336 || ADDU $Flo,$Fctxlo,$Fctxlo:$Flo
337 || ADD $Dctxlo,$Dhi,$Dhi
338 ||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN]
339 ||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN]
340 ADD $Ghi,$Gctxhi,$Ghi
341 || ADDU $Glo,$Gctxlo,$Gctxlo:$Glo
342 || ADD $Ectxlo,$Ehi,$Ehi
343 ||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
344 ||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
345 ADD $Hhi,$Hctxhi,$Hhi
346 || ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo
347 || ADD $Fctxlo,$Fhi,$Fhi
348 ||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
349 ||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN]
350 ADD $Gctxlo,$Ghi,$Ghi
351 ||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
352 ||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN]
353 ADD $Hctxlo,$Hhi,$Hhi
354 ||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
355 ||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN]
356 ;;===== branch to outerloop? is taken here
358 STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
359 || STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
361 ADD FP,B0,SP ; destroy circular buffer
362 || LDDW *FP[-4],A11:A10
364 || LDDW *FP[-2],B11:B10
367 LDW *++SP(40),FP ; restore frame pointer
369 MVC B0,AMR ; clear AMR
370 NOP 2 ; wait till FP is committed
373 .sect ".const:sha_asm"
376 .uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
377 .uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
378 .uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
379 .uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
380 .uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
381 .uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
382 .uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
383 .uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
384 .uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
385 .uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
386 .uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
387 .uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
388 .uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
389 .uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
390 .uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
391 .uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
392 .uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
393 .uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
394 .uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
395 .uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
396 .uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
397 .uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
398 .uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
399 .uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
400 .uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
401 .uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
402 .uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
403 .uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
404 .uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
405 .uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
406 .uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
407 .uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
408 .uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
409 .uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
410 .uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
411 .uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
412 .uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
413 .uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
414 .uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
415 .uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
416 .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"