3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance is just below 10 cycles per processed byte, which is
15 # almost 40% faster than compiler-generated code. Unroll is unlikely
16 # to give more than ~8% improvement...
18 # !!! Note that this module uses AMR, which means that all interrupt
19 # service routines are expected to preserve it and for own well-being
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23 open STDOUT,">$output";
25 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
28 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
30 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
33 ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
36 ($Xn,$X0,$K)=("B7","B8","B9");
37 ($Maj,$Ch)=($T2,"B6");
54 .global _sha256_block_data_order
55 _sha256_block_data_order:
56 .asmfunc stack_usage(64)
57 MV $NUM,A0 ; reassign $NUM
59 [!A0] BNOP RA ; if ($NUM==0) return;
60 || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
62 [A0] ADDKPC _sha256_block_data_order,B2
63 || [A0] AND B0,SP,SP ; align stack at 64 bytes
66 || [A0] MVKL \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
68 || [A0] MVKH \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
71 || [A0] MVKL (K256-_sha256_block_data_order),$K256
73 || [A0] MVKH (K256-_sha256_block_data_order),$K256
75 [A0] MVC B1,AMR ; setup circular addressing
78 || [A0] ADD B2,$K256,$K256
79 || [A0] MV $CTXA,$CTXB
80 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
81 LDW *${CTXA}[0],$A ; load ctx
90 LDNW *$INP++,$Xn ; pre-fetch input
91 LDW *$K256++,$K ; pre-fetch K256[0]
92 MVK 14,B0 ; loop counters
107 SPLOOPD 8 ; BODY_00_14
121 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
123 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
125 || ADD $K,$H,$T1 ; T1 = h + K256[i]
126 ADD $X0,$T1,$T1 ; T1 += X[i];
130 XOR $t1a,$S0,$S0 ; Sigma0(a)
131 || XOR $t1e,$S1,$S1 ; Sigma1(e)
132 || LDW *$K256++,$K ; pre-fetch K256[i+1]
133 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
134 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
135 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
136 || ROTL $G,0,$H ; h = g
142 || ADD $D,$T1,$E ; e = d + T1
146 || ADD $T1,$T2,$A ; a = T1 + T2
149 ROTL $A,30,$S0 ; BODY_15
155 || LDW *${Xib}[1],$Xn ; modulo-scheduled
159 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
160 || LDW *${Xib}[2],$X1 ; modulo-scheduled
162 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
164 || ADD $K,$H,$T1 ; T1 = h + K256[i]
165 ADD $X0,$T1,$T1 ; T1 += X[i];
169 XOR $t1a,$S0,$S0 ; Sigma0(a)
170 || XOR $t1e,$S1,$S1 ; Sigma1(e)
171 || LDW *$K256++,$K ; pre-fetch K256[i+1]
172 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
173 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
174 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
175 || ROTL $G,0,$H ; h = g
179 || ADD $D,$T1,$E ; e = d + T1
181 || MV $Xn,$X0 ; modulo-scheduled
182 || LDW *$Xia,$X9 ; modulo-scheduled
183 || ROTL $X1,25,$t0e ; modulo-scheduled
184 || ROTL $X14,15,$t0a ; modulo-scheduled
185 SHRU $X1,3,$s0 ; modulo-scheduled
186 || SHRU $X14,10,$s1 ; modulo-scheduled
187 || ROTL $B,0,$C ; c = b
189 || ADD $T1,$T2,$A ; a = T1 + T2
191 SPLOOPD 10 ; BODY_16_63
193 || ROTL $X1,14,$t1e ; modulo-scheduled
194 || ROTL $X14,13,$t1a ; modulo-scheduled
200 XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
201 || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
202 || LDW *${Xib}[2],$X1 ; module-scheduled
209 || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
213 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
214 || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
216 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
218 || ADD $H,$K,$T1 ; T1 = h + K256[i]
219 || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
222 || ADD $X0,$T1,$T1 ; T1 += X[i]
224 XOR $t1a,$S0,$S0 ; Sigma0(a)
225 || XOR $t1e,$S1,$S1 ; Sigma1(e)
226 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
228 || ROTL $G,0,$H ; h = g
229 || LDW *$K256++,$K ; pre-fetch K256[i+1]
230 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
231 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
233 || MV $Xn,$X0 ; modulo-scheduled
234 || LDW *++$Xia,$X9 ; modulo-scheduled
235 || ROTL $X1,25,$t0e ; module-scheduled
236 || ROTL $X14,15,$t0a ; modulo-scheduled
237 ROTL $X1,14,$t1e ; modulo-scheduled
238 || ROTL $X14,13,$t1a ; modulo-scheduled
240 || ADD $D,$T1,$E ; e = d + T1
244 || ADD $T1,$T2,$A ; a = T1 + T2
245 || SHRU $X1,3,$s0 ; modulo-scheduled
246 || SHRU $X14,10,$s1 ; modulo-scheduled
250 || [A0] LDNW *$INP++,$Xn ; pre-fetch input
251 || [A0] ADDK -260,$K256 ; rewind K256
252 || ADD $Actx,$A,$A ; accumulate ctx
260 || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
263 ||[!A0] MV $CTXA,$CTXB
264 [!A0] MV FP,SP ; restore stack pointer
265 ||[!A0] LDW *FP[0],FP ; restore frame pointer
266 [!A0] STW $A,*${CTXA}[0] ; save ctx
267 ||[!A0] STW $E,*${CTXB}[4]
269 [!A0] STW $B,*${CTXA}[1]
270 ||[!A0] STW $F,*${CTXB}[5]
271 ||[!A0] MVC B0,AMR ; clear AMR
273 || STW $G,*${CTXB}[6]
275 || STW $H,*${CTXB}[7]
278 .sect ".const:sha_asm"
281 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
282 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
283 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
284 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
285 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
286 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
287 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
288 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
289 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
290 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
291 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
292 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
293 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
294 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
295 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
296 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
297 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"