2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Performance is just below 10 cycles per processed byte, which is
22 # almost 40% faster than compiler-generated code. Unroll is unlikely
23 # to give more than ~8% improvement...
25 # !!! Note that this module uses AMR, which means that all interrupt
26 # service routines are expected to preserve it and for own well-being
29 $output = pop and open STDOUT,">$output";
31 ($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
34 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
36 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
39 ($Xia,$Xib)=("A5","B5"); # circular/ring buffer
42 ($Xn,$X0,$K)=("B7","B8","B9");
43 ($Maj,$Ch)=($T2,"B6");
48 .if .ASSEMBLER_VERSION<7000000
53 .asg sha256_block_data_order,_sha256_block_data_order
65 .global _sha256_block_data_order
66 _sha256_block_data_order:
68 .asmfunc stack_usage(64)
69 MV $NUM,A0 ; reassign $NUM
71 [!A0] BNOP RA ; if ($NUM==0) return;
72 || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
74 [A0] ADDKPC __sha256_block,B2
75 || [A0] AND B0,SP,SP ; align stack at 64 bytes
78 || [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
80 || [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
83 || [A0] MVKL (K256-__sha256_block),$K256
85 || [A0] MVKH (K256-__sha256_block),$K256
87 [A0] MVC B1,AMR ; setup circular addressing
90 || [A0] ADD B2,$K256,$K256
91 || [A0] MV $CTXA,$CTXB
92 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
93 LDW *${CTXA}[0],$A ; load ctx
100 || LDW *${CTXB}[7],$H
102 LDNW *$INP++,$Xn ; pre-fetch input
103 LDW *$K256++,$K ; pre-fetch K256[0]
104 MVK 14,B0 ; loop counters
119 SPLOOPD 8 ; BODY_00_14
133 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
135 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
137 || ADD $K,$H,$T1 ; T1 = h + K256[i]
138 ADD $X0,$T1,$T1 ; T1 += X[i];
142 XOR $t1a,$S0,$S0 ; Sigma0(a)
143 || XOR $t1e,$S1,$S1 ; Sigma1(e)
144 || LDW *$K256++,$K ; pre-fetch K256[i+1]
145 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
146 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
147 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
148 || ROTL $G,0,$H ; h = g
154 || ADD $D,$T1,$E ; e = d + T1
158 || ADD $T1,$T2,$A ; a = T1 + T2
161 ROTL $A,30,$S0 ; BODY_15
167 || LDW *${Xib}[1],$Xn ; modulo-scheduled
171 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
172 || LDW *${Xib}[2],$X1 ; modulo-scheduled
174 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
176 || ADD $K,$H,$T1 ; T1 = h + K256[i]
177 ADD $X0,$T1,$T1 ; T1 += X[i];
181 XOR $t1a,$S0,$S0 ; Sigma0(a)
182 || XOR $t1e,$S1,$S1 ; Sigma1(e)
183 || LDW *$K256++,$K ; pre-fetch K256[i+1]
184 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
185 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
186 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
187 || ROTL $G,0,$H ; h = g
191 || ADD $D,$T1,$E ; e = d + T1
193 || MV $Xn,$X0 ; modulo-scheduled
194 || LDW *$Xia,$X9 ; modulo-scheduled
195 || ROTL $X1,25,$t0e ; modulo-scheduled
196 || ROTL $X14,15,$t0a ; modulo-scheduled
197 SHRU $X1,3,$s0 ; modulo-scheduled
198 || SHRU $X14,10,$s1 ; modulo-scheduled
199 || ROTL $B,0,$C ; c = b
201 || ADD $T1,$T2,$A ; a = T1 + T2
203 SPLOOPD 10 ; BODY_16_63
205 || ROTL $X1,14,$t1e ; modulo-scheduled
206 || ROTL $X14,13,$t1a ; modulo-scheduled
212 XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
213 || XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
214 || LDW *${Xib}[2],$X1 ; module-scheduled
221 || ADD $X9,$X0,$X0 ; X[i] += X[i+9]
225 || XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
226 || ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
228 || OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
230 || ADD $H,$K,$T1 ; T1 = h + K256[i]
231 || ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
234 || ADD $X0,$T1,$T1 ; T1 += X[i]
236 XOR $t1a,$S0,$S0 ; Sigma0(a)
237 || XOR $t1e,$S1,$S1 ; Sigma1(e)
238 || ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
240 || ROTL $G,0,$H ; h = g
241 || LDW *$K256++,$K ; pre-fetch K256[i+1]
242 ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
243 || ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
245 || MV $Xn,$X0 ; modulo-scheduled
246 || LDW *++$Xia,$X9 ; modulo-scheduled
247 || ROTL $X1,25,$t0e ; module-scheduled
248 || ROTL $X14,15,$t0a ; modulo-scheduled
249 ROTL $X1,14,$t1e ; modulo-scheduled
250 || ROTL $X14,13,$t1a ; modulo-scheduled
252 || ADD $D,$T1,$E ; e = d + T1
256 || ADD $T1,$T2,$A ; a = T1 + T2
257 || SHRU $X1,3,$s0 ; modulo-scheduled
258 || SHRU $X14,10,$s1 ; modulo-scheduled
262 || [A0] LDNW *$INP++,$Xn ; pre-fetch input
263 || [A0] ADDK -260,$K256 ; rewind K256
264 || ADD $Actx,$A,$A ; accumulate ctx
272 || [A0] LDW *$K256++,$K ; pre-fetch K256[0]
275 ||[!A0] MV $CTXA,$CTXB
276 [!A0] MV FP,SP ; restore stack pointer
277 ||[!A0] LDW *FP[0],FP ; restore frame pointer
278 [!A0] STW $A,*${CTXA}[0] ; save ctx
279 ||[!A0] STW $E,*${CTXB}[4]
281 [!A0] STW $B,*${CTXA}[1]
282 ||[!A0] STW $F,*${CTXB}[5]
283 ||[!A0] MVC B0,AMR ; clear AMR
285 || STW $G,*${CTXB}[6]
287 || STW $H,*${CTXB}[7]
291 .sect ".text:sha_asm.const"
293 .sect ".const:sha_asm"
297 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
298 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
299 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
300 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
301 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
302 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
303 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
304 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
305 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
306 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
307 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
308 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
309 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
310 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
311 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
312 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
313 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"