3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # If compared to compiler-generated code with similar characteristics,
15 # i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
16 # this implementation is 25% smaller and >2x faster. In absolute terms
17 # performance is (quite impressive) ~6.5 cycles per processed byte.
18 # Fully unrolled assembler would be ~5x larger and is likely to be
19 # ~15% faster. It would be free from references to intermediate ring
20 # buffer, but put more pressure on L1P [both because the code would be
21 # larger and won't be using SPLOOP buffer]. There are no plans to
22 # realize fully unrolled variant though...
24 # !!! Note that this module uses AMR, which means that all interrupt
25 # service routines are expected to preserve it and for own well-being
28 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
29 open STDOUT,">$output";
31 ($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
33 ($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
34 ($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
35 ($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
36 ($XPA,$XPB) = ("A5","B5"); # X circular buffer
37 ($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
42 .if .ASSEMBLER_VERSION<7000000
46 .asg sha1_block_data_order,_sha1_block_data_order
58 .global _sha1_block_data_order
59 _sha1_block_data_order:
60 .asmfunc stack_usage(64)
61 MV $NUM,A0 ; reassign $NUM
63 [!A0] BNOP RA ; if ($NUM==0) return;
64 || [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
66 [A0] LDW *${CTX}[0],$A ; load A-E...
67 || [A0] AND B0,SP,SP ; align stack at 64 bytes
68 [A0] LDW *${CTX}[1],$B
69 || [A0] SUBAW SP,2,SP ; reserve two words above buffer
70 [A0] LDW *${CTX}[2],$C
71 || [A0] MVK 0x00404,B0
72 [A0] LDW *${CTX}[3],$D
73 || [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
74 [A0] LDW *${CTX}[4],$E
75 || [A0] MVC B0,AMR ; setup circular addressing
76 LDNW *${INP}++,$TX1 ; pre-fetch input
84 MVKH 0x5a820000,$K ; K_00_19
88 ;;==================================================
89 SPLOOPD 5 ; BODY_00_13
98 || ADD $K,$E,$T ; T=E+K
100 XOR $F0,$F,$F ; F_00_19(B,C,D)
104 || LDNW *${INP}++,$TX1
106 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
107 || ROTL $B,30,$C ; C=ROL(B,30)
108 || SWAP4 $TX2,$TX3 ; byte swap
110 ADD $Arot,$T,$T ; T+=ROL(A,5)
113 ADD $TX3,$T,$A ; A=T+Xi
114 || STW $TX3,*${XPB}++
116 ;;==================================================
117 ROTL $A,5,$Arot ; BODY_14
120 || ADD $K,$E,$T ; T=E+K
122 XOR $F0,$F,$F ; F_00_19(B,C,D)
126 || LDNW *${INP}++,$TX1
128 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
129 || ROTL $B,30,$C ; C=ROL(B,30)
130 || SWAP4 $TX2,$TX2 ; byte swap
131 || LDW *${XPA}++,$X0 ; fetches from X ring buffer are
132 || LDW *${XPB}[4],$X2 ; 2 iterations ahead
134 ADD $Arot,$T,$T ; T+=ROL(A,5)
136 || LDW *${XPA}[7],$X8
137 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
140 ADD $TX2,$T,$A ; A=T+Xi
141 || STW $TX2,*${XPB}++
142 ;;==================================================
143 ROTL $A,5,$Arot ; BODY_15
146 || ADD $K,$E,$T ; T=E+K
148 XOR $F0,$F,$F ; F_00_19(B,C,D)
153 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
154 || ROTL $B,30,$C ; C=ROL(B,30)
155 || SWAP4 $TX2,$TX2 ; byte swap
156 || XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
158 || LDW *${XPB}[4],$X2
160 ADD $Arot,$T,$T ; T+=ROL(A,5)
163 || LDW *${XPA}[7],$X8
164 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
167 ADD $TX2,$T,$A ; A=T+Xi
168 || STW $TX2,*${XPB}++
169 || XOR $TX0,$TX1,$TX1
171 ;;==================================================
172 SPLOOPD 5 ; BODY_16_19
178 || ADD $K,$E,$T ; T=E+K
179 || ROTL $TX1,1,$TX2 ; Xupdate output
181 XOR $F0,$F,$F ; F_00_19(B,C,D)
185 ADD $F,$T,$T ; T+=F_00_19(B,C,D)
186 || ROTL $B,30,$C ; C=ROL(B,30)
189 || LDW *${XPB}[4],$X2
191 ADD $Arot,$T,$T ; T+=ROL(A,5)
194 || LDW *${XPA}[7],$X8
195 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
198 ADD $TX2,$T,$A ; A=T+Xi
199 || STW $TX2,*${XPB}++
200 || XOR $TX0,$TX1,$TX1
205 MVKH 0x6ed90000,$K ; K_20_39
209 ;;==================================================
210 SPLOOPD 5 ; BODY_20_39
215 || ADD $K,$E,$T ; T=E+K
216 || ROTL $TX1,1,$TX2 ; Xupdate output
218 XOR $D,$F,$F ; F_20_39(B,C,D)
222 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
223 || ROTL $B,30,$C ; C=ROL(B,30)
226 || LDW *${XPB}[4],$X2
228 ADD $Arot,$T,$T ; T+=ROL(A,5)
231 || LDW *${XPA}[7],$X8
232 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
235 ADD $TX2,$T,$A ; A=T+Xi
236 || STW $TX2,*${XPB}++ ; last one is redundant
237 || XOR $TX0,$TX1,$TX1
240 $code.=<<___ if (!shift);
242 MVKH 0x8f1b0000,$K ; K_40_59
246 ;;==================================================
247 SPLOOPD 5 ; BODY_40_59
255 || ADD $K,$E,$T ; T=E+K
256 || ROTL $TX1,1,$TX2 ; Xupdate output
258 XOR $F0,$F,$F ; F_40_59(B,C,D)
262 ADD $F,$T,$T ; T+=F_40_59(B,C,D)
263 || ROTL $B,30,$C ; C=ROL(B,30)
266 || LDW *${XPB}[4],$X2
268 ADD $Arot,$T,$T ; T+=ROL(A,5)
271 || LDW *${XPA}[7],$X8
272 || MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
275 ADD $TX2,$T,$A ; A=T+Xi
276 || STW $TX2,*${XPB}++
277 || XOR $TX0,$TX1,$TX1
284 MVKH 0xca620000,$K ; K_60_79
286 &BODY_20_39(-1); # BODY_60_78
288 ;;==================================================
290 || ROTL $A,5,$Arot ; BODY_79
292 || ROTL $TX1,1,$TX2 ; Xupdate output
294 [A0] LDNW *${INP}++,$TX1 ; pre-fetch input
295 || ADD $K,$E,$T ; T=E+K
296 || XOR $D,$F,$F ; F_20_39(B,C,D)
298 ADD $F,$T,$T ; T+=F_20_39(B,C,D)
299 || ADD $Ectx,$D,$E ; E=D,E+=Ectx
300 || ADD $Dctx,$C,$D ; D=C,D+=Dctx
301 || ROTL $B,30,$C ; C=ROL(B,30)
303 ADD $Arot,$T,$T ; T+=ROL(A,5)
304 || ADD $Bctx,$A,$B ; B=A,B+=Bctx
306 ADD $TX2,$T,$A ; A=T+Xi
308 ADD $Actx,$A,$A ; A+=Actx
309 || ADD $Cctx,$C,$C ; C+=Cctx
313 || MV FP,SP ; restore stack pointer
314 || LDW *FP[0],FP ; restore frame pointer
315 STW $A,*${CTX}[0] ; emit A-E...
318 || MVC B0,AMR ; clear AMR
325 .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"