2f775ac0050cf3593bae2e0839d9d15a59f3770e
[openssl.git] / crypto / sha / asm / sha256-c64xplus.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # SHA256 for C64x+.
11 #
12 # January 2012
13 #
14 # Performance is just below 10 cycles per processed byte, which is
15 # almost 40% faster than compiler-generated code. Unroll is unlikely
16 # to give more than ~8% improvement...
17 #
18 # !!! Note that this module uses AMR, which means that all interrupt
19 # service routines are expected to preserve it and for own well-being
20 # zero it upon entry.
21
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23 open STDOUT,">$output";
24
25 ($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
26  $K256="A3";
27
28 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
29         =map("A$_",(16..31));
30 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
31         =map("B$_",(16..31));
32
33 ($Xia,$Xib)=("A5","B5");                        # circular/ring buffer
34  $CTXB=$t2e;
35
36 ($Xn,$X0,$K)=("B7","B8","B9");
37 ($Maj,$Ch)=($T2,"B6");
38
39 $code.=<<___;
40         .text
41
42         .if     .ASSEMBLER_VERSION<7000000
43         .asg    0,__TI_EABI__
44         .endif
45         .if     __TI_EABI__
46         .nocmp
47         .asg    sha256_block_data_order,_sha256_block_data_order
48         .endif
49
50         .asg    B3,RA
51         .asg    A15,FP
52         .asg    B15,SP
53
54         .if     .BIG_ENDIAN
55         .asg    SWAP2,MV
56         .asg    SWAP4,MV
57         .endif
58
59         .global _sha256_block_data_order
60 _sha256_block_data_order:
61 __sha256_block:
62         .asmfunc stack_usage(64)
63         MV      $NUM,A0                         ; reassign $NUM
64 ||      MVK     -64,B0
65   [!A0] BNOP    RA                              ; if ($NUM==0) return;
66 || [A0] STW     FP,*SP--[16]                    ; save frame pointer and alloca(64)
67 || [A0] MV      SP,FP
68    [A0] ADDKPC  __sha256_block,B2
69 || [A0] AND     B0,SP,SP                        ; align stack at 64 bytes
70         .if     __TI_EABI__
71    [A0] MVK     0x00404,B1
72 || [A0] MVKL    \$PCR_OFFSET(K256,__sha256_block),$K256
73    [A0] MVKH    0x50000,B1
74 || [A0] MVKH    \$PCR_OFFSET(K256,__sha256_block),$K256
75         .else
76    [A0] MVK     0x00404,B1
77 || [A0] MVKL    (K256-__sha256_block),$K256
78    [A0] MVKH    0x50000,B1
79 || [A0] MVKH    (K256-__sha256_block),$K256
80         .endif
81    [A0] MVC     B1,AMR                          ; setup circular addressing
82 || [A0] MV      SP,$Xia
83    [A0] MV      SP,$Xib
84 || [A0] ADD     B2,$K256,$K256
85 || [A0] MV      $CTXA,$CTXB
86 || [A0] SUBAW   SP,2,SP                         ; reserve two words above buffer
87         LDW     *${CTXA}[0],$A                  ; load ctx
88 ||      LDW     *${CTXB}[4],$E
89         LDW     *${CTXA}[1],$B
90 ||      LDW     *${CTXB}[5],$F
91         LDW     *${CTXA}[2],$C
92 ||      LDW     *${CTXB}[6],$G
93         LDW     *${CTXA}[3],$D
94 ||      LDW     *${CTXB}[7],$H
95
96         LDNW    *$INP++,$Xn                     ; pre-fetch input
97         LDW     *$K256++,$K                     ; pre-fetch K256[0]
98         MVK     14,B0                           ; loop counters
99         MVK     47,B1
100 ||      ADDAW   $Xia,9,$Xia
101 outerloop?:
102         SUB     A0,1,A0
103 ||      MV      $A,$Actx
104 ||      MV      $E,$Ectx
105 ||      MVD     $B,$Bctx
106 ||      MVD     $F,$Fctx
107         MV      $C,$Cctx
108 ||      MV      $G,$Gctx
109 ||      MVD     $D,$Dctx
110 ||      MVD     $H,$Hctx
111 ||      SWAP4   $Xn,$X0
112
113         SPLOOPD 8                               ; BODY_00_14
114 ||      MVC     B0,ILC
115 ||      SWAP2   $X0,$X0
116
117         LDNW    *$INP++,$Xn
118 ||      ROTL    $A,30,$S0
119 ||      OR      $A,$B,$Maj
120 ||      AND     $A,$B,$t2a
121 ||      ROTL    $E,26,$S1
122 ||      AND     $F,$E,$Ch
123 ||      ANDN    $G,$E,$t2e
124         ROTL    $A,19,$t0a
125 ||      AND     $C,$Maj,$Maj
126 ||      ROTL    $E,21,$t0e
127 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
128         ROTL    $A,10,$t1a
129 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
130 ||      ROTL    $E,7,$t1e
131 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
132         ADD     $X0,$T1,$T1                     ; T1 += X[i];
133 ||      STW     $X0,*$Xib++
134 ||      XOR     $t0a,$S0,$S0
135 ||      XOR     $t0e,$S1,$S1
136         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
137 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
138 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
139 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
140         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
141 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
142 ||      ROTL    $G,0,$H                         ; h = g
143 ||      MV      $F,$G                           ; g = f
144 ||      MV      $X0,$X14
145 ||      SWAP4   $Xn,$X0
146         SWAP2   $X0,$X0
147 ||      MV      $E,$F                           ; f = e
148 ||      ADD     $D,$T1,$E                       ; e = d + T1
149 ||      MV      $C,$D                           ; d = c
150         MV      $B,$C                           ; c = b
151 ||      MV      $A,$B                           ; b = a
152 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
153         SPKERNEL
154
155         ROTL    $A,30,$S0                       ; BODY_15
156 ||      OR      $A,$B,$Maj
157 ||      AND     $A,$B,$t2a
158 ||      ROTL    $E,26,$S1
159 ||      AND     $F,$E,$Ch
160 ||      ANDN    $G,$E,$t2e
161 ||      LDW     *${Xib}[1],$Xn                  ; modulo-scheduled
162         ROTL    $A,19,$t0a
163 ||      AND     $C,$Maj,$Maj
164 ||      ROTL    $E,21,$t0e
165 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
166 ||      LDW     *${Xib}[2],$X1                  ; modulo-scheduled
167         ROTL    $A,10,$t1a
168 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
169 ||      ROTL    $E,7,$t1e
170 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
171         ADD     $X0,$T1,$T1                     ; T1 += X[i];
172 ||      STW     $X0,*$Xib++
173 ||      XOR     $t0a,$S0,$S0
174 ||      XOR     $t0e,$S1,$S1
175         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
176 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
177 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
178 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
179         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
180 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
181 ||      ROTL    $G,0,$H                         ; h = g
182 ||      MV      $F,$G                           ; g = f
183 ||      MV      $X0,$X15
184         MV      $E,$F                           ; f = e
185 ||      ADD     $D,$T1,$E                       ; e = d + T1
186 ||      MV      $C,$D                           ; d = c
187 ||      MV      $Xn,$X0                         ; modulo-scheduled
188 ||      LDW     *$Xia,$X9                       ; modulo-scheduled
189 ||      ROTL    $X1,25,$t0e                     ; modulo-scheduled
190 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
191         SHRU    $X1,3,$s0                       ; modulo-scheduled
192 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
193 ||      ROTL    $B,0,$C                         ; c = b
194 ||      MV      $A,$B                           ; b = a
195 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
196
197         SPLOOPD 10                              ; BODY_16_63
198 ||      MVC     B1,ILC
199 ||      ROTL    $X1,14,$t1e                     ; modulo-scheduled
200 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
201
202         XOR     $t0e,$s0,$s0
203 ||      XOR     $t0a,$s1,$s1
204 ||      MV      $X15,$X14
205 ||      MV      $X1,$Xn
206         XOR     $t1e,$s0,$s0                    ; sigma0(X[i+1])
207 ||      XOR     $t1a,$s1,$s1                    ; sigma1(X[i+14])
208 ||      LDW     *${Xib}[2],$X1                  ; module-scheduled
209         ROTL    $A,30,$S0
210 ||      OR      $A,$B,$Maj
211 ||      AND     $A,$B,$t2a
212 ||      ROTL    $E,26,$S1
213 ||      AND     $F,$E,$Ch
214 ||      ANDN    $G,$E,$t2e
215 ||      ADD     $X9,$X0,$X0                     ; X[i] += X[i+9]
216         ROTL    $A,19,$t0a
217 ||      AND     $C,$Maj,$Maj
218 ||      ROTL    $E,21,$t0e
219 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
220 ||      ADD     $s0,$X0,$X0                     ; X[i] += sigma1(X[i+1])
221         ROTL    $A,10,$t1a
222 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
223 ||      ROTL    $E,7,$t1e
224 ||      ADD     $H,$K,$T1                       ; T1 = h + K256[i]
225 ||      ADD     $s1,$X0,$X0                     ; X[i] += sigma1(X[i+14])
226         XOR     $t0a,$S0,$S0
227 ||      XOR     $t0e,$S1,$S1
228 ||      ADD     $X0,$T1,$T1                     ; T1 += X[i]
229 ||      STW     $X0,*$Xib++
230         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
231 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
232 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
233 ||      MV      $X0,$X15
234 ||      ROTL    $G,0,$H                         ; h = g
235 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
236         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
237 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
238 ||      MV      $F,$G                           ; g = f
239 ||      MV      $Xn,$X0                         ; modulo-scheduled
240 ||      LDW     *++$Xia,$X9                     ; modulo-scheduled
241 ||      ROTL    $X1,25,$t0e                     ; module-scheduled
242 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
243         ROTL    $X1,14,$t1e                     ; modulo-scheduled
244 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
245 ||      MV      $E,$F                           ; f = e
246 ||      ADD     $D,$T1,$E                       ; e = d + T1
247 ||      MV      $C,$D                           ; d = c
248 ||      MV      $B,$C                           ; c = b
249         MV      $A,$B                           ; b = a
250 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
251 ||      SHRU    $X1,3,$s0                       ; modulo-scheduled
252 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
253         SPKERNEL
254
255    [A0] B       outerloop?
256 || [A0] LDNW    *$INP++,$Xn                     ; pre-fetch input
257 || [A0] ADDK    -260,$K256                      ; rewind K256
258 ||      ADD     $Actx,$A,$A                     ; accumulate ctx
259 ||      ADD     $Ectx,$E,$E
260 ||      ADD     $Bctx,$B,$B
261         ADD     $Fctx,$F,$F
262 ||      ADD     $Cctx,$C,$C
263 ||      ADD     $Gctx,$G,$G
264 ||      ADD     $Dctx,$D,$D
265 ||      ADD     $Hctx,$H,$H
266 || [A0] LDW     *$K256++,$K                     ; pre-fetch K256[0]
267
268   [!A0] BNOP    RA
269 ||[!A0] MV      $CTXA,$CTXB
270   [!A0] MV      FP,SP                           ; restore stack pointer
271 ||[!A0] LDW     *FP[0],FP                       ; restore frame pointer
272   [!A0] STW     $A,*${CTXA}[0]                  ; save ctx
273 ||[!A0] STW     $E,*${CTXB}[4]
274 ||[!A0] MVK     0,B0
275   [!A0] STW     $B,*${CTXA}[1]
276 ||[!A0] STW     $F,*${CTXB}[5]
277 ||[!A0] MVC     B0,AMR                          ; clear AMR
278         STW     $C,*${CTXA}[2]
279 ||      STW     $G,*${CTXB}[6]
280         STW     $D,*${CTXA}[3]
281 ||      STW     $H,*${CTXB}[7]
282         .endasmfunc
283
284         .if     __TI_EABI__
285         .sect   ".text:sha_asm.const"
286         .else
287         .sect   ".const:sha_asm"
288         .endif
289         .align  128
290 K256:
291         .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
292         .uword  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
293         .uword  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
294         .uword  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
295         .uword  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
296         .uword  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
297         .uword  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
298         .uword  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
299         .uword  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
300         .uword  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
301         .uword  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
302         .uword  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
303         .uword  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
304         .uword  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
305         .uword  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
306         .uword  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
307         .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
308         .align  4
309
310 ___
311
312 print $code;
313 close STDOUT;