Merge branch 'master' of openssl.net:openssl
[openssl.git] / crypto / sha / asm / sha256-c64xplus.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # SHA256 for C64x+.
11 #
12 # January 2012
13 #
14 # Performance is just below 10 cycles per processed byte, which is
15 # almost 40% faster than compiler-generated code. Unroll is unlikely
16 # to give more than ~8% improvement...
17 #
18 # !!! Note that this module uses AMR, which means that all interrupt
19 # service routines are expected to preserve it and for own well-being
20 # zero it upon entry.
21
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23 open STDOUT,">$output";
24
25 ($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
26  $K256="A3";
27
28 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
29         =map("A$_",(16..31));
30 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
31         =map("B$_",(16..31));
32
33 ($Xia,$Xib)=("A5","B5");                        # circular/ring buffer
34  $CTXB=$t2e;
35
36 ($Xn,$X0,$K)=("B7","B8","B9");
37 ($Maj,$Ch)=($T2,"B6");
38
39 $code.=<<___;
40         .text
41         .if     __TI_EABI__
42         .nocmp
43         .asg    sha256_block_data_order,_sha256_block_data_order
44         .endif
45
46         .asg    B3,RA
47         .asg    A15,FP
48         .asg    B15,SP
49
50         .if     .BIG_ENDIAN
51         .asg    SWAP2,MV
52         .asg    SWAP4,MV
53         .endif
54
55         .global _sha256_block_data_order
56 _sha256_block_data_order:
57         .asmfunc stack_usage(64)
58         MV      $NUM,A0                         ; reassign $NUM
59 ||      MVK     -64,B0
60   [!A0] BNOP    RA                              ; if ($NUM==0) return;
61 || [A0] STW     FP,*SP--[16]                    ; save frame pointer and alloca(64)
62 || [A0] MV      SP,FP
63    [A0] ADDKPC  _sha256_block_data_order,B2
64 || [A0] AND     B0,SP,SP                        ; align stack at 64 bytes
65         .if     __TI_EABI__
66    [A0] MVK     0x00404,B1
67 || [A0] MVKL    \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
68    [A0] MVKH    0x50000,B1
69 || [A0] MVKH    \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
70         .else
71    [A0] MVK     0x00404,B1
72 || [A0] MVKL    (K256-_sha256_block_data_order),$K256
73    [A0] MVKH    0x50000,B1
74 || [A0] MVKH    (K256-_sha256_block_data_order),$K256
75         .endif
76    [A0] MVC     B1,AMR                          ; setup circular addressing
77 || [A0] MV      SP,$Xia
78    [A0] MV      SP,$Xib
79 || [A0] ADD     B2,$K256,$K256
80 || [A0] MV      $CTXA,$CTXB
81 || [A0] SUBAW   SP,2,SP                         ; reserve two words above buffer
82         LDW     *${CTXA}[0],$A                  ; load ctx
83 ||      LDW     *${CTXB}[4],$E
84         LDW     *${CTXA}[1],$B
85 ||      LDW     *${CTXB}[5],$F
86         LDW     *${CTXA}[2],$C
87 ||      LDW     *${CTXB}[6],$G
88         LDW     *${CTXA}[3],$D
89 ||      LDW     *${CTXB}[7],$H
90
91         LDNW    *$INP++,$Xn                     ; pre-fetch input
92         LDW     *$K256++,$K                     ; pre-fetch K256[0]
93         MVK     14,B0                           ; loop counters
94         MVK     47,B1
95 ||      ADDAW   $Xia,9,$Xia
96 outerloop?:
97         SUB     A0,1,A0
98 ||      MV      $A,$Actx
99 ||      MV      $E,$Ectx
100 ||      MVD     $B,$Bctx
101 ||      MVD     $F,$Fctx
102         MV      $C,$Cctx
103 ||      MV      $G,$Gctx
104 ||      MVD     $D,$Dctx
105 ||      MVD     $H,$Hctx
106 ||      SWAP4   $Xn,$X0
107
108         SPLOOPD 8                               ; BODY_00_14
109 ||      MVC     B0,ILC
110 ||      SWAP2   $X0,$X0
111
112         LDNW    *$INP++,$Xn
113 ||      ROTL    $A,30,$S0
114 ||      OR      $A,$B,$Maj
115 ||      AND     $A,$B,$t2a
116 ||      ROTL    $E,26,$S1
117 ||      AND     $F,$E,$Ch
118 ||      ANDN    $G,$E,$t2e
119         ROTL    $A,19,$t0a
120 ||      AND     $C,$Maj,$Maj
121 ||      ROTL    $E,21,$t0e
122 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
123         ROTL    $A,10,$t1a
124 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
125 ||      ROTL    $E,7,$t1e
126 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
127         ADD     $X0,$T1,$T1                     ; T1 += X[i];
128 ||      STW     $X0,*$Xib++
129 ||      XOR     $t0a,$S0,$S0
130 ||      XOR     $t0e,$S1,$S1
131         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
132 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
133 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
134 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
135         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
136 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
137 ||      ROTL    $G,0,$H                         ; h = g
138 ||      MV      $F,$G                           ; g = f
139 ||      MV      $X0,$X14
140 ||      SWAP4   $Xn,$X0
141         SWAP2   $X0,$X0
142 ||      MV      $E,$F                           ; f = e
143 ||      ADD     $D,$T1,$E                       ; e = d + T1
144 ||      MV      $C,$D                           ; d = c
145         MV      $B,$C                           ; c = b
146 ||      MV      $A,$B                           ; b = a
147 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
148         SPKERNEL
149
150         ROTL    $A,30,$S0                       ; BODY_15
151 ||      OR      $A,$B,$Maj
152 ||      AND     $A,$B,$t2a
153 ||      ROTL    $E,26,$S1
154 ||      AND     $F,$E,$Ch
155 ||      ANDN    $G,$E,$t2e
156 ||      LDW     *${Xib}[1],$Xn                  ; modulo-scheduled
157         ROTL    $A,19,$t0a
158 ||      AND     $C,$Maj,$Maj
159 ||      ROTL    $E,21,$t0e
160 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
161 ||      LDW     *${Xib}[2],$X1                  ; modulo-scheduled
162         ROTL    $A,10,$t1a
163 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
164 ||      ROTL    $E,7,$t1e
165 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
166         ADD     $X0,$T1,$T1                     ; T1 += X[i];
167 ||      STW     $X0,*$Xib++
168 ||      XOR     $t0a,$S0,$S0
169 ||      XOR     $t0e,$S1,$S1
170         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
171 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
172 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
173 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
174         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
175 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
176 ||      ROTL    $G,0,$H                         ; h = g
177 ||      MV      $F,$G                           ; g = f
178 ||      MV      $X0,$X15
179         MV      $E,$F                           ; f = e
180 ||      ADD     $D,$T1,$E                       ; e = d + T1
181 ||      MV      $C,$D                           ; d = c
182 ||      MV      $Xn,$X0                         ; modulo-scheduled
183 ||      LDW     *$Xia,$X9                       ; modulo-scheduled
184 ||      ROTL    $X1,25,$t0e                     ; modulo-scheduled
185 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
186         SHRU    $X1,3,$s0                       ; modulo-scheduled
187 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
188 ||      ROTL    $B,0,$C                         ; c = b
189 ||      MV      $A,$B                           ; b = a
190 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
191
192         SPLOOPD 10                              ; BODY_16_63
193 ||      MVC     B1,ILC
194 ||      ROTL    $X1,14,$t1e                     ; modulo-scheduled
195 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
196
197         XOR     $t0e,$s0,$s0
198 ||      XOR     $t0a,$s1,$s1
199 ||      MV      $X15,$X14
200 ||      MV      $X1,$Xn
201         XOR     $t1e,$s0,$s0                    ; sigma0(X[i+1])
202 ||      XOR     $t1a,$s1,$s1                    ; sigma1(X[i+14])
203 ||      LDW     *${Xib}[2],$X1                  ; module-scheduled
204         ROTL    $A,30,$S0
205 ||      OR      $A,$B,$Maj
206 ||      AND     $A,$B,$t2a
207 ||      ROTL    $E,26,$S1
208 ||      AND     $F,$E,$Ch
209 ||      ANDN    $G,$E,$t2e
210 ||      ADD     $X9,$X0,$X0                     ; X[i] += X[i+9]
211         ROTL    $A,19,$t0a
212 ||      AND     $C,$Maj,$Maj
213 ||      ROTL    $E,21,$t0e
214 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
215 ||      ADD     $s0,$X0,$X0                     ; X[i] += sigma1(X[i+1])
216         ROTL    $A,10,$t1a
217 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
218 ||      ROTL    $E,7,$t1e
219 ||      ADD     $H,$K,$T1                       ; T1 = h + K256[i]
220 ||      ADD     $s1,$X0,$X0                     ; X[i] += sigma1(X[i+14])
221         XOR     $t0a,$S0,$S0
222 ||      XOR     $t0e,$S1,$S1
223 ||      ADD     $X0,$T1,$T1                     ; T1 += X[i]
224 ||      STW     $X0,*$Xib++
225         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
226 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
227 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
228 ||      MV      $X0,$X15
229 ||      ROTL    $G,0,$H                         ; h = g
230 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
231         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
232 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
233 ||      MV      $F,$G                           ; g = f
234 ||      MV      $Xn,$X0                         ; modulo-scheduled
235 ||      LDW     *++$Xia,$X9                     ; modulo-scheduled
236 ||      ROTL    $X1,25,$t0e                     ; module-scheduled
237 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
238         ROTL    $X1,14,$t1e                     ; modulo-scheduled
239 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
240 ||      MV      $E,$F                           ; f = e
241 ||      ADD     $D,$T1,$E                       ; e = d + T1
242 ||      MV      $C,$D                           ; d = c
243 ||      MV      $B,$C                           ; c = b
244         MV      $A,$B                           ; b = a
245 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
246 ||      SHRU    $X1,3,$s0                       ; modulo-scheduled
247 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
248         SPKERNEL
249
250    [A0] B       outerloop?
251 || [A0] LDNW    *$INP++,$Xn                     ; pre-fetch input
252 || [A0] ADDK    -260,$K256                      ; rewind K256
253 ||      ADD     $Actx,$A,$A                     ; accumulate ctx
254 ||      ADD     $Ectx,$E,$E
255 ||      ADD     $Bctx,$B,$B
256         ADD     $Fctx,$F,$F
257 ||      ADD     $Cctx,$C,$C
258 ||      ADD     $Gctx,$G,$G
259 ||      ADD     $Dctx,$D,$D
260 ||      ADD     $Hctx,$H,$H
261 || [A0] LDW     *$K256++,$K                     ; pre-fetch K256[0]
262
263   [!A0] BNOP    RA
264 ||[!A0] MV      $CTXA,$CTXB
265   [!A0] MV      FP,SP                           ; restore stack pointer
266 ||[!A0] LDW     *FP[0],FP                       ; restore frame pointer
267   [!A0] STW     $A,*${CTXA}[0]                  ; save ctx
268 ||[!A0] STW     $E,*${CTXB}[4]
269 ||[!A0] MVK     0,B0
270   [!A0] STW     $B,*${CTXA}[1]
271 ||[!A0] STW     $F,*${CTXB}[5]
272 ||[!A0] MVC     B0,AMR                          ; clear AMR
273         STW     $C,*${CTXA}[2]
274 ||      STW     $G,*${CTXB}[6]
275         STW     $D,*${CTXA}[3]
276 ||      STW     $H,*${CTXB}[7]
277         .endasmfunc
278
279         .if     __TI_EABI__
280         .sect   ".text:sha_asm.const"
281         .else
282         .sect   ".const:sha_asm"
283         .endif
284         .align  128
285 K256:
286         .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
287         .uword  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
288         .uword  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
289         .uword  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
290         .uword  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
291         .uword  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
292         .uword  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
293         .uword  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
294         .uword  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
295         .uword  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
296         .uword  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
297         .uword  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
298         .uword  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
299         .uword  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
300         .uword  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
301         .uword  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
302         .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
303         .align  4
304
305 ___
306
307 print $code;
308 close STDOUT;