3ab7d9b689463770d880ca02134b1e6e984119e7
[openssl.git] / crypto / sha / asm / sha256-c64xplus.pl
1 #! /usr/bin/env perl
2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # SHA256 for C64x+.
18 #
19 # January 2012
20 #
21 # Performance is just below 10 cycles per processed byte, which is
22 # almost 40% faster than compiler-generated code. Unroll is unlikely
23 # to give more than ~8% improvement...
24 #
25 # !!! Note that this module uses AMR, which means that all interrupt
26 # service routines are expected to preserve it and for own well-being
27 # zero it upon entry.
28
29 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
30 open STDOUT,">$output";
31
32 ($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
33  $K256="A3";
34
35 ($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
36         =map("A$_",(16..31));
37 ($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
38         =map("B$_",(16..31));
39
40 ($Xia,$Xib)=("A5","B5");                        # circular/ring buffer
41  $CTXB=$t2e;
42
43 ($Xn,$X0,$K)=("B7","B8","B9");
44 ($Maj,$Ch)=($T2,"B6");
45
46 $code.=<<___;
47         .text
48
49         .if     .ASSEMBLER_VERSION<7000000
50         .asg    0,__TI_EABI__
51         .endif
52         .if     __TI_EABI__
53         .nocmp
54         .asg    sha256_block_data_order,_sha256_block_data_order
55         .endif
56
57         .asg    B3,RA
58         .asg    A15,FP
59         .asg    B15,SP
60
61         .if     .BIG_ENDIAN
62         .asg    SWAP2,MV
63         .asg    SWAP4,MV
64         .endif
65
66         .global _sha256_block_data_order
67 _sha256_block_data_order:
68 __sha256_block:
69         .asmfunc stack_usage(64)
70         MV      $NUM,A0                         ; reassign $NUM
71 ||      MVK     -64,B0
72   [!A0] BNOP    RA                              ; if ($NUM==0) return;
73 || [A0] STW     FP,*SP--[16]                    ; save frame pointer and alloca(64)
74 || [A0] MV      SP,FP
75    [A0] ADDKPC  __sha256_block,B2
76 || [A0] AND     B0,SP,SP                        ; align stack at 64 bytes
77         .if     __TI_EABI__
78    [A0] MVK     0x00404,B1
79 || [A0] MVKL    \$PCR_OFFSET(K256,__sha256_block),$K256
80    [A0] MVKH    0x50000,B1
81 || [A0] MVKH    \$PCR_OFFSET(K256,__sha256_block),$K256
82         .else
83    [A0] MVK     0x00404,B1
84 || [A0] MVKL    (K256-__sha256_block),$K256
85    [A0] MVKH    0x50000,B1
86 || [A0] MVKH    (K256-__sha256_block),$K256
87         .endif
88    [A0] MVC     B1,AMR                          ; setup circular addressing
89 || [A0] MV      SP,$Xia
90    [A0] MV      SP,$Xib
91 || [A0] ADD     B2,$K256,$K256
92 || [A0] MV      $CTXA,$CTXB
93 || [A0] SUBAW   SP,2,SP                         ; reserve two words above buffer
94         LDW     *${CTXA}[0],$A                  ; load ctx
95 ||      LDW     *${CTXB}[4],$E
96         LDW     *${CTXA}[1],$B
97 ||      LDW     *${CTXB}[5],$F
98         LDW     *${CTXA}[2],$C
99 ||      LDW     *${CTXB}[6],$G
100         LDW     *${CTXA}[3],$D
101 ||      LDW     *${CTXB}[7],$H
102
103         LDNW    *$INP++,$Xn                     ; pre-fetch input
104         LDW     *$K256++,$K                     ; pre-fetch K256[0]
105         MVK     14,B0                           ; loop counters
106         MVK     47,B1
107 ||      ADDAW   $Xia,9,$Xia
108 outerloop?:
109         SUB     A0,1,A0
110 ||      MV      $A,$Actx
111 ||      MV      $E,$Ectx
112 ||      MVD     $B,$Bctx
113 ||      MVD     $F,$Fctx
114         MV      $C,$Cctx
115 ||      MV      $G,$Gctx
116 ||      MVD     $D,$Dctx
117 ||      MVD     $H,$Hctx
118 ||      SWAP4   $Xn,$X0
119
120         SPLOOPD 8                               ; BODY_00_14
121 ||      MVC     B0,ILC
122 ||      SWAP2   $X0,$X0
123
124         LDNW    *$INP++,$Xn
125 ||      ROTL    $A,30,$S0
126 ||      OR      $A,$B,$Maj
127 ||      AND     $A,$B,$t2a
128 ||      ROTL    $E,26,$S1
129 ||      AND     $F,$E,$Ch
130 ||      ANDN    $G,$E,$t2e
131         ROTL    $A,19,$t0a
132 ||      AND     $C,$Maj,$Maj
133 ||      ROTL    $E,21,$t0e
134 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
135         ROTL    $A,10,$t1a
136 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
137 ||      ROTL    $E,7,$t1e
138 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
139         ADD     $X0,$T1,$T1                     ; T1 += X[i];
140 ||      STW     $X0,*$Xib++
141 ||      XOR     $t0a,$S0,$S0
142 ||      XOR     $t0e,$S1,$S1
143         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
144 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
145 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
146 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
147         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
148 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
149 ||      ROTL    $G,0,$H                         ; h = g
150 ||      MV      $F,$G                           ; g = f
151 ||      MV      $X0,$X14
152 ||      SWAP4   $Xn,$X0
153         SWAP2   $X0,$X0
154 ||      MV      $E,$F                           ; f = e
155 ||      ADD     $D,$T1,$E                       ; e = d + T1
156 ||      MV      $C,$D                           ; d = c
157         MV      $B,$C                           ; c = b
158 ||      MV      $A,$B                           ; b = a
159 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
160         SPKERNEL
161
162         ROTL    $A,30,$S0                       ; BODY_15
163 ||      OR      $A,$B,$Maj
164 ||      AND     $A,$B,$t2a
165 ||      ROTL    $E,26,$S1
166 ||      AND     $F,$E,$Ch
167 ||      ANDN    $G,$E,$t2e
168 ||      LDW     *${Xib}[1],$Xn                  ; modulo-scheduled
169         ROTL    $A,19,$t0a
170 ||      AND     $C,$Maj,$Maj
171 ||      ROTL    $E,21,$t0e
172 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
173 ||      LDW     *${Xib}[2],$X1                  ; modulo-scheduled
174         ROTL    $A,10,$t1a
175 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
176 ||      ROTL    $E,7,$t1e
177 ||      ADD     $K,$H,$T1                       ; T1 = h + K256[i]
178         ADD     $X0,$T1,$T1                     ; T1 += X[i];
179 ||      STW     $X0,*$Xib++
180 ||      XOR     $t0a,$S0,$S0
181 ||      XOR     $t0e,$S1,$S1
182         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
183 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
184 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
185 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
186         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
187 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
188 ||      ROTL    $G,0,$H                         ; h = g
189 ||      MV      $F,$G                           ; g = f
190 ||      MV      $X0,$X15
191         MV      $E,$F                           ; f = e
192 ||      ADD     $D,$T1,$E                       ; e = d + T1
193 ||      MV      $C,$D                           ; d = c
194 ||      MV      $Xn,$X0                         ; modulo-scheduled
195 ||      LDW     *$Xia,$X9                       ; modulo-scheduled
196 ||      ROTL    $X1,25,$t0e                     ; modulo-scheduled
197 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
198         SHRU    $X1,3,$s0                       ; modulo-scheduled
199 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
200 ||      ROTL    $B,0,$C                         ; c = b
201 ||      MV      $A,$B                           ; b = a
202 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
203
204         SPLOOPD 10                              ; BODY_16_63
205 ||      MVC     B1,ILC
206 ||      ROTL    $X1,14,$t1e                     ; modulo-scheduled
207 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
208
209         XOR     $t0e,$s0,$s0
210 ||      XOR     $t0a,$s1,$s1
211 ||      MV      $X15,$X14
212 ||      MV      $X1,$Xn
213         XOR     $t1e,$s0,$s0                    ; sigma0(X[i+1])
214 ||      XOR     $t1a,$s1,$s1                    ; sigma1(X[i+14])
215 ||      LDW     *${Xib}[2],$X1                  ; module-scheduled
216         ROTL    $A,30,$S0
217 ||      OR      $A,$B,$Maj
218 ||      AND     $A,$B,$t2a
219 ||      ROTL    $E,26,$S1
220 ||      AND     $F,$E,$Ch
221 ||      ANDN    $G,$E,$t2e
222 ||      ADD     $X9,$X0,$X0                     ; X[i] += X[i+9]
223         ROTL    $A,19,$t0a
224 ||      AND     $C,$Maj,$Maj
225 ||      ROTL    $E,21,$t0e
226 ||      XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
227 ||      ADD     $s0,$X0,$X0                     ; X[i] += sigma1(X[i+1])
228         ROTL    $A,10,$t1a
229 ||      OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
230 ||      ROTL    $E,7,$t1e
231 ||      ADD     $H,$K,$T1                       ; T1 = h + K256[i]
232 ||      ADD     $s1,$X0,$X0                     ; X[i] += sigma1(X[i+14])
233         XOR     $t0a,$S0,$S0
234 ||      XOR     $t0e,$S1,$S1
235 ||      ADD     $X0,$T1,$T1                     ; T1 += X[i]
236 ||      STW     $X0,*$Xib++
237         XOR     $t1a,$S0,$S0                    ; Sigma0(a)
238 ||      XOR     $t1e,$S1,$S1                    ; Sigma1(e)
239 ||      ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
240 ||      MV      $X0,$X15
241 ||      ROTL    $G,0,$H                         ; h = g
242 ||      LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
243         ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
244 ||      ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
245 ||      MV      $F,$G                           ; g = f
246 ||      MV      $Xn,$X0                         ; modulo-scheduled
247 ||      LDW     *++$Xia,$X9                     ; modulo-scheduled
248 ||      ROTL    $X1,25,$t0e                     ; module-scheduled
249 ||      ROTL    $X14,15,$t0a                    ; modulo-scheduled
250         ROTL    $X1,14,$t1e                     ; modulo-scheduled
251 ||      ROTL    $X14,13,$t1a                    ; modulo-scheduled
252 ||      MV      $E,$F                           ; f = e
253 ||      ADD     $D,$T1,$E                       ; e = d + T1
254 ||      MV      $C,$D                           ; d = c
255 ||      MV      $B,$C                           ; c = b
256         MV      $A,$B                           ; b = a
257 ||      ADD     $T1,$T2,$A                      ; a = T1 + T2
258 ||      SHRU    $X1,3,$s0                       ; modulo-scheduled
259 ||      SHRU    $X14,10,$s1                     ; modulo-scheduled
260         SPKERNEL
261
262    [A0] B       outerloop?
263 || [A0] LDNW    *$INP++,$Xn                     ; pre-fetch input
264 || [A0] ADDK    -260,$K256                      ; rewind K256
265 ||      ADD     $Actx,$A,$A                     ; accumulate ctx
266 ||      ADD     $Ectx,$E,$E
267 ||      ADD     $Bctx,$B,$B
268         ADD     $Fctx,$F,$F
269 ||      ADD     $Cctx,$C,$C
270 ||      ADD     $Gctx,$G,$G
271 ||      ADD     $Dctx,$D,$D
272 ||      ADD     $Hctx,$H,$H
273 || [A0] LDW     *$K256++,$K                     ; pre-fetch K256[0]
274
275   [!A0] BNOP    RA
276 ||[!A0] MV      $CTXA,$CTXB
277   [!A0] MV      FP,SP                           ; restore stack pointer
278 ||[!A0] LDW     *FP[0],FP                       ; restore frame pointer
279   [!A0] STW     $A,*${CTXA}[0]                  ; save ctx
280 ||[!A0] STW     $E,*${CTXB}[4]
281 ||[!A0] MVK     0,B0
282   [!A0] STW     $B,*${CTXA}[1]
283 ||[!A0] STW     $F,*${CTXB}[5]
284 ||[!A0] MVC     B0,AMR                          ; clear AMR
285         STW     $C,*${CTXA}[2]
286 ||      STW     $G,*${CTXB}[6]
287         STW     $D,*${CTXA}[3]
288 ||      STW     $H,*${CTXB}[7]
289         .endasmfunc
290
291         .if     __TI_EABI__
292         .sect   ".text:sha_asm.const"
293         .else
294         .sect   ".const:sha_asm"
295         .endif
296         .align  128
297 K256:
298         .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
299         .uword  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
300         .uword  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
301         .uword  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
302         .uword  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
303         .uword  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
304         .uword  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
305         .uword  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
306         .uword  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
307         .uword  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
308         .uword  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
309         .uword  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
310         .uword  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
311         .uword  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
312         .uword  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
313         .uword  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
314         .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
315         .align  4
316
317 ___
318
319 print $code;
320 close STDOUT;