2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv4.
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
47 ########################################################################
48 # Numbers are cycles per processed byte. Non-NEON results account even
49 # for input bit interleaving.
54 # Cortex-A5 88/+160%, 36
55 # Cortex-A7 78/+160%, 34
56 # Cortex-A8 51/+230%, 30
57 # Cortex-A9 53/+210%, 26
58 # Cortex-A15 42/+160%, 18
59 # Snapdragon S4 43/+210%, 24
61 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
62 # over compiler-generated KECCAK_2X reference code.
64 my @C = map("r$_",(0..9));
65 my @E = map("r$_",(10..12,14));
67 ########################################################################
69 # ----->+-----------------------+
70 # | uint64_t A[5][5] |
72 # +200->+-----------------------+
75 # +240->+-----------------------+
76 # | uint64_t T[5][5] |
78 # +440->+-----------------------+
80 # +444->+-----------------------+
82 # +448->+-----------------------+
85 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
86 my @D = map(8*$_, (25..29));
87 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
92 #if defined(__thumb2__)
99 .type iotas32, %object
102 .long 0x00000001, 0x00000000
103 .long 0x00000000, 0x00000089
104 .long 0x00000000, 0x8000008b
105 .long 0x00000000, 0x80008080
106 .long 0x00000001, 0x0000008b
107 .long 0x00000001, 0x00008000
108 .long 0x00000001, 0x80008088
109 .long 0x00000001, 0x80000082
110 .long 0x00000000, 0x0000000b
111 .long 0x00000000, 0x0000000a
112 .long 0x00000001, 0x00008082
113 .long 0x00000000, 0x00008003
114 .long 0x00000001, 0x0000808b
115 .long 0x00000001, 0x8000000b
116 .long 0x00000001, 0x8000008a
117 .long 0x00000001, 0x80000081
118 .long 0x00000000, 0x80000081
119 .long 0x00000000, 0x80000008
120 .long 0x00000000, 0x00000083
121 .long 0x00000000, 0x80008003
122 .long 0x00000001, 0x80008088
123 .long 0x00000000, 0x80000088
124 .long 0x00000001, 0x00008000
125 .long 0x00000000, 0x80008082
126 .size iotas32,.-iotas32
128 .type KeccakF1600_int, %function
131 add @C[9],sp,#$A[4][2]
132 add @E[2],sp,#$A[0][0]
133 add @E[0],sp,#$A[1][0]
134 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
137 eor @E[1],@E[1],@E[1]
145 my (@A,@R); (@A[0..4],@R) = @_;
148 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
149 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
150 eor @C[0],@C[0],@E[0]
151 add @E[0],sp,#$A[1][2]
152 eor @C[1],@C[1],@E[1]
153 eor @C[2],@C[2],@E[2]
154 eor @C[3],@C[3],@E[3]
155 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
156 eor @C[4],@C[4],@E[0]
157 add @E[0],sp,#$A[1][4]
158 eor @C[5],@C[5],@E[1]
159 eor @C[6],@C[6],@E[2]
160 eor @C[7],@C[7],@E[3]
161 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
162 eor @C[8],@C[8],@E[0]
163 add @E[0],sp,#$A[2][1]
164 eor @C[9],@C[9],@E[1]
165 eor @C[0],@C[0],@E[2]
166 eor @C[1],@C[1],@E[3]
167 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
168 eor @C[2],@C[2],@E[0]
169 add @E[0],sp,#$A[2][3]
170 eor @C[3],@C[3],@E[1]
171 eor @C[4],@C[4],@E[2]
172 eor @C[5],@C[5],@E[3]
173 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
174 eor @C[6],@C[6],@E[0]
175 add @E[0],sp,#$A[3][0]
176 eor @C[7],@C[7],@E[1]
177 eor @C[8],@C[8],@E[2]
178 eor @C[9],@C[9],@E[3]
179 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
180 eor @C[0],@C[0],@E[0]
181 add @E[0],sp,#$A[3][2]
182 eor @C[1],@C[1],@E[1]
183 eor @C[2],@C[2],@E[2]
184 eor @C[3],@C[3],@E[3]
185 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
186 eor @C[4],@C[4],@E[0]
187 add @E[0],sp,#$A[3][4]
188 eor @C[5],@C[5],@E[1]
189 eor @C[6],@C[6],@E[2]
190 eor @C[7],@C[7],@E[3]
191 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
192 eor @C[8],@C[8],@E[0]
193 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
194 eor @C[9],@C[9],@E[1]
195 ldr @E[1],[sp,#$A[4][1]+4]
196 eor @C[0],@C[0],@E[2]
197 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
198 eor @C[1],@C[1],@E[3]
199 ldr @E[3],[sp,#$A[0][2]+4]
200 eor @C[2],@C[2],@E[0]
201 add @E[0],sp,#$A[0][3]
202 eor @C[3],@C[3],@E[1]
203 eor @C[4],@C[4],@E[2]
204 eor @C[5],@C[5],@E[3]
205 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
206 eor @C[6],@C[6],@E[0]
207 eor @C[7],@C[7],@E[1]
208 eor @C[8],@C[8],@E[2]
209 eor @C[9],@C[9],@E[3]
211 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
212 eor @E[1],@C[1],@C[4]
213 str @E[0],[sp,#$D[1]] @ D[1] = E[0]
214 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
215 str @E[1],[sp,#$D[1]+4]
216 eor @E[3],@C[7],@C[0]
217 str @E[2],[sp,#$D[4]] @ D[4] = E[1]
218 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
219 str @E[3],[sp,#$D[4]+4]
220 eor @C[1],@C[9],@C[2]
221 str @C[0],[sp,#$D[0]] @ D[0] = C[0]
222 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
223 str @C[1],[sp,#$D[0]+4]
224 eor @C[3],@C[3],@C[6]
225 ldr @C[7],[sp,#$A[3][3]]
226 str @C[2],[sp,#$D[2]] @ D[2] = C[1]
227 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
228 ldr @C[6],[sp,#$A[3][3]+4]
229 str @C[3],[sp,#$D[2]+4]
230 eor @C[5],@C[5],@C[8]
232 ldr @C[8],[sp,#$A[4][4]]
233 ldr @C[9],[sp,#$A[4][4]+4]
234 str @C[4],[sp,#$D[3]] @ D[3] = C[2]
235 eor @C[7],@C[7],@C[4]
236 str @C[5],[sp,#$D[3]+4]
237 eor @C[6],@C[6],@C[5]
238 ldr @C[4],[sp,#$A[0][0]]
239 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
240 @ ror @C[6],@C[6],#32-11
241 eor @C[8],@C[8],@E[2]
242 ldr @C[5],[sp,#$A[0][0]+4]
243 eor @C[9],@C[9],@E[3]
244 ldr @E[2],[sp,#$A[2][2]]
245 eor @C[0],@C[0],@C[4]
246 ldr @E[3],[sp,#$A[2][2]+4]
247 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
248 @ ror @C[9],@C[9],#32-7
249 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
250 eor @E[2],@E[2],@C[2]
251 ldr @C[2],[sp,#$A[1][1]]
252 eor @E[3],@E[3],@C[3]
253 ldr @C[3],[sp,#$A[1][1]+4]
254 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
255 ldr @E[2],[sp,#444] @ load counter
256 eor @C[2],@C[2],@E[0]
258 ror @C[4],@E[3],#32-22
259 add @E[3],@E[0],@E[2]
260 eor @C[3],@C[3],@E[1]
262 $code.=<<___ if ($A[0][0] != $T[0][0]);
263 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
265 $code.=<<___ if ($A[0][0] == $T[0][0]);
266 ldr @E[0],[@E[3],#8] @ iotas[i].lo
268 ldr @E[1],[@E[3],#12] @ iotas[i].hi
270 str @E[2],[sp,#444] @ store counter
273 bic @E[2],@C[4],@C[2],ror#32-22
274 bic @E[3],@C[5],@C[3],ror#32-22
275 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
276 ror @C[3],@C[3],#32-22
277 eor @E[2],@E[2],@C[0]
278 eor @E[3],@E[3],@C[1]
279 eor @E[0],@E[0],@E[2]
280 eor @E[1],@E[1],@E[3]
281 str @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
282 bic @E[2],@C[6],@C[4],ror#11
283 str @E[1],[sp,#$R[0][0]+4]
284 bic @E[3],@C[7],@C[5],ror#10
285 bic @E[0],@C[8],@C[6],ror#32-(11-7)
286 bic @E[1],@C[9],@C[7],ror#32-(10-7)
287 eor @E[2],@C[2],@E[2],ror#32-11
288 eor @E[3],@C[3],@E[3],ror#32-10
289 str @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
290 eor @E[0],@C[4],@E[0],ror#32-7
291 str @E[3],[sp,#$R[0][1]+4]
292 eor @E[1],@C[5],@E[1],ror#32-7
293 str @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
294 bic @E[2],@C[0],@C[8],ror#32-7
295 str @E[1],[sp,#$R[0][2]+4]
296 bic @E[3],@C[1],@C[9],ror#32-7
297 eor @E[2],@E[2],@C[6],ror#32-11
298 eor @E[3],@E[3],@C[7],ror#32-10
299 str @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
300 bic @E[0],@C[2],@C[0]
301 str @E[3],[sp,#$R[0][3]+4]
303 bic @E[1],@C[3],@C[1]
304 ldr @C[0],[sp,#$A[0][3]] @ A[0][3]
305 eor @E[0],@E[0],@C[8],ror#32-7
306 ldr @C[1],[sp,#$A[0][3]+4]
307 eor @E[1],@E[1],@C[9],ror#32-7
308 str @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
310 str @E[1],[sp,#$R[0][4]+4]
312 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
313 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
315 ldr @C[2],[sp,#$A[1][4]] @ A[1][4]
316 eor @C[0],@C[0],@E[0]
317 ldr @C[3],[sp,#$A[1][4]+4]
318 eor @C[1],@C[1],@E[1]
319 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
320 ldr @E[0],[sp,#$A[3][1]] @ A[3][1]
321 @ ror @C[1],@C[1],#32-14
322 ldr @E[1],[sp,#$A[3][1]+4]
324 eor @C[2],@C[2],@E[2]
325 ldr @C[4],[sp,#$A[2][0]] @ A[2][0]
326 eor @C[3],@C[3],@E[3]
327 ldr @C[5],[sp,#$A[2][0]+4]
328 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
329 @ ror @C[3],@C[3],#32-10
331 eor @C[6],@C[6],@C[4]
332 ldr @E[2],[sp,#$D[2]] @ D[2]
333 eor @C[7],@C[7],@C[5]
334 ldr @E[3],[sp,#$D[2]+4]
335 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
336 ror @C[4],@C[7],#32-2
338 eor @E[0],@E[0],@C[8]
339 ldr @C[8],[sp,#$A[4][2]] @ A[4][2]
340 eor @E[1],@E[1],@C[9]
341 ldr @C[9],[sp,#$A[4][2]+4]
342 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
343 ror @C[6],@E[1],#32-23
345 bic @E[0],@C[4],@C[2],ror#32-10
346 bic @E[1],@C[5],@C[3],ror#32-10
347 eor @E[2],@E[2],@C[8]
348 eor @E[3],@E[3],@C[9]
349 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
350 ror @C[8],@E[3],#32-31
351 eor @E[0],@E[0],@C[0],ror#32-14
352 eor @E[1],@E[1],@C[1],ror#32-14
353 str @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
354 bic @E[2],@C[6],@C[4]
355 str @E[1],[sp,#$R[1][0]+4]
356 bic @E[3],@C[7],@C[5]
357 eor @E[2],@E[2],@C[2],ror#32-10
358 eor @E[3],@E[3],@C[3],ror#32-10
359 str @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
360 bic @E[0],@C[8],@C[6]
361 str @E[3],[sp,#$R[1][1]+4]
362 bic @E[1],@C[9],@C[7]
363 bic @E[2],@C[0],@C[8],ror#14
364 bic @E[3],@C[1],@C[9],ror#14
365 eor @E[0],@E[0],@C[4]
366 eor @E[1],@E[1],@C[5]
367 str @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
368 bic @E[0],@C[2],@C[0],ror#32-(14-10)
369 eor @E[2],@C[6],@E[2],ror#32-14
370 str @E[1],[sp,#$R[1][2]+4]
371 bic @E[1],@C[3],@C[1],ror#32-(14-10)
372 eor @E[3],@C[7],@E[3],ror#32-14
373 str @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
375 str @E[3],[sp,#$R[1][3]+4]
376 ldr @C[1],[sp,#$A[0][1]] @ A[0][1]
377 eor @E[0],@C[8],@E[0],ror#32-10
378 ldr @C[0],[sp,#$A[0][1]+4]
379 eor @E[1],@C[9],@E[1],ror#32-10
380 str @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
382 str @E[1],[sp,#$R[1][4]+4]
384 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
385 ldr @C[2],[sp,#$A[1][2]] @ A[1][2]
386 ldr @C[3],[sp,#$A[1][2]+4]
387 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
389 eor @C[1],@C[1],@E[0]
390 ldr @C[4],[sp,#$A[2][3]] @ A[2][3]
391 eor @C[0],@C[0],@E[1]
392 ldr @C[5],[sp,#$A[2][3]+4]
393 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
395 eor @C[2],@C[2],@E[2]
396 ldr @E[0],[sp,#$A[3][4]] @ A[3][4]
397 eor @C[3],@C[3],@E[3]
398 ldr @E[1],[sp,#$A[3][4]+4]
399 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
400 ldr @E[2],[sp,#$D[0]] @ D[0]
401 @ ror @C[3],@C[3],#32-3
402 ldr @E[3],[sp,#$D[0]+4]
404 eor @C[4],@C[4],@C[6]
405 eor @C[5],@C[5],@C[7]
406 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
407 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
409 eor @E[0],@E[0],@C[8]
410 ldr @C[8],[sp,#$A[4][0]] @ A[4][0]
411 eor @E[1],@E[1],@C[9]
412 ldr @C[9],[sp,#$A[4][0]+4]
413 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
414 ror @C[7],@E[1],#32-4
416 eor @E[2],@E[2],@C[8]
417 eor @E[3],@E[3],@C[9]
418 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
419 ror @C[9],@E[3],#32-9
421 bic @E[0],@C[5],@C[2],ror#13-3
422 bic @E[1],@C[4],@C[3],ror#12-3
423 bic @E[2],@C[6],@C[5],ror#32-13
424 bic @E[3],@C[7],@C[4],ror#32-12
425 eor @E[0],@C[0],@E[0],ror#32-13
426 eor @E[1],@C[1],@E[1],ror#32-12
427 str @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
428 eor @E[2],@E[2],@C[2],ror#32-3
429 str @E[1],[sp,#$R[2][0]+4]
430 eor @E[3],@E[3],@C[3],ror#32-3
431 str @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
432 bic @E[0],@C[8],@C[6]
433 str @E[3],[sp,#$R[2][1]+4]
434 bic @E[1],@C[9],@C[7]
435 eor @E[0],@E[0],@C[5],ror#32-13
436 eor @E[1],@E[1],@C[4],ror#32-12
437 str @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
438 bic @E[2],@C[0],@C[8]
439 str @E[1],[sp,#$R[2][2]+4]
440 bic @E[3],@C[1],@C[9]
441 eor @E[2],@E[2],@C[6]
442 eor @E[3],@E[3],@C[7]
443 str @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
444 bic @E[0],@C[2],@C[0],ror#3
445 str @E[3],[sp,#$R[2][3]+4]
446 bic @E[1],@C[3],@C[1],ror#3
447 ldr @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
448 eor @E[0],@C[8],@E[0],ror#32-3
449 ldr @C[0],[sp,#$A[0][4]+4]
450 eor @E[1],@C[9],@E[1],ror#32-3
451 str @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
453 str @E[1],[sp,#$R[2][4]+4]
455 ldr @E[0],[sp,#$D[4]] @ D[4]
456 ldr @E[1],[sp,#$D[4]+4]
457 ldr @E[2],[sp,#$D[0]] @ D[0]
458 ldr @E[3],[sp,#$D[0]+4]
460 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
462 eor @C[1],@C[1],@E[0]
463 ldr @C[2],[sp,#$A[1][0]] @ A[1][0]
464 eor @C[0],@C[0],@E[1]
465 ldr @C[3],[sp,#$A[1][0]+4]
466 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
467 ldr @C[4],[sp,#$A[2][1]] @ A[2][1]
468 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
469 ldr @C[5],[sp,#$A[2][1]+4]
471 eor @C[2],@C[2],@E[2]
472 ldr @E[0],[sp,#$A[3][2]] @ A[3][2]
473 eor @C[3],@C[3],@E[3]
474 ldr @E[1],[sp,#$A[3][2]+4]
475 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
476 ldr @E[2],[sp,#$D[3]] @ D[3]
477 @ ror @C[3],@C[3],#32-18
478 ldr @E[3],[sp,#$D[3]+4]
480 eor @C[6],@C[6],@C[4]
481 eor @C[7],@C[7],@C[5]
482 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
483 ror @C[5],@C[7],#32-5
485 eor @E[0],@E[0],@C[8]
486 ldr @C[8],[sp,#$A[4][3]] @ A[4][3]
487 eor @E[1],@E[1],@C[9]
488 ldr @C[9],[sp,#$A[4][3]+4]
489 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
490 ror @C[6],@E[1],#32-8
492 eor @E[2],@E[2],@C[8]
493 eor @E[3],@E[3],@C[9]
494 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
495 ror @C[9],@E[3],#32-28
497 bic @E[0],@C[4],@C[2],ror#32-18
498 bic @E[1],@C[5],@C[3],ror#32-18
499 eor @E[0],@E[0],@C[0],ror#32-14
500 eor @E[1],@E[1],@C[1],ror#32-13
501 str @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
502 bic @E[2],@C[6],@C[4]
503 str @E[1],[sp,#$R[3][0]+4]
504 bic @E[3],@C[7],@C[5]
505 eor @E[2],@E[2],@C[2],ror#32-18
506 eor @E[3],@E[3],@C[3],ror#32-18
507 str @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
508 bic @E[0],@C[8],@C[6]
509 str @E[3],[sp,#$R[3][1]+4]
510 bic @E[1],@C[9],@C[7]
511 bic @E[2],@C[0],@C[8],ror#14
512 bic @E[3],@C[1],@C[9],ror#13
513 eor @E[0],@E[0],@C[4]
514 eor @E[1],@E[1],@C[5]
515 str @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
516 bic @E[0],@C[2],@C[0],ror#18-14
517 eor @E[2],@C[6],@E[2],ror#32-14
518 str @E[1],[sp,#$R[3][2]+4]
519 bic @E[1],@C[3],@C[1],ror#18-13
520 eor @E[3],@C[7],@E[3],ror#32-13
521 str @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
522 str @E[3],[sp,#$R[3][3]+4]
524 ldr @C[0],[sp,#$A[0][2]] @ A[0][2]
525 eor @E[0],@C[8],@E[0],ror#32-18
526 ldr @C[1],[sp,#$A[0][2]+4]
527 eor @E[1],@C[9],@E[1],ror#32-18
528 str @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
529 str @E[1],[sp,#$R[3][4]+4]
531 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
532 ldr @C[2],[sp,#$A[1][3]] @ A[1][3]
533 ldr @C[3],[sp,#$A[1][3]+4]
534 ldr @C[6],[sp,#$D[4]] @ D[4]
535 ldr @C[7],[sp,#$D[4]+4]
537 eor @C[0],@C[0],@E[0]
538 ldr @C[4],[sp,#$A[2][4]] @ A[2][4]
539 eor @C[1],@C[1],@E[1]
540 ldr @C[5],[sp,#$A[2][4]+4]
541 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
542 ldr @C[8],[sp,#$D[0]] @ D[0]
543 @ ror @C[1],@C[1],#32-31
544 ldr @C[9],[sp,#$D[0]+4]
546 eor @E[2],@E[2],@C[2]
547 ldr @E[0],[sp,#$A[3][0]] @ A[3][0]
548 eor @E[3],@E[3],@C[3]
549 ldr @E[1],[sp,#$A[3][0]+4]
550 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
551 ldr @E[2],[sp,#$D[1]] @ D[1]
552 ror @C[2],@E[3],#32-28
553 ldr @E[3],[sp,#$D[1]+4]
555 eor @C[6],@C[6],@C[4]
556 eor @C[7],@C[7],@C[5]
557 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
558 ror @C[4],@C[7],#32-20
560 eor @E[0],@E[0],@C[8]
561 ldr @C[8],[sp,#$A[4][1]] @ A[4][1]
562 eor @E[1],@E[1],@C[9]
563 ldr @C[9],[sp,#$A[4][1]+4]
564 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
565 ror @C[6],@E[1],#32-21
567 eor @C[8],@C[8],@E[2]
568 eor @C[9],@C[9],@E[3]
569 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
570 @ ror @C[9],@C[3],#32-1
572 bic @E[0],@C[4],@C[2]
573 bic @E[1],@C[5],@C[3]
574 eor @E[0],@E[0],@C[0],ror#32-31
575 eor @E[1],@E[1],@C[1],ror#32-31
576 str @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
577 bic @E[2],@C[6],@C[4]
578 str @E[1],[sp,#$R[4][0]+4]
579 bic @E[3],@C[7],@C[5]
580 eor @E[2],@E[2],@C[2]
581 eor @E[3],@E[3],@C[3]
582 str @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
583 bic @E[0],@C[8],@C[6],ror#1
584 str @E[3],[sp,#$R[4][1]+4]
585 bic @E[1],@C[9],@C[7],ror#1
586 bic @E[2],@C[0],@C[8],ror#31-1
587 bic @E[3],@C[1],@C[9],ror#31-1
588 eor @C[4],@C[4],@E[0],ror#32-1
589 eor @C[5],@C[5],@E[1],ror#32-1
590 str @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
591 eor @C[6],@C[6],@E[2],ror#32-31
592 str @C[5],[sp,#$R[4][2]+4]
593 eor @C[7],@C[7],@E[3],ror#32-31
594 str @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
595 bic @E[0],@C[2],@C[0],ror#32-31
596 str @C[7],[sp,#$R[4][3]+4]
597 bic @E[1],@C[3],@C[1],ror#32-31
598 add @E[2],sp,#$R[0][0]
599 eor @C[8],@E[0],@C[8],ror#32-1
600 add @E[0],sp,#$R[1][0]
601 eor @C[9],@E[1],@C[9],ror#32-1
602 str @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
603 str @C[9],[sp,#$R[4][4]+4]
612 .size KeccakF1600_int,.-KeccakF1600_int
614 .type KeccakF1600, %function
617 stmdb sp!,{r0,r4-r11,lr}
618 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
620 add @E[0],r0,#$A[1][0]
621 add @E[1],sp,#$A[1][0]
622 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
623 stmia sp, {@C[0]-@C[9]}
624 ldmia @E[0]!,{@C[0]-@C[9]}
625 stmia @E[1]!,{@C[0]-@C[9]}
626 ldmia @E[0]!,{@C[0]-@C[9]}
627 stmia @E[1]!,{@C[0]-@C[9]}
628 ldmia @E[0]!,{@C[0]-@C[9]}
629 stmia @E[1]!,{@C[0]-@C[9]}
630 ldmia @E[0], {@C[0]-@C[9]}
631 add @E[2],sp,#$A[0][0]
632 add @E[0],sp,#$A[1][0]
633 stmia @E[1], {@C[0]-@C[9]}
637 ldr @E[1], [sp,#440+16] @ restore pointer to A
638 ldmia sp, {@C[0]-@C[9]}
639 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
640 ldmia @E[0]!,{@C[0]-@C[9]}
641 stmia @E[1]!,{@C[0]-@C[9]}
642 ldmia @E[0]!,{@C[0]-@C[9]}
643 stmia @E[1]!,{@C[0]-@C[9]}
644 ldmia @E[0]!,{@C[0]-@C[9]}
645 stmia @E[1]!,{@C[0]-@C[9]}
646 ldmia @E[0], {@C[0]-@C[9]}
647 stmia @E[1], {@C[0]-@C[9]}
650 ldmia sp!,{r4-r11,pc}
651 .size KeccakF1600,.-KeccakF1600
653 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
655 ########################################################################
657 # ----->+-----------------------+
658 # | uint64_t A[5][5] |
661 # +456->+-----------------------+
663 # +460->+-----------------------+
665 # +464->+-----------------------+
667 # +468->+-----------------------+
669 # +472->+-----------------------+
671 # +476->+-----------------------+
672 # | const void *inp |
673 # +480->+-----------------------+
675 # +484->+-----------------------+
677 # +488->+-----------------------+
682 .type SHA3_absorb,%function
685 stmdb sp!,{r0-r12,lr}
688 add $A_flat,r0,#$A[1][0]
696 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
697 stmia $inp!, {@C[0]-@C[9]}
698 ldmia $A_flat!,{@C[0]-@C[9]}
699 stmia $inp!, {@C[0]-@C[9]}
700 ldmia $A_flat!,{@C[0]-@C[9]}
701 stmia $inp!, {@C[0]-@C[9]}
702 ldmia $A_flat!,{@C[0]-@C[9]}
703 stmia $inp!, {@C[0]-@C[9]}
704 ldmia $A_flat!,{@C[0]-@C[9]}
705 stmia $inp, {@C[0]-@C[9]}
707 ldr $inp,[sp,#476] @ restore $inp
714 mov r6,#0x11 @ compose constants
719 orr r6,r6,r6,lsl#16 @ 0x11111111
720 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
721 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
722 orr r7,r6,r6,lsl#1 @ 0x33333333
723 orr r6,r6,r6,lsl#2 @ 0x55555555
736 str r0,[sp,#480] @ save len - bsz
749 orr r0,r0,r3,lsl#24 @ lo
753 orr r1,r1,r3,lsl#24 @ hi
755 and r2,r0,r6 @ &=0x55555555
756 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
757 and r3,r1,r6 @ &=0x55555555
758 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
763 and r2,r2,r7 @ &=0x33333333
764 and r0,r0,r7,lsl#2 @ &=0xcccccccc
765 and r3,r3,r7 @ &=0x33333333
766 and r1,r1,r7,lsl#2 @ &=0xcccccccc
771 and r2,r2,r8 @ &=0x0f0f0f0f
772 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
773 and r3,r3,r8 @ &=0x0f0f0f0f
774 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
775 ldmia $A_flat,{r4-r5} @ A_flat[i]
780 and r2,r2,r9 @ &=0x00ff00ff
781 and r0,r0,r9,lsl#8 @ &=0xff00ff00
782 and r3,r3,r9 @ &=0x00ff00ff
783 and r1,r1,r9,lsl#8 @ &=0xff00ff00
795 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
805 ldmia r14,{r6-r12,r14} @ restore constants and variables
810 add $inp,sp,#$A[1][0]
811 ldmia sp, {@C[0]-@C[9]}
812 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
813 ldmia $inp!, {@C[0]-@C[9]}
814 stmia $A_flat!,{@C[0]-@C[9]}
815 ldmia $inp!, {@C[0]-@C[9]}
816 stmia $A_flat!,{@C[0]-@C[9]}
817 ldmia $inp!, {@C[0]-@C[9]}
818 stmia $A_flat!,{@C[0]-@C[9]}
819 ldmia $inp, {@C[0]-@C[9]}
820 stmia $A_flat, {@C[0]-@C[9]}
824 mov r0,$len @ return value
825 ldmia sp!,{r4-r12,pc}
826 .size SHA3_absorb,.-SHA3_absorb
829 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
833 .type SHA3_squeeze,%function
836 stmdb sp!,{r0,r3-r10,lr}
849 mov r6,#0x11 @ compose constants
854 orr r6,r6,r6,lsl#16 @ 0x11111111
855 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
856 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
857 orr r7,r6,r6,lsl#1 @ 0x33333333
858 orr r6,r6,r6,lsl#2 @ 0x55555555
867 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
870 lsl r3,r1,#16 @ r3 = r1 << 16
871 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
873 lsr r0,r0,#16 @ r0 = r0 >> 16
874 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
880 and r2,r2,r9 @ &=0x00ff00ff
881 and r3,r3,r9,lsl#8 @ &=0xff00ff00
882 and r0,r0,r9 @ &=0x00ff00ff
883 and r1,r1,r9,lsl#8 @ &=0xff00ff00
888 and r2,r2,r8 @ &=0x0f0f0f0f
889 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
890 and r0,r0,r8 @ &=0x0f0f0f0f
891 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
896 and r2,r2,r7 @ &=0x33333333
897 and r3,r3,r7,lsl#2 @ &=0xcccccccc
898 and r0,r0,r7 @ &=0x33333333
899 and r1,r1,r7,lsl#2 @ &=0xcccccccc
904 and r2,r2,r6 @ &=0x55555555
905 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
906 and r0,r0,r6 @ &=0x55555555
907 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
932 subs $bsz,$bsz,#8 @ bsz -= 8
935 mov r0,r14 @ original $A_flat
939 ldmia sp,{r6-r10,r12} @ restore constants and variables
975 ldmia sp!,{r4-r10,pc}
976 .size SHA3_squeeze,.-SHA3_squeeze
983 .type iotas64, %object
986 .quad 0x0000000000000001
987 .quad 0x0000000000008082
988 .quad 0x800000000000808a
989 .quad 0x8000000080008000
990 .quad 0x000000000000808b
991 .quad 0x0000000080000001
992 .quad 0x8000000080008081
993 .quad 0x8000000000008009
994 .quad 0x000000000000008a
995 .quad 0x0000000000000088
996 .quad 0x0000000080008009
997 .quad 0x000000008000000a
998 .quad 0x000000008000808b
999 .quad 0x800000000000008b
1000 .quad 0x8000000000008089
1001 .quad 0x8000000000008003
1002 .quad 0x8000000000008002
1003 .quad 0x8000000000000080
1004 .quad 0x000000000000800a
1005 .quad 0x800000008000000a
1006 .quad 0x8000000080008081
1007 .quad 0x8000000000008080
1008 .quad 0x0000000080000001
1009 .quad 0x8000000080008008
1010 .size iotas64,.-iotas64
1012 .type KeccakF1600_neon, %function
1017 mov r3, #24 @ loop counter
1023 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
1024 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1025 vst1.64 {d18}, [r1:64] @ offload A[2][4]
1026 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1027 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1028 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1029 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1030 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1031 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1032 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1033 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1034 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1035 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1036 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1037 veor d25, d25, d24 @ C[4]^=A[4][4]
1039 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1040 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1041 vadd.u64 d18, d25, d25 @ C[4]<<1
1042 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1043 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1044 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1045 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1046 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1047 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1048 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1050 veor d0, d0, d25 @ A[0][0] ^= C[4]
1051 veor d1, d1, d25 @ A[1][0] ^= C[4]
1052 veor d10, d10, d25 @ A[2][0] ^= C[4]
1053 veor d11, d11, d25 @ A[3][0] ^= C[4]
1054 veor d20, d20, d25 @ A[4][0] ^= C[4]
1056 veor d2, d2, d26 @ A[0][1] ^= D[1]
1057 veor d3, d3, d26 @ A[1][1] ^= D[1]
1058 veor d12, d12, d26 @ A[2][1] ^= D[1]
1059 veor d13, d13, d26 @ A[3][1] ^= D[1]
1060 veor d21, d21, d26 @ A[4][1] ^= D[1]
1063 veor d6, d6, d28 @ A[0][3] ^= C[2]
1064 veor d7, d7, d28 @ A[1][3] ^= C[2]
1065 veor d16, d16, d28 @ A[2][3] ^= C[2]
1066 veor d17, d17, d28 @ A[3][3] ^= C[2]
1067 veor d23, d23, d28 @ A[4][3] ^= C[2]
1068 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
1071 vld1.64 {d18}, [r1:64] @ restore A[2][4]
1072 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1073 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1074 veor d22, d22, d27 @ A[4][2] ^= D[2]
1076 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1077 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1078 veor d24, d24, d29 @ A[4][4] ^= C[3]
1081 vmov d26, d2 @ C[1] = A[0][1]
1082 vshl.u64 d2, d3, #44
1083 vmov d27, d4 @ C[2] = A[0][2]
1084 vshl.u64 d4, d14, #43
1085 vmov d28, d6 @ C[3] = A[0][3]
1086 vshl.u64 d6, d17, #21
1087 vmov d29, d8 @ C[4] = A[0][4]
1088 vshl.u64 d8, d24, #14
1089 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1090 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1091 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1092 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1094 vshl.u64 d3, d9, #20
1095 vshl.u64 d14, d16, #25
1096 vshl.u64 d17, d15, #15
1097 vshl.u64 d24, d21, #2
1098 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1099 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1100 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1101 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1103 vshl.u64 d9, d22, #61
1104 @ vshl.u64 d16, d19, #8
1105 vshl.u64 d15, d12, #10
1106 vshl.u64 d21, d7, #55
1107 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1108 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1109 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1110 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1112 vshl.u64 d22, d18, #39
1113 @ vshl.u64 d19, d23, #56
1114 vshl.u64 d12, d5, #6
1115 vshl.u64 d7, d13, #45
1116 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1117 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1118 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1119 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1121 vshl.u64 d18, d20, #18
1122 vshl.u64 d23, d11, #41
1123 vshl.u64 d5, d10, #3
1124 vshl.u64 d13, d1, #36
1125 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1126 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1127 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1128 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1130 vshl.u64 d1, d28, #28
1131 vshl.u64 d10, d26, #1
1132 vshl.u64 d11, d29, #27
1133 vshl.u64 d20, d27, #62
1134 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1135 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1136 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1137 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1143 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1144 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1145 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1146 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
1149 vmov q1, q14 @ A[0..1][1]
1150 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1151 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1154 vmov q0, q5 @ A[2..3][0]
1156 vmov q15, q6 @ A[2..3][1]
1157 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1159 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1161 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1163 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1164 vmov q14, q10 @ A[4][0..1]
1165 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1167 vld1.64 d25, [r2:64]! @ Iota[i++]
1170 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
1171 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1173 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1175 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1177 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1178 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1179 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1185 .size KeccakF1600_neon,.-KeccakF1600_neon
1187 .global SHA3_absorb_neon
1188 .type SHA3_absorb_neon, %function
1191 stmdb sp!, {r4-r6,lr}
1192 vstmdb sp!, {d8-d15}
1198 vld1.32 {d0}, [r0:64]! @ A[0][0]
1199 vld1.32 {d2}, [r0:64]! @ A[0][1]
1200 vld1.32 {d4}, [r0:64]! @ A[0][2]
1201 vld1.32 {d6}, [r0:64]! @ A[0][3]
1202 vld1.32 {d8}, [r0:64]! @ A[0][4]
1204 vld1.32 {d1}, [r0:64]! @ A[1][0]
1205 vld1.32 {d3}, [r0:64]! @ A[1][1]
1206 vld1.32 {d5}, [r0:64]! @ A[1][2]
1207 vld1.32 {d7}, [r0:64]! @ A[1][3]
1208 vld1.32 {d9}, [r0:64]! @ A[1][4]
1210 vld1.32 {d10}, [r0:64]! @ A[2][0]
1211 vld1.32 {d12}, [r0:64]! @ A[2][1]
1212 vld1.32 {d14}, [r0:64]! @ A[2][2]
1213 vld1.32 {d16}, [r0:64]! @ A[2][3]
1214 vld1.32 {d18}, [r0:64]! @ A[2][4]
1216 vld1.32 {d11}, [r0:64]! @ A[3][0]
1217 vld1.32 {d13}, [r0:64]! @ A[3][1]
1218 vld1.32 {d15}, [r0:64]! @ A[3][2]
1219 vld1.32 {d17}, [r0:64]! @ A[3][3]
1220 vld1.32 {d19}, [r0:64]! @ A[3][4]
1222 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1223 vld1.32 {d24}, [r0:64] @ A[4][4]
1224 sub r0, r0, #24*8 @ rewind
1229 subs r12, r5, r6 @ len - bsz
1233 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1235 veor d0, d0, d31 @ A[0][0] ^= *inp++
1238 veor d2, d2, d31 @ A[0][1] ^= *inp++
1242 veor d4, d4, d31 @ A[0][2] ^= *inp++
1245 veor d6, d6, d31 @ A[0][3] ^= *inp++
1249 veor d8, d8, d31 @ A[0][4] ^= *inp++
1253 veor d1, d1, d31 @ A[1][0] ^= *inp++
1257 veor d3, d3, d31 @ A[1][1] ^= *inp++
1260 veor d5, d5, d31 @ A[1][2] ^= *inp++
1264 veor d7, d7, d31 @ A[1][3] ^= *inp++
1267 veor d9, d9, d31 @ A[1][4] ^= *inp++
1272 veor d10, d10, d31 @ A[2][0] ^= *inp++
1275 veor d12, d12, d31 @ A[2][1] ^= *inp++
1279 veor d14, d14, d31 @ A[2][2] ^= *inp++
1282 veor d16, d16, d31 @ A[2][3] ^= *inp++
1286 veor d18, d18, d31 @ A[2][4] ^= *inp++
1290 veor d11, d11, d31 @ A[3][0] ^= *inp++
1294 veor d13, d13, d31 @ A[3][1] ^= *inp++
1297 veor d15, d15, d31 @ A[3][2] ^= *inp++
1301 veor d17, d17, d31 @ A[3][3] ^= *inp++
1304 veor d19, d19, d31 @ A[3][4] ^= *inp++
1309 veor d20, d20, d31 @ A[4][0] ^= *inp++
1312 veor d21, d21, d31 @ A[4][1] ^= *inp++
1316 veor d22, d22, d31 @ A[4][2] ^= *inp++
1319 veor d23, d23, d31 @ A[4][3] ^= *inp++
1322 veor d24, d24, d31 @ A[4][4] ^= *inp++
1330 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1331 vst1.32 {d2}, [r0:64]!
1332 vst1.32 {d4}, [r0:64]!
1333 vst1.32 {d6}, [r0:64]!
1334 vst1.32 {d8}, [r0:64]!
1336 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1337 vst1.32 {d3}, [r0:64]!
1338 vst1.32 {d5}, [r0:64]!
1339 vst1.32 {d7}, [r0:64]!
1340 vst1.32 {d9}, [r0:64]!
1342 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1343 vst1.32 {d12}, [r0:64]!
1344 vst1.32 {d14}, [r0:64]!
1345 vst1.32 {d16}, [r0:64]!
1346 vst1.32 {d18}, [r0:64]!
1348 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1349 vst1.32 {d13}, [r0:64]!
1350 vst1.32 {d15}, [r0:64]!
1351 vst1.32 {d17}, [r0:64]!
1352 vst1.32 {d19}, [r0:64]!
1354 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1355 vst1.32 {d24}, [r0:64]
1357 mov r0, r5 @ return value
1358 vldmia sp!, {d8-d15}
1359 ldmia sp!, {r4-r6,pc}
1360 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1362 .global SHA3_squeeze_neon
1363 .type SHA3_squeeze_neon, %function
1366 stmdb sp!, {r4-r6,lr}
1371 mov r12, r0 @ A_flat
1373 b .Loop_squeeze_neon
1378 blo .Lsqueeze_neon_tail
1379 vld1.32 {d0}, [r12]!
1380 vst1.8 {d0}, [r4]! @ endian-neutral store
1382 subs r5, r5, #8 @ len -= 8
1383 beq .Lsqueeze_neon_done
1385 subs r14, r14, #8 @ bsz -= 8
1386 bhi .Loop_squeeze_neon
1388 vstmdb sp!, {d8-d15}
1390 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1391 vld1.32 {d2}, [r0:64]!
1392 vld1.32 {d4}, [r0:64]!
1393 vld1.32 {d6}, [r0:64]!
1394 vld1.32 {d8}, [r0:64]!
1396 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1397 vld1.32 {d3}, [r0:64]!
1398 vld1.32 {d5}, [r0:64]!
1399 vld1.32 {d7}, [r0:64]!
1400 vld1.32 {d9}, [r0:64]!
1402 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1403 vld1.32 {d12}, [r0:64]!
1404 vld1.32 {d14}, [r0:64]!
1405 vld1.32 {d16}, [r0:64]!
1406 vld1.32 {d18}, [r0:64]!
1408 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1409 vld1.32 {d13}, [r0:64]!
1410 vld1.32 {d15}, [r0:64]!
1411 vld1.32 {d17}, [r0:64]!
1412 vld1.32 {d19}, [r0:64]!
1414 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1415 vld1.32 {d24}, [r0:64]
1416 sub r0, r0, #24*8 @ rewind
1420 mov r12, r0 @ A_flat
1421 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1422 vst1.32 {d2}, [r0:64]!
1423 vst1.32 {d4}, [r0:64]!
1424 vst1.32 {d6}, [r0:64]!
1425 vst1.32 {d8}, [r0:64]!
1427 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1428 vst1.32 {d3}, [r0:64]!
1429 vst1.32 {d5}, [r0:64]!
1430 vst1.32 {d7}, [r0:64]!
1431 vst1.32 {d9}, [r0:64]!
1433 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1434 vst1.32 {d12}, [r0:64]!
1435 vst1.32 {d14}, [r0:64]!
1436 vst1.32 {d16}, [r0:64]!
1437 vst1.32 {d18}, [r0:64]!
1439 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1440 vst1.32 {d13}, [r0:64]!
1441 vst1.32 {d15}, [r0:64]!
1442 vst1.32 {d17}, [r0:64]!
1443 vst1.32 {d19}, [r0:64]!
1445 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1447 vst1.32 {d24}, [r0:64]
1448 mov r0, r12 @ rewind
1450 vldmia sp!, {d8-d15}
1451 b .Loop_squeeze_neon
1454 .Lsqueeze_neon_tail:
1457 strb r2, [r4],#1 @ endian-neutral store
1459 blo .Lsqueeze_neon_done
1462 beq .Lsqueeze_neon_done
1466 blo .Lsqueeze_neon_done
1468 beq .Lsqueeze_neon_done
1473 blo .Lsqueeze_neon_done
1476 beq .Lsqueeze_neon_done
1479 .Lsqueeze_neon_done:
1480 ldmia sp!, {r4-r6,pc}
1481 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1482 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1488 close STDOUT; # enforce flush