2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv4.
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
28 ########################################################################
29 # Numbers are cycles per processed byte. Non-NEON results account even
30 # for input bit interleaving [which takes ~1/4-1/3 of time].
32 # r=1600(*),NEON r=1088(**),NEON
34 # Cortex-A5 80/+220%, 24 110, 36
35 # Cortex-A7 71/+180%, 23 99, 34
36 # Cortex-A8 48/+290%, 20 67, 30
37 # Cortex-A9 48/+290%, 17 66, 26
38 # Cortex-A15 34/+210%, 12 47, 18
39 # Snapdragon S4 44/+230%, 16 59, 24
41 # (*) Not used in real life, meaningful as estimate for single absorb
42 # operation performance. Percentage after slash is improvement
43 # over compiler-generated KECCAK_1X reference code.
44 # (**) Corresponds to SHA3-256, 8KB message size.
46 my @C = map("r$_",(0..9));
47 my @E = map("r$_",(10..12,14));
49 ########################################################################
51 # ----->+-----------------------+
52 # | uint64_t A[5][5] |
54 # +200->+-----------------------+
57 # +240->+-----------------------+
58 # | uint64_t T[2][5] |
60 # +320->+-----------------------+
62 # +324->+-----------------------+
64 # +328->+-----------------------+
67 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
68 my @D = map(8*$_, (25..29));
69 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35));
74 #if defined(__thumb2__)
81 .type iotas32, %object
84 .long 0x00000001, 0x00000000
85 .long 0x00000000, 0x00000089
86 .long 0x00000000, 0x8000008b
87 .long 0x00000000, 0x80008080
88 .long 0x00000001, 0x0000008b
89 .long 0x00000001, 0x00008000
90 .long 0x00000001, 0x80008088
91 .long 0x00000001, 0x80000082
92 .long 0x00000000, 0x0000000b
93 .long 0x00000000, 0x0000000a
94 .long 0x00000001, 0x00008082
95 .long 0x00000000, 0x00008003
96 .long 0x00000001, 0x0000808b
97 .long 0x00000001, 0x8000000b
98 .long 0x00000001, 0x8000008a
99 .long 0x00000001, 0x80000081
100 .long 0x00000000, 0x80000081
101 .long 0x00000000, 0x80000008
102 .long 0x00000000, 0x00000083
103 .long 0x00000000, 0x80008003
104 .long 0x00000001, 0x80008088
105 .long 0x00000000, 0x80000088
106 .long 0x00000001, 0x00008000
107 .long 0x00000000, 0x80008082
108 .size iotas32,.-iotas32
110 .type KeccakF1600_int, %function
113 ldmia sp,{@C[0]-@C[9]} @ A[0][0..4]
114 add @E[0],sp,#$A[1][0]
117 eor @E[1],@E[1],@E[1]
123 ldmia sp,{@C[0]-@C[9]} @ A[0][0..4]
125 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
126 eor @C[0],@C[0],@E[0]
127 add @E[0],sp,#$A[1][2]
128 eor @C[1],@C[1],@E[1]
129 eor @C[2],@C[2],@E[2]
130 eor @C[3],@C[3],@E[3]
131 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
132 eor @C[4],@C[4],@E[0]
133 add @E[0],sp,#$A[1][4]
134 eor @C[5],@C[5],@E[1]
135 eor @C[6],@C[6],@E[2]
136 eor @C[7],@C[7],@E[3]
137 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
138 eor @C[8],@C[8],@E[0]
139 add @E[0],sp,#$A[2][1]
140 eor @C[9],@C[9],@E[1]
141 eor @C[0],@C[0],@E[2]
142 eor @C[1],@C[1],@E[3]
143 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
144 eor @C[2],@C[2],@E[0]
145 add @E[0],sp,#$A[2][3]
146 eor @C[3],@C[3],@E[1]
147 eor @C[4],@C[4],@E[2]
148 eor @C[5],@C[5],@E[3]
149 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
150 eor @C[6],@C[6],@E[0]
151 add @E[0],sp,#$A[3][0]
152 eor @C[7],@C[7],@E[1]
153 eor @C[8],@C[8],@E[2]
154 eor @C[9],@C[9],@E[3]
155 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
156 eor @C[0],@C[0],@E[0]
157 add @E[0],sp,#$A[3][2]
158 eor @C[1],@C[1],@E[1]
159 eor @C[2],@C[2],@E[2]
160 eor @C[3],@C[3],@E[3]
161 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
162 eor @C[4],@C[4],@E[0]
163 add @E[0],sp,#$A[3][4]
164 eor @C[5],@C[5],@E[1]
165 eor @C[6],@C[6],@E[2]
166 eor @C[7],@C[7],@E[3]
167 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
168 eor @C[8],@C[8],@E[0]
169 add @E[0],sp,#$A[4][1]
170 eor @C[9],@C[9],@E[1]
171 eor @C[0],@C[0],@E[2]
172 eor @C[1],@C[1],@E[3]
173 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[4][1..2]
174 eor @C[2],@C[2],@E[0]
175 add @E[0],sp,#$A[4][3]
176 eor @C[3],@C[3],@E[1]
177 eor @C[4],@C[4],@E[2]
178 eor @C[5],@C[5],@E[3]
179 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[4][3..4]
180 eor @C[6],@C[6],@E[0]
181 eor @C[7],@C[7],@E[1]
182 eor @C[8],@C[8],@E[2]
183 eor @C[9],@C[9],@E[3]
185 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
186 eor @E[1],@C[1],@C[4]
187 str @E[0],[sp,#$D[1]] @ D[1] = E[0]
188 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
189 str @E[1],[sp,#$D[1]+4]
190 eor @E[3],@C[7],@C[0]
191 str @E[2],[sp,#$D[4]] @ D[4] = E[1]
192 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
193 str @E[3],[sp,#$D[4]+4]
194 eor @C[1],@C[9],@C[2]
195 str @C[0],[sp,#$D[0]] @ D[0] = C[0]
196 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
197 str @C[1],[sp,#$D[0]+4]
198 eor @C[3],@C[3],@C[6]
199 str @C[2],[sp,#$D[2]] @ D[2] = C[1]
200 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
201 str @C[3],[sp,#$D[2]+4]
202 eor @C[5],@C[5],@C[8]
203 ldr @C[8],[sp,#$A[3][0]]
204 ldr @C[9],[sp,#$A[3][0]+4]
205 str @C[4],[sp,#$D[3]] @ D[3] = C[2]
206 str @C[5],[sp,#$D[3]+4]
208 ldr @C[6],[sp,#$A[0][1]]
209 eor @C[8],@C[8],@C[0]
210 ldr @C[7],[sp,#$A[0][1]+4]
211 eor @C[9],@C[9],@C[1]
212 str @C[8],[sp,#$T[0][0]] @ T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
213 ldr @C[8],[sp,#$A[0][2]]
214 str @C[9],[sp,#$T[0][0]+4]
215 ldr @C[9],[sp,#$A[0][2]+4]
216 eor @C[6],@C[6],@E[0]
217 eor @C[7],@C[7],@E[1]
218 str @C[6],[sp,#$T[0][1]] @ T[0][1] = A[0][1] ^ E[0]; /* D[1] */
219 ldr @C[6],[sp,#$A[0][3]]
220 str @C[7],[sp,#$T[0][1]+4]
221 ldr @C[7],[sp,#$A[0][3]+4]
222 eor @C[8],@C[8],@C[2]
223 eor @C[9],@C[9],@C[3]
224 str @C[8],[sp,#$T[0][2]] @ T[0][2] = A[0][2] ^ C[1]; /* D[2] */
225 ldr @C[8],[sp,#$A[0][4]]
226 str @C[9],[sp,#$T[0][2]+4]
227 ldr @C[9],[sp,#$A[0][4]+4]
228 eor @C[6],@C[6],@C[4]
229 eor @C[7],@C[7],@C[5]
230 str @C[6],[sp,#$T[0][3]] @ T[0][3] = A[0][3] ^ C[2]; /* D[3] */
231 eor @C[8],@C[8],@E[2]
232 str @C[7],[sp,#$T[0][3]+4]
233 eor @C[9],@C[9],@E[3]
234 ldr @C[6],[sp,#$A[3][3]]
235 ldr @C[7],[sp,#$A[3][3]+4]
236 str @C[8],[sp,#$T[0][4]] @ T[0][4] = A[0][4] ^ E[1]; /* D[4] */
237 str @C[9],[sp,#$T[0][4]+4]
239 ldr @C[8],[sp,#$A[4][4]]
240 eor @C[4],@C[4],@C[6]
241 ldr @C[9],[sp,#$A[4][4]+4]
242 eor @C[5],@C[5],@C[7]
243 ror @C[7],@C[4],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
244 ldr @C[4],[sp,#$A[0][0]]
245 ror @C[6],@C[5],#32-11
246 ldr @C[5],[sp,#$A[0][0]+4]
247 eor @C[8],@C[8],@E[2]
248 eor @C[9],@C[9],@E[3]
249 ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
250 ldr @E[2],[sp,#$A[2][2]]
251 ror @C[9],@C[9],#32-7
252 ldr @E[3],[sp,#$A[2][2]+4]
253 eor @C[0],@C[0],@C[4]
254 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
255 eor @E[2],@E[2],@C[2]
256 ldr @C[2],[sp,#$A[1][1]]
257 eor @E[3],@E[3],@C[3]
258 ldr @C[3],[sp,#$A[1][1]+4]
259 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
260 ldr @E[2],[sp,#324] @ load counter
261 eor @C[2],@C[2],@E[0]
262 ror @C[4],@E[3],#32-22
264 eor @C[3],@C[3],@E[1]
265 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
266 add @E[3],@E[3],@E[2]
267 ror @C[3],@C[3],#32-22
269 ldr @E[0],[@E[3],#0] @ iotas[i].lo
271 ldr @E[1],[@E[3],#4] @ iotas[i].hi
273 str @E[2],[sp,#324] @ store counter
275 bic @E[2],@C[4],@C[2]
276 bic @E[3],@C[5],@C[3]
277 eor @E[2],@E[2],@C[0]
278 eor @E[3],@E[3],@C[1]
279 eor @E[0],@E[0],@E[2]
280 eor @E[1],@E[1],@E[3]
281 str @E[0],[sp,#$A[0][0]] @ A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
282 bic @E[2],@C[6],@C[4]
283 str @E[1],[sp,#$A[0][0]+4]
284 bic @E[3],@C[7],@C[5]
285 eor @E[2],@E[2],@C[2]
286 eor @E[3],@E[3],@C[3]
287 str @E[2],[sp,#$A[0][1]] @ A[0][1] = C[1] ^ (~C[2] & C[3]);
288 bic @E[0],@C[8],@C[6]
289 str @E[3],[sp,#$A[0][1]+4]
290 bic @E[1],@C[9],@C[7]
291 eor @E[0],@E[0],@C[4]
292 eor @E[1],@E[1],@C[5]
293 str @E[0],[sp,#$A[0][2]] @ A[0][2] = C[2] ^ (~C[3] & C[4]);
294 bic @E[2],@C[0],@C[8]
295 str @E[1],[sp,#$A[0][2]+4]
296 bic @E[3],@C[1],@C[9]
297 eor @E[2],@E[2],@C[6]
298 eor @E[3],@E[3],@C[7]
299 str @E[2],[sp,#$A[0][3]] @ A[0][3] = C[3] ^ (~C[4] & C[0]);
300 bic @E[0],@C[2],@C[0]
301 str @E[3],[sp,#$A[0][3]+4]
303 bic @E[1],@C[3],@C[1]
304 eor @E[0],@E[0],@C[8]
305 eor @E[1],@E[1],@C[9]
306 str @E[0],[sp,#$A[0][4]] @ A[0][4] = C[4] ^ (~C[0] & C[1]);
307 str @E[1],[sp,#$A[0][4]+4]
309 ldmia @E[3],{@C[6]-@C[9],@E[0],@E[1],@E[2],@E[3]} @ D[0..3]
310 ldr @C[0],[sp,#$A[1][0]]
311 ldr @C[1],[sp,#$A[1][0]+4]
312 ldr @C[2],[sp,#$A[2][1]]
313 ldr @C[3],[sp,#$A[2][1]+4]
314 ldr @C[4],[sp,#$D[4]]
315 eor @C[0],@C[0],@C[6]
316 ldr @C[5],[sp,#$D[4]+4]
317 eor @C[1],@C[1],@C[7]
318 str @C[0],[sp,#$T[1][0]] @ T[1][0] = A[1][0] ^ (C[3] = D[0]);
319 add @C[0],sp,#$A[1][2]
320 str @C[1],[sp,#$T[1][0]+4]
321 eor @C[2],@C[2],@C[8]
322 eor @C[3],@C[3],@C[9]
323 str @C[2],[sp,#$T[1][1]] @ T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
324 str @C[3],[sp,#$T[1][1]+4]
325 ldmia @C[0],{@C[0]-@C[3]} @ A[1][2..3]
326 eor @C[0],@C[0],@E[0]
327 eor @C[1],@C[1],@E[1]
328 str @C[0],[sp,#$T[1][2]] @ T[1][2] = A[1][2] ^ (E[0] = D[2]);
329 ldr @C[0],[sp,#$A[2][4]]
330 str @C[1],[sp,#$T[1][2]+4]
331 ldr @C[1],[sp,#$A[2][4]+4]
332 eor @C[2],@C[2],@E[2]
333 eor @C[3],@C[3],@E[3]
334 str @C[2],[sp,#$T[1][3]] @ T[1][3] = A[1][3] ^ (E[1] = D[3]);
335 ldr @C[2],[sp,#$T[0][3]]
336 str @C[3],[sp,#$T[1][3]+4]
337 ldr @C[3],[sp,#$T[0][3]+4]
338 eor @C[0],@C[0],@C[4]
339 ldr @E[2],[sp,#$A[1][4]]
340 eor @C[1],@C[1],@C[5]
341 ldr @E[3],[sp,#$A[1][4]+4]
342 str @C[0],[sp,#$T[1][4]] @ T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
344 ror @C[0],@C[2],#32-14 @ C[0] = ROL64(T[0][3], rhotates[0][3]);
345 str @C[1],[sp,#$T[1][4]+4]
346 ror @C[1],@C[3],#32-14
347 eor @C[2],@E[2],@C[4]
348 ldr @C[4],[sp,#$A[2][0]]
349 eor @C[3],@E[3],@C[5]
350 ldr @C[5],[sp,#$A[2][0]+4]
351 ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
352 ldr @E[2],[sp,#$A[3][1]]
353 ror @C[3],@C[3],#32-10
354 ldr @E[3],[sp,#$A[3][1]+4]
355 eor @C[6],@C[6],@C[4]
356 eor @C[7],@C[7],@C[5]
357 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
358 eor @E[2],@E[2],@C[8]
359 ror @C[4],@C[7],#32-2
360 ldr @C[8],[sp,#$A[4][2]]
361 eor @E[3],@E[3],@C[9]
362 ldr @C[9],[sp,#$A[4][2]+4]
363 ror @C[7],@E[2],#32-22 @ C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
364 eor @E[0],@E[0],@C[8]
365 ror @C[6],@E[3],#32-23
366 eor @E[1],@E[1],@C[9]
367 ror @C[9],@E[0],#32-30 @ C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
369 bic @E[0],@C[4],@C[2]
370 ror @C[8],@E[1],#32-31
371 bic @E[1],@C[5],@C[3]
372 eor @E[0],@E[0],@C[0]
373 eor @E[1],@E[1],@C[1]
374 str @E[0],[sp,#$A[1][0]] @ A[1][0] = C[0] ^ (~C[1] & C[2])
375 bic @E[2],@C[6],@C[4]
376 str @E[1],[sp,#$A[1][0]+4]
377 bic @E[3],@C[7],@C[5]
378 eor @E[2],@E[2],@C[2]
379 eor @E[3],@E[3],@C[3]
380 str @E[2],[sp,#$A[1][1]] @ A[1][1] = C[1] ^ (~C[2] & C[3]);
381 bic @E[0],@C[8],@C[6]
382 str @E[3],[sp,#$A[1][1]+4]
383 bic @E[1],@C[9],@C[7]
384 eor @E[0],@E[0],@C[4]
385 eor @E[1],@E[1],@C[5]
386 str @E[0],[sp,#$A[1][2]] @ A[1][2] = C[2] ^ (~C[3] & C[4]);
387 bic @E[2],@C[0],@C[8]
388 str @E[1],[sp,#$A[1][2]+4]
389 bic @E[3],@C[1],@C[9]
390 eor @E[2],@E[2],@C[6]
391 eor @E[3],@E[3],@C[7]
392 str @E[2],[sp,#$A[1][3]] @ A[1][3] = C[3] ^ (~C[4] & C[0]);
393 bic @E[0],@C[2],@C[0]
394 str @E[3],[sp,#$A[1][3]+4]
396 bic @E[1],@C[3],@C[1]
397 ldr @C[1],[sp,#$T[0][1]]
398 eor @E[0],@E[0],@C[8]
399 ldr @C[0],[sp,#$T[0][1]+4]
400 eor @E[1],@E[1],@C[9]
401 str @E[0],[sp,#$A[1][4]] @ A[1][4] = C[4] ^ (~C[0] & C[1]);
402 str @E[1],[sp,#$A[1][4]+4]
404 ldr @C[2],[sp,#$T[1][2]]
405 ldr @C[3],[sp,#$T[1][2]+4]
406 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
407 ldr @C[4],[sp,#$A[2][3]]
408 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(T[0][1], rhotates[0][1]);
409 ldr @C[5],[sp,#$A[2][3]+4]
410 ror @C[2],@C[2],#32-3 @ C[1] = ROL64(T[1][2], rhotates[1][2]);
411 ldr @C[6],[sp,#$A[3][4]]
412 ror @C[3],@C[3],#32-3
413 ldr @C[7],[sp,#$A[3][4]+4]
414 eor @E[0],@E[0],@C[4]
415 ldr @C[8],[sp,#$A[4][0]]
416 eor @E[1],@E[1],@C[5]
417 ldr @C[9],[sp,#$A[4][0]+4]
418 ror @C[5],@E[0],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
419 ldr @E[0],[sp,#$D[0]]
420 ror @C[4],@E[1],#32-13
421 ldr @E[1],[sp,#$D[0]+4]
422 eor @C[6],@C[6],@E[2]
423 eor @C[7],@C[7],@E[3]
424 ror @C[6],@C[6],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
425 eor @C[8],@C[8],@E[0]
426 ror @C[7],@C[7],#32-4
427 eor @C[9],@C[9],@E[1]
428 ror @C[8],@C[8],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
430 bic @E[0],@C[4],@C[2]
431 ror @C[9],@C[9],#32-9
432 bic @E[1],@C[5],@C[3]
433 eor @E[0],@E[0],@C[0]
434 eor @E[1],@E[1],@C[1]
435 str @E[0],[sp,#$A[2][0]] @ A[2][0] = C[0] ^ (~C[1] & C[2])
436 bic @E[2],@C[6],@C[4]
437 str @E[1],[sp,#$A[2][0]+4]
438 bic @E[3],@C[7],@C[5]
439 eor @E[2],@E[2],@C[2]
440 eor @E[3],@E[3],@C[3]
441 str @E[2],[sp,#$A[2][1]] @ A[2][1] = C[1] ^ (~C[2] & C[3]);
442 bic @E[0],@C[8],@C[6]
443 str @E[3],[sp,#$A[2][1]+4]
444 bic @E[1],@C[9],@C[7]
445 eor @E[0],@E[0],@C[4]
446 eor @E[1],@E[1],@C[5]
447 str @E[0],[sp,#$A[2][2]] @ A[2][2] = C[2] ^ (~C[3] & C[4]);
448 bic @E[2],@C[0],@C[8]
449 str @E[1],[sp,#$A[2][2]+4]
450 bic @E[3],@C[1],@C[9]
451 eor @E[2],@E[2],@C[6]
452 eor @E[3],@E[3],@C[7]
453 str @E[2],[sp,#$A[2][3]] @ A[2][3] = C[3] ^ (~C[4] & C[0]);
454 bic @E[0],@C[2],@C[0]
455 str @E[3],[sp,#$A[2][3]+4]
456 bic @E[1],@C[3],@C[1]
457 eor @E[0],@E[0],@C[8]
458 eor @E[1],@E[1],@C[9]
459 str @E[0],[sp,#$A[2][4]] @ A[2][4] = C[4] ^ (~C[0] & C[1]);
460 add @C[2],sp,#$T[1][0]
461 str @E[1],[sp,#$A[2][4]+4]
464 ldr @C[1],[sp,#$T[0][4]]
465 ldr @C[0],[sp,#$T[0][4]+4]
466 ldmia @C[2],{@C[2]-@C[5]} @ T[1][0..1]
467 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
468 ror @C[1],@C[1],#32-13 @ C[0] = ROL64(T[0][4], rhotates[0][4]);
469 ldr @C[6],[sp,#$A[3][2]]
470 ror @C[0],@C[0],#32-14
471 ldr @C[7],[sp,#$A[3][2]+4]
472 ror @C[2],@C[2],#32-18 @ C[1] = ROL64(T[1][0], rhotates[1][0]);
473 ldr @C[8],[sp,#$A[4][3]]
474 ror @C[3],@C[3],#32-18
475 ldr @C[9],[sp,#$A[4][3]+4]
476 ror @C[4],@C[4],#32-5 @ C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
477 eor @E[0],@E[0],@C[6]
478 ror @C[5],@C[5],#32-5
479 eor @E[1],@E[1],@C[7]
480 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
481 eor @C[8],@C[8],@E[2]
482 ror @C[6],@E[1],#32-8
483 eor @C[9],@C[9],@E[3]
484 ror @C[8],@C[8],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
486 bic @E[0],@C[4],@C[2]
487 ror @C[9],@C[9],#32-28
488 bic @E[1],@C[5],@C[3]
489 eor @E[0],@E[0],@C[0]
490 eor @E[1],@E[1],@C[1]
491 str @E[0],[sp,#$A[3][0]] @ A[3][0] = C[0] ^ (~C[1] & C[2])
492 bic @E[2],@C[6],@C[4]
493 str @E[1],[sp,#$A[3][0]+4]
494 bic @E[3],@C[7],@C[5]
495 eor @E[2],@E[2],@C[2]
496 eor @E[3],@E[3],@C[3]
497 str @E[2],[sp,#$A[3][1]] @ A[3][1] = C[1] ^ (~C[2] & C[3]);
498 bic @E[0],@C[8],@C[6]
499 str @E[3],[sp,#$A[3][1]+4]
500 bic @E[1],@C[9],@C[7]
501 eor @E[0],@E[0],@C[4]
502 eor @E[1],@E[1],@C[5]
503 str @E[0],[sp,#$A[3][2]] @ A[3][2] = C[2] ^ (~C[3] & C[4]);
504 bic @E[2],@C[0],@C[8]
505 str @E[1],[sp,#$A[3][2]+4]
506 bic @E[3],@C[1],@C[9]
507 eor @E[2],@E[2],@C[6]
508 eor @E[3],@E[3],@C[7]
509 str @E[2],[sp,#$A[3][3]] @ A[3][3] = C[3] ^ (~C[4] & C[0]);
510 bic @E[0],@C[2],@C[0]
511 str @E[3],[sp,#$A[3][3]+4]
512 bic @E[1],@C[3],@C[1]
513 eor @E[0],@E[0],@C[8]
514 eor @E[1],@E[1],@C[9]
515 str @E[0],[sp,#$A[3][4]] @ A[3][4] = C[4] ^ (~C[0] & C[1]);
516 add @E[3],sp,#$T[1][3]
517 str @E[1],[sp,#$A[3][4]+4]
519 ldr @C[0],[sp,#$T[0][2]]
520 ldr @C[1],[sp,#$T[0][2]+4]
521 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ T[1][3..4]
522 ldr @C[7],[sp,#$T[0][0]]
523 ror @C[0],@C[0],#32-31 @ C[0] = ROL64(T[0][2], rhotates[0][2]);
524 ldr @C[6],[sp,#$T[0][0]+4]
525 ror @C[1],@C[1],#32-31
526 ldr @C[8],[sp,#$A[4][1]]
527 ror @C[3],@E[0],#32-27 @ C[1] = ROL64(T[1][3], rhotates[1][3]);
528 ldr @E[0],[sp,#$D[1]]
529 ror @C[2],@E[1],#32-28
530 ldr @C[9],[sp,#$A[4][1]+4]
531 ror @C[5],@E[2],#32-19 @ C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
532 ldr @E[1],[sp,#$D[1]+4]
533 ror @C[4],@E[3],#32-20
534 eor @C[8],@C[8],@E[0]
535 ror @C[7],@C[7],#32-20 @ C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
536 eor @C[9],@C[9],@E[1]
537 ror @C[6],@C[6],#32-21
539 bic @E[0],@C[4],@C[2]
540 ror @C[8],@C[8],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
541 bic @E[1],@C[5],@C[3]
542 ror @C[9],@C[9],#32-1
543 eor @E[0],@E[0],@C[0]
544 eor @E[1],@E[1],@C[1]
545 str @E[0],[sp,#$A[4][0]] @ A[4][0] = C[0] ^ (~C[1] & C[2])
546 bic @E[2],@C[6],@C[4]
547 str @E[1],[sp,#$A[4][0]+4]
548 bic @E[3],@C[7],@C[5]
549 eor @E[2],@E[2],@C[2]
550 eor @E[3],@E[3],@C[3]
551 str @E[2],[sp,#$A[4][1]] @ A[4][1] = C[1] ^ (~C[2] & C[3]);
552 bic @E[0],@C[8],@C[6]
553 str @E[3],[sp,#$A[4][1]+4]
554 bic @E[1],@C[9],@C[7]
555 eor @E[0],@E[0],@C[4]
556 eor @E[1],@E[1],@C[5]
557 str @E[0],[sp,#$A[4][2]] @ A[4][2] = C[2] ^ (~C[3] & C[4]);
558 bic @E[2],@C[0],@C[8]
559 str @E[1],[sp,#$A[4][2]+4]
560 bic @E[3],@C[1],@C[9]
561 eor @E[2],@E[2],@C[6]
562 eor @E[3],@E[3],@C[7]
563 str @E[2],[sp,#$A[4][3]] @ A[4][3] = C[3] ^ (~C[4] & C[0]);
564 bic @E[0],@C[2],@C[0]
565 str @E[3],[sp,#$A[4][3]+4]
566 bic @E[1],@C[3],@C[1]
567 eor @E[2],@E[0],@C[8]
568 eor @E[3],@E[1],@C[9]
569 str @E[2],[sp,#$A[4][4]] @ A[4][4] = C[4] ^ (~C[0] & C[1]);
570 add @E[0],sp,#$A[1][0]
571 str @E[3],[sp,#$A[4][4]+4]
576 .size KeccakF1600_int,.-KeccakF1600_int
578 .type KeccakF1600, %function
581 stmdb sp!,{r0,r4-r11,lr}
582 sub sp,sp,#320+16 @ space for A[5][5],D[5],T[2][5],...
584 add @E[0],r0,#$A[1][0]
585 add @E[1],sp,#$A[1][0]
587 ldmia @E[0]!,{@C[0]-@C[9]} @ copy A[5][5] to stack
588 stmia @E[1]!,{@C[0]-@C[9]}
589 ldmia @E[0]!,{@C[0]-@C[9]}
590 stmia @E[1]!,{@C[0]-@C[9]}
591 ldmia @E[0]!,{@C[0]-@C[9]}
592 stmia @E[1]!,{@C[0]-@C[9]}
593 ldmia @E[0], {@C[0]-@C[9]}
594 stmia @E[1], {@C[0]-@C[9]}
595 ldmia @E[2], {@C[0]-@C[9]} @ A[0][0..4]
596 add @E[0],sp,#$A[1][0]
597 stmia sp, {@C[0]-@C[9]}
601 ldr @E[1], [sp,#320+16] @ restore pointer to A
602 ldmia sp, {@C[0]-@C[9]}
603 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
604 ldmia @E[0]!,{@C[0]-@C[9]}
605 stmia @E[1]!,{@C[0]-@C[9]}
606 ldmia @E[0]!,{@C[0]-@C[9]}
607 stmia @E[1]!,{@C[0]-@C[9]}
608 ldmia @E[0]!,{@C[0]-@C[9]}
609 stmia @E[1]!,{@C[0]-@C[9]}
610 ldmia @E[0], {@C[0]-@C[9]}
611 stmia @E[1], {@C[0]-@C[9]}
614 ldmia sp!,{r4-r11,pc}
615 .size KeccakF1600,.-KeccakF1600
617 { my ($hi,$lo,$i,$A_flat, $len,$bsz,$inp) = map("r$_",(5..8, 10..12));
619 ########################################################################
621 # ----->+-----------------------+
622 # | uint64_t A[5][5] |
625 # +336->+-----------------------+
627 # +340->+-----------------------+
628 # | const void *inp |
629 # +344->+-----------------------+
631 # +348->+-----------------------+
633 # +352->+-----------------------+
638 .type SHA3_absorb,%function
641 stmdb sp!,{r0-r12,lr}
649 ldmia r12!,{@C[0]-@C[9]} @ copy A[5][5] to stack
650 stmia r14!,{@C[0]-@C[9]}
651 ldmia r12!,{@C[0]-@C[9]}
652 stmia r14!,{@C[0]-@C[9]}
653 ldmia r12!,{@C[0]-@C[9]}
654 stmia r14!,{@C[0]-@C[9]}
655 ldmia r12!,{@C[0]-@C[9]}
656 stmia r14!,{@C[0]-@C[9]}
657 ldmia r12, {@C[0]-@C[9]}
658 stmia r14, {@C[0]-@C[9]}
666 str r0,[sp,#344] @ save len - bsz
669 ldmia $A_flat,{r2-r3} @ A_flat[i]
670 ldrb r0,[$inp,#7]! @ inp[7]
679 ldrbne r0,[$inp,#-1]!
681 ldrneb r0,[$inp,#-1]!
683 adds r1,r1,r1 @ sip through carry flag
705 stmia $A_flat!,{r2-r3} @ A_flat[i++] ^= BitInterleave(inp[0..7])
721 ldr r14, [sp,#336] @ pull pointer to A[5][5]
722 ldmia sp, {@C[0]-@C[9]}
723 stmia r14!,{@C[0]-@C[9]} @ return A[5][5]
724 ldmia r12!,{@C[0]-@C[9]}
725 stmia r14!,{@C[0]-@C[9]}
726 ldmia r12!,{@C[0]-@C[9]}
727 stmia r14!,{@C[0]-@C[9]}
728 ldmia r12!,{@C[0]-@C[9]}
729 stmia r14!,{@C[0]-@C[9]}
730 ldmia r12, {@C[0]-@C[9]}
731 stmia r14, {@C[0]-@C[9]}
734 mov r0,$len @ return value
735 ldmia sp!,{r4-r12,pc}
736 .size SHA3_absorb,.-SHA3_absorb
739 { my ($A_flat,$out,$len,$bsz, $byte,$shl) = map("r$_", (4..9));
743 .type SHA3_squeeze,%function
746 stmdb sp!,{r4-r10,lr}
757 ldmia r12!,{r0,r1} @ A_flat[i++]
763 eor $byte,$byte,$byte
764 adds r3,r3,r3 @ sip through carry flag
765 adc $byte,$byte,$byte
767 adc $byte,$byte,$byte
769 adc $byte,$byte,$byte
771 adc $byte,$byte,$byte
773 adc $byte,$byte,$byte
775 adc $byte,$byte,$byte
777 adc $byte,$byte,$byte
779 adc $byte,$byte,$byte
780 subs $len,$len,#1 @ len -= 1
786 subs r14,r14,#8 @ bsz -= 8
798 ldmia sp!,{r4-r10,pc}
799 .size SHA3_squeeze,.-SHA3_squeeze
806 .type iotas64, %object
809 .quad 0x0000000000000001
810 .quad 0x0000000000008082
811 .quad 0x800000000000808a
812 .quad 0x8000000080008000
813 .quad 0x000000000000808b
814 .quad 0x0000000080000001
815 .quad 0x8000000080008081
816 .quad 0x8000000000008009
817 .quad 0x000000000000008a
818 .quad 0x0000000000000088
819 .quad 0x0000000080008009
820 .quad 0x000000008000000a
821 .quad 0x000000008000808b
822 .quad 0x800000000000008b
823 .quad 0x8000000000008089
824 .quad 0x8000000000008003
825 .quad 0x8000000000008002
826 .quad 0x8000000000000080
827 .quad 0x000000000000800a
828 .quad 0x800000008000000a
829 .quad 0x8000000080008081
830 .quad 0x8000000000008080
831 .quad 0x0000000080000001
832 .quad 0x8000000080008008
833 .size iotas64,.-iotas64
835 .type KeccakF1600_neon, %function
840 mov r3, #24 @ loop counter
846 vst1.64 {q4}, [r0:64] @ offload A[0..1][4]
847 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
848 vst1.64 {d18}, [r1:64] @ offload A[2][4]
849 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
850 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
851 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
852 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
853 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
854 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
855 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
856 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
857 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
858 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
859 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
860 veor d25, d25, d24 @ C[4]^=A[4][4]
862 vadd.u64 q4, q13, q13 @ C[0..1]<<1
863 vadd.u64 q15, q14, q14 @ C[2..3]<<1
864 vadd.u64 d18, d25, d25 @ C[4]<<1
865 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
866 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
867 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
868 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
869 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
870 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
871 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
873 veor d0, d0, d25 @ A[0][0] ^= C[4]
874 veor d1, d1, d25 @ A[1][0] ^= C[4]
875 veor d10, d10, d25 @ A[2][0] ^= C[4]
876 veor d11, d11, d25 @ A[3][0] ^= C[4]
877 veor d20, d20, d25 @ A[4][0] ^= C[4]
879 veor d2, d2, d26 @ A[0][1] ^= D[1]
880 veor d3, d3, d26 @ A[1][1] ^= D[1]
881 veor d12, d12, d26 @ A[2][1] ^= D[1]
882 veor d13, d13, d26 @ A[3][1] ^= D[1]
883 veor d21, d21, d26 @ A[4][1] ^= D[1]
886 veor d6, d6, d28 @ A[0][3] ^= C[2]
887 veor d7, d7, d28 @ A[1][3] ^= C[2]
888 veor d16, d16, d28 @ A[2][3] ^= C[2]
889 veor d17, d17, d28 @ A[3][3] ^= C[2]
890 veor d23, d23, d28 @ A[4][3] ^= C[2]
891 vld1.64 {q4}, [r0:64] @ restore A[0..1][4]
894 vld1.64 {d18}, [r1:64] @ restore A[2][4]
895 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
896 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
897 veor d22, d22, d27 @ A[4][2] ^= D[2]
899 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
900 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
901 veor d24, d24, d29 @ A[4][4] ^= C[3]
904 vmov d26, d2 @ C[1] = A[0][1]
906 vmov d27, d4 @ C[2] = A[0][2]
907 vshl.u64 d4, d14, #43
908 vmov d28, d6 @ C[3] = A[0][3]
909 vshl.u64 d6, d17, #21
910 vmov d29, d8 @ C[4] = A[0][4]
911 vshl.u64 d8, d24, #14
912 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
913 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
914 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
915 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
918 vshl.u64 d14, d16, #25
919 vshl.u64 d17, d15, #15
920 vshl.u64 d24, d21, #2
921 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
922 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
923 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
924 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
926 vshl.u64 d9, d22, #61
927 @ vshl.u64 d16, d19, #8
928 vshl.u64 d15, d12, #10
929 vshl.u64 d21, d7, #55
930 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
931 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
932 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
933 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
935 vshl.u64 d22, d18, #39
936 @ vshl.u64 d19, d23, #56
938 vshl.u64 d7, d13, #45
939 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
940 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
941 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
942 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
944 vshl.u64 d18, d20, #18
945 vshl.u64 d23, d11, #41
947 vshl.u64 d13, d1, #36
948 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
949 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
950 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
951 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
953 vshl.u64 d1, d28, #28
954 vshl.u64 d10, d26, #1
955 vshl.u64 d11, d29, #27
956 vshl.u64 d20, d27, #62
957 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
958 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
959 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
960 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
966 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
967 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
968 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
969 vst1.64 {q13}, [r0:64] @ offload A[0..1][0]
972 vmov q1, q14 @ A[0..1][1]
973 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
974 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
977 vmov q0, q5 @ A[2..3][0]
979 vmov q15, q6 @ A[2..3][1]
980 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
982 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
984 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
986 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
987 vmov q14, q10 @ A[4][0..1]
988 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
990 vld1.64 d25, [r2:64]! @ Iota[i++]
993 vld1.64 {q0}, [r0:64] @ restore A[0..1][0]
994 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
996 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
998 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1000 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1001 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1002 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1008 .size KeccakF1600_neon,.-KeccakF1600_neon
1010 .global SHA3_absorb_neon
1011 .type SHA3_absorb_neon, %function
1014 stmdb sp!, {r4-r6,lr}
1015 vstmdb sp!, {d8-d15}
1021 vld1.32 {d0}, [r0:64]! @ A[0][0]
1022 vld1.32 {d2}, [r0:64]! @ A[0][1]
1023 vld1.32 {d4}, [r0:64]! @ A[0][2]
1024 vld1.32 {d6}, [r0:64]! @ A[0][3]
1025 vld1.32 {d8}, [r0:64]! @ A[0][4]
1027 vld1.32 {d1}, [r0:64]! @ A[1][0]
1028 vld1.32 {d3}, [r0:64]! @ A[1][1]
1029 vld1.32 {d5}, [r0:64]! @ A[1][2]
1030 vld1.32 {d7}, [r0:64]! @ A[1][3]
1031 vld1.32 {d9}, [r0:64]! @ A[1][4]
1033 vld1.32 {d10}, [r0:64]! @ A[2][0]
1034 vld1.32 {d12}, [r0:64]! @ A[2][1]
1035 vld1.32 {d14}, [r0:64]! @ A[2][2]
1036 vld1.32 {d16}, [r0:64]! @ A[2][3]
1037 vld1.32 {d18}, [r0:64]! @ A[2][4]
1039 vld1.32 {d11}, [r0:64]! @ A[3][0]
1040 vld1.32 {d13}, [r0:64]! @ A[3][1]
1041 vld1.32 {d15}, [r0:64]! @ A[3][2]
1042 vld1.32 {d17}, [r0:64]! @ A[3][3]
1043 vld1.32 {d19}, [r0:64]! @ A[3][4]
1045 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..3]
1046 vld1.32 {d24}, [r0:64] @ A[4][4]
1047 sub r0, r0, #24*8 @ rewind
1052 subs r12, r5, r6 @ len - bsz
1056 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1058 veor d0, d0, d31 @ A[0][0] ^= *inp++
1061 veor d2, d2, d31 @ A[0][1] ^= *inp++
1065 veor d4, d4, d31 @ A[0][2] ^= *inp++
1068 veor d6, d6, d31 @ A[0][3] ^= *inp++
1072 veor d8, d8, d31 @ A[0][4] ^= *inp++
1076 veor d1, d1, d31 @ A[1][0] ^= *inp++
1080 veor d3, d3, d31 @ A[1][1] ^= *inp++
1083 veor d5, d5, d31 @ A[1][2] ^= *inp++
1087 veor d7, d7, d31 @ A[1][3] ^= *inp++
1090 veor d9, d9, d31 @ A[1][4] ^= *inp++
1095 veor d10, d10, d31 @ A[2][0] ^= *inp++
1098 veor d12, d12, d31 @ A[2][1] ^= *inp++
1102 veor d14, d14, d31 @ A[2][2] ^= *inp++
1105 veor d16, d16, d31 @ A[2][3] ^= *inp++
1109 veor d18, d18, d31 @ A[2][4] ^= *inp++
1113 veor d11, d11, d31 @ A[3][0] ^= *inp++
1117 veor d13, d13, d31 @ A[3][1] ^= *inp++
1120 veor d15, d15, d31 @ A[3][2] ^= *inp++
1124 veor d17, d17, d31 @ A[3][3] ^= *inp++
1127 veor d19, d19, d31 @ A[3][4] ^= *inp++
1132 veor d20, d20, d31 @ A[4][0] ^= *inp++
1135 veor d21, d21, d31 @ A[4][1] ^= *inp++
1139 veor d22, d22, d31 @ A[4][2] ^= *inp++
1142 veor d23, d23, d31 @ A[4][3] ^= *inp++
1145 veor d24, d24, d31 @ A[4][4] ^= *inp++
1153 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1154 vst1.32 {d2}, [r0:64]!
1155 vst1.32 {d4}, [r0:64]!
1156 vst1.32 {d6}, [r0:64]!
1157 vst1.32 {d8}, [r0:64]!
1159 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1160 vst1.32 {d3}, [r0:64]!
1161 vst1.32 {d5}, [r0:64]!
1162 vst1.32 {d7}, [r0:64]!
1163 vst1.32 {d9}, [r0:64]!
1165 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1166 vst1.32 {d12}, [r0:64]!
1167 vst1.32 {d14}, [r0:64]!
1168 vst1.32 {d16}, [r0:64]!
1169 vst1.32 {d18}, [r0:64]!
1171 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1172 vst1.32 {d13}, [r0:64]!
1173 vst1.32 {d15}, [r0:64]!
1174 vst1.32 {d17}, [r0:64]!
1175 vst1.32 {d19}, [r0:64]!
1177 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1178 vst1.32 {d24}, [r0:64]
1180 mov r0, r5 @ return value
1181 vldmia sp!, {d8-d15}
1182 ldmia sp!, {r4-r6,pc}
1183 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1185 .global SHA3_squeeze_neon
1186 .type SHA3_squeeze_neon, %function
1189 stmdb sp!, {r4-r6,lr}
1194 mov r12, r0 @ A_flat
1196 b .Loop_squeeze_neon
1201 blo .Lsqueeze_neon_tail
1202 vld1.32 {d0}, [r12]!
1203 vst1.8 {d0}, [r4]! @ endian-neutral store
1205 subs r5, r5, #8 @ len -= 8
1206 beq .Lsqueeze_neon_done
1208 subs r14, r14, #8 @ bsz -= 8
1209 bhi .Loop_squeeze_neon
1211 vstmdb sp!, {d8-d15}
1213 vld1.32 {d0}, [r0:64]! @ A[0][0..4]
1214 vld1.32 {d2}, [r0:64]!
1215 vld1.32 {d4}, [r0:64]!
1216 vld1.32 {d6}, [r0:64]!
1217 vld1.32 {d8}, [r0:64]!
1219 vld1.32 {d1}, [r0:64]! @ A[1][0..4]
1220 vld1.32 {d3}, [r0:64]!
1221 vld1.32 {d5}, [r0:64]!
1222 vld1.32 {d7}, [r0:64]!
1223 vld1.32 {d9}, [r0:64]!
1225 vld1.32 {d10}, [r0:64]! @ A[2][0..4]
1226 vld1.32 {d12}, [r0:64]!
1227 vld1.32 {d14}, [r0:64]!
1228 vld1.32 {d16}, [r0:64]!
1229 vld1.32 {d18}, [r0:64]!
1231 vld1.32 {d11}, [r0:64]! @ A[3][0..4]
1232 vld1.32 {d13}, [r0:64]!
1233 vld1.32 {d15}, [r0:64]!
1234 vld1.32 {d17}, [r0:64]!
1235 vld1.32 {d19}, [r0:64]!
1237 vld1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1238 vld1.32 {d24}, [r0:64]
1239 sub r0, r0, #24*8 @ rewind
1243 mov r12, r0 @ A_flat
1244 vst1.32 {d0}, [r0:64]! @ A[0][0..4]
1245 vst1.32 {d2}, [r0:64]!
1246 vst1.32 {d4}, [r0:64]!
1247 vst1.32 {d6}, [r0:64]!
1248 vst1.32 {d8}, [r0:64]!
1250 vst1.32 {d1}, [r0:64]! @ A[1][0..4]
1251 vst1.32 {d3}, [r0:64]!
1252 vst1.32 {d5}, [r0:64]!
1253 vst1.32 {d7}, [r0:64]!
1254 vst1.32 {d9}, [r0:64]!
1256 vst1.32 {d10}, [r0:64]! @ A[2][0..4]
1257 vst1.32 {d12}, [r0:64]!
1258 vst1.32 {d14}, [r0:64]!
1259 vst1.32 {d16}, [r0:64]!
1260 vst1.32 {d18}, [r0:64]!
1262 vst1.32 {d11}, [r0:64]! @ A[3][0..4]
1263 vst1.32 {d13}, [r0:64]!
1264 vst1.32 {d15}, [r0:64]!
1265 vst1.32 {d17}, [r0:64]!
1266 vst1.32 {d19}, [r0:64]!
1268 vst1.32 {d20-d23}, [r0:64]! @ A[4][0..4]
1270 vst1.32 {d24}, [r0:64]
1271 mov r0, r12 @ rewind
1273 vldmia sp!, {d8-d15}
1274 b .Loop_squeeze_neon
1277 .Lsqueeze_neon_tail:
1280 strb r2, [r4],#1 @ endian-neutral store
1282 blo .Lsqueeze_neon_done
1285 beq .Lsqueeze_neon_done
1289 blo .Lsqueeze_neon_done
1291 beq .Lsqueeze_neon_done
1296 blo .Lsqueeze_neon_done
1299 beq .Lsqueeze_neon_done
1302 .Lsqueeze_neon_done:
1303 ldmia sp!, {r4-r6,pc}
1304 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1305 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1311 close STDOUT; # enforce flush