2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv4.
20 # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit
21 # interleaving. How does it compare to Keccak Code Package? It's as
22 # fast, but several times smaller, and is endian- and ISA-neutral. ISA
23 # neutrality means that minimum ISA requirement is ARMv4, yet it can
24 # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with
25 # register layout taken from Keccak Code Package. It's also as fast,
26 # in fact faster by 10-15% on some processors, and endian-neutral.
30 # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2
31 # of rotate instructions with logical ones. This resulted in ~10%
32 # improvement on most processors. Switch to KECCAK_2X effectively
33 # minimizes re-loads from temporary storage, and merged rotates just
34 # eliminate corresponding instructions. As for latter. When examining
35 # code you'll notice commented ror instructions. These are eliminated
36 # ones, and you should trace destination register below to see what's
37 # going on. Just in case, why not all rotates are eliminated. Trouble
38 # is that you have operations that require both inputs to be rotated,
39 # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using
40 # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation
41 # that takes 'a' as input. And thing is that this next operation can
42 # be in next round. It's totally possible to "carry" rotate "factors"
43 # to the next round, but it makes code more complex. And the last word
44 # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the
47 # Reduce per-round instruction count in Thumb-2 case by 16%. This is
48 # achieved by folding ldr/str pairs to their double-word counterparts.
49 # Theoretically this should have improved performance on single-issue
50 # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as
53 ########################################################################
54 # Numbers are cycles per processed byte. Non-NEON results account even
55 # for input bit interleaving.
57 # r=1088(*) Thumb-2(**) NEON
60 # Cortex-A5 88/+160%, 86, 36
61 # Cortex-A7 78/+160%, 68, 34
62 # Cortex-A8 51/+230%, 57, 30
63 # Cortex-A9 53/+210%, 51, 26
64 # Cortex-A15 42/+160%, 38, 18
65 # Snapdragon S4 43/+210%, 38, 24
67 # (*) Corresponds to SHA3-256. Percentage after slash is improvement
68 # over compiler-generated KECCAK_2X reference code.
69 # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to
70 # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable
71 # processors are presented mostly for reference purposes.
74 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
75 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
77 if ($flavour && $flavour ne "void") {
78 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
79 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
80 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
81 die "can't locate arm-xlate.pl";
83 open STDOUT,"| \"$^X\" $xlate $flavour $output";
85 open STDOUT,">$output";
88 my @C = map("r$_",(0..9));
89 my @E = map("r$_",(10..12,14));
91 ########################################################################
93 # ----->+-----------------------+
94 # | uint64_t A[5][5] |
96 # +200->+-----------------------+
99 # +240->+-----------------------+
100 # | uint64_t T[5][5] |
102 # +440->+-----------------------+
104 # +444->+-----------------------+
106 # +448->+-----------------------+
109 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
110 my @D = map(8*$_, (25..29));
111 my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50));
114 #include "arm_arch.h"
116 #if defined(__thumb2__)
125 .type iotas32, %object
128 .long 0x00000001, 0x00000000
129 .long 0x00000000, 0x00000089
130 .long 0x00000000, 0x8000008b
131 .long 0x00000000, 0x80008080
132 .long 0x00000001, 0x0000008b
133 .long 0x00000001, 0x00008000
134 .long 0x00000001, 0x80008088
135 .long 0x00000001, 0x80000082
136 .long 0x00000000, 0x0000000b
137 .long 0x00000000, 0x0000000a
138 .long 0x00000001, 0x00008082
139 .long 0x00000000, 0x00008003
140 .long 0x00000001, 0x0000808b
141 .long 0x00000001, 0x8000000b
142 .long 0x00000001, 0x8000008a
143 .long 0x00000001, 0x80000081
144 .long 0x00000000, 0x80000081
145 .long 0x00000000, 0x80000008
146 .long 0x00000000, 0x00000083
147 .long 0x00000000, 0x80008003
148 .long 0x00000001, 0x80008088
149 .long 0x00000000, 0x80000088
150 .long 0x00000001, 0x00008000
151 .long 0x00000000, 0x80008082
152 .size iotas32,.-iotas32
154 .type KeccakF1600_int, %function
157 add @C[9],sp,#$A[4][2]
158 add @E[2],sp,#$A[0][0]
159 add @E[0],sp,#$A[1][0]
160 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4]
163 eor @E[1],@E[1],@E[1]
171 my (@A,@R); (@A[0..4],@R) = @_;
174 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1]
175 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1]
177 eor @C[0],@C[0],@E[0]
178 eor @C[1],@C[1],@E[1]
179 eor @C[2],@C[2],@E[2]
180 ldrd @E[0],@E[1],[sp,#$A[1][2]]
181 eor @C[3],@C[3],@E[3]
182 ldrd @E[2],@E[3],[sp,#$A[1][3]]
183 eor @C[4],@C[4],@E[0]
184 eor @C[5],@C[5],@E[1]
185 eor @C[6],@C[6],@E[2]
186 ldrd @E[0],@E[1],[sp,#$A[1][4]]
187 eor @C[7],@C[7],@E[3]
188 ldrd @E[2],@E[3],[sp,#$A[2][0]]
189 eor @C[8],@C[8],@E[0]
190 eor @C[9],@C[9],@E[1]
191 eor @C[0],@C[0],@E[2]
192 ldrd @E[0],@E[1],[sp,#$A[2][1]]
193 eor @C[1],@C[1],@E[3]
194 ldrd @E[2],@E[3],[sp,#$A[2][2]]
195 eor @C[2],@C[2],@E[0]
196 eor @C[3],@C[3],@E[1]
197 eor @C[4],@C[4],@E[2]
198 ldrd @E[0],@E[1],[sp,#$A[2][3]]
199 eor @C[5],@C[5],@E[3]
200 ldrd @E[2],@E[3],[sp,#$A[2][4]]
201 eor @C[6],@C[6],@E[0]
202 eor @C[7],@C[7],@E[1]
203 eor @C[8],@C[8],@E[2]
204 ldrd @E[0],@E[1],[sp,#$A[3][0]]
205 eor @C[9],@C[9],@E[3]
206 ldrd @E[2],@E[3],[sp,#$A[3][1]]
207 eor @C[0],@C[0],@E[0]
208 eor @C[1],@C[1],@E[1]
209 eor @C[2],@C[2],@E[2]
210 ldrd @E[0],@E[1],[sp,#$A[3][2]]
211 eor @C[3],@C[3],@E[3]
212 ldrd @E[2],@E[3],[sp,#$A[3][3]]
213 eor @C[4],@C[4],@E[0]
214 eor @C[5],@C[5],@E[1]
215 eor @C[6],@C[6],@E[2]
216 ldrd @E[0],@E[1],[sp,#$A[3][4]]
217 eor @C[7],@C[7],@E[3]
218 ldrd @E[2],@E[3],[sp,#$A[4][0]]
219 eor @C[8],@C[8],@E[0]
220 eor @C[9],@C[9],@E[1]
221 eor @C[0],@C[0],@E[2]
222 ldrd @E[0],@E[1],[sp,#$A[4][1]]
223 eor @C[1],@C[1],@E[3]
224 ldrd @E[2],@E[3],[sp,#$A[0][2]]
225 eor @C[2],@C[2],@E[0]
226 eor @C[3],@C[3],@E[1]
227 eor @C[4],@C[4],@E[2]
228 ldrd @E[0],@E[1],[sp,#$A[0][3]]
229 eor @C[5],@C[5],@E[3]
230 ldrd @E[2],@E[3],[sp,#$A[0][4]]
232 eor @C[0],@C[0],@E[0]
233 add @E[0],sp,#$A[1][2]
234 eor @C[1],@C[1],@E[1]
235 eor @C[2],@C[2],@E[2]
236 eor @C[3],@C[3],@E[3]
237 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3]
238 eor @C[4],@C[4],@E[0]
239 add @E[0],sp,#$A[1][4]
240 eor @C[5],@C[5],@E[1]
241 eor @C[6],@C[6],@E[2]
242 eor @C[7],@C[7],@E[3]
243 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0]
244 eor @C[8],@C[8],@E[0]
245 add @E[0],sp,#$A[2][1]
246 eor @C[9],@C[9],@E[1]
247 eor @C[0],@C[0],@E[2]
248 eor @C[1],@C[1],@E[3]
249 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2]
250 eor @C[2],@C[2],@E[0]
251 add @E[0],sp,#$A[2][3]
252 eor @C[3],@C[3],@E[1]
253 eor @C[4],@C[4],@E[2]
254 eor @C[5],@C[5],@E[3]
255 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4]
256 eor @C[6],@C[6],@E[0]
257 add @E[0],sp,#$A[3][0]
258 eor @C[7],@C[7],@E[1]
259 eor @C[8],@C[8],@E[2]
260 eor @C[9],@C[9],@E[3]
261 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1]
262 eor @C[0],@C[0],@E[0]
263 add @E[0],sp,#$A[3][2]
264 eor @C[1],@C[1],@E[1]
265 eor @C[2],@C[2],@E[2]
266 eor @C[3],@C[3],@E[3]
267 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3]
268 eor @C[4],@C[4],@E[0]
269 add @E[0],sp,#$A[3][4]
270 eor @C[5],@C[5],@E[1]
271 eor @C[6],@C[6],@E[2]
272 eor @C[7],@C[7],@E[3]
273 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0]
274 eor @C[8],@C[8],@E[0]
275 ldr @E[0],[sp,#$A[4][1]] @ A[4][1]
276 eor @C[9],@C[9],@E[1]
277 ldr @E[1],[sp,#$A[4][1]+4]
278 eor @C[0],@C[0],@E[2]
279 ldr @E[2],[sp,#$A[0][2]] @ A[0][2]
280 eor @C[1],@C[1],@E[3]
281 ldr @E[3],[sp,#$A[0][2]+4]
282 eor @C[2],@C[2],@E[0]
283 add @E[0],sp,#$A[0][3]
284 eor @C[3],@C[3],@E[1]
285 eor @C[4],@C[4],@E[2]
286 eor @C[5],@C[5],@E[3]
287 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4]
289 eor @C[6],@C[6],@E[0]
290 eor @C[7],@C[7],@E[1]
291 eor @C[8],@C[8],@E[2]
292 eor @C[9],@C[9],@E[3]
294 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0];
295 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0]
296 eor @E[1],@C[1],@C[4]
297 str.h @E[1],[sp,#$D[1]+4]
298 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3];
299 eor @E[3],@C[7],@C[0]
300 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1]
301 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4];
302 str.h @E[3],[sp,#$D[4]+4]
303 eor @C[1],@C[9],@C[2]
304 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0]
305 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1];
306 ldr.l @C[7],[sp,#$A[3][3]]
307 eor @C[3],@C[3],@C[6]
308 str.h @C[1],[sp,#$D[0]+4]
309 ldr.h @C[6],[sp,#$A[3][3]+4]
310 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1]
311 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2];
312 str.h @C[3],[sp,#$D[2]+4]
313 eor @C[5],@C[5],@C[8]
315 ldr.l @C[8],[sp,#$A[4][4]]
316 ldr.h @C[9],[sp,#$A[4][4]+4]
317 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2]
318 eor @C[7],@C[7],@C[4]
319 str.h @C[5],[sp,#$D[3]+4]
320 eor @C[6],@C[6],@C[5]
321 ldr.l @C[4],[sp,#$A[0][0]]
322 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
323 @ ror @C[6],@C[6],#32-11
324 ldr.h @C[5],[sp,#$A[0][0]+4]
325 eor @C[8],@C[8],@E[2]
326 eor @C[9],@C[9],@E[3]
327 ldr.l @E[2],[sp,#$A[2][2]]
328 eor @C[0],@C[0],@C[4]
329 ldr.h @E[3],[sp,#$A[2][2]+4]
330 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
331 @ ror @C[9],@C[9],#32-7
332 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
333 eor @E[2],@E[2],@C[2]
334 ldr.l @C[2],[sp,#$A[1][1]]
335 eor @E[3],@E[3],@C[3]
336 ldr.h @C[3],[sp,#$A[1][1]+4]
337 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
338 ldr @E[2],[sp,#444] @ load counter
339 eor @C[2],@C[2],@E[0]
341 ror @C[4],@E[3],#32-22
342 add @E[3],@E[0],@E[2]
343 eor @C[3],@C[3],@E[1]
345 $code.=<<___ if ($A[0][0] != $T[0][0]);
346 ldmia @E[3],{@E[0],@E[1]} @ iotas[i]
348 $code.=<<___ if ($A[0][0] == $T[0][0]);
349 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo
351 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi
353 str @E[2],[sp,#444] @ store counter
356 bic @E[2],@C[4],@C[2],ror#32-22
357 bic @E[3],@C[5],@C[3],ror#32-22
358 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
359 ror @C[3],@C[3],#32-22
360 eor @E[2],@E[2],@C[0]
361 eor @E[3],@E[3],@C[1]
362 eor @E[0],@E[0],@E[2]
363 eor @E[1],@E[1],@E[3]
364 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
365 bic @E[2],@C[6],@C[4],ror#11
366 str.h @E[1],[sp,#$R[0][0]+4]
367 bic @E[3],@C[7],@C[5],ror#10
368 bic @E[0],@C[8],@C[6],ror#32-(11-7)
369 bic @E[1],@C[9],@C[7],ror#32-(10-7)
370 eor @E[2],@C[2],@E[2],ror#32-11
371 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]);
372 eor @E[3],@C[3],@E[3],ror#32-10
373 str.h @E[3],[sp,#$R[0][1]+4]
374 eor @E[0],@C[4],@E[0],ror#32-7
375 eor @E[1],@C[5],@E[1],ror#32-7
376 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]);
377 bic @E[2],@C[0],@C[8],ror#32-7
378 str.h @E[1],[sp,#$R[0][2]+4]
379 bic @E[3],@C[1],@C[9],ror#32-7
380 eor @E[2],@E[2],@C[6],ror#32-11
381 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]);
382 eor @E[3],@E[3],@C[7],ror#32-10
383 str.h @E[3],[sp,#$R[0][3]+4]
384 bic @E[0],@C[2],@C[0]
386 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3]
387 bic @E[1],@C[3],@C[1]
388 ldr.h @C[1],[sp,#$A[0][3]+4]
389 eor @E[0],@E[0],@C[8],ror#32-7
390 eor @E[1],@E[1],@C[9],ror#32-7
391 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]);
393 str.h @E[1],[sp,#$R[0][4]+4]
395 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4]
396 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1]
398 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4]
399 eor @C[0],@C[0],@E[0]
400 ldr.h @C[3],[sp,#$A[1][4]+4]
401 eor @C[1],@C[1],@E[1]
402 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
403 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1]
404 @ ror @C[1],@C[1],#32-14
405 ldr.h @E[1],[sp,#$A[3][1]+4]
407 eor @C[2],@C[2],@E[2]
408 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0]
409 eor @C[3],@C[3],@E[3]
410 ldr.h @C[5],[sp,#$A[2][0]+4]
411 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
412 @ ror @C[3],@C[3],#32-10
414 eor @C[6],@C[6],@C[4]
415 ldr.l @E[2],[sp,#$D[2]] @ D[2]
416 eor @C[7],@C[7],@C[5]
417 ldr.h @E[3],[sp,#$D[2]+4]
418 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
419 ror @C[4],@C[7],#32-2
421 eor @E[0],@E[0],@C[8]
422 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2]
423 eor @E[1],@E[1],@C[9]
424 ldr.h @C[9],[sp,#$A[4][2]+4]
425 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
426 ror @C[6],@E[1],#32-23
428 bic @E[0],@C[4],@C[2],ror#32-10
429 bic @E[1],@C[5],@C[3],ror#32-10
430 eor @E[2],@E[2],@C[8]
431 eor @E[3],@E[3],@C[9]
432 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
433 ror @C[8],@E[3],#32-31
434 eor @E[0],@E[0],@C[0],ror#32-14
435 eor @E[1],@E[1],@C[1],ror#32-14
436 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2])
437 bic @E[2],@C[6],@C[4]
438 str.h @E[1],[sp,#$R[1][0]+4]
439 bic @E[3],@C[7],@C[5]
440 eor @E[2],@E[2],@C[2],ror#32-10
441 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]);
442 eor @E[3],@E[3],@C[3],ror#32-10
443 str.h @E[3],[sp,#$R[1][1]+4]
444 bic @E[0],@C[8],@C[6]
445 bic @E[1],@C[9],@C[7]
446 bic @E[2],@C[0],@C[8],ror#14
447 bic @E[3],@C[1],@C[9],ror#14
448 eor @E[0],@E[0],@C[4]
449 eor @E[1],@E[1],@C[5]
450 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]);
451 bic @C[2],@C[2],@C[0],ror#32-(14-10)
452 str.h @E[1],[sp,#$R[1][2]+4]
453 eor @E[2],@C[6],@E[2],ror#32-14
454 bic @E[1],@C[3],@C[1],ror#32-(14-10)
455 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]);
456 eor @E[3],@C[7],@E[3],ror#32-14
457 str.h @E[3],[sp,#$R[1][3]+4]
459 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1]
460 eor @E[0],@C[8],@C[2],ror#32-10
461 ldr.h @C[0],[sp,#$A[0][1]+4]
462 eor @E[1],@C[9],@E[1],ror#32-10
463 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]);
464 str.h @E[1],[sp,#$R[1][4]+4]
467 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2]
468 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2]
469 ldr.h @C[3],[sp,#$A[1][2]+4]
470 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4]
472 eor @C[1],@C[1],@E[0]
473 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3]
474 eor @C[0],@C[0],@E[1]
475 ldr.h @C[5],[sp,#$A[2][3]+4]
476 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
478 eor @C[2],@C[2],@E[2]
479 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4]
480 eor @C[3],@C[3],@E[3]
481 ldr.h @E[1],[sp,#$A[3][4]+4]
482 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
483 ldr.l @E[2],[sp,#$D[0]] @ D[0]
484 @ ror @C[3],@C[3],#32-3
485 ldr.h @E[3],[sp,#$D[0]+4]
487 eor @C[4],@C[4],@C[6]
488 eor @C[5],@C[5],@C[7]
489 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
490 @ ror @C[4],@C[7],#32-13 @ [track reverse order below]
492 eor @E[0],@E[0],@C[8]
493 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0]
494 eor @E[1],@E[1],@C[9]
495 ldr.h @C[9],[sp,#$A[4][0]+4]
496 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
497 ror @C[7],@E[1],#32-4
499 eor @E[2],@E[2],@C[8]
500 eor @E[3],@E[3],@C[9]
501 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
502 ror @C[9],@E[3],#32-9
504 bic @E[0],@C[5],@C[2],ror#13-3
505 bic @E[1],@C[4],@C[3],ror#12-3
506 bic @E[2],@C[6],@C[5],ror#32-13
507 bic @E[3],@C[7],@C[4],ror#32-12
508 eor @E[0],@C[0],@E[0],ror#32-13
509 eor @E[1],@C[1],@E[1],ror#32-12
510 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2])
511 eor @E[2],@E[2],@C[2],ror#32-3
512 str.h @E[1],[sp,#$R[2][0]+4]
513 eor @E[3],@E[3],@C[3],ror#32-3
514 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]);
515 bic @E[0],@C[8],@C[6]
516 bic @E[1],@C[9],@C[7]
517 str.h @E[3],[sp,#$R[2][1]+4]
518 eor @E[0],@E[0],@C[5],ror#32-13
519 eor @E[1],@E[1],@C[4],ror#32-12
520 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]);
521 bic @E[2],@C[0],@C[8]
522 str.h @E[1],[sp,#$R[2][2]+4]
523 bic @E[3],@C[1],@C[9]
524 eor @E[2],@E[2],@C[6]
525 eor @E[3],@E[3],@C[7]
526 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]);
527 bic @E[0],@C[2],@C[0],ror#3
528 str.h @E[3],[sp,#$R[2][3]+4]
529 bic @E[1],@C[3],@C[1],ror#3
530 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order]
531 eor @E[0],@C[8],@E[0],ror#32-3
532 ldr.h @C[0],[sp,#$A[0][4]+4]
533 eor @E[1],@C[9],@E[1],ror#32-3
534 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]);
536 str.h @E[1],[sp,#$R[2][4]+4]
538 ldr.l @E[0],[sp,#$D[4]] @ D[4]
539 ldr.h @E[1],[sp,#$D[4]+4]
540 ldr.l @E[2],[sp,#$D[0]] @ D[0]
541 ldr.h @E[3],[sp,#$D[0]+4]
543 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2]
545 eor @C[1],@C[1],@E[0]
546 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0]
547 eor @C[0],@C[0],@E[1]
548 ldr.h @C[3],[sp,#$A[1][0]+4]
549 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
550 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1]
551 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order]
552 ldr.h @C[5],[sp,#$A[2][1]+4]
554 eor @C[2],@C[2],@E[2]
555 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2]
556 eor @C[3],@C[3],@E[3]
557 ldr.h @E[1],[sp,#$A[3][2]+4]
558 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
559 ldr.l @E[2],[sp,#$D[3]] @ D[3]
560 @ ror @C[3],@C[3],#32-18
561 ldr.h @E[3],[sp,#$D[3]+4]
563 eor @C[6],@C[6],@C[4]
564 eor @C[7],@C[7],@C[5]
565 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
566 ror @C[5],@C[7],#32-5
568 eor @E[0],@E[0],@C[8]
569 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3]
570 eor @E[1],@E[1],@C[9]
571 ldr.h @C[9],[sp,#$A[4][3]+4]
572 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
573 ror @C[6],@E[1],#32-8
575 eor @E[2],@E[2],@C[8]
576 eor @E[3],@E[3],@C[9]
577 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
578 ror @C[9],@E[3],#32-28
580 bic @E[0],@C[4],@C[2],ror#32-18
581 bic @E[1],@C[5],@C[3],ror#32-18
582 eor @E[0],@E[0],@C[0],ror#32-14
583 eor @E[1],@E[1],@C[1],ror#32-13
584 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2])
585 bic @E[2],@C[6],@C[4]
586 str.h @E[1],[sp,#$R[3][0]+4]
587 bic @E[3],@C[7],@C[5]
588 eor @E[2],@E[2],@C[2],ror#32-18
589 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]);
590 eor @E[3],@E[3],@C[3],ror#32-18
591 str.h @E[3],[sp,#$R[3][1]+4]
592 bic @E[0],@C[8],@C[6]
593 bic @E[1],@C[9],@C[7]
594 bic @E[2],@C[0],@C[8],ror#14
595 bic @E[3],@C[1],@C[9],ror#13
596 eor @E[0],@E[0],@C[4]
597 eor @E[1],@E[1],@C[5]
598 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]);
599 bic @C[2],@C[2],@C[0],ror#18-14
600 str.h @E[1],[sp,#$R[3][2]+4]
601 eor @E[2],@C[6],@E[2],ror#32-14
602 bic @E[1],@C[3],@C[1],ror#18-13
603 eor @E[3],@C[7],@E[3],ror#32-13
604 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]);
605 str.h @E[3],[sp,#$R[3][3]+4]
607 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2]
608 eor @E[0],@C[8],@C[2],ror#32-18
609 ldr.h @C[1],[sp,#$A[0][2]+4]
610 eor @E[1],@C[9],@E[1],ror#32-18
611 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]);
612 str.h @E[1],[sp,#$R[3][4]+4]
614 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3]
615 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3]
616 ldr.h @C[3],[sp,#$A[1][3]+4]
617 ldr.l @C[6],[sp,#$D[4]] @ D[4]
618 ldr.h @C[7],[sp,#$D[4]+4]
620 eor @C[0],@C[0],@E[0]
621 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4]
622 eor @C[1],@C[1],@E[1]
623 ldr.h @C[5],[sp,#$A[2][4]+4]
624 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
625 ldr.l @C[8],[sp,#$D[0]] @ D[0]
626 @ ror @C[1],@C[1],#32-31
627 ldr.h @C[9],[sp,#$D[0]+4]
629 eor @E[2],@E[2],@C[2]
630 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0]
631 eor @E[3],@E[3],@C[3]
632 ldr.h @E[1],[sp,#$A[3][0]+4]
633 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
634 ldr.l @E[2],[sp,#$D[1]] @ D[1]
635 ror @C[2],@E[3],#32-28
636 ldr.h @E[3],[sp,#$D[1]+4]
638 eor @C[6],@C[6],@C[4]
639 eor @C[7],@C[7],@C[5]
640 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
641 ror @C[4],@C[7],#32-20
643 eor @E[0],@E[0],@C[8]
644 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1]
645 eor @E[1],@E[1],@C[9]
646 ldr.h @C[9],[sp,#$A[4][1]+4]
647 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
648 ror @C[6],@E[1],#32-21
650 eor @C[8],@C[8],@E[2]
651 eor @C[9],@C[9],@E[3]
652 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
653 @ ror @C[9],@C[3],#32-1
655 bic @E[0],@C[4],@C[2]
656 bic @E[1],@C[5],@C[3]
657 eor @E[0],@E[0],@C[0],ror#32-31
658 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2])
659 eor @E[1],@E[1],@C[1],ror#32-31
660 str.h @E[1],[sp,#$R[4][0]+4]
661 bic @E[2],@C[6],@C[4]
662 bic @E[3],@C[7],@C[5]
663 eor @E[2],@E[2],@C[2]
664 eor @E[3],@E[3],@C[3]
665 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]);
666 bic @E[0],@C[8],@C[6],ror#1
667 str.h @E[3],[sp,#$R[4][1]+4]
668 bic @E[1],@C[9],@C[7],ror#1
669 bic @E[2],@C[0],@C[8],ror#31-1
670 bic @E[3],@C[1],@C[9],ror#31-1
671 eor @C[4],@C[4],@E[0],ror#32-1
672 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]);
673 eor @C[5],@C[5],@E[1],ror#32-1
674 str.h @C[5],[sp,#$R[4][2]+4]
675 eor @C[6],@C[6],@E[2],ror#32-31
676 eor @C[7],@C[7],@E[3],ror#32-31
677 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]);
678 bic @E[0],@C[2],@C[0],ror#32-31
679 str.h @C[7],[sp,#$R[4][3]+4]
680 bic @E[1],@C[3],@C[1],ror#32-31
681 add @E[2],sp,#$R[0][0]
682 eor @C[8],@E[0],@C[8],ror#32-1
683 add @E[0],sp,#$R[1][0]
684 eor @C[9],@E[1],@C[9],ror#32-1
685 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]);
686 str.h @C[9],[sp,#$R[4][4]+4]
699 moveq pc,lr @ be binary compatible with V4, yet
700 bx lr @ interoperable with Thumb ISA:-)
702 .size KeccakF1600_int,.-KeccakF1600_int
704 .type KeccakF1600, %function
707 stmdb sp!,{r0,r4-r11,lr}
708 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],...
710 add @E[0],r0,#$A[1][0]
711 add @E[1],sp,#$A[1][0]
712 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
713 stmia sp, {@C[0]-@C[9]}
714 ldmia @E[0]!,{@C[0]-@C[9]}
715 stmia @E[1]!,{@C[0]-@C[9]}
716 ldmia @E[0]!,{@C[0]-@C[9]}
717 stmia @E[1]!,{@C[0]-@C[9]}
718 ldmia @E[0]!,{@C[0]-@C[9]}
719 stmia @E[1]!,{@C[0]-@C[9]}
720 ldmia @E[0], {@C[0]-@C[9]}
721 add @E[2],sp,#$A[0][0]
722 add @E[0],sp,#$A[1][0]
723 stmia @E[1], {@C[0]-@C[9]}
727 ldr @E[1], [sp,#440+16] @ restore pointer to A
728 ldmia sp, {@C[0]-@C[9]}
729 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5]
730 ldmia @E[0]!,{@C[0]-@C[9]}
731 stmia @E[1]!,{@C[0]-@C[9]}
732 ldmia @E[0]!,{@C[0]-@C[9]}
733 stmia @E[1]!,{@C[0]-@C[9]}
734 ldmia @E[0]!,{@C[0]-@C[9]}
735 stmia @E[1]!,{@C[0]-@C[9]}
736 ldmia @E[0], {@C[0]-@C[9]}
737 stmia @E[1], {@C[0]-@C[9]}
741 ldmia sp!,{r4-r11,pc}
743 ldmia sp!,{r4-r11,lr}
745 moveq pc,lr @ be binary compatible with V4, yet
746 bx lr @ interoperable with Thumb ISA:-)
748 .size KeccakF1600,.-KeccakF1600
750 { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
752 ########################################################################
754 # ----->+-----------------------+
755 # | uint64_t A[5][5] |
758 # +456->+-----------------------+
760 # +460->+-----------------------+
762 # +464->+-----------------------+
764 # +468->+-----------------------+
766 # +472->+-----------------------+
768 # +476->+-----------------------+
769 # | const void *inp |
770 # +480->+-----------------------+
772 # +484->+-----------------------+
774 # +488->+-----------------------+
779 .type SHA3_absorb,%function
782 stmdb sp!,{r0-r12,lr}
785 add $A_flat,r0,#$A[1][0]
793 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack
794 stmia $inp!, {@C[0]-@C[9]}
795 ldmia $A_flat!,{@C[0]-@C[9]}
796 stmia $inp!, {@C[0]-@C[9]}
797 ldmia $A_flat!,{@C[0]-@C[9]}
798 stmia $inp!, {@C[0]-@C[9]}
799 ldmia $A_flat!,{@C[0]-@C[9]}
800 stmia $inp!, {@C[0]-@C[9]}
801 ldmia $A_flat!,{@C[0]-@C[9]}
802 stmia $inp, {@C[0]-@C[9]}
804 ldr $inp,[sp,#476] @ restore $inp
811 mov r6,#0x11 @ compose constants
816 orr r6,r6,r6,lsl#16 @ 0x11111111
817 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
818 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
819 orr r7,r6,r6,lsl#1 @ 0x33333333
820 orr r6,r6,r6,lsl#2 @ 0x55555555
833 str r0,[sp,#480] @ save len - bsz
846 orr r0,r0,r3,lsl#24 @ lo
850 orr r1,r1,r3,lsl#24 @ hi
852 and r2,r0,r6 @ &=0x55555555
853 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa
854 and r3,r1,r6 @ &=0x55555555
855 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
860 and r2,r2,r7 @ &=0x33333333
861 and r0,r0,r7,lsl#2 @ &=0xcccccccc
862 and r3,r3,r7 @ &=0x33333333
863 and r1,r1,r7,lsl#2 @ &=0xcccccccc
868 and r2,r2,r8 @ &=0x0f0f0f0f
869 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0
870 and r3,r3,r8 @ &=0x0f0f0f0f
871 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
872 ldmia $A_flat,{r4-r5} @ A_flat[i]
877 and r2,r2,r9 @ &=0x00ff00ff
878 and r0,r0,r9,lsl#8 @ &=0xff00ff00
879 and r3,r3,r9 @ &=0x00ff00ff
880 and r1,r1,r9,lsl#8 @ &=0xff00ff00
892 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7])
902 ldmia r14,{r6-r12,r14} @ restore constants and variables
907 add $inp,sp,#$A[1][0]
908 ldmia sp, {@C[0]-@C[9]}
909 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5]
910 ldmia $inp!, {@C[0]-@C[9]}
911 stmia $A_flat!,{@C[0]-@C[9]}
912 ldmia $inp!, {@C[0]-@C[9]}
913 stmia $A_flat!,{@C[0]-@C[9]}
914 ldmia $inp!, {@C[0]-@C[9]}
915 stmia $A_flat!,{@C[0]-@C[9]}
916 ldmia $inp, {@C[0]-@C[9]}
917 stmia $A_flat, {@C[0]-@C[9]}
921 mov r0,$len @ return value
923 ldmia sp!,{r4-r12,pc}
925 ldmia sp!,{r4-r12,lr}
927 moveq pc,lr @ be binary compatible with V4, yet
928 bx lr @ interoperable with Thumb ISA:-)
930 .size SHA3_absorb,.-SHA3_absorb
933 { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12));
937 .type SHA3_squeeze,%function
940 stmdb sp!,{r0,r3-r10,lr}
953 mov r6,#0x11 @ compose constants
958 orr r6,r6,r6,lsl#16 @ 0x11111111
959 orr r9,r9,r9,lsl#16 @ 0x00ff00ff
960 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f
961 orr r7,r6,r6,lsl#1 @ 0x33333333
962 orr r6,r6,r6,lsl#2 @ 0x55555555
971 ldmia $A_flat!,{r0,r1} @ A_flat[i++]
974 lsl r3,r1,#16 @ r3 = r1 << 16
975 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff
977 lsr r0,r0,#16 @ r0 = r0 >> 16
978 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000
984 and r2,r2,r9 @ &=0x00ff00ff
985 and r3,r3,r9,lsl#8 @ &=0xff00ff00
986 and r0,r0,r9 @ &=0x00ff00ff
987 and r1,r1,r9,lsl#8 @ &=0xff00ff00
992 and r2,r2,r8 @ &=0x0f0f0f0f
993 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0
994 and r0,r0,r8 @ &=0x0f0f0f0f
995 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0
1000 and r2,r2,r7 @ &=0x33333333
1001 and r3,r3,r7,lsl#2 @ &=0xcccccccc
1002 and r0,r0,r7 @ &=0x33333333
1003 and r1,r1,r7,lsl#2 @ &=0xcccccccc
1008 and r2,r2,r6 @ &=0x55555555
1009 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa
1010 and r0,r0,r6 @ &=0x55555555
1011 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa
1036 subs $bsz,$bsz,#8 @ bsz -= 8
1039 mov r0,r14 @ original $A_flat
1043 ldmia sp,{r6-r10,r12} @ restore constants and variables
1080 ldmia sp!,{r4-r10,pc}
1082 ldmia sp!,{r4-r10,lr}
1084 moveq pc,lr @ be binary compatible with V4, yet
1085 bx lr @ interoperable with Thumb ISA:-)
1087 .size SHA3_squeeze,.-SHA3_squeeze
1092 #if __ARM_MAX_ARCH__>=7
1095 .type iotas64, %object
1098 .quad 0x0000000000000001
1099 .quad 0x0000000000008082
1100 .quad 0x800000000000808a
1101 .quad 0x8000000080008000
1102 .quad 0x000000000000808b
1103 .quad 0x0000000080000001
1104 .quad 0x8000000080008081
1105 .quad 0x8000000000008009
1106 .quad 0x000000000000008a
1107 .quad 0x0000000000000088
1108 .quad 0x0000000080008009
1109 .quad 0x000000008000000a
1110 .quad 0x000000008000808b
1111 .quad 0x800000000000008b
1112 .quad 0x8000000000008089
1113 .quad 0x8000000000008003
1114 .quad 0x8000000000008002
1115 .quad 0x8000000000000080
1116 .quad 0x000000000000800a
1117 .quad 0x800000008000000a
1118 .quad 0x8000000080008081
1119 .quad 0x8000000000008080
1120 .quad 0x0000000080000001
1121 .quad 0x8000000080008008
1122 .size iotas64,.-iotas64
1124 .type KeccakF1600_neon, %function
1129 mov r3, #24 @ loop counter
1135 vst1.64 {q4}, [r0,:64] @ offload A[0..1][4]
1136 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0]
1137 vst1.64 {d18}, [r1,:64] @ offload A[2][4]
1138 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1]
1139 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2]
1140 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0]
1141 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1]
1142 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3]
1143 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4]
1144 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2]
1145 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3]
1146 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4]
1147 veor q13, q13, q10 @ C[0..1]^=A[4][0..1]
1148 veor q14, q15, q11 @ C[2..3]^=A[4][2..3]
1149 veor d25, d25, d24 @ C[4]^=A[4][4]
1151 vadd.u64 q4, q13, q13 @ C[0..1]<<1
1152 vadd.u64 q15, q14, q14 @ C[2..3]<<1
1153 vadd.u64 d18, d25, d25 @ C[4]<<1
1154 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1)
1155 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1)
1156 vsri.u64 d18, d25, #63 @ ROL64(C[4],1)
1157 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1)
1158 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1)
1159 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1)
1160 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1)
1162 veor d0, d0, d25 @ A[0][0] ^= C[4]
1163 veor d1, d1, d25 @ A[1][0] ^= C[4]
1164 veor d10, d10, d25 @ A[2][0] ^= C[4]
1165 veor d11, d11, d25 @ A[3][0] ^= C[4]
1166 veor d20, d20, d25 @ A[4][0] ^= C[4]
1168 veor d2, d2, d26 @ A[0][1] ^= D[1]
1169 veor d3, d3, d26 @ A[1][1] ^= D[1]
1170 veor d12, d12, d26 @ A[2][1] ^= D[1]
1171 veor d13, d13, d26 @ A[3][1] ^= D[1]
1172 veor d21, d21, d26 @ A[4][1] ^= D[1]
1175 veor d6, d6, d28 @ A[0][3] ^= C[2]
1176 veor d7, d7, d28 @ A[1][3] ^= C[2]
1177 veor d16, d16, d28 @ A[2][3] ^= C[2]
1178 veor d17, d17, d28 @ A[3][3] ^= C[2]
1179 veor d23, d23, d28 @ A[4][3] ^= C[2]
1180 vld1.64 {q4}, [r0,:64] @ restore A[0..1][4]
1183 vld1.64 {d18}, [r1,:64] @ restore A[2][4]
1184 veor q2, q2, q13 @ A[0..1][2] ^= D[2]
1185 veor q7, q7, q13 @ A[2..3][2] ^= D[2]
1186 veor d22, d22, d27 @ A[4][2] ^= D[2]
1188 veor q4, q4, q14 @ A[0..1][4] ^= C[3]
1189 veor q9, q9, q14 @ A[2..3][4] ^= C[3]
1190 veor d24, d24, d29 @ A[4][4] ^= C[3]
1193 vmov d26, d2 @ C[1] = A[0][1]
1194 vshl.u64 d2, d3, #44
1195 vmov d27, d4 @ C[2] = A[0][2]
1196 vshl.u64 d4, d14, #43
1197 vmov d28, d6 @ C[3] = A[0][3]
1198 vshl.u64 d6, d17, #21
1199 vmov d29, d8 @ C[4] = A[0][4]
1200 vshl.u64 d8, d24, #14
1201 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1])
1202 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2])
1203 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3])
1204 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4])
1206 vshl.u64 d3, d9, #20
1207 vshl.u64 d14, d16, #25
1208 vshl.u64 d17, d15, #15
1209 vshl.u64 d24, d21, #2
1210 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4])
1211 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3])
1212 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2])
1213 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1])
1215 vshl.u64 d9, d22, #61
1216 @ vshl.u64 d16, d19, #8
1217 vshl.u64 d15, d12, #10
1218 vshl.u64 d21, d7, #55
1219 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2])
1220 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4])
1221 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1])
1222 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3])
1224 vshl.u64 d22, d18, #39
1225 @ vshl.u64 d19, d23, #56
1226 vshl.u64 d12, d5, #6
1227 vshl.u64 d7, d13, #45
1228 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4])
1229 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3])
1230 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2])
1231 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1])
1233 vshl.u64 d18, d20, #18
1234 vshl.u64 d23, d11, #41
1235 vshl.u64 d5, d10, #3
1236 vshl.u64 d13, d1, #36
1237 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0])
1238 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0])
1239 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0])
1240 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0])
1242 vshl.u64 d1, d28, #28
1243 vshl.u64 d10, d26, #1
1244 vshl.u64 d11, d29, #27
1245 vshl.u64 d20, d27, #62
1246 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3])
1247 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1])
1248 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4])
1249 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2])
1255 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2])
1256 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3])
1257 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
1258 vst1.64 {q13}, [r0,:64] @ offload A[0..1][0]
1261 vmov q1, q14 @ A[0..1][1]
1262 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
1263 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
1266 vmov q0, q5 @ A[2..3][0]
1268 vmov q15, q6 @ A[2..3][1]
1269 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
1271 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
1273 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
1275 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
1276 vmov q14, q10 @ A[4][0..1]
1277 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
1279 vld1.64 d25, [r2,:64]! @ Iota[i++]
1282 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0]
1283 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2])
1285 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3])
1287 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4])
1289 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0])
1290 veor d0, d0, d25 @ A[0][0] ^= Iota[i]
1291 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1])
1297 .size KeccakF1600_neon,.-KeccakF1600_neon
1299 .global SHA3_absorb_neon
1300 .type SHA3_absorb_neon, %function
1303 stmdb sp!, {r4-r6,lr}
1304 vstmdb sp!, {d8-d15}
1310 vld1.32 {d0}, [r0,:64]! @ A[0][0]
1311 vld1.32 {d2}, [r0,:64]! @ A[0][1]
1312 vld1.32 {d4}, [r0,:64]! @ A[0][2]
1313 vld1.32 {d6}, [r0,:64]! @ A[0][3]
1314 vld1.32 {d8}, [r0,:64]! @ A[0][4]
1316 vld1.32 {d1}, [r0,:64]! @ A[1][0]
1317 vld1.32 {d3}, [r0,:64]! @ A[1][1]
1318 vld1.32 {d5}, [r0,:64]! @ A[1][2]
1319 vld1.32 {d7}, [r0,:64]! @ A[1][3]
1320 vld1.32 {d9}, [r0,:64]! @ A[1][4]
1322 vld1.32 {d10}, [r0,:64]! @ A[2][0]
1323 vld1.32 {d12}, [r0,:64]! @ A[2][1]
1324 vld1.32 {d14}, [r0,:64]! @ A[2][2]
1325 vld1.32 {d16}, [r0,:64]! @ A[2][3]
1326 vld1.32 {d18}, [r0,:64]! @ A[2][4]
1328 vld1.32 {d11}, [r0,:64]! @ A[3][0]
1329 vld1.32 {d13}, [r0,:64]! @ A[3][1]
1330 vld1.32 {d15}, [r0,:64]! @ A[3][2]
1331 vld1.32 {d17}, [r0,:64]! @ A[3][3]
1332 vld1.32 {d19}, [r0,:64]! @ A[3][4]
1334 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3]
1335 vld1.32 {d24}, [r0,:64] @ A[4][4]
1336 sub r0, r0, #24*8 @ rewind
1341 subs r12, r5, r6 @ len - bsz
1345 vld1.8 {d31}, [r4]! @ endian-neutral loads...
1347 veor d0, d0, d31 @ A[0][0] ^= *inp++
1350 veor d2, d2, d31 @ A[0][1] ^= *inp++
1354 veor d4, d4, d31 @ A[0][2] ^= *inp++
1357 veor d6, d6, d31 @ A[0][3] ^= *inp++
1361 veor d8, d8, d31 @ A[0][4] ^= *inp++
1365 veor d1, d1, d31 @ A[1][0] ^= *inp++
1369 veor d3, d3, d31 @ A[1][1] ^= *inp++
1372 veor d5, d5, d31 @ A[1][2] ^= *inp++
1376 veor d7, d7, d31 @ A[1][3] ^= *inp++
1379 veor d9, d9, d31 @ A[1][4] ^= *inp++
1384 veor d10, d10, d31 @ A[2][0] ^= *inp++
1387 veor d12, d12, d31 @ A[2][1] ^= *inp++
1391 veor d14, d14, d31 @ A[2][2] ^= *inp++
1394 veor d16, d16, d31 @ A[2][3] ^= *inp++
1398 veor d18, d18, d31 @ A[2][4] ^= *inp++
1402 veor d11, d11, d31 @ A[3][0] ^= *inp++
1406 veor d13, d13, d31 @ A[3][1] ^= *inp++
1409 veor d15, d15, d31 @ A[3][2] ^= *inp++
1413 veor d17, d17, d31 @ A[3][3] ^= *inp++
1416 veor d19, d19, d31 @ A[3][4] ^= *inp++
1421 veor d20, d20, d31 @ A[4][0] ^= *inp++
1424 veor d21, d21, d31 @ A[4][1] ^= *inp++
1428 veor d22, d22, d31 @ A[4][2] ^= *inp++
1431 veor d23, d23, d31 @ A[4][3] ^= *inp++
1434 veor d24, d24, d31 @ A[4][4] ^= *inp++
1442 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1443 vst1.32 {d2}, [r0,:64]!
1444 vst1.32 {d4}, [r0,:64]!
1445 vst1.32 {d6}, [r0,:64]!
1446 vst1.32 {d8}, [r0,:64]!
1448 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1449 vst1.32 {d3}, [r0,:64]!
1450 vst1.32 {d5}, [r0,:64]!
1451 vst1.32 {d7}, [r0,:64]!
1452 vst1.32 {d9}, [r0,:64]!
1454 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1455 vst1.32 {d12}, [r0,:64]!
1456 vst1.32 {d14}, [r0,:64]!
1457 vst1.32 {d16}, [r0,:64]!
1458 vst1.32 {d18}, [r0,:64]!
1460 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1461 vst1.32 {d13}, [r0,:64]!
1462 vst1.32 {d15}, [r0,:64]!
1463 vst1.32 {d17}, [r0,:64]!
1464 vst1.32 {d19}, [r0,:64]!
1466 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1467 vst1.32 {d24}, [r0,:64]
1469 mov r0, r5 @ return value
1470 vldmia sp!, {d8-d15}
1471 ldmia sp!, {r4-r6,pc}
1472 .size SHA3_absorb_neon,.-SHA3_absorb_neon
1474 .global SHA3_squeeze_neon
1475 .type SHA3_squeeze_neon, %function
1478 stmdb sp!, {r4-r6,lr}
1483 mov r12, r0 @ A_flat
1485 b .Loop_squeeze_neon
1490 blo .Lsqueeze_neon_tail
1491 vld1.32 {d0}, [r12]!
1492 vst1.8 {d0}, [r4]! @ endian-neutral store
1494 subs r5, r5, #8 @ len -= 8
1495 beq .Lsqueeze_neon_done
1497 subs r14, r14, #8 @ bsz -= 8
1498 bhi .Loop_squeeze_neon
1500 vstmdb sp!, {d8-d15}
1502 vld1.32 {d0}, [r0,:64]! @ A[0][0..4]
1503 vld1.32 {d2}, [r0,:64]!
1504 vld1.32 {d4}, [r0,:64]!
1505 vld1.32 {d6}, [r0,:64]!
1506 vld1.32 {d8}, [r0,:64]!
1508 vld1.32 {d1}, [r0,:64]! @ A[1][0..4]
1509 vld1.32 {d3}, [r0,:64]!
1510 vld1.32 {d5}, [r0,:64]!
1511 vld1.32 {d7}, [r0,:64]!
1512 vld1.32 {d9}, [r0,:64]!
1514 vld1.32 {d10}, [r0,:64]! @ A[2][0..4]
1515 vld1.32 {d12}, [r0,:64]!
1516 vld1.32 {d14}, [r0,:64]!
1517 vld1.32 {d16}, [r0,:64]!
1518 vld1.32 {d18}, [r0,:64]!
1520 vld1.32 {d11}, [r0,:64]! @ A[3][0..4]
1521 vld1.32 {d13}, [r0,:64]!
1522 vld1.32 {d15}, [r0,:64]!
1523 vld1.32 {d17}, [r0,:64]!
1524 vld1.32 {d19}, [r0,:64]!
1526 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1527 vld1.32 {d24}, [r0,:64]
1528 sub r0, r0, #24*8 @ rewind
1532 mov r12, r0 @ A_flat
1533 vst1.32 {d0}, [r0,:64]! @ A[0][0..4]
1534 vst1.32 {d2}, [r0,:64]!
1535 vst1.32 {d4}, [r0,:64]!
1536 vst1.32 {d6}, [r0,:64]!
1537 vst1.32 {d8}, [r0,:64]!
1539 vst1.32 {d1}, [r0,:64]! @ A[1][0..4]
1540 vst1.32 {d3}, [r0,:64]!
1541 vst1.32 {d5}, [r0,:64]!
1542 vst1.32 {d7}, [r0,:64]!
1543 vst1.32 {d9}, [r0,:64]!
1545 vst1.32 {d10}, [r0,:64]! @ A[2][0..4]
1546 vst1.32 {d12}, [r0,:64]!
1547 vst1.32 {d14}, [r0,:64]!
1548 vst1.32 {d16}, [r0,:64]!
1549 vst1.32 {d18}, [r0,:64]!
1551 vst1.32 {d11}, [r0,:64]! @ A[3][0..4]
1552 vst1.32 {d13}, [r0,:64]!
1553 vst1.32 {d15}, [r0,:64]!
1554 vst1.32 {d17}, [r0,:64]!
1555 vst1.32 {d19}, [r0,:64]!
1557 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4]
1559 vst1.32 {d24}, [r0,:64]
1560 mov r0, r12 @ rewind
1562 vldmia sp!, {d8-d15}
1563 b .Loop_squeeze_neon
1566 .Lsqueeze_neon_tail:
1569 strb r2, [r4],#1 @ endian-neutral store
1571 blo .Lsqueeze_neon_done
1574 beq .Lsqueeze_neon_done
1578 blo .Lsqueeze_neon_done
1580 beq .Lsqueeze_neon_done
1585 blo .Lsqueeze_neon_done
1588 beq .Lsqueeze_neon_done
1591 .Lsqueeze_neon_done:
1592 ldmia sp!, {r4-r6,pc}
1593 .size SHA3_squeeze_neon,.-SHA3_squeeze_neon
1595 .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
1603 my ($mnemonic,$half,$reg,$ea) = @_;
1604 my $op = $mnemonic eq "ldr" ? \%ldr : \%str;
1609 sprintf "#ifndef __thumb2__\n" .
1611 "#endif", $mnemonic,$reg,$ea;
1613 sprintf "#ifndef __thumb2__\n" .
1616 " %sd\t%s,%s,%s\n" .
1617 "#endif", $mnemonic,$reg,$ea,
1618 $mnemonic,$$op{reg},$reg,$$op{ea};
1623 foreach (split($/,$code)) {
1624 s/\`([^\`]*)\`/eval $1/ge;
1626 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or
1627 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or
1628 s/\bret\b/bx lr/g or
1629 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
1634 close STDOUT; # enforce flush