2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for ARMv8.
20 # This is straightforward KECCAK_1X_ALT implementation. It makes no
21 # sense to attempt SIMD/NEON implementation for following reason.
22 # 64-bit lanes of vector registers can't be addressed as easily as in
23 # 32-bit mode. This means that 64-bit NEON is bound to be slower than
24 # 32-bit NEON, and this implementation is faster than 32-bit NEON on
25 # same processor. Even though it takes more scalar xor's and andn's,
26 # it gets compensated by availability of rotate. Not to forget that
27 # most processors achieve higher issue rate with scalar instructions.
29 ######################################################################
30 # Numbers are cycles per processed byte.
42 # (*) Corresponds to SHA3-256. No improvement coefficients are listed
43 # because they vary too much from compiler to compiler. Newer
44 # compiler does much better and improvement varies from 5% on
45 # Cortex-A57 to 25% on Cortex-A53. While in comparison to older
46 # compiler this code is at least 2x faster...
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
53 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
54 die "can't locate arm-xlate.pl";
56 open OUT,"| \"$^X\" $xlate $flavour $output";
59 my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
61 $A[3][3] = "x25"; # x18 is reserved
63 my @C = map("x$_", (26,27,28,30));
65 my @rhotates = ([ 0, 1, 62, 28, 27 ],
66 [ 36, 44, 6, 55, 20 ],
67 [ 3, 10, 43, 25, 39 ],
68 [ 41, 45, 15, 21, 8 ],
69 [ 18, 2, 61, 56, 14 ]);
74 .align 8 // strategic alignment and padding that allows to use
75 // address value as loop termination condition...
79 .quad 0x0000000000000001
80 .quad 0x0000000000008082
81 .quad 0x800000000000808a
82 .quad 0x8000000080008000
83 .quad 0x000000000000808b
84 .quad 0x0000000080000001
85 .quad 0x8000000080008081
86 .quad 0x8000000000008009
87 .quad 0x000000000000008a
88 .quad 0x0000000000000088
89 .quad 0x0000000080008009
90 .quad 0x000000008000000a
91 .quad 0x000000008000808b
92 .quad 0x800000000000008b
93 .quad 0x8000000000008089
94 .quad 0x8000000000008003
95 .quad 0x8000000000008002
96 .quad 0x8000000000000080
97 .quad 0x000000000000800a
98 .quad 0x800000008000000a
99 .quad 0x8000000080008081
100 .quad 0x8000000000008080
101 .quad 0x0000000080000001
102 .quad 0x8000000080008008
105 .type KeccakF1600_int,%function
109 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
113 ////////////////////////////////////////// Theta
114 eor $C[0],$A[0][0],$A[1][0]
115 stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
116 eor $C[1],$A[0][1],$A[1][1]
117 eor $C[2],$A[0][2],$A[1][2]
118 eor $C[3],$A[0][3],$A[1][3]
123 eor $C[4],$A[0][4],$A[1][4]
124 eor $C[0],$C[0],$A[2][0]
125 eor $C[1],$C[1],$A[2][1]
126 eor $C[2],$C[2],$A[2][2]
127 eor $C[3],$C[3],$A[2][3]
128 eor $C[4],$C[4],$A[2][4]
129 eor $C[0],$C[0],$A[3][0]
130 eor $C[1],$C[1],$A[3][1]
131 eor $C[2],$C[2],$A[3][2]
132 eor $C[3],$C[3],$A[3][3]
133 eor $C[4],$C[4],$A[3][4]
134 eor $C[0],$C[0],$A[4][0]
135 eor $C[2],$C[2],$A[4][2]
136 eor $C[1],$C[1],$A[4][1]
137 eor $C[3],$C[3],$A[4][3]
138 eor $C[4],$C[4],$A[4][4]
140 eor $C[5],$C[0],$C[2],ror#63
142 eor $A[0][1],$A[0][1],$C[5]
143 eor $A[1][1],$A[1][1],$C[5]
144 eor $A[2][1],$A[2][1],$C[5]
145 eor $A[3][1],$A[3][1],$C[5]
146 eor $A[4][1],$A[4][1],$C[5]
148 eor $C[5],$C[1],$C[3],ror#63
149 eor $C[2],$C[2],$C[4],ror#63
150 eor $C[3],$C[3],$C[0],ror#63
151 eor $C[4],$C[4],$C[1],ror#63
153 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
154 eor $A[1][2],$A[1][2],$C[5]
155 eor $A[2][2],$A[2][2],$C[5]
156 eor $A[3][2],$A[3][2],$C[5]
157 eor $A[4][2],$A[4][2],$C[5]
159 eor $A[0][0],$A[0][0],$C[4]
160 eor $A[1][0],$A[1][0],$C[4]
161 eor $A[2][0],$A[2][0],$C[4]
162 eor $A[3][0],$A[3][0],$C[4]
163 eor $A[4][0],$A[4][0],$C[4]
168 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
169 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
170 eor $A[1][3],$A[1][3],$C[2]
171 eor $A[2][3],$A[2][3],$C[2]
172 eor $A[3][3],$A[3][3],$C[2]
173 eor $A[4][3],$A[4][3],$C[2]
175 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
176 eor $A[1][4],$A[1][4],$C[3]
177 eor $A[2][4],$A[2][4],$C[3]
178 eor $A[3][4],$A[3][4],$C[3]
179 eor $A[4][4],$A[4][4],$C[3]
181 ////////////////////////////////////////// Rho+Pi
183 ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
185 ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
187 ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
189 ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
191 ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
192 ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
193 ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
194 ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
196 ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
197 ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
198 ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
199 ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
201 ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
202 ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
203 ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
204 ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
206 ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
207 ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
208 ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
209 ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
211 ror $A[1][0],$C[0],#64-$rhotates[0][3]
212 ror $A[2][0],$C[3],#64-$rhotates[0][1]
213 ror $A[3][0],$C[2],#64-$rhotates[0][4]
214 ror $A[4][0],$C[1],#64-$rhotates[0][2]
216 ////////////////////////////////////////// Chi+Iota
217 bic $C[0],$A[0][2],$A[0][1]
218 bic $C[1],$A[0][3],$A[0][2]
219 bic $C[2],$A[0][0],$A[0][4]
220 bic $C[3],$A[0][1],$A[0][0]
221 eor $A[0][0],$A[0][0],$C[0]
222 bic $C[0],$A[0][4],$A[0][3]
223 eor $A[0][1],$A[0][1],$C[1]
225 eor $A[0][3],$A[0][3],$C[2]
226 eor $A[0][4],$A[0][4],$C[3]
227 eor $A[0][2],$A[0][2],$C[0]
228 ldr $C[3],[$C[1]],#8 // Iota[i++]
230 bic $C[0],$A[1][2],$A[1][1]
231 tst $C[1],#255 // are we done?
233 bic $C[1],$A[1][3],$A[1][2]
234 bic $C[2],$A[1][0],$A[1][4]
235 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
236 bic $C[3],$A[1][1],$A[1][0]
237 eor $A[1][0],$A[1][0],$C[0]
238 bic $C[0],$A[1][4],$A[1][3]
239 eor $A[1][1],$A[1][1],$C[1]
240 eor $A[1][3],$A[1][3],$C[2]
241 eor $A[1][4],$A[1][4],$C[3]
242 eor $A[1][2],$A[1][2],$C[0]
244 bic $C[0],$A[2][2],$A[2][1]
245 bic $C[1],$A[2][3],$A[2][2]
246 bic $C[2],$A[2][0],$A[2][4]
247 bic $C[3],$A[2][1],$A[2][0]
248 eor $A[2][0],$A[2][0],$C[0]
249 bic $C[0],$A[2][4],$A[2][3]
250 eor $A[2][1],$A[2][1],$C[1]
251 eor $A[2][3],$A[2][3],$C[2]
252 eor $A[2][4],$A[2][4],$C[3]
253 eor $A[2][2],$A[2][2],$C[0]
255 bic $C[0],$A[3][2],$A[3][1]
256 bic $C[1],$A[3][3],$A[3][2]
257 bic $C[2],$A[3][0],$A[3][4]
258 bic $C[3],$A[3][1],$A[3][0]
259 eor $A[3][0],$A[3][0],$C[0]
260 bic $C[0],$A[3][4],$A[3][3]
261 eor $A[3][1],$A[3][1],$C[1]
262 eor $A[3][3],$A[3][3],$C[2]
263 eor $A[3][4],$A[3][4],$C[3]
264 eor $A[3][2],$A[3][2],$C[0]
266 bic $C[0],$A[4][2],$A[4][1]
267 bic $C[1],$A[4][3],$A[4][2]
268 bic $C[2],$A[4][0],$A[4][4]
269 bic $C[3],$A[4][1],$A[4][0]
270 eor $A[4][0],$A[4][0],$C[0]
271 bic $C[0],$A[4][4],$A[4][3]
272 eor $A[4][1],$A[4][1],$C[1]
273 eor $A[4][3],$A[4][3],$C[2]
274 eor $A[4][4],$A[4][4],$C[3]
275 eor $A[4][2],$A[4][2],$C[0]
281 .size KeccakF1600_int,.-KeccakF1600_int
283 .type KeccakF1600,%function
286 stp x29,x30,[sp,#-128]!
295 str x0,[sp,#32] // offload argument
297 ldp $A[0][0],$A[0][1],[x0,#16*0]
298 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
299 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
300 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
301 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
302 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
303 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
304 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
305 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
306 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
307 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
308 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
309 ldr $A[4][4],[$C[0],#16*12]
314 stp $A[0][0],$A[0][1],[$C[0],#16*0]
315 stp $A[0][2],$A[0][3],[$C[0],#16*1]
316 stp $A[0][4],$A[1][0],[$C[0],#16*2]
317 stp $A[1][1],$A[1][2],[$C[0],#16*3]
318 stp $A[1][3],$A[1][4],[$C[0],#16*4]
319 stp $A[2][0],$A[2][1],[$C[0],#16*5]
320 stp $A[2][2],$A[2][3],[$C[0],#16*6]
321 stp $A[2][4],$A[3][0],[$C[0],#16*7]
322 stp $A[3][1],$A[3][2],[$C[0],#16*8]
323 stp $A[3][3],$A[3][4],[$C[0],#16*9]
324 stp $A[4][0],$A[4][1],[$C[0],#16*10]
325 stp $A[4][2],$A[4][3],[$C[0],#16*11]
326 str $A[4][4],[$C[0],#16*12]
328 ldp x19,x20,[x29,#16]
330 ldp x21,x22,[x29,#32]
331 ldp x23,x24,[x29,#48]
332 ldp x25,x26,[x29,#64]
333 ldp x27,x28,[x29,#80]
334 ldp x29,x30,[sp],#128
336 .size KeccakF1600,.-KeccakF1600
339 .type SHA3_absorb,%function
342 stp x29,x30,[sp,#-128]!
351 stp x0,x1,[sp,#32] // offload arguments
354 mov $C[0],x0 // uint64_t A[5][5]
355 mov $C[1],x1 // const void *inp
356 mov $C[2],x2 // size_t len
357 mov $C[3],x3 // size_t bsz
358 ldp $A[0][0],$A[0][1],[$C[0],#16*0]
359 ldp $A[0][2],$A[0][3],[$C[0],#16*1]
360 ldp $A[0][4],$A[1][0],[$C[0],#16*2]
361 ldp $A[1][1],$A[1][2],[$C[0],#16*3]
362 ldp $A[1][3],$A[1][4],[$C[0],#16*4]
363 ldp $A[2][0],$A[2][1],[$C[0],#16*5]
364 ldp $A[2][2],$A[2][3],[$C[0],#16*6]
365 ldp $A[2][4],$A[3][0],[$C[0],#16*7]
366 ldp $A[3][1],$A[3][2],[$C[0],#16*8]
367 ldp $A[3][3],$A[3][4],[$C[0],#16*9]
368 ldp $A[4][0],$A[4][1],[$C[0],#16*10]
369 ldp $A[4][2],$A[4][3],[$C[0],#16*11]
370 ldr $A[4][4],[$C[0],#16*12]
375 subs $C[0],$C[2],$C[3] // len - bsz
378 str $C[0],[sp,#48] // save len - bsz
380 for (my $i=0; $i<24; $i+=2) {
383 ldr $C[0],[$C[1]],#8 // *inp++
387 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
390 ldr $C[0],[$C[1]],#8 // *inp++
394 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
399 ldr $C[0],[$C[1]],#8 // *inp++
403 eor $A[4][4],$A[4][4],$C[0]
406 str $C[1],[sp,#40] // save inp
410 ldr $C[1],[sp,#40] // restore arguments
411 ldp $C[2],$C[3],[sp,#48]
417 stp $A[0][0],$A[0][1],[$C[1],#16*0]
418 stp $A[0][2],$A[0][3],[$C[1],#16*1]
419 stp $A[0][4],$A[1][0],[$C[1],#16*2]
420 stp $A[1][1],$A[1][2],[$C[1],#16*3]
421 stp $A[1][3],$A[1][4],[$C[1],#16*4]
422 stp $A[2][0],$A[2][1],[$C[1],#16*5]
423 stp $A[2][2],$A[2][3],[$C[1],#16*6]
424 stp $A[2][4],$A[3][0],[$C[1],#16*7]
425 stp $A[3][1],$A[3][2],[$C[1],#16*8]
426 stp $A[3][3],$A[3][4],[$C[1],#16*9]
427 stp $A[4][0],$A[4][1],[$C[1],#16*10]
428 stp $A[4][2],$A[4][3],[$C[1],#16*11]
429 str $A[4][4],[$C[1],#16*12]
431 mov x0,$C[2] // return value
432 ldp x19,x20,[x29,#16]
434 ldp x21,x22,[x29,#32]
435 ldp x23,x24,[x29,#48]
436 ldp x25,x26,[x29,#64]
437 ldp x27,x28,[x29,#80]
438 ldp x29,x30,[sp],#128
440 .size SHA3_absorb,.-SHA3_absorb
443 my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
446 .type SHA3_squeeze,%function
449 stp x29,x30,[sp,#-48]!
454 mov $A_flat,x0 // put aside arguments
512 .size SHA3_squeeze,.-SHA3_squeeze
513 .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"