2 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the OpenSSL license (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
10 #include <openssl/e_os2.h>
14 #ifndef KECCAK1600_ASM
16 #if defined(__x86_64__) || defined(__aarch64__) || \
17 defined(__mips64) || defined(__ia64) || \
18 (defined(__VMS) && !defined(__vax))
20 * These are available even in ILP32 flavours, but even then they are
21 * capable of performing 64-bit operations as efficiently as in *P64.
22 * Since it's not given that we can use sizeof(void *), just shunt it.
24 # define BIT_INTERLEAVE (0)
26 # define BIT_INTERLEAVE (sizeof(void *) < 8)
29 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
31 static uint64_t ROL64(uint64_t val, int offset)
35 } else if (!BIT_INTERLEAVE) {
36 return (val << offset) | (val >> (64-offset));
38 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
44 hi = ROL32(lo, offset);
45 lo = ROL32(tmp, offset + 1);
48 lo = ROL32(lo, offset);
49 hi = ROL32(hi, offset);
52 return ((uint64_t)hi << 32) | lo;
56 static const unsigned char rhotates[5][5] = {
58 { 36, 44, 6, 55, 20 },
59 { 3, 10, 43, 25, 39 },
60 { 41, 45, 15, 21, 8 },
64 static const uint64_t iotas[] = {
65 BIT_INTERLEAVE ? 0x0000000000000001U : 0x0000000000000001U,
66 BIT_INTERLEAVE ? 0x0000008900000000U : 0x0000000000008082U,
67 BIT_INTERLEAVE ? 0x8000008b00000000U : 0x800000000000808aU,
68 BIT_INTERLEAVE ? 0x8000808000000000U : 0x8000000080008000U,
69 BIT_INTERLEAVE ? 0x0000008b00000001U : 0x000000000000808bU,
70 BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
71 BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
72 BIT_INTERLEAVE ? 0x8000008200000001U : 0x8000000000008009U,
73 BIT_INTERLEAVE ? 0x0000000b00000000U : 0x000000000000008aU,
74 BIT_INTERLEAVE ? 0x0000000a00000000U : 0x0000000000000088U,
75 BIT_INTERLEAVE ? 0x0000808200000001U : 0x0000000080008009U,
76 BIT_INTERLEAVE ? 0x0000800300000000U : 0x000000008000000aU,
77 BIT_INTERLEAVE ? 0x0000808b00000001U : 0x000000008000808bU,
78 BIT_INTERLEAVE ? 0x8000000b00000001U : 0x800000000000008bU,
79 BIT_INTERLEAVE ? 0x8000008a00000001U : 0x8000000000008089U,
80 BIT_INTERLEAVE ? 0x8000008100000001U : 0x8000000000008003U,
81 BIT_INTERLEAVE ? 0x8000008100000000U : 0x8000000000008002U,
82 BIT_INTERLEAVE ? 0x8000000800000000U : 0x8000000000000080U,
83 BIT_INTERLEAVE ? 0x0000008300000000U : 0x000000000000800aU,
84 BIT_INTERLEAVE ? 0x8000800300000000U : 0x800000008000000aU,
85 BIT_INTERLEAVE ? 0x8000808800000001U : 0x8000000080008081U,
86 BIT_INTERLEAVE ? 0x8000008800000000U : 0x8000000000008080U,
87 BIT_INTERLEAVE ? 0x0000800000000001U : 0x0000000080000001U,
88 BIT_INTERLEAVE ? 0x8000808200000000U : 0x8000000080008008U
91 #if defined(KECCAK_REF)
93 * This is straightforward or "maximum clarity" implementation aiming
94 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
95 * Permutation-Based Hash and Extendible-Output Functions" as much as
96 * possible. With one caveat. Because of the way C stores matrices,
97 * references to A[x,y] in the specification are presented as A[y][x].
98 * Implementation unrolls inner x-loops so that modulo 5 operations are
99 * explicitly pre-computed.
101 static void Theta(uint64_t A[5][5])
112 for (y = 1; y < 5; y++) {
120 D[0] = ROL64(C[1], 1) ^ C[4];
121 D[1] = ROL64(C[2], 1) ^ C[0];
122 D[2] = ROL64(C[3], 1) ^ C[1];
123 D[3] = ROL64(C[4], 1) ^ C[2];
124 D[4] = ROL64(C[0], 1) ^ C[3];
126 for (y = 0; y < 5; y++) {
135 static void Rho(uint64_t A[5][5])
139 for (y = 0; y < 5; y++) {
140 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
141 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
142 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
143 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
144 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
148 static void Pi(uint64_t A[5][5])
154 * A[y][x] = T[x][(3*y+x)%5]
156 memcpy(T, A, sizeof(T));
189 static void Chi(uint64_t A[5][5])
194 for (y = 0; y < 5; y++) {
195 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
196 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
197 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
198 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
199 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
209 static void Iota(uint64_t A[5][5], size_t i)
211 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
215 void KeccakF1600(uint64_t A[5][5])
219 for (i = 0; i < 24; i++) {
228 #elif defined(KECCAK_1X)
230 * This implementation is optimization of above code featuring unroll
231 * of even y-loops, their fusion and code motion. It also minimizes
232 * temporary storage. Compiler would normally do all these things for
233 * you, purpose of manual optimization is to provide "unobscured"
234 * reference for assembly implementation [in case this approach is
235 * chosen for implementation on some platform]. In the nutshell it's
236 * equivalent of "plane-per-plane processing" approach discussed in
237 * section 2.4 of "Keccak implementation overview".
239 static void Round(uint64_t A[5][5], size_t i)
241 uint64_t C[5], E[2]; /* registers */
242 uint64_t D[5], T[2][5]; /* memory */
244 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
246 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
247 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
248 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
249 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
250 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
253 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
254 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
255 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
256 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
257 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
259 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
260 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
261 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
262 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
263 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
265 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
266 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
267 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
268 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
269 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
271 D[0] = ROL64(C[1], 1) ^ C[4];
272 D[1] = ROL64(C[2], 1) ^ C[0];
273 D[2] = ROL64(C[3], 1) ^ C[1];
274 D[3] = ROL64(C[4], 1) ^ C[2];
275 D[4] = ROL64(C[0], 1) ^ C[3];
277 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
278 T[0][1] = A[0][1] ^ D[1];
279 T[0][2] = A[0][2] ^ D[2];
280 T[0][3] = A[0][3] ^ D[3];
281 T[0][4] = A[0][4] ^ D[4];
283 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
284 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
285 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
286 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
287 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
289 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
290 A[0][1] = C[1] ^ (~C[2] & C[3]);
291 A[0][2] = C[2] ^ (~C[3] & C[4]);
292 A[0][3] = C[3] ^ (~C[4] & C[0]);
293 A[0][4] = C[4] ^ (~C[0] & C[1]);
295 T[1][0] = A[1][0] ^ (C[3] = D[0]);
296 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
297 T[1][2] = A[1][2] ^ (E[0] = D[2]);
298 T[1][3] = A[1][3] ^ (E[1] = D[3]);
299 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
301 C[0] = ROL64(T[0][3], rhotates[0][3]);
302 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
303 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
304 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
305 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
307 A[1][0] = C[0] ^ (~C[1] & C[2]);
308 A[1][1] = C[1] ^ (~C[2] & C[3]);
309 A[1][2] = C[2] ^ (~C[3] & C[4]);
310 A[1][3] = C[3] ^ (~C[4] & C[0]);
311 A[1][4] = C[4] ^ (~C[0] & C[1]);
313 C[0] = ROL64(T[0][1], rhotates[0][1]);
314 C[1] = ROL64(T[1][2], rhotates[1][2]);
315 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
316 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
317 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
319 A[2][0] = C[0] ^ (~C[1] & C[2]);
320 A[2][1] = C[1] ^ (~C[2] & C[3]);
321 A[2][2] = C[2] ^ (~C[3] & C[4]);
322 A[2][3] = C[3] ^ (~C[4] & C[0]);
323 A[2][4] = C[4] ^ (~C[0] & C[1]);
325 C[0] = ROL64(T[0][4], rhotates[0][4]);
326 C[1] = ROL64(T[1][0], rhotates[1][0]);
327 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
328 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
329 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
331 A[3][0] = C[0] ^ (~C[1] & C[2]);
332 A[3][1] = C[1] ^ (~C[2] & C[3]);
333 A[3][2] = C[2] ^ (~C[3] & C[4]);
334 A[3][3] = C[3] ^ (~C[4] & C[0]);
335 A[3][4] = C[4] ^ (~C[0] & C[1]);
337 C[0] = ROL64(T[0][2], rhotates[0][2]);
338 C[1] = ROL64(T[1][3], rhotates[1][3]);
339 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
340 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
341 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
343 A[4][0] = C[0] ^ (~C[1] & C[2]);
344 A[4][1] = C[1] ^ (~C[2] & C[3]);
345 A[4][2] = C[2] ^ (~C[3] & C[4]);
346 A[4][3] = C[3] ^ (~C[4] & C[0]);
347 A[4][4] = C[4] ^ (~C[0] & C[1]);
350 void KeccakF1600(uint64_t A[5][5])
354 for (i = 0; i < 24; i++) {
359 #elif defined(KECCAK_1X_ALT)
361 * This is variant of above KECCAK_1X that reduces requirement for
362 * temporary storage even further, but at cost of more updates to A[][].
363 * It's less suitable if A[][] is memory bound, but better if it's
367 static void Round(uint64_t A[5][5], size_t i)
371 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
373 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
374 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
375 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
376 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
377 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
379 D[1] = C[0] ^ ROL64(C[2], 1);
380 D[2] = C[1] ^ ROL64(C[3], 1);
381 D[3] = C[2] ^= ROL64(C[4], 1);
382 D[4] = C[3] ^= ROL64(C[0], 1);
383 D[0] = C[4] ^= ROL64(C[1], 1);
420 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
421 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
422 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
423 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
425 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
426 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
427 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
428 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
430 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
431 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
432 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
433 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
435 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
436 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
437 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
438 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
440 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
441 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
442 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
443 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
445 A[1][0] = ROL64(C[3], rhotates[0][3]);
446 A[2][0] = ROL64(C[1], rhotates[0][1]);
447 A[3][0] = ROL64(C[4], rhotates[0][4]);
448 A[4][0] = ROL64(C[2], rhotates[0][2]);
455 A[0][0] ^= (~A[0][1] & A[0][2]);
456 A[1][0] ^= (~A[1][1] & A[1][2]);
457 A[0][1] ^= (~A[0][2] & A[0][3]);
458 A[1][1] ^= (~A[1][2] & A[1][3]);
459 A[0][2] ^= (~A[0][3] & A[0][4]);
460 A[1][2] ^= (~A[1][3] & A[1][4]);
461 A[0][3] ^= (~A[0][4] & C[0]);
462 A[1][3] ^= (~A[1][4] & C[1]);
463 A[0][4] ^= (~C[0] & D[0]);
464 A[1][4] ^= (~C[1] & D[1]);
471 A[2][0] ^= (~A[2][1] & A[2][2]);
472 A[3][0] ^= (~A[3][1] & A[3][2]);
473 A[2][1] ^= (~A[2][2] & A[2][3]);
474 A[3][1] ^= (~A[3][2] & A[3][3]);
475 A[2][2] ^= (~A[2][3] & A[2][4]);
476 A[3][2] ^= (~A[3][3] & A[3][4]);
477 A[2][3] ^= (~A[2][4] & C[2]);
478 A[3][3] ^= (~A[3][4] & C[3]);
479 A[2][4] ^= (~C[2] & D[2]);
480 A[3][4] ^= (~C[3] & D[3]);
485 A[4][0] ^= (~A[4][1] & A[4][2]);
486 A[4][1] ^= (~A[4][2] & A[4][3]);
487 A[4][2] ^= (~A[4][3] & A[4][4]);
488 A[4][3] ^= (~A[4][4] & C[4]);
489 A[4][4] ^= (~C[4] & D[4]);
493 void KeccakF1600(uint64_t A[5][5])
497 for (i = 0; i < 24; i++) {
502 #elif defined(KECCAK_2X)
504 * This implementation is variant of KECCAK_1X above with outer-most
505 * round loop unrolled twice. This allows to take temporary storage
506 * out of round procedure and simplify references to it by alternating
507 * it with actual data (see round loop below). Just like original, it's
508 * rather meant as reference for an assembly implementation. It's likely
509 * to provide best instruction per processed byte ratio at minimal
510 * round unroll factor...
512 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
516 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
518 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
519 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
520 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
521 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
522 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
524 D[0] = ROL64(C[1], 1) ^ C[4];
525 D[1] = ROL64(C[2], 1) ^ C[0];
526 D[2] = ROL64(C[3], 1) ^ C[1];
527 D[3] = ROL64(C[4], 1) ^ C[2];
528 D[4] = ROL64(C[0], 1) ^ C[3];
530 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
531 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
532 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
533 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
534 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
536 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
537 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
538 R[0][1] = C[1] ^ (~C[2] | C[3]);
539 R[0][2] = C[2] ^ ( C[3] & C[4]);
540 R[0][3] = C[3] ^ ( C[4] | C[0]);
541 R[0][4] = C[4] ^ ( C[0] & C[1]);
543 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
544 R[0][1] = C[1] ^ (~C[2] & C[3]);
545 R[0][2] = C[2] ^ (~C[3] & C[4]);
546 R[0][3] = C[3] ^ (~C[4] & C[0]);
547 R[0][4] = C[4] ^ (~C[0] & C[1]);
550 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
551 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
552 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
553 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
554 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
556 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
557 R[1][0] = C[0] ^ (C[1] | C[2]);
558 R[1][1] = C[1] ^ (C[2] & C[3]);
559 R[1][2] = C[2] ^ (C[3] | ~C[4]);
560 R[1][3] = C[3] ^ (C[4] | C[0]);
561 R[1][4] = C[4] ^ (C[0] & C[1]);
563 R[1][0] = C[0] ^ (~C[1] & C[2]);
564 R[1][1] = C[1] ^ (~C[2] & C[3]);
565 R[1][2] = C[2] ^ (~C[3] & C[4]);
566 R[1][3] = C[3] ^ (~C[4] & C[0]);
567 R[1][4] = C[4] ^ (~C[0] & C[1]);
570 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
571 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
572 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
573 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
574 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
576 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
577 R[2][0] = C[0] ^ ( C[1] | C[2]);
578 R[2][1] = C[1] ^ ( C[2] & C[3]);
579 R[2][2] = C[2] ^ (~C[3] & C[4]);
580 R[2][3] = ~C[3] ^ ( C[4] | C[0]);
581 R[2][4] = C[4] ^ ( C[0] & C[1]);
583 R[2][0] = C[0] ^ (~C[1] & C[2]);
584 R[2][1] = C[1] ^ (~C[2] & C[3]);
585 R[2][2] = C[2] ^ (~C[3] & C[4]);
586 R[2][3] = C[3] ^ (~C[4] & C[0]);
587 R[2][4] = C[4] ^ (~C[0] & C[1]);
590 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
591 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
592 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
593 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
594 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
596 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
597 R[3][0] = C[0] ^ ( C[1] & C[2]);
598 R[3][1] = C[1] ^ ( C[2] | C[3]);
599 R[3][2] = C[2] ^ (~C[3] | C[4]);
600 R[3][3] = ~C[3] ^ ( C[4] & C[0]);
601 R[3][4] = C[4] ^ ( C[0] | C[1]);
603 R[3][0] = C[0] ^ (~C[1] & C[2]);
604 R[3][1] = C[1] ^ (~C[2] & C[3]);
605 R[3][2] = C[2] ^ (~C[3] & C[4]);
606 R[3][3] = C[3] ^ (~C[4] & C[0]);
607 R[3][4] = C[4] ^ (~C[0] & C[1]);
610 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
611 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
612 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
613 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
614 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
616 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
617 R[4][0] = C[0] ^ (~C[1] & C[2]);
618 R[4][1] = ~C[1] ^ ( C[2] | C[3]);
619 R[4][2] = C[2] ^ ( C[3] & C[4]);
620 R[4][3] = C[3] ^ ( C[4] | C[0]);
621 R[4][4] = C[4] ^ ( C[0] & C[1]);
623 R[4][0] = C[0] ^ (~C[1] & C[2]);
624 R[4][1] = C[1] ^ (~C[2] & C[3]);
625 R[4][2] = C[2] ^ (~C[3] & C[4]);
626 R[4][3] = C[3] ^ (~C[4] & C[0]);
627 R[4][4] = C[4] ^ (~C[0] & C[1]);
631 void KeccakF1600(uint64_t A[5][5])
636 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
645 for (i = 0; i < 24; i += 2) {
650 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
662 * This implementation is KECCAK_1X from above combined 4 times with
663 * a twist that allows to omit temporary storage and perform in-place
664 * processing. It's discussed in section 2.5 of "Keccak implementation
665 * overview". It's likely to be best suited for processors with large
668 static void FourRounds(uint64_t A[5][5], size_t i)
670 uint64_t B[5], C[5], D[5];
672 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
675 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
676 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
677 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
678 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
679 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
681 D[0] = ROL64(C[1], 1) ^ C[4];
682 D[1] = ROL64(C[2], 1) ^ C[0];
683 D[2] = ROL64(C[3], 1) ^ C[1];
684 D[3] = ROL64(C[4], 1) ^ C[2];
685 D[4] = ROL64(C[0], 1) ^ C[3];
687 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
688 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
689 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
690 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
691 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
693 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
694 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
695 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
696 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
697 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
699 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
700 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
701 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
702 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
703 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
705 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
706 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
707 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
708 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
709 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
711 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
712 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
713 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
714 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
715 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
717 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
718 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
719 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
720 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
721 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
723 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
724 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
725 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
726 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
727 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
729 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
730 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
731 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
732 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
733 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
735 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
736 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
737 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
738 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
739 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
741 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
742 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
743 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
744 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
745 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
748 D[0] = ROL64(C[1], 1) ^ C[4];
749 D[1] = ROL64(C[2], 1) ^ C[0];
750 D[2] = ROL64(C[3], 1) ^ C[1];
751 D[3] = ROL64(C[4], 1) ^ C[2];
752 D[4] = ROL64(C[0], 1) ^ C[3];
754 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
755 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
756 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
757 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
758 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
760 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
761 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
762 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
763 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
764 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
766 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
767 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
768 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
769 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
770 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
772 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
773 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
774 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
775 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
776 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
778 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
779 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
780 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
781 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
782 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
784 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
785 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
786 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
787 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
788 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
790 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
791 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
792 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
793 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
794 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
796 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
797 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
798 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
799 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
800 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
802 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
803 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
804 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
805 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
806 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
808 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
809 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
810 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
811 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
812 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
815 D[0] = ROL64(C[1], 1) ^ C[4];
816 D[1] = ROL64(C[2], 1) ^ C[0];
817 D[2] = ROL64(C[3], 1) ^ C[1];
818 D[3] = ROL64(C[4], 1) ^ C[2];
819 D[4] = ROL64(C[0], 1) ^ C[3];
821 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
822 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
823 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
824 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
825 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
827 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
828 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
829 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
830 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
831 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
833 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
834 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
835 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
836 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
837 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
839 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
840 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
841 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
842 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
843 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
845 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
846 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
847 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
848 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
849 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
851 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
852 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
853 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
854 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
855 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
857 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
858 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
859 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
860 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
861 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
863 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
864 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
865 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
866 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
867 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
869 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
870 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
871 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
872 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
873 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
875 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
876 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
877 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
878 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
879 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
882 D[0] = ROL64(C[1], 1) ^ C[4];
883 D[1] = ROL64(C[2], 1) ^ C[0];
884 D[2] = ROL64(C[3], 1) ^ C[1];
885 D[3] = ROL64(C[4], 1) ^ C[2];
886 D[4] = ROL64(C[0], 1) ^ C[3];
888 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
889 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
890 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
891 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
892 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
894 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
895 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
896 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
897 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
898 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
900 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
901 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
902 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
903 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
904 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
906 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
907 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
908 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
909 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
910 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
912 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
913 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
914 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
915 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
916 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
918 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
919 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
920 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
921 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
922 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
924 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
925 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
926 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
927 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
928 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
930 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
931 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
932 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
933 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
934 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
936 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
937 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
938 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
939 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
940 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
942 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
943 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
944 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
945 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
946 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
949 void KeccakF1600(uint64_t A[5][5])
953 for (i = 0; i < 24; i += 4) {
960 static uint64_t BitInterleave(uint64_t Ai)
962 if (BIT_INTERLEAVE) {
963 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
966 t0 = lo & 0x55555555;
967 t0 |= t0 >> 1; t0 &= 0x33333333;
968 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
969 t0 |= t0 >> 4; t0 &= 0x00ff00ff;
970 t0 |= t0 >> 8; t0 &= 0x0000ffff;
972 t1 = hi & 0x55555555;
973 t1 |= t1 >> 1; t1 &= 0x33333333;
974 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
975 t1 |= t1 >> 4; t1 &= 0x00ff00ff;
976 t1 |= t1 >> 8; t1 <<= 16;
979 lo |= lo << 1; lo &= 0xcccccccc;
980 lo |= lo << 2; lo &= 0xf0f0f0f0;
981 lo |= lo << 4; lo &= 0xff00ff00;
982 lo |= lo << 8; lo >>= 16;
985 hi |= hi << 1; hi &= 0xcccccccc;
986 hi |= hi << 2; hi &= 0xf0f0f0f0;
987 hi |= hi << 4; hi &= 0xff00ff00;
988 hi |= hi << 8; hi &= 0xffff0000;
990 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
996 static uint64_t BitDeinterleave(uint64_t Ai)
998 if (BIT_INTERLEAVE) {
999 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1002 t0 = lo & 0x0000ffff;
1003 t0 |= t0 << 8; t0 &= 0x00ff00ff;
1004 t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
1005 t0 |= t0 << 2; t0 &= 0x33333333;
1006 t0 |= t0 << 1; t0 &= 0x55555555;
1009 t1 |= t1 >> 8; t1 &= 0xff00ff00;
1010 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
1011 t1 |= t1 >> 2; t1 &= 0xcccccccc;
1012 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
1015 lo |= lo << 8; lo &= 0x00ff00ff;
1016 lo |= lo << 4; lo &= 0x0f0f0f0f;
1017 lo |= lo << 2; lo &= 0x33333333;
1018 lo |= lo << 1; lo &= 0x55555555;
1021 hi |= hi >> 8; hi &= 0xff00ff00;
1022 hi |= hi >> 4; hi &= 0xf0f0f0f0;
1023 hi |= hi >> 2; hi &= 0xcccccccc;
1024 hi |= hi >> 1; hi &= 0xaaaaaaaa;
1026 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1033 * SHA3_absorb can be called multiple times, but at each invocation
1034 * largest multiple of |r| out of |len| bytes are processed. Then
1035 * remaining amount of bytes is returned. This is done to spare caller
1036 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1037 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1038 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1039 * padding and intermediate sub-block buffering, byte- or bitwise, is
1040 * caller's reponsibility.
1042 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1045 uint64_t *A_flat = (uint64_t *)A;
1046 size_t i, w = r / 8;
1048 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1051 for (i = 0; i < w; i++) {
1052 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
1053 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1054 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1055 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1058 A_flat[i] ^= BitInterleave(Ai);
1068 * SHA3_squeeze is called once at the end to generate |out| hash value
1071 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1073 uint64_t *A_flat = (uint64_t *)A;
1074 size_t i, rem, w = r / 8;
1076 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1079 for (i = 0; i < w; i++) {
1080 uint64_t Ai = BitDeinterleave(A_flat[i]);
1082 out[0] = (unsigned char)(Ai);
1083 out[1] = (unsigned char)(Ai >> 8);
1084 out[2] = (unsigned char)(Ai >> 16);
1085 out[3] = (unsigned char)(Ai >> 24);
1086 out[4] = (unsigned char)(Ai >> 32);
1087 out[5] = (unsigned char)(Ai >> 40);
1088 out[6] = (unsigned char)(Ai >> 48);
1089 out[7] = (unsigned char)(Ai >> 56);
1100 for (i = 0; i < len; i++) {
1101 uint64_t Ai = BitDeinterleave(A_flat[i]);
1103 out[0] = (unsigned char)(Ai);
1104 out[1] = (unsigned char)(Ai >> 8);
1105 out[2] = (unsigned char)(Ai >> 16);
1106 out[3] = (unsigned char)(Ai >> 24);
1107 out[4] = (unsigned char)(Ai >> 32);
1108 out[5] = (unsigned char)(Ai >> 40);
1109 out[6] = (unsigned char)(Ai >> 48);
1110 out[7] = (unsigned char)(Ai >> 56);
1115 uint64_t Ai = BitDeinterleave(A_flat[i]);
1117 for (i = 0; i < rem; i++) {
1118 *out++ = (unsigned char)Ai;
1124 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1126 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
1131 * Post-padding one-shot implementations would look as following:
1133 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1134 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1135 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1136 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1137 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1138 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1141 void SHA3_sponge(const unsigned char *inp, size_t len,
1142 unsigned char *out, size_t d, size_t r)
1146 memset(A, 0, sizeof(A));
1147 SHA3_absorb(A, inp, len, r);
1148 SHA3_squeeze(A, out, d, r);
1156 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1158 unsigned char test[168] = { '\xf3', '\x3' };
1159 unsigned char out[512];
1161 static const unsigned char result[512] = {
1162 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1163 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1164 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1165 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1166 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1167 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1168 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1169 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1170 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1171 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1172 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1173 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1174 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1175 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1176 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1177 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1178 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1179 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1180 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1181 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1182 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1183 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1184 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1185 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1186 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1187 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1188 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1189 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1190 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1191 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1192 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1193 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1194 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1195 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1196 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1197 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1198 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1199 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1200 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1201 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1202 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1203 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1204 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1205 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1206 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1207 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1208 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1209 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1210 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1211 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1212 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1213 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1214 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1215 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1216 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1217 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1218 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1219 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1220 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1221 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1222 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1223 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1224 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1225 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1229 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1232 * Rationale behind keeping output [formatted as below] is that
1233 * one should be able to redirect it to a file, then copy-n-paste
1234 * final "output val" from official example to another file, and
1235 * compare the two with diff(1).
1237 for (i = 0; i < sizeof(out);) {
1238 printf("%02X", out[i]);
1239 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1242 if (memcmp(out,result,sizeof(out))) {
1243 fprintf(stderr,"failure\n");
1246 fprintf(stderr,"success\n");