2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for s390x.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # In the nutshell it's transliteration of x86_64 module, because both
26 # architectures have similar capabilities/limitations. Performance
27 # measurement is problematic as I don't have access to an idle system.
28 # It looks like z13 processes one byte [out of long message] in ~14
29 # cycles. At least the result is consistent with estimate based on
30 # amount of instruction and assumed instruction issue rate. It's ~2.5x
31 # faster than compiler-generated code.
35 if ($flavour =~ /3[12]/) {
43 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
44 open STDOUT,">$output";
46 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
48 my @C = map("%r$_",(0,1,5..7));
49 my @D = map("%r$_",(8..12));
50 my @T = map("%r$_",(13..14));
51 my ($src,$dst,$iotas) = map("%r$_",(2..4));
54 $stdframe=16*$SIZE_T+4*8;
55 $frame=$stdframe+25*8;
57 my @rhotates = ([ 0, 1, 62, 28, 27 ],
58 [ 36, 44, 6, 55, 20 ],
59 [ 3, 10, 43, 25, 39 ],
60 [ 41, 45, 15, 21, 8 ],
61 [ 18, 2, 61, 56, 14 ]);
63 { my @C = @C; # copy, because we mess them up...
69 .type __KeccakF1600,\@function
72 st${g} %r14,$SIZE_T*14($sp)
73 lg @C[0],$A[4][0]($src)
74 lg @C[1],$A[4][1]($src)
75 lg @C[2],$A[4][2]($src)
76 lg @C[3],$A[4][3]($src)
77 lg @C[4],$A[4][4]($src)
83 lg @D[0],$A[0][0]($src)
84 lg @D[1],$A[1][1]($src)
85 lg @D[2],$A[2][2]($src)
86 lg @D[3],$A[3][3]($src)
89 xg @C[1],$A[0][1]($src)
90 xg @C[2],$A[0][2]($src)
91 xg @C[3],$A[0][3]($src)
93 xg @C[4],$A[0][4]($src)
95 xg @C[0],$A[1][0]($src)
97 xg @C[2],$A[1][2]($src)
98 xg @C[3],$A[1][3]($src)
99 xg @C[4],$A[1][4]($src)
101 xg @C[0],$A[2][0]($src)
102 xg @C[1],$A[2][1]($src)
104 xg @C[3],$A[2][3]($src)
105 xg @C[4],$A[2][4]($src)
107 xg @C[0],$A[3][0]($src)
108 xg @C[1],$A[3][1]($src)
109 xg @C[2],$A[3][2]($src)
111 xg @C[4],$A[3][4]($src)
115 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
118 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
121 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
124 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
127 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
129 (@D[0..4], @C) = (@C[1..4,0], @D);
134 rllg @C[1],@C[1],$rhotates[1][1]
136 rllg @C[2],@C[2],$rhotates[2][2]
141 rllg @C[3],@C[3],$rhotates[3][3]
142 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
143 rllg @C[4],@C[4],$rhotates[4][4]
146 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
150 lghi @C[1],-1 # no 'not' instruction :-(
151 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
152 xgr @C[2],@C[1] # not @C[2]
153 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
155 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
158 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
159 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
161 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
162 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
163 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
166 lg @C[0],$A[0][3]($src)
167 lg @C[4],$A[4][2]($src)
168 lg @C[3],$A[3][1]($src)
169 lg @C[1],$A[1][4]($src)
170 lg @C[2],$A[2][0]($src)
174 rllg @C[0],@C[0],$rhotates[0][3]
176 rllg @C[4],@C[4],$rhotates[4][2]
178 rllg @C[3],@C[3],$rhotates[3][1]
183 rllg @C[1],@C[1],$rhotates[1][4]
184 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
185 rllg @C[2],@C[2],$rhotates[2][0]
186 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
190 lghi @C[0],-1 # no 'not' instruction :-(
191 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
192 xgr @C[4],@C[0] # not @C[4]
193 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
196 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
199 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
200 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
202 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
203 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
204 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
207 lg @C[2],$A[2][3]($src)
208 lg @C[3],$A[3][4]($src)
209 lg @C[1],$A[1][2]($src)
210 lg @C[4],$A[4][0]($src)
211 lg @C[0],$A[0][1]($src)
215 rllg @C[2],@C[2],$rhotates[2][3]
217 rllg @C[3],@C[3],$rhotates[3][4]
219 rllg @C[1],@C[1],$rhotates[1][2]
224 rllg @C[4],@C[4],$rhotates[4][0]
225 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
226 lghi @T[1],-1 # no 'not' instruction :-(
227 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
229 xgr @C[3],@T[1] # not @C[3]
232 rllg @C[0],@C[0],$rhotates[0][1]
233 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
235 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
236 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
239 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
240 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
242 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
243 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
244 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
247 lg @C[2],$A[2][1]($src)
248 lg @C[3],$A[3][2]($src)
249 lg @C[1],$A[1][0]($src)
250 lg @C[4],$A[4][3]($src)
251 lg @C[0],$A[0][4]($src)
255 rllg @C[2],@C[2],$rhotates[2][1]
257 rllg @C[3],@C[3],$rhotates[3][2]
259 rllg @C[1],@C[1],$rhotates[1][0]
261 rllg @C[4],@C[4],$rhotates[4][3]
265 lghi @T[1],-1 # no 'not' instruction :-(
266 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
267 xgr @C[3],@T[1] # not @C[3]
268 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
272 rllg @C[0],@C[0],$rhotates[0][4]
273 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
275 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
276 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
279 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
280 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
282 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
283 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
284 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
287 xg @D[2],$A[0][2]($src)
288 xg @D[3],$A[1][3]($src)
289 xg @D[1],$A[4][1]($src)
290 xg @D[4],$A[2][4]($src)
291 xgr $dst,$src # xchg $dst,$src
292 rllg @D[2],@D[2],$rhotates[0][2]
293 xg @D[0],$A[3][0]($src)
294 rllg @D[3],@D[3],$rhotates[1][3]
296 rllg @D[1],@D[1],$rhotates[4][1]
298 rllg @D[4],@D[4],$rhotates[2][4]
304 lghi @T[1],-1 # no 'not' instruction :-(
305 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
306 xgr @C[1],@T[1] # not @C[1]
307 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
311 rllg @D[0],@D[0],$rhotates[3][0]
312 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
314 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
315 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
318 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
319 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
321 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
322 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
324 lgr @C[1],@C[0] # harmonize with the loop top
326 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
331 l${g} %r14,$SIZE_T*14($sp)
333 .size __KeccakF1600,.-__KeccakF1600
338 .type KeccakF1600,\@function
343 stm${g} %r6,%r15,$SIZE_T*6($sp)
348 lghi @D[0],-1 # no 'not' instruction :-(
354 xg @D[0],$A[0][1]($src)
355 xg @D[1],$A[0][2]($src)
356 xg @D[2],$A[1][3]($src)
357 xg @D[3],$A[2][2]($src)
358 xg @D[4],$A[3][2]($src)
359 xg @T[0],$A[4][0]($src)
360 stmg @D[0],@D[1],$A[0][1]($src)
361 stg @D[2],$A[1][3]($src)
362 stg @D[3],$A[2][2]($src)
363 stg @D[4],$A[3][2]($src)
364 stg @T[0],$A[4][0]($src)
366 la $dst,$stdframe($sp)
368 bras %r14,__KeccakF1600
370 lghi @D[0],-1 # no 'not' instruction :-(
376 xg @D[0],$A[0][1]($src)
377 xg @D[1],$A[0][2]($src)
378 xg @D[2],$A[1][3]($src)
379 xg @D[3],$A[2][2]($src)
380 xg @D[4],$A[3][2]($src)
381 xg @T[0],$A[4][0]($src)
382 stmg @D[0],@D[1],$A[0][1]($src)
383 stg @D[2],$A[1][3]($src)
384 stg @D[3],$A[2][2]($src)
385 stg @D[4],$A[3][2]($src)
386 stg @T[0],$A[4][0]($src)
388 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
390 .size KeccakF1600,.-KeccakF1600
393 { my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
397 .type SHA3_absorb,\@function
401 stm${g} %r5,%r15,$SIZE_T*5($sp)
406 lghi @D[0],-1 # no 'not' instruction :-(
412 xg @D[0],$A[0][1]($src)
413 xg @D[1],$A[0][2]($src)
414 xg @D[2],$A[1][3]($src)
415 xg @D[3],$A[2][2]($src)
416 xg @D[4],$A[3][2]($src)
417 xg @T[0],$A[4][0]($src)
418 stmg @D[0],@D[1],$A[0][1]($src)
419 stg @D[2],$A[1][3]($src)
420 stg @D[3],$A[2][2]($src)
421 stg @D[4],$A[3][2]($src)
422 stg @T[0],$A[4][0]($src)
438 brct $bsz,.Lblock_absorb
440 stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
441 la $dst,$stdframe($sp)
442 bras %r14,__KeccakF1600
443 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
448 lghi @D[0],-1 # no 'not' instruction :-(
454 xg @D[0],$A[0][1]($src)
455 xg @D[1],$A[0][2]($src)
456 xg @D[2],$A[1][3]($src)
457 xg @D[3],$A[2][2]($src)
458 xg @D[4],$A[3][2]($src)
459 xg @T[0],$A[4][0]($src)
460 stmg @D[0],@D[1],$A[0][1]($src)
461 stg @D[2],$A[1][3]($src)
462 stg @D[3],$A[2][2]($src)
463 stg @D[4],$A[3][2]($src)
464 stg @T[0],$A[4][0]($src)
466 lgr %r2,$len # return value
468 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
470 .size SHA3_absorb,.-SHA3_absorb
473 { my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
477 .type SHA3_squeeze,\@function
481 st${g} %r14,2*$SIZE_T($sp)
483 st${g} $bsz,5*$SIZE_T($sp)
497 a${g}hi $len,-8 # len -= 8
500 brct $bsz,.Loop_squeeze # bsz--
502 stm${g} $out,$len,3*$SIZE_T($sp)
503 bras %r14,.LKeccakF1600
504 lm${g} $out,$bsz,3*$SIZE_T($sp)
515 brct $len,.Loop_tail_squeeze
518 l${g} %r14,2*$SIZE_T($sp)
520 .size SHA3_squeeze,.-SHA3_squeeze
525 .quad 0,0,0,0,0,0,0,0
528 .quad 0x0000000000000001
529 .quad 0x0000000000008082
530 .quad 0x800000000000808a
531 .quad 0x8000000080008000
532 .quad 0x000000000000808b
533 .quad 0x0000000080000001
534 .quad 0x8000000080008081
535 .quad 0x8000000000008009
536 .quad 0x000000000000008a
537 .quad 0x0000000000000088
538 .quad 0x0000000080008009
539 .quad 0x000000008000000a
540 .quad 0x000000008000808b
541 .quad 0x800000000000008b
542 .quad 0x8000000000008089
543 .quad 0x8000000000008003
544 .quad 0x8000000000008002
545 .quad 0x8000000000000080
546 .quad 0x000000000000800a
547 .quad 0x800000008000000a
548 .quad 0x8000000080008081
549 .quad 0x8000000000008080
550 .quad 0x0000000080000001
551 .quad 0x8000000080008008
553 .asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
556 # unlike 32-bit shift 64-bit one takes three arguments
557 $code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;