2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for s390x.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # In the nutshell it's transliteration of x86_64 module, because both
26 # architectures have similar capabilities/limitations. Performance
27 # measurement is problematic as I don't have access to an idle system.
28 # It looks like z13 processes one byte [out of long message] in ~14
29 # cycles. At least the result is consistent with estimate based on
30 # amount of instruction and assumed instruction issue rate. It's ~2.5x
31 # faster than compiler-generated code.
35 if ($flavour =~ /3[12]/) {
43 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
44 open STDOUT,">$output";
46 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
48 my @C = map("%r$_",(0,1,5..7));
49 my @D = map("%r$_",(8..12));
50 my @T = map("%r$_",(13..14));
51 my ($src,$dst,$iotas) = map("%r$_",(2..4));
54 $stdframe=16*$SIZE_T+4*8;
55 $frame=$stdframe+25*8;
57 my @rhotates = ([ 0, 1, 62, 28, 27 ],
58 [ 36, 44, 6, 55, 20 ],
59 [ 3, 10, 43, 25, 39 ],
60 [ 41, 45, 15, 21, 8 ],
61 [ 18, 2, 61, 56, 14 ]);
63 { my @C = @C; # copy, because we mess the up...
69 .type __KeccakF1600,\@function
72 st${g} %r14,$SIZE_T*14($sp)
73 lg @C[0],$A[4][0]($src)
74 lg @C[1],$A[4][1]($src)
75 lg @C[2],$A[4][2]($src)
76 lg @C[3],$A[4][3]($src)
77 lg @C[4],$A[4][4]($src)
82 lg @D[0],$A[0][0]($src)
83 lg @D[1],$A[1][1]($src)
84 lg @D[2],$A[2][2]($src)
85 lg @D[3],$A[3][3]($src)
88 xg @C[1],$A[0][1]($src)
89 xg @C[2],$A[0][2]($src)
90 xg @C[3],$A[0][3]($src)
92 xg @C[4],$A[0][4]($src)
94 xg @C[0],$A[1][0]($src)
96 xg @C[2],$A[1][2]($src)
97 xg @C[3],$A[1][3]($src)
98 xg @C[4],$A[1][4]($src)
100 xg @C[0],$A[2][0]($src)
101 xg @C[1],$A[2][1]($src)
103 xg @C[3],$A[2][3]($src)
104 xg @C[4],$A[2][4]($src)
106 xg @C[0],$A[3][0]($src)
107 xg @C[1],$A[3][1]($src)
108 xg @C[2],$A[3][2]($src)
110 xg @C[4],$A[3][4]($src)
114 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
117 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
120 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
123 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
126 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
129 @D = (@C[1],@C[2],@C[3],@C[4],@C[0]);
135 rllg @C[1],@C[1],$rhotates[1][1]
137 rllg @C[2],@C[2],$rhotates[2][2]
142 rllg @C[3],@C[3],$rhotates[3][3]
143 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
144 rllg @C[4],@C[4],$rhotates[4][4]
147 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
151 lghi @C[1],-1 # no 'not' instruction :-(
152 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
153 xgr @C[2],@C[1] # not @C[2]
154 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
156 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
159 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
160 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
162 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
163 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
164 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
167 lg @C[0],$A[0][3]($src)
168 lg @C[4],$A[4][2]($src)
169 lg @C[3],$A[3][1]($src)
170 lg @C[1],$A[1][4]($src)
171 lg @C[2],$A[2][0]($src)
175 rllg @C[0],@C[0],$rhotates[0][3]
177 rllg @C[4],@C[4],$rhotates[4][2]
179 rllg @C[3],@C[3],$rhotates[3][1]
184 rllg @C[1],@C[1],$rhotates[1][4]
185 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
186 rllg @C[2],@C[2],$rhotates[2][0]
187 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
191 lghi @C[0],-1 # no 'not' instruction :-(
192 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
193 xgr @C[4],@C[0] # not @C[4]
194 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
197 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
200 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
201 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
203 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
204 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
205 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
208 lg @C[2],$A[2][3]($src)
209 lg @C[3],$A[3][4]($src)
210 lg @C[1],$A[1][2]($src)
211 lg @C[4],$A[4][0]($src)
212 lg @C[0],$A[0][1]($src)
216 rllg @C[2],@C[2],$rhotates[2][3]
218 rllg @C[3],@C[3],$rhotates[3][4]
220 rllg @C[1],@C[1],$rhotates[1][2]
225 rllg @C[4],@C[4],$rhotates[4][0]
226 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
227 lghi @T[1],-1 # no 'not' instruction :-(
228 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
230 xgr @C[3],@T[1] # not @C[3]
233 rllg @C[0],@C[0],$rhotates[0][1]
234 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
236 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
237 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
240 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
241 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
243 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
244 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
245 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
248 lg @C[2],$A[2][1]($src)
249 lg @C[3],$A[3][2]($src)
250 lg @C[1],$A[1][0]($src)
251 lg @C[4],$A[4][3]($src)
252 lg @C[0],$A[0][4]($src)
256 rllg @C[2],@C[2],$rhotates[2][1]
258 rllg @C[3],@C[3],$rhotates[3][2]
260 rllg @C[1],@C[1],$rhotates[1][0]
262 rllg @C[4],@C[4],$rhotates[4][3]
266 lghi @T[1],-1 # no 'not' instruction :-(
267 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
268 xgr @C[3],@T[1] # not @C[3]
269 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
273 rllg @C[0],@C[0],$rhotates[0][4]
274 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
276 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
277 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
280 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
281 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
283 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
284 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
285 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
288 xg @D[2],$A[0][2]($src)
289 xg @D[3],$A[1][3]($src)
290 xg @D[1],$A[4][1]($src)
291 xg @D[4],$A[2][4]($src)
292 xgr $dst,$src # xchg $dst,$src
293 rllg @D[2],@D[2],$rhotates[0][2]
294 xg @D[0],$A[3][0]($src)
295 rllg @D[3],@D[3],$rhotates[1][3]
297 rllg @D[1],@D[1],$rhotates[4][1]
299 rllg @D[4],@D[4],$rhotates[2][4]
301 @C = (@D[2],@D[3],@D[4],@D[0],@D[1]);
305 lghi @T[1],-1 # no 'not' instruction :-(
306 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
307 xgr @C[1],@T[1] # not @C[1]
308 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
312 rllg @D[0],@D[0],$rhotates[3][0]
313 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
315 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
316 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
319 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
320 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
322 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
323 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
325 lgr @C[1],@C[0] # harmonize with the loop top
327 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
332 l${g} %r14,$SIZE_T*14($sp)
334 .size __KeccakF1600,.-__KeccakF1600
340 .type KeccakF1600,\@function
345 stm${g} %r6,%r15,$SIZE_T*6($sp)
350 lghi @D[0],-1 # no 'not' instruction :-(
356 xg @D[0],$A[0][1]($src)
357 xg @D[1],$A[0][2]($src)
358 xg @D[2],$A[1][3]($src)
359 xg @D[3],$A[2][2]($src)
360 xg @D[4],$A[3][2]($src)
361 xg @T[0],$A[4][0]($src)
362 stg @D[0],$A[0][1]($src)
363 stg @D[1],$A[0][2]($src)
364 stg @D[2],$A[1][3]($src)
365 stg @D[3],$A[2][2]($src)
366 stg @D[4],$A[3][2]($src)
367 stg @T[0],$A[4][0]($src)
369 la $dst,$stdframe($sp)
372 bras %r14,__KeccakF1600
374 lghi @D[0],-1 # no 'not' instruction :-(
380 xg @D[0],$A[0][1]($src)
381 xg @D[1],$A[0][2]($src)
382 xg @D[2],$A[1][3]($src)
383 xg @D[3],$A[2][2]($src)
384 xg @D[4],$A[3][2]($src)
385 xg @T[0],$A[4][0]($src)
386 stg @D[0],$A[0][1]($src)
387 stg @D[1],$A[0][2]($src)
388 stg @D[2],$A[1][3]($src)
389 stg @D[3],$A[2][2]($src)
390 stg @D[4],$A[3][2]($src)
391 stg @T[0],$A[4][0]($src)
393 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
395 .size KeccakF1600,.-KeccakF1600
398 { my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
402 .type SHA3_absorb,\@function
406 stm${g} %r5,%r15,$SIZE_T*5($sp)
411 lghi @D[0],-1 # no 'not' instruction :-(
417 xg @D[0],$A[0][1]($src)
418 xg @D[1],$A[0][2]($src)
419 xg @D[2],$A[1][3]($src)
420 xg @D[3],$A[2][2]($src)
421 xg @D[4],$A[3][2]($src)
422 xg @T[0],$A[4][0]($src)
423 stg @D[0],$A[0][1]($src)
424 stg @D[1],$A[0][2]($src)
425 stg @D[2],$A[1][3]($src)
426 stg @D[3],$A[2][2]($src)
427 stg @D[4],$A[3][2]($src)
428 stg @T[0],$A[4][0]($src)
444 brct $bsz,.Lblock_absorb
446 stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
447 la $dst,$stdframe($sp)
449 bras %r14,__KeccakF1600
450 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
455 lghi @D[0],-1 # no 'not' instruction :-(
461 xg @D[0],$A[0][1]($src)
462 xg @D[1],$A[0][2]($src)
463 xg @D[2],$A[1][3]($src)
464 xg @D[3],$A[2][2]($src)
465 xg @D[4],$A[3][2]($src)
466 xg @T[0],$A[4][0]($src)
467 stg @D[0],$A[0][1]($src)
468 stg @D[1],$A[0][2]($src)
469 stg @D[2],$A[1][3]($src)
470 stg @D[3],$A[2][2]($src)
471 stg @D[4],$A[3][2]($src)
472 stg @T[0],$A[4][0]($src)
474 lgr %r2,$len # return value
476 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
478 .size SHA3_absorb,.-SHA3_absorb
481 { my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
485 .type SHA3_squeeze,\@function
489 st${g} %r14,2*$SIZE_T($sp)
491 st${g} $bsz,5*$SIZE_T($sp)
505 a${g}hi $len,-8 # len -= 8
508 brct $bsz,.Loop_squeeze # bsz--
510 stm${g} $out,$len,3*$SIZE_T($sp)
511 bras %r14,.LKeccakF1600
512 lm${g} $out,$bsz,3*$SIZE_T($sp)
523 brct $len,.Loop_tail_squeeze
526 l${g} %r14,2*$SIZE_T($sp)
528 .size SHA3_squeeze,.-SHA3_squeeze
533 .quad 0,0,0,0,0,0,0,0
536 .quad 0x0000000000000001
537 .quad 0x0000000000008082
538 .quad 0x800000000000808a
539 .quad 0x8000000080008000
540 .quad 0x000000000000808b
541 .quad 0x0000000080000001
542 .quad 0x8000000080008081
543 .quad 0x8000000000008009
544 .quad 0x000000000000008a
545 .quad 0x0000000000000088
546 .quad 0x0000000080008009
547 .quad 0x000000008000000a
548 .quad 0x000000008000808b
549 .quad 0x800000000000008b
550 .quad 0x8000000000008089
551 .quad 0x8000000000008003
552 .quad 0x8000000000008002
553 .quad 0x8000000000000080
554 .quad 0x000000000000800a
555 .quad 0x800000008000000a
556 .quad 0x8000000080008081
557 .quad 0x8000000000008080
558 .quad 0x0000000080000001
559 .quad 0x8000000080008008
561 .asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
564 # unlike 32-bit shift 64-bit one takes three arguments
565 $code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;