2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for s390x.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # In the nutshell it's transliteration of x86_64 module, because both
26 # architectures have similar capabilities/limitations. Performance
27 # measurement is problematic as I don't have access to an idle system.
28 # It looks like z13 processes one byte [out of long message] in ~14
29 # cycles. At least the result is consistent with estimate based on
30 # amount of instruction and assumed instruction issue rate. It's ~2.5x
31 # faster than compiler-generated code.
33 # $output is the last argument if it looks like a file (it has an extension)
34 # $flavour is the first argument if it doesn't look like a file
35 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
36 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
38 if ($flavour =~ /3[12]/) {
46 $output and open STDOUT,">$output";
48 my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
50 my @C = map("%r$_",(0,1,5..7));
51 my @D = map("%r$_",(8..12));
52 my @T = map("%r$_",(13..14));
53 my ($src,$dst,$iotas) = map("%r$_",(2..4));
56 $stdframe=16*$SIZE_T+4*8;
57 $frame=$stdframe+25*8;
59 my @rhotates = ([ 0, 1, 62, 28, 27 ],
60 [ 36, 44, 6, 55, 20 ],
61 [ 3, 10, 43, 25, 39 ],
62 [ 41, 45, 15, 21, 8 ],
63 [ 18, 2, 61, 56, 14 ]);
65 { my @C = @C; # copy, because we mess them up...
71 .type __KeccakF1600,\@function
74 st${g} %r14,$SIZE_T*14($sp)
75 lg @C[0],$A[4][0]($src)
76 lg @C[1],$A[4][1]($src)
77 lg @C[2],$A[4][2]($src)
78 lg @C[3],$A[4][3]($src)
79 lg @C[4],$A[4][4]($src)
85 lg @D[0],$A[0][0]($src)
86 lg @D[1],$A[1][1]($src)
87 lg @D[2],$A[2][2]($src)
88 lg @D[3],$A[3][3]($src)
91 xg @C[1],$A[0][1]($src)
92 xg @C[2],$A[0][2]($src)
93 xg @C[3],$A[0][3]($src)
95 xg @C[4],$A[0][4]($src)
97 xg @C[0],$A[1][0]($src)
99 xg @C[2],$A[1][2]($src)
100 xg @C[3],$A[1][3]($src)
101 xg @C[4],$A[1][4]($src)
103 xg @C[0],$A[2][0]($src)
104 xg @C[1],$A[2][1]($src)
106 xg @C[3],$A[2][3]($src)
107 xg @C[4],$A[2][4]($src)
109 xg @C[0],$A[3][0]($src)
110 xg @C[1],$A[3][1]($src)
111 xg @C[2],$A[3][2]($src)
113 xg @C[4],$A[3][4]($src)
117 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
120 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
123 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
126 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
129 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
131 (@D[0..4], @C) = (@C[1..4,0], @D);
136 rllg @C[1],@C[1],$rhotates[1][1]
138 rllg @C[2],@C[2],$rhotates[2][2]
143 rllg @C[3],@C[3],$rhotates[3][3]
144 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
145 rllg @C[4],@C[4],$rhotates[4][4]
148 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
152 lghi @C[1],-1 # no 'not' instruction :-(
153 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
154 xgr @C[2],@C[1] # not @C[2]
155 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
157 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
160 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
161 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
163 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
164 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
165 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
168 lg @C[0],$A[0][3]($src)
169 lg @C[4],$A[4][2]($src)
170 lg @C[3],$A[3][1]($src)
171 lg @C[1],$A[1][4]($src)
172 lg @C[2],$A[2][0]($src)
176 rllg @C[0],@C[0],$rhotates[0][3]
178 rllg @C[4],@C[4],$rhotates[4][2]
180 rllg @C[3],@C[3],$rhotates[3][1]
185 rllg @C[1],@C[1],$rhotates[1][4]
186 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
187 rllg @C[2],@C[2],$rhotates[2][0]
188 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
192 lghi @C[0],-1 # no 'not' instruction :-(
193 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
194 xgr @C[4],@C[0] # not @C[4]
195 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
198 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
201 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
202 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
204 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
205 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
206 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
209 lg @C[2],$A[2][3]($src)
210 lg @C[3],$A[3][4]($src)
211 lg @C[1],$A[1][2]($src)
212 lg @C[4],$A[4][0]($src)
213 lg @C[0],$A[0][1]($src)
217 rllg @C[2],@C[2],$rhotates[2][3]
219 rllg @C[3],@C[3],$rhotates[3][4]
221 rllg @C[1],@C[1],$rhotates[1][2]
226 rllg @C[4],@C[4],$rhotates[4][0]
227 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
228 lghi @T[1],-1 # no 'not' instruction :-(
229 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
231 xgr @C[3],@T[1] # not @C[3]
234 rllg @C[0],@C[0],$rhotates[0][1]
235 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
237 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
238 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
241 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
242 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
244 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
245 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
246 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
249 lg @C[2],$A[2][1]($src)
250 lg @C[3],$A[3][2]($src)
251 lg @C[1],$A[1][0]($src)
252 lg @C[4],$A[4][3]($src)
253 lg @C[0],$A[0][4]($src)
257 rllg @C[2],@C[2],$rhotates[2][1]
259 rllg @C[3],@C[3],$rhotates[3][2]
261 rllg @C[1],@C[1],$rhotates[1][0]
263 rllg @C[4],@C[4],$rhotates[4][3]
267 lghi @T[1],-1 # no 'not' instruction :-(
268 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
269 xgr @C[3],@T[1] # not @C[3]
270 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
274 rllg @C[0],@C[0],$rhotates[0][4]
275 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
277 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
278 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
281 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
282 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
284 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
285 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
286 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
289 xg @D[2],$A[0][2]($src)
290 xg @D[3],$A[1][3]($src)
291 xg @D[1],$A[4][1]($src)
292 xg @D[4],$A[2][4]($src)
293 xgr $dst,$src # xchg $dst,$src
294 rllg @D[2],@D[2],$rhotates[0][2]
295 xg @D[0],$A[3][0]($src)
296 rllg @D[3],@D[3],$rhotates[1][3]
298 rllg @D[1],@D[1],$rhotates[4][1]
300 rllg @D[4],@D[4],$rhotates[2][4]
306 lghi @T[1],-1 # no 'not' instruction :-(
307 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
308 xgr @C[1],@T[1] # not @C[1]
309 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
313 rllg @D[0],@D[0],$rhotates[3][0]
314 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
316 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
317 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
320 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
321 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
323 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
324 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
326 lgr @C[1],@C[0] # harmonize with the loop top
328 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
333 l${g} %r14,$SIZE_T*14($sp)
335 .size __KeccakF1600,.-__KeccakF1600
340 .type KeccakF1600,\@function
345 stm${g} %r6,%r15,$SIZE_T*6($sp)
350 lghi @D[0],-1 # no 'not' instruction :-(
356 xg @D[0],$A[0][1]($src)
357 xg @D[1],$A[0][2]($src)
358 xg @D[2],$A[1][3]($src)
359 xg @D[3],$A[2][2]($src)
360 xg @D[4],$A[3][2]($src)
361 xg @T[0],$A[4][0]($src)
362 stmg @D[0],@D[1],$A[0][1]($src)
363 stg @D[2],$A[1][3]($src)
364 stg @D[3],$A[2][2]($src)
365 stg @D[4],$A[3][2]($src)
366 stg @T[0],$A[4][0]($src)
368 la $dst,$stdframe($sp)
370 bras %r14,__KeccakF1600
372 lghi @D[0],-1 # no 'not' instruction :-(
378 xg @D[0],$A[0][1]($src)
379 xg @D[1],$A[0][2]($src)
380 xg @D[2],$A[1][3]($src)
381 xg @D[3],$A[2][2]($src)
382 xg @D[4],$A[3][2]($src)
383 xg @T[0],$A[4][0]($src)
384 stmg @D[0],@D[1],$A[0][1]($src)
385 stg @D[2],$A[1][3]($src)
386 stg @D[3],$A[2][2]($src)
387 stg @D[4],$A[3][2]($src)
388 stg @T[0],$A[4][0]($src)
390 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
392 .size KeccakF1600,.-KeccakF1600
395 { my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
399 .type SHA3_absorb,\@function
403 stm${g} %r5,%r15,$SIZE_T*5($sp)
408 lghi @D[0],-1 # no 'not' instruction :-(
414 xg @D[0],$A[0][1]($src)
415 xg @D[1],$A[0][2]($src)
416 xg @D[2],$A[1][3]($src)
417 xg @D[3],$A[2][2]($src)
418 xg @D[4],$A[3][2]($src)
419 xg @T[0],$A[4][0]($src)
420 stmg @D[0],@D[1],$A[0][1]($src)
421 stg @D[2],$A[1][3]($src)
422 stg @D[3],$A[2][2]($src)
423 stg @D[4],$A[3][2]($src)
424 stg @T[0],$A[4][0]($src)
440 brct $bsz,.Lblock_absorb
442 stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
443 la $dst,$stdframe($sp)
444 bras %r14,__KeccakF1600
445 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
450 lghi @D[0],-1 # no 'not' instruction :-(
456 xg @D[0],$A[0][1]($src)
457 xg @D[1],$A[0][2]($src)
458 xg @D[2],$A[1][3]($src)
459 xg @D[3],$A[2][2]($src)
460 xg @D[4],$A[3][2]($src)
461 xg @T[0],$A[4][0]($src)
462 stmg @D[0],@D[1],$A[0][1]($src)
463 stg @D[2],$A[1][3]($src)
464 stg @D[3],$A[2][2]($src)
465 stg @D[4],$A[3][2]($src)
466 stg @T[0],$A[4][0]($src)
468 lgr %r2,$len # return value
470 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
472 .size SHA3_absorb,.-SHA3_absorb
475 { my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
479 .type SHA3_squeeze,\@function
483 st${g} %r14,2*$SIZE_T($sp)
485 st${g} $bsz,5*$SIZE_T($sp)
499 a${g}hi $len,-8 # len -= 8
502 brct $bsz,.Loop_squeeze # bsz--
504 stm${g} $out,$len,3*$SIZE_T($sp)
505 bras %r14,.LKeccakF1600
506 lm${g} $out,$bsz,3*$SIZE_T($sp)
517 brct $len,.Loop_tail_squeeze
520 l${g} %r14,2*$SIZE_T($sp)
522 .size SHA3_squeeze,.-SHA3_squeeze
527 .quad 0,0,0,0,0,0,0,0
530 .quad 0x0000000000000001
531 .quad 0x0000000000008082
532 .quad 0x800000000000808a
533 .quad 0x8000000080008000
534 .quad 0x000000000000808b
535 .quad 0x0000000080000001
536 .quad 0x8000000080008081
537 .quad 0x8000000000008009
538 .quad 0x000000000000008a
539 .quad 0x0000000000000088
540 .quad 0x0000000080008009
541 .quad 0x000000008000000a
542 .quad 0x000000008000808b
543 .quad 0x800000000000008b
544 .quad 0x8000000000008089
545 .quad 0x8000000000008003
546 .quad 0x8000000000008002
547 .quad 0x8000000000000080
548 .quad 0x000000000000800a
549 .quad 0x800000008000000a
550 .quad 0x8000000080008081
551 .quad 0x8000000000008080
552 .quad 0x0000000080000001
553 .quad 0x8000000080008008
555 .asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
558 # unlike 32-bit shift 64-bit one takes three arguments
559 $code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;