2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright 2021- IBM Inc. All rights reserved
5 # Licensed under the Apache License 2.0 (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
10 #===================================================================================
11 # Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
13 # GHASH is based on the Karatsuba multiplication method.
17 # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
18 # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
19 # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
20 # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
21 # (X4.h * H.h + X4.l * H.l + X4 * H)
25 # Hash keys = v3 - v14
27 # ( H^2.l, H^2, H^2.h)
28 # ( H^3.l, H^3, H^3.h)
29 # ( H^4.l, H^4, H^4.h)
35 # vs0 - vs14 for round keys
36 # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
38 # This implementation uses stitched AES-GCM approach to improve overall performance.
39 # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
41 # Current large block (16384 bytes) performance per second with 128 bit key --
44 # Power10[le] (3.5GHz) 5.32G 5.26G
46 # ===================================================================================
48 # $output is the last argument if it looks like a file (it has an extension)
49 # $flavour is the first argument if it doesn't look like a file
50 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
51 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
53 if ($flavour =~ /64/) {
61 } elsif ($flavour =~ /32/) {
69 } else { die "nonsense $flavour"; }
72 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
74 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
75 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
76 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
77 die "can't locate ppc-xlate.pl";
79 open STDOUT,"| $^X $xlate $flavour \"$output\""
80 or die "can't call $xlate: $!";
87 # v15 - v18 - input states
88 # vs1 - vs9 - round keys
90 .macro Loop_aes_middle4x
149 # v15 - v22 - input states
150 # vs1 - vs9 - round keys
152 .macro Loop_aes_middle8x
247 # Compute 4x hash values based on Karatsuba method.
254 vpmsumd 23, 12, 15 # H4.L * X.L
263 vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
264 vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
272 # sum hash and reduction with H Poly
273 vpmsumd 28, 23, 2 # reduction
276 vsldoi 26, 24, 29, 8 # mL
277 vsldoi 29, 29, 24, 8 # mH
278 vxor 23, 23, 26 # mL + L
280 vsldoi 23, 23, 23, 8 # swap
283 vpmsumd 24, 14, 15 # H4.H * X.H
294 # sum hash and reduction with H Poly
295 vsldoi 27, 23, 23, 8 # swap
300 xxlor 32, 23+32, 23+32 # update hash
305 # Combine two 4x ghash
306 # v15 - v22 - input blocks
308 .macro ppc_aes_gcm_ghash2_4x
310 vxor 15, 15, 0 # Xi + X
314 vpmsumd 23, 12, 15 # H4.L * X.L
323 vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
324 vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
331 # sum hash and reduction with H Poly
332 vpmsumd 28, 23, 2 # reduction
337 vsldoi 26, 24, 29, 8 # mL
338 vsldoi 29, 29, 24, 8 # mH
339 vxor 23, 23, 26 # mL + L
341 vsldoi 23, 23, 23, 8 # swap
344 vpmsumd 24, 14, 15 # H4.H * X.H
353 vxor 24, 24, 29 # H + mH
355 # sum hash and reduction with H Poly
356 vsldoi 27, 23, 23, 8 # swap
359 vxor 27, 23, 27 # 1st Xi
365 vxor 19, 19, 27 # Xi + X
366 vpmsumd 23, 12, 19 # H4.L * X.L
372 vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
373 vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
380 # sum hash and reduction with H Poly
381 vpmsumd 28, 23, 2 # reduction
386 vsldoi 26, 24, 29, 8 # mL
387 vsldoi 29, 29, 24, 8 # mH
388 vxor 23, 23, 26 # mL + L
390 vsldoi 23, 23, 23, 8 # swap
393 vpmsumd 24, 14, 19 # H4.H * X.H
402 vxor 24, 24, 29 # H + mH
404 # sum hash and reduction with H Poly
405 vsldoi 27, 23, 23, 8 # swap
410 xxlor 32, 23+32, 23+32 # update hash
415 # Compute update single hash
417 .macro ppc_update_hash_1x
422 vpmsumd 22, 3, 28 # L
423 vpmsumd 23, 4, 28 # M
424 vpmsumd 24, 5, 28 # H
426 vpmsumd 27, 22, 2 # reduction
428 vsldoi 25, 23, 19, 8 # mL
429 vsldoi 26, 19, 23, 8 # mH
430 vxor 22, 22, 25 # LL + LL
431 vxor 24, 24, 26 # HH + HH
433 vsldoi 22, 22, 22, 8 # swap
436 vsldoi 20, 22, 22, 8 # swap
437 vpmsumd 22, 22, 2 # reduction
441 vmr 0, 22 # update hash
446 # ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
447 # const AES_KEY *key, unsigned char iv[16],
453 # r6 - AES round keys
455 # r8 - Xi, HPoli, hash keys
457 .global ppc_aes_gcm_encrypt
460 _ppc_aes_gcm_encrypt:
500 lxvb16x 32, 0, 8 # load Xi
502 # load Hash - h^4, h^3, h^2, h
504 lxvd2x 2+32, 10, 8 # H Poli
506 lxvd2x 3+32, 10, 8 # Hl
508 lxvd2x 4+32, 10, 8 # H
510 lxvd2x 5+32, 10, 8 # Hh
513 lxvd2x 6+32, 10, 8 # H^2l
515 lxvd2x 7+32, 10, 8 # H^2
517 lxvd2x 8+32, 10, 8 # H^2h
520 lxvd2x 9+32, 10, 8 # H^3l
522 lxvd2x 10+32, 10, 8 # H^3
524 lxvd2x 11+32, 10, 8 # H^3h
527 lxvd2x 12+32, 10, 8 # H^4l
529 lxvd2x 13+32, 10, 8 # H^4
531 lxvd2x 14+32, 10, 8 # H^4h
533 # initialize ICB: GHASH( IV ), IV - r7
534 lxvb16x 30+32, 0, 7 # load IV - v30
537 li 11, 0 # block index
542 vsldoi 31, 31, 22,1 # counter 1
544 # load round key to VSR
557 # load rounds - 10 (128), 12 (192), 14 (256)
561 # vxor state, state, w # addroundkey
563 vxor 15, 30, 29 # IV + round key - add round key 0
568 # load 2 more round keys (v11, v12)
575 # load 2 more round keys (v11, v12, v13, v14)
590 divdu 10, 5, 10 # n 128 bytes-blocks
594 vaddudm 30, 30, 31 # IV + counter
623 lxvb16x 15, 0, 14 # load block
624 lxvb16x 16, 15, 14 # load block
625 lxvb16x 17, 16, 14 # load block
626 lxvb16x 18, 17, 14 # load block
627 lxvb16x 19, 18, 14 # load block
628 lxvb16x 20, 19, 14 # load block
629 lxvb16x 21, 20, 14 # load block
630 lxvb16x 22, 21, 14 # load block
697 vcipherlast 15, 15, 23
698 vcipherlast 16, 16, 23
701 stxvb16x 47, 0, 9 # store output
703 stxvb16x 48, 15, 9 # store output
705 vcipherlast 17, 17, 23
706 vcipherlast 18, 18, 23
709 stxvb16x 49, 16, 9 # store output
711 stxvb16x 50, 17, 9 # store output
713 vcipherlast 19, 19, 23
714 vcipherlast 20, 20, 23
717 stxvb16x 51, 18, 9 # store output
719 stxvb16x 52, 19, 9 # store output
721 vcipherlast 21, 21, 23
722 vcipherlast 22, 22, 23
725 stxvb16x 53, 20, 9 # store output
727 stxvb16x 54, 21, 9 # store output
732 ppc_aes_gcm_ghash2_4x
735 vaddudm 30, 30, 31 # IV + counter
737 vxor 15, 30, 27 # add round key
764 # loop last few blocks
775 .macro Loop_aes_middle_1x
801 lxvb16x 15, 0, 14 # load block
833 vcipherlast 15, 15, 23
836 stxvb16x 47, 0, 9 # store output
846 vaddudm 30, 30, 31 # IV + counter
847 vxor 15, 30, 19 # add round key
885 vcipherlast 15, 15, 23
887 lxvb16x 15, 0, 14 # load last block
890 # create partial block mask
892 sub 15, 15, 12 # index to the mask
894 vspltisb 16, -1 # first 16 bytes - 0xffff...ff
895 vspltisb 17, 0 # second 16 bytes - 0x0000...00
902 lxvb16x 16, 15, 10 # load partial block mask
908 # * should store only the remaining bytes.
909 bl Write_partial_block
914 # Write partial block
916 # r12 - remaining bytes
917 # v15 - partial input data
921 stxvb16x 15+32, 10, 1 # last block
923 #add 10, 9, 11 # Output
927 mtctr 12 # remaining bytes
938 stxvb16x 32, 0, 8 # write out Xi
939 add 3, 11, 12 # return count
983 .global ppc_aes_gcm_decrypt
986 _ppc_aes_gcm_decrypt:
1026 lxvb16x 32, 0, 8 # load Xi
1028 # load Hash - h^4, h^3, h^2, h
1030 lxvd2x 2+32, 10, 8 # H Poli
1032 lxvd2x 3+32, 10, 8 # Hl
1034 lxvd2x 4+32, 10, 8 # H
1036 lxvd2x 5+32, 10, 8 # Hh
1039 lxvd2x 6+32, 10, 8 # H^2l
1041 lxvd2x 7+32, 10, 8 # H^2
1043 lxvd2x 8+32, 10, 8 # H^2h
1046 lxvd2x 9+32, 10, 8 # H^3l
1048 lxvd2x 10+32, 10, 8 # H^3
1050 lxvd2x 11+32, 10, 8 # H^3h
1053 lxvd2x 12+32, 10, 8 # H^4l
1055 lxvd2x 13+32, 10, 8 # H^4
1057 lxvd2x 14+32, 10, 8 # H^4h
1059 # initialize ICB: GHASH( IV ), IV - r7
1060 lxvb16x 30+32, 0, 7 # load IV - v30
1063 li 11, 0 # block index
1068 vsldoi 31, 31, 22,1 # counter 1
1070 # load round key to VSR
1083 # load rounds - 10 (128), 12 (192), 14 (256)
1087 # vxor state, state, w # addroundkey
1089 vxor 15, 30, 29 # IV + round key - add round key 0
1092 beq Loop_aes_gcm_8x_dec
1094 # load 2 more round keys (v11, v12)
1099 beq Loop_aes_gcm_8x_dec
1101 # load 2 more round keys (v11, v12, v13, v14)
1105 beq Loop_aes_gcm_8x_dec
1110 Loop_aes_gcm_8x_dec:
1116 divdu 10, 5, 10 # n 128 bytes-blocks
1118 beq Loop_last_block_dec
1120 vaddudm 30, 30, 31 # IV + counter
1149 lxvb16x 15, 0, 14 # load block
1150 lxvb16x 16, 15, 14 # load block
1151 lxvb16x 17, 16, 14 # load block
1152 lxvb16x 18, 17, 14 # load block
1153 lxvb16x 19, 18, 14 # load block
1154 lxvb16x 20, 19, 14 # load block
1155 lxvb16x 21, 20, 14 # load block
1156 lxvb16x 22, 21, 14 # load block
1223 vcipherlast 15, 15, 23
1224 vcipherlast 16, 16, 23
1227 stxvb16x 47, 0, 9 # store output
1229 stxvb16x 48, 15, 9 # store output
1231 vcipherlast 17, 17, 23
1232 vcipherlast 18, 18, 23
1235 stxvb16x 49, 16, 9 # store output
1237 stxvb16x 50, 17, 9 # store output
1239 vcipherlast 19, 19, 23
1240 vcipherlast 20, 20, 23
1243 stxvb16x 51, 18, 9 # store output
1245 stxvb16x 52, 19, 9 # store output
1247 vcipherlast 21, 21, 23
1248 vcipherlast 22, 22, 23
1251 stxvb16x 53, 20, 9 # store output
1253 stxvb16x 54, 21, 9 # store output
1267 ppc_aes_gcm_ghash2_4x
1270 vaddudm 30, 30, 31 # IV + counter
1272 vxor 15, 30, 27 # add round key
1290 bdnz Loop_8x_block_dec
1294 Loop_last_block_dec:
1298 # loop last few blocks
1310 lxvb16x 15, 0, 14 # load block
1342 vcipherlast 15, 15, 23
1345 stxvb16x 47, 0, 9 # store output
1355 vaddudm 30, 30, 31 # IV + counter
1356 vxor 15, 30, 19 # add round key
1358 bdnz Next_rem_block_dec
1394 vcipherlast 15, 15, 23
1396 lxvb16x 15, 0, 14 # load block
1399 # create partial block mask
1401 sub 15, 15, 12 # index to the mask
1403 vspltisb 16, -1 # first 16 bytes - 0xffff...ff
1404 vspltisb 17, 0 # second 16 bytes - 0x0000...00
1411 lxvb16x 16, 15, 10 # load block mask
1417 # * should store only the remaining bytes.
1418 bl Write_partial_block
1425 foreach (split("\n",$code)) {
1426 s/\`([^\`]*)\`/eval $1/geo;
1428 if ($flavour =~ /le$/o) { # little-endian
1438 close STDOUT or die "error closing STDOUT: $!"; # enforce flush