2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # ECP_NISTZ256 module for PPC64.
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816.
24 # with/without -DECP_NISTZ256_ASM
28 # $output is the last argument if it looks like a file (it has an extension)
29 # $flavour is the first argument if it doesn't look like a file
30 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36 die "can't locate ppc-xlate.pl";
38 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
39 or die "can't call $xlate: $!";
45 my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
46 $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
47 map("r$_",(3..12,22..31));
49 my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
55 ########################################################################
56 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 open TABLE,"<ecp_nistz256_table.c" or
60 open TABLE,"<${dir}../ecp_nistz256_table.c" or
61 die "failed to open ecp_nistz256_table.c:",$!;
66 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
70 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
71 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
73 die "insane number of elements" if ($#arr != 64*16*37-1);
76 .type ecp_nistz256_precomputed,\@object
77 .globl ecp_nistz256_precomputed
79 ecp_nistz256_precomputed:
81 ########################################################################
82 # this conversion smashes P256_POINT_AFFINE by individual bytes with
83 # 64 byte interval, similar to
87 @tbl = splice(@arr,0,64*16);
88 for($i=0;$i<64;$i++) {
90 for($j=0;$j<64;$j++) {
91 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
94 $code.=join(',',map { sprintf "0x%02x",$_} @line);
100 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
101 .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
103 # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
104 # const BN_ULONG x2[4]);
105 .globl ecp_nistz256_mul_mont
107 ecp_nistz256_mul_mont:
128 srdi $poly1,$poly1,32 # 0x00000000ffffffff
130 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
132 bl __ecp_nistz256_mul_mont
148 .byte 0,12,4,0,0x80,10,3,0
150 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
152 # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
153 .globl ecp_nistz256_sqr_mont
155 ecp_nistz256_sqr_mont:
175 srdi $poly1,$poly1,32 # 0x00000000ffffffff
177 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
179 bl __ecp_nistz256_sqr_mont
195 .byte 0,12,4,0,0x80,10,2,0
197 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
199 # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
200 # const BN_ULONG x2[4]);
201 .globl ecp_nistz256_add
221 srdi $poly1,$poly1,32 # 0x00000000ffffffff
223 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
225 bl __ecp_nistz256_add
235 .byte 0,12,4,0,0x80,4,3,0
237 .size ecp_nistz256_add,.-ecp_nistz256_add
239 # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
240 .globl ecp_nistz256_div_by_2
242 ecp_nistz256_div_by_2:
256 srdi $poly1,$poly1,32 # 0x00000000ffffffff
258 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
260 bl __ecp_nistz256_div_by_2
270 .byte 0,12,4,0,0x80,4,2,0
272 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
274 # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
275 .globl ecp_nistz256_mul_by_2
277 ecp_nistz256_mul_by_2:
296 srdi $poly1,$poly1,32 # 0x00000000ffffffff
298 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
300 bl __ecp_nistz256_add # ret = a+a // 2*a
310 .byte 0,12,4,0,0x80,4,3,0
312 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
314 # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
315 .globl ecp_nistz256_mul_by_3
317 ecp_nistz256_mul_by_3:
340 srdi $poly1,$poly1,32 # 0x00000000ffffffff
342 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
344 bl __ecp_nistz256_add # ret = a+a // 2*a
351 bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
361 .byte 0,12,4,0,0x80,4,2,0
363 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
365 # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
366 # const BN_ULONG x2[4]);
367 .globl ecp_nistz256_sub
383 srdi $poly1,$poly1,32 # 0x00000000ffffffff
385 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
387 bl __ecp_nistz256_sub_from
397 .byte 0,12,4,0,0x80,4,3,0
399 .size ecp_nistz256_sub,.-ecp_nistz256_sub
401 # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
402 .globl ecp_nistz256_neg
419 srdi $poly1,$poly1,32 # 0x00000000ffffffff
421 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
423 bl __ecp_nistz256_sub_from
433 .byte 0,12,4,0,0x80,4,2,0
435 .size ecp_nistz256_neg,.-ecp_nistz256_neg
437 # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
438 # to $a0-$a3 and b[0] - to $bi
439 .type __ecp_nistz256_mul_mont,\@function
441 __ecp_nistz256_mul_mont:
442 mulld $acc0,$a0,$bi # a[0]*b[0]
445 mulld $acc1,$a1,$bi # a[1]*b[0]
448 mulld $acc2,$a2,$bi # a[2]*b[0]
451 mulld $acc3,$a3,$bi # a[3]*b[0]
455 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
463 for($i=1;$i<4;$i++) {
464 ################################################################
465 # Reduction iteration is normally performed by accumulating
466 # result of multiplication of modulus by "magic" digit [and
467 # omitting least significant word, which is guaranteed to
468 # be 0], but thanks to special form of modulus and "magic"
469 # digit being equal to least significant word, it can be
470 # performed with additions and subtractions alone. Indeed:
472 # ffff0001.00000000.0000ffff.ffffffff
474 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
476 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
479 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
480 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
481 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
483 # or marking redundant operations:
485 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
486 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
487 # - 0000abcd.efgh0000.--------.--------.--------
490 subfc $t2,$t0,$acc0 # "*0xffff0001"
492 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
494 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
498 mulld $t0,$a0,$bi # lo(a[0]*b[i])
499 mulld $t1,$a1,$bi # lo(a[1]*b[i])
500 mulld $t2,$a2,$bi # lo(a[2]*b[i])
501 mulld $t3,$a3,$bi # lo(a[3]*b[i])
502 addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
503 mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
505 mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
507 mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
509 mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
512 $code.=<<___ if ($i<3);
513 ld $bi,8*($i+1)($bp) # b[$i+1]
516 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
528 subfc $t2,$t0,$acc0 # "*0xffff0001"
530 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
532 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
537 addic $acc0,$acc0,1 # ret -= modulus
538 subfe $acc1,$poly1,$acc1
539 subfe $acc2,$t2,$acc2
540 subfe $acc3,$poly3,$acc3
541 subfe $acc4,$t2,$acc4
543 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
557 .byte 0,12,0x14,0,0,0,1,0
559 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
561 # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
563 .type __ecp_nistz256_sqr_mont,\@function
565 __ecp_nistz256_sqr_mont:
566 ################################################################
567 # | | | | | |a1*a0| |
568 # | | | | |a2*a0| | |
569 # | |a3*a2|a3*a0| | | |
570 # | | | |a2*a1| | | |
571 # | | |a3*a1| | | | |
572 # *| | | | | | | | 2|
573 # +|a3*a3|a2*a2|a1*a1|a0*a0|
574 # |--+--+--+--+--+--+--+--|
575 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
577 # "can't overflow" below mark carrying into high part of
578 # multiplication result, which can't overflow, because it
579 # can never be all ones.
581 mulld $acc1,$a1,$a0 # a[1]*a[0]
583 mulld $acc2,$a2,$a0 # a[2]*a[0]
585 mulld $acc3,$a3,$a0 # a[3]*a[0]
588 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
589 mulld $t0,$a2,$a1 # a[2]*a[1]
592 mulld $t2,$a3,$a1 # a[3]*a[1]
594 addze $acc4,$acc4 # can't overflow
596 mulld $acc5,$a3,$a2 # a[3]*a[2]
599 addc $t1,$t1,$t2 # accumulate high parts of multiplication
600 addze $t2,$t3 # can't overflow
602 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
605 addze $acc6,$acc6 # can't overflow
607 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
608 adde $acc2,$acc2,$acc2
609 adde $acc3,$acc3,$acc3
610 adde $acc4,$acc4,$acc4
611 adde $acc5,$acc5,$acc5
612 adde $acc6,$acc6,$acc6
616 mulld $acc0,$a0,$a0 # a[0]*a[0]
618 mulld $t1,$a1,$a1 # a[1]*a[1]
620 mulld $t2,$a2,$a2 # a[2]*a[2]
622 mulld $t3,$a3,$a3 # a[3]*a[3]
624 addc $acc1,$acc1,$a0 # +a[i]*a[i]
634 for($i=0;$i<3;$i++) { # reductions, see commentary in
635 # multiplication for details
637 subfc $t2,$t0,$acc0 # "*0xffff0001"
639 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
643 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
644 addze $acc3,$t3 # can't overflow
648 subfc $t2,$t0,$acc0 # "*0xffff0001"
650 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
652 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
653 addze $acc3,$t3 # can't overflow
655 addc $acc0,$acc0,$acc4 # accumulate upper half
656 adde $acc1,$acc1,$acc5
657 adde $acc2,$acc2,$acc6
658 adde $acc3,$acc3,$acc7
662 addic $acc0,$acc0,1 # ret -= modulus
663 subfe $acc1,$poly1,$acc1
664 subfe $acc2,$t2,$acc2
665 subfe $acc3,$poly3,$acc3
666 subfe $acc4,$t2,$acc4
668 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
682 .byte 0,12,0x14,0,0,0,1,0
684 .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
686 # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
687 # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
688 # contexts, e.g. in multiplication by 2 and 3...
689 .type __ecp_nistz256_add,\@function
692 addc $acc0,$acc0,$t0 # ret = a+b
699 # if a+b >= modulus, subtract modulus
701 # But since comparison implies subtraction, we subtract
702 # modulus and then add it back if subtraction borrowed.
705 subfe $acc1,$poly1,$acc1
706 subfe $acc2,$t2,$acc2
707 subfe $acc3,$poly3,$acc3
724 .byte 0,12,0x14,0,0,0,3,0
726 .size __ecp_nistz256_add,.-__ecp_nistz256_add
728 .type __ecp_nistz256_sub_from,\@function
730 __ecp_nistz256_sub_from:
735 subfc $acc0,$t0,$acc0 # ret = a-b
736 subfe $acc1,$t1,$acc1
737 subfe $acc2,$t2,$acc2
738 subfe $acc3,$t3,$acc3
739 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
741 # if a-b borrowed, add modulus
743 addc $acc0,$acc0,$t0 # ret -= modulus & t0
757 .byte 0,12,0x14,0,0,0,3,0
759 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
761 .type __ecp_nistz256_sub_morf,\@function
763 __ecp_nistz256_sub_morf:
768 subfc $acc0,$acc0,$t0 # ret = b-a
769 subfe $acc1,$acc1,$t1
770 subfe $acc2,$acc2,$t2
771 subfe $acc3,$acc3,$t3
772 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
774 # if b-a borrowed, add modulus
776 addc $acc0,$acc0,$t0 # ret -= modulus & t0
790 .byte 0,12,0x14,0,0,0,3,0
792 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
794 .type __ecp_nistz256_div_by_2,\@function
796 __ecp_nistz256_div_by_2:
798 addic $acc0,$acc0,-1 # a += modulus
800 adde $acc1,$acc1,$poly1
804 adde $acc3,$acc3,$poly3
806 addze $ap,$t2 # ap = carry
809 subfc $acc0,$t0,$acc0 # a -= modulus if a was even
810 subfe $acc1,$t1,$acc1
811 subfe $acc2,$t2,$acc2
812 subfe $acc3,$t3,$acc3
835 .byte 0,12,0x14,0,0,0,1,0
837 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
839 ########################################################################
840 # following subroutines are "literal" implementation of those found in
843 ########################################################################
844 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
847 my $FRAME=64+32*4+12*8;
848 my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
849 # above map() describes stack layout with 4 temporary
850 # 256-bit vectors on top.
851 my ($rp_real,$ap_real) = map("r$_",(20,21));
854 .globl ecp_nistz256_point_double
856 ecp_nistz256_point_double:
857 stdu $sp,-$FRAME($sp)
859 std r20,$FRAME-8*12($sp)
860 std r21,$FRAME-8*11($sp)
861 std r22,$FRAME-8*10($sp)
862 std r23,$FRAME-8*9($sp)
863 std r24,$FRAME-8*8($sp)
864 std r25,$FRAME-8*7($sp)
865 std r26,$FRAME-8*6($sp)
866 std r27,$FRAME-8*5($sp)
867 std r28,$FRAME-8*4($sp)
868 std r29,$FRAME-8*3($sp)
869 std r30,$FRAME-8*2($sp)
870 std r31,$FRAME-8*1($sp)
873 srdi $poly1,$poly1,32 # 0x00000000ffffffff
875 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
885 ld $a0,64($ap) # forward load for p256_sqr_mont
892 bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
895 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
901 mr $a0,$acc0 # put Zsqr aside for p256_sub
906 bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
909 mr $acc0,$a0 # restore Zsqr
913 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
918 bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
921 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
930 bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
936 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
941 bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
944 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
946 ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
952 bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
956 bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
958 mr $t0,$acc0 # duplicate M
962 mr $a0,$acc0 # put M aside
967 bl __ecp_nistz256_add
968 mr $t0,$a0 # restore M
972 ld $bi,0($ap_real) # forward load for p256_mul_mont
977 bl __ecp_nistz256_add # p256_mul_by_3(M, M);
981 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
987 ld $a0,$M+0($sp) # forward load for p256_sqr_mont
992 bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
995 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
998 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
1002 bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
1005 mr $a0,$acc0 # copy S
1010 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
1012 addi $bp,$rp_real,32
1013 addi $rp,$rp_real,32
1014 bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
1017 ld r20,$FRAME-8*12($sp)
1018 ld r21,$FRAME-8*11($sp)
1019 ld r22,$FRAME-8*10($sp)
1020 ld r23,$FRAME-8*9($sp)
1021 ld r24,$FRAME-8*8($sp)
1022 ld r25,$FRAME-8*7($sp)
1023 ld r26,$FRAME-8*6($sp)
1024 ld r27,$FRAME-8*5($sp)
1025 ld r28,$FRAME-8*4($sp)
1026 ld r29,$FRAME-8*3($sp)
1027 ld r30,$FRAME-8*2($sp)
1028 ld r31,$FRAME-8*1($sp)
1032 .byte 0,12,4,0,0x80,12,2,0
1034 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1038 ########################################################################
1039 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1040 # const P256_POINT *in2);
1042 my $FRAME = 64 + 32*12 + 16*8;
1043 my ($res_x,$res_y,$res_z,
1044 $H,$Hsqr,$R,$Rsqr,$Hcub,
1045 $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1046 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1047 # above map() describes stack layout with 12 temporary
1048 # 256-bit vectors on top.
1049 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1052 .globl ecp_nistz256_point_add
1054 ecp_nistz256_point_add:
1055 stdu $sp,-$FRAME($sp)
1057 std r16,$FRAME-8*16($sp)
1058 std r17,$FRAME-8*15($sp)
1059 std r18,$FRAME-8*14($sp)
1060 std r19,$FRAME-8*13($sp)
1061 std r20,$FRAME-8*12($sp)
1062 std r21,$FRAME-8*11($sp)
1063 std r22,$FRAME-8*10($sp)
1064 std r23,$FRAME-8*9($sp)
1065 std r24,$FRAME-8*8($sp)
1066 std r25,$FRAME-8*7($sp)
1067 std r26,$FRAME-8*6($sp)
1068 std r27,$FRAME-8*5($sp)
1069 std r28,$FRAME-8*4($sp)
1070 std r29,$FRAME-8*3($sp)
1071 std r30,$FRAME-8*2($sp)
1072 std r31,$FRAME-8*1($sp)
1075 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1077 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1079 ld $a0,64($bp) # in2_z
1088 or $in2infty,$t0,$t2
1090 or $in2infty,$in2infty,$t0
1091 sradi $in2infty,$in2infty,63 # !in2infty
1093 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
1095 ld $a0,64($ap_real) # in1_z
1101 or $in1infty,$t0,$t2
1103 or $in1infty,$in1infty,$t0
1104 sradi $in1infty,$in1infty,63 # !in1infty
1106 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1109 ld $a0,$Z2sqr+0($sp)
1110 ld $a1,$Z2sqr+8($sp)
1111 ld $a2,$Z2sqr+16($sp)
1112 ld $a3,$Z2sqr+24($sp)
1113 addi $bp,$bp_real,64
1115 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
1118 ld $a0,$Z1sqr+0($sp)
1119 ld $a1,$Z1sqr+8($sp)
1120 ld $a2,$Z1sqr+16($sp)
1121 ld $a3,$Z1sqr+24($sp)
1122 addi $bp,$ap_real,64
1124 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1131 addi $bp,$ap_real,32
1133 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
1140 addi $bp,$bp_real,32
1142 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1145 ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
1151 bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
1153 or $acc0,$acc0,$acc1 # see if result is zero
1154 or $acc2,$acc2,$acc3
1155 or $temp,$acc0,$acc2
1159 bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
1168 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
1171 ld $a0,$R+0($sp) # forward load for p256_sqr_mont
1176 bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
1178 or $acc0,$acc0,$acc1 # see if result is zero
1179 or $acc2,$acc2,$acc3
1180 or. $acc0,$acc0,$acc2
1181 bne .Ladd_proceed # is_equal(U1,U2)?
1183 and. $t0,$in1infty,$in2infty
1184 beq .Ladd_proceed # (in1infty || in2infty)?
1187 beq .Ladd_double # is_equal(S1,S2)?
1192 std $a0,16($rp_real)
1193 std $a0,24($rp_real)
1194 std $a0,32($rp_real)
1195 std $a0,40($rp_real)
1196 std $a0,48($rp_real)
1197 std $a0,56($rp_real)
1198 std $a0,64($rp_real)
1199 std $a0,72($rp_real)
1200 std $a0,80($rp_real)
1201 std $a0,88($rp_real)
1206 ld $bp,0($sp) # back-link
1209 ld r16,$FRAME-8*16($sp)
1210 ld r17,$FRAME-8*15($sp)
1211 ld r18,$FRAME-8*14($sp)
1212 ld r19,$FRAME-8*13($sp)
1213 stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
1219 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1226 addi $bp,$ap_real,64
1228 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1235 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1238 ld $a0,$res_z+0($sp)
1239 ld $a1,$res_z+8($sp)
1240 ld $a2,$res_z+16($sp)
1241 ld $a3,$res_z+24($sp)
1242 addi $bp,$bp_real,64
1244 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
1249 ld $a2,$Hsqr+16($sp)
1250 ld $a3,$Hsqr+24($sp)
1253 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1262 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
1269 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1273 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1276 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1279 ld $bi,$Hcub($sp) # forward load for p256_mul_mont
1285 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1289 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
1292 ld $a0,$res_y+0($sp)
1293 ld $a1,$res_y+8($sp)
1294 ld $a2,$res_y+16($sp)
1295 ld $a3,$res_y+24($sp)
1298 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1301 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1303 ld $t0,0($bp_real) # in2
1307 ld $a0,$res_x+0($sp) # res
1308 ld $a1,$res_x+8($sp)
1309 ld $a2,$res_x+16($sp)
1310 ld $a3,$res_x+24($sp)
1312 for($i=0;$i<64;$i+=32) { # conditional moves
1314 ld $acc0,$i+0($ap_real) # in1
1315 ld $acc1,$i+8($ap_real)
1316 ld $acc2,$i+16($ap_real)
1317 ld $acc3,$i+24($ap_real)
1318 andc $t0,$t0,$in1infty
1319 andc $t1,$t1,$in1infty
1320 andc $t2,$t2,$in1infty
1321 andc $t3,$t3,$in1infty
1322 and $a0,$a0,$in1infty
1323 and $a1,$a1,$in1infty
1324 and $a2,$a2,$in1infty
1325 and $a3,$a3,$in1infty
1330 andc $acc0,$acc0,$in2infty
1331 andc $acc1,$acc1,$in2infty
1332 andc $acc2,$acc2,$in2infty
1333 andc $acc3,$acc3,$in2infty
1334 and $t0,$t0,$in2infty
1335 and $t1,$t1,$in2infty
1336 and $t2,$t2,$in2infty
1337 and $t3,$t3,$in2infty
1343 ld $t0,$i+32($bp_real) # in2
1344 ld $t1,$i+40($bp_real)
1345 ld $t2,$i+48($bp_real)
1346 ld $t3,$i+56($bp_real)
1347 ld $a0,$res_x+$i+32($sp)
1348 ld $a1,$res_x+$i+40($sp)
1349 ld $a2,$res_x+$i+48($sp)
1350 ld $a3,$res_x+$i+56($sp)
1351 std $acc0,$i+0($rp_real)
1352 std $acc1,$i+8($rp_real)
1353 std $acc2,$i+16($rp_real)
1354 std $acc3,$i+24($rp_real)
1358 ld $acc0,$i+0($ap_real) # in1
1359 ld $acc1,$i+8($ap_real)
1360 ld $acc2,$i+16($ap_real)
1361 ld $acc3,$i+24($ap_real)
1362 andc $t0,$t0,$in1infty
1363 andc $t1,$t1,$in1infty
1364 andc $t2,$t2,$in1infty
1365 andc $t3,$t3,$in1infty
1366 and $a0,$a0,$in1infty
1367 and $a1,$a1,$in1infty
1368 and $a2,$a2,$in1infty
1369 and $a3,$a3,$in1infty
1374 andc $acc0,$acc0,$in2infty
1375 andc $acc1,$acc1,$in2infty
1376 andc $acc2,$acc2,$in2infty
1377 andc $acc3,$acc3,$in2infty
1378 and $t0,$t0,$in2infty
1379 and $t1,$t1,$in2infty
1380 and $t2,$t2,$in2infty
1381 and $t3,$t3,$in2infty
1386 std $acc0,$i+0($rp_real)
1387 std $acc1,$i+8($rp_real)
1388 std $acc2,$i+16($rp_real)
1389 std $acc3,$i+24($rp_real)
1393 ld r16,$FRAME-8*16($sp)
1394 ld r17,$FRAME-8*15($sp)
1395 ld r18,$FRAME-8*14($sp)
1396 ld r19,$FRAME-8*13($sp)
1397 ld r20,$FRAME-8*12($sp)
1398 ld r21,$FRAME-8*11($sp)
1399 ld r22,$FRAME-8*10($sp)
1400 ld r23,$FRAME-8*9($sp)
1401 ld r24,$FRAME-8*8($sp)
1402 ld r25,$FRAME-8*7($sp)
1403 ld r26,$FRAME-8*6($sp)
1404 ld r27,$FRAME-8*5($sp)
1405 ld r28,$FRAME-8*4($sp)
1406 ld r29,$FRAME-8*3($sp)
1407 ld r30,$FRAME-8*2($sp)
1408 ld r31,$FRAME-8*1($sp)
1412 .byte 0,12,4,0,0x80,16,3,0
1414 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1418 ########################################################################
1419 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1420 # const P256_POINT_AFFINE *in2);
1422 my $FRAME = 64 + 32*10 + 16*8;
1423 my ($res_x,$res_y,$res_z,
1424 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1426 # above map() describes stack layout with 10 temporary
1427 # 256-bit vectors on top.
1428 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1431 .globl ecp_nistz256_point_add_affine
1433 ecp_nistz256_point_add_affine:
1434 stdu $sp,-$FRAME($sp)
1436 std r16,$FRAME-8*16($sp)
1437 std r17,$FRAME-8*15($sp)
1438 std r18,$FRAME-8*14($sp)
1439 std r19,$FRAME-8*13($sp)
1440 std r20,$FRAME-8*12($sp)
1441 std r21,$FRAME-8*11($sp)
1442 std r22,$FRAME-8*10($sp)
1443 std r23,$FRAME-8*9($sp)
1444 std r24,$FRAME-8*8($sp)
1445 std r25,$FRAME-8*7($sp)
1446 std r26,$FRAME-8*6($sp)
1447 std r27,$FRAME-8*5($sp)
1448 std r28,$FRAME-8*4($sp)
1449 std r29,$FRAME-8*3($sp)
1450 std r30,$FRAME-8*2($sp)
1451 std r31,$FRAME-8*1($sp)
1454 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1456 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1462 ld $a0,64($ap) # in1_z
1468 or $in1infty,$t0,$t2
1470 or $in1infty,$in1infty,$t0
1471 sradi $in1infty,$in1infty,63 # !in1infty
1473 ld $acc0,0($bp) # in2_x
1477 ld $t0,32($bp) # in2_y
1481 or $acc0,$acc0,$acc1
1482 or $acc2,$acc2,$acc3
1483 or $acc0,$acc0,$acc2
1487 or $in2infty,$acc0,$t0
1489 or $in2infty,$in2infty,$t0
1490 sradi $in2infty,$in2infty,63 # !in2infty
1493 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1502 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
1505 ld $bi,64($ap_real) # forward load for p256_mul_mont
1506 ld $a0,$Z1sqr+0($sp)
1507 ld $a1,$Z1sqr+8($sp)
1508 ld $a2,$Z1sqr+16($sp)
1509 ld $a3,$Z1sqr+24($sp)
1511 bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
1513 addi $bp,$ap_real,64
1515 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1522 addi $bp,$ap_real,64
1524 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1531 addi $bp,$bp_real,32
1533 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1535 addi $bp,$ap_real,32
1536 ld $a0,$H+0($sp) # forward load for p256_sqr_mont
1541 bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
1544 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1551 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1556 ld $a2,$Hsqr+16($sp)
1557 ld $a3,$Hsqr+24($sp)
1560 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1565 ld $a2,$Hsqr+16($sp)
1566 ld $a3,$Hsqr+24($sp)
1569 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
1576 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1580 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1583 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1586 ld $bi,32($ap_real) # forward load for p256_mul_mont
1589 ld $a2,$Hcub+16($sp)
1590 ld $a3,$Hcub+24($sp)
1592 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1594 addi $bp,$ap_real,32
1596 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
1599 ld $a0,$res_y+0($sp)
1600 ld $a1,$res_y+8($sp)
1601 ld $a2,$res_y+16($sp)
1602 ld $a3,$res_y+24($sp)
1605 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1608 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1610 ld $t0,0($bp_real) # in2
1614 ld $a0,$res_x+0($sp) # res
1615 ld $a1,$res_x+8($sp)
1616 ld $a2,$res_x+16($sp)
1617 ld $a3,$res_x+24($sp)
1619 for($i=0;$i<64;$i+=32) { # conditional moves
1621 ld $acc0,$i+0($ap_real) # in1
1622 ld $acc1,$i+8($ap_real)
1623 ld $acc2,$i+16($ap_real)
1624 ld $acc3,$i+24($ap_real)
1625 andc $t0,$t0,$in1infty
1626 andc $t1,$t1,$in1infty
1627 andc $t2,$t2,$in1infty
1628 andc $t3,$t3,$in1infty
1629 and $a0,$a0,$in1infty
1630 and $a1,$a1,$in1infty
1631 and $a2,$a2,$in1infty
1632 and $a3,$a3,$in1infty
1637 andc $acc0,$acc0,$in2infty
1638 andc $acc1,$acc1,$in2infty
1639 andc $acc2,$acc2,$in2infty
1640 andc $acc3,$acc3,$in2infty
1641 and $t0,$t0,$in2infty
1642 and $t1,$t1,$in2infty
1643 and $t2,$t2,$in2infty
1644 and $t3,$t3,$in2infty
1650 $code.=<<___ if ($i==0);
1651 ld $t0,32($bp_real) # in2
1656 $code.=<<___ if ($i==32);
1657 li $t0,1 # Lone_mont
1663 ld $a0,$res_x+$i+32($sp)
1664 ld $a1,$res_x+$i+40($sp)
1665 ld $a2,$res_x+$i+48($sp)
1666 ld $a3,$res_x+$i+56($sp)
1667 std $acc0,$i+0($rp_real)
1668 std $acc1,$i+8($rp_real)
1669 std $acc2,$i+16($rp_real)
1670 std $acc3,$i+24($rp_real)
1674 ld $acc0,$i+0($ap_real) # in1
1675 ld $acc1,$i+8($ap_real)
1676 ld $acc2,$i+16($ap_real)
1677 ld $acc3,$i+24($ap_real)
1678 andc $t0,$t0,$in1infty
1679 andc $t1,$t1,$in1infty
1680 andc $t2,$t2,$in1infty
1681 andc $t3,$t3,$in1infty
1682 and $a0,$a0,$in1infty
1683 and $a1,$a1,$in1infty
1684 and $a2,$a2,$in1infty
1685 and $a3,$a3,$in1infty
1690 andc $acc0,$acc0,$in2infty
1691 andc $acc1,$acc1,$in2infty
1692 andc $acc2,$acc2,$in2infty
1693 andc $acc3,$acc3,$in2infty
1694 and $t0,$t0,$in2infty
1695 and $t1,$t1,$in2infty
1696 and $t2,$t2,$in2infty
1697 and $t3,$t3,$in2infty
1702 std $acc0,$i+0($rp_real)
1703 std $acc1,$i+8($rp_real)
1704 std $acc2,$i+16($rp_real)
1705 std $acc3,$i+24($rp_real)
1708 ld r16,$FRAME-8*16($sp)
1709 ld r17,$FRAME-8*15($sp)
1710 ld r18,$FRAME-8*14($sp)
1711 ld r19,$FRAME-8*13($sp)
1712 ld r20,$FRAME-8*12($sp)
1713 ld r21,$FRAME-8*11($sp)
1714 ld r22,$FRAME-8*10($sp)
1715 ld r23,$FRAME-8*9($sp)
1716 ld r24,$FRAME-8*8($sp)
1717 ld r25,$FRAME-8*7($sp)
1718 ld r26,$FRAME-8*6($sp)
1719 ld r27,$FRAME-8*5($sp)
1720 ld r28,$FRAME-8*4($sp)
1721 ld r29,$FRAME-8*3($sp)
1722 ld r30,$FRAME-8*2($sp)
1723 ld r31,$FRAME-8*1($sp)
1727 .byte 0,12,4,0,0x80,16,3,0
1729 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1733 my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1734 my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1737 ########################################################################
1738 # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1740 .globl ecp_nistz256_ord_mul_mont
1742 ecp_nistz256_ord_mul_mont:
1768 ori $ordk,$ordk,0xc8aa
1769 ori $ord0,$ord0,0xcac2
1770 ori $ord1,$ord1,0xfaad
1774 oris $ordk,$ordk,0xee00
1775 oris $ord0,$ord0,0xfc63
1776 oris $ord1,$ord1,0xa717
1777 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1778 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1779 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1780 li $ord2,-1 # 0xffffffffffffffff
1781 sldi $ord3,$ord2,32 # 0xffffffff00000000
1784 mulld $acc0,$a0,$bi # a[0]*b[0]
1787 mulld $acc1,$a1,$bi # a[1]*b[0]
1790 mulld $acc2,$a2,$bi # a[2]*b[0]
1793 mulld $acc3,$a3,$bi # a[3]*b[0]
1794 mulhdu $acc4,$a3,$bi
1796 mulld $t4,$acc0,$ordk
1798 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
1799 adde $acc2,$acc2,$t1
1800 adde $acc3,$acc3,$t2
1804 for ($i=1;$i<4;$i++) {
1805 ################################################################
1806 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1808 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1810 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1813 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1814 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1815 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1817 ld $bi,8*$i($bp) # b[i]
1820 subfc $acc2,$t4,$acc2
1822 subfe $acc3,$t0,$acc3
1823 subfe $acc4,$t1,$acc4
1824 subfe $acc5,$zr,$acc5
1826 addic $t0,$acc0,-1 # discarded
1827 mulhdu $t1,$ord0,$t4
1829 mulhdu $t3,$ord1,$t4
1836 addc $acc0,$acc1,$t2
1838 adde $acc1,$acc2,$t3
1840 adde $acc2,$acc3,$t4
1841 adde $acc3,$acc4,$t4
1844 addc $acc0,$acc0,$t0 # accumulate low parts
1846 adde $acc1,$acc1,$t1
1848 adde $acc2,$acc2,$t2
1850 adde $acc3,$acc3,$t3
1853 mulld $t4,$acc0,$ordk
1854 addc $acc1,$acc1,$t0 # accumulate high parts
1855 adde $acc2,$acc2,$t1
1856 adde $acc3,$acc3,$t2
1857 adde $acc4,$acc4,$t3
1862 sldi $t0,$t4,32 # last reduction
1863 subfc $acc2,$t4,$acc2
1865 subfe $acc3,$t0,$acc3
1866 subfe $acc4,$t1,$acc4
1867 subfe $acc5,$zr,$acc5
1869 addic $t0,$acc0,-1 # discarded
1870 mulhdu $t1,$ord0,$t4
1872 mulhdu $t3,$ord1,$t4
1877 addc $acc0,$acc1,$t2
1878 adde $acc1,$acc2,$t3
1879 adde $acc2,$acc3,$t4
1880 adde $acc3,$acc4,$t4
1883 subfc $acc0,$ord0,$acc0 # ret -= modulus
1884 subfe $acc1,$ord1,$acc1
1885 subfe $acc2,$ord2,$acc2
1886 subfe $acc3,$ord3,$acc3
1887 subfe $acc4,$zr,$acc4
1891 addc $acc0,$acc0,$t0 # ret += modulus if borrow
1893 adde $acc1,$acc1,$t1
1894 adde $acc2,$acc2,$acc4
1895 adde $acc3,$acc3,$t3
1919 .byte 0,12,4,0,0x80,14,3,0
1921 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1923 ################################################################################
1924 # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1926 .globl ecp_nistz256_ord_sqr_mont
1928 ecp_nistz256_ord_sqr_mont:
1955 ori $ordk,$ordk,0xc8aa
1956 ori $ord0,$ord0,0xcac2
1957 ori $ord1,$ord1,0xfaad
1961 oris $ordk,$ordk,0xee00
1962 oris $ord0,$ord0,0xfc63
1963 oris $ord1,$ord1,0xa717
1964 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1965 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1966 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1967 li $ord2,-1 # 0xffffffffffffffff
1968 sldi $ord3,$ord2,32 # 0xffffffff00000000
1974 ################################################################
1975 # | | | | | |a1*a0| |
1976 # | | | | |a2*a0| | |
1977 # | |a3*a2|a3*a0| | | |
1978 # | | | |a2*a1| | | |
1979 # | | |a3*a1| | | | |
1980 # *| | | | | | | | 2|
1981 # +|a3*a3|a2*a2|a1*a1|a0*a0|
1982 # |--+--+--+--+--+--+--+--|
1983 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1985 # "can't overflow" below mark carrying into high part of
1986 # multiplication result, which can't overflow, because it
1987 # can never be all ones.
1989 mulld $acc1,$a1,$a0 # a[1]*a[0]
1991 mulld $acc2,$a2,$a0 # a[2]*a[0]
1993 mulld $acc3,$a3,$a0 # a[3]*a[0]
1994 mulhdu $acc4,$a3,$a0
1996 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
1997 mulld $t0,$a2,$a1 # a[2]*a[1]
1999 adde $acc3,$acc3,$t2
2000 mulld $t2,$a3,$a1 # a[3]*a[1]
2002 addze $acc4,$acc4 # can't overflow
2004 mulld $acc5,$a3,$a2 # a[3]*a[2]
2005 mulhdu $acc6,$a3,$a2
2007 addc $t1,$t1,$t2 # accumulate high parts of multiplication
2008 mulld $acc0,$a0,$a0 # a[0]*a[0]
2009 addze $t2,$t3 # can't overflow
2011 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
2013 adde $acc4,$acc4,$t1
2014 mulld $t1,$a1,$a1 # a[1]*a[1]
2015 adde $acc5,$acc5,$t2
2017 addze $acc6,$acc6 # can't overflow
2019 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
2020 mulld $t2,$a2,$a2 # a[2]*a[2]
2021 adde $acc2,$acc2,$acc2
2023 adde $acc3,$acc3,$acc3
2024 mulld $t3,$a3,$a3 # a[3]*a[3]
2025 adde $acc4,$acc4,$acc4
2027 adde $acc5,$acc5,$acc5
2028 adde $acc6,$acc6,$acc6
2031 addc $acc1,$acc1,$a0 # +a[i]*a[i]
2032 mulld $t4,$acc0,$ordk
2033 adde $acc2,$acc2,$t1
2034 adde $acc3,$acc3,$a1
2035 adde $acc4,$acc4,$t2
2036 adde $acc5,$acc5,$a2
2037 adde $acc6,$acc6,$t3
2038 adde $acc7,$acc7,$a3
2040 for($i=0; $i<4; $i++) { # reductions
2042 addic $t0,$acc0,-1 # discarded
2043 mulhdu $t1,$ord0,$t4
2045 mulhdu $t3,$ord1,$t4
2050 addc $acc0,$acc1,$t2
2051 adde $acc1,$acc2,$t3
2052 adde $acc2,$acc3,$t4
2053 adde $acc3,$zr,$t4 # can't overflow
2055 $code.=<<___ if ($i<3);
2056 mulld $t3,$acc0,$ordk
2060 subfc $acc1,$t4,$acc1
2062 subfe $acc2,$t0,$acc2
2063 subfe $acc3,$t1,$acc3 # can't borrow
2065 ($t3,$t4) = ($t4,$t3);
2068 addc $acc0,$acc0,$acc4 # accumulate upper half
2069 adde $acc1,$acc1,$acc5
2070 adde $acc2,$acc2,$acc6
2071 adde $acc3,$acc3,$acc7
2074 subfc $acc0,$ord0,$acc0 # ret -= modulus
2075 subfe $acc1,$ord1,$acc1
2076 subfe $acc2,$ord2,$acc2
2077 subfe $acc3,$ord3,$acc3
2078 subfe $acc4,$zr,$acc4
2082 addc $a0,$acc0,$t0 # ret += modulus if borrow
2085 adde $a2,$acc2,$acc4
2112 .byte 0,12,4,0,0x80,14,3,0
2114 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
2118 ########################################################################
2119 # scatter-gather subroutines
2121 my ($out,$inp,$index,$mask)=map("r$_",(3..7));
2123 ########################################################################
2124 # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2126 .globl ecp_nistz256_scatter_w5
2128 ecp_nistz256_scatter_w5:
2129 slwi $index,$index,2
2130 add $out,$out,$index
2137 stw r8, 64*0-4($out)
2139 stw r9, 64*1-4($out)
2141 stw r10,64*2-4($out)
2143 stw r11,64*3-4($out)
2145 stw r8, 64*4-4($out)
2146 stw r9, 64*5-4($out)
2147 stw r10,64*6-4($out)
2148 stw r11,64*7-4($out)
2156 stw r8, 64*0-4($out)
2158 stw r9, 64*1-4($out)
2160 stw r10,64*2-4($out)
2162 stw r11,64*3-4($out)
2164 stw r8, 64*4-4($out)
2165 stw r9, 64*5-4($out)
2166 stw r10,64*6-4($out)
2167 stw r11,64*7-4($out)
2175 stw r8, 64*0-4($out)
2177 stw r9, 64*1-4($out)
2179 stw r10,64*2-4($out)
2181 stw r11,64*3-4($out)
2183 stw r8, 64*4-4($out)
2184 stw r9, 64*5-4($out)
2185 stw r10,64*6-4($out)
2186 stw r11,64*7-4($out)
2190 .byte 0,12,0x14,0,0,0,3,0
2192 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2194 ########################################################################
2195 # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2197 .globl ecp_nistz256_gather_w5
2199 ecp_nistz256_gather_w5:
2203 add $index,$index,r0
2204 slwi $index,$index,2
2205 add $inp,$inp,$index
2286 .byte 0,12,0x14,0,0,0,3,0
2288 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2290 ########################################################################
2291 # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2293 .globl ecp_nistz256_scatter_w7
2295 ecp_nistz256_scatter_w7:
2298 add $out,$out,$index
2319 bdnz .Loop_scatter_w7
2323 .byte 0,12,0x14,0,0,0,3,0
2325 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2327 ########################################################################
2328 # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2330 .globl ecp_nistz256_gather_w7
2332 ecp_nistz256_gather_w7:
2338 add $index,$index,r0
2339 add $inp,$inp,$index
2370 bdnz .Loop_gather_w7
2374 .byte 0,12,0x14,0,0,0,3,0
2376 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2380 foreach (split("\n",$code)) {
2381 s/\`([^\`]*)\`/eval $1/ge;
2385 close STDOUT or die "error closing STDOUT"; # enforce flush