3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # ECP_NISTZ256 module for PPC64.
14 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
15 # http://eprint.iacr.org/2013/816.
17 # with/without -DECP_NISTZ256_ASM
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
24 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
25 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
26 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
27 die "can't locate ppc-xlate.pl";
29 open OUT,"| \"$^X\" $xlate $flavour $output";
35 my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
36 $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
37 map("r$_",(3..12,22..31));
39 my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
45 ########################################################################
46 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 open TABLE,"<ecp_nistz256_table.c" or
50 open TABLE,"<${dir}../ecp_nistz256_table.c" or
51 die "failed to open ecp_nistz256_table.c:",$!;
56 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
60 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
61 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
63 die "insane number of elements" if ($#arr != 64*16*37-1);
66 .type ecp_nistz256_precomputed,\@object
67 .globl ecp_nistz256_precomputed
69 ecp_nistz256_precomputed:
71 ########################################################################
72 # this conversion smashes P256_POINT_AFFINE by individual bytes with
73 # 64 byte interval, similar to
77 @tbl = splice(@arr,0,64*16);
78 for($i=0;$i<64;$i++) {
80 for($j=0;$j<64;$j++) {
81 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
84 $code.=join(',',map { sprintf "0x%02x",$_} @line);
90 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
91 .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
93 # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
94 # const BN_ULONG x2[4]);
95 .globl ecp_nistz256_mul_mont
97 ecp_nistz256_mul_mont:
118 srdi $poly1,$poly1,32 # 0x00000000ffffffff
120 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
122 bl __ecp_nistz256_mul_mont
138 .byte 0,12,4,0,0x80,10,3,0
140 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
142 # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
143 .globl ecp_nistz256_sqr_mont
145 ecp_nistz256_sqr_mont:
165 srdi $poly1,$poly1,32 # 0x00000000ffffffff
167 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
169 bl __ecp_nistz256_sqr_mont
185 .byte 0,12,4,0,0x80,10,2,0
187 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
189 # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
190 # const BN_ULONG x2[4]);
191 .globl ecp_nistz256_add
211 srdi $poly1,$poly1,32 # 0x00000000ffffffff
213 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
215 bl __ecp_nistz256_add
225 .byte 0,12,4,0,0x80,4,3,0
227 .size ecp_nistz256_add,.-ecp_nistz256_add
229 # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
230 .globl ecp_nistz256_div_by_2
232 ecp_nistz256_div_by_2:
246 srdi $poly1,$poly1,32 # 0x00000000ffffffff
248 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
250 bl __ecp_nistz256_div_by_2
260 .byte 0,12,4,0,0x80,4,2,0
262 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
264 # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
265 .globl ecp_nistz256_mul_by_2
267 ecp_nistz256_mul_by_2:
286 srdi $poly1,$poly1,32 # 0x00000000ffffffff
288 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
290 bl __ecp_nistz256_add # ret = a+a // 2*a
300 .byte 0,12,4,0,0x80,4,3,0
302 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
304 # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
305 .globl ecp_nistz256_mul_by_3
307 ecp_nistz256_mul_by_3:
330 srdi $poly1,$poly1,32 # 0x00000000ffffffff
332 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
334 bl __ecp_nistz256_add # ret = a+a // 2*a
341 bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
351 .byte 0,12,4,0,0x80,4,2,0
353 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
355 # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
356 # const BN_ULONG x2[4]);
357 .globl ecp_nistz256_sub
373 srdi $poly1,$poly1,32 # 0x00000000ffffffff
375 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
377 bl __ecp_nistz256_sub_from
387 .byte 0,12,4,0,0x80,4,3,0
389 .size ecp_nistz256_sub,.-ecp_nistz256_sub
391 # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
392 .globl ecp_nistz256_neg
409 srdi $poly1,$poly1,32 # 0x00000000ffffffff
411 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
413 bl __ecp_nistz256_sub_from
423 .byte 0,12,4,0,0x80,4,2,0
425 .size ecp_nistz256_neg,.-ecp_nistz256_neg
427 # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
428 # to $a0-$a3 and b[0] - to $bi
429 .type __ecp_nistz256_mul_mont,\@function
431 __ecp_nistz256_mul_mont:
432 mulld $acc0,$a0,$bi # a[0]*b[0]
435 mulld $acc1,$a1,$bi # a[1]*b[0]
438 mulld $acc2,$a2,$bi # a[2]*b[0]
441 mulld $acc3,$a3,$bi # a[3]*b[0]
445 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
453 for($i=1;$i<4;$i++) {
454 ################################################################
455 # Reduction iteration is normally performed by accumulating
456 # result of multiplication of modulus by "magic" digit [and
457 # omitting least significant word, which is guaranteed to
458 # be 0], but thanks to special form of modulus and "magic"
459 # digit being equal to least significant word, it can be
460 # performed with additions and subtractions alone. Indeed:
462 # ffff0001.00000000.0000ffff.ffffffff
464 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
466 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
469 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
470 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
471 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
473 # or marking redundant operations:
475 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
476 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
477 # - 0000abcd.efgh0000.--------.--------.--------
480 subfc $t2,$t0,$acc0 # "*0xffff0001"
482 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
484 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
488 mulld $t0,$a0,$bi # lo(a[0]*b[i])
489 mulld $t1,$a1,$bi # lo(a[1]*b[i])
490 mulld $t2,$a2,$bi # lo(a[2]*b[i])
491 mulld $t3,$a3,$bi # lo(a[3]*b[i])
492 addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
493 mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
495 mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
497 mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
499 mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
502 $code.=<<___ if ($i<3);
503 ld $bi,8*($i+1)($bp) # b[$i+1]
506 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
518 subfc $t2,$t0,$acc0 # "*0xffff0001"
520 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
522 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
527 addic $acc0,$acc0,1 # ret -= modulus
528 subfe $acc1,$poly1,$acc1
529 subfe $acc2,$t2,$acc2
530 subfe $acc3,$poly3,$acc3
531 subfe $acc4,$t2,$acc4
533 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
547 .byte 0,12,0x14,0,0,0,1,0
549 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
551 # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
553 .type __ecp_nistz256_sqr_mont,\@function
555 __ecp_nistz256_sqr_mont:
556 ################################################################
557 # | | | | | |a1*a0| |
558 # | | | | |a2*a0| | |
559 # | |a3*a2|a3*a0| | | |
560 # | | | |a2*a1| | | |
561 # | | |a3*a1| | | | |
562 # *| | | | | | | | 2|
563 # +|a3*a3|a2*a2|a1*a1|a0*a0|
564 # |--+--+--+--+--+--+--+--|
565 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
567 # "can't overflow" below mark carrying into high part of
568 # multiplication result, which can't overflow, because it
569 # can never be all ones.
571 mulld $acc1,$a1,$a0 # a[1]*a[0]
573 mulld $acc2,$a2,$a0 # a[2]*a[0]
575 mulld $acc3,$a3,$a0 # a[3]*a[0]
578 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
579 mulld $t0,$a2,$a1 # a[2]*a[1]
582 mulld $t2,$a3,$a1 # a[3]*a[1]
584 addze $acc4,$acc4 # can't overflow
586 mulld $acc5,$a3,$a2 # a[3]*a[2]
589 addc $t1,$t1,$t2 # accumulate high parts of multiplication
590 addze $t2,$t3 # can't overflow
592 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
595 addze $acc6,$acc6 # can't overflow
597 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
598 adde $acc2,$acc2,$acc2
599 adde $acc3,$acc3,$acc3
600 adde $acc4,$acc4,$acc4
601 adde $acc5,$acc5,$acc5
602 adde $acc6,$acc6,$acc6
606 mulld $acc0,$a0,$a0 # a[0]*a[0]
608 mulld $t1,$a1,$a1 # a[1]*a[1]
610 mulld $t2,$a2,$a2 # a[2]*a[2]
612 mulld $t3,$a3,$a3 # a[3]*a[3]
614 addc $acc1,$acc1,$a0 # +a[i]*a[i]
624 for($i=0;$i<3;$i++) { # reductions, see commentary in
625 # multiplication for details
627 subfc $t2,$t0,$acc0 # "*0xffff0001"
629 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
633 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
634 addze $acc3,$t3 # can't overflow
638 subfc $t2,$t0,$acc0 # "*0xffff0001"
640 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
642 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
643 addze $acc3,$t3 # can't overflow
645 addc $acc0,$acc0,$acc4 # accumulate upper half
646 adde $acc1,$acc1,$acc5
647 adde $acc2,$acc2,$acc6
648 adde $acc3,$acc3,$acc7
652 addic $acc0,$acc0,1 # ret -= modulus
653 subfe $acc1,$poly1,$acc1
654 subfe $acc2,$t2,$acc2
655 subfe $acc3,$poly3,$acc3
656 subfe $acc4,$t2,$acc4
658 addc $acc0,$acc0,$acc4 # ret += modulus if borrow
672 .byte 0,12,0x14,0,0,0,1,0
674 .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
676 # Note that __ecp_nistz256_add expects both input vectors pre-loaded to
677 # $a0-$a3 and $t0-$t3. This is done because it's used in multiple
678 # contexts, e.g. in multiplication by 2 and 3...
679 .type __ecp_nistz256_add,\@function
682 addc $acc0,$acc0,$t0 # ret = a+b
689 # if a+b >= modulus, subtract modulus
691 # But since comparison implies subtraction, we subtract
692 # modulus and then add it back if subraction borrowed.
695 subfe $acc1,$poly1,$acc1
696 subfe $acc2,$t2,$acc2
697 subfe $acc3,$poly3,$acc3
714 .byte 0,12,0x14,0,0,0,3,0
716 .size __ecp_nistz256_add,.-__ecp_nistz256_add
718 .type __ecp_nistz256_sub_from,\@function
720 __ecp_nistz256_sub_from:
725 subfc $acc0,$t0,$acc0 # ret = a-b
726 subfe $acc1,$t1,$acc1
727 subfe $acc2,$t2,$acc2
728 subfe $acc3,$t3,$acc3
729 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
731 # if a-b borrowed, add modulus
733 addc $acc0,$acc0,$t0 # ret -= modulus & t0
747 .byte 0,12,0x14,0,0,0,3,0
749 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
751 .type __ecp_nistz256_sub_morf,\@function
753 __ecp_nistz256_sub_morf:
758 subfc $acc0,$acc0,$t0 # ret = b-a
759 subfe $acc1,$acc1,$t1
760 subfe $acc2,$acc2,$t2
761 subfe $acc3,$acc3,$t3
762 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
764 # if b-a borrowed, add modulus
766 addc $acc0,$acc0,$t0 # ret -= modulus & t0
780 .byte 0,12,0x14,0,0,0,3,0
782 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
784 .type __ecp_nistz256_div_by_2,\@function
786 __ecp_nistz256_div_by_2:
788 addic $acc0,$acc0,-1 # a += modulus
790 adde $acc1,$acc1,$poly1
794 adde $acc3,$acc3,$poly3
796 addze $ap,$t2 # ap = carry
799 subfc $acc0,$t0,$acc0 # a -= modulus if a was even
800 subfe $acc1,$t1,$acc1
801 subfe $acc2,$t2,$acc2
802 subfe $acc3,$t3,$acc3
825 .byte 0,12,0x14,0,0,0,1,0
827 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
829 ########################################################################
830 # following subroutines are "literal" implementation of those found in
833 ########################################################################
834 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
837 my $FRAME=64+32*4+12*8;
838 my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
839 # above map() describes stack layout with 4 temporary
840 # 256-bit vectors on top.
841 my ($rp_real,$ap_real) = map("r$_",(20,21));
844 .globl ecp_nistz256_point_double
846 ecp_nistz256_point_double:
847 stdu $sp,-$FRAME($sp)
849 std r20,$FRAME-8*12($sp)
850 std r21,$FRAME-8*11($sp)
851 std r22,$FRAME-8*10($sp)
852 std r23,$FRAME-8*9($sp)
853 std r24,$FRAME-8*8($sp)
854 std r25,$FRAME-8*7($sp)
855 std r26,$FRAME-8*6($sp)
856 std r27,$FRAME-8*5($sp)
857 std r28,$FRAME-8*4($sp)
858 std r29,$FRAME-8*3($sp)
859 std r30,$FRAME-8*2($sp)
860 std r31,$FRAME-8*1($sp)
863 srdi $poly1,$poly1,32 # 0x00000000ffffffff
865 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
875 ld $a0,64($ap) # forward load for p256_sqr_mont
882 bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
885 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
891 mr $a0,$acc0 # put Zsqr aside for p256_sub
896 bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
899 mr $acc0,$a0 # restore Zsqr
903 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
908 bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
911 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
920 bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
926 ld $a0,$S+0($sp) # forward load for p256_sqr_mont
931 bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
934 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
936 ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
942 bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
946 bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
948 mr $t0,$acc0 # duplicate M
952 mr $a0,$acc0 # put M aside
957 bl __ecp_nistz256_add
958 mr $t0,$a0 # restore M
962 ld $bi,0($ap_real) # forward load for p256_mul_mont
967 bl __ecp_nistz256_add # p256_mul_by_3(M, M);
971 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
977 ld $a0,$M+0($sp) # forward load for p256_sqr_mont
982 bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
985 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
988 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
992 bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
995 mr $a0,$acc0 # copy S
1000 bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
1002 addi $bp,$rp_real,32
1003 addi $rp,$rp_real,32
1004 bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
1007 ld r20,$FRAME-8*12($sp)
1008 ld r21,$FRAME-8*11($sp)
1009 ld r22,$FRAME-8*10($sp)
1010 ld r23,$FRAME-8*9($sp)
1011 ld r24,$FRAME-8*8($sp)
1012 ld r25,$FRAME-8*7($sp)
1013 ld r26,$FRAME-8*6($sp)
1014 ld r27,$FRAME-8*5($sp)
1015 ld r28,$FRAME-8*4($sp)
1016 ld r29,$FRAME-8*3($sp)
1017 ld r30,$FRAME-8*2($sp)
1018 ld r31,$FRAME-8*1($sp)
1022 .byte 0,12,4,0,0x80,12,2,0
1024 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1028 ########################################################################
1029 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1030 # const P256_POINT *in2);
1032 my $FRAME = 64 + 32*12 + 16*8;
1033 my ($res_x,$res_y,$res_z,
1034 $H,$Hsqr,$R,$Rsqr,$Hcub,
1035 $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
1036 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1037 # above map() describes stack layout with 12 temporary
1038 # 256-bit vectors on top.
1039 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1042 .globl ecp_nistz256_point_add
1044 ecp_nistz256_point_add:
1045 stdu $sp,-$FRAME($sp)
1047 std r16,$FRAME-8*16($sp)
1048 std r17,$FRAME-8*15($sp)
1049 std r18,$FRAME-8*14($sp)
1050 std r19,$FRAME-8*13($sp)
1051 std r20,$FRAME-8*12($sp)
1052 std r21,$FRAME-8*11($sp)
1053 std r22,$FRAME-8*10($sp)
1054 std r23,$FRAME-8*9($sp)
1055 std r24,$FRAME-8*8($sp)
1056 std r25,$FRAME-8*7($sp)
1057 std r26,$FRAME-8*6($sp)
1058 std r27,$FRAME-8*5($sp)
1059 std r28,$FRAME-8*4($sp)
1060 std r29,$FRAME-8*3($sp)
1061 std r30,$FRAME-8*2($sp)
1062 std r31,$FRAME-8*1($sp)
1065 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1067 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1069 ld $a0,64($bp) # in2_z
1078 or $in2infty,$t0,$t2
1080 or $in2infty,$in2infty,$t0
1081 sradi $in2infty,$in2infty,63 # !in2infty
1083 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
1085 ld $a0,64($ap_real) # in1_z
1091 or $in1infty,$t0,$t2
1093 or $in1infty,$in1infty,$t0
1094 sradi $in1infty,$in1infty,63 # !in1infty
1096 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1099 ld $a0,$Z2sqr+0($sp)
1100 ld $a1,$Z2sqr+8($sp)
1101 ld $a2,$Z2sqr+16($sp)
1102 ld $a3,$Z2sqr+24($sp)
1103 addi $bp,$bp_real,64
1105 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
1108 ld $a0,$Z1sqr+0($sp)
1109 ld $a1,$Z1sqr+8($sp)
1110 ld $a2,$Z1sqr+16($sp)
1111 ld $a3,$Z1sqr+24($sp)
1112 addi $bp,$ap_real,64
1114 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1121 addi $bp,$ap_real,32
1123 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
1130 addi $bp,$bp_real,32
1132 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1135 ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
1141 bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
1143 or $acc0,$acc0,$acc1 # see if result is zero
1144 or $acc2,$acc2,$acc3
1145 or $temp,$acc0,$acc2
1149 bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
1158 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
1161 ld $a0,$R+0($sp) # forward load for p256_sqr_mont
1166 bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
1168 or $acc0,$acc0,$acc1 # see if result is zero
1169 or $acc2,$acc2,$acc3
1170 or. $acc0,$acc0,$acc2
1171 bne .Ladd_proceed # is_equal(U1,U2)?
1173 and. $t0,$in1infty,$in2infty
1174 beq .Ladd_proceed # (in1infty || in2infty)?
1177 beq .Ladd_double # is_equal(S1,S2)?
1182 std $a0,16($rp_real)
1183 std $a0,24($rp_real)
1184 std $a0,32($rp_real)
1185 std $a0,40($rp_real)
1186 std $a0,48($rp_real)
1187 std $a0,56($rp_real)
1188 std $a0,64($rp_real)
1189 std $a0,72($rp_real)
1190 std $a0,80($rp_real)
1191 std $a0,88($rp_real)
1196 ld $bp,0($sp) # back-link
1199 ld r16,$FRAME-8*16($sp)
1200 ld r17,$FRAME-8*15($sp)
1201 ld r18,$FRAME-8*14($sp)
1202 ld r19,$FRAME-8*13($sp)
1203 stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
1209 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1216 addi $bp,$ap_real,64
1218 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1225 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1228 ld $a0,$res_z+0($sp)
1229 ld $a1,$res_z+8($sp)
1230 ld $a2,$res_z+16($sp)
1231 ld $a3,$res_z+24($sp)
1232 addi $bp,$bp_real,64
1234 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
1239 ld $a2,$Hsqr+16($sp)
1240 ld $a3,$Hsqr+24($sp)
1243 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1252 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
1259 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1263 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1266 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1269 ld $bi,$Hcub($sp) # forward load for p256_mul_mont
1275 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1279 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
1282 ld $a0,$res_y+0($sp)
1283 ld $a1,$res_y+8($sp)
1284 ld $a2,$res_y+16($sp)
1285 ld $a3,$res_y+24($sp)
1288 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1291 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1293 ld $t0,0($bp_real) # in2
1297 ld $a0,$res_x+0($sp) # res
1298 ld $a1,$res_x+8($sp)
1299 ld $a2,$res_x+16($sp)
1300 ld $a3,$res_x+24($sp)
1302 for($i=0;$i<64;$i+=32) { # conditional moves
1304 ld $acc0,$i+0($ap_real) # in1
1305 ld $acc1,$i+8($ap_real)
1306 ld $acc2,$i+16($ap_real)
1307 ld $acc3,$i+24($ap_real)
1308 andc $t0,$t0,$in1infty
1309 andc $t1,$t1,$in1infty
1310 andc $t2,$t2,$in1infty
1311 andc $t3,$t3,$in1infty
1312 and $a0,$a0,$in1infty
1313 and $a1,$a1,$in1infty
1314 and $a2,$a2,$in1infty
1315 and $a3,$a3,$in1infty
1320 andc $acc0,$acc0,$in2infty
1321 andc $acc1,$acc1,$in2infty
1322 andc $acc2,$acc2,$in2infty
1323 andc $acc3,$acc3,$in2infty
1324 and $t0,$t0,$in2infty
1325 and $t1,$t1,$in2infty
1326 and $t2,$t2,$in2infty
1327 and $t3,$t3,$in2infty
1333 ld $t0,$i+32($bp_real) # in2
1334 ld $t1,$i+40($bp_real)
1335 ld $t2,$i+48($bp_real)
1336 ld $t3,$i+56($bp_real)
1337 ld $a0,$res_x+$i+32($sp)
1338 ld $a1,$res_x+$i+40($sp)
1339 ld $a2,$res_x+$i+48($sp)
1340 ld $a3,$res_x+$i+56($sp)
1341 std $acc0,$i+0($rp_real)
1342 std $acc1,$i+8($rp_real)
1343 std $acc2,$i+16($rp_real)
1344 std $acc3,$i+24($rp_real)
1348 ld $acc0,$i+0($ap_real) # in1
1349 ld $acc1,$i+8($ap_real)
1350 ld $acc2,$i+16($ap_real)
1351 ld $acc3,$i+24($ap_real)
1352 andc $t0,$t0,$in1infty
1353 andc $t1,$t1,$in1infty
1354 andc $t2,$t2,$in1infty
1355 andc $t3,$t3,$in1infty
1356 and $a0,$a0,$in1infty
1357 and $a1,$a1,$in1infty
1358 and $a2,$a2,$in1infty
1359 and $a3,$a3,$in1infty
1364 andc $acc0,$acc0,$in2infty
1365 andc $acc1,$acc1,$in2infty
1366 andc $acc2,$acc2,$in2infty
1367 andc $acc3,$acc3,$in2infty
1368 and $t0,$t0,$in2infty
1369 and $t1,$t1,$in2infty
1370 and $t2,$t2,$in2infty
1371 and $t3,$t3,$in2infty
1376 std $acc0,$i+0($rp_real)
1377 std $acc1,$i+8($rp_real)
1378 std $acc2,$i+16($rp_real)
1379 std $acc3,$i+24($rp_real)
1383 ld r16,$FRAME-8*16($sp)
1384 ld r17,$FRAME-8*15($sp)
1385 ld r18,$FRAME-8*14($sp)
1386 ld r19,$FRAME-8*13($sp)
1387 ld r20,$FRAME-8*12($sp)
1388 ld r21,$FRAME-8*11($sp)
1389 ld r22,$FRAME-8*10($sp)
1390 ld r23,$FRAME-8*9($sp)
1391 ld r24,$FRAME-8*8($sp)
1392 ld r25,$FRAME-8*7($sp)
1393 ld r26,$FRAME-8*6($sp)
1394 ld r27,$FRAME-8*5($sp)
1395 ld r28,$FRAME-8*4($sp)
1396 ld r29,$FRAME-8*3($sp)
1397 ld r30,$FRAME-8*2($sp)
1398 ld r31,$FRAME-8*1($sp)
1402 .byte 0,12,4,0,0x80,16,3,0
1404 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1408 ########################################################################
1409 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1410 # const P256_POINT_AFFINE *in2);
1412 my $FRAME = 64 + 32*10 + 16*8;
1413 my ($res_x,$res_y,$res_z,
1414 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
1416 # above map() describes stack layout with 10 temporary
1417 # 256-bit vectors on top.
1418 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
1421 .globl ecp_nistz256_point_add_affine
1423 ecp_nistz256_point_add_affine:
1424 stdu $sp,-$FRAME($sp)
1426 std r16,$FRAME-8*16($sp)
1427 std r17,$FRAME-8*15($sp)
1428 std r18,$FRAME-8*14($sp)
1429 std r19,$FRAME-8*13($sp)
1430 std r20,$FRAME-8*12($sp)
1431 std r21,$FRAME-8*11($sp)
1432 std r22,$FRAME-8*10($sp)
1433 std r23,$FRAME-8*9($sp)
1434 std r24,$FRAME-8*8($sp)
1435 std r25,$FRAME-8*7($sp)
1436 std r26,$FRAME-8*6($sp)
1437 std r27,$FRAME-8*5($sp)
1438 std r28,$FRAME-8*4($sp)
1439 std r29,$FRAME-8*3($sp)
1440 std r30,$FRAME-8*2($sp)
1441 std r31,$FRAME-8*1($sp)
1444 srdi $poly1,$poly1,32 # 0x00000000ffffffff
1446 orc $poly3,$poly3,$poly1 # 0xffffffff00000001
1452 ld $a0,64($ap) # in1_z
1458 or $in1infty,$t0,$t2
1460 or $in1infty,$in1infty,$t0
1461 sradi $in1infty,$in1infty,63 # !in1infty
1463 ld $acc0,0($bp) # in2_x
1467 ld $t0,32($bp) # in2_y
1471 or $acc0,$acc0,$acc1
1472 or $acc2,$acc2,$acc3
1473 or $acc0,$acc0,$acc2
1477 or $in2infty,$acc0,$t0
1479 or $in2infty,$in2infty,$t0
1480 sradi $in2infty,$in2infty,63 # !in2infty
1483 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
1492 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
1495 ld $bi,64($ap_real) # forward load for p256_mul_mont
1496 ld $a0,$Z1sqr+0($sp)
1497 ld $a1,$Z1sqr+8($sp)
1498 ld $a2,$Z1sqr+16($sp)
1499 ld $a3,$Z1sqr+24($sp)
1501 bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
1503 addi $bp,$ap_real,64
1505 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
1512 addi $bp,$ap_real,64
1514 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
1521 addi $bp,$bp_real,32
1523 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
1525 addi $bp,$ap_real,32
1526 ld $a0,$H+0($sp) # forward load for p256_sqr_mont
1531 bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
1534 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
1541 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
1546 ld $a2,$Hsqr+16($sp)
1547 ld $a3,$Hsqr+24($sp)
1550 bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
1555 ld $a2,$Hsqr+16($sp)
1556 ld $a3,$Hsqr+24($sp)
1559 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
1566 bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
1570 bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
1573 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
1576 ld $bi,32($ap_real) # forward load for p256_mul_mont
1579 ld $a2,$Hcub+16($sp)
1580 ld $a3,$Hcub+24($sp)
1582 bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
1584 addi $bp,$ap_real,32
1586 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
1589 ld $a0,$res_y+0($sp)
1590 ld $a1,$res_y+8($sp)
1591 ld $a2,$res_y+16($sp)
1592 ld $a3,$res_y+24($sp)
1595 bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
1598 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
1600 ld $t0,0($bp_real) # in2
1604 ld $a0,$res_x+0($sp) # res
1605 ld $a1,$res_x+8($sp)
1606 ld $a2,$res_x+16($sp)
1607 ld $a3,$res_x+24($sp)
1609 for($i=0;$i<64;$i+=32) { # conditional moves
1611 ld $acc0,$i+0($ap_real) # in1
1612 ld $acc1,$i+8($ap_real)
1613 ld $acc2,$i+16($ap_real)
1614 ld $acc3,$i+24($ap_real)
1615 andc $t0,$t0,$in1infty
1616 andc $t1,$t1,$in1infty
1617 andc $t2,$t2,$in1infty
1618 andc $t3,$t3,$in1infty
1619 and $a0,$a0,$in1infty
1620 and $a1,$a1,$in1infty
1621 and $a2,$a2,$in1infty
1622 and $a3,$a3,$in1infty
1627 andc $acc0,$acc0,$in2infty
1628 andc $acc1,$acc1,$in2infty
1629 andc $acc2,$acc2,$in2infty
1630 andc $acc3,$acc3,$in2infty
1631 and $t0,$t0,$in2infty
1632 and $t1,$t1,$in2infty
1633 and $t2,$t2,$in2infty
1634 and $t3,$t3,$in2infty
1640 $code.=<<___ if ($i==0);
1641 ld $t0,32($bp_real) # in2
1646 $code.=<<___ if ($i==32);
1647 li $t0,1 # Lone_mont
1653 ld $a0,$res_x+$i+32($sp)
1654 ld $a1,$res_x+$i+40($sp)
1655 ld $a2,$res_x+$i+48($sp)
1656 ld $a3,$res_x+$i+56($sp)
1657 std $acc0,$i+0($rp_real)
1658 std $acc1,$i+8($rp_real)
1659 std $acc2,$i+16($rp_real)
1660 std $acc3,$i+24($rp_real)
1664 ld $acc0,$i+0($ap_real) # in1
1665 ld $acc1,$i+8($ap_real)
1666 ld $acc2,$i+16($ap_real)
1667 ld $acc3,$i+24($ap_real)
1668 andc $t0,$t0,$in1infty
1669 andc $t1,$t1,$in1infty
1670 andc $t2,$t2,$in1infty
1671 andc $t3,$t3,$in1infty
1672 and $a0,$a0,$in1infty
1673 and $a1,$a1,$in1infty
1674 and $a2,$a2,$in1infty
1675 and $a3,$a3,$in1infty
1680 andc $acc0,$acc0,$in2infty
1681 andc $acc1,$acc1,$in2infty
1682 andc $acc2,$acc2,$in2infty
1683 andc $acc3,$acc3,$in2infty
1684 and $t0,$t0,$in2infty
1685 and $t1,$t1,$in2infty
1686 and $t2,$t2,$in2infty
1687 and $t3,$t3,$in2infty
1692 std $acc0,$i+0($rp_real)
1693 std $acc1,$i+8($rp_real)
1694 std $acc2,$i+16($rp_real)
1695 std $acc3,$i+24($rp_real)
1698 ld r16,$FRAME-8*16($sp)
1699 ld r17,$FRAME-8*15($sp)
1700 ld r18,$FRAME-8*14($sp)
1701 ld r19,$FRAME-8*13($sp)
1702 ld r20,$FRAME-8*12($sp)
1703 ld r21,$FRAME-8*11($sp)
1704 ld r22,$FRAME-8*10($sp)
1705 ld r23,$FRAME-8*9($sp)
1706 ld r24,$FRAME-8*8($sp)
1707 ld r25,$FRAME-8*7($sp)
1708 ld r26,$FRAME-8*6($sp)
1709 ld r27,$FRAME-8*5($sp)
1710 ld r28,$FRAME-8*4($sp)
1711 ld r29,$FRAME-8*3($sp)
1712 ld r30,$FRAME-8*2($sp)
1713 ld r31,$FRAME-8*1($sp)
1717 .byte 0,12,4,0,0x80,16,3,0
1719 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1723 my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
1724 my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
1727 ########################################################################
1728 # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1730 .globl ecp_nistz256_ord_mul_mont
1732 ecp_nistz256_ord_mul_mont:
1758 ori $ordk,$ordk,0xc8aa
1759 ori $ord0,$ord0,0xcac2
1760 ori $ord1,$ord1,0xfaad
1764 oris $ordk,$ordk,0xee00
1765 oris $ord0,$ord0,0xfc63
1766 oris $ord1,$ord1,0xa717
1767 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1768 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1769 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1770 li $ord2,-1 # 0xffffffffffffffff
1771 sldi $ord3,$ord2,32 # 0xffffffff00000000
1774 mulld $acc0,$a0,$bi # a[0]*b[0]
1777 mulld $acc1,$a1,$bi # a[1]*b[0]
1780 mulld $acc2,$a2,$bi # a[2]*b[0]
1783 mulld $acc3,$a3,$bi # a[3]*b[0]
1784 mulhdu $acc4,$a3,$bi
1786 mulld $t4,$acc0,$ordk
1788 addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
1789 adde $acc2,$acc2,$t1
1790 adde $acc3,$acc3,$t2
1794 for ($i=1;$i<4;$i++) {
1795 ################################################################
1796 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1798 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1800 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1803 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1804 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1805 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1807 ld $bi,8*$i($bp) # b[i]
1810 subfc $acc2,$t4,$acc2
1812 subfe $acc3,$t0,$acc3
1813 subfe $acc4,$t1,$acc4
1814 subfe $acc5,$zr,$acc5
1816 addic $t0,$acc0,-1 # discarded
1817 mulhdu $t1,$ord0,$t4
1819 mulhdu $t3,$ord1,$t4
1826 addc $acc0,$acc1,$t2
1828 adde $acc1,$acc2,$t3
1830 adde $acc2,$acc3,$t4
1831 adde $acc3,$acc4,$t4
1834 addc $acc0,$acc0,$t0 # accumulate low parts
1836 adde $acc1,$acc1,$t1
1838 adde $acc2,$acc2,$t2
1840 adde $acc3,$acc3,$t3
1843 mulld $t4,$acc0,$ordk
1844 addc $acc1,$acc1,$t0 # accumulate high parts
1845 adde $acc2,$acc2,$t1
1846 adde $acc3,$acc3,$t2
1847 adde $acc4,$acc4,$t3
1852 sldi $t0,$t4,32 # last reduction
1853 subfc $acc2,$t4,$acc2
1855 subfe $acc3,$t0,$acc3
1856 subfe $acc4,$t1,$acc4
1857 subfe $acc5,$zr,$acc5
1859 addic $t0,$acc0,-1 # discarded
1860 mulhdu $t1,$ord0,$t4
1862 mulhdu $t3,$ord1,$t4
1867 addc $acc0,$acc1,$t2
1868 adde $acc1,$acc2,$t3
1869 adde $acc2,$acc3,$t4
1870 adde $acc3,$acc4,$t4
1873 subfc $acc0,$ord0,$acc0 # ret -= modulus
1874 subfe $acc1,$ord1,$acc1
1875 subfe $acc2,$ord2,$acc2
1876 subfe $acc3,$ord3,$acc3
1877 subfe $acc4,$zr,$acc4
1881 addc $acc0,$acc0,$t0 # ret += modulus if borrow
1883 adde $acc1,$acc1,$t1
1884 adde $acc2,$acc2,$acc4
1885 adde $acc3,$acc3,$t3
1909 .byte 0,12,4,0,0x80,14,3,0
1911 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1913 ################################################################################
1914 # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1916 .globl ecp_nistz256_ord_sqr_mont
1918 ecp_nistz256_ord_sqr_mont:
1945 ori $ordk,$ordk,0xc8aa
1946 ori $ord0,$ord0,0xcac2
1947 ori $ord1,$ord1,0xfaad
1951 oris $ordk,$ordk,0xee00
1952 oris $ord0,$ord0,0xfc63
1953 oris $ord1,$ord1,0xa717
1954 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
1955 ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
1956 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
1957 li $ord2,-1 # 0xffffffffffffffff
1958 sldi $ord3,$ord2,32 # 0xffffffff00000000
1964 ################################################################
1965 # | | | | | |a1*a0| |
1966 # | | | | |a2*a0| | |
1967 # | |a3*a2|a3*a0| | | |
1968 # | | | |a2*a1| | | |
1969 # | | |a3*a1| | | | |
1970 # *| | | | | | | | 2|
1971 # +|a3*a3|a2*a2|a1*a1|a0*a0|
1972 # |--+--+--+--+--+--+--+--|
1973 # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1975 # "can't overflow" below mark carrying into high part of
1976 # multiplication result, which can't overflow, because it
1977 # can never be all ones.
1979 mulld $acc1,$a1,$a0 # a[1]*a[0]
1981 mulld $acc2,$a2,$a0 # a[2]*a[0]
1983 mulld $acc3,$a3,$a0 # a[3]*a[0]
1984 mulhdu $acc4,$a3,$a0
1986 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
1987 mulld $t0,$a2,$a1 # a[2]*a[1]
1989 adde $acc3,$acc3,$t2
1990 mulld $t2,$a3,$a1 # a[3]*a[1]
1992 addze $acc4,$acc4 # can't overflow
1994 mulld $acc5,$a3,$a2 # a[3]*a[2]
1995 mulhdu $acc6,$a3,$a2
1997 addc $t1,$t1,$t2 # accumulate high parts of multiplication
1998 mulld $acc0,$a0,$a0 # a[0]*a[0]
1999 addze $t2,$t3 # can't overflow
2001 addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
2003 adde $acc4,$acc4,$t1
2004 mulld $t1,$a1,$a1 # a[1]*a[1]
2005 adde $acc5,$acc5,$t2
2007 addze $acc6,$acc6 # can't overflow
2009 addc $acc1,$acc1,$acc1 # acc[1-6]*=2
2010 mulld $t2,$a2,$a2 # a[2]*a[2]
2011 adde $acc2,$acc2,$acc2
2013 adde $acc3,$acc3,$acc3
2014 mulld $t3,$a3,$a3 # a[3]*a[3]
2015 adde $acc4,$acc4,$acc4
2017 adde $acc5,$acc5,$acc5
2018 adde $acc6,$acc6,$acc6
2021 addc $acc1,$acc1,$a0 # +a[i]*a[i]
2022 mulld $t4,$acc0,$ordk
2023 adde $acc2,$acc2,$t1
2024 adde $acc3,$acc3,$a1
2025 adde $acc4,$acc4,$t2
2026 adde $acc5,$acc5,$a2
2027 adde $acc6,$acc6,$t3
2028 adde $acc7,$acc7,$a3
2030 for($i=0; $i<4; $i++) { # reductions
2032 addic $t0,$acc0,-1 # discarded
2033 mulhdu $t1,$ord0,$t4
2035 mulhdu $t3,$ord1,$t4
2040 addc $acc0,$acc1,$t2
2041 adde $acc1,$acc2,$t3
2042 adde $acc2,$acc3,$t4
2043 adde $acc3,$zr,$t4 # can't overflow
2045 $code.=<<___ if ($i<3);
2046 mulld $t3,$acc0,$ordk
2050 subfc $acc1,$t4,$acc1
2052 subfe $acc2,$t0,$acc2
2053 subfe $acc3,$t1,$acc3 # can't borrow
2055 ($t3,$t4) = ($t4,$t3);
2058 addc $acc0,$acc0,$acc4 # accumulate upper half
2059 adde $acc1,$acc1,$acc5
2060 adde $acc2,$acc2,$acc6
2061 adde $acc3,$acc3,$acc7
2064 subfc $acc0,$ord0,$acc0 # ret -= modulus
2065 subfe $acc1,$ord1,$acc1
2066 subfe $acc2,$ord2,$acc2
2067 subfe $acc3,$ord3,$acc3
2068 subfe $acc4,$zr,$acc4
2072 addc $a0,$acc0,$t0 # ret += modulus if borrow
2075 adde $a2,$acc2,$acc4
2102 .byte 0,12,4,0,0x80,14,3,0
2104 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
2108 ########################################################################
2109 # scatter-gather subroutines
2111 my ($out,$inp,$index,$mask)=map("r$_",(3..7));
2113 ########################################################################
2114 # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
2116 .globl ecp_nistz256_scatter_w5
2118 ecp_nistz256_scatter_w5:
2119 slwi $index,$index,2
2120 add $out,$out,$index
2127 stw r8, 64*0-4($out)
2129 stw r9, 64*1-4($out)
2131 stw r10,64*2-4($out)
2133 stw r11,64*3-4($out)
2135 stw r8, 64*4-4($out)
2136 stw r9, 64*5-4($out)
2137 stw r10,64*6-4($out)
2138 stw r11,64*7-4($out)
2146 stw r8, 64*0-4($out)
2148 stw r9, 64*1-4($out)
2150 stw r10,64*2-4($out)
2152 stw r11,64*3-4($out)
2154 stw r8, 64*4-4($out)
2155 stw r9, 64*5-4($out)
2156 stw r10,64*6-4($out)
2157 stw r11,64*7-4($out)
2165 stw r8, 64*0-4($out)
2167 stw r9, 64*1-4($out)
2169 stw r10,64*2-4($out)
2171 stw r11,64*3-4($out)
2173 stw r8, 64*4-4($out)
2174 stw r9, 64*5-4($out)
2175 stw r10,64*6-4($out)
2176 stw r11,64*7-4($out)
2180 .byte 0,12,0x14,0,0,0,3,0
2182 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2184 ########################################################################
2185 # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
2187 .globl ecp_nistz256_gather_w5
2189 ecp_nistz256_gather_w5:
2193 add $index,$index,r0
2194 slwi $index,$index,2
2195 add $inp,$inp,$index
2276 .byte 0,12,0x14,0,0,0,3,0
2278 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2280 ########################################################################
2281 # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
2283 .globl ecp_nistz256_scatter_w7
2285 ecp_nistz256_scatter_w7:
2288 add $out,$out,$index
2309 bdnz .Loop_scatter_w7
2313 .byte 0,12,0x14,0,0,0,3,0
2315 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2317 ########################################################################
2318 # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
2320 .globl ecp_nistz256_gather_w7
2322 ecp_nistz256_gather_w7:
2328 add $index,$index,r0
2329 add $inp,$inp,$index
2360 bdnz .Loop_gather_w7
2364 .byte 0,12,0x14,0,0,0,3,0
2366 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2370 foreach (split("\n",$code)) {
2371 s/\`([^\`]*)\`/eval $1/ge;
2375 close STDOUT; # enforce flush