2 # Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4 # Copyright (c) 2015 CloudFlare, Inc.
6 # Licensed under the OpenSSL license (the "License"). You may not use
7 # this file except in compliance with the License. You can obtain a copy
8 # in the file LICENSE in the source distribution or at
9 # https://www.openssl.org/source/license.html
11 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
13 # (2) University of Haifa, Israel
14 # (3) CloudFlare, Inc.
17 # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
20 # Further optimization by <appro@openssl.org>:
22 # this/original with/without -DECP_NISTZ256_ASM(*)
23 # Opteron +15-49% +150-195%
24 # Bulldozer +18-45% +175-240%
25 # P4 +24-46% +100-150%
26 # Westmere +18-34% +87-160%
27 # Sandy Bridge +14-35% +120-185%
28 # Ivy Bridge +11-35% +125-180%
29 # Haswell +10-37% +160-200%
30 # Broadwell +24-58% +210-270%
31 # Atom +20-50% +180-240%
32 # VIA Nano +50-160% +480-480%
34 # (*) "without -DECP_NISTZ256_ASM" refers to build with
35 # "enable-ec_nistp_64_gcc_128";
37 # Ranges denote minimum and maximum improvement coefficients depending
38 # on benchmark. In "this/original" column lower coefficient is for
39 # ECDSA sign, while in "with/without" - for ECDH key agreement, and
40 # higher - for ECDSA sign, relatively fastest server-side operation.
41 # Keep in mind that +100% means 2x improvement.
45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52 die "can't locate x86_64-xlate.pl";
54 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
57 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
58 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
59 $avx = ($1>=2.19) + ($1>=2.22);
63 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
69 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71 $avx = ($1>=10) + ($1>=11);
75 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
76 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
77 $avx = ($ver>=3.0) + ($ver>=3.01);
83 .extern OPENSSL_ia32cap_P
88 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
90 # 2^512 mod P precomputed for NIST P256 polynomial
92 .quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
101 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
103 # Constants for computations modulo ord(p256)
105 .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
107 .quad 0xccd1c8aaee00bc4f
111 ################################################################################
112 # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
114 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
115 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
116 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
120 .globl ecp_nistz256_mul_by_2
121 .type ecp_nistz256_mul_by_2,\@function,2
123 ecp_nistz256_mul_by_2:
134 add $a0, $a0 # a0:a3+a0:a3
138 lea .Lpoly(%rip), $a_ptr
167 .cfi_adjust_cfa_offset -16
171 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
173 ################################################################################
174 # void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
175 .globl ecp_nistz256_div_by_2
176 .type ecp_nistz256_div_by_2,\@function,2
178 ecp_nistz256_div_by_2:
191 lea .Lpoly(%rip), $a_ptr
202 xor $a_ptr, $a_ptr # borrow $a_ptr
211 mov $a1, $t0 # a0:a3>>1
237 .cfi_adjust_cfa_offset -16
241 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
243 ################################################################################
244 # void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
245 .globl ecp_nistz256_mul_by_3
246 .type ecp_nistz256_mul_by_3,\@function,2
248 ecp_nistz256_mul_by_3:
259 add $a0, $a0 # a0:a3+a0:a3
271 sbb .Lpoly+8*1(%rip), $a1
274 sbb .Lpoly+8*3(%rip), $a3
283 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
293 sbb .Lpoly+8*1(%rip), $a1
296 sbb .Lpoly+8*3(%rip), $a3
313 .cfi_adjust_cfa_offset -16
317 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
319 ################################################################################
320 # void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
321 .globl ecp_nistz256_add
322 .type ecp_nistz256_add,\@function,3
337 lea .Lpoly(%rip), $a_ptr
369 .cfi_adjust_cfa_offset -16
373 .size ecp_nistz256_add,.-ecp_nistz256_add
375 ################################################################################
376 # void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
377 .globl ecp_nistz256_sub
378 .type ecp_nistz256_sub,\@function,3
393 lea .Lpoly(%rip), $a_ptr
425 .cfi_adjust_cfa_offset -16
429 .size ecp_nistz256_sub,.-ecp_nistz256_sub
431 ################################################################################
432 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
433 .globl ecp_nistz256_neg
434 .type ecp_nistz256_neg,\@function,2
455 lea .Lpoly(%rip), $a_ptr
481 .cfi_adjust_cfa_offset -16
485 .size ecp_nistz256_neg,.-ecp_nistz256_neg
489 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
490 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
491 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
492 my ($poly1,$poly3)=($acc6,$acc7);
495 ################################################################################
496 # void ecp_nistz256_ord_mul_mont(
501 .globl ecp_nistz256_ord_mul_mont
502 .type ecp_nistz256_ord_mul_mont,\@function,3
504 ecp_nistz256_ord_mul_mont:
507 $code.=<<___ if ($addx);
509 and OPENSSL_ia32cap_P+8(%rip), %ecx
511 je .Lecp_nistz256_ord_mul_montx
528 mov 8*0($b_org), %rax
530 lea .Lord(%rip), %r14
531 mov .LordK(%rip), %r15
533 ################################# * b[0]
561 ################################# First reduction step
564 add %rax, $acc5 # guaranteed to be zero
570 sbb \$0, $acc0 # can't borrow
579 adc \$0, $acc0 # can't overflow
584 mov 8*1($b_ptr), %rax
585 sbb %rdx, $t1 # can't borrow
591 ################################# * b[1]
627 ################################# Second reduction step
630 add %rax, $t0 # guaranteed to be zero
635 sbb \$0, $acc1 # can't borrow
644 adc \$0, $acc1 # can't overflow
649 mov 8*2($b_ptr), %rax
650 sbb %rdx, $t1 # can't borrow
656 ################################## * b[2]
692 ################################# Third reduction step
695 add %rax, $t0 # guaranteed to be zero
700 sbb \$0, $acc2 # can't borrow
709 adc \$0, $acc2 # can't overflow
714 mov 8*3($b_ptr), %rax
715 sbb %rdx, $t1 # can't borrow
721 ################################# * b[3]
757 ################################# Last reduction step
760 add %rax, $t0 # guaranteed to be zero
765 sbb \$0, $acc3 # can't borrow
774 adc \$0, $acc3 # can't overflow
779 sbb %rdx, $t1 # can't borrow
785 ################################# Subtract ord
801 mov $acc4, 8*0($r_ptr)
802 mov $acc5, 8*1($r_ptr)
803 mov $acc0, 8*2($r_ptr)
804 mov $acc1, 8*3($r_ptr)
819 .cfi_adjust_cfa_offset -48
823 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
825 ################################################################################
826 # void ecp_nistz256_ord_sqr_mont(
831 .globl ecp_nistz256_ord_sqr_mont
832 .type ecp_nistz256_ord_sqr_mont,\@function,3
834 ecp_nistz256_ord_sqr_mont:
837 $code.=<<___ if ($addx);
839 and OPENSSL_ia32cap_P+8(%rip), %ecx
841 je .Lecp_nistz256_ord_sqr_montx
858 mov 8*0($a_ptr), $acc0
859 mov 8*1($a_ptr), %rax
860 mov 8*2($a_ptr), $acc6
861 mov 8*3($a_ptr), $acc7
862 lea .Lord(%rip), $a_ptr # pointer to modulus
868 ################################# a[1:] * a[0]
869 mov %rax, $t1 # put aside a[1]
870 mul $acc0 # a[1] * a[0]
872 movq $t1, %xmm1 # offload a[1]
876 mul $acc0 # a[2] * a[0]
879 movq $acc6, %xmm2 # offload a[2]
883 mul $acc0 # a[3] * a[0]
886 movq $acc7, %xmm3 # offload a[3]
890 ################################# a[3] * a[2]
891 mul $acc6 # a[3] * a[2]
896 ################################# a[2:] * a[1]
897 mul $t1 # a[2] * a[1]
903 mul $t1 # a[3] * a[1]
909 adc \$0, $acc6 # can't overflow
911 ################################# *2
922 ################################# Missing products
923 mul %rax # a[0] * a[0]
928 mul %rax # a[1] * a[1]
935 mul %rax # a[2] * a[2]
943 imulq 8*4($a_ptr), $acc0 # *= .LordK
945 mul %rax # a[3] * a[3]
948 mov 8*0($a_ptr), %rax # modulus[0]
949 adc %rdx, $acc7 # can't overflow
951 ################################# First reduction step
954 add %rax, $t0 # guaranteed to be zero
955 mov 8*1($a_ptr), %rax # modulus[1]
959 sbb \$0, $t1 # can't borrow
968 adc \$0, $t1 # can't overflow
971 imulq 8*4($a_ptr), $acc1 # *= .LordK
976 mov 8*0($a_ptr), %rax
977 sbb %rdx, $acc0 # can't borrow
980 adc \$0, $acc0 # can't overflow
982 ################################# Second reduction step
985 add %rax, $t0 # guaranteed to be zero
986 mov 8*1($a_ptr), %rax
990 sbb \$0, $t1 # can't borrow
999 adc \$0, $t1 # can't overflow
1002 imulq 8*4($a_ptr), $acc2 # *= .LordK
1007 mov 8*0($a_ptr), %rax
1008 sbb %rdx, $acc1 # can't borrow
1011 adc \$0, $acc1 # can't overflow
1013 ################################# Third reduction step
1016 add %rax, $t0 # guaranteed to be zero
1017 mov 8*1($a_ptr), %rax
1021 sbb \$0, $t1 # can't borrow
1030 adc \$0, $t1 # can't overflow
1033 imulq 8*4($a_ptr), $acc3 # *= .LordK
1038 mov 8*0($a_ptr), %rax
1039 sbb %rdx, $acc2 # can't borrow
1042 adc \$0, $acc2 # can't overflow
1044 ################################# Last reduction step
1047 add %rax, $t0 # guaranteed to be zero
1048 mov 8*1($a_ptr), %rax
1052 sbb \$0, $t1 # can't borrow
1061 adc \$0, $t1 # can't overflow
1066 sbb %rdx, $acc3 # can't borrow
1069 adc \$0, $acc3 # can't overflow
1071 ################################# Add bits [511:256] of the sqr result
1081 ################################# Compare to modulus
1082 sub 8*0($a_ptr), $acc0
1084 sbb 8*1($a_ptr), $acc1
1085 sbb 8*2($a_ptr), $acc2
1087 sbb 8*3($a_ptr), $acc3
1098 mov $acc0, 8*0($r_ptr)
1099 mov %rax, 8*1($r_ptr)
1101 mov $acc6, 8*2($r_ptr)
1103 mov $acc7, 8*3($r_ptr)
1119 .cfi_adjust_cfa_offset -48
1123 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1126 $code.=<<___ if ($addx);
1127 ################################################################################
1128 .type ecp_nistz256_ord_mul_montx,\@function,3
1130 ecp_nistz256_ord_mul_montx:
1132 .Lecp_nistz256_ord_mul_montx:
1148 mov 8*0($b_org), %rdx
1149 mov 8*0($a_ptr), $acc1
1150 mov 8*1($a_ptr), $acc2
1151 mov 8*2($a_ptr), $acc3
1152 mov 8*3($a_ptr), $acc4
1153 lea -128($a_ptr), $a_ptr # control u-op density
1154 lea .Lord-128(%rip), %r14
1155 mov .LordK(%rip), %r15
1157 ################################# Multiply by b[0]
1158 mulx $acc1, $acc0, $acc1
1159 mulx $acc2, $t0, $acc2
1160 mulx $acc3, $t1, $acc3
1162 mulx $acc4, $t0, $acc4
1164 mulx %r15, %rdx, %rax
1169 ################################# reduction
1170 xor $acc5, $acc5 # $acc5=0, cf=0, of=0
1171 mulx 8*0+128(%r14), $t0, $t1
1172 adcx $t0, $acc0 # guaranteed to be zero
1175 mulx 8*1+128(%r14), $t0, $t1
1179 mulx 8*2+128(%r14), $t0, $t1
1183 mulx 8*3+128(%r14), $t0, $t1
1184 mov 8*1($b_ptr), %rdx
1189 adc \$0, $acc5 # cf=0, of=0
1191 ################################# Multiply by b[1]
1192 mulx 8*0+128($a_ptr), $t0, $t1
1196 mulx 8*1+128($a_ptr), $t0, $t1
1200 mulx 8*2+128($a_ptr), $t0, $t1
1204 mulx 8*3+128($a_ptr), $t0, $t1
1206 mulx %r15, %rdx, %rax
1212 adc \$0, $acc0 # cf=0, of=0
1214 ################################# reduction
1215 mulx 8*0+128(%r14), $t0, $t1
1216 adcx $t0, $acc1 # guaranteed to be zero
1219 mulx 8*1+128(%r14), $t0, $t1
1223 mulx 8*2+128(%r14), $t0, $t1
1227 mulx 8*3+128(%r14), $t0, $t1
1228 mov 8*2($b_ptr), %rdx
1233 adc \$0, $acc0 # cf=0, of=0
1235 ################################# Multiply by b[2]
1236 mulx 8*0+128($a_ptr), $t0, $t1
1240 mulx 8*1+128($a_ptr), $t0, $t1
1244 mulx 8*2+128($a_ptr), $t0, $t1
1248 mulx 8*3+128($a_ptr), $t0, $t1
1250 mulx %r15, %rdx, %rax
1256 adc \$0, $acc1 # cf=0, of=0
1258 ################################# reduction
1259 mulx 8*0+128(%r14), $t0, $t1
1260 adcx $t0, $acc2 # guaranteed to be zero
1263 mulx 8*1+128(%r14), $t0, $t1
1267 mulx 8*2+128(%r14), $t0, $t1
1271 mulx 8*3+128(%r14), $t0, $t1
1272 mov 8*3($b_ptr), %rdx
1277 adc \$0, $acc1 # cf=0, of=0
1279 ################################# Multiply by b[3]
1280 mulx 8*0+128($a_ptr), $t0, $t1
1284 mulx 8*1+128($a_ptr), $t0, $t1
1288 mulx 8*2+128($a_ptr), $t0, $t1
1292 mulx 8*3+128($a_ptr), $t0, $t1
1294 mulx %r15, %rdx, %rax
1300 adc \$0, $acc2 # cf=0, of=0
1302 ################################# reduction
1303 mulx 8*0+128(%r14), $t0, $t1
1304 adcx $t0, $acc3 # guranteed to be zero
1307 mulx 8*1+128(%r14), $t0, $t1
1311 mulx 8*2+128(%r14), $t0, $t1
1315 mulx 8*3+128(%r14), $t0, $t1
1325 #################################
1326 # Branch-less conditional subtraction of P
1328 sub 8*0(%r14), $acc4
1329 sbb 8*1(%r14), $acc5
1330 sbb 8*2(%r14), $acc0
1332 sbb 8*3(%r14), $acc1
1340 mov $acc4, 8*0($r_ptr)
1341 mov $acc5, 8*1($r_ptr)
1342 mov $acc0, 8*2($r_ptr)
1343 mov $acc1, 8*3($r_ptr)
1358 .cfi_adjust_cfa_offset -48
1359 .Lord_mulx_epilogue:
1362 .size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1364 .type ecp_nistz256_ord_sqr_montx,\@function,3
1366 ecp_nistz256_ord_sqr_montx:
1368 .Lecp_nistz256_ord_sqr_montx:
1384 mov 8*0($a_ptr), %rdx
1385 mov 8*1($a_ptr), $acc6
1386 mov 8*2($a_ptr), $acc7
1387 mov 8*3($a_ptr), $acc0
1388 lea .Lord(%rip), $a_ptr
1393 mulx $acc6, $acc1, $acc2 # a[0]*a[1]
1394 mulx $acc7, $t0, $acc3 # a[0]*a[2]
1395 mov %rdx, %rax # offload a[0]
1396 movq $acc6, %xmm1 # offload a[1]
1397 mulx $acc0, $t1, $acc4 # a[0]*a[3]
1400 movq $acc7, %xmm2 # offload a[2]
1403 xor $acc5, $acc5 # $acc5=0,cf=0,of=0
1404 #################################
1405 mulx $acc7, $t0, $t1 # a[1]*a[2]
1409 mulx $acc0, $t0, $t1 # a[1]*a[3]
1414 #################################
1415 mulx $acc0, $t0, $acc6 # a[2]*a[3]
1417 movq $acc0, %xmm3 # offload a[3]
1418 xor $acc7, $acc7 # $acc7=0,cf=0,of=0
1419 adcx $acc1, $acc1 # acc1:6<<1
1422 adox $acc7, $acc6 # of=0
1424 ################################# a[i]*a[i]
1425 mulx %rdx, $acc0, $t1
1446 ################################# reduction
1448 mulx 8*4($a_ptr), %rdx, $t0
1450 xor %rax, %rax # cf=0, of=0
1451 mulx 8*0($a_ptr), $t0, $t1
1452 adcx $t0, $acc0 # guaranteed to be zero
1454 mulx 8*1($a_ptr), $t0, $t1
1457 mulx 8*2($a_ptr), $t0, $t1
1460 mulx 8*3($a_ptr), $t0, $t1
1462 adox $t1, $acc0 # of=0
1463 adcx %rax, $acc0 # cf=0
1465 #################################
1467 mulx 8*4($a_ptr), %rdx, $t0
1469 mulx 8*0($a_ptr), $t0, $t1
1470 adox $t0, $acc1 # guaranteed to be zero
1472 mulx 8*1($a_ptr), $t0, $t1
1475 mulx 8*2($a_ptr), $t0, $t1
1478 mulx 8*3($a_ptr), $t0, $t1
1480 adcx $t1, $acc1 # cf=0
1481 adox %rax, $acc1 # of=0
1483 #################################
1485 mulx 8*4($a_ptr), %rdx, $t0
1487 mulx 8*0($a_ptr), $t0, $t1
1488 adcx $t0, $acc2 # guaranteed to be zero
1490 mulx 8*1($a_ptr), $t0, $t1
1493 mulx 8*2($a_ptr), $t0, $t1
1496 mulx 8*3($a_ptr), $t0, $t1
1498 adox $t1, $acc2 # of=0
1499 adcx %rax, $acc2 # cf=0
1501 #################################
1503 mulx 8*4($a_ptr), %rdx, $t0
1505 mulx 8*0($a_ptr), $t0, $t1
1506 adox $t0, $acc3 # guaranteed to be zero
1508 mulx 8*1($a_ptr), $t0, $t1
1511 mulx 8*2($a_ptr), $t0, $t1
1514 mulx 8*3($a_ptr), $t0, $t1
1519 ################################# accumulate upper half
1520 add $acc0, $acc4 # add $acc4, $acc0
1528 ################################# compare to modulus
1529 sub 8*0($a_ptr), $acc4
1531 sbb 8*1($a_ptr), $acc1
1532 sbb 8*2($a_ptr), $acc2
1534 sbb 8*3($a_ptr), $acc3
1545 mov %rdx, 8*0($r_ptr)
1546 mov $acc6, 8*1($r_ptr)
1548 mov $acc7, 8*2($r_ptr)
1550 mov $acc0, 8*3($r_ptr)
1566 .cfi_adjust_cfa_offset -48
1567 .Lord_sqrx_epilogue:
1570 .size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1574 ################################################################################
1575 # void ecp_nistz256_to_mont(
1578 .globl ecp_nistz256_to_mont
1579 .type ecp_nistz256_to_mont,\@function,2
1581 ecp_nistz256_to_mont:
1583 $code.=<<___ if ($addx);
1585 and OPENSSL_ia32cap_P+8(%rip), %ecx
1588 lea .LRR(%rip), $b_org
1590 .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
1592 ################################################################################
1593 # void ecp_nistz256_mul_mont(
1598 .globl ecp_nistz256_mul_mont
1599 .type ecp_nistz256_mul_mont,\@function,3
1601 ecp_nistz256_mul_mont:
1604 $code.=<<___ if ($addx);
1606 and OPENSSL_ia32cap_P+8(%rip), %ecx
1624 $code.=<<___ if ($addx);
1630 mov 8*0($b_org), %rax
1631 mov 8*0($a_ptr), $acc1
1632 mov 8*1($a_ptr), $acc2
1633 mov 8*2($a_ptr), $acc3
1634 mov 8*3($a_ptr), $acc4
1636 call __ecp_nistz256_mul_montq
1638 $code.=<<___ if ($addx);
1644 mov 8*0($b_org), %rdx
1645 mov 8*0($a_ptr), $acc1
1646 mov 8*1($a_ptr), $acc2
1647 mov 8*2($a_ptr), $acc3
1648 mov 8*3($a_ptr), $acc4
1649 lea -128($a_ptr), $a_ptr # control u-op density
1651 call __ecp_nistz256_mul_montx
1668 .cfi_adjust_cfa_offset -48
1672 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1674 .type __ecp_nistz256_mul_montq,\@abi-omnipotent
1676 __ecp_nistz256_mul_montq:
1677 ########################################################################
1678 # Multiply a by b[0]
1681 mov .Lpoly+8*1(%rip),$poly1
1687 mov .Lpoly+8*3(%rip),$poly3
1706 ########################################################################
1707 # First reduction step
1708 # Basically now we want to multiply acc[0] by p256,
1709 # and add the result to the acc.
1710 # Due to the special form of p256 we do some optimizations
1712 # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1713 # then we add acc[0] and get acc[0] x 2^96
1719 add $acc0, $acc1 # +=acc[0]<<96
1722 mov 8*1($b_ptr), %rax
1727 ########################################################################
1760 ########################################################################
1761 # Second reduction step
1769 mov 8*2($b_ptr), %rax
1774 ########################################################################
1807 ########################################################################
1808 # Third reduction step
1816 mov 8*3($b_ptr), %rax
1821 ########################################################################
1854 ########################################################################
1855 # Final reduction step
1868 ########################################################################
1869 # Branch-less conditional subtraction of P
1870 sub \$-1, $acc4 # .Lpoly[0]
1872 sbb $poly1, $acc5 # .Lpoly[1]
1873 sbb \$0, $acc0 # .Lpoly[2]
1875 sbb $poly3, $acc1 # .Lpoly[3]
1880 mov $acc4, 8*0($r_ptr)
1882 mov $acc5, 8*1($r_ptr)
1884 mov $acc0, 8*2($r_ptr)
1885 mov $acc1, 8*3($r_ptr)
1888 .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1890 ################################################################################
1891 # void ecp_nistz256_sqr_mont(
1895 # we optimize the square according to S.Gueron and V.Krasnov,
1896 # "Speeding up Big-Number Squaring"
1897 .globl ecp_nistz256_sqr_mont
1898 .type ecp_nistz256_sqr_mont,\@function,2
1900 ecp_nistz256_sqr_mont:
1903 $code.=<<___ if ($addx);
1905 and OPENSSL_ia32cap_P+8(%rip), %ecx
1922 $code.=<<___ if ($addx);
1927 mov 8*0($a_ptr), %rax
1928 mov 8*1($a_ptr), $acc6
1929 mov 8*2($a_ptr), $acc7
1930 mov 8*3($a_ptr), $acc0
1932 call __ecp_nistz256_sqr_montq
1934 $code.=<<___ if ($addx);
1939 mov 8*0($a_ptr), %rdx
1940 mov 8*1($a_ptr), $acc6
1941 mov 8*2($a_ptr), $acc7
1942 mov 8*3($a_ptr), $acc0
1943 lea -128($a_ptr), $a_ptr # control u-op density
1945 call __ecp_nistz256_sqr_montx
1962 .cfi_adjust_cfa_offset -48
1966 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1968 .type __ecp_nistz256_sqr_montq,\@abi-omnipotent
1970 __ecp_nistz256_sqr_montq:
1972 mulq $acc6 # a[1]*a[0]
1977 mulq $acc5 # a[0]*a[2]
1983 mulq $acc5 # a[0]*a[3]
1989 #################################
1990 mulq $acc6 # a[1]*a[2]
1996 mulq $acc6 # a[1]*a[3]
2004 #################################
2005 mulq $acc7 # a[2]*a[3]
2008 mov 8*0($a_ptr), %rax
2012 add $acc1, $acc1 # acc1:6<<1
2022 mov 8*1($a_ptr), %rax
2028 mov 8*2($a_ptr), %rax
2035 mov 8*3($a_ptr), %rax
2045 mov .Lpoly+8*1(%rip), $a_ptr
2046 mov .Lpoly+8*3(%rip), $t1
2048 ##########################################
2055 add $acc0, $acc1 # +=acc[0]<<96
2061 ##########################################
2074 ##########################################
2087 ###########################################
2100 ############################################
2101 # Add the rest of the acc
2110 sub \$-1, $acc4 # .Lpoly[0]
2112 sbb $a_ptr, $acc5 # .Lpoly[1]
2113 sbb \$0, $acc6 # .Lpoly[2]
2115 sbb $t1, $acc7 # .Lpoly[3]
2120 mov $acc4, 8*0($r_ptr)
2122 mov $acc5, 8*1($r_ptr)
2124 mov $acc6, 8*2($r_ptr)
2125 mov $acc7, 8*3($r_ptr)
2128 .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
2133 .type __ecp_nistz256_mul_montx,\@abi-omnipotent
2135 __ecp_nistz256_mul_montx:
2136 ########################################################################
2138 mulx $acc1, $acc0, $acc1
2139 mulx $acc2, $t0, $acc2
2141 xor $acc5, $acc5 # cf=0
2142 mulx $acc3, $t1, $acc3
2143 mov .Lpoly+8*3(%rip), $poly3
2145 mulx $acc4, $t0, $acc4
2148 shlx $poly1,$acc0,$t1
2150 shrx $poly1,$acc0,$t0
2153 ########################################################################
2154 # First reduction step
2158 mulx $poly3, $t0, $t1
2159 mov 8*1($b_ptr), %rdx
2163 xor $acc0, $acc0 # $acc0=0,cf=0,of=0
2165 ########################################################################
2167 mulx 8*0+128($a_ptr), $t0, $t1
2171 mulx 8*1+128($a_ptr), $t0, $t1
2175 mulx 8*2+128($a_ptr), $t0, $t1
2179 mulx 8*3+128($a_ptr), $t0, $t1
2182 shlx $poly1, $acc1, $t0
2184 shrx $poly1, $acc1, $t1
2190 ########################################################################
2191 # Second reduction step
2195 mulx $poly3, $t0, $t1
2196 mov 8*2($b_ptr), %rdx
2200 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
2202 ########################################################################
2204 mulx 8*0+128($a_ptr), $t0, $t1
2208 mulx 8*1+128($a_ptr), $t0, $t1
2212 mulx 8*2+128($a_ptr), $t0, $t1
2216 mulx 8*3+128($a_ptr), $t0, $t1
2219 shlx $poly1, $acc2, $t0
2221 shrx $poly1, $acc2, $t1
2227 ########################################################################
2228 # Third reduction step
2232 mulx $poly3, $t0, $t1
2233 mov 8*3($b_ptr), %rdx
2237 xor $acc2, $acc2 # $acc2=0,cf=0,of=0
2239 ########################################################################
2241 mulx 8*0+128($a_ptr), $t0, $t1
2245 mulx 8*1+128($a_ptr), $t0, $t1
2249 mulx 8*2+128($a_ptr), $t0, $t1
2253 mulx 8*3+128($a_ptr), $t0, $t1
2256 shlx $poly1, $acc3, $t0
2258 shrx $poly1, $acc3, $t1
2264 ########################################################################
2265 # Fourth reduction step
2269 mulx $poly3, $t0, $t1
2271 mov .Lpoly+8*1(%rip), $poly1
2277 ########################################################################
2278 # Branch-less conditional subtraction of P
2281 sbb \$-1, $acc4 # .Lpoly[0]
2282 sbb $poly1, $acc5 # .Lpoly[1]
2283 sbb \$0, $acc0 # .Lpoly[2]
2285 sbb $poly3, $acc1 # .Lpoly[3]
2290 mov $acc4, 8*0($r_ptr)
2292 mov $acc5, 8*1($r_ptr)
2294 mov $acc0, 8*2($r_ptr)
2295 mov $acc1, 8*3($r_ptr)
2298 .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
2300 .type __ecp_nistz256_sqr_montx,\@abi-omnipotent
2302 __ecp_nistz256_sqr_montx:
2303 mulx $acc6, $acc1, $acc2 # a[0]*a[1]
2304 mulx $acc7, $t0, $acc3 # a[0]*a[2]
2307 mulx $acc0, $t1, $acc4 # a[0]*a[3]
2311 xor $acc5, $acc5 # $acc5=0,cf=0,of=0
2313 #################################
2314 mulx $acc7, $t0, $t1 # a[1]*a[2]
2318 mulx $acc0, $t0, $t1 # a[1]*a[3]
2324 #################################
2325 mulx $acc0, $t0, $acc6 # a[2]*a[3]
2326 mov 8*0+128($a_ptr), %rdx
2327 xor $acc7, $acc7 # $acc7=0,cf=0,of=0
2328 adcx $acc1, $acc1 # acc1:6<<1
2331 adox $acc7, $acc6 # of=0
2333 mulx %rdx, $acc0, $t1
2334 mov 8*1+128($a_ptr), %rdx
2339 mov 8*2+128($a_ptr), %rdx
2345 mov 8*3+128($a_ptr), %rdx
2353 mov .Lpoly+8*3(%rip), %rdx
2355 shlx $a_ptr, $acc0, $t0
2357 shrx $a_ptr, $acc0, $t4
2364 mulx $acc0, $t0, $acc0
2366 shlx $a_ptr, $acc1, $t0
2368 shrx $a_ptr, $acc1, $t4
2374 mulx $acc1, $t0, $acc1
2376 shlx $a_ptr, $acc2, $t0
2378 shrx $a_ptr, $acc2, $t4
2384 mulx $acc2, $t0, $acc2
2386 shlx $a_ptr, $acc3, $t0
2388 shrx $a_ptr, $acc3, $t4
2394 mulx $acc3, $t0, $acc3
2399 add $acc0, $acc4 # accumulate upper half
2400 mov .Lpoly+8*1(%rip), $a_ptr
2408 sub \$-1, $acc4 # .Lpoly[0]
2410 sbb $a_ptr, $acc5 # .Lpoly[1]
2411 sbb \$0, $acc6 # .Lpoly[2]
2413 sbb $t1, $acc7 # .Lpoly[3]
2418 mov $acc4, 8*0($r_ptr)
2420 mov $acc5, 8*1($r_ptr)
2422 mov $acc6, 8*2($r_ptr)
2423 mov $acc7, 8*3($r_ptr)
2426 .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2431 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
2432 my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
2433 my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
2436 ################################################################################
2437 # void ecp_nistz256_from_mont(
2440 # This one performs Montgomery multiplication by 1, so we only need the reduction
2442 .globl ecp_nistz256_from_mont
2443 .type ecp_nistz256_from_mont,\@function,2
2445 ecp_nistz256_from_mont:
2453 mov 8*0($in_ptr), %rax
2454 mov .Lpoly+8*3(%rip), $t2
2455 mov 8*1($in_ptr), $acc1
2456 mov 8*2($in_ptr), $acc2
2457 mov 8*3($in_ptr), $acc3
2459 mov .Lpoly+8*1(%rip), $t1
2461 #########################################
2473 #########################################
2486 ##########################################
2499 ###########################################
2513 ###########################################
2514 # Branch-less conditional subtraction
2524 cmovnz $in_ptr, $acc1
2525 mov $acc0, 8*0($r_ptr)
2527 mov $acc1, 8*1($r_ptr)
2529 mov $acc2, 8*2($r_ptr)
2530 mov $acc3, 8*3($r_ptr)
2537 .cfi_adjust_cfa_offset -16
2541 .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
2545 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2546 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2547 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2548 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2551 ################################################################################
2552 # void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
2553 .globl ecp_nistz256_scatter_w5
2554 .type ecp_nistz256_scatter_w5,\@abi-omnipotent
2556 ecp_nistz256_scatter_w5:
2557 lea -3($index,$index,2), $index
2558 movdqa 0x00($in_t), %xmm0
2560 movdqa 0x10($in_t), %xmm1
2561 movdqa 0x20($in_t), %xmm2
2562 movdqa 0x30($in_t), %xmm3
2563 movdqa 0x40($in_t), %xmm4
2564 movdqa 0x50($in_t), %xmm5
2565 movdqa %xmm0, 0x00($val,$index)
2566 movdqa %xmm1, 0x10($val,$index)
2567 movdqa %xmm2, 0x20($val,$index)
2568 movdqa %xmm3, 0x30($val,$index)
2569 movdqa %xmm4, 0x40($val,$index)
2570 movdqa %xmm5, 0x50($val,$index)
2573 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
2575 ################################################################################
2576 # void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2577 .globl ecp_nistz256_gather_w5
2578 .type ecp_nistz256_gather_w5,\@abi-omnipotent
2580 ecp_nistz256_gather_w5:
2582 $code.=<<___ if ($avx>1);
2583 mov OPENSSL_ia32cap_P+8(%rip), %eax
2585 jnz .Lavx2_gather_w5
2587 $code.=<<___ if ($win64);
2588 lea -0x88(%rsp), %rax
2589 .LSEH_begin_ecp_nistz256_gather_w5:
2590 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
2591 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
2592 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
2593 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
2594 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
2595 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
2596 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
2597 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
2598 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
2599 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
2600 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
2603 movdqa .LOne(%rip), $ONE
2614 pshufd \$0, $INDEX, $INDEX
2617 .Lselect_loop_sse_w5:
2621 pcmpeqd $INDEX, $TMP0
2623 movdqa 16*0($in_t), $T0a
2624 movdqa 16*1($in_t), $T0b
2625 movdqa 16*2($in_t), $T0c
2626 movdqa 16*3($in_t), $T0d
2627 movdqa 16*4($in_t), $T0e
2628 movdqa 16*5($in_t), $T0f
2629 lea 16*6($in_t), $in_t
2645 jnz .Lselect_loop_sse_w5
2647 movdqu $Ra, 16*0($val)
2648 movdqu $Rb, 16*1($val)
2649 movdqu $Rc, 16*2($val)
2650 movdqu $Rd, 16*3($val)
2651 movdqu $Re, 16*4($val)
2652 movdqu $Rf, 16*5($val)
2654 $code.=<<___ if ($win64);
2655 movaps (%rsp), %xmm6
2656 movaps 0x10(%rsp), %xmm7
2657 movaps 0x20(%rsp), %xmm8
2658 movaps 0x30(%rsp), %xmm9
2659 movaps 0x40(%rsp), %xmm10
2660 movaps 0x50(%rsp), %xmm11
2661 movaps 0x60(%rsp), %xmm12
2662 movaps 0x70(%rsp), %xmm13
2663 movaps 0x80(%rsp), %xmm14
2664 movaps 0x90(%rsp), %xmm15
2665 lea 0xa8(%rsp), %rsp
2669 .LSEH_end_ecp_nistz256_gather_w5:
2670 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
2672 ################################################################################
2673 # void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
2674 .globl ecp_nistz256_scatter_w7
2675 .type ecp_nistz256_scatter_w7,\@abi-omnipotent
2677 ecp_nistz256_scatter_w7:
2678 movdqu 0x00($in_t), %xmm0
2680 movdqu 0x10($in_t), %xmm1
2681 movdqu 0x20($in_t), %xmm2
2682 movdqu 0x30($in_t), %xmm3
2683 movdqa %xmm0, 0x00($val,$index)
2684 movdqa %xmm1, 0x10($val,$index)
2685 movdqa %xmm2, 0x20($val,$index)
2686 movdqa %xmm3, 0x30($val,$index)
2689 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
2691 ################################################################################
2692 # void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2693 .globl ecp_nistz256_gather_w7
2694 .type ecp_nistz256_gather_w7,\@abi-omnipotent
2696 ecp_nistz256_gather_w7:
2698 $code.=<<___ if ($avx>1);
2699 mov OPENSSL_ia32cap_P+8(%rip), %eax
2701 jnz .Lavx2_gather_w7
2703 $code.=<<___ if ($win64);
2704 lea -0x88(%rsp), %rax
2705 .LSEH_begin_ecp_nistz256_gather_w7:
2706 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
2707 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
2708 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
2709 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
2710 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
2711 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
2712 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
2713 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
2714 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
2715 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
2716 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
2719 movdqa .LOne(%rip), $M0
2728 pshufd \$0, $INDEX, $INDEX
2731 .Lselect_loop_sse_w7:
2734 movdqa 16*0($in_t), $T0a
2735 movdqa 16*1($in_t), $T0b
2736 pcmpeqd $INDEX, $TMP0
2737 movdqa 16*2($in_t), $T0c
2738 movdqa 16*3($in_t), $T0d
2739 lea 16*4($in_t), $in_t
2748 prefetcht0 255($in_t)
2752 jnz .Lselect_loop_sse_w7
2754 movdqu $Ra, 16*0($val)
2755 movdqu $Rb, 16*1($val)
2756 movdqu $Rc, 16*2($val)
2757 movdqu $Rd, 16*3($val)
2759 $code.=<<___ if ($win64);
2760 movaps (%rsp), %xmm6
2761 movaps 0x10(%rsp), %xmm7
2762 movaps 0x20(%rsp), %xmm8
2763 movaps 0x30(%rsp), %xmm9
2764 movaps 0x40(%rsp), %xmm10
2765 movaps 0x50(%rsp), %xmm11
2766 movaps 0x60(%rsp), %xmm12
2767 movaps 0x70(%rsp), %xmm13
2768 movaps 0x80(%rsp), %xmm14
2769 movaps 0x90(%rsp), %xmm15
2770 lea 0xa8(%rsp), %rsp
2774 .LSEH_end_ecp_nistz256_gather_w7:
2775 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
2779 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2780 my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2781 my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2782 my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2785 ################################################################################
2786 # void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
2787 .type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
2789 ecp_nistz256_avx2_gather_w5:
2793 $code.=<<___ if ($win64);
2794 lea -0x88(%rsp), %rax
2796 .LSEH_begin_ecp_nistz256_avx2_gather_w5:
2797 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
2798 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
2799 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
2800 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
2801 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
2802 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
2803 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
2804 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
2805 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
2806 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
2807 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
2810 vmovdqa .LTwo(%rip), $TWO
2816 vmovdqa .LOne(%rip), $M0
2817 vmovdqa .LTwo(%rip), $M1
2820 vpermd $INDEX, $Ra, $INDEX
2823 .Lselect_loop_avx2_w5:
2825 vmovdqa 32*0($in_t), $T0a
2826 vmovdqa 32*1($in_t), $T0b
2827 vmovdqa 32*2($in_t), $T0c
2829 vmovdqa 32*3($in_t), $T1a
2830 vmovdqa 32*4($in_t), $T1b
2831 vmovdqa 32*5($in_t), $T1c
2833 vpcmpeqd $INDEX, $M0, $TMP0
2834 vpcmpeqd $INDEX, $M1, $TMP1
2836 vpaddd $TWO, $M0, $M0
2837 vpaddd $TWO, $M1, $M1
2838 lea 32*6($in_t), $in_t
2840 vpand $TMP0, $T0a, $T0a
2841 vpand $TMP0, $T0b, $T0b
2842 vpand $TMP0, $T0c, $T0c
2843 vpand $TMP1, $T1a, $T1a
2844 vpand $TMP1, $T1b, $T1b
2845 vpand $TMP1, $T1c, $T1c
2847 vpxor $T0a, $Ra, $Ra
2848 vpxor $T0b, $Rb, $Rb
2849 vpxor $T0c, $Rc, $Rc
2850 vpxor $T1a, $Ra, $Ra
2851 vpxor $T1b, $Rb, $Rb
2852 vpxor $T1c, $Rc, $Rc
2855 jnz .Lselect_loop_avx2_w5
2857 vmovdqu $Ra, 32*0($val)
2858 vmovdqu $Rb, 32*1($val)
2859 vmovdqu $Rc, 32*2($val)
2862 $code.=<<___ if ($win64);
2863 movaps (%rsp), %xmm6
2864 movaps 0x10(%rsp), %xmm7
2865 movaps 0x20(%rsp), %xmm8
2866 movaps 0x30(%rsp), %xmm9
2867 movaps 0x40(%rsp), %xmm10
2868 movaps 0x50(%rsp), %xmm11
2869 movaps 0x60(%rsp), %xmm12
2870 movaps 0x70(%rsp), %xmm13
2871 movaps 0x80(%rsp), %xmm14
2872 movaps 0x90(%rsp), %xmm15
2877 .LSEH_end_ecp_nistz256_avx2_gather_w5:
2878 .size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
2882 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2883 my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2884 my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2885 my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2886 my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2890 ################################################################################
2891 # void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
2892 .globl ecp_nistz256_avx2_gather_w7
2893 .type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
2895 ecp_nistz256_avx2_gather_w7:
2899 $code.=<<___ if ($win64);
2901 lea -0x88(%rsp), %rax
2902 .LSEH_begin_ecp_nistz256_avx2_gather_w7:
2903 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp
2904 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax)
2905 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax)
2906 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax)
2907 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax)
2908 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax)
2909 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax)
2910 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax)
2911 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax)
2912 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax)
2913 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax)
2916 vmovdqa .LThree(%rip), $THREE
2921 vmovdqa .LOne(%rip), $M0
2922 vmovdqa .LTwo(%rip), $M1
2923 vmovdqa .LThree(%rip), $M2
2926 vpermd $INDEX, $Ra, $INDEX
2927 # Skip index = 0, because it is implicitly the point at infinity
2930 .Lselect_loop_avx2_w7:
2932 vmovdqa 32*0($in_t), $T0a
2933 vmovdqa 32*1($in_t), $T0b
2935 vmovdqa 32*2($in_t), $T1a
2936 vmovdqa 32*3($in_t), $T1b
2938 vmovdqa 32*4($in_t), $T2a
2939 vmovdqa 32*5($in_t), $T2b
2941 vpcmpeqd $INDEX, $M0, $TMP0
2942 vpcmpeqd $INDEX, $M1, $TMP1
2943 vpcmpeqd $INDEX, $M2, $TMP2
2945 vpaddd $THREE, $M0, $M0
2946 vpaddd $THREE, $M1, $M1
2947 vpaddd $THREE, $M2, $M2
2948 lea 32*6($in_t), $in_t
2950 vpand $TMP0, $T0a, $T0a
2951 vpand $TMP0, $T0b, $T0b
2952 vpand $TMP1, $T1a, $T1a
2953 vpand $TMP1, $T1b, $T1b
2954 vpand $TMP2, $T2a, $T2a
2955 vpand $TMP2, $T2b, $T2b
2957 vpxor $T0a, $Ra, $Ra
2958 vpxor $T0b, $Rb, $Rb
2959 vpxor $T1a, $Ra, $Ra
2960 vpxor $T1b, $Rb, $Rb
2961 vpxor $T2a, $Ra, $Ra
2962 vpxor $T2b, $Rb, $Rb
2965 jnz .Lselect_loop_avx2_w7
2968 vmovdqa 32*0($in_t), $T0a
2969 vmovdqa 32*1($in_t), $T0b
2971 vpcmpeqd $INDEX, $M0, $TMP0
2973 vpand $TMP0, $T0a, $T0a
2974 vpand $TMP0, $T0b, $T0b
2976 vpxor $T0a, $Ra, $Ra
2977 vpxor $T0b, $Rb, $Rb
2979 vmovdqu $Ra, 32*0($val)
2980 vmovdqu $Rb, 32*1($val)
2983 $code.=<<___ if ($win64);
2984 movaps (%rsp), %xmm6
2985 movaps 0x10(%rsp), %xmm7
2986 movaps 0x20(%rsp), %xmm8
2987 movaps 0x30(%rsp), %xmm9
2988 movaps 0x40(%rsp), %xmm10
2989 movaps 0x50(%rsp), %xmm11
2990 movaps 0x60(%rsp), %xmm12
2991 movaps 0x70(%rsp), %xmm13
2992 movaps 0x80(%rsp), %xmm14
2993 movaps 0x90(%rsp), %xmm15
2998 .LSEH_end_ecp_nistz256_avx2_gather_w7:
2999 .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3003 .globl ecp_nistz256_avx2_gather_w7
3004 .type ecp_nistz256_avx2_gather_w7,\@function,3
3006 ecp_nistz256_avx2_gather_w7:
3007 .byte 0x0f,0x0b # ud2
3009 .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
3013 ########################################################################
3014 # This block implements higher level point_double, point_add and
3015 # point_add_affine. The key to performance in this case is to allow
3016 # out-of-order execution logic to overlap computations from next step
3017 # with tail processing from current step. By using tailored calling
3018 # sequence we minimize inter-step overhead to give processor better
3019 # shot at overlapping operations...
3021 # You will notice that input data is copied to stack. Trouble is that
3022 # there are no registers to spare for holding original pointers and
3023 # reloading them, pointers, would create undesired dependencies on
3024 # effective addresses calculation paths. In other words it's too done
3025 # to favour out-of-order execution logic.
3026 # <appro@openssl.org>
3028 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
3029 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
3030 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
3031 my ($poly1,$poly3)=($acc6,$acc7);
3033 sub load_for_mul () {
3034 my ($a,$b,$src0) = @_;
3035 my $bias = $src0 eq "%rax" ? 0 : -128;
3041 lea $bias+$a, $a_ptr
3046 sub load_for_sqr () {
3048 my $bias = $src0 eq "%rax" ? 0 : -128;
3052 lea $bias+$a, $a_ptr
3058 ########################################################################
3059 # operate in 4-5-0-1 "name space" that matches multiplication output
3061 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3064 .type __ecp_nistz256_add_toq,\@abi-omnipotent
3066 __ecp_nistz256_add_toq:
3068 add 8*0($b_ptr), $a0
3069 adc 8*1($b_ptr), $a1
3071 adc 8*2($b_ptr), $a2
3072 adc 8*3($b_ptr), $a3
3086 mov $a0, 8*0($r_ptr)
3088 mov $a1, 8*1($r_ptr)
3090 mov $a2, 8*2($r_ptr)
3091 mov $a3, 8*3($r_ptr)
3094 .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
3096 .type __ecp_nistz256_sub_fromq,\@abi-omnipotent
3098 __ecp_nistz256_sub_fromq:
3099 sub 8*0($b_ptr), $a0
3100 sbb 8*1($b_ptr), $a1
3102 sbb 8*2($b_ptr), $a2
3103 sbb 8*3($b_ptr), $a3
3117 mov $a0, 8*0($r_ptr)
3119 mov $a1, 8*1($r_ptr)
3121 mov $a2, 8*2($r_ptr)
3122 mov $a3, 8*3($r_ptr)
3125 .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
3127 .type __ecp_nistz256_subq,\@abi-omnipotent
3129 __ecp_nistz256_subq:
3152 .size __ecp_nistz256_subq,.-__ecp_nistz256_subq
3154 .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
3156 __ecp_nistz256_mul_by_2q:
3158 add $a0, $a0 # a0:a3+a0:a3
3176 mov $a0, 8*0($r_ptr)
3178 mov $a1, 8*1($r_ptr)
3180 mov $a2, 8*2($r_ptr)
3181 mov $a3, 8*3($r_ptr)
3184 .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
3189 my ($src0,$sfx,$bias);
3190 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
3198 .globl ecp_nistz256_point_double
3199 .type ecp_nistz256_point_double,\@function,2
3201 ecp_nistz256_point_double:
3204 $code.=<<___ if ($addx);
3206 and OPENSSL_ia32cap_P+8(%rip), %ecx
3216 .type ecp_nistz256_point_doublex,\@function,2
3218 ecp_nistz256_point_doublex:
3237 .cfi_adjust_cfa_offset 32*5+8
3238 .Lpoint_double${x}_body:
3240 .Lpoint_double_shortcut$x:
3241 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
3242 mov $a_ptr, $b_ptr # backup copy
3243 movdqu 0x10($a_ptr), %xmm1
3244 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
3245 mov 0x20+8*1($a_ptr), $acc5
3246 mov 0x20+8*2($a_ptr), $acc0
3247 mov 0x20+8*3($a_ptr), $acc1
3248 mov .Lpoly+8*1(%rip), $poly1
3249 mov .Lpoly+8*3(%rip), $poly3
3250 movdqa %xmm0, $in_x(%rsp)
3251 movdqa %xmm1, $in_x+0x10(%rsp)
3252 lea 0x20($r_ptr), $acc2
3253 lea 0x40($r_ptr), $acc3
3258 lea $S(%rsp), $r_ptr
3259 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
3261 mov 0x40+8*0($a_ptr), $src0
3262 mov 0x40+8*1($a_ptr), $acc6
3263 mov 0x40+8*2($a_ptr), $acc7
3264 mov 0x40+8*3($a_ptr), $acc0
3265 lea 0x40-$bias($a_ptr), $a_ptr
3266 lea $Zsqr(%rsp), $r_ptr
3267 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
3269 `&load_for_sqr("$S(%rsp)", "$src0")`
3270 lea $S(%rsp), $r_ptr
3271 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
3273 mov 0x20($b_ptr), $src0 # $b_ptr is still valid
3274 mov 0x40+8*0($b_ptr), $acc1
3275 mov 0x40+8*1($b_ptr), $acc2
3276 mov 0x40+8*2($b_ptr), $acc3
3277 mov 0x40+8*3($b_ptr), $acc4
3278 lea 0x40-$bias($b_ptr), $a_ptr
3279 lea 0x20($b_ptr), $b_ptr
3281 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
3282 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
3284 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
3285 mov $in_x+8*1(%rsp), $acc5
3286 lea $Zsqr(%rsp), $b_ptr
3287 mov $in_x+8*2(%rsp), $acc0
3288 mov $in_x+8*3(%rsp), $acc1
3289 lea $M(%rsp), $r_ptr
3290 call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
3292 mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
3293 mov $in_x+8*1(%rsp), $acc5
3294 lea $Zsqr(%rsp), $b_ptr
3295 mov $in_x+8*2(%rsp), $acc0
3296 mov $in_x+8*3(%rsp), $acc1
3297 lea $Zsqr(%rsp), $r_ptr
3298 call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
3300 `&load_for_sqr("$S(%rsp)", "$src0")`
3302 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
3305 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
3306 # operate in 4-5-6-7 "name space" that matches squaring output
3308 my ($poly1,$poly3)=($a_ptr,$t1);
3309 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
3322 xor $a_ptr, $a_ptr # borrow $a_ptr
3331 mov $a1, $t0 # a0:a3>>1
3342 mov $a0, 8*0($r_ptr)
3344 mov $a1, 8*1($r_ptr)
3348 mov $a2, 8*2($r_ptr)
3349 mov $a3, 8*3($r_ptr)
3353 `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
3354 lea $M(%rsp), $r_ptr
3355 call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
3357 lea $tmp0(%rsp), $r_ptr
3358 call __ecp_nistz256_mul_by_2$x
3360 lea $M(%rsp), $b_ptr
3361 lea $M(%rsp), $r_ptr
3362 call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
3364 `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
3365 lea $S(%rsp), $r_ptr
3366 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
3368 lea $tmp0(%rsp), $r_ptr
3369 call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
3371 `&load_for_sqr("$M(%rsp)", "$src0")`
3373 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
3375 lea $tmp0(%rsp), $b_ptr
3376 mov $acc6, $acc0 # harmonize sqr output and sub input
3380 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
3382 mov $S+8*0(%rsp), $t0
3383 mov $S+8*1(%rsp), $t1
3384 mov $S+8*2(%rsp), $t2
3385 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
3386 lea $S(%rsp), $r_ptr
3387 call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
3390 lea $M(%rsp), $b_ptr
3391 mov $acc4, $acc6 # harmonize sub output and mul input
3393 mov $acc4, $S+8*0(%rsp) # have to save:-(
3395 mov $acc5, $S+8*1(%rsp)
3397 mov $acc0, $S+8*2(%rsp)
3398 lea $S-$bias(%rsp), $a_ptr
3400 mov $acc1, $S+8*3(%rsp)
3402 lea $S(%rsp), $r_ptr
3403 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
3407 call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
3409 lea 32*5+56(%rsp), %rsi
3424 .cfi_def_cfa_register %rsp
3425 .Lpoint_double${x}_epilogue:
3428 .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
3435 my ($src0,$sfx,$bias);
3436 my ($H,$Hsqr,$R,$Rsqr,$Hcub,
3438 $res_x,$res_y,$res_z,
3439 $in1_x,$in1_y,$in1_z,
3440 $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
3441 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
3449 .globl ecp_nistz256_point_add
3450 .type ecp_nistz256_point_add,\@function,3
3452 ecp_nistz256_point_add:
3455 $code.=<<___ if ($addx);
3457 and OPENSSL_ia32cap_P+8(%rip), %ecx
3467 .type ecp_nistz256_point_addx,\@function,3
3469 ecp_nistz256_point_addx:
3488 .cfi_adjust_cfa_offset 32*18+8
3489 .Lpoint_add${x}_body:
3491 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
3492 movdqu 0x10($a_ptr), %xmm1
3493 movdqu 0x20($a_ptr), %xmm2
3494 movdqu 0x30($a_ptr), %xmm3
3495 movdqu 0x40($a_ptr), %xmm4
3496 movdqu 0x50($a_ptr), %xmm5
3497 mov $a_ptr, $b_ptr # reassign
3498 mov $b_org, $a_ptr # reassign
3499 movdqa %xmm0, $in1_x(%rsp)
3500 movdqa %xmm1, $in1_x+0x10(%rsp)
3501 movdqa %xmm2, $in1_y(%rsp)
3502 movdqa %xmm3, $in1_y+0x10(%rsp)
3503 movdqa %xmm4, $in1_z(%rsp)
3504 movdqa %xmm5, $in1_z+0x10(%rsp)
3507 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
3508 pshufd \$0xb1, %xmm5, %xmm3
3509 movdqu 0x10($a_ptr), %xmm1
3510 movdqu 0x20($a_ptr), %xmm2
3512 movdqu 0x30($a_ptr), %xmm3
3513 mov 0x40+8*0($a_ptr), $src0 # load original in2_z
3514 mov 0x40+8*1($a_ptr), $acc6
3515 mov 0x40+8*2($a_ptr), $acc7
3516 mov 0x40+8*3($a_ptr), $acc0
3517 movdqa %xmm0, $in2_x(%rsp)
3518 pshufd \$0x1e, %xmm5, %xmm4
3519 movdqa %xmm1, $in2_x+0x10(%rsp)
3520 movdqu 0x40($a_ptr),%xmm0 # in2_z again
3521 movdqu 0x50($a_ptr),%xmm1
3522 movdqa %xmm2, $in2_y(%rsp)
3523 movdqa %xmm3, $in2_y+0x10(%rsp)
3527 movq $r_ptr, %xmm0 # save $r_ptr
3529 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
3530 mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
3531 mov $acc6, $in2_z+8*1(%rsp)
3532 mov $acc7, $in2_z+8*2(%rsp)
3533 mov $acc0, $in2_z+8*3(%rsp)
3534 lea $Z2sqr(%rsp), $r_ptr # Z2^2
3535 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
3537 pcmpeqd %xmm4, %xmm5
3538 pshufd \$0xb1, %xmm1, %xmm4
3540 pshufd \$0, %xmm5, %xmm5 # in1infty
3541 pshufd \$0x1e, %xmm4, %xmm3
3544 pcmpeqd %xmm3, %xmm4
3545 pshufd \$0, %xmm4, %xmm4 # in2infty
3546 mov 0x40+8*0($b_ptr), $src0 # load original in1_z
3547 mov 0x40+8*1($b_ptr), $acc6
3548 mov 0x40+8*2($b_ptr), $acc7
3549 mov 0x40+8*3($b_ptr), $acc0
3552 lea 0x40-$bias($b_ptr), $a_ptr
3553 lea $Z1sqr(%rsp), $r_ptr # Z1^2
3554 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
3556 `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3557 lea $S1(%rsp), $r_ptr # S1 = Z2^3
3558 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
3560 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3561 lea $S2(%rsp), $r_ptr # S2 = Z1^3
3562 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
3564 `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3565 lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
3566 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
3568 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3569 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
3570 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
3572 lea $S1(%rsp), $b_ptr
3573 lea $R(%rsp), $r_ptr # R = S2 - S1
3574 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
3576 or $acc5, $acc4 # see if result is zero
3580 por %xmm5, %xmm2 # in1infty || in2infty
3583 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3584 lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
3585 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
3587 `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3588 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
3589 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
3591 lea $U1(%rsp), $b_ptr
3592 lea $H(%rsp), $r_ptr # H = U2 - U1
3593 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
3595 or $acc5, $acc4 # see if result is zero
3599 .byte 0x3e # predict taken
3600 jnz .Ladd_proceed$x # is_equal(U1,U2)?
3604 jnz .Ladd_proceed$x # (in1infty || in2infty)?
3606 jz .Ladd_double$x # is_equal(S1,S2)?
3608 movq %xmm0, $r_ptr # restore $r_ptr
3610 movdqu %xmm0, 0x00($r_ptr)
3611 movdqu %xmm0, 0x10($r_ptr)
3612 movdqu %xmm0, 0x20($r_ptr)
3613 movdqu %xmm0, 0x30($r_ptr)
3614 movdqu %xmm0, 0x40($r_ptr)
3615 movdqu %xmm0, 0x50($r_ptr)
3620 movq %xmm1, $a_ptr # restore $a_ptr
3621 movq %xmm0, $r_ptr # restore $r_ptr
3622 add \$`32*(18-5)`, %rsp # difference in frame sizes
3623 jmp .Lpoint_double_shortcut$x
3627 `&load_for_sqr("$R(%rsp)", "$src0")`
3628 lea $Rsqr(%rsp), $r_ptr # R^2
3629 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
3631 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3632 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
3633 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
3635 `&load_for_sqr("$H(%rsp)", "$src0")`
3636 lea $Hsqr(%rsp), $r_ptr # H^2
3637 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
3639 `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3640 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
3641 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
3643 `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3644 lea $Hcub(%rsp), $r_ptr # H^3
3645 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
3647 `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3648 lea $U2(%rsp), $r_ptr # U1*H^2
3649 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
3652 #######################################################################
3653 # operate in 4-5-0-1 "name space" that matches multiplication output
3655 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3656 my ($poly1, $poly3)=($acc6,$acc7);
3659 #lea $U2(%rsp), $a_ptr
3660 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
3661 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
3664 add $acc0, $acc0 # a0:a3+a0:a3
3665 lea $Rsqr(%rsp), $a_ptr
3682 mov 8*0($a_ptr), $t0
3684 mov 8*1($a_ptr), $t1
3686 mov 8*2($a_ptr), $t2
3688 mov 8*3($a_ptr), $t3
3690 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
3692 lea $Hcub(%rsp), $b_ptr
3693 lea $res_x(%rsp), $r_ptr
3694 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
3696 mov $U2+8*0(%rsp), $t0
3697 mov $U2+8*1(%rsp), $t1
3698 mov $U2+8*2(%rsp), $t2
3699 mov $U2+8*3(%rsp), $t3
3700 lea $res_y(%rsp), $r_ptr
3702 call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
3704 mov $acc0, 8*0($r_ptr) # save the result, as
3705 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
3706 mov $acc2, 8*2($r_ptr)
3707 mov $acc3, 8*3($r_ptr)
3711 `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3712 lea $S2(%rsp), $r_ptr
3713 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
3715 `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3716 lea $res_y(%rsp), $r_ptr
3717 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
3719 lea $S2(%rsp), $b_ptr
3720 lea $res_y(%rsp), $r_ptr
3721 call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
3723 movq %xmm0, $r_ptr # restore $r_ptr
3725 movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
3727 pandn $res_z(%rsp), %xmm0
3729 pandn $res_z+0x10(%rsp), %xmm1
3731 pand $in2_z(%rsp), %xmm2
3732 pand $in2_z+0x10(%rsp), %xmm3
3736 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
3742 pand $in1_z(%rsp), %xmm2
3743 pand $in1_z+0x10(%rsp), %xmm3
3746 movdqu %xmm2, 0x40($r_ptr)
3747 movdqu %xmm3, 0x50($r_ptr)
3749 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
3751 pandn $res_x(%rsp), %xmm0
3753 pandn $res_x+0x10(%rsp), %xmm1
3755 pand $in2_x(%rsp), %xmm2
3756 pand $in2_x+0x10(%rsp), %xmm3
3760 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
3766 pand $in1_x(%rsp), %xmm2
3767 pand $in1_x+0x10(%rsp), %xmm3
3770 movdqu %xmm2, 0x00($r_ptr)
3771 movdqu %xmm3, 0x10($r_ptr)
3773 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
3775 pandn $res_y(%rsp), %xmm0
3777 pandn $res_y+0x10(%rsp), %xmm1
3779 pand $in2_y(%rsp), %xmm2
3780 pand $in2_y+0x10(%rsp), %xmm3
3784 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
3790 pand $in1_y(%rsp), %xmm2
3791 pand $in1_y+0x10(%rsp), %xmm3
3794 movdqu %xmm2, 0x20($r_ptr)
3795 movdqu %xmm3, 0x30($r_ptr)
3798 lea 32*18+56(%rsp), %rsi
3813 .cfi_def_cfa_register %rsp
3814 .Lpoint_add${x}_epilogue:
3817 .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3822 sub gen_add_affine () {
3824 my ($src0,$sfx,$bias);
3825 my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3826 $res_x,$res_y,$res_z,
3827 $in1_x,$in1_y,$in1_z,
3828 $in2_x,$in2_y)=map(32*$_,(0..14));
3837 .globl ecp_nistz256_point_add_affine
3838 .type ecp_nistz256_point_add_affine,\@function,3
3840 ecp_nistz256_point_add_affine:
3843 $code.=<<___ if ($addx);
3845 and OPENSSL_ia32cap_P+8(%rip), %ecx
3847 je .Lpoint_add_affinex
3855 .type ecp_nistz256_point_add_affinex,\@function,3
3857 ecp_nistz256_point_add_affinex:
3859 .Lpoint_add_affinex:
3876 .cfi_adjust_cfa_offset 32*15+8
3877 .Ladd_affine${x}_body:
3879 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
3880 mov $b_org, $b_ptr # reassign
3881 movdqu 0x10($a_ptr), %xmm1
3882 movdqu 0x20($a_ptr), %xmm2
3883 movdqu 0x30($a_ptr), %xmm3
3884 movdqu 0x40($a_ptr), %xmm4
3885 movdqu 0x50($a_ptr), %xmm5
3886 mov 0x40+8*0($a_ptr), $src0 # load original in1_z
3887 mov 0x40+8*1($a_ptr), $acc6
3888 mov 0x40+8*2($a_ptr), $acc7
3889 mov 0x40+8*3($a_ptr), $acc0
3890 movdqa %xmm0, $in1_x(%rsp)
3891 movdqa %xmm1, $in1_x+0x10(%rsp)
3892 movdqa %xmm2, $in1_y(%rsp)
3893 movdqa %xmm3, $in1_y+0x10(%rsp)
3894 movdqa %xmm4, $in1_z(%rsp)
3895 movdqa %xmm5, $in1_z+0x10(%rsp)
3898 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
3899 pshufd \$0xb1, %xmm5, %xmm3
3900 movdqu 0x10($b_ptr), %xmm1
3901 movdqu 0x20($b_ptr), %xmm2
3903 movdqu 0x30($b_ptr), %xmm3
3904 movdqa %xmm0, $in2_x(%rsp)
3905 pshufd \$0x1e, %xmm5, %xmm4
3906 movdqa %xmm1, $in2_x+0x10(%rsp)
3908 movq $r_ptr, %xmm0 # save $r_ptr
3909 movdqa %xmm2, $in2_y(%rsp)
3910 movdqa %xmm3, $in2_y+0x10(%rsp)
3916 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
3917 lea $Z1sqr(%rsp), $r_ptr # Z1^2
3918 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
3920 pcmpeqd %xmm4, %xmm5
3921 pshufd \$0xb1, %xmm3, %xmm4
3922 mov 0x00($b_ptr), $src0 # $b_ptr is still valid
3923 #lea 0x00($b_ptr), $b_ptr
3924 mov $acc4, $acc1 # harmonize sqr output and mul input
3926 pshufd \$0, %xmm5, %xmm5 # in1infty
3927 pshufd \$0x1e, %xmm4, %xmm3
3932 pcmpeqd %xmm3, %xmm4
3933 pshufd \$0, %xmm4, %xmm4 # in2infty
3935 lea $Z1sqr-$bias(%rsp), $a_ptr
3937 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
3938 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
3940 lea $in1_x(%rsp), $b_ptr
3941 lea $H(%rsp), $r_ptr # H = U2 - U1
3942 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
3944 `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3945 lea $S2(%rsp), $r_ptr # S2 = Z1^3
3946 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
3948 `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3949 lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
3950 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
3952 `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3953 lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
3954 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
3956 lea $in1_y(%rsp), $b_ptr
3957 lea $R(%rsp), $r_ptr # R = S2 - S1
3958 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
3960 `&load_for_sqr("$H(%rsp)", "$src0")`
3961 lea $Hsqr(%rsp), $r_ptr # H^2
3962 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
3964 `&load_for_sqr("$R(%rsp)", "$src0")`
3965 lea $Rsqr(%rsp), $r_ptr # R^2
3966 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
3968 `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3969 lea $Hcub(%rsp), $r_ptr # H^3
3970 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
3972 `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3973 lea $U2(%rsp), $r_ptr # U1*H^2
3974 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
3977 #######################################################################
3978 # operate in 4-5-0-1 "name space" that matches multiplication output
3980 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3981 my ($poly1, $poly3)=($acc6,$acc7);
3984 #lea $U2(%rsp), $a_ptr
3985 #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
3986 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
3989 add $acc0, $acc0 # a0:a3+a0:a3
3990 lea $Rsqr(%rsp), $a_ptr
4007 mov 8*0($a_ptr), $t0
4009 mov 8*1($a_ptr), $t1
4011 mov 8*2($a_ptr), $t2
4013 mov 8*3($a_ptr), $t3
4015 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
4017 lea $Hcub(%rsp), $b_ptr
4018 lea $res_x(%rsp), $r_ptr
4019 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
4021 mov $U2+8*0(%rsp), $t0
4022 mov $U2+8*1(%rsp), $t1
4023 mov $U2+8*2(%rsp), $t2
4024 mov $U2+8*3(%rsp), $t3
4025 lea $H(%rsp), $r_ptr
4027 call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
4029 mov $acc0, 8*0($r_ptr) # save the result, as
4030 mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
4031 mov $acc2, 8*2($r_ptr)
4032 mov $acc3, 8*3($r_ptr)
4036 `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
4037 lea $S2(%rsp), $r_ptr
4038 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
4040 `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
4041 lea $H(%rsp), $r_ptr
4042 call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
4044 lea $S2(%rsp), $b_ptr
4045 lea $res_y(%rsp), $r_ptr
4046 call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
4048 movq %xmm0, $r_ptr # restore $r_ptr
4050 movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
4052 pandn $res_z(%rsp), %xmm0
4054 pandn $res_z+0x10(%rsp), %xmm1
4056 pand .LONE_mont(%rip), %xmm2
4057 pand .LONE_mont+0x10(%rip), %xmm3
4061 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
4067 pand $in1_z(%rsp), %xmm2
4068 pand $in1_z+0x10(%rsp), %xmm3
4071 movdqu %xmm2, 0x40($r_ptr)
4072 movdqu %xmm3, 0x50($r_ptr)
4074 movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
4076 pandn $res_x(%rsp), %xmm0
4078 pandn $res_x+0x10(%rsp), %xmm1
4080 pand $in2_x(%rsp), %xmm2
4081 pand $in2_x+0x10(%rsp), %xmm3
4085 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
4091 pand $in1_x(%rsp), %xmm2
4092 pand $in1_x+0x10(%rsp), %xmm3
4095 movdqu %xmm2, 0x00($r_ptr)
4096 movdqu %xmm3, 0x10($r_ptr)
4098 movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
4100 pandn $res_y(%rsp), %xmm0
4102 pandn $res_y+0x10(%rsp), %xmm1
4104 pand $in2_y(%rsp), %xmm2
4105 pand $in2_y+0x10(%rsp), %xmm3
4109 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
4115 pand $in1_y(%rsp), %xmm2
4116 pand $in1_y+0x10(%rsp), %xmm3
4119 movdqu %xmm2, 0x20($r_ptr)
4120 movdqu %xmm3, 0x30($r_ptr)
4122 lea 32*15+56(%rsp), %rsi
4137 .cfi_def_cfa_register %rsp
4138 .Ladd_affine${x}_epilogue:
4141 .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
4144 &gen_add_affine("q");
4146 ########################################################################
4150 ########################################################################
4151 # operate in 4-5-0-1 "name space" that matches multiplication output
4153 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
4156 .type __ecp_nistz256_add_tox,\@abi-omnipotent
4158 __ecp_nistz256_add_tox:
4160 adc 8*0($b_ptr), $a0
4161 adc 8*1($b_ptr), $a1
4163 adc 8*2($b_ptr), $a2
4164 adc 8*3($b_ptr), $a3
4179 mov $a0, 8*0($r_ptr)
4181 mov $a1, 8*1($r_ptr)
4183 mov $a2, 8*2($r_ptr)
4184 mov $a3, 8*3($r_ptr)
4187 .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
4189 .type __ecp_nistz256_sub_fromx,\@abi-omnipotent
4191 __ecp_nistz256_sub_fromx:
4193 sbb 8*0($b_ptr), $a0
4194 sbb 8*1($b_ptr), $a1
4196 sbb 8*2($b_ptr), $a2
4197 sbb 8*3($b_ptr), $a3
4212 mov $a0, 8*0($r_ptr)
4214 mov $a1, 8*1($r_ptr)
4216 mov $a2, 8*2($r_ptr)
4217 mov $a3, 8*3($r_ptr)
4220 .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
4222 .type __ecp_nistz256_subx,\@abi-omnipotent
4224 __ecp_nistz256_subx:
4249 .size __ecp_nistz256_subx,.-__ecp_nistz256_subx
4251 .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
4253 __ecp_nistz256_mul_by_2x:
4255 adc $a0, $a0 # a0:a3+a0:a3
4274 mov $a0, 8*0($r_ptr)
4276 mov $a1, 8*1($r_ptr)
4278 mov $a2, 8*2($r_ptr)
4279 mov $a3, 8*3($r_ptr)
4282 .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
4287 &gen_add_affine("x");
4291 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4292 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4300 .extern __imp_RtlVirtualUnwind
4302 .type short_handler,\@abi-omnipotent
4316 mov 120($context),%rax # pull context->Rax
4317 mov 248($context),%rbx # pull context->Rip
4319 mov 8($disp),%rsi # disp->ImageBase
4320 mov 56($disp),%r11 # disp->HandlerData
4322 mov 0(%r11),%r10d # HandlerData[0]
4323 lea (%rsi,%r10),%r10 # end of prologue label
4324 cmp %r10,%rbx # context->Rip<end of prologue label
4325 jb .Lcommon_seh_tail
4327 mov 152($context),%rax # pull context->Rsp
4329 mov 4(%r11),%r10d # HandlerData[1]
4330 lea (%rsi,%r10),%r10 # epilogue label
4331 cmp %r10,%rbx # context->Rip>=epilogue label
4332 jae .Lcommon_seh_tail
4338 mov %r12,216($context) # restore context->R12
4339 mov %r13,224($context) # restore context->R13
4341 jmp .Lcommon_seh_tail
4342 .size short_handler,.-short_handler
4344 .type full_handler,\@abi-omnipotent
4358 mov 120($context),%rax # pull context->Rax
4359 mov 248($context),%rbx # pull context->Rip
4361 mov 8($disp),%rsi # disp->ImageBase
4362 mov 56($disp),%r11 # disp->HandlerData
4364 mov 0(%r11),%r10d # HandlerData[0]
4365 lea (%rsi,%r10),%r10 # end of prologue label
4366 cmp %r10,%rbx # context->Rip<end of prologue label
4367 jb .Lcommon_seh_tail
4369 mov 152($context),%rax # pull context->Rsp
4371 mov 4(%r11),%r10d # HandlerData[1]
4372 lea (%rsi,%r10),%r10 # epilogue label
4373 cmp %r10,%rbx # context->Rip>=epilogue label
4374 jae .Lcommon_seh_tail
4376 mov 8(%r11),%r10d # HandlerData[2]
4377 lea (%rax,%r10),%rax
4385 mov %rbx,144($context) # restore context->Rbx
4386 mov %rbp,160($context) # restore context->Rbp
4387 mov %r12,216($context) # restore context->R12
4388 mov %r13,224($context) # restore context->R13
4389 mov %r14,232($context) # restore context->R14
4390 mov %r15,240($context) # restore context->R15
4395 mov %rax,152($context) # restore context->Rsp
4396 mov %rsi,168($context) # restore context->Rsi
4397 mov %rdi,176($context) # restore context->Rdi
4399 mov 40($disp),%rdi # disp->ContextRecord
4400 mov $context,%rsi # context
4401 mov \$154,%ecx # sizeof(CONTEXT)
4402 .long 0xa548f3fc # cld; rep movsq
4405 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4406 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4407 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4408 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4409 mov 40(%rsi),%r10 # disp->ContextRecord
4410 lea 56(%rsi),%r11 # &disp->HandlerData
4411 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4412 mov %r10,32(%rsp) # arg5
4413 mov %r11,40(%rsp) # arg6
4414 mov %r12,48(%rsp) # arg7
4415 mov %rcx,56(%rsp) # arg8, (NULL)
4416 call *__imp_RtlVirtualUnwind(%rip)
4418 mov \$1,%eax # ExceptionContinueSearch
4430 .size full_handler,.-full_handler
4434 .rva .LSEH_begin_ecp_nistz256_mul_by_2
4435 .rva .LSEH_end_ecp_nistz256_mul_by_2
4436 .rva .LSEH_info_ecp_nistz256_mul_by_2
4438 .rva .LSEH_begin_ecp_nistz256_div_by_2
4439 .rva .LSEH_end_ecp_nistz256_div_by_2
4440 .rva .LSEH_info_ecp_nistz256_div_by_2
4442 .rva .LSEH_begin_ecp_nistz256_mul_by_3
4443 .rva .LSEH_end_ecp_nistz256_mul_by_3
4444 .rva .LSEH_info_ecp_nistz256_mul_by_3
4446 .rva .LSEH_begin_ecp_nistz256_add
4447 .rva .LSEH_end_ecp_nistz256_add
4448 .rva .LSEH_info_ecp_nistz256_add
4450 .rva .LSEH_begin_ecp_nistz256_sub
4451 .rva .LSEH_end_ecp_nistz256_sub
4452 .rva .LSEH_info_ecp_nistz256_sub
4454 .rva .LSEH_begin_ecp_nistz256_neg
4455 .rva .LSEH_end_ecp_nistz256_neg
4456 .rva .LSEH_info_ecp_nistz256_neg
4458 .rva .LSEH_begin_ecp_nistz256_ord_mul_mont
4459 .rva .LSEH_end_ecp_nistz256_ord_mul_mont
4460 .rva .LSEH_info_ecp_nistz256_ord_mul_mont
4462 .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont
4463 .rva .LSEH_end_ecp_nistz256_ord_sqr_mont
4464 .rva .LSEH_info_ecp_nistz256_ord_sqr_mont
4466 $code.=<<___ if ($addx);
4467 .rva .LSEH_begin_ecp_nistz256_ord_mul_montx
4468 .rva .LSEH_end_ecp_nistz256_ord_mul_montx
4469 .rva .LSEH_info_ecp_nistz256_ord_mul_montx
4471 .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx
4472 .rva .LSEH_end_ecp_nistz256_ord_sqr_montx
4473 .rva .LSEH_info_ecp_nistz256_ord_sqr_montx
4476 .rva .LSEH_begin_ecp_nistz256_to_mont
4477 .rva .LSEH_end_ecp_nistz256_to_mont
4478 .rva .LSEH_info_ecp_nistz256_to_mont
4480 .rva .LSEH_begin_ecp_nistz256_mul_mont
4481 .rva .LSEH_end_ecp_nistz256_mul_mont
4482 .rva .LSEH_info_ecp_nistz256_mul_mont
4484 .rva .LSEH_begin_ecp_nistz256_sqr_mont
4485 .rva .LSEH_end_ecp_nistz256_sqr_mont
4486 .rva .LSEH_info_ecp_nistz256_sqr_mont
4488 .rva .LSEH_begin_ecp_nistz256_from_mont
4489 .rva .LSEH_end_ecp_nistz256_from_mont
4490 .rva .LSEH_info_ecp_nistz256_from_mont
4492 .rva .LSEH_begin_ecp_nistz256_gather_w5
4493 .rva .LSEH_end_ecp_nistz256_gather_w5
4494 .rva .LSEH_info_ecp_nistz256_gather_wX
4496 .rva .LSEH_begin_ecp_nistz256_gather_w7
4497 .rva .LSEH_end_ecp_nistz256_gather_w7
4498 .rva .LSEH_info_ecp_nistz256_gather_wX
4500 $code.=<<___ if ($avx>1);
4501 .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5
4502 .rva .LSEH_end_ecp_nistz256_avx2_gather_w5
4503 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX
4505 .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7
4506 .rva .LSEH_end_ecp_nistz256_avx2_gather_w7
4507 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX
4510 .rva .LSEH_begin_ecp_nistz256_point_double
4511 .rva .LSEH_end_ecp_nistz256_point_double
4512 .rva .LSEH_info_ecp_nistz256_point_double
4514 .rva .LSEH_begin_ecp_nistz256_point_add
4515 .rva .LSEH_end_ecp_nistz256_point_add
4516 .rva .LSEH_info_ecp_nistz256_point_add
4518 .rva .LSEH_begin_ecp_nistz256_point_add_affine
4519 .rva .LSEH_end_ecp_nistz256_point_add_affine
4520 .rva .LSEH_info_ecp_nistz256_point_add_affine
4522 $code.=<<___ if ($addx);
4523 .rva .LSEH_begin_ecp_nistz256_point_doublex
4524 .rva .LSEH_end_ecp_nistz256_point_doublex
4525 .rva .LSEH_info_ecp_nistz256_point_doublex
4527 .rva .LSEH_begin_ecp_nistz256_point_addx
4528 .rva .LSEH_end_ecp_nistz256_point_addx
4529 .rva .LSEH_info_ecp_nistz256_point_addx
4531 .rva .LSEH_begin_ecp_nistz256_point_add_affinex
4532 .rva .LSEH_end_ecp_nistz256_point_add_affinex
4533 .rva .LSEH_info_ecp_nistz256_point_add_affinex
4539 .LSEH_info_ecp_nistz256_mul_by_2:
4542 .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[]
4543 .LSEH_info_ecp_nistz256_div_by_2:
4546 .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[]
4547 .LSEH_info_ecp_nistz256_mul_by_3:
4550 .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[]
4551 .LSEH_info_ecp_nistz256_add:
4554 .rva .Ladd_body,.Ladd_epilogue # HandlerData[]
4555 .LSEH_info_ecp_nistz256_sub:
4558 .rva .Lsub_body,.Lsub_epilogue # HandlerData[]
4559 .LSEH_info_ecp_nistz256_neg:
4562 .rva .Lneg_body,.Lneg_epilogue # HandlerData[]
4563 .LSEH_info_ecp_nistz256_ord_mul_mont:
4566 .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[]
4568 .LSEH_info_ecp_nistz256_ord_sqr_mont:
4571 .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[]
4574 $code.=<<___ if ($addx);
4575 .LSEH_info_ecp_nistz256_ord_mul_montx:
4578 .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[]
4580 .LSEH_info_ecp_nistz256_ord_sqr_montx:
4583 .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[]
4587 .LSEH_info_ecp_nistz256_to_mont:
4590 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
4592 .LSEH_info_ecp_nistz256_mul_mont:
4595 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
4597 .LSEH_info_ecp_nistz256_sqr_mont:
4600 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
4602 .LSEH_info_ecp_nistz256_from_mont:
4605 .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
4606 .LSEH_info_ecp_nistz256_gather_wX:
4607 .byte 0x01,0x33,0x16,0x00
4608 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
4609 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
4610 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
4611 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
4612 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
4613 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
4614 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
4615 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
4616 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
4617 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
4618 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
4621 $code.=<<___ if ($avx>1);
4622 .LSEH_info_ecp_nistz256_avx2_gather_wX:
4623 .byte 0x01,0x36,0x17,0x0b
4624 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
4625 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
4626 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
4627 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
4628 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
4629 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
4630 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
4631 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
4632 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
4633 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
4634 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
4635 .byte 0x00,0xb3,0x00,0x00 # set_frame r11
4639 .LSEH_info_ecp_nistz256_point_double:
4642 .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[]
4644 .LSEH_info_ecp_nistz256_point_add:
4647 .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[]
4649 .LSEH_info_ecp_nistz256_point_add_affine:
4652 .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[]
4655 $code.=<<___ if ($addx);
4657 .LSEH_info_ecp_nistz256_point_doublex:
4660 .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[]
4662 .LSEH_info_ecp_nistz256_point_addx:
4665 .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[]
4667 .LSEH_info_ecp_nistz256_point_add_affinex:
4670 .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[]
4675 ########################################################################
4676 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
4678 open TABLE,"<ecp_nistz256_table.c" or
4679 open TABLE,"<${dir}../ecp_nistz256_table.c" or
4680 die "failed to open ecp_nistz256_table.c:",$!;
4685 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
4689 die "insane number of elements" if ($#arr != 64*16*37-1);
4693 .globl ecp_nistz256_precomputed
4694 .type ecp_nistz256_precomputed,\@object
4696 ecp_nistz256_precomputed:
4698 while (@line=splice(@arr,0,16)) {
4699 print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
4702 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
4705 $code =~ s/\`([^\`]*)\`/eval $1/gem;