3 #******************************************************************************#
4 #* Copyright(c) 2012, Intel Corp. *#
5 #* Developers and authors: *#
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1) *#
7 #* (1) Intel Architecture Group, Microprocessor and Chipset Development, *#
8 #* Israel Development Center, Haifa, Israel *#
9 #* (2) University of Haifa *#
10 #******************************************************************************#
11 #* This submission to OpenSSL is to be made available under the OpenSSL *#
12 #* license, and only to the OpenSSL project, in order to allow integration *#
13 #* into the publicly distributed code. ? *#
14 #* The use of this code, or portions of this code, or concepts embedded in *#
15 #* this code, or modification of this code and/or algorithm(s) in it, or the *#
16 #* use of this code for any other purpose than stated above, requires special *#
18 #******************************************************************************#
19 #******************************************************************************#
21 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS *#
22 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
23 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
24 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
25 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
26 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF *#
27 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *#
28 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN *#
29 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) *#
30 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
31 #* POSSIBILITY OF SUCH DAMAGE. *#
32 #******************************************************************************#
34 #* [1] S. Gueron, "Efficient Software Implementations of Modular *#
35 #* Exponentiation", http://eprint.iacr.org/2011/239 *#
36 #* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". *#
37 #* IEEE Proceedings of 9th International Conference on Information *#
38 #* Technology: New Generations (ITNG 2012), 821-823 (2012). *#
39 #* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
40 #* Journal of Cryptographic Engineering 2:31-43 (2012). *#
41 #* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis *#
42 #* resistant 512-bit and 1024-bit modular exponentiation for optimizing *#
43 #* RSA1024 and RSA2048 on x86_64 platforms", *#
44 #* http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
45 ################################################################################
47 # While original submission covers 512- and 1024-bit exponentiation,
48 # this module is limited to 512-bit version only (and as such
49 # accelerates RSA1024 sign). This is because improvement for longer
50 # keys is not high enough to justify the effort, highest measured
51 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
52 # for the moment of this writing!] Nor does this module implement
53 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
54 # to more modular mixture of C and assembly. And it's optimized even
55 # for processors other than Intel Core family (see table below for
56 # improvement coefficients).
59 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
60 # ----------------+---------------------------
61 # Opteron +13% |+5% +20%
62 # Bulldozer -0% |-1% +10%
64 # Westmere +5% |+14% +17%
65 # Sandy Bridge +2% |+12% +29%
66 # Ivy Bridge +1% |+11% +35%
67 # Haswell(**) -0% |+12% +39%
69 # VIA Nano +70% |+9% +25%
71 # (*) rsax engine and fips numbers are presented for reference
73 # (**) MULX was attempted, but found to give only marginal improvement;
77 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
79 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
81 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
82 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
83 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
84 die "can't locate x86_64-xlate.pl";
86 open OUT,"| $^X $xlate $flavour $output";
89 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
90 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
94 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
95 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
99 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
100 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
104 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
106 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
111 .extern OPENSSL_ia32cap_P
114 .type rsaz_512_sqr,\@function,4
116 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
126 movq $mod, %rbp # common argument
131 $code.=<<___ if ($addx);
133 andl OPENSSL_ia32cap_P+8(%rip),%r11d
134 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
142 movl $times,128+8(%rsp)
186 addq %r8, %r8 #shlq \$1, %r8
188 adcq %r9, %r9 #shld \$1, %r8, %r9
249 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
251 adcq %r11, %r11 #shld \$1, %r10, %r11
289 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
307 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
337 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
355 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
380 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
395 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
419 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
427 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
448 adcq %r12, %r12 #shld \$1, %rbx, %r12
449 adcq %r13, %r13 #shld \$1, %r12, %r13
450 adcq %r14, %r14 #shld \$1, %r13, %r14
480 call __rsaz_512_reduce
492 call __rsaz_512_subtract
496 movl 128+8(%rsp), $times
508 movl $times,128+8(%rsp)
509 movq $out, %xmm0 # off-load
510 movq %rbp, %xmm1 # off-load
514 mulx 16($inp), %rcx, %r10
515 xor %rbp, %rbp # cf=0, of=0
517 mulx 24($inp), %rax, %r11
520 mulx 32($inp), %rcx, %r12
523 mulx 40($inp), %rax, %r13
526 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
530 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
532 adcx %rbp, %r15 # %rbp is 0
539 mulx %rdx, %rax, %rdx
548 mulx 16($inp), %rax, %rbx
552 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
556 mulx 32($inp), %rax, %rbx
560 mulx 40($inp), $out, %r8
564 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
568 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
578 mulx %rdx, %rax, %rcx
585 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
588 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
592 mulx 32($inp), %rax, %rcx
596 mulx 40($inp), $out, %r9
600 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
604 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
614 mulx %rdx, %rax, %rdx
621 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
624 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
628 mulx 40($inp), $out, %r10
632 mulx 48($inp), %rax, %rbx
636 mulx 56($inp), $out, %r10
647 mulx %rdx, %rax, %rdx
657 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
661 mulx 48($inp), %rax, %rcx
665 mulx 56($inp), $out, %r11
675 mulx %rdx, %rax, %rdx
685 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
689 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
699 mulx %rdx, %rax, %rdx
709 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
719 mulx %rdx, %rax, %rdx
725 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
726 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
729 mulx %rdx, %rax, %rdx
741 movq 128(%rsp), %rdx # pull $n0
751 call __rsaz_512_reducex
763 call __rsaz_512_subtract
767 movl 128+8(%rsp), $times
778 leaq 128+24+48(%rsp), %rax
788 .size rsaz_512_sqr,.-rsaz_512_sqr
792 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
795 .type rsaz_512_mul,\@function,5
807 movq $out, %xmm0 # off-load arguments
811 $code.=<<___ if ($addx);
813 andl OPENSSL_ia32cap_P+8(%rip),%r11d
814 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
818 movq ($bp), %rbx # pass b[0]
819 movq $bp, %rbp # pass argument
834 call __rsaz_512_reduce
836 $code.=<<___ if ($addx);
841 movq $bp, %rbp # pass argument
842 movq ($bp), %rdx # pass b[0]
848 movq 128(%rsp), %rdx # pull $n0
858 call __rsaz_512_reducex
872 call __rsaz_512_subtract
874 leaq 128+24+48(%rsp), %rax
884 .size rsaz_512_mul,.-rsaz_512_mul
888 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
890 .globl rsaz_512_mul_gather4
891 .type rsaz_512_mul_gather4,\@function,6
893 rsaz_512_mul_gather4:
904 $code.=<<___ if ($addx);
906 andl OPENSSL_ia32cap_P+8(%rip),%r11d
907 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
911 movl 64($bp,$pwr,4), %eax
912 movq $out, %xmm0 # off-load arguments
913 movl ($bp,$pwr,4), %ebx
921 leaq 128($bp,$pwr,4), %rbp
922 mulq %rbx # 0 iteration
1049 leaq 128(%rbp), %rbp
1053 jnz .Loop_mul_gather
1076 call __rsaz_512_reduce
1078 $code.=<<___ if ($addx);
1079 jmp .Lmul_gather_tail
1083 mov 64($bp,$pwr,4), %eax
1084 movq $out, %xmm0 # off-load arguments
1085 lea 128($bp,$pwr,4), %rbp
1086 mov ($bp,$pwr,4), %edx
1092 mulx ($ap), %rbx, %r8 # 0 iteration
1094 xor %edi, %edi # cf=0, of=0
1096 mulx 8($ap), %rax, %r9
1099 mulx 16($ap), %rbx, %r10
1100 movd 64(%rbp), %xmm5
1103 mulx 24($ap), %rax, %r11
1107 mulx 32($ap), %rbx, %r12
1111 mulx 40($ap), %rax, %r13
1114 mulx 48($ap), %rbx, %r14
1118 mulx 56($ap), %rax, %r15
1123 adcx %rdi, %r15 # %rdi is 0
1126 jmp .Loop_mulx_gather
1130 mulx ($ap), %rax, %r8
1134 mulx 8($ap), %rax, %r9
1135 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1139 mulx 16($ap), %rax, %r10
1140 movd 64(%rbp), %xmm5
1145 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1151 mulx 32($ap), %rax, %r12
1155 mulx 40($ap), %rax, %r13
1159 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1163 mulx 56($ap), %rax, %r15
1165 mov %rbx, 64(%rsp,%rcx,8)
1169 adcx %rdi, %r15 # cf=0
1172 jnz .Loop_mulx_gather
1176 mov %r10, 64+16(%rsp)
1177 mov %r11, 64+24(%rsp)
1178 mov %r12, 64+32(%rsp)
1179 mov %r13, 64+40(%rsp)
1180 mov %r14, 64+48(%rsp)
1181 mov %r15, 64+56(%rsp)
1186 mov 128(%rsp), %rdx # pull $n0
1196 call __rsaz_512_reducex
1206 adcq 104(%rsp), %r13
1207 adcq 112(%rsp), %r14
1208 adcq 120(%rsp), %r15
1211 call __rsaz_512_subtract
1213 leaq 128+24+48(%rsp), %rax
1214 movq -48(%rax), %r15
1215 movq -40(%rax), %r14
1216 movq -32(%rax), %r13
1217 movq -24(%rax), %r12
1218 movq -16(%rax), %rbp
1221 .Lmul_gather4_epilogue:
1223 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1227 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1229 .globl rsaz_512_mul_scatter4
1230 .type rsaz_512_mul_scatter4,\@function,6
1232 rsaz_512_mul_scatter4:
1241 .Lmul_scatter4_body:
1242 leaq ($tbl,$pwr,4), $tbl
1243 movq $out, %xmm0 # off-load arguments
1250 $code.=<<___ if ($addx);
1251 movl \$0x80100,%r11d
1252 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1253 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1257 movq ($out),%rbx # pass b[0]
1272 call __rsaz_512_reduce
1274 $code.=<<___ if ($addx);
1275 jmp .Lmul_scatter_tail
1279 movq ($out), %rdx # pass b[0]
1280 call __rsaz_512_mulx
1285 movq 128(%rsp), %rdx # pull $n0
1295 call __rsaz_512_reducex
1305 adcq 104(%rsp), %r13
1306 adcq 112(%rsp), %r14
1307 adcq 120(%rsp), %r15
1311 call __rsaz_512_subtract
1313 movl %r8d, 64*0($inp) # scatter
1315 movl %r9d, 64*2($inp)
1317 movl %r10d, 64*4($inp)
1319 movl %r11d, 64*6($inp)
1321 movl %r12d, 64*8($inp)
1323 movl %r13d, 64*10($inp)
1325 movl %r14d, 64*12($inp)
1327 movl %r15d, 64*14($inp)
1329 movl %r8d, 64*1($inp)
1330 movl %r9d, 64*3($inp)
1331 movl %r10d, 64*5($inp)
1332 movl %r11d, 64*7($inp)
1333 movl %r12d, 64*9($inp)
1334 movl %r13d, 64*11($inp)
1335 movl %r14d, 64*13($inp)
1336 movl %r15d, 64*15($inp)
1338 leaq 128+24+48(%rsp), %rax
1339 movq -48(%rax), %r15
1340 movq -40(%rax), %r14
1341 movq -32(%rax), %r13
1342 movq -24(%rax), %r12
1343 movq -16(%rax), %rbp
1346 .Lmul_scatter4_epilogue:
1348 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1352 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1354 .globl rsaz_512_mul_by_one
1355 .type rsaz_512_mul_by_one,\@function,4
1357 rsaz_512_mul_by_one:
1368 $code.=<<___ if ($addx);
1369 movl OPENSSL_ia32cap_P+8(%rip),%eax
1372 movq $mod, %rbp # reassign argument
1385 movdqa %xmm0, (%rsp)
1386 movdqa %xmm0, 16(%rsp)
1387 movdqa %xmm0, 32(%rsp)
1388 movdqa %xmm0, 48(%rsp)
1389 movdqa %xmm0, 64(%rsp)
1390 movdqa %xmm0, 80(%rsp)
1391 movdqa %xmm0, 96(%rsp)
1393 $code.=<<___ if ($addx);
1395 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1399 call __rsaz_512_reduce
1401 $code.=<<___ if ($addx);
1405 movq 128(%rsp), %rdx # pull $n0
1406 call __rsaz_512_reducex
1419 leaq 128+24+48(%rsp), %rax
1420 movq -48(%rax), %r15
1421 movq -40(%rax), %r14
1422 movq -32(%rax), %r13
1423 movq -24(%rax), %r12
1424 movq -16(%rax), %rbp
1427 .Lmul_by_one_epilogue:
1429 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1432 { # __rsaz_512_reduce
1434 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1436 # clobbers: everything except %rbp and %rdi
1438 .type __rsaz_512_reduce,\@abi-omnipotent
1442 imulq 128+8(%rsp), %rbx
1445 jmp .Lreduction_loop
1476 movq 128+8(%rsp), %rsi
1517 jne .Lreduction_loop
1520 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1524 # __rsaz_512_reducex
1526 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1528 # clobbers: everything except %rbp and %rdi
1530 .type __rsaz_512_reducex,\@abi-omnipotent
1533 #movq 128+8(%rsp), %rdx # pull $n0
1535 xorq %rsi, %rsi # cf=0,of=0
1537 jmp .Lreduction_loopx
1542 mulx 0(%rbp), %rax, %r8
1546 mulx 8(%rbp), %rax, %r9
1550 mulx 16(%rbp), %rbx, %r10
1554 mulx 24(%rbp), %rbx, %r11
1558 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1564 mulx 128+8(%rsp), %rbx, %rdx
1567 mulx 40(%rbp), %rax, %r13
1571 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1575 mulx 56(%rbp), %rax, %r15
1578 adox %rsi, %r15 # %rsi is 0
1579 adcx %rsi, %r15 # cf=0
1582 jne .Lreduction_loopx
1585 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1588 { # __rsaz_512_subtract
1589 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1591 # clobbers: everything but %rdi, %rsi and %rbp
1593 .type __rsaz_512_subtract,\@abi-omnipotent
1595 __rsaz_512_subtract:
1649 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1654 # input: %rsi - ap, %rbp - bp
1656 # clobbers: everything
1657 my ($ap,$bp) = ("%rsi","%rbp");
1659 .type __rsaz_512_mul,\@abi-omnipotent
1800 .size __rsaz_512_mul,.-__rsaz_512_mul
1806 # input: %rsi - ap, %rbp - bp
1808 # clobbers: everything
1809 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1811 .type __rsaz_512_mulx,\@abi-omnipotent
1814 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1815 xor $zero, $zero # cf=0,of=0
1817 mulx 8($ap), %rax, %r9
1820 mulx 16($ap), %rbx, %r10
1823 mulx 24($ap), %rax, %r11
1826 .byte 0xc4,0x62,0xe3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rbx, %r12
1829 mulx 40($ap), %rax, %r13
1832 mulx 48($ap), %rbx, %r14
1835 mulx 56($ap), %rax, %r15
1839 adcx $zero, %r15 # cf=0
1847 mulx ($ap), %rax, %r8
1851 mulx 8($ap), %rax, %r9
1855 mulx 16($ap), %rax, %r10
1859 mulx 24($ap), %rax, %r11
1863 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1867 mulx 40($ap), %rax, %r13
1871 mulx 48($ap), %rax, %r14
1875 mulx 56($ap), %rax, %r15
1876 movq 64($bp,%rcx,8), %rdx
1877 movq %rbx, 8+64-8(%rsp,%rcx,8)
1880 adcx $zero, %r15 # cf=0
1886 mulx ($ap), %rax, %r8
1890 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1894 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1898 mulx 24($ap), %rax, %r11
1902 mulx 32($ap), %rax, %r12
1906 mulx 40($ap), %rax, %r13
1910 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1914 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1919 mov %rbx, 8+64-8(%rsp)
1921 mov %r9, 8+64+8(%rsp)
1922 mov %r10, 8+64+16(%rsp)
1923 mov %r11, 8+64+24(%rsp)
1924 mov %r12, 8+64+32(%rsp)
1925 mov %r13, 8+64+40(%rsp)
1926 mov %r14, 8+64+48(%rsp)
1927 mov %r15, 8+64+56(%rsp)
1930 .size __rsaz_512_mulx,.-__rsaz_512_mulx
1934 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1936 .globl rsaz_512_scatter4
1937 .type rsaz_512_scatter4,\@abi-omnipotent
1940 leaq ($out,$power,4), $out
1950 leaq 128($out), $out
1954 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1956 .globl rsaz_512_gather4
1957 .type rsaz_512_gather4,\@abi-omnipotent
1960 leaq ($inp,$power,4), $inp
1967 leaq 128($inp), $inp
1975 .size rsaz_512_gather4,.-rsaz_512_gather4
1979 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1980 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1988 .extern __imp_RtlVirtualUnwind
1989 .type se_handler,\@abi-omnipotent
2003 mov 120($context),%rax # pull context->Rax
2004 mov 248($context),%rbx # pull context->Rip
2006 mov 8($disp),%rsi # disp->ImageBase
2007 mov 56($disp),%r11 # disp->HandlerData
2009 mov 0(%r11),%r10d # HandlerData[0]
2010 lea (%rsi,%r10),%r10 # end of prologue label
2011 cmp %r10,%rbx # context->Rip<end of prologue label
2012 jb .Lcommon_seh_tail
2014 mov 152($context),%rax # pull context->Rsp
2016 mov 4(%r11),%r10d # HandlerData[1]
2017 lea (%rsi,%r10),%r10 # epilogue label
2018 cmp %r10,%rbx # context->Rip>=epilogue label
2019 jae .Lcommon_seh_tail
2021 lea 128+24+48(%rax),%rax
2029 mov %rbx,144($context) # restore context->Rbx
2030 mov %rbp,160($context) # restore context->Rbp
2031 mov %r12,216($context) # restore context->R12
2032 mov %r13,224($context) # restore context->R13
2033 mov %r14,232($context) # restore context->R14
2034 mov %r15,240($context) # restore context->R15
2039 mov %rax,152($context) # restore context->Rsp
2040 mov %rsi,168($context) # restore context->Rsi
2041 mov %rdi,176($context) # restore context->Rdi
2043 mov 40($disp),%rdi # disp->ContextRecord
2044 mov $context,%rsi # context
2045 mov \$154,%ecx # sizeof(CONTEXT)
2046 .long 0xa548f3fc # cld; rep movsq
2049 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2050 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2051 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2052 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2053 mov 40(%rsi),%r10 # disp->ContextRecord
2054 lea 56(%rsi),%r11 # &disp->HandlerData
2055 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2056 mov %r10,32(%rsp) # arg5
2057 mov %r11,40(%rsp) # arg6
2058 mov %r12,48(%rsp) # arg7
2059 mov %rcx,56(%rsp) # arg8, (NULL)
2060 call *__imp_RtlVirtualUnwind(%rip)
2062 mov \$1,%eax # ExceptionContinueSearch
2074 .size sqr_handler,.-sqr_handler
2078 .rva .LSEH_begin_rsaz_512_sqr
2079 .rva .LSEH_end_rsaz_512_sqr
2080 .rva .LSEH_info_rsaz_512_sqr
2082 .rva .LSEH_begin_rsaz_512_mul
2083 .rva .LSEH_end_rsaz_512_mul
2084 .rva .LSEH_info_rsaz_512_mul
2086 .rva .LSEH_begin_rsaz_512_mul_gather4
2087 .rva .LSEH_end_rsaz_512_mul_gather4
2088 .rva .LSEH_info_rsaz_512_mul_gather4
2090 .rva .LSEH_begin_rsaz_512_mul_scatter4
2091 .rva .LSEH_end_rsaz_512_mul_scatter4
2092 .rva .LSEH_info_rsaz_512_mul_scatter4
2094 .rva .LSEH_begin_rsaz_512_mul_by_one
2095 .rva .LSEH_end_rsaz_512_mul_by_one
2096 .rva .LSEH_info_rsaz_512_mul_by_one
2100 .LSEH_info_rsaz_512_sqr:
2103 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2104 .LSEH_info_rsaz_512_mul:
2107 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2108 .LSEH_info_rsaz_512_mul_gather4:
2111 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2112 .LSEH_info_rsaz_512_mul_scatter4:
2115 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2116 .LSEH_info_rsaz_512_mul_by_one:
2119 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2123 $code =~ s/\`([^\`]*)\`/eval $1/gem;