2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ##############################################################################
12 # Copyright (c) 2012, Intel Corporation #
14 # All rights reserved. #
16 # Redistribution and use in source and binary forms, with or without #
17 # modification, are permitted provided that the following conditions are #
20 # * Redistributions of source code must retain the above copyright #
21 # notice, this list of conditions and the following disclaimer. #
23 # * Redistributions in binary form must reproduce the above copyright #
24 # notice, this list of conditions and the following disclaimer in the #
25 # documentation and/or other materials provided with the #
28 # * Neither the name of the Intel Corporation nor the names of its #
29 # contributors may be used to endorse or promote products derived from #
30 # this software without specific prior written permission. #
33 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
34 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
35 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
36 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
37 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
38 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
39 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
40 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
41 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
42 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
43 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
45 ##############################################################################
46 # Developers and authors: #
47 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
48 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
49 # Israel Development Center, Haifa, Israel #
50 # (2) University of Haifa #
51 ##############################################################################
53 # [1] S. Gueron, "Efficient Software Implementations of Modular #
54 # Exponentiation", http://eprint.iacr.org/2011/239 #
55 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
56 # IEEE Proceedings of 9th International Conference on Information #
57 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
58 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
59 # Journal of Cryptographic Engineering 2:31-43 (2012). #
60 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
61 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
62 # RSA1024 and RSA2048 on x86_64 platforms", #
63 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
64 ##############################################################################
66 # While original submission covers 512- and 1024-bit exponentiation,
67 # this module is limited to 512-bit version only (and as such
68 # accelerates RSA1024 sign). This is because improvement for longer
69 # keys is not high enough to justify the effort, highest measured
70 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
71 # for the moment of this writing!] Nor does this module implement
72 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
73 # to more modular mixture of C and assembly. And it's optimized even
74 # for processors other than Intel Core family (see table below for
75 # improvement coefficients).
78 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
79 # ----------------+---------------------------
80 # Opteron +13% |+5% +20%
81 # Bulldozer -0% |-1% +10%
83 # Westmere +5% |+14% +17%
84 # Sandy Bridge +2% |+12% +29%
85 # Ivy Bridge +1% |+11% +35%
86 # Haswell(**) -0% |+12% +39%
88 # VIA Nano +70% |+9% +25%
90 # (*) rsax engine and fips numbers are presented for reference
92 # (**) MULX was attempted, but found to give only marginal improvement;
96 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
98 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
100 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
101 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
102 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
103 die "can't locate x86_64-xlate.pl";
105 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
108 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
109 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
113 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
114 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
118 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
119 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
123 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
124 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
125 $addx = ($ver>=3.03);
128 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
130 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
135 .extern OPENSSL_ia32cap_P
138 .type rsaz_512_sqr,\@function,5
140 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
156 .cfi_adjust_cfa_offset 128+24
158 movq $mod, %rbp # common argument
163 $code.=<<___ if ($addx);
165 andl OPENSSL_ia32cap_P+8(%rip),%r11d
166 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
174 movl $times,128+8(%rsp)
218 addq %r8, %r8 #shlq \$1, %r8
220 adcq %r9, %r9 #shld \$1, %r8, %r9
281 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
283 adcq %r11, %r11 #shld \$1, %r10, %r11
321 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
339 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
369 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
387 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
412 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
427 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
451 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
459 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
480 adcq %r12, %r12 #shld \$1, %rbx, %r12
481 adcq %r13, %r13 #shld \$1, %r12, %r13
482 adcq %r14, %r14 #shld \$1, %r13, %r14
512 call __rsaz_512_reduce
524 call __rsaz_512_subtract
528 movl 128+8(%rsp), $times
540 movl $times,128+8(%rsp)
541 movq $out, %xmm0 # off-load
542 movq %rbp, %xmm1 # off-load
546 mulx 16($inp), %rcx, %r10
547 xor %rbp, %rbp # cf=0, of=0
549 mulx 24($inp), %rax, %r11
552 mulx 32($inp), %rcx, %r12
555 mulx 40($inp), %rax, %r13
558 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
562 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
564 adcx %rbp, %r15 # %rbp is 0
571 mulx %rdx, %rax, %rdx
580 mulx 16($inp), %rax, %rbx
584 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
588 mulx 32($inp), %rax, %rbx
592 mulx 40($inp), $out, %r8
596 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
600 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
610 mulx %rdx, %rax, %rcx
617 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
620 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
624 mulx 32($inp), %rax, %rcx
628 mulx 40($inp), $out, %r9
632 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
636 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
646 mulx %rdx, %rax, %rdx
653 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
656 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
660 mulx 40($inp), $out, %r10
664 mulx 48($inp), %rax, %rbx
668 mulx 56($inp), $out, %r10
679 mulx %rdx, %rax, %rdx
689 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
693 mulx 48($inp), %rax, %rcx
697 mulx 56($inp), $out, %r11
707 mulx %rdx, %rax, %rdx
717 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
721 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
731 mulx %rdx, %rax, %rdx
741 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
751 mulx %rdx, %rax, %rdx
757 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
758 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
761 mulx %rdx, %rax, %rdx
773 movq 128(%rsp), %rdx # pull $n0
783 call __rsaz_512_reducex
795 call __rsaz_512_subtract
799 movl 128+8(%rsp), $times
810 leaq 128+24+48(%rsp), %rax
825 .cfi_def_cfa_register %rsp
829 .size rsaz_512_sqr,.-rsaz_512_sqr
833 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
836 .type rsaz_512_mul,\@function,5
854 .cfi_adjust_cfa_offset 128+24
856 movq $out, %xmm0 # off-load arguments
860 $code.=<<___ if ($addx);
862 andl OPENSSL_ia32cap_P+8(%rip),%r11d
863 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
867 movq ($bp), %rbx # pass b[0]
868 movq $bp, %rbp # pass argument
883 call __rsaz_512_reduce
885 $code.=<<___ if ($addx);
890 movq $bp, %rbp # pass argument
891 movq ($bp), %rdx # pass b[0]
897 movq 128(%rsp), %rdx # pull $n0
907 call __rsaz_512_reducex
921 call __rsaz_512_subtract
923 leaq 128+24+48(%rsp), %rax
938 .cfi_def_cfa_register %rsp
942 .size rsaz_512_mul,.-rsaz_512_mul
946 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
948 .globl rsaz_512_mul_gather4
949 .type rsaz_512_mul_gather4,\@function,6
951 rsaz_512_mul_gather4:
966 subq \$`128+24+($win64?0xb0:0)`, %rsp
967 .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
969 $code.=<<___ if ($win64);
970 movaps %xmm6,0xa0(%rsp)
971 movaps %xmm7,0xb0(%rsp)
972 movaps %xmm8,0xc0(%rsp)
973 movaps %xmm9,0xd0(%rsp)
974 movaps %xmm10,0xe0(%rsp)
975 movaps %xmm11,0xf0(%rsp)
976 movaps %xmm12,0x100(%rsp)
977 movaps %xmm13,0x110(%rsp)
978 movaps %xmm14,0x120(%rsp)
979 movaps %xmm15,0x130(%rsp)
984 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
985 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
987 pshufd \$0,%xmm8,%xmm8 # broadcast $power
991 ########################################################################
992 # calculate mask by comparing 0..15 to $power
994 for($i=0;$i<4;$i++) {
996 paddd %xmm`$i`,%xmm`$i+1`
997 pcmpeqd %xmm8,%xmm`$i`
998 movdqa %xmm7,%xmm`$i+3`
1003 paddd %xmm`$i`,%xmm`$i+1`
1004 pcmpeqd %xmm8,%xmm`$i`
1010 movdqa 16*0($bp),%xmm8
1011 movdqa 16*1($bp),%xmm9
1012 movdqa 16*2($bp),%xmm10
1013 movdqa 16*3($bp),%xmm11
1015 movdqa 16*4($bp),%xmm12
1017 movdqa 16*5($bp),%xmm13
1019 movdqa 16*6($bp),%xmm14
1021 movdqa 16*7($bp),%xmm15
1035 pshufd \$0x4e,%xmm8,%xmm9
1038 $code.=<<___ if ($addx);
1039 movl \$0x80100,%r11d
1040 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1041 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1047 movq $n0, 128(%rsp) # off-load arguments
1048 movq $out, 128+8(%rsp)
1049 movq $mod, 128+16(%rsp)
1053 mulq %rbx # 0 iteration
1102 jmp .Loop_mul_gather
1106 movdqa 16*0(%rbp),%xmm8
1107 movdqa 16*1(%rbp),%xmm9
1108 movdqa 16*2(%rbp),%xmm10
1109 movdqa 16*3(%rbp),%xmm11
1111 movdqa 16*4(%rbp),%xmm12
1113 movdqa 16*5(%rbp),%xmm13
1115 movdqa 16*6(%rbp),%xmm14
1117 movdqa 16*7(%rbp),%xmm15
1118 leaq 128(%rbp), %rbp
1131 pshufd \$0x4e,%xmm8,%xmm9
1201 jnz .Loop_mul_gather
1212 movq 128+8(%rsp), $out
1213 movq 128+16(%rsp), %rbp
1224 call __rsaz_512_reduce
1226 $code.=<<___ if ($addx);
1227 jmp .Lmul_gather_tail
1233 mov $n0, 128(%rsp) # off-load arguments
1234 mov $out, 128+8(%rsp)
1235 mov $mod, 128+16(%rsp)
1237 mulx ($ap), %rbx, %r8 # 0 iteration
1239 xor %edi, %edi # cf=0, of=0
1241 mulx 8($ap), %rax, %r9
1243 mulx 16($ap), %rbx, %r10
1246 mulx 24($ap), %rax, %r11
1249 mulx 32($ap), %rbx, %r12
1252 mulx 40($ap), %rax, %r13
1255 mulx 48($ap), %rbx, %r14
1258 mulx 56($ap), %rax, %r15
1263 adcx %rdi, %r15 # %rdi is 0
1266 jmp .Loop_mulx_gather
1270 movdqa 16*0(%rbp),%xmm8
1271 movdqa 16*1(%rbp),%xmm9
1272 movdqa 16*2(%rbp),%xmm10
1273 movdqa 16*3(%rbp),%xmm11
1275 movdqa 16*4(%rbp),%xmm12
1277 movdqa 16*5(%rbp),%xmm13
1279 movdqa 16*6(%rbp),%xmm14
1281 movdqa 16*7(%rbp),%xmm15
1282 leaq 128(%rbp), %rbp
1295 pshufd \$0x4e,%xmm8,%xmm9
1299 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1303 mulx 8($ap), %rax, %r9
1307 mulx 16($ap), %rax, %r10
1311 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1315 mulx 32($ap), %rax, %r12
1319 mulx 40($ap), %rax, %r13
1323 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1328 mulx 56($ap), %rax, %r15
1329 mov %rbx, 64(%rsp,%rcx,8)
1333 adcx %rdi, %r15 # cf=0
1336 jnz .Loop_mulx_gather
1340 mov %r10, 64+16(%rsp)
1341 mov %r11, 64+24(%rsp)
1342 mov %r12, 64+32(%rsp)
1343 mov %r13, 64+40(%rsp)
1344 mov %r14, 64+48(%rsp)
1345 mov %r15, 64+56(%rsp)
1347 mov 128(%rsp), %rdx # pull arguments
1348 mov 128+8(%rsp), $out
1349 mov 128+16(%rsp), %rbp
1360 call __rsaz_512_reducex
1370 adcq 104(%rsp), %r13
1371 adcq 112(%rsp), %r14
1372 adcq 120(%rsp), %r15
1375 call __rsaz_512_subtract
1377 leaq 128+24+48(%rsp), %rax
1379 $code.=<<___ if ($win64);
1380 movaps 0xa0-0xc8(%rax),%xmm6
1381 movaps 0xb0-0xc8(%rax),%xmm7
1382 movaps 0xc0-0xc8(%rax),%xmm8
1383 movaps 0xd0-0xc8(%rax),%xmm9
1384 movaps 0xe0-0xc8(%rax),%xmm10
1385 movaps 0xf0-0xc8(%rax),%xmm11
1386 movaps 0x100-0xc8(%rax),%xmm12
1387 movaps 0x110-0xc8(%rax),%xmm13
1388 movaps 0x120-0xc8(%rax),%xmm14
1389 movaps 0x130-0xc8(%rax),%xmm15
1394 movq -48(%rax), %r15
1396 movq -40(%rax), %r14
1398 movq -32(%rax), %r13
1400 movq -24(%rax), %r12
1402 movq -16(%rax), %rbp
1407 .cfi_def_cfa_register %rsp
1408 .Lmul_gather4_epilogue:
1411 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1415 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1417 .globl rsaz_512_mul_scatter4
1418 .type rsaz_512_mul_scatter4,\@function,6
1420 rsaz_512_mul_scatter4:
1437 .cfi_adjust_cfa_offset 128+24
1438 .Lmul_scatter4_body:
1439 leaq ($tbl,$pwr,8), $tbl
1440 movq $out, %xmm0 # off-load arguments
1447 $code.=<<___ if ($addx);
1448 movl \$0x80100,%r11d
1449 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1450 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1454 movq ($out),%rbx # pass b[0]
1469 call __rsaz_512_reduce
1471 $code.=<<___ if ($addx);
1472 jmp .Lmul_scatter_tail
1476 movq ($out), %rdx # pass b[0]
1477 call __rsaz_512_mulx
1482 movq 128(%rsp), %rdx # pull $n0
1492 call __rsaz_512_reducex
1502 adcq 104(%rsp), %r13
1503 adcq 112(%rsp), %r14
1504 adcq 120(%rsp), %r15
1508 call __rsaz_512_subtract
1510 movq %r8, 128*0($inp) # scatter
1511 movq %r9, 128*1($inp)
1512 movq %r10, 128*2($inp)
1513 movq %r11, 128*3($inp)
1514 movq %r12, 128*4($inp)
1515 movq %r13, 128*5($inp)
1516 movq %r14, 128*6($inp)
1517 movq %r15, 128*7($inp)
1519 leaq 128+24+48(%rsp), %rax
1521 movq -48(%rax), %r15
1523 movq -40(%rax), %r14
1525 movq -32(%rax), %r13
1527 movq -24(%rax), %r12
1529 movq -16(%rax), %rbp
1534 .cfi_def_cfa_register %rsp
1535 .Lmul_scatter4_epilogue:
1538 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1542 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1544 .globl rsaz_512_mul_by_one
1545 .type rsaz_512_mul_by_one,\@function,4
1547 rsaz_512_mul_by_one:
1563 .cfi_adjust_cfa_offset 128+24
1566 $code.=<<___ if ($addx);
1567 movl OPENSSL_ia32cap_P+8(%rip),%eax
1570 movq $mod, %rbp # reassign argument
1583 movdqa %xmm0, (%rsp)
1584 movdqa %xmm0, 16(%rsp)
1585 movdqa %xmm0, 32(%rsp)
1586 movdqa %xmm0, 48(%rsp)
1587 movdqa %xmm0, 64(%rsp)
1588 movdqa %xmm0, 80(%rsp)
1589 movdqa %xmm0, 96(%rsp)
1591 $code.=<<___ if ($addx);
1593 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1597 call __rsaz_512_reduce
1599 $code.=<<___ if ($addx);
1603 movq 128(%rsp), %rdx # pull $n0
1604 call __rsaz_512_reducex
1617 leaq 128+24+48(%rsp), %rax
1619 movq -48(%rax), %r15
1621 movq -40(%rax), %r14
1623 movq -32(%rax), %r13
1625 movq -24(%rax), %r12
1627 movq -16(%rax), %rbp
1632 .cfi_def_cfa_register %rsp
1633 .Lmul_by_one_epilogue:
1636 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1639 { # __rsaz_512_reduce
1641 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1643 # clobbers: everything except %rbp and %rdi
1645 .type __rsaz_512_reduce,\@abi-omnipotent
1649 imulq 128+8(%rsp), %rbx
1652 jmp .Lreduction_loop
1683 movq 128+8(%rsp), %rsi
1724 jne .Lreduction_loop
1727 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1731 # __rsaz_512_reducex
1733 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1735 # clobbers: everything except %rbp and %rdi
1737 .type __rsaz_512_reducex,\@abi-omnipotent
1740 #movq 128+8(%rsp), %rdx # pull $n0
1742 xorq %rsi, %rsi # cf=0,of=0
1744 jmp .Lreduction_loopx
1749 mulx 0(%rbp), %rax, %r8
1753 mulx 8(%rbp), %rax, %r9
1757 mulx 16(%rbp), %rbx, %r10
1761 mulx 24(%rbp), %rbx, %r11
1765 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1771 mulx 128+8(%rsp), %rbx, %rdx
1774 mulx 40(%rbp), %rax, %r13
1778 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1782 mulx 56(%rbp), %rax, %r15
1785 adox %rsi, %r15 # %rsi is 0
1786 adcx %rsi, %r15 # cf=0
1789 jne .Lreduction_loopx
1792 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1795 { # __rsaz_512_subtract
1796 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1798 # clobbers: everything but %rdi, %rsi and %rbp
1800 .type __rsaz_512_subtract,\@abi-omnipotent
1802 __rsaz_512_subtract:
1856 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1861 # input: %rsi - ap, %rbp - bp
1863 # clobbers: everything
1864 my ($ap,$bp) = ("%rsi","%rbp");
1866 .type __rsaz_512_mul,\@abi-omnipotent
2007 .size __rsaz_512_mul,.-__rsaz_512_mul
2013 # input: %rsi - ap, %rbp - bp
2015 # clobbers: everything
2016 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
2018 .type __rsaz_512_mulx,\@abi-omnipotent
2021 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
2024 mulx 8($ap), %rax, %r9
2027 mulx 16($ap), %rbx, %r10
2030 mulx 24($ap), %rax, %r11
2033 mulx 32($ap), %rbx, %r12
2036 mulx 40($ap), %rax, %r13
2039 mulx 48($ap), %rbx, %r14
2042 mulx 56($ap), %rax, %r15
2048 xor $zero, $zero # cf=0,of=0
2054 mulx ($ap), %rax, %r8
2058 mulx 8($ap), %rax, %r9
2062 mulx 16($ap), %rax, %r10
2066 mulx 24($ap), %rax, %r11
2070 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
2074 mulx 40($ap), %rax, %r13
2078 mulx 48($ap), %rax, %r14
2082 mulx 56($ap), %rax, %r15
2083 movq 64($bp,%rcx,8), %rdx
2084 movq %rbx, 8+64-8(%rsp,%rcx,8)
2087 adcx $zero, %r15 # cf=0
2093 mulx ($ap), %rax, %r8
2097 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2101 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2105 mulx 24($ap), %rax, %r11
2109 mulx 32($ap), %rax, %r12
2113 mulx 40($ap), %rax, %r13
2117 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2121 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2126 mov %rbx, 8+64-8(%rsp)
2128 mov %r9, 8+64+8(%rsp)
2129 mov %r10, 8+64+16(%rsp)
2130 mov %r11, 8+64+24(%rsp)
2131 mov %r12, 8+64+32(%rsp)
2132 mov %r13, 8+64+40(%rsp)
2133 mov %r14, 8+64+48(%rsp)
2134 mov %r15, 8+64+56(%rsp)
2137 .size __rsaz_512_mulx,.-__rsaz_512_mulx
2141 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2143 .globl rsaz_512_scatter4
2144 .type rsaz_512_scatter4,\@abi-omnipotent
2147 leaq ($out,$power,8), $out
2155 leaq 128($out), $out
2159 .size rsaz_512_scatter4,.-rsaz_512_scatter4
2161 .globl rsaz_512_gather4
2162 .type rsaz_512_gather4,\@abi-omnipotent
2166 $code.=<<___ if ($win64);
2167 .LSEH_begin_rsaz_512_gather4:
2168 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2169 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2170 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2171 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2172 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2173 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2174 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2175 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2176 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2177 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2178 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2182 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2183 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2185 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2189 ########################################################################
2190 # calculate mask by comparing 0..15 to $power
2192 for($i=0;$i<4;$i++) {
2194 paddd %xmm`$i`,%xmm`$i+1`
2195 pcmpeqd %xmm8,%xmm`$i`
2196 movdqa %xmm7,%xmm`$i+3`
2201 paddd %xmm`$i`,%xmm`$i+1`
2202 pcmpeqd %xmm8,%xmm`$i`
2211 movdqa 16*0($inp),%xmm8
2212 movdqa 16*1($inp),%xmm9
2213 movdqa 16*2($inp),%xmm10
2214 movdqa 16*3($inp),%xmm11
2216 movdqa 16*4($inp),%xmm12
2218 movdqa 16*5($inp),%xmm13
2220 movdqa 16*6($inp),%xmm14
2222 movdqa 16*7($inp),%xmm15
2223 leaq 128($inp), $inp
2236 pshufd \$0x4e,%xmm8,%xmm9
2243 $code.=<<___ if ($win64);
2244 movaps 0x00(%rsp),%xmm6
2245 movaps 0x10(%rsp),%xmm7
2246 movaps 0x20(%rsp),%xmm8
2247 movaps 0x30(%rsp),%xmm9
2248 movaps 0x40(%rsp),%xmm10
2249 movaps 0x50(%rsp),%xmm11
2250 movaps 0x60(%rsp),%xmm12
2251 movaps 0x70(%rsp),%xmm13
2252 movaps 0x80(%rsp),%xmm14
2253 movaps 0x90(%rsp),%xmm15
2258 .LSEH_end_rsaz_512_gather4:
2259 .size rsaz_512_gather4,.-rsaz_512_gather4
2268 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2269 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2277 .extern __imp_RtlVirtualUnwind
2278 .type se_handler,\@abi-omnipotent
2292 mov 120($context),%rax # pull context->Rax
2293 mov 248($context),%rbx # pull context->Rip
2295 mov 8($disp),%rsi # disp->ImageBase
2296 mov 56($disp),%r11 # disp->HandlerData
2298 mov 0(%r11),%r10d # HandlerData[0]
2299 lea (%rsi,%r10),%r10 # end of prologue label
2300 cmp %r10,%rbx # context->Rip<end of prologue label
2301 jb .Lcommon_seh_tail
2303 mov 152($context),%rax # pull context->Rsp
2305 mov 4(%r11),%r10d # HandlerData[1]
2306 lea (%rsi,%r10),%r10 # epilogue label
2307 cmp %r10,%rbx # context->Rip>=epilogue label
2308 jae .Lcommon_seh_tail
2310 lea 128+24+48(%rax),%rax
2312 lea .Lmul_gather4_epilogue(%rip),%rbx
2314 jne .Lse_not_in_mul_gather4
2318 lea -48-0xa8(%rax),%rsi
2319 lea 512($context),%rdi
2321 .long 0xa548f3fc # cld; rep movsq
2323 .Lse_not_in_mul_gather4:
2330 mov %rbx,144($context) # restore context->Rbx
2331 mov %rbp,160($context) # restore context->Rbp
2332 mov %r12,216($context) # restore context->R12
2333 mov %r13,224($context) # restore context->R13
2334 mov %r14,232($context) # restore context->R14
2335 mov %r15,240($context) # restore context->R15
2340 mov %rax,152($context) # restore context->Rsp
2341 mov %rsi,168($context) # restore context->Rsi
2342 mov %rdi,176($context) # restore context->Rdi
2344 mov 40($disp),%rdi # disp->ContextRecord
2345 mov $context,%rsi # context
2346 mov \$154,%ecx # sizeof(CONTEXT)
2347 .long 0xa548f3fc # cld; rep movsq
2350 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2351 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2352 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2353 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2354 mov 40(%rsi),%r10 # disp->ContextRecord
2355 lea 56(%rsi),%r11 # &disp->HandlerData
2356 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2357 mov %r10,32(%rsp) # arg5
2358 mov %r11,40(%rsp) # arg6
2359 mov %r12,48(%rsp) # arg7
2360 mov %rcx,56(%rsp) # arg8, (NULL)
2361 call *__imp_RtlVirtualUnwind(%rip)
2363 mov \$1,%eax # ExceptionContinueSearch
2375 .size se_handler,.-se_handler
2379 .rva .LSEH_begin_rsaz_512_sqr
2380 .rva .LSEH_end_rsaz_512_sqr
2381 .rva .LSEH_info_rsaz_512_sqr
2383 .rva .LSEH_begin_rsaz_512_mul
2384 .rva .LSEH_end_rsaz_512_mul
2385 .rva .LSEH_info_rsaz_512_mul
2387 .rva .LSEH_begin_rsaz_512_mul_gather4
2388 .rva .LSEH_end_rsaz_512_mul_gather4
2389 .rva .LSEH_info_rsaz_512_mul_gather4
2391 .rva .LSEH_begin_rsaz_512_mul_scatter4
2392 .rva .LSEH_end_rsaz_512_mul_scatter4
2393 .rva .LSEH_info_rsaz_512_mul_scatter4
2395 .rva .LSEH_begin_rsaz_512_mul_by_one
2396 .rva .LSEH_end_rsaz_512_mul_by_one
2397 .rva .LSEH_info_rsaz_512_mul_by_one
2399 .rva .LSEH_begin_rsaz_512_gather4
2400 .rva .LSEH_end_rsaz_512_gather4
2401 .rva .LSEH_info_rsaz_512_gather4
2405 .LSEH_info_rsaz_512_sqr:
2408 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2409 .LSEH_info_rsaz_512_mul:
2412 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2413 .LSEH_info_rsaz_512_mul_gather4:
2416 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2417 .LSEH_info_rsaz_512_mul_scatter4:
2420 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2421 .LSEH_info_rsaz_512_mul_by_one:
2424 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2425 .LSEH_info_rsaz_512_gather4:
2426 .byte 0x01,0x46,0x16,0x00
2427 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2428 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2429 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2430 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2431 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2432 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2433 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2434 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2435 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2436 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2437 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
2441 $code =~ s/\`([^\`]*)\`/eval $1/gem;