3 ##############################################################################
5 # Copyright (c) 2012, Intel Corporation #
7 # All rights reserved. #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
42 # (2) University of Haifa, Israel #
43 ##############################################################################
45 # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
46 # Exponentiation, Using Advanced Vector Instructions Architectures", #
47 # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
48 # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
49 # [2] S. Gueron: "Efficient Software Implementations of Modular #
50 # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
51 # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
52 # Proceedings of 9th International Conference on Information Technology: #
53 # New Generations (ITNG 2012), pp.821-823 (2012) #
54 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
55 # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
56 # on AVX2 capable x86_64 platforms", #
57 # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58 ##############################################################################
60 # +13% improvement over original submission by <appro@openssl.org>
62 # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
63 # 2.3GHz Haswell 621 765/+23% 1113/+79%
65 # (*) if system doesn't support AVX2, for reference purposes;
69 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
71 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
73 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
75 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
76 die "can't locate x86_64-xlate.pl";
78 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80 $avx = ($1>=2.19) + ($1>=2.22);
84 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
85 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
86 $avx = ($1>=2.09) + ($1>=2.10);
90 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
91 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
92 $avx = ($1>=10) + ($1>=11);
96 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
97 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
98 $avx = ($ver>=3.0) + ($ver>=3.01);
102 open OUT,"| \"$^X\" $xlate $flavour $output";
107 my $rp="%rdi"; # BN_ULONG *rp,
108 my $ap="%rsi"; # const BN_ULONG *ap,
109 my $np="%rdx"; # const BN_ULONG *np,
110 my $n0="%ecx"; # const BN_ULONG n0,
111 my $rep="%r8d"; # int repeat);
113 # The registers that hold the accumulated redundant result
114 # The AMM works on 1024 bit operands, and redundant word size is 29
115 # Therefore: ceil(1024/29)/4 = 9
126 # Registers that hold the broadcasted words of bp, currently used
129 # Registers that hold the broadcasted words of Y, currently used
134 my $AND_MASK="%ymm15";
135 # alu registers that hold the first words of the ACC
141 my $i="%r14d"; # loop counter
144 my $FrameSize=32*18+32*8; # place for A^2 and 2*A
151 $np="%r13"; # reassigned argument
156 .globl rsaz_1024_sqr_avx2
157 .type rsaz_1024_sqr_avx2,\@function,5
159 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
169 $code.=<<___ if ($win64);
171 vmovaps %xmm6,-0xd8(%rax)
172 vmovaps %xmm7,-0xc8(%rax)
173 vmovaps %xmm8,-0xb8(%rax)
174 vmovaps %xmm9,-0xa8(%rax)
175 vmovaps %xmm10,-0x98(%rax)
176 vmovaps %xmm11,-0x88(%rax)
177 vmovaps %xmm12,-0x78(%rax)
178 vmovaps %xmm13,-0x68(%rax)
179 vmovaps %xmm14,-0x58(%rax)
180 vmovaps %xmm15,-0x48(%rax)
185 mov %rdx, $np # reassigned argument
186 sub \$$FrameSize, %rsp
188 sub \$-128, $rp # size optimization
192 and \$4095, $tmp # see if $np crosses page
195 vpxor $ACC9,$ACC9,$ACC9
196 jz .Lsqr_1024_no_n_copy
198 # unaligned 256-bit load that crosses page boundary can
199 # cause >2x performance degradation here, so if $np does
200 # cross page boundary, copy it to stack and make sure stack
203 vmovdqu 32*0-128($np), $ACC0
205 vmovdqu 32*1-128($np), $ACC1
206 vmovdqu 32*2-128($np), $ACC2
207 vmovdqu 32*3-128($np), $ACC3
208 vmovdqu 32*4-128($np), $ACC4
209 vmovdqu 32*5-128($np), $ACC5
210 vmovdqu 32*6-128($np), $ACC6
211 vmovdqu 32*7-128($np), $ACC7
212 vmovdqu 32*8-128($np), $ACC8
213 lea $FrameSize+128(%rsp),$np
214 vmovdqu $ACC0, 32*0-128($np)
215 vmovdqu $ACC1, 32*1-128($np)
216 vmovdqu $ACC2, 32*2-128($np)
217 vmovdqu $ACC3, 32*3-128($np)
218 vmovdqu $ACC4, 32*4-128($np)
219 vmovdqu $ACC5, 32*5-128($np)
220 vmovdqu $ACC6, 32*6-128($np)
221 vmovdqu $ACC7, 32*7-128($np)
222 vmovdqu $ACC8, 32*8-128($np)
223 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
225 .Lsqr_1024_no_n_copy:
228 vmovdqu 32*1-128($ap), $ACC1
229 vmovdqu 32*2-128($ap), $ACC2
230 vmovdqu 32*3-128($ap), $ACC3
231 vmovdqu 32*4-128($ap), $ACC4
232 vmovdqu 32*5-128($ap), $ACC5
233 vmovdqu 32*6-128($ap), $ACC6
234 vmovdqu 32*7-128($ap), $ACC7
235 vmovdqu 32*8-128($ap), $ACC8
237 lea 192(%rsp), $tp0 # 64+128=192
238 vpbroadcastq .Land_mask(%rip), $AND_MASK
239 jmp .LOOP_GRANDE_SQR_1024
242 .LOOP_GRANDE_SQR_1024:
243 lea 32*18+128(%rsp), $aap # size optimization
244 lea 448(%rsp), $tp1 # 64+128+256=448
246 # the squaring is performed as described in Variant B of
247 # "Speeding up Big-Number Squaring", so start by calculating
249 vpaddq $ACC1, $ACC1, $ACC1
250 vpbroadcastq 32*0-128($ap), $B1
251 vpaddq $ACC2, $ACC2, $ACC2
252 vmovdqa $ACC1, 32*0-128($aap)
253 vpaddq $ACC3, $ACC3, $ACC3
254 vmovdqa $ACC2, 32*1-128($aap)
255 vpaddq $ACC4, $ACC4, $ACC4
256 vmovdqa $ACC3, 32*2-128($aap)
257 vpaddq $ACC5, $ACC5, $ACC5
258 vmovdqa $ACC4, 32*3-128($aap)
259 vpaddq $ACC6, $ACC6, $ACC6
260 vmovdqa $ACC5, 32*4-128($aap)
261 vpaddq $ACC7, $ACC7, $ACC7
262 vmovdqa $ACC6, 32*5-128($aap)
263 vpaddq $ACC8, $ACC8, $ACC8
264 vmovdqa $ACC7, 32*6-128($aap)
265 vpxor $ACC9, $ACC9, $ACC9
266 vmovdqa $ACC8, 32*7-128($aap)
268 vpmuludq 32*0-128($ap), $B1, $ACC0
269 vpbroadcastq 32*1-128($ap), $B2
270 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
271 vpmuludq $B1, $ACC1, $ACC1
272 vmovdqu $ACC9, 32*10-448($tp1)
273 vpmuludq $B1, $ACC2, $ACC2
274 vmovdqu $ACC9, 32*11-448($tp1)
275 vpmuludq $B1, $ACC3, $ACC3
276 vmovdqu $ACC9, 32*12-448($tp1)
277 vpmuludq $B1, $ACC4, $ACC4
278 vmovdqu $ACC9, 32*13-448($tp1)
279 vpmuludq $B1, $ACC5, $ACC5
280 vmovdqu $ACC9, 32*14-448($tp1)
281 vpmuludq $B1, $ACC6, $ACC6
282 vmovdqu $ACC9, 32*15-448($tp1)
283 vpmuludq $B1, $ACC7, $ACC7
284 vmovdqu $ACC9, 32*16-448($tp1)
285 vpmuludq $B1, $ACC8, $ACC8
286 vpbroadcastq 32*2-128($ap), $B1
287 vmovdqu $ACC9, 32*17-448($tp1)
298 vpbroadcastq 32*1-128($tpa), $B2
299 vpmuludq 32*0-128($ap), $B1, $ACC0
300 vpaddq 32*0-192($tp0), $ACC0, $ACC0
301 vpmuludq 32*0-128($aap), $B1, $ACC1
302 vpaddq 32*1-192($tp0), $ACC1, $ACC1
303 vpmuludq 32*1-128($aap), $B1, $ACC2
304 vpaddq 32*2-192($tp0), $ACC2, $ACC2
305 vpmuludq 32*2-128($aap), $B1, $ACC3
306 vpaddq 32*3-192($tp0), $ACC3, $ACC3
307 vpmuludq 32*3-128($aap), $B1, $ACC4
308 vpaddq 32*4-192($tp0), $ACC4, $ACC4
309 vpmuludq 32*4-128($aap), $B1, $ACC5
310 vpaddq 32*5-192($tp0), $ACC5, $ACC5
311 vpmuludq 32*5-128($aap), $B1, $ACC6
312 vpaddq 32*6-192($tp0), $ACC6, $ACC6
313 vpmuludq 32*6-128($aap), $B1, $ACC7
314 vpaddq 32*7-192($tp0), $ACC7, $ACC7
315 vpmuludq 32*7-128($aap), $B1, $ACC8
316 vpbroadcastq 32*2-128($tpa), $B1
317 vpaddq 32*8-192($tp0), $ACC8, $ACC8
319 vmovdqu $ACC0, 32*0-192($tp0)
320 vmovdqu $ACC1, 32*1-192($tp0)
322 vpmuludq 32*1-128($ap), $B2, $TEMP0
323 vpaddq $TEMP0, $ACC2, $ACC2
324 vpmuludq 32*1-128($aap), $B2, $TEMP1
325 vpaddq $TEMP1, $ACC3, $ACC3
326 vpmuludq 32*2-128($aap), $B2, $TEMP2
327 vpaddq $TEMP2, $ACC4, $ACC4
328 vpmuludq 32*3-128($aap), $B2, $TEMP0
329 vpaddq $TEMP0, $ACC5, $ACC5
330 vpmuludq 32*4-128($aap), $B2, $TEMP1
331 vpaddq $TEMP1, $ACC6, $ACC6
332 vpmuludq 32*5-128($aap), $B2, $TEMP2
333 vpaddq $TEMP2, $ACC7, $ACC7
334 vpmuludq 32*6-128($aap), $B2, $TEMP0
335 vpaddq $TEMP0, $ACC8, $ACC8
336 vpmuludq 32*7-128($aap), $B2, $ACC0
337 vpbroadcastq 32*3-128($tpa), $B2
338 vpaddq 32*9-192($tp0), $ACC0, $ACC0
340 vmovdqu $ACC2, 32*2-192($tp0)
341 vmovdqu $ACC3, 32*3-192($tp0)
343 vpmuludq 32*2-128($ap), $B1, $TEMP2
344 vpaddq $TEMP2, $ACC4, $ACC4
345 vpmuludq 32*2-128($aap), $B1, $TEMP0
346 vpaddq $TEMP0, $ACC5, $ACC5
347 vpmuludq 32*3-128($aap), $B1, $TEMP1
348 vpaddq $TEMP1, $ACC6, $ACC6
349 vpmuludq 32*4-128($aap), $B1, $TEMP2
350 vpaddq $TEMP2, $ACC7, $ACC7
351 vpmuludq 32*5-128($aap), $B1, $TEMP0
352 vpaddq $TEMP0, $ACC8, $ACC8
353 vpmuludq 32*6-128($aap), $B1, $TEMP1
354 vpaddq $TEMP1, $ACC0, $ACC0
355 vpmuludq 32*7-128($aap), $B1, $ACC1
356 vpbroadcastq 32*4-128($tpa), $B1
357 vpaddq 32*10-448($tp1), $ACC1, $ACC1
359 vmovdqu $ACC4, 32*4-192($tp0)
360 vmovdqu $ACC5, 32*5-192($tp0)
362 vpmuludq 32*3-128($ap), $B2, $TEMP0
363 vpaddq $TEMP0, $ACC6, $ACC6
364 vpmuludq 32*3-128($aap), $B2, $TEMP1
365 vpaddq $TEMP1, $ACC7, $ACC7
366 vpmuludq 32*4-128($aap), $B2, $TEMP2
367 vpaddq $TEMP2, $ACC8, $ACC8
368 vpmuludq 32*5-128($aap), $B2, $TEMP0
369 vpaddq $TEMP0, $ACC0, $ACC0
370 vpmuludq 32*6-128($aap), $B2, $TEMP1
371 vpaddq $TEMP1, $ACC1, $ACC1
372 vpmuludq 32*7-128($aap), $B2, $ACC2
373 vpbroadcastq 32*5-128($tpa), $B2
374 vpaddq 32*11-448($tp1), $ACC2, $ACC2
376 vmovdqu $ACC6, 32*6-192($tp0)
377 vmovdqu $ACC7, 32*7-192($tp0)
379 vpmuludq 32*4-128($ap), $B1, $TEMP0
380 vpaddq $TEMP0, $ACC8, $ACC8
381 vpmuludq 32*4-128($aap), $B1, $TEMP1
382 vpaddq $TEMP1, $ACC0, $ACC0
383 vpmuludq 32*5-128($aap), $B1, $TEMP2
384 vpaddq $TEMP2, $ACC1, $ACC1
385 vpmuludq 32*6-128($aap), $B1, $TEMP0
386 vpaddq $TEMP0, $ACC2, $ACC2
387 vpmuludq 32*7-128($aap), $B1, $ACC3
388 vpbroadcastq 32*6-128($tpa), $B1
389 vpaddq 32*12-448($tp1), $ACC3, $ACC3
391 vmovdqu $ACC8, 32*8-192($tp0)
392 vmovdqu $ACC0, 32*9-192($tp0)
395 vpmuludq 32*5-128($ap), $B2, $TEMP2
396 vpaddq $TEMP2, $ACC1, $ACC1
397 vpmuludq 32*5-128($aap), $B2, $TEMP0
398 vpaddq $TEMP0, $ACC2, $ACC2
399 vpmuludq 32*6-128($aap), $B2, $TEMP1
400 vpaddq $TEMP1, $ACC3, $ACC3
401 vpmuludq 32*7-128($aap), $B2, $ACC4
402 vpbroadcastq 32*7-128($tpa), $B2
403 vpaddq 32*13-448($tp1), $ACC4, $ACC4
405 vmovdqu $ACC1, 32*10-448($tp1)
406 vmovdqu $ACC2, 32*11-448($tp1)
408 vpmuludq 32*6-128($ap), $B1, $TEMP0
409 vpaddq $TEMP0, $ACC3, $ACC3
410 vpmuludq 32*6-128($aap), $B1, $TEMP1
411 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
412 vpaddq $TEMP1, $ACC4, $ACC4
413 vpmuludq 32*7-128($aap), $B1, $ACC5
414 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
415 vpaddq 32*14-448($tp1), $ACC5, $ACC5
417 vmovdqu $ACC3, 32*12-448($tp1)
418 vmovdqu $ACC4, 32*13-448($tp1)
421 vpmuludq 32*7-128($ap), $B2, $TEMP0
422 vpaddq $TEMP0, $ACC5, $ACC5
423 vpmuludq 32*7-128($aap), $B2, $ACC6
424 vpaddq 32*15-448($tp1), $ACC6, $ACC6
426 vpmuludq 32*8-128($ap), $ACC0, $ACC7
427 vmovdqu $ACC5, 32*14-448($tp1)
428 vpaddq 32*16-448($tp1), $ACC7, $ACC7
429 vmovdqu $ACC6, 32*15-448($tp1)
430 vmovdqu $ACC7, 32*16-448($tp1)
442 #we need to fix indexes 32-39 to avoid overflow
443 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
444 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
445 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
446 lea 192(%rsp), $tp0 # 64+128=192
448 vpsrlq \$29, $ACC8, $TEMP1
449 vpand $AND_MASK, $ACC8, $ACC8
450 vpsrlq \$29, $ACC1, $TEMP2
451 vpand $AND_MASK, $ACC1, $ACC1
453 vpermq \$0x93, $TEMP1, $TEMP1
454 vpxor $ZERO, $ZERO, $ZERO
455 vpermq \$0x93, $TEMP2, $TEMP2
457 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
458 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
459 vpaddq $TEMP0, $ACC8, $ACC8
460 vpblendd \$3, $TEMP2, $ZERO, $TEMP2
461 vpaddq $TEMP1, $ACC1, $ACC1
462 vpaddq $TEMP2, $ACC2, $ACC2
463 vmovdqu $ACC1, 32*9-192($tp0)
464 vmovdqu $ACC2, 32*10-192($tp0)
470 vmovdqu 32*1(%rsp), $ACC1
471 vmovdqu 32*2-192($tp0), $ACC2
472 vmovdqu 32*3-192($tp0), $ACC3
473 vmovdqu 32*4-192($tp0), $ACC4
474 vmovdqu 32*5-192($tp0), $ACC5
475 vmovdqu 32*6-192($tp0), $ACC6
476 vmovdqu 32*7-192($tp0), $ACC7
480 and \$0x1fffffff, %eax
484 imulq -128($np), %rax
485 vpbroadcastq $Y1, $Y1
488 imulq 8-128($np), %rax
492 imulq 16-128($np), %rax
495 imulq 24-128($np), %rdx
500 and \$0x1fffffff, %eax
503 jmp .LOOP_REDUCE_1024
508 vpbroadcastq $Y2, $Y2
510 vpmuludq 32*1-128($np), $Y1, $TEMP0
512 imulq -128($np), %rax
513 vpaddq $TEMP0, $ACC1, $ACC1
515 vpmuludq 32*2-128($np), $Y1, $TEMP1
517 imulq 8-128($np), %rax
518 vpaddq $TEMP1, $ACC2, $ACC2
519 vpmuludq 32*3-128($np), $Y1, $TEMP2
524 imulq 16-128($np), %rax
526 vpaddq $TEMP2, $ACC3, $ACC3
527 vpmuludq 32*4-128($np), $Y1, $TEMP0
530 vpaddq $TEMP0, $ACC4, $ACC4
531 vpmuludq 32*5-128($np), $Y1, $TEMP1
534 vpaddq $TEMP1, $ACC5, $ACC5
535 vpmuludq 32*6-128($np), $Y1, $TEMP2
536 and \$0x1fffffff, %eax
537 vpaddq $TEMP2, $ACC6, $ACC6
538 vpmuludq 32*7-128($np), $Y1, $TEMP0
539 vpaddq $TEMP0, $ACC7, $ACC7
540 vpmuludq 32*8-128($np), $Y1, $TEMP1
542 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
543 vpaddq $TEMP1, $ACC8, $ACC8
544 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
545 vpbroadcastq $Y1, $Y1
547 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
548 vmovdqu 32*3-8-128($np), $TEMP1
550 imulq -128($np), %rax
551 vpaddq $TEMP2, $ACC1, $ACC1
552 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
553 vmovdqu 32*4-8-128($np), $TEMP2
556 imulq 8-128($np), %rax
557 vpaddq $TEMP0, $ACC2, $ACC2
560 vpmuludq $Y2, $TEMP1, $TEMP1
561 vmovdqu 32*5-8-128($np), $TEMP0
563 vpaddq $TEMP1, $ACC3, $ACC3
564 vpmuludq $Y2, $TEMP2, $TEMP2
565 vmovdqu 32*6-8-128($np), $TEMP1
569 vpaddq $TEMP2, $ACC4, $ACC4
570 vpmuludq $Y2, $TEMP0, $TEMP0
571 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
572 and \$0x1fffffff, %eax
573 vpaddq $TEMP0, $ACC5, $ACC5
574 vpmuludq $Y2, $TEMP1, $TEMP1
575 vmovdqu 32*8-8-128($np), $TEMP0
576 vpaddq $TEMP1, $ACC6, $ACC6
577 vpmuludq $Y2, $TEMP2, $TEMP2
578 vmovdqu 32*9-8-128($np), $ACC9
579 vmovd %eax, $ACC0 # borrow ACC0 for Y2
580 imulq -128($np), %rax
581 vpaddq $TEMP2, $ACC7, $ACC7
582 vpmuludq $Y2, $TEMP0, $TEMP0
583 vmovdqu 32*1-16-128($np), $TEMP1
584 vpbroadcastq $ACC0, $ACC0
585 vpaddq $TEMP0, $ACC8, $ACC8
586 vpmuludq $Y2, $ACC9, $ACC9
587 vmovdqu 32*2-16-128($np), $TEMP2
591 ($ACC0,$Y2)=($Y2,$ACC0);
593 vmovdqu 32*1-24-128($np), $ACC0
594 vpmuludq $Y1, $TEMP1, $TEMP1
595 vmovdqu 32*3-16-128($np), $TEMP0
596 vpaddq $TEMP1, $ACC1, $ACC1
597 vpmuludq $Y2, $ACC0, $ACC0
598 vpmuludq $Y1, $TEMP2, $TEMP2
599 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
600 vpaddq $ACC1, $ACC0, $ACC0
601 vpaddq $TEMP2, $ACC2, $ACC2
602 vpmuludq $Y1, $TEMP0, $TEMP0
603 vmovdqu 32*5-16-128($np), $TEMP2
606 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
607 vpaddq $TEMP0, $ACC3, $ACC3
608 vpmuludq $Y1, $TEMP1, $TEMP1
609 vmovdqu 32*6-16-128($np), $TEMP0
610 vpaddq $TEMP1, $ACC4, $ACC4
611 vpmuludq $Y1, $TEMP2, $TEMP2
612 vmovdqu 32*7-16-128($np), $TEMP1
613 vpaddq $TEMP2, $ACC5, $ACC5
614 vpmuludq $Y1, $TEMP0, $TEMP0
615 vmovdqu 32*8-16-128($np), $TEMP2
616 vpaddq $TEMP0, $ACC6, $ACC6
617 vpmuludq $Y1, $TEMP1, $TEMP1
619 vmovdqu 32*9-16-128($np), $TEMP0
621 vpaddq $TEMP1, $ACC7, $ACC7
622 vpmuludq $Y1, $TEMP2, $TEMP2
623 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
626 vpaddq $TEMP2, $ACC8, $ACC8
627 vpmuludq $Y1, $TEMP0, $TEMP0
628 and \$0x1fffffff, %eax
630 vmovdqu 32*3-24-128($np), $TEMP2
632 vpaddq $TEMP0, $ACC9, $ACC9
633 vpbroadcastq $Y1, $Y1
635 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
636 vmovdqu 32*4-24-128($np), $TEMP0
638 imulq -128($np), %rax
640 vpaddq $TEMP1, $ACC2, $ACC1
641 vpmuludq $Y2, $TEMP2, $TEMP2
642 vmovdqu 32*5-24-128($np), $TEMP1
645 imulq 8-128($np), %rax
649 vpaddq $TEMP2, $ACC3, $ACC2
650 vpmuludq $Y2, $TEMP0, $TEMP0
651 vmovdqu 32*6-24-128($np), $TEMP2
654 imulq 16-128($np), %rax
655 vpaddq $TEMP0, $ACC4, $ACC3
656 vpmuludq $Y2, $TEMP1, $TEMP1
657 vmovdqu 32*7-24-128($np), $TEMP0
658 imulq 24-128($np), %rdx # future $r3
661 vpaddq $TEMP1, $ACC5, $ACC4
662 vpmuludq $Y2, $TEMP2, $TEMP2
663 vmovdqu 32*8-24-128($np), $TEMP1
666 vpmuludq $Y2, $TEMP0, $TEMP0
667 vpaddq $TEMP2, $ACC6, $ACC5
668 vmovdqu 32*9-24-128($np), $TEMP2
669 and \$0x1fffffff, %eax
670 vpaddq $TEMP0, $ACC7, $ACC6
671 vpmuludq $Y2, $TEMP1, $TEMP1
673 vpaddq $TEMP1, $ACC8, $ACC7
674 vpmuludq $Y2, $TEMP2, $TEMP2
675 vpaddq $TEMP2, $ACC9, $ACC8
680 jnz .LOOP_REDUCE_1024
682 ($ACC0,$Y2)=($Y2,$ACC0);
684 lea 448(%rsp), $tp1 # size optimization
685 vpaddq $ACC9, $Y2, $ACC0
686 vpxor $ZERO, $ZERO, $ZERO
688 vpaddq 32*9-192($tp0), $ACC0, $ACC0
689 vpaddq 32*10-448($tp1), $ACC1, $ACC1
690 vpaddq 32*11-448($tp1), $ACC2, $ACC2
691 vpaddq 32*12-448($tp1), $ACC3, $ACC3
692 vpaddq 32*13-448($tp1), $ACC4, $ACC4
693 vpaddq 32*14-448($tp1), $ACC5, $ACC5
694 vpaddq 32*15-448($tp1), $ACC6, $ACC6
695 vpaddq 32*16-448($tp1), $ACC7, $ACC7
696 vpaddq 32*17-448($tp1), $ACC8, $ACC8
698 vpsrlq \$29, $ACC0, $TEMP1
699 vpand $AND_MASK, $ACC0, $ACC0
700 vpsrlq \$29, $ACC1, $TEMP2
701 vpand $AND_MASK, $ACC1, $ACC1
702 vpsrlq \$29, $ACC2, $TEMP3
703 vpermq \$0x93, $TEMP1, $TEMP1
704 vpand $AND_MASK, $ACC2, $ACC2
705 vpsrlq \$29, $ACC3, $TEMP4
706 vpermq \$0x93, $TEMP2, $TEMP2
707 vpand $AND_MASK, $ACC3, $ACC3
708 vpermq \$0x93, $TEMP3, $TEMP3
710 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
711 vpermq \$0x93, $TEMP4, $TEMP4
712 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
713 vpaddq $TEMP0, $ACC0, $ACC0
714 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
715 vpaddq $TEMP1, $ACC1, $ACC1
716 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
717 vpaddq $TEMP2, $ACC2, $ACC2
718 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
719 vpaddq $TEMP3, $ACC3, $ACC3
720 vpaddq $TEMP4, $ACC4, $ACC4
722 vpsrlq \$29, $ACC0, $TEMP1
723 vpand $AND_MASK, $ACC0, $ACC0
724 vpsrlq \$29, $ACC1, $TEMP2
725 vpand $AND_MASK, $ACC1, $ACC1
726 vpsrlq \$29, $ACC2, $TEMP3
727 vpermq \$0x93, $TEMP1, $TEMP1
728 vpand $AND_MASK, $ACC2, $ACC2
729 vpsrlq \$29, $ACC3, $TEMP4
730 vpermq \$0x93, $TEMP2, $TEMP2
731 vpand $AND_MASK, $ACC3, $ACC3
732 vpermq \$0x93, $TEMP3, $TEMP3
734 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
735 vpermq \$0x93, $TEMP4, $TEMP4
736 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
737 vpaddq $TEMP0, $ACC0, $ACC0
738 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
739 vpaddq $TEMP1, $ACC1, $ACC1
740 vmovdqu $ACC0, 32*0-128($rp)
741 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
742 vpaddq $TEMP2, $ACC2, $ACC2
743 vmovdqu $ACC1, 32*1-128($rp)
744 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
745 vpaddq $TEMP3, $ACC3, $ACC3
746 vmovdqu $ACC2, 32*2-128($rp)
747 vpaddq $TEMP4, $ACC4, $ACC4
748 vmovdqu $ACC3, 32*3-128($rp)
752 vpsrlq \$29, $ACC4, $TEMP1
753 vpand $AND_MASK, $ACC4, $ACC4
754 vpsrlq \$29, $ACC5, $TEMP2
755 vpand $AND_MASK, $ACC5, $ACC5
756 vpsrlq \$29, $ACC6, $TEMP3
757 vpermq \$0x93, $TEMP1, $TEMP1
758 vpand $AND_MASK, $ACC6, $ACC6
759 vpsrlq \$29, $ACC7, $TEMP4
760 vpermq \$0x93, $TEMP2, $TEMP2
761 vpand $AND_MASK, $ACC7, $ACC7
762 vpsrlq \$29, $ACC8, $TEMP5
763 vpermq \$0x93, $TEMP3, $TEMP3
764 vpand $AND_MASK, $ACC8, $ACC8
765 vpermq \$0x93, $TEMP4, $TEMP4
767 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
768 vpermq \$0x93, $TEMP5, $TEMP5
769 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
770 vpaddq $TEMP0, $ACC4, $ACC4
771 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
772 vpaddq $TEMP1, $ACC5, $ACC5
773 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
774 vpaddq $TEMP2, $ACC6, $ACC6
775 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
776 vpaddq $TEMP3, $ACC7, $ACC7
777 vpaddq $TEMP4, $ACC8, $ACC8
779 vpsrlq \$29, $ACC4, $TEMP1
780 vpand $AND_MASK, $ACC4, $ACC4
781 vpsrlq \$29, $ACC5, $TEMP2
782 vpand $AND_MASK, $ACC5, $ACC5
783 vpsrlq \$29, $ACC6, $TEMP3
784 vpermq \$0x93, $TEMP1, $TEMP1
785 vpand $AND_MASK, $ACC6, $ACC6
786 vpsrlq \$29, $ACC7, $TEMP4
787 vpermq \$0x93, $TEMP2, $TEMP2
788 vpand $AND_MASK, $ACC7, $ACC7
789 vpsrlq \$29, $ACC8, $TEMP5
790 vpermq \$0x93, $TEMP3, $TEMP3
791 vpand $AND_MASK, $ACC8, $ACC8
792 vpermq \$0x93, $TEMP4, $TEMP4
794 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
795 vpermq \$0x93, $TEMP5, $TEMP5
796 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
797 vpaddq $TEMP0, $ACC4, $ACC4
798 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
799 vpaddq $TEMP1, $ACC5, $ACC5
800 vmovdqu $ACC4, 32*4-128($rp)
801 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
802 vpaddq $TEMP2, $ACC6, $ACC6
803 vmovdqu $ACC5, 32*5-128($rp)
804 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
805 vpaddq $TEMP3, $ACC7, $ACC7
806 vmovdqu $ACC6, 32*6-128($rp)
807 vpaddq $TEMP4, $ACC8, $ACC8
808 vmovdqu $ACC7, 32*7-128($rp)
809 vmovdqu $ACC8, 32*8-128($rp)
813 jne .LOOP_GRANDE_SQR_1024
818 $code.=<<___ if ($win64);
819 movaps -0xd8(%rax),%xmm6
820 movaps -0xc8(%rax),%xmm7
821 movaps -0xb8(%rax),%xmm8
822 movaps -0xa8(%rax),%xmm9
823 movaps -0x98(%rax),%xmm10
824 movaps -0x88(%rax),%xmm11
825 movaps -0x78(%rax),%xmm12
826 movaps -0x68(%rax),%xmm13
827 movaps -0x58(%rax),%xmm14
828 movaps -0x48(%rax),%xmm15
837 lea (%rax),%rsp # restore %rsp
840 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
845 my $rp="%rdi"; # BN_ULONG *rp,
846 my $ap="%rsi"; # const BN_ULONG *ap,
847 my $bp="%rdx"; # const BN_ULONG *bp,
848 my $np="%rcx"; # const BN_ULONG *np,
849 my $n0="%r8d"; # unsigned int n0);
851 # The registers that hold the accumulated redundant result
852 # The AMM works on 1024 bit operands, and redundant word size is 29
853 # Therefore: ceil(1024/29)/4 = 9
865 # Registers that hold the broadcasted words of multiplier, currently used
874 my $AND_MASK="%ymm15";
876 # alu registers that hold the first words of the ACC
885 $bp="%r13"; # reassigned argument
888 .globl rsaz_1024_mul_avx2
889 .type rsaz_1024_mul_avx2,\@function,5
900 $code.=<<___ if ($win64);
903 vmovaps %xmm6,-0xd8(%rax)
904 vmovaps %xmm7,-0xc8(%rax)
905 vmovaps %xmm8,-0xb8(%rax)
906 vmovaps %xmm9,-0xa8(%rax)
907 vmovaps %xmm10,-0x98(%rax)
908 vmovaps %xmm11,-0x88(%rax)
909 vmovaps %xmm12,-0x78(%rax)
910 vmovaps %xmm13,-0x68(%rax)
911 vmovaps %xmm14,-0x58(%rax)
912 vmovaps %xmm15,-0x48(%rax)
918 mov %rdx, $bp # reassigned argument
921 # unaligned 256-bit load that crosses page boundary can
922 # cause severe performance degradation here, so if $ap does
923 # cross page boundary, swap it with $bp [meaning that caller
924 # is advised to lay down $ap and $bp next to each other, so
925 # that only one can cross page boundary].
936 sub \$-128,$ap # size optimization
940 and \$4095, $tmp # see if $np crosses page
944 jz .Lmul_1024_no_n_copy
946 # unaligned 256-bit load that crosses page boundary can
947 # cause severe performance degradation here, so if $np does
948 # cross page boundary, copy it to stack and make sure stack
951 vmovdqu 32*0-128($np), $ACC0
953 vmovdqu 32*1-128($np), $ACC1
954 vmovdqu 32*2-128($np), $ACC2
955 vmovdqu 32*3-128($np), $ACC3
956 vmovdqu 32*4-128($np), $ACC4
957 vmovdqu 32*5-128($np), $ACC5
958 vmovdqu 32*6-128($np), $ACC6
959 vmovdqu 32*7-128($np), $ACC7
960 vmovdqu 32*8-128($np), $ACC8
962 vmovdqu $ACC0, 32*0-128($np)
963 vpxor $ACC0, $ACC0, $ACC0
964 vmovdqu $ACC1, 32*1-128($np)
965 vpxor $ACC1, $ACC1, $ACC1
966 vmovdqu $ACC2, 32*2-128($np)
967 vpxor $ACC2, $ACC2, $ACC2
968 vmovdqu $ACC3, 32*3-128($np)
969 vpxor $ACC3, $ACC3, $ACC3
970 vmovdqu $ACC4, 32*4-128($np)
971 vpxor $ACC4, $ACC4, $ACC4
972 vmovdqu $ACC5, 32*5-128($np)
973 vpxor $ACC5, $ACC5, $ACC5
974 vmovdqu $ACC6, 32*6-128($np)
975 vpxor $ACC6, $ACC6, $ACC6
976 vmovdqu $ACC7, 32*7-128($np)
977 vpxor $ACC7, $ACC7, $ACC7
978 vmovdqu $ACC8, 32*8-128($np)
980 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
981 .Lmul_1024_no_n_copy:
985 vpbroadcastq ($bp), $Bi
986 vmovdqu $ACC0, (%rsp) # clear top of stack
993 vmovdqu .Land_mask(%rip), $AND_MASK
995 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
1000 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
1002 imulq -128($ap), %rax
1005 imulq 8-128($ap), $r1
1010 and \$0x1fffffff, %eax
1013 imulq 16-128($ap), $r2
1017 imulq 24-128($ap), $r3
1019 vpmuludq 32*1-128($ap),$Bi,$TEMP0
1021 vpaddq $TEMP0,$ACC1,$ACC1
1022 vpmuludq 32*2-128($ap),$Bi,$TEMP1
1023 vpbroadcastq $Yi, $Yi
1024 vpaddq $TEMP1,$ACC2,$ACC2
1025 vpmuludq 32*3-128($ap),$Bi,$TEMP2
1026 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
1027 vpaddq $TEMP2,$ACC3,$ACC3
1028 vpmuludq 32*4-128($ap),$Bi,$TEMP0
1029 vpaddq $TEMP0,$ACC4,$ACC4
1030 vpmuludq 32*5-128($ap),$Bi,$TEMP1
1031 vpaddq $TEMP1,$ACC5,$ACC5
1032 vpmuludq 32*6-128($ap),$Bi,$TEMP2
1033 vpaddq $TEMP2,$ACC6,$ACC6
1034 vpmuludq 32*7-128($ap),$Bi,$TEMP0
1035 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
1036 vpaddq $TEMP0,$ACC7,$ACC7
1037 vpmuludq 32*8-128($ap),$Bi,$TEMP1
1038 vpbroadcastq 8($bp), $Bi
1039 vpaddq $TEMP1,$ACC8,$ACC8
1042 imulq -128($np),%rax
1045 imulq 8-128($np),%rax
1048 imulq 16-128($np),%rax
1051 imulq 24-128($np),%rdx
1055 vpmuludq 32*1-128($np),$Yi,$TEMP2
1057 vpaddq $TEMP2,$ACC1,$ACC1
1058 vpmuludq 32*2-128($np),$Yi,$TEMP0
1059 vpaddq $TEMP0,$ACC2,$ACC2
1060 vpmuludq 32*3-128($np),$Yi,$TEMP1
1061 vpaddq $TEMP1,$ACC3,$ACC3
1062 vpmuludq 32*4-128($np),$Yi,$TEMP2
1063 vpaddq $TEMP2,$ACC4,$ACC4
1064 vpmuludq 32*5-128($np),$Yi,$TEMP0
1065 vpaddq $TEMP0,$ACC5,$ACC5
1066 vpmuludq 32*6-128($np),$Yi,$TEMP1
1067 vpaddq $TEMP1,$ACC6,$ACC6
1068 vpmuludq 32*7-128($np),$Yi,$TEMP2
1069 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
1070 vpaddq $TEMP2,$ACC7,$ACC7
1071 vpmuludq 32*8-128($np),$Yi,$TEMP0
1072 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
1073 vpaddq $TEMP0,$ACC8,$ACC8
1076 imulq -128($ap),%rax
1078 vmovdqu -8+32*1-128($ap),$TEMP1
1080 imulq 8-128($ap),%rax
1082 vmovdqu -8+32*2-128($ap),$TEMP2
1086 and \$0x1fffffff, %eax
1088 imulq 16-128($ap),%rbx
1090 vpmuludq $Bi,$TEMP1,$TEMP1
1092 vmovdqu -8+32*3-128($ap),$TEMP0
1093 vpaddq $TEMP1,$ACC1,$ACC1
1094 vpmuludq $Bi,$TEMP2,$TEMP2
1095 vpbroadcastq $Yi, $Yi
1096 vmovdqu -8+32*4-128($ap),$TEMP1
1097 vpaddq $TEMP2,$ACC2,$ACC2
1098 vpmuludq $Bi,$TEMP0,$TEMP0
1099 vmovdqu -8+32*5-128($ap),$TEMP2
1100 vpaddq $TEMP0,$ACC3,$ACC3
1101 vpmuludq $Bi,$TEMP1,$TEMP1
1102 vmovdqu -8+32*6-128($ap),$TEMP0
1103 vpaddq $TEMP1,$ACC4,$ACC4
1104 vpmuludq $Bi,$TEMP2,$TEMP2
1105 vmovdqu -8+32*7-128($ap),$TEMP1
1106 vpaddq $TEMP2,$ACC5,$ACC5
1107 vpmuludq $Bi,$TEMP0,$TEMP0
1108 vmovdqu -8+32*8-128($ap),$TEMP2
1109 vpaddq $TEMP0,$ACC6,$ACC6
1110 vpmuludq $Bi,$TEMP1,$TEMP1
1111 vmovdqu -8+32*9-128($ap),$ACC9
1112 vpaddq $TEMP1,$ACC7,$ACC7
1113 vpmuludq $Bi,$TEMP2,$TEMP2
1114 vpaddq $TEMP2,$ACC8,$ACC8
1115 vpmuludq $Bi,$ACC9,$ACC9
1116 vpbroadcastq 16($bp), $Bi
1119 imulq -128($np),%rax
1121 vmovdqu -8+32*1-128($np),$TEMP0
1123 imulq 8-128($np),%rax
1125 vmovdqu -8+32*2-128($np),$TEMP1
1127 imulq 16-128($np),%rdx
1131 vpmuludq $Yi,$TEMP0,$TEMP0
1133 vmovdqu -8+32*3-128($np),$TEMP2
1134 vpaddq $TEMP0,$ACC1,$ACC1
1135 vpmuludq $Yi,$TEMP1,$TEMP1
1136 vmovdqu -8+32*4-128($np),$TEMP0
1137 vpaddq $TEMP1,$ACC2,$ACC2
1138 vpmuludq $Yi,$TEMP2,$TEMP2
1139 vmovdqu -8+32*5-128($np),$TEMP1
1140 vpaddq $TEMP2,$ACC3,$ACC3
1141 vpmuludq $Yi,$TEMP0,$TEMP0
1142 vmovdqu -8+32*6-128($np),$TEMP2
1143 vpaddq $TEMP0,$ACC4,$ACC4
1144 vpmuludq $Yi,$TEMP1,$TEMP1
1145 vmovdqu -8+32*7-128($np),$TEMP0
1146 vpaddq $TEMP1,$ACC5,$ACC5
1147 vpmuludq $Yi,$TEMP2,$TEMP2
1148 vmovdqu -8+32*8-128($np),$TEMP1
1149 vpaddq $TEMP2,$ACC6,$ACC6
1150 vpmuludq $Yi,$TEMP0,$TEMP0
1151 vmovdqu -8+32*9-128($np),$TEMP2
1152 vpaddq $TEMP0,$ACC7,$ACC7
1153 vpmuludq $Yi,$TEMP1,$TEMP1
1154 vpaddq $TEMP1,$ACC8,$ACC8
1155 vpmuludq $Yi,$TEMP2,$TEMP2
1156 vpaddq $TEMP2,$ACC9,$ACC9
1158 vmovdqu -16+32*1-128($ap),$TEMP0
1160 imulq -128($ap),%rax
1163 vmovdqu -16+32*2-128($ap),$TEMP1
1166 and \$0x1fffffff, %eax
1168 imulq 8-128($ap),%rbx
1170 vpmuludq $Bi,$TEMP0,$TEMP0
1172 vmovdqu -16+32*3-128($ap),$TEMP2
1173 vpaddq $TEMP0,$ACC1,$ACC1
1174 vpmuludq $Bi,$TEMP1,$TEMP1
1175 vpbroadcastq $Yi, $Yi
1176 vmovdqu -16+32*4-128($ap),$TEMP0
1177 vpaddq $TEMP1,$ACC2,$ACC2
1178 vpmuludq $Bi,$TEMP2,$TEMP2
1179 vmovdqu -16+32*5-128($ap),$TEMP1
1180 vpaddq $TEMP2,$ACC3,$ACC3
1181 vpmuludq $Bi,$TEMP0,$TEMP0
1182 vmovdqu -16+32*6-128($ap),$TEMP2
1183 vpaddq $TEMP0,$ACC4,$ACC4
1184 vpmuludq $Bi,$TEMP1,$TEMP1
1185 vmovdqu -16+32*7-128($ap),$TEMP0
1186 vpaddq $TEMP1,$ACC5,$ACC5
1187 vpmuludq $Bi,$TEMP2,$TEMP2
1188 vmovdqu -16+32*8-128($ap),$TEMP1
1189 vpaddq $TEMP2,$ACC6,$ACC6
1190 vpmuludq $Bi,$TEMP0,$TEMP0
1191 vmovdqu -16+32*9-128($ap),$TEMP2
1192 vpaddq $TEMP0,$ACC7,$ACC7
1193 vpmuludq $Bi,$TEMP1,$TEMP1
1194 vpaddq $TEMP1,$ACC8,$ACC8
1195 vpmuludq $Bi,$TEMP2,$TEMP2
1196 vpbroadcastq 24($bp), $Bi
1197 vpaddq $TEMP2,$ACC9,$ACC9
1199 vmovdqu -16+32*1-128($np),$TEMP0
1201 imulq -128($np),%rax
1203 vmovdqu -16+32*2-128($np),$TEMP1
1204 imulq 8-128($np),%rdx
1208 vpmuludq $Yi,$TEMP0,$TEMP0
1210 vmovdqu -16+32*3-128($np),$TEMP2
1211 vpaddq $TEMP0,$ACC1,$ACC1
1212 vpmuludq $Yi,$TEMP1,$TEMP1
1213 vmovdqu -16+32*4-128($np),$TEMP0
1214 vpaddq $TEMP1,$ACC2,$ACC2
1215 vpmuludq $Yi,$TEMP2,$TEMP2
1216 vmovdqu -16+32*5-128($np),$TEMP1
1217 vpaddq $TEMP2,$ACC3,$ACC3
1218 vpmuludq $Yi,$TEMP0,$TEMP0
1219 vmovdqu -16+32*6-128($np),$TEMP2
1220 vpaddq $TEMP0,$ACC4,$ACC4
1221 vpmuludq $Yi,$TEMP1,$TEMP1
1222 vmovdqu -16+32*7-128($np),$TEMP0
1223 vpaddq $TEMP1,$ACC5,$ACC5
1224 vpmuludq $Yi,$TEMP2,$TEMP2
1225 vmovdqu -16+32*8-128($np),$TEMP1
1226 vpaddq $TEMP2,$ACC6,$ACC6
1227 vpmuludq $Yi,$TEMP0,$TEMP0
1228 vmovdqu -16+32*9-128($np),$TEMP2
1229 vpaddq $TEMP0,$ACC7,$ACC7
1230 vpmuludq $Yi,$TEMP1,$TEMP1
1231 vmovdqu -24+32*1-128($ap),$TEMP0
1232 vpaddq $TEMP1,$ACC8,$ACC8
1233 vpmuludq $Yi,$TEMP2,$TEMP2
1234 vmovdqu -24+32*2-128($ap),$TEMP1
1235 vpaddq $TEMP2,$ACC9,$ACC9
1238 imulq -128($ap),%rbx
1243 and \$0x1fffffff, %eax
1245 vpmuludq $Bi,$TEMP0,$TEMP0
1247 vmovdqu -24+32*3-128($ap),$TEMP2
1248 vpaddq $TEMP0,$ACC1,$ACC1
1249 vpmuludq $Bi,$TEMP1,$TEMP1
1250 vpbroadcastq $Yi, $Yi
1251 vmovdqu -24+32*4-128($ap),$TEMP0
1252 vpaddq $TEMP1,$ACC2,$ACC2
1253 vpmuludq $Bi,$TEMP2,$TEMP2
1254 vmovdqu -24+32*5-128($ap),$TEMP1
1255 vpaddq $TEMP2,$ACC3,$ACC3
1256 vpmuludq $Bi,$TEMP0,$TEMP0
1257 vmovdqu -24+32*6-128($ap),$TEMP2
1258 vpaddq $TEMP0,$ACC4,$ACC4
1259 vpmuludq $Bi,$TEMP1,$TEMP1
1260 vmovdqu -24+32*7-128($ap),$TEMP0
1261 vpaddq $TEMP1,$ACC5,$ACC5
1262 vpmuludq $Bi,$TEMP2,$TEMP2
1263 vmovdqu -24+32*8-128($ap),$TEMP1
1264 vpaddq $TEMP2,$ACC6,$ACC6
1265 vpmuludq $Bi,$TEMP0,$TEMP0
1266 vmovdqu -24+32*9-128($ap),$TEMP2
1267 vpaddq $TEMP0,$ACC7,$ACC7
1268 vpmuludq $Bi,$TEMP1,$TEMP1
1269 vpaddq $TEMP1,$ACC8,$ACC8
1270 vpmuludq $Bi,$TEMP2,$TEMP2
1271 vpbroadcastq 32($bp), $Bi
1272 vpaddq $TEMP2,$ACC9,$ACC9
1273 add \$32, $bp # $bp++
1275 vmovdqu -24+32*1-128($np),$TEMP0
1276 imulq -128($np),%rax
1280 vmovdqu -24+32*2-128($np),$TEMP1
1281 vpmuludq $Yi,$TEMP0,$TEMP0
1283 vmovdqu -24+32*3-128($np),$TEMP2
1284 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
1285 vpmuludq $Yi,$TEMP1,$TEMP1
1286 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
1287 vpaddq $TEMP1,$ACC2,$ACC1
1288 vmovdqu -24+32*4-128($np),$TEMP0
1289 vpmuludq $Yi,$TEMP2,$TEMP2
1290 vmovdqu -24+32*5-128($np),$TEMP1
1291 vpaddq $TEMP2,$ACC3,$ACC2
1292 vpmuludq $Yi,$TEMP0,$TEMP0
1293 vmovdqu -24+32*6-128($np),$TEMP2
1294 vpaddq $TEMP0,$ACC4,$ACC3
1295 vpmuludq $Yi,$TEMP1,$TEMP1
1296 vmovdqu -24+32*7-128($np),$TEMP0
1297 vpaddq $TEMP1,$ACC5,$ACC4
1298 vpmuludq $Yi,$TEMP2,$TEMP2
1299 vmovdqu -24+32*8-128($np),$TEMP1
1300 vpaddq $TEMP2,$ACC6,$ACC5
1301 vpmuludq $Yi,$TEMP0,$TEMP0
1302 vmovdqu -24+32*9-128($np),$TEMP2
1304 vpaddq $TEMP0,$ACC7,$ACC6
1305 vpmuludq $Yi,$TEMP1,$TEMP1
1307 vpaddq $TEMP1,$ACC8,$ACC7
1308 vpmuludq $Yi,$TEMP2,$TEMP2
1310 vpaddq $TEMP2,$ACC9,$ACC8
1316 # (*) Original implementation was correcting ACC1-ACC3 for overflow
1317 # after 7 loop runs, or after 28 iterations, or 56 additions.
1318 # But as we underutilize resources, it's possible to correct in
1319 # each iteration with marginal performance loss. But then, as
1320 # we do it in each iteration, we can correct less digits, and
1321 # avoid performance penalties completely. Also note that we
1322 # correct only three digits out of four. This works because
1323 # most significant digit is subjected to less additions.
1329 vpermq \$0, $AND_MASK, $AND_MASK
1330 vpaddq (%rsp), $TEMP1, $ACC0
1332 vpsrlq \$29, $ACC0, $TEMP1
1333 vpand $AND_MASK, $ACC0, $ACC0
1334 vpsrlq \$29, $ACC1, $TEMP2
1335 vpand $AND_MASK, $ACC1, $ACC1
1336 vpsrlq \$29, $ACC2, $TEMP3
1337 vpermq \$0x93, $TEMP1, $TEMP1
1338 vpand $AND_MASK, $ACC2, $ACC2
1339 vpsrlq \$29, $ACC3, $TEMP4
1340 vpermq \$0x93, $TEMP2, $TEMP2
1341 vpand $AND_MASK, $ACC3, $ACC3
1343 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1344 vpermq \$0x93, $TEMP3, $TEMP3
1345 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1346 vpermq \$0x93, $TEMP4, $TEMP4
1347 vpaddq $TEMP0, $ACC0, $ACC0
1348 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1349 vpaddq $TEMP1, $ACC1, $ACC1
1350 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1351 vpaddq $TEMP2, $ACC2, $ACC2
1352 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1353 vpaddq $TEMP3, $ACC3, $ACC3
1354 vpaddq $TEMP4, $ACC4, $ACC4
1356 vpsrlq \$29, $ACC0, $TEMP1
1357 vpand $AND_MASK, $ACC0, $ACC0
1358 vpsrlq \$29, $ACC1, $TEMP2
1359 vpand $AND_MASK, $ACC1, $ACC1
1360 vpsrlq \$29, $ACC2, $TEMP3
1361 vpermq \$0x93, $TEMP1, $TEMP1
1362 vpand $AND_MASK, $ACC2, $ACC2
1363 vpsrlq \$29, $ACC3, $TEMP4
1364 vpermq \$0x93, $TEMP2, $TEMP2
1365 vpand $AND_MASK, $ACC3, $ACC3
1366 vpermq \$0x93, $TEMP3, $TEMP3
1368 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1369 vpermq \$0x93, $TEMP4, $TEMP4
1370 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1371 vpaddq $TEMP0, $ACC0, $ACC0
1372 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1373 vpaddq $TEMP1, $ACC1, $ACC1
1374 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1375 vpaddq $TEMP2, $ACC2, $ACC2
1376 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1377 vpaddq $TEMP3, $ACC3, $ACC3
1378 vpaddq $TEMP4, $ACC4, $ACC4
1380 vmovdqu $ACC0, 0-128($rp)
1381 vmovdqu $ACC1, 32-128($rp)
1382 vmovdqu $ACC2, 64-128($rp)
1383 vmovdqu $ACC3, 96-128($rp)
1388 vpsrlq \$29, $ACC4, $TEMP1
1389 vpand $AND_MASK, $ACC4, $ACC4
1390 vpsrlq \$29, $ACC5, $TEMP2
1391 vpand $AND_MASK, $ACC5, $ACC5
1392 vpsrlq \$29, $ACC6, $TEMP3
1393 vpermq \$0x93, $TEMP1, $TEMP1
1394 vpand $AND_MASK, $ACC6, $ACC6
1395 vpsrlq \$29, $ACC7, $TEMP4
1396 vpermq \$0x93, $TEMP2, $TEMP2
1397 vpand $AND_MASK, $ACC7, $ACC7
1398 vpsrlq \$29, $ACC8, $TEMP5
1399 vpermq \$0x93, $TEMP3, $TEMP3
1400 vpand $AND_MASK, $ACC8, $ACC8
1401 vpermq \$0x93, $TEMP4, $TEMP4
1403 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1404 vpermq \$0x93, $TEMP5, $TEMP5
1405 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1406 vpaddq $TEMP0, $ACC4, $ACC4
1407 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1408 vpaddq $TEMP1, $ACC5, $ACC5
1409 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1410 vpaddq $TEMP2, $ACC6, $ACC6
1411 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1412 vpaddq $TEMP3, $ACC7, $ACC7
1413 vpaddq $TEMP4, $ACC8, $ACC8
1415 vpsrlq \$29, $ACC4, $TEMP1
1416 vpand $AND_MASK, $ACC4, $ACC4
1417 vpsrlq \$29, $ACC5, $TEMP2
1418 vpand $AND_MASK, $ACC5, $ACC5
1419 vpsrlq \$29, $ACC6, $TEMP3
1420 vpermq \$0x93, $TEMP1, $TEMP1
1421 vpand $AND_MASK, $ACC6, $ACC6
1422 vpsrlq \$29, $ACC7, $TEMP4
1423 vpermq \$0x93, $TEMP2, $TEMP2
1424 vpand $AND_MASK, $ACC7, $ACC7
1425 vpsrlq \$29, $ACC8, $TEMP5
1426 vpermq \$0x93, $TEMP3, $TEMP3
1427 vpand $AND_MASK, $ACC8, $ACC8
1428 vpermq \$0x93, $TEMP4, $TEMP4
1430 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1431 vpermq \$0x93, $TEMP5, $TEMP5
1432 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1433 vpaddq $TEMP0, $ACC4, $ACC4
1434 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1435 vpaddq $TEMP1, $ACC5, $ACC5
1436 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1437 vpaddq $TEMP2, $ACC6, $ACC6
1438 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1439 vpaddq $TEMP3, $ACC7, $ACC7
1440 vpaddq $TEMP4, $ACC8, $ACC8
1442 vmovdqu $ACC4, 128-128($rp)
1443 vmovdqu $ACC5, 160-128($rp)
1444 vmovdqu $ACC6, 192-128($rp)
1445 vmovdqu $ACC7, 224-128($rp)
1446 vmovdqu $ACC8, 256-128($rp)
1451 $code.=<<___ if ($win64);
1452 movaps -0xd8(%rax),%xmm6
1453 movaps -0xc8(%rax),%xmm7
1454 movaps -0xb8(%rax),%xmm8
1455 movaps -0xa8(%rax),%xmm9
1456 movaps -0x98(%rax),%xmm10
1457 movaps -0x88(%rax),%xmm11
1458 movaps -0x78(%rax),%xmm12
1459 movaps -0x68(%rax),%xmm13
1460 movaps -0x58(%rax),%xmm14
1461 movaps -0x48(%rax),%xmm15
1470 lea (%rax),%rsp # restore %rsp
1471 .Lmul_1024_epilogue:
1473 .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1477 my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1478 my @T = map("%r$_",(8..11));
1481 .globl rsaz_1024_red2norm_avx2
1482 .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
1484 rsaz_1024_red2norm_avx2:
1485 sub \$-128,$inp # size optimization
1489 for ($j=0,$i=0; $i<16; $i++) {
1491 while (29*$j<64*($i+1)) { # load data till boundary
1492 $code.=" mov `8*$j-128`($inp), @T[0]\n";
1493 $j++; $k++; push(@T,shift(@T));
1496 while ($k>1) { # shift loaded data but last value
1497 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
1500 $code.=<<___; # shift last value
1502 shl \$`29*($j-1)`, @T[-1]
1503 shr \$`-29*($j-1)`, @T[0]
1505 while ($l) { # accumulate all values
1506 $code.=" add @T[-$l], %rax\n";
1510 adc \$0, @T[0] # consume eventual carry
1511 mov %rax, 8*$i($out)
1518 .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1520 .globl rsaz_1024_norm2red_avx2
1521 .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
1523 rsaz_1024_norm2red_avx2:
1524 sub \$-128,$out # size optimization
1526 mov \$0x1fffffff,%eax
1528 for ($j=0,$i=0; $i<16; $i++) {
1529 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
1530 $code.=" xor @T[1],@T[1]\n" if ($i==15);
1532 while (29*($j+1)<64*($i+1)) {
1535 shr \$`29*$j`,@T[-$k]
1536 and %rax,@T[-$k] # &0x1fffffff
1537 mov @T[-$k],`8*$j-128`($out)
1542 shrd \$`29*$j`,@T[1],@T[0]
1544 mov @T[0],`8*$j-128`($out)
1550 mov @T[0],`8*$j-128`($out) # zero
1551 mov @T[0],`8*($j+1)-128`($out)
1552 mov @T[0],`8*($j+2)-128`($out)
1553 mov @T[0],`8*($j+3)-128`($out)
1555 .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1559 my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1562 .globl rsaz_1024_scatter5_avx2
1563 .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
1565 rsaz_1024_scatter5_avx2:
1567 vmovdqu .Lscatter_permd(%rip),%ymm5
1569 lea ($out,$power),$out
1571 jmp .Loop_scatter_1024
1575 vmovdqu ($inp),%ymm0
1577 vpermd %ymm0,%ymm5,%ymm0
1578 vmovdqu %xmm0,($out)
1579 lea 16*32($out),$out
1581 jnz .Loop_scatter_1024
1585 .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1587 .globl rsaz_1024_gather5_avx2
1588 .type rsaz_1024_gather5_avx2,\@abi-omnipotent
1590 rsaz_1024_gather5_avx2:
1592 $code.=<<___ if ($win64);
1593 lea -0x88(%rsp),%rax
1595 .LSEH_begin_rsaz_1024_gather5:
1596 # I can't trust assembler to use specific encoding:-(
1597 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1598 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
1599 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
1600 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
1601 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
1602 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
1603 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
1604 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
1605 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
1606 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
1607 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
1610 lea .Lgather_table(%rip),%r11
1613 shr \$2,%eax # cache line number
1614 shl \$4,$power # offset within cache line
1616 vmovdqu -32(%r11),%ymm7 # .Lgather_permd
1617 vpbroadcastb 8(%r11,%rax), %xmm8
1618 vpbroadcastb 7(%r11,%rax), %xmm9
1619 vpbroadcastb 6(%r11,%rax), %xmm10
1620 vpbroadcastb 5(%r11,%rax), %xmm11
1621 vpbroadcastb 4(%r11,%rax), %xmm12
1622 vpbroadcastb 3(%r11,%rax), %xmm13
1623 vpbroadcastb 2(%r11,%rax), %xmm14
1624 vpbroadcastb 1(%r11,%rax), %xmm15
1626 lea 64($inp,$power),$inp
1627 mov \$64,%r11 # size optimization
1629 jmp .Loop_gather_1024
1633 vpand -64($inp), %xmm8,%xmm0
1634 vpand ($inp), %xmm9,%xmm1
1635 vpand 64($inp), %xmm10,%xmm2
1636 vpand ($inp,%r11,2), %xmm11,%xmm3
1637 vpor %xmm0,%xmm1,%xmm1
1638 vpand 64($inp,%r11,2), %xmm12,%xmm4
1639 vpor %xmm2,%xmm3,%xmm3
1640 vpand ($inp,%r11,4), %xmm13,%xmm5
1641 vpor %xmm1,%xmm3,%xmm3
1642 vpand 64($inp,%r11,4), %xmm14,%xmm6
1643 vpor %xmm4,%xmm5,%xmm5
1644 vpand -128($inp,%r11,8), %xmm15,%xmm2
1645 lea ($inp,%r11,8),$inp
1646 vpor %xmm3,%xmm5,%xmm5
1647 vpor %xmm2,%xmm6,%xmm6
1648 vpor %xmm5,%xmm6,%xmm6
1649 vpermd %ymm6,%ymm7,%ymm6
1650 vmovdqu %ymm6,($out)
1653 jnz .Loop_gather_1024
1655 vpxor %ymm0,%ymm0,%ymm0
1656 vmovdqu %ymm0,($out)
1659 $code.=<<___ if ($win64);
1661 movaps 0x10(%rsp),%xmm7
1662 movaps 0x20(%rsp),%xmm8
1663 movaps 0x30(%rsp),%xmm9
1664 movaps 0x40(%rsp),%xmm10
1665 movaps 0x50(%rsp),%xmm11
1666 movaps 0x60(%rsp),%xmm12
1667 movaps 0x70(%rsp),%xmm13
1668 movaps 0x80(%rsp),%xmm14
1669 movaps 0x90(%rsp),%xmm15
1671 .LSEH_end_rsaz_1024_gather5:
1675 .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1680 .extern OPENSSL_ia32cap_P
1681 .globl rsaz_avx2_eligible
1682 .type rsaz_avx2_eligible,\@abi-omnipotent
1685 mov OPENSSL_ia32cap_P+8(%rip),%eax
1687 $code.=<<___ if ($addx);
1688 mov \$`1<<8|1<<19`,%ecx
1691 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
1698 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1702 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1704 .long 0,2,4,6,7,7,7,7
1706 .long 0,7,1,7,2,7,3,7
1708 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1719 .extern __imp_RtlVirtualUnwind
1720 .type rsaz_se_handler,\@abi-omnipotent
1734 mov 120($context),%rax # pull context->Rax
1735 mov 248($context),%rbx # pull context->Rip
1737 mov 8($disp),%rsi # disp->ImageBase
1738 mov 56($disp),%r11 # disp->HandlerData
1740 mov 0(%r11),%r10d # HandlerData[0]
1741 lea (%rsi,%r10),%r10 # prologue label
1742 cmp %r10,%rbx # context->Rip<prologue label
1743 jb .Lcommon_seh_tail
1745 mov 152($context),%rax # pull context->Rsp
1747 mov 4(%r11),%r10d # HandlerData[1]
1748 lea (%rsi,%r10),%r10 # epilogue label
1749 cmp %r10,%rbx # context->Rip>=epilogue label
1750 jae .Lcommon_seh_tail
1752 mov 160($context),%rax # pull context->Rbp
1760 mov %r15,240($context)
1761 mov %r14,232($context)
1762 mov %r13,224($context)
1763 mov %r12,216($context)
1764 mov %rbp,160($context)
1765 mov %rbx,144($context)
1767 lea -0xd8(%rax),%rsi # %xmm save area
1768 lea 512($context),%rdi # & context.Xmm6
1769 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1770 .long 0xa548f3fc # cld; rep movsq
1775 mov %rax,152($context) # restore context->Rsp
1776 mov %rsi,168($context) # restore context->Rsi
1777 mov %rdi,176($context) # restore context->Rdi
1779 mov 40($disp),%rdi # disp->ContextRecord
1780 mov $context,%rsi # context
1781 mov \$154,%ecx # sizeof(CONTEXT)
1782 .long 0xa548f3fc # cld; rep movsq
1785 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1786 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1787 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1788 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1789 mov 40(%rsi),%r10 # disp->ContextRecord
1790 lea 56(%rsi),%r11 # &disp->HandlerData
1791 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1792 mov %r10,32(%rsp) # arg5
1793 mov %r11,40(%rsp) # arg6
1794 mov %r12,48(%rsp) # arg7
1795 mov %rcx,56(%rsp) # arg8, (NULL)
1796 call *__imp_RtlVirtualUnwind(%rip)
1798 mov \$1,%eax # ExceptionContinueSearch
1810 .size rsaz_se_handler,.-rsaz_se_handler
1814 .rva .LSEH_begin_rsaz_1024_sqr_avx2
1815 .rva .LSEH_end_rsaz_1024_sqr_avx2
1816 .rva .LSEH_info_rsaz_1024_sqr_avx2
1818 .rva .LSEH_begin_rsaz_1024_mul_avx2
1819 .rva .LSEH_end_rsaz_1024_mul_avx2
1820 .rva .LSEH_info_rsaz_1024_mul_avx2
1822 .rva .LSEH_begin_rsaz_1024_gather5
1823 .rva .LSEH_end_rsaz_1024_gather5
1824 .rva .LSEH_info_rsaz_1024_gather5
1827 .LSEH_info_rsaz_1024_sqr_avx2:
1829 .rva rsaz_se_handler
1830 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
1831 .LSEH_info_rsaz_1024_mul_avx2:
1833 .rva rsaz_se_handler
1834 .rva .Lmul_1024_body,.Lmul_1024_epilogue
1835 .LSEH_info_rsaz_1024_gather5:
1836 .byte 0x01,0x33,0x16,0x00
1837 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
1838 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
1839 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
1840 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
1841 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
1842 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
1843 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
1844 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
1845 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
1846 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
1847 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1851 foreach (split("\n",$code)) {
1852 s/\`([^\`]*)\`/eval($1)/ge;
1854 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1856 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1857 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1858 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1859 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1860 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1865 print <<___; # assembler is too old
1868 .globl rsaz_avx2_eligible
1869 .type rsaz_avx2_eligible,\@abi-omnipotent
1873 .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1875 .globl rsaz_1024_sqr_avx2
1876 .globl rsaz_1024_mul_avx2
1877 .globl rsaz_1024_norm2red_avx2
1878 .globl rsaz_1024_red2norm_avx2
1879 .globl rsaz_1024_scatter5_avx2
1880 .globl rsaz_1024_gather5_avx2
1881 .type rsaz_1024_sqr_avx2,\@abi-omnipotent
1884 rsaz_1024_norm2red_avx2:
1885 rsaz_1024_red2norm_avx2:
1886 rsaz_1024_scatter5_avx2:
1887 rsaz_1024_gather5_avx2:
1888 .byte 0x0f,0x0b # ud2
1890 .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2