bn/asm/rsax-avx2.pl: minor optimization [for Decoded ICache].
[openssl.git] / crypto / bn / asm / rsaz-avx2.pl
1 #!/usr/bin/env perl
2
3 #******************************************************************************
4 #* Copyright(c) 2012, Intel Corp.                                             
5 #* Developers and authors:                                                    
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   
7 #* (1) Intel Corporation, Israel Development Center, Haifa, Israel
8 #* (2) University of Haifa, Israel                                              
9 #******************************************************************************
10 #* LICENSE:                                                                
11 #* This submission to OpenSSL is to be made available under the OpenSSL  
12 #* license, and only to the OpenSSL project, in order to allow integration    
13 #* into the publicly distributed code. 
14 #* The use of this code, or portions of this code, or concepts embedded in
15 #* this code, or modification of this code and/or algorithm(s) in it, or the
16 #* use of this code for any other purpose than stated above, requires special
17 #* licensing.                                                                  
18 #******************************************************************************
19 #* DISCLAIMER:                                                                
20 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     
21 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
22 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
23 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
24 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
25 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    
26 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   
27 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    
28 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    
29 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
30 #* POSSIBILITY OF SUCH DAMAGE.                                                
31 #******************************************************************************
32 #* Reference:                                                                 
33 #* [1]  S. Gueron, V. Krasnov: "Software Implementation of Modular
34 #*      Exponentiation,  Using Advanced Vector Instructions Architectures",
35 #*      F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
36 #*      pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
37 #* [2]  S. Gueron: "Efficient Software Implementations of Modular
38 #*      Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
39 #* [3]  S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
40 #*      Proceedings of 9th International Conference on Information Technology:
41 #*      New Generations (ITNG 2012), pp.821-823 (2012)
42 #* [4]  S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
43 #*      resistant 1024-bit modular exponentiation, for optimizing RSA2048
44 #*      on AVX2 capable x86_64 platforms",
45 #*      http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
46 #******************************************************************************
47
48 # +10% improvement by <appro@openssl.org>
49 #
50 # rsa2048 sign/sec      OpenSSL 1.0.1   scalar(*)       this
51 # 2.3GHz Haswell        621             732/+18%        1112/+79%
52 #
53 # (*)   if system doesn't support AVX2, for reference purposes;
54
55 $flavour = shift;
56 $output  = shift;
57 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
58
59 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60
61 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
62 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
63 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
64 die "can't locate x86_64-xlate.pl";
65
66 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
67                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
68         $avx = ($1>=2.19) + ($1>=2.22);
69 }
70
71 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
72             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
73         $avx = ($1>=2.09) + ($1>=2.10);
74 }
75
76 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
77             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
78         $avx = ($1>=10) + ($1>=11);
79 }
80
81 open OUT,"| $^X $xlate $flavour $output";
82 *STDOUT = *OUT;
83
84 if ($avx>1) {{{
85 { # void AMS_WW(
86 my $rp="%rdi";  # BN_ULONG *rp,
87 my $ap="%rsi";  # const BN_ULONG *ap,
88 my $np="%rdx";  # const BN_ULONG *np,
89 my $n0="%ecx";  # const BN_ULONG n0,
90 my $rep="%r8d"; # int repeat);
91
92 # The registers that hold the accumulated redundant result
93 # The AMM works on 1024 bit operands, and redundant word size is 29
94 # Therefore: ceil(1024/29)/4 = 9
95 my $ACC0="%ymm0";
96 my $ACC1="%ymm1";
97 my $ACC2="%ymm2";
98 my $ACC3="%ymm3";
99 my $ACC4="%ymm4";
100 my $ACC5="%ymm5";
101 my $ACC6="%ymm6";
102 my $ACC7="%ymm7";
103 my $ACC8="%ymm8";
104 my $ACC9="%ymm9";
105 # Registers that hold the broadcasted words of bp, currently used
106 my $B1="%ymm10";
107 my $B2="%ymm11";
108 # Registers that hold the broadcasted words of Y, currently used
109 my $Y1="%ymm12";
110 my $Y2="%ymm13";
111 # Helper registers
112 my $TEMP1="%ymm14";
113 my $AND_MASK="%ymm15";
114 # alu registers that hold the first words of the ACC
115 my $r0="%r9";
116 my $r1="%r10";
117 my $r2="%r11";
118 my $r3="%r12";
119
120 my $i="%r14d";                  # loop counter
121 my $tmp = "%r15";
122
123 my $FrameSize=32*18+32*8;       # place for A^2 and 2*A
124
125 my $aap=$r0;
126 my $tp0="%rbx";
127 my $tp1=$r3;
128 my $tpa=$tmp;
129
130 $np="%r13";                     # reassigned argument
131
132 $code.=<<___;
133 .text
134
135 .globl  rsaz_1024_sqr_avx2
136 .type   rsaz_1024_sqr_avx2,\@function,5
137 .align  64
138 rsaz_1024_sqr_avx2:             # 702 cycles, 14% faster than rsaz_1024_mul_avx2
139         lea     (%rsp), %rax
140         push    %rbx
141         push    %rbp
142         push    %r12
143         push    %r13
144         push    %r14
145         push    %r15
146 ___
147 $code.=<<___ if ($win64);
148         lea     -0xa8(%rsp),%rsp
149         movaps  %xmm6,-0xd8(%rax)
150         movaps  %xmm7,-0xc8(%rax)
151         movaps  %xmm8,-0xb8(%rax)
152         movaps  %xmm9,-0xa8(%rax)
153         movaps  %xmm10,-0x98(%rax)
154         movaps  %xmm11,-0x88(%rax)
155         movaps  %xmm12,-0x78(%rax)
156         movaps  %xmm13,-0x68(%rax)
157         movaps  %xmm14,-0x58(%rax)
158         movaps  %xmm15,-0x48(%rax)
159 .Lsqr_1024_body:
160 ___
161 $code.=<<___;
162         mov     %rax,%rbp
163         vzeroall
164         mov     %rdx, $np                       # reassigned argument
165         sub     \$$FrameSize, %rsp
166         mov     $np, $tmp
167         sub     \$-128, $rp                     # size optimization
168         sub     \$-128, $ap
169         sub     \$-128, $np
170
171         and     \$4095, $tmp                    # see if $np crosses page
172         add     \$32*10, $tmp
173         shr     \$12, $tmp
174         jz      .Lsqr_1024_no_n_copy
175
176         # unaligned 256-bit load that crosses page boundary can
177         # cause >2x performance degradation here, so if $np does
178         # cross page boundary, copy it to stack and make sure stack
179         # frame doesn't...
180         sub             \$32*10,%rsp
181         vmovdqu         32*0-128($np), $ACC0
182         and             \$-2048, %rsp
183         vmovdqu         32*1-128($np), $ACC1
184         vmovdqu         32*2-128($np), $ACC2
185         vmovdqu         32*3-128($np), $ACC3
186         vmovdqu         32*4-128($np), $ACC4
187         vmovdqu         32*5-128($np), $ACC5
188         vmovdqu         32*6-128($np), $ACC6
189         vmovdqu         32*7-128($np), $ACC7
190         vmovdqu         32*8-128($np), $ACC8
191         lea             $FrameSize+128(%rsp),$np
192         vmovdqu         $ACC0, 32*0-128($np)
193         vmovdqu         $ACC1, 32*1-128($np)
194         vmovdqu         $ACC2, 32*2-128($np)
195         vmovdqu         $ACC3, 32*3-128($np)
196         vmovdqu         $ACC4, 32*4-128($np)
197         vmovdqu         $ACC5, 32*5-128($np)
198         vmovdqu         $ACC6, 32*6-128($np)
199         vmovdqu         $ACC7, 32*7-128($np)
200         vmovdqu         $ACC8, 32*8-128($np)
201         vmovdqu         $ACC9, 32*9-128($np)    # $ACC9 is zero after vzeroall
202
203 .Lsqr_1024_no_n_copy:
204         and             \$-1024, %rsp
205
206         vmovdqu         32*1-128($ap), $ACC1
207         vmovdqu         32*2-128($ap), $ACC2
208         vmovdqu         32*3-128($ap), $ACC3
209         vmovdqu         32*4-128($ap), $ACC4
210         vmovdqu         32*5-128($ap), $ACC5
211         vmovdqu         32*6-128($ap), $ACC6
212         vmovdqu         32*7-128($ap), $ACC7
213         vmovdqu         32*8-128($ap), $ACC8
214
215         lea     192(%rsp), $tp0                 # 64+128=192
216         vpbroadcastq    .Land_mask(%rip), $AND_MASK
217         jmp     .LOOP_GRANDE_SQR_1024
218
219 .align  32
220 .LOOP_GRANDE_SQR_1024:
221         lea     32*18+128(%rsp), $aap           # size optimization
222         lea     448(%rsp), $tp1                 # 64+128+256=448
223
224         # the squaring is performed as described in Variant B of
225         # "Speeding up Big-Number Squaring", so start by calculating
226         # the A*2=A+A vector
227         vpaddq          $ACC1, $ACC1, $ACC1
228          vpbroadcastq   32*0-128($ap), $B1
229         vpaddq          $ACC2, $ACC2, $ACC2
230         vmovdqa         $ACC1, 32*0-128($aap)
231         vpaddq          $ACC3, $ACC3, $ACC3
232         vmovdqa         $ACC2, 32*1-128($aap)
233         vpaddq          $ACC4, $ACC4, $ACC4
234         vmovdqa         $ACC3, 32*2-128($aap)
235         vpaddq          $ACC5, $ACC5, $ACC5
236         vmovdqa         $ACC4, 32*3-128($aap)
237         vpaddq          $ACC6, $ACC6, $ACC6
238         vmovdqa         $ACC5, 32*4-128($aap)
239         vpaddq          $ACC7, $ACC7, $ACC7
240         vmovdqa         $ACC6, 32*5-128($aap)
241         vpaddq          $ACC8, $ACC8, $ACC8
242         vmovdqa         $ACC7, 32*6-128($aap)
243         vpxor           $ACC9, $ACC9, $ACC9
244         vmovdqa         $ACC8, 32*7-128($aap)
245
246         vpmuludq        32*0-128($ap), $B1, $ACC0
247          vpbroadcastq   32*1-128($ap), $B2
248          vmovdqu        $ACC9, 32*9-192($tp0)   # zero upper half
249         vpmuludq        $B1, $ACC1, $ACC1
250          vmovdqu        $ACC9, 32*10-448($tp1)
251         vpmuludq        $B1, $ACC2, $ACC2
252          vmovdqu        $ACC9, 32*11-448($tp1)
253         vpmuludq        $B1, $ACC3, $ACC3
254          vmovdqu        $ACC9, 32*12-448($tp1)
255         vpmuludq        $B1, $ACC4, $ACC4
256          vmovdqu        $ACC9, 32*13-448($tp1)
257         vpmuludq        $B1, $ACC5, $ACC5
258          vmovdqu        $ACC9, 32*14-448($tp1)
259         vpmuludq        $B1, $ACC6, $ACC6
260          vmovdqu        $ACC9, 32*15-448($tp1)
261         vpmuludq        $B1, $ACC7, $ACC7
262          vmovdqu        $ACC9, 32*16-448($tp1)
263         vpmuludq        $B1, $ACC8, $ACC8
264          vpbroadcastq   32*2-128($ap), $B1
265          vmovdqu        $ACC9, 32*17-448($tp1)
266
267         mov     $ap, $tpa
268         mov     \$4, $i
269         jmp     .Lsqr_entry_1024
270 ___
271 $TEMP0=$Y1;
272 $TEMP2=$Y2;
273 $code.=<<___;
274 .align  32
275 .LOOP_SQR_1024:
276          vpbroadcastq   32*1-128($tpa), $B2
277         vpmuludq        32*0-128($ap), $B1, $ACC0
278         vpaddq          32*0-192($tp0), $ACC0, $ACC0
279         vpmuludq        32*0-128($aap), $B1, $ACC1
280         vpaddq          32*1-192($tp0), $ACC1, $ACC1
281         vpmuludq        32*1-128($aap), $B1, $ACC2
282         vpaddq          32*2-192($tp0), $ACC2, $ACC2
283         vpmuludq        32*2-128($aap), $B1, $ACC3
284         vpaddq          32*3-192($tp0), $ACC3, $ACC3
285         vpmuludq        32*3-128($aap), $B1, $ACC4
286         vpaddq          32*4-192($tp0), $ACC4, $ACC4
287         vpmuludq        32*4-128($aap), $B1, $ACC5
288         vpaddq          32*5-192($tp0), $ACC5, $ACC5
289         vpmuludq        32*5-128($aap), $B1, $ACC6
290         vpaddq          32*6-192($tp0), $ACC6, $ACC6
291         vpmuludq        32*6-128($aap), $B1, $ACC7
292         vpaddq          32*7-192($tp0), $ACC7, $ACC7
293         vpmuludq        32*7-128($aap), $B1, $ACC8
294          vpbroadcastq   32*2-128($tpa), $B1
295         vpaddq          32*8-192($tp0), $ACC8, $ACC8
296 .Lsqr_entry_1024:
297         vmovdqu         $ACC0, 32*0-192($tp0)
298         vmovdqu         $ACC1, 32*1-192($tp0)
299
300         vpmuludq        32*1-128($ap), $B2, $TEMP0
301         vpaddq          $TEMP0, $ACC2, $ACC2
302         vpmuludq        32*1-128($aap), $B2, $TEMP1
303         vpaddq          $TEMP1, $ACC3, $ACC3
304         vpmuludq        32*2-128($aap), $B2, $TEMP2
305         vpaddq          $TEMP2, $ACC4, $ACC4
306         vpmuludq        32*3-128($aap), $B2, $TEMP0
307         vpaddq          $TEMP0, $ACC5, $ACC5
308         vpmuludq        32*4-128($aap), $B2, $TEMP1
309         vpaddq          $TEMP1, $ACC6, $ACC6
310         vpmuludq        32*5-128($aap), $B2, $TEMP2
311         vpaddq          $TEMP2, $ACC7, $ACC7
312         vpmuludq        32*6-128($aap), $B2, $TEMP0
313         vpaddq          $TEMP0, $ACC8, $ACC8
314         vpmuludq        32*7-128($aap), $B2, $ACC0
315          vpbroadcastq   32*3-128($tpa), $B2
316         vpaddq          32*9-192($tp0), $ACC0, $ACC0
317
318         vmovdqu         $ACC2, 32*2-192($tp0)
319         vmovdqu         $ACC3, 32*3-192($tp0)
320
321         vpmuludq        32*2-128($ap), $B1, $TEMP2
322         vpaddq          $TEMP2, $ACC4, $ACC4
323         vpmuludq        32*2-128($aap), $B1, $TEMP0
324         vpaddq          $TEMP0, $ACC5, $ACC5
325         vpmuludq        32*3-128($aap), $B1, $TEMP1
326         vpaddq          $TEMP1, $ACC6, $ACC6
327         vpmuludq        32*4-128($aap), $B1, $TEMP2
328         vpaddq          $TEMP2, $ACC7, $ACC7
329         vpmuludq        32*5-128($aap), $B1, $TEMP0
330         vpaddq          $TEMP0, $ACC8, $ACC8
331         vpmuludq        32*6-128($aap), $B1, $TEMP1
332         vpaddq          $TEMP1, $ACC0, $ACC0
333         vpmuludq        32*7-128($aap), $B1, $ACC1
334          vpbroadcastq   32*4-128($tpa), $B1
335         vpaddq          32*10-448($tp1), $ACC1, $ACC1
336
337         vmovdqu         $ACC4, 32*4-192($tp0)
338         vmovdqu         $ACC5, 32*5-192($tp0)
339
340         vpmuludq        32*3-128($ap), $B2, $TEMP0
341         vpaddq          $TEMP0, $ACC6, $ACC6
342         vpmuludq        32*3-128($aap), $B2, $TEMP1
343         vpaddq          $TEMP1, $ACC7, $ACC7
344         vpmuludq        32*4-128($aap), $B2, $TEMP2
345         vpaddq          $TEMP2, $ACC8, $ACC8
346         vpmuludq        32*5-128($aap), $B2, $TEMP0
347         vpaddq          $TEMP0, $ACC0, $ACC0
348         vpmuludq        32*6-128($aap), $B2, $TEMP1
349         vpaddq          $TEMP1, $ACC1, $ACC1
350         vpmuludq        32*7-128($aap), $B2, $ACC2
351          vpbroadcastq   32*5-128($tpa), $B2
352         vpaddq          32*11-448($tp1), $ACC2, $ACC2   
353
354         vmovdqu         $ACC6, 32*6-192($tp0)
355         vmovdqu         $ACC7, 32*7-192($tp0)
356
357         vpmuludq        32*4-128($ap), $B1, $TEMP0
358         vpaddq          $TEMP0, $ACC8, $ACC8
359         vpmuludq        32*4-128($aap), $B1, $TEMP1
360         vpaddq          $TEMP1, $ACC0, $ACC0
361         vpmuludq        32*5-128($aap), $B1, $TEMP2
362         vpaddq          $TEMP2, $ACC1, $ACC1
363         vpmuludq        32*6-128($aap), $B1, $TEMP0
364         vpaddq          $TEMP0, $ACC2, $ACC2
365         vpmuludq        32*7-128($aap), $B1, $ACC3
366          vpbroadcastq   32*6-128($tpa), $B1
367         vpaddq          32*12-448($tp1), $ACC3, $ACC3
368
369         vmovdqu         $ACC8, 32*8-192($tp0)
370         vmovdqu         $ACC0, 32*9-192($tp0)
371         lea             8($tp0), $tp0
372
373         vpmuludq        32*5-128($ap), $B2, $TEMP2
374         vpaddq          $TEMP2, $ACC1, $ACC1
375         vpmuludq        32*5-128($aap), $B2, $TEMP0
376         vpaddq          $TEMP0, $ACC2, $ACC2
377         vpmuludq        32*6-128($aap), $B2, $TEMP1
378         vpaddq          $TEMP1, $ACC3, $ACC3
379         vpmuludq        32*7-128($aap), $B2, $ACC4
380          vpbroadcastq   32*7-128($tpa), $B2
381         vpaddq          32*13-448($tp1), $ACC4, $ACC4
382
383         vmovdqu         $ACC1, 32*10-448($tp1)
384         vmovdqu         $ACC2, 32*11-448($tp1)
385
386         vpmuludq        32*6-128($ap), $B1, $TEMP0
387         vpaddq          $TEMP0, $ACC3, $ACC3
388         vpmuludq        32*6-128($aap), $B1, $TEMP1
389          vpbroadcastq   32*8-128($tpa), $ACC0           # borrow $ACC0 for $B1
390         vpaddq          $TEMP1, $ACC4, $ACC4
391         vpmuludq        32*7-128($aap), $B1, $ACC5
392          vpbroadcastq   32*0+8-128($tpa), $B1           # for next iteration
393         vpaddq          32*14-448($tp1), $ACC5, $ACC5
394
395         vmovdqu         $ACC3, 32*12-448($tp1)
396         vmovdqu         $ACC4, 32*13-448($tp1)
397         lea             8($tpa), $tpa
398
399         vpmuludq        32*7-128($ap), $B2, $TEMP0
400         vpaddq          $TEMP0, $ACC5, $ACC5
401         vpmuludq        32*7-128($aap), $B2, $ACC6
402         vpaddq          32*15-448($tp1), $ACC6, $ACC6
403
404         vpmuludq        32*8-128($ap), $ACC0, $ACC7
405         vmovdqu         $ACC5, 32*14-448($tp1)
406         vpaddq          32*16-448($tp1), $ACC7, $ACC7
407         vmovdqu         $ACC6, 32*15-448($tp1)
408         vmovdqu         $ACC7, 32*16-448($tp1)
409         lea             8($tp1), $tp1
410
411         dec     $i        
412         jnz     .LOOP_SQR_1024
413 ___
414 $ZERO = $ACC9;
415 $TEMP0 = $B1;
416 $TEMP2 = $B2;
417 $TEMP3 = $Y1;
418 $TEMP4 = $Y2;
419 $code.=<<___;
420         #we need to fix indexes 32-39 to avoid overflow
421         vmovdqu         32*8(%rsp), $ACC8               # 32*8-192($tp0),
422         vmovdqu         32*9(%rsp), $ACC1               # 32*9-192($tp0)
423         vmovdqu         32*10(%rsp), $ACC2              # 32*10-192($tp0)
424         lea             192(%rsp), $tp0                 # 64+128=192
425
426         vpsrlq          \$29, $ACC8, $TEMP1
427         vpand           $AND_MASK, $ACC8, $ACC8
428         vpsrlq          \$29, $ACC1, $TEMP2
429         vpand           $AND_MASK, $ACC1, $ACC1
430
431         vpermq          \$0x93, $TEMP1, $TEMP1
432         vpxor           $ZERO, $ZERO, $ZERO
433         vpermq          \$0x93, $TEMP2, $TEMP2
434
435         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
436         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
437         vpaddq          $TEMP0, $ACC8, $ACC8
438         vpblendd        \$3, $TEMP2, $ZERO, $TEMP2
439         vpaddq          $TEMP1, $ACC1, $ACC1
440         vpaddq          $TEMP2, $ACC2, $ACC2
441         vmovdqu         $ACC1, 32*9-192($tp0)
442         vmovdqu         $ACC2, 32*10-192($tp0)
443
444         mov     (%rsp), %rax
445         mov     8(%rsp), $r1
446         mov     16(%rsp), $r2
447         mov     24(%rsp), $r3
448         vmovdqu 32*1(%rsp), $ACC1
449         vmovdqu 32*2-192($tp0), $ACC2
450         vmovdqu 32*3-192($tp0), $ACC3
451         vmovdqu 32*4-192($tp0), $ACC4
452         vmovdqu 32*5-192($tp0), $ACC5
453         vmovdqu 32*6-192($tp0), $ACC6
454         vmovdqu 32*7-192($tp0), $ACC7
455
456         mov     %rax, $r0
457         imull   $n0, %eax
458         and     \$0x1fffffff, %eax
459         vmovd   %eax, $Y1
460
461         mov     %rax, %rdx
462         imulq   -128($np), %rax
463          vpbroadcastq   $Y1, $Y1
464         add     %rax, $r0
465         mov     %rdx, %rax
466         imulq   8-128($np), %rax
467         shr     \$29, $r0
468         add     %rax, $r1
469         mov     %rdx, %rax
470         imulq   16-128($np), %rax
471         add     $r0, $r1
472         add     %rax, $r2
473         imulq   24-128($np), %rdx
474         add     %rdx, $r3
475
476         mov     $r1, %rax
477         imull   $n0, %eax
478         and     \$0x1fffffff, %eax
479
480         mov \$9, $i
481         jmp .LOOP_REDUCE_1024
482
483 .align  32
484 .LOOP_REDUCE_1024:
485         vmovd   %eax, $Y2
486         vpbroadcastq    $Y2, $Y2
487
488         vpmuludq        32*1-128($np), $Y1, $TEMP0
489          mov    %rax, %rdx
490          imulq  -128($np), %rax
491         vpaddq          $TEMP0, $ACC1, $ACC1
492          add    %rax, $r1
493         vpmuludq        32*2-128($np), $Y1, $TEMP1
494          mov    %rdx, %rax
495          imulq  8-128($np), %rax
496         vpaddq          $TEMP1, $ACC2, $ACC2
497         vpmuludq        32*3-128($np), $Y1, $TEMP2
498          .byte  0x67
499          add    %rax, $r2
500          .byte  0x67
501          mov    %rdx, %rax
502          imulq  16-128($np), %rax
503          shr    \$29, $r1
504         vpaddq          $TEMP2, $ACC3, $ACC3
505         vpmuludq        32*4-128($np), $Y1, $TEMP0
506          add    %rax, $r3
507          add    $r1, $r2
508         vpaddq          $TEMP0, $ACC4, $ACC4
509         vpmuludq        32*5-128($np), $Y1, $TEMP1
510          mov    $r2, %rax
511          imull  $n0, %eax
512         vpaddq          $TEMP1, $ACC5, $ACC5
513         vpmuludq        32*6-128($np), $Y1, $TEMP2
514          and    \$0x1fffffff, %eax
515         vpaddq          $TEMP2, $ACC6, $ACC6
516         vpmuludq        32*7-128($np), $Y1, $TEMP0
517         vpaddq          $TEMP0, $ACC7, $ACC7
518         vpmuludq        32*8-128($np), $Y1, $TEMP1
519          vmovd  %eax, $Y1
520          #vmovdqu       32*1-8-128($np), $TEMP2         # moved below
521         vpaddq          $TEMP1, $ACC8, $ACC8
522          #vmovdqu       32*2-8-128($np), $TEMP0         # moved below
523          vpbroadcastq   $Y1, $Y1
524
525         vpmuludq        32*1-8-128($np), $Y2, $TEMP2    # see above
526         vmovdqu         32*3-8-128($np), $TEMP1
527          mov    %rax, %rdx
528          imulq  -128($np), %rax
529         vpaddq          $TEMP2, $ACC1, $ACC1
530         vpmuludq        32*2-8-128($np), $Y2, $TEMP0    # see above
531         vmovdqu         32*4-8-128($np), $TEMP2
532          add    %rax, $r2
533          mov    %rdx, %rax
534          imulq  8-128($np), %rax
535         vpaddq          $TEMP0, $ACC2, $ACC2
536          add    $r3, %rax
537          shr    \$29, $r2
538         vpmuludq        $Y2, $TEMP1, $TEMP1
539         vmovdqu         32*5-8-128($np), $TEMP0
540          add    $r2, %rax
541         vpaddq          $TEMP1, $ACC3, $ACC3
542         vpmuludq        $Y2, $TEMP2, $TEMP2
543         vmovdqu         32*6-8-128($np), $TEMP1
544          .byte  0x67
545          mov    %rax, $r3
546          imull  $n0, %eax
547         vpaddq          $TEMP2, $ACC4, $ACC4
548         vpmuludq        $Y2, $TEMP0, $TEMP0
549         .byte   0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00    # vmovdqu               32*7-8-128($np), $TEMP2
550          and    \$0x1fffffff, %eax
551         vpaddq          $TEMP0, $ACC5, $ACC5
552         vpmuludq        $Y2, $TEMP1, $TEMP1
553         vmovdqu         32*8-8-128($np), $TEMP0
554         vpaddq          $TEMP1, $ACC6, $ACC6
555         vpmuludq        $Y2, $TEMP2, $TEMP2
556         vmovdqu         32*9-8-128($np), $ACC9
557          vmovd  %eax, $ACC0                     # borrow ACC0 for Y2
558          imulq  -128($np), %rax
559         vpaddq          $TEMP2, $ACC7, $ACC7
560         vpmuludq        $Y2, $TEMP0, $TEMP0
561          vmovdqu        32*1-16-128($np), $TEMP1
562          vpbroadcastq   $ACC0, $ACC0
563         vpaddq          $TEMP0, $ACC8, $ACC8
564         vpmuludq        $Y2, $ACC9, $ACC9
565          vmovdqu        32*2-16-128($np), $TEMP2
566          add    %rax, $r3
567
568 ___
569 ($ACC0,$Y2)=($Y2,$ACC0);
570 $code.=<<___;
571          vmovdqu        32*1-24-128($np), $ACC0
572         vpmuludq        $Y1, $TEMP1, $TEMP1
573         vmovdqu         32*3-16-128($np), $TEMP0
574         vpaddq          $TEMP1, $ACC1, $ACC1
575          vpmuludq       $Y2, $ACC0, $ACC0
576         vpmuludq        $Y1, $TEMP2, $TEMP2
577         .byte   0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff    # vmovdqu               32*4-16-128($np), $TEMP1
578          vpaddq         $ACC1, $ACC0, $ACC0
579         vpaddq          $TEMP2, $ACC2, $ACC2
580         vpmuludq        $Y1, $TEMP0, $TEMP0
581         vmovdqu         32*5-16-128($np), $TEMP2
582          .byte  0x67
583          vmovq          $ACC0, %rax
584          vmovdqu        $ACC0, (%rsp)           # transfer $r0-$r3
585         vpaddq          $TEMP0, $ACC3, $ACC3
586         vpmuludq        $Y1, $TEMP1, $TEMP1
587         vmovdqu         32*6-16-128($np), $TEMP0
588         vpaddq          $TEMP1, $ACC4, $ACC4
589         vpmuludq        $Y1, $TEMP2, $TEMP2
590         vmovdqu         32*7-16-128($np), $TEMP1
591         vpaddq          $TEMP2, $ACC5, $ACC5
592         vpmuludq        $Y1, $TEMP0, $TEMP0
593         vmovdqu         32*8-16-128($np), $TEMP2
594         vpaddq          $TEMP0, $ACC6, $ACC6
595         vpmuludq        $Y1, $TEMP1, $TEMP1
596          shr    \$29, $r3
597         vmovdqu         32*9-16-128($np), $TEMP0
598          add    $r3, %rax
599         vpaddq          $TEMP1, $ACC7, $ACC7
600         vpmuludq        $Y1, $TEMP2, $TEMP2
601          #vmovdqu       32*2-24-128($np), $TEMP1        # moved below
602          mov    %rax, $r0
603          imull  $n0, %eax
604         vpaddq          $TEMP2, $ACC8, $ACC8
605         vpmuludq        $Y1, $TEMP0, $TEMP0
606          and    \$0x1fffffff, %eax
607          vmovd  %eax, $Y1
608          vmovdqu        32*3-24-128($np), $TEMP2
609         .byte   0x67
610         vpaddq          $TEMP0, $ACC9, $ACC9
611          vpbroadcastq   $Y1, $Y1
612
613         vpmuludq        32*2-24-128($np), $Y2, $TEMP1   # see above
614         vmovdqu         32*4-24-128($np), $TEMP0
615          mov    %rax, %rdx
616          imulq  -128($np), %rax
617          mov    8(%rsp), $r1
618         vpaddq          $TEMP1, $ACC2, $ACC1
619         vpmuludq        $Y2, $TEMP2, $TEMP2
620         vmovdqu         32*5-24-128($np), $TEMP1
621          add    %rax, $r0
622          mov    %rdx, %rax
623          imulq  8-128($np), %rax
624          .byte  0x67
625          shr    \$29, $r0
626          mov    16(%rsp), $r2
627         vpaddq          $TEMP2, $ACC3, $ACC2
628         vpmuludq        $Y2, $TEMP0, $TEMP0
629         vmovdqu         32*6-24-128($np), $TEMP2
630          add    %rax, $r1
631          mov    %rdx, %rax
632          imulq  16-128($np), %rax
633         vpaddq          $TEMP0, $ACC4, $ACC3
634         vpmuludq        $Y2, $TEMP1, $TEMP1
635         vmovdqu         32*7-24-128($np), $TEMP0
636          imulq  24-128($np), %rdx               # future $r3
637          add    %rax, $r2
638          lea    ($r0,$r1), %rax
639         vpaddq          $TEMP1, $ACC5, $ACC4
640         vpmuludq        $Y2, $TEMP2, $TEMP2
641         vmovdqu         32*8-24-128($np), $TEMP1
642          mov    %rax, $r1
643          imull  $n0, %eax
644         vpmuludq        $Y2, $TEMP0, $TEMP0
645         vpaddq          $TEMP2, $ACC6, $ACC5
646         vmovdqu         32*9-24-128($np), $TEMP2
647          and    \$0x1fffffff, %eax
648         vpaddq          $TEMP0, $ACC7, $ACC6
649         vpmuludq        $Y2, $TEMP1, $TEMP1
650          add    24(%rsp), %rdx
651         vpaddq          $TEMP1, $ACC8, $ACC7
652         vpmuludq        $Y2, $TEMP2, $TEMP2
653         vpaddq          $TEMP2, $ACC9, $ACC8
654          vmovq  $r3, $ACC9
655          mov    %rdx, $r3
656
657         dec     $i
658         jnz     .LOOP_REDUCE_1024
659 ___
660 ($ACC0,$Y2)=($Y2,$ACC0);
661 $code.=<<___;
662         lea     448(%rsp), $tp1                 # size optimization
663         vpaddq  $ACC9, $Y2, $ACC0
664         vpxor   $ZERO, $ZERO, $ZERO
665
666         vpaddq          32*9-192($tp0), $ACC0, $ACC0
667         vpaddq          32*10-448($tp1), $ACC1, $ACC1
668         vpaddq          32*11-448($tp1), $ACC2, $ACC2
669         vpaddq          32*12-448($tp1), $ACC3, $ACC3
670         vpaddq          32*13-448($tp1), $ACC4, $ACC4
671         vpaddq          32*14-448($tp1), $ACC5, $ACC5
672         vpaddq          32*15-448($tp1), $ACC6, $ACC6
673         vpaddq          32*16-448($tp1), $ACC7, $ACC7
674         vpaddq          32*17-448($tp1), $ACC8, $ACC8
675
676         vpsrlq          \$29, $ACC0, $TEMP1
677         vpand           $AND_MASK, $ACC0, $ACC0
678         vpsrlq          \$29, $ACC1, $TEMP2
679         vpand           $AND_MASK, $ACC1, $ACC1
680         vpsrlq          \$29, $ACC2, $TEMP3
681         vpermq          \$0x93, $TEMP1, $TEMP1
682         vpand           $AND_MASK, $ACC2, $ACC2
683         vpsrlq          \$29, $ACC3, $TEMP4
684         vpermq          \$0x93, $TEMP2, $TEMP2
685         vpand           $AND_MASK, $ACC3, $ACC3
686         vpermq          \$0x93, $TEMP3, $TEMP3
687
688         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
689         vpermq          \$0x93, $TEMP4, $TEMP4
690         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
691         vpaddq          $TEMP0, $ACC0, $ACC0
692         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
693         vpaddq          $TEMP1, $ACC1, $ACC1
694         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
695         vpaddq          $TEMP2, $ACC2, $ACC2
696         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
697         vpaddq          $TEMP3, $ACC3, $ACC3
698         vpaddq          $TEMP4, $ACC4, $ACC4
699
700         vpsrlq          \$29, $ACC0, $TEMP1
701         vpand           $AND_MASK, $ACC0, $ACC0
702         vpsrlq          \$29, $ACC1, $TEMP2
703         vpand           $AND_MASK, $ACC1, $ACC1
704         vpsrlq          \$29, $ACC2, $TEMP3
705         vpermq          \$0x93, $TEMP1, $TEMP1
706         vpand           $AND_MASK, $ACC2, $ACC2
707         vpsrlq          \$29, $ACC3, $TEMP4
708         vpermq          \$0x93, $TEMP2, $TEMP2
709         vpand           $AND_MASK, $ACC3, $ACC3
710         vpermq          \$0x93, $TEMP3, $TEMP3
711
712         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
713         vpermq          \$0x93, $TEMP4, $TEMP4
714         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
715         vpaddq          $TEMP0, $ACC0, $ACC0
716         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
717         vpaddq          $TEMP1, $ACC1, $ACC1
718         vmovdqu         $ACC0, 32*0-128($rp)
719         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
720         vpaddq          $TEMP2, $ACC2, $ACC2
721         vmovdqu         $ACC1, 32*1-128($rp)
722         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
723         vpaddq          $TEMP3, $ACC3, $ACC3
724         vmovdqu         $ACC2, 32*2-128($rp)
725         vpaddq          $TEMP4, $ACC4, $ACC4
726         vmovdqu         $ACC3, 32*3-128($rp)
727 ___
728 $TEMP5=$ACC0;
729 $code.=<<___;
730         vpsrlq          \$29, $ACC4, $TEMP1
731         vpand           $AND_MASK, $ACC4, $ACC4
732         vpsrlq          \$29, $ACC5, $TEMP2
733         vpand           $AND_MASK, $ACC5, $ACC5
734         vpsrlq          \$29, $ACC6, $TEMP3
735         vpermq          \$0x93, $TEMP1, $TEMP1
736         vpand           $AND_MASK, $ACC6, $ACC6
737         vpsrlq          \$29, $ACC7, $TEMP4
738         vpermq          \$0x93, $TEMP2, $TEMP2
739         vpand           $AND_MASK, $ACC7, $ACC7
740         vpsrlq          \$29, $ACC8, $TEMP5
741         vpermq          \$0x93, $TEMP3, $TEMP3
742         vpand           $AND_MASK, $ACC8, $ACC8
743         vpermq          \$0x93, $TEMP4, $TEMP4
744
745         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
746         vpermq          \$0x93, $TEMP5, $TEMP5
747         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
748         vpaddq          $TEMP0, $ACC4, $ACC4
749         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
750         vpaddq          $TEMP1, $ACC5, $ACC5
751         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
752         vpaddq          $TEMP2, $ACC6, $ACC6
753         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
754         vpaddq          $TEMP3, $ACC7, $ACC7
755         vpaddq          $TEMP4, $ACC8, $ACC8
756      
757         vpsrlq          \$29, $ACC4, $TEMP1
758         vpand           $AND_MASK, $ACC4, $ACC4
759         vpsrlq          \$29, $ACC5, $TEMP2
760         vpand           $AND_MASK, $ACC5, $ACC5
761         vpsrlq          \$29, $ACC6, $TEMP3
762         vpermq          \$0x93, $TEMP1, $TEMP1
763         vpand           $AND_MASK, $ACC6, $ACC6
764         vpsrlq          \$29, $ACC7, $TEMP4
765         vpermq          \$0x93, $TEMP2, $TEMP2
766         vpand           $AND_MASK, $ACC7, $ACC7
767         vpsrlq          \$29, $ACC8, $TEMP5
768         vpermq          \$0x93, $TEMP3, $TEMP3
769         vpand           $AND_MASK, $ACC8, $ACC8
770         vpermq          \$0x93, $TEMP4, $TEMP4
771
772         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
773         vpermq          \$0x93, $TEMP5, $TEMP5
774         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
775         vpaddq          $TEMP0, $ACC4, $ACC4
776         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
777         vpaddq          $TEMP1, $ACC5, $ACC5
778         vmovdqu         $ACC4, 32*4-128($rp)
779         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
780         vpaddq          $TEMP2, $ACC6, $ACC6
781         vmovdqu         $ACC5, 32*5-128($rp)
782         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
783         vpaddq          $TEMP3, $ACC7, $ACC7
784         vmovdqu         $ACC6, 32*6-128($rp)
785         vpaddq          $TEMP4, $ACC8, $ACC8
786         vmovdqu         $ACC7, 32*7-128($rp)
787         vmovdqu         $ACC8, 32*8-128($rp)
788
789         mov     $rp, $ap
790         dec     $rep
791         jne     .LOOP_GRANDE_SQR_1024
792
793         vzeroall
794         mov     %rbp, %rax
795 ___
796 $code.=<<___ if ($win64);
797         movaps  -0xd8(%rax),%xmm6
798         movaps  -0xc8(%rax),%xmm7
799         movaps  -0xb8(%rax),%xmm8
800         movaps  -0xa8(%rax),%xmm9
801         movaps  -0x98(%rax),%xmm10
802         movaps  -0x88(%rax),%xmm11
803         movaps  -0x78(%rax),%xmm12
804         movaps  -0x68(%rax),%xmm13
805         movaps  -0x58(%rax),%xmm14
806         movaps  -0x48(%rax),%xmm15
807 ___
808 $code.=<<___;
809         mov     -48(%rax),%r15
810         mov     -40(%rax),%r14
811         mov     -32(%rax),%r13
812         mov     -24(%rax),%r12
813         mov     -16(%rax),%rbp
814         mov     -8(%rax),%rbx
815         lea     (%rax),%rsp             # restore %rsp
816 .Lsqr_1024_epilogue:
817         ret
818 .size   rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
819 ___
820 }
821
822 { # void AMM_WW(
823 my $rp="%rdi";  # BN_ULONG *rp,
824 my $ap="%rsi";  # const BN_ULONG *ap,
825 my $bp="%rdx";  # const BN_ULONG *bp,
826 my $np="%rcx";  # const BN_ULONG *np,
827 my $n0="%r8d";  # unsigned int n0);
828
829 # The registers that hold the accumulated redundant result
830 # The AMM works on 1024 bit operands, and redundant word size is 29
831 # Therefore: ceil(1024/29)/4 = 9
832 my $ACC0="%ymm0";
833 my $ACC1="%ymm1";
834 my $ACC2="%ymm2";
835 my $ACC3="%ymm3";
836 my $ACC4="%ymm4";
837 my $ACC5="%ymm5";
838 my $ACC6="%ymm6";
839 my $ACC7="%ymm7";
840 my $ACC8="%ymm8";
841 my $ACC9="%ymm9";
842
843 # Registers that hold the broadcasted words of multiplier, currently used
844 my $Bi="%ymm10";
845 my $Yi="%ymm11";
846
847 # Helper registers
848 my $TEMP0=$ACC0;
849 my $TEMP1="%ymm12";
850 my $TEMP2="%ymm13";
851 my $ZERO="%ymm14";
852 my $AND_MASK="%ymm15";
853
854 # alu registers that hold the first words of the ACC
855 my $r0="%r9";
856 my $r1="%r10";
857 my $r2="%r11";
858 my $r3="%r12";
859
860 my $i="%r14d";
861 my $tmp="%r15";
862
863 $bp="%r13";     # reassigned argument
864
865 $code.=<<___;
866 .globl  rsaz_1024_mul_avx2
867 .type   rsaz_1024_mul_avx2,\@function,5
868 .align  64
869 rsaz_1024_mul_avx2:
870         lea     (%rsp), %rax
871         push    %rbx
872         push    %rbp
873         push    %r12
874         push    %r13
875         push    %r14
876         push    %r15
877 ___
878 $code.=<<___ if ($win64);
879         lea     -0xa8(%rsp),%rsp
880         movaps  %xmm6,-0xd8(%rax)
881         movaps  %xmm7,-0xc8(%rax)
882         movaps  %xmm8,-0xb8(%rax)
883         movaps  %xmm9,-0xa8(%rax)
884         movaps  %xmm10,-0x98(%rax)
885         movaps  %xmm11,-0x88(%rax)
886         movaps  %xmm12,-0x78(%rax)
887         movaps  %xmm13,-0x68(%rax)
888         movaps  %xmm14,-0x58(%rax)
889         movaps  %xmm15,-0x48(%rax)
890 .Lmul_1024_body:
891 ___
892 $code.=<<___;
893         mov     %rax,%rbp
894         vzeroall
895         mov     %rdx, $bp       # reassigned argument
896         sub     \$64,%rsp
897
898         # unaligned 256-bit load that crosses page boundary can
899         # cause severe performance degradation here, so if $ap does
900         # cross page boundary, swap it with $bp [meaning that caller
901         # is advised to lay down $ap and $bp next to each other, so
902         # that only one can cross page boundary].
903         mov     $ap, $tmp
904         and     \$4095, $tmp
905         add     \$32*10, $tmp
906         shr     \$12, $tmp
907         mov     $ap, $tmp
908         cmovnz  $bp, $ap
909         cmovnz  $tmp, $bp
910
911         mov     $np, $tmp
912         sub     \$-128,$ap      # size optimization
913         sub     \$-128,$np
914         sub     \$-128,$rp
915
916         and     \$4095, $tmp    # see if $np crosses page
917         add     \$32*10, $tmp
918         shr     \$12, $tmp
919         jz      .Lmul_1024_no_n_copy
920
921         # unaligned 256-bit load that crosses page boundary can
922         # cause severe performance degradation here, so if $np does
923         # cross page boundary, copy it to stack and make sure stack
924         # frame doesn't...
925         sub             \$32*10,%rsp
926         vmovdqu         32*0-128($np), $ACC0
927         and             \$-512, %rsp
928         vmovdqu         32*1-128($np), $ACC1
929         vmovdqu         32*2-128($np), $ACC2
930         vmovdqu         32*3-128($np), $ACC3
931         vmovdqu         32*4-128($np), $ACC4
932         vmovdqu         32*5-128($np), $ACC5
933         vmovdqu         32*6-128($np), $ACC6
934         vmovdqu         32*7-128($np), $ACC7
935         vmovdqu         32*8-128($np), $ACC8
936         lea             64+128(%rsp),$np
937         vmovdqu         $ACC0, 32*0-128($np)
938         vpxor           $ACC0, $ACC0, $ACC0
939         vmovdqu         $ACC1, 32*1-128($np)
940         vpxor           $ACC1, $ACC1, $ACC1
941         vmovdqu         $ACC2, 32*2-128($np)
942         vpxor           $ACC2, $ACC2, $ACC2
943         vmovdqu         $ACC3, 32*3-128($np)
944         vpxor           $ACC3, $ACC3, $ACC3
945         vmovdqu         $ACC4, 32*4-128($np)
946         vpxor           $ACC4, $ACC4, $ACC4
947         vmovdqu         $ACC5, 32*5-128($np)
948         vpxor           $ACC5, $ACC5, $ACC5
949         vmovdqu         $ACC6, 32*6-128($np)
950         vpxor           $ACC6, $ACC6, $ACC6
951         vmovdqu         $ACC7, 32*7-128($np)
952         vpxor           $ACC7, $ACC7, $ACC7
953         vmovdqu         $ACC8, 32*8-128($np)
954         vmovdqa         $ACC0, $ACC8
955         vmovdqu         $ACC9, 32*9-128($np)    # $ACC9 is zero after vzeroall
956 .Lmul_1024_no_n_copy:
957         and     \$-64,%rsp
958
959         mov     ($bp), %rbx
960         vpbroadcastq ($bp), $Bi
961         vmovdqu $ACC0, (%rsp)                   # clear top of stack
962         xor     $r0, $r0
963         xor     $r1, $r1
964         xor     $r2, $r2
965         xor     $r3, $r3
966
967         vmovdqu .Land_mask(%rip), $AND_MASK
968         mov     \$9, $i
969         jmp     .Loop_mul_1024
970
971 .align  32
972 .Loop_mul_1024:
973          vpsrlq         \$29, $ACC3, $ACC9              # correct $ACC3(*)
974         mov     %rbx, %rax
975         imulq   -128($ap), %rax
976         add     $r0, %rax
977         mov     %rbx, $r1
978         imulq   8-128($ap), $r1
979         add     8(%rsp), $r1
980
981         mov     %rax, $r0
982         imull   $n0, %eax
983         and     \$0x1fffffff, %eax
984
985          mov    %rbx, $r2
986          imulq  16-128($ap), $r2
987          add    16(%rsp), $r2
988
989          mov    %rbx, $r3
990          imulq  24-128($ap), $r3
991          add    24(%rsp), $r3
992         vpmuludq        32*1-128($ap),$Bi,$TEMP0
993          vmovd          %eax, $Yi
994         vpaddq          $TEMP0,$ACC1,$ACC1
995         vpmuludq        32*2-128($ap),$Bi,$TEMP1
996          vpbroadcastq   $Yi, $Yi
997         vpaddq          $TEMP1,$ACC2,$ACC2
998         vpmuludq        32*3-128($ap),$Bi,$TEMP2
999          vpand          $AND_MASK, $ACC3, $ACC3         # correct $ACC3
1000         vpaddq          $TEMP2,$ACC3,$ACC3
1001         vpmuludq        32*4-128($ap),$Bi,$TEMP0
1002         vpaddq          $TEMP0,$ACC4,$ACC4
1003         vpmuludq        32*5-128($ap),$Bi,$TEMP1
1004         vpaddq          $TEMP1,$ACC5,$ACC5
1005         vpmuludq        32*6-128($ap),$Bi,$TEMP2
1006         vpaddq          $TEMP2,$ACC6,$ACC6
1007         vpmuludq        32*7-128($ap),$Bi,$TEMP0
1008          vpermq         \$0x93, $ACC9, $ACC9            # correct $ACC3
1009         vpaddq          $TEMP0,$ACC7,$ACC7
1010         vpmuludq        32*8-128($ap),$Bi,$TEMP1
1011          vpbroadcastq   8($bp), $Bi
1012         vpaddq          $TEMP1,$ACC8,$ACC8
1013
1014         mov     %rax,%rdx
1015         imulq   -128($np),%rax
1016         add     %rax,$r0
1017         mov     %rdx,%rax
1018         imulq   8-128($np),%rax
1019         add     %rax,$r1
1020         mov     %rdx,%rax
1021         imulq   16-128($np),%rax
1022         add     %rax,$r2
1023         shr     \$29, $r0
1024         imulq   24-128($np),%rdx
1025         add     %rdx,$r3
1026         add     $r0, $r1
1027
1028         vpmuludq        32*1-128($np),$Yi,$TEMP2
1029          vmovq          $Bi, %rbx
1030         vpaddq          $TEMP2,$ACC1,$ACC1
1031         vpmuludq        32*2-128($np),$Yi,$TEMP0
1032         vpaddq          $TEMP0,$ACC2,$ACC2
1033         vpmuludq        32*3-128($np),$Yi,$TEMP1
1034         vpaddq          $TEMP1,$ACC3,$ACC3
1035         vpmuludq        32*4-128($np),$Yi,$TEMP2
1036         vpaddq          $TEMP2,$ACC4,$ACC4
1037         vpmuludq        32*5-128($np),$Yi,$TEMP0
1038         vpaddq          $TEMP0,$ACC5,$ACC5
1039         vpmuludq        32*6-128($np),$Yi,$TEMP1
1040         vpaddq          $TEMP1,$ACC6,$ACC6
1041         vpmuludq        32*7-128($np),$Yi,$TEMP2
1042          vpblendd       \$3, $ZERO, $ACC9, $ACC9        # correct $ACC3
1043         vpaddq          $TEMP2,$ACC7,$ACC7
1044         vpmuludq        32*8-128($np),$Yi,$TEMP0
1045          vpaddq         $ACC9, $ACC3, $ACC3             # correct $ACC3
1046         vpaddq          $TEMP0,$ACC8,$ACC8
1047
1048         mov     %rbx, %rax
1049         imulq   -128($ap),%rax
1050         add     %rax,$r1
1051          vmovdqu        -8+32*1-128($ap),$TEMP1
1052         mov     %rbx, %rax
1053         imulq   8-128($ap),%rax
1054         add     %rax,$r2
1055          vmovdqu        -8+32*2-128($ap),$TEMP2
1056
1057         mov     $r1, %rax
1058         imull   $n0, %eax
1059         and     \$0x1fffffff, %eax
1060
1061          imulq  16-128($ap),%rbx
1062          add    %rbx,$r3
1063         vpmuludq        $Bi,$TEMP1,$TEMP1
1064          vmovd          %eax, $Yi
1065         vmovdqu         -8+32*3-128($ap),$TEMP0
1066         vpaddq          $TEMP1,$ACC1,$ACC1
1067         vpmuludq        $Bi,$TEMP2,$TEMP2
1068          vpbroadcastq   $Yi, $Yi
1069         vmovdqu         -8+32*4-128($ap),$TEMP1
1070         vpaddq          $TEMP2,$ACC2,$ACC2
1071         vpmuludq        $Bi,$TEMP0,$TEMP0
1072         vmovdqu         -8+32*5-128($ap),$TEMP2
1073         vpaddq          $TEMP0,$ACC3,$ACC3
1074         vpmuludq        $Bi,$TEMP1,$TEMP1
1075         vmovdqu         -8+32*6-128($ap),$TEMP0
1076         vpaddq          $TEMP1,$ACC4,$ACC4
1077         vpmuludq        $Bi,$TEMP2,$TEMP2
1078         vmovdqu         -8+32*7-128($ap),$TEMP1
1079         vpaddq          $TEMP2,$ACC5,$ACC5
1080         vpmuludq        $Bi,$TEMP0,$TEMP0
1081         vmovdqu         -8+32*8-128($ap),$TEMP2
1082         vpaddq          $TEMP0,$ACC6,$ACC6
1083         vpmuludq        $Bi,$TEMP1,$TEMP1
1084         vmovdqu         -8+32*9-128($ap),$ACC9
1085         vpaddq          $TEMP1,$ACC7,$ACC7
1086         vpmuludq        $Bi,$TEMP2,$TEMP2
1087         vpaddq          $TEMP2,$ACC8,$ACC8
1088         vpmuludq        $Bi,$ACC9,$ACC9
1089          vpbroadcastq   16($bp), $Bi
1090
1091         mov     %rax,%rdx
1092         imulq   -128($np),%rax
1093         add     %rax,$r1
1094          vmovdqu        -8+32*1-128($np),$TEMP0
1095         mov     %rdx,%rax
1096         imulq   8-128($np),%rax
1097         add     %rax,$r2
1098          vmovdqu        -8+32*2-128($np),$TEMP1
1099         shr     \$29, $r1
1100         imulq   16-128($np),%rdx
1101         add     %rdx,$r3
1102         add     $r1, $r2
1103
1104         vpmuludq        $Yi,$TEMP0,$TEMP0
1105          vmovq          $Bi, %rbx
1106         vmovdqu         -8+32*3-128($np),$TEMP2
1107         vpaddq          $TEMP0,$ACC1,$ACC1
1108         vpmuludq        $Yi,$TEMP1,$TEMP1
1109         vmovdqu         -8+32*4-128($np),$TEMP0
1110         vpaddq          $TEMP1,$ACC2,$ACC2
1111         vpmuludq        $Yi,$TEMP2,$TEMP2
1112         vmovdqu         -8+32*5-128($np),$TEMP1
1113         vpaddq          $TEMP2,$ACC3,$ACC3
1114         vpmuludq        $Yi,$TEMP0,$TEMP0
1115         vmovdqu         -8+32*6-128($np),$TEMP2
1116         vpaddq          $TEMP0,$ACC4,$ACC4
1117         vpmuludq        $Yi,$TEMP1,$TEMP1
1118         vmovdqu         -8+32*7-128($np),$TEMP0
1119         vpaddq          $TEMP1,$ACC5,$ACC5
1120         vpmuludq        $Yi,$TEMP2,$TEMP2
1121         vmovdqu         -8+32*8-128($np),$TEMP1
1122         vpaddq          $TEMP2,$ACC6,$ACC6
1123         vpmuludq        $Yi,$TEMP0,$TEMP0
1124         vmovdqu         -8+32*9-128($np),$TEMP2
1125         vpaddq          $TEMP0,$ACC7,$ACC7
1126         vpmuludq        $Yi,$TEMP1,$TEMP1
1127         vpaddq          $TEMP1,$ACC8,$ACC8
1128         vpmuludq        $Yi,$TEMP2,$TEMP2
1129         vpaddq          $TEMP2,$ACC9,$ACC9
1130
1131          vmovdqu        -16+32*1-128($ap),$TEMP0
1132         mov     %rbx,%rax
1133         imulq   -128($ap),%rax
1134         add     $r2,%rax
1135
1136          vmovdqu        -16+32*2-128($ap),$TEMP1
1137         mov     %rax,$r2
1138         imull   $n0, %eax
1139         and     \$0x1fffffff, %eax
1140
1141          imulq  8-128($ap),%rbx
1142          add    %rbx,$r3
1143         vpmuludq        $Bi,$TEMP0,$TEMP0
1144          vmovd          %eax, $Yi
1145         vmovdqu         -16+32*3-128($ap),$TEMP2
1146         vpaddq          $TEMP0,$ACC1,$ACC1
1147         vpmuludq        $Bi,$TEMP1,$TEMP1
1148          vpbroadcastq   $Yi, $Yi
1149         vmovdqu         -16+32*4-128($ap),$TEMP0
1150         vpaddq          $TEMP1,$ACC2,$ACC2
1151         vpmuludq        $Bi,$TEMP2,$TEMP2
1152         vmovdqu         -16+32*5-128($ap),$TEMP1
1153         vpaddq          $TEMP2,$ACC3,$ACC3
1154         vpmuludq        $Bi,$TEMP0,$TEMP0
1155         vmovdqu         -16+32*6-128($ap),$TEMP2
1156         vpaddq          $TEMP0,$ACC4,$ACC4
1157         vpmuludq        $Bi,$TEMP1,$TEMP1
1158         vmovdqu         -16+32*7-128($ap),$TEMP0
1159         vpaddq          $TEMP1,$ACC5,$ACC5
1160         vpmuludq        $Bi,$TEMP2,$TEMP2
1161         vmovdqu         -16+32*8-128($ap),$TEMP1
1162         vpaddq          $TEMP2,$ACC6,$ACC6
1163         vpmuludq        $Bi,$TEMP0,$TEMP0
1164         vmovdqu         -16+32*9-128($ap),$TEMP2
1165         vpaddq          $TEMP0,$ACC7,$ACC7
1166         vpmuludq        $Bi,$TEMP1,$TEMP1
1167         vpaddq          $TEMP1,$ACC8,$ACC8
1168         vpmuludq        $Bi,$TEMP2,$TEMP2
1169          vpbroadcastq   24($bp), $Bi
1170         vpaddq          $TEMP2,$ACC9,$ACC9
1171
1172          vmovdqu        -16+32*1-128($np),$TEMP0
1173         mov     %rax,%rdx
1174         imulq   -128($np),%rax
1175         add     %rax,$r2
1176          vmovdqu        -16+32*2-128($np),$TEMP1
1177         imulq   8-128($np),%rdx
1178         add     %rdx,$r3
1179         shr     \$29, $r2
1180
1181         vpmuludq        $Yi,$TEMP0,$TEMP0
1182          vmovq          $Bi, %rbx
1183         vmovdqu         -16+32*3-128($np),$TEMP2
1184         vpaddq          $TEMP0,$ACC1,$ACC1
1185         vpmuludq        $Yi,$TEMP1,$TEMP1
1186         vmovdqu         -16+32*4-128($np),$TEMP0
1187         vpaddq          $TEMP1,$ACC2,$ACC2
1188         vpmuludq        $Yi,$TEMP2,$TEMP2
1189         vmovdqu         -16+32*5-128($np),$TEMP1
1190         vpaddq          $TEMP2,$ACC3,$ACC3
1191         vpmuludq        $Yi,$TEMP0,$TEMP0
1192         vmovdqu         -16+32*6-128($np),$TEMP2
1193         vpaddq          $TEMP0,$ACC4,$ACC4
1194         vpmuludq        $Yi,$TEMP1,$TEMP1
1195         vmovdqu         -16+32*7-128($np),$TEMP0
1196         vpaddq          $TEMP1,$ACC5,$ACC5
1197         vpmuludq        $Yi,$TEMP2,$TEMP2
1198         vmovdqu         -16+32*8-128($np),$TEMP1
1199         vpaddq          $TEMP2,$ACC6,$ACC6
1200         vpmuludq        $Yi,$TEMP0,$TEMP0
1201         vmovdqu         -16+32*9-128($np),$TEMP2
1202         vpaddq          $TEMP0,$ACC7,$ACC7
1203         vpmuludq        $Yi,$TEMP1,$TEMP1
1204          vmovdqu        -24+32*1-128($ap),$TEMP0
1205         vpaddq          $TEMP1,$ACC8,$ACC8
1206         vpmuludq        $Yi,$TEMP2,$TEMP2
1207          vmovdqu        -24+32*2-128($ap),$TEMP1
1208         vpaddq          $TEMP2,$ACC9,$ACC9
1209
1210         add     $r2, $r3
1211         imulq   -128($ap),%rbx
1212         add     %rbx,$r3
1213
1214         mov     $r3, %rax
1215         imull   $n0, %eax
1216         and     \$0x1fffffff, %eax
1217
1218         vpmuludq        $Bi,$TEMP0,$TEMP0
1219          vmovd          %eax, $Yi
1220         vmovdqu         -24+32*3-128($ap),$TEMP2
1221         vpaddq          $TEMP0,$ACC1,$ACC1
1222         vpmuludq        $Bi,$TEMP1,$TEMP1
1223          vpbroadcastq   $Yi, $Yi
1224         vmovdqu         -24+32*4-128($ap),$TEMP0
1225         vpaddq          $TEMP1,$ACC2,$ACC2
1226         vpmuludq        $Bi,$TEMP2,$TEMP2
1227         vmovdqu         -24+32*5-128($ap),$TEMP1
1228         vpaddq          $TEMP2,$ACC3,$ACC3
1229         vpmuludq        $Bi,$TEMP0,$TEMP0
1230         vmovdqu         -24+32*6-128($ap),$TEMP2
1231         vpaddq          $TEMP0,$ACC4,$ACC4
1232         vpmuludq        $Bi,$TEMP1,$TEMP1
1233         vmovdqu         -24+32*7-128($ap),$TEMP0
1234         vpaddq          $TEMP1,$ACC5,$ACC5
1235         vpmuludq        $Bi,$TEMP2,$TEMP2
1236         vmovdqu         -24+32*8-128($ap),$TEMP1
1237         vpaddq          $TEMP2,$ACC6,$ACC6
1238         vpmuludq        $Bi,$TEMP0,$TEMP0
1239         vmovdqu         -24+32*9-128($ap),$TEMP2
1240         vpaddq          $TEMP0,$ACC7,$ACC7
1241         vpmuludq        $Bi,$TEMP1,$TEMP1
1242         vpaddq          $TEMP1,$ACC8,$ACC8
1243         vpmuludq        $Bi,$TEMP2,$TEMP2
1244          vpbroadcastq   32($bp), $Bi
1245         vpaddq          $TEMP2,$ACC9,$ACC9
1246          add            \$32, $bp                       # $bp++
1247
1248         vmovdqu         -24+32*1-128($np),$TEMP0
1249         imulq   -128($np),%rax
1250         add     %rax,$r3
1251         shr     \$29, $r3
1252
1253         vmovdqu         -24+32*2-128($np),$TEMP1
1254         vpmuludq        $Yi,$TEMP0,$TEMP0
1255          vmovq          $Bi, %rbx
1256         vmovdqu         -24+32*3-128($np),$TEMP2
1257         vpaddq          $TEMP0,$ACC1,$ACC0              # $ACC0==$TEMP0
1258         vpmuludq        $Yi,$TEMP1,$TEMP1
1259          vmovdqu        $ACC0, (%rsp)                   # transfer $r0-$r3
1260         vpaddq          $TEMP1,$ACC2,$ACC1
1261         vmovdqu         -24+32*4-128($np),$TEMP0
1262         vpmuludq        $Yi,$TEMP2,$TEMP2
1263         vmovdqu         -24+32*5-128($np),$TEMP1
1264         vpaddq          $TEMP2,$ACC3,$ACC2
1265         vpmuludq        $Yi,$TEMP0,$TEMP0
1266         vmovdqu         -24+32*6-128($np),$TEMP2
1267         vpaddq          $TEMP0,$ACC4,$ACC3
1268         vpmuludq        $Yi,$TEMP1,$TEMP1
1269         vmovdqu         -24+32*7-128($np),$TEMP0
1270         vpaddq          $TEMP1,$ACC5,$ACC4
1271         vpmuludq        $Yi,$TEMP2,$TEMP2
1272         vmovdqu         -24+32*8-128($np),$TEMP1
1273         vpaddq          $TEMP2,$ACC6,$ACC5
1274         vpmuludq        $Yi,$TEMP0,$TEMP0
1275         vmovdqu         -24+32*9-128($np),$TEMP2
1276          mov    $r3, $r0
1277         vpaddq          $TEMP0,$ACC7,$ACC6
1278         vpmuludq        $Yi,$TEMP1,$TEMP1
1279          add    (%rsp), $r0
1280         vpaddq          $TEMP1,$ACC8,$ACC7
1281         vpmuludq        $Yi,$TEMP2,$TEMP2
1282          vmovq  $r3, $TEMP1
1283         vpaddq          $TEMP2,$ACC9,$ACC8
1284
1285         dec     $i
1286         jnz     .Loop_mul_1024
1287 ___
1288
1289 # (*)   Original implementation was correcting ACC1-ACC3 for overflow
1290 #       after 7 loop runs, or after 28 iterations, or 56 additions.
1291 #       But as we underutilize resources, it's possible to correct in
1292 #       each iteration with marginal performance loss. But then, as
1293 #       we do it in each iteration, we can correct less digits, and
1294 #       avoid performance penalties completely. Also note that we
1295 #       correct only three digits out of four. This works because
1296 #       most significant digit is subjected to less additions.
1297
1298 $TEMP0 = $ACC9;
1299 $TEMP3 = $Bi;
1300 $TEMP4 = $Yi;
1301 $code.=<<___;
1302         vpermq          \$0, $AND_MASK, $AND_MASK
1303         vpaddq          (%rsp), $TEMP1, $ACC0
1304
1305         vpsrlq          \$29, $ACC0, $TEMP1
1306         vpand           $AND_MASK, $ACC0, $ACC0
1307         vpsrlq          \$29, $ACC1, $TEMP2
1308         vpand           $AND_MASK, $ACC1, $ACC1
1309         vpsrlq          \$29, $ACC2, $TEMP3
1310         vpermq          \$0x93, $TEMP1, $TEMP1
1311         vpand           $AND_MASK, $ACC2, $ACC2
1312         vpsrlq          \$29, $ACC3, $TEMP4
1313         vpermq          \$0x93, $TEMP2, $TEMP2
1314         vpand           $AND_MASK, $ACC3, $ACC3
1315
1316         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1317         vpermq          \$0x93, $TEMP3, $TEMP3
1318         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1319         vpermq          \$0x93, $TEMP4, $TEMP4
1320         vpaddq          $TEMP0, $ACC0, $ACC0
1321         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1322         vpaddq          $TEMP1, $ACC1, $ACC1
1323         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1324         vpaddq          $TEMP2, $ACC2, $ACC2
1325         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
1326         vpaddq          $TEMP3, $ACC3, $ACC3
1327         vpaddq          $TEMP4, $ACC4, $ACC4
1328
1329         vpsrlq          \$29, $ACC0, $TEMP1
1330         vpand           $AND_MASK, $ACC0, $ACC0
1331         vpsrlq          \$29, $ACC1, $TEMP2
1332         vpand           $AND_MASK, $ACC1, $ACC1
1333         vpsrlq          \$29, $ACC2, $TEMP3
1334         vpermq          \$0x93, $TEMP1, $TEMP1
1335         vpand           $AND_MASK, $ACC2, $ACC2
1336         vpsrlq          \$29, $ACC3, $TEMP4
1337         vpermq          \$0x93, $TEMP2, $TEMP2
1338         vpand           $AND_MASK, $ACC3, $ACC3
1339         vpermq          \$0x93, $TEMP3, $TEMP3
1340
1341         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1342         vpermq          \$0x93, $TEMP4, $TEMP4
1343         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1344         vpaddq          $TEMP0, $ACC0, $ACC0
1345         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1346         vpaddq          $TEMP1, $ACC1, $ACC1
1347         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1348         vpaddq          $TEMP2, $ACC2, $ACC2
1349         vpblendd        \$3, $TEMP4, $ZERO, $TEMP4
1350         vpaddq          $TEMP3, $ACC3, $ACC3
1351         vpaddq          $TEMP4, $ACC4, $ACC4
1352
1353         vmovdqu         $ACC0, 0-128($rp)
1354         vmovdqu         $ACC1, 32-128($rp)
1355         vmovdqu         $ACC2, 64-128($rp)
1356         vmovdqu         $ACC3, 96-128($rp)
1357 ___
1358
1359 $TEMP5=$ACC0;
1360 $code.=<<___;
1361         vpsrlq          \$29, $ACC4, $TEMP1
1362         vpand           $AND_MASK, $ACC4, $ACC4
1363         vpsrlq          \$29, $ACC5, $TEMP2
1364         vpand           $AND_MASK, $ACC5, $ACC5
1365         vpsrlq          \$29, $ACC6, $TEMP3
1366         vpermq          \$0x93, $TEMP1, $TEMP1
1367         vpand           $AND_MASK, $ACC6, $ACC6
1368         vpsrlq          \$29, $ACC7, $TEMP4
1369         vpermq          \$0x93, $TEMP2, $TEMP2
1370         vpand           $AND_MASK, $ACC7, $ACC7
1371         vpsrlq          \$29, $ACC8, $TEMP5
1372         vpermq          \$0x93, $TEMP3, $TEMP3
1373         vpand           $AND_MASK, $ACC8, $ACC8
1374         vpermq          \$0x93, $TEMP4, $TEMP4
1375
1376         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1377         vpermq          \$0x93, $TEMP5, $TEMP5
1378         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1379         vpaddq          $TEMP0, $ACC4, $ACC4
1380         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1381         vpaddq          $TEMP1, $ACC5, $ACC5
1382         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1383         vpaddq          $TEMP2, $ACC6, $ACC6
1384         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
1385         vpaddq          $TEMP3, $ACC7, $ACC7
1386         vpaddq          $TEMP4, $ACC8, $ACC8
1387
1388         vpsrlq          \$29, $ACC4, $TEMP1
1389         vpand           $AND_MASK, $ACC4, $ACC4
1390         vpsrlq          \$29, $ACC5, $TEMP2
1391         vpand           $AND_MASK, $ACC5, $ACC5
1392         vpsrlq          \$29, $ACC6, $TEMP3
1393         vpermq          \$0x93, $TEMP1, $TEMP1
1394         vpand           $AND_MASK, $ACC6, $ACC6
1395         vpsrlq          \$29, $ACC7, $TEMP4
1396         vpermq          \$0x93, $TEMP2, $TEMP2
1397         vpand           $AND_MASK, $ACC7, $ACC7
1398         vpsrlq          \$29, $ACC8, $TEMP5
1399         vpermq          \$0x93, $TEMP3, $TEMP3
1400         vpand           $AND_MASK, $ACC8, $ACC8
1401         vpermq          \$0x93, $TEMP4, $TEMP4
1402
1403         vpblendd        \$3, $ZERO, $TEMP1, $TEMP0
1404         vpermq          \$0x93, $TEMP5, $TEMP5
1405         vpblendd        \$3, $TEMP1, $TEMP2, $TEMP1
1406         vpaddq          $TEMP0, $ACC4, $ACC4
1407         vpblendd        \$3, $TEMP2, $TEMP3, $TEMP2
1408         vpaddq          $TEMP1, $ACC5, $ACC5
1409         vpblendd        \$3, $TEMP3, $TEMP4, $TEMP3
1410         vpaddq          $TEMP2, $ACC6, $ACC6
1411         vpblendd        \$3, $TEMP4, $TEMP5, $TEMP4
1412         vpaddq          $TEMP3, $ACC7, $ACC7
1413         vpaddq          $TEMP4, $ACC8, $ACC8
1414
1415         vmovdqu         $ACC4, 128-128($rp)
1416         vmovdqu         $ACC5, 160-128($rp)    
1417         vmovdqu         $ACC6, 192-128($rp)
1418         vmovdqu         $ACC7, 224-128($rp)
1419         vmovdqu         $ACC8, 256-128($rp)
1420         vzeroupper
1421
1422         mov     %rbp, %rax
1423 ___
1424 $code.=<<___ if ($win64);
1425         movaps  -0xd8(%rax),%xmm6
1426         movaps  -0xc8(%rax),%xmm7
1427         movaps  -0xb8(%rax),%xmm8
1428         movaps  -0xa8(%rax),%xmm9
1429         movaps  -0x98(%rax),%xmm10
1430         movaps  -0x88(%rax),%xmm11
1431         movaps  -0x78(%rax),%xmm12
1432         movaps  -0x68(%rax),%xmm13
1433         movaps  -0x58(%rax),%xmm14
1434         movaps  -0x48(%rax),%xmm15
1435 ___
1436 $code.=<<___;
1437         mov     -48(%rax),%r15
1438         mov     -40(%rax),%r14
1439         mov     -32(%rax),%r13
1440         mov     -24(%rax),%r12
1441         mov     -16(%rax),%rbp
1442         mov     -8(%rax),%rbx
1443         lea     (%rax),%rsp             # restore %rsp
1444 .Lmul_1024_epilogue:
1445         ret
1446 .size   rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1447 ___
1448 }
1449 {
1450 my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1451 my @T = map("%r$_",(8..11));
1452
1453 $code.=<<___;
1454 .globl  rsaz_1024_red2norm_avx2
1455 .type   rsaz_1024_red2norm_avx2,\@abi-omnipotent
1456 .align  32
1457 rsaz_1024_red2norm_avx2:
1458         sub     \$-128,$inp     # size optimization
1459         xor     %rax,%rax
1460 ___
1461
1462 for ($j=0,$i=0; $i<16; $i++) {
1463     my $k=0;
1464     while (29*$j<64*($i+1)) {   # load data till boundary
1465         $code.="        mov     `8*$j-128`($inp), @T[0]\n";
1466         $j++; $k++; push(@T,shift(@T));
1467     }
1468     $l=$k;
1469     while ($k>1) {              # shift loaded data but last value
1470         $code.="        shl     \$`29*($j-$k)`,@T[-$k]\n";
1471         $k--;
1472     }
1473     $code.=<<___;               # shift last value
1474         mov     @T[-1], @T[0]
1475         shl     \$`29*($j-1)`, @T[-1]
1476         shr     \$`-29*($j-1)`, @T[0]
1477 ___
1478     while ($l) {                # accumulate all values
1479         $code.="        add     @T[-$l], %rax\n";
1480         $l--;
1481     }
1482         $code.=<<___;
1483         adc     \$0, @T[0]      # consume eventual carry
1484         mov     %rax, 8*$i($out)
1485         mov     @T[0], %rax
1486 ___
1487     push(@T,shift(@T));
1488 }
1489 $code.=<<___;
1490         ret
1491 .size   rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1492
1493 .globl  rsaz_1024_norm2red_avx2
1494 .type   rsaz_1024_norm2red_avx2,\@abi-omnipotent
1495 .align  32
1496 rsaz_1024_norm2red_avx2:
1497         sub     \$-128,$out     # size optimization
1498         mov     ($inp),@T[0]
1499         mov     \$0x1fffffff,%eax
1500 ___
1501 for ($j=0,$i=0; $i<16; $i++) {
1502     $code.="    mov     `8*($i+1)`($inp),@T[1]\n"       if ($i<15);
1503     $code.="    xor     @T[1],@T[1]\n"                  if ($i==15);
1504     my $k=1;
1505     while (29*($j+1)<64*($i+1)) {
1506         $code.=<<___;
1507         mov     @T[0],@T[-$k]
1508         shr     \$`29*$j`,@T[-$k]
1509         and     %rax,@T[-$k]                            # &0x1fffffff
1510         mov     @T[-$k],`8*$j-128`($out)
1511 ___
1512         $j++; $k++;
1513     }
1514     $code.=<<___;
1515         shrd    \$`29*$j`,@T[1],@T[0]
1516         and     %rax,@T[0]
1517         mov     @T[0],`8*$j-128`($out)
1518 ___
1519     $j++;
1520     push(@T,shift(@T));
1521 }
1522 $code.=<<___;
1523         mov     @T[0],`8*$j-128`($out)                  # zero
1524         mov     @T[0],`8*($j+1)-128`($out)
1525         mov     @T[0],`8*($j+2)-128`($out)
1526         mov     @T[0],`8*($j+3)-128`($out)
1527         ret
1528 .size   rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1529 ___
1530 }
1531 {
1532 my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1533
1534 $code.=<<___;
1535 .globl  rsaz_1024_scatter5_avx2
1536 .type   rsaz_1024_scatter5_avx2,\@abi-omnipotent
1537 .align  32
1538 rsaz_1024_scatter5_avx2:
1539         vzeroupper
1540         vmovdqu .Lscatter_permd(%rip),%ymm5
1541         shl     \$4,$power
1542         lea     ($out,$power),$out
1543         mov     \$9,%eax
1544         jmp     .Loop_scatter_1024
1545
1546 .align  32
1547 .Loop_scatter_1024:
1548         vmovdqu         ($inp),%ymm0
1549         lea             32($inp),$inp
1550         vpermd          %ymm0,%ymm5,%ymm0
1551         vmovdqu         %xmm0,($out)
1552         lea             16*32($out),$out
1553         dec     %eax
1554         jnz     .Loop_scatter_1024
1555
1556         vzeroupper
1557         ret
1558 .size   rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1559
1560 .globl  rsaz_1024_gather5_avx2
1561 .type   rsaz_1024_gather5_avx2,\@abi-omnipotent
1562 .align  32
1563 rsaz_1024_gather5_avx2:
1564 ___
1565 $code.=<<___ if ($win64);
1566         lea     -0x88(%rsp),%rax
1567 .LSEH_begin_rsaz_1024_gather5:
1568         # I can't trust assembler to use specific encoding:-(
1569         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax),%rsp
1570         .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6,-0x20(%rax)
1571         .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7,-0x10(%rax)
1572         .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8,0(%rax)
1573         .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9,0x10(%rax)
1574         .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10,0x20(%rax)
1575         .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11,0x30(%rax)
1576         .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12,0x40(%rax)
1577         .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13,0x50(%rax)
1578         .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14,0x60(%rax)
1579         .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15,0x70(%rax)
1580 ___
1581 $code.=<<___;
1582         vzeroupper
1583         lea     .Lgather_table(%rip),%r11
1584         mov     $power,%eax
1585         and     \$3,$power
1586         shr     \$2,%eax                        # cache line number
1587         shl     \$4,$power                      # offset within cache line
1588
1589         vmovdqu         -32(%r11),%ymm7         # .Lgather_permd
1590         vpbroadcastb    8(%r11,%rax), %xmm8
1591         vpbroadcastb    7(%r11,%rax), %xmm9
1592         vpbroadcastb    6(%r11,%rax), %xmm10
1593         vpbroadcastb    5(%r11,%rax), %xmm11
1594         vpbroadcastb    4(%r11,%rax), %xmm12
1595         vpbroadcastb    3(%r11,%rax), %xmm13
1596         vpbroadcastb    2(%r11,%rax), %xmm14
1597         vpbroadcastb    1(%r11,%rax), %xmm15
1598
1599         lea     ($inp,$power),$inp
1600         mov     \$64,%r11                       # size optimization
1601         mov     \$9,%eax
1602         jmp     .Loop_gather_1024
1603
1604 .align  32
1605 .Loop_gather_1024:
1606         vpand           ($inp),                 %xmm8,%xmm0
1607         vpand           ($inp,%r11),            %xmm9,%xmm1
1608         vpand           ($inp,%r11,2),          %xmm10,%xmm2
1609         vpand           64($inp,%r11,2),        %xmm11,%xmm3
1610          vpor                                   %xmm0,%xmm1,%xmm1
1611         vpand           ($inp,%r11,4),          %xmm12,%xmm4
1612          vpor                                   %xmm2,%xmm3,%xmm3
1613         vpand           64($inp,%r11,4),        %xmm13,%xmm5
1614          vpor                                   %xmm1,%xmm3,%xmm3
1615         vpand           -128($inp,%r11,8),      %xmm14,%xmm6
1616          vpor                                   %xmm4,%xmm5,%xmm5
1617         vpand           -64($inp,%r11,8),       %xmm15,%xmm2
1618         lea             ($inp,%r11,8),$inp
1619          vpor                                   %xmm3,%xmm5,%xmm5
1620          vpor                                   %xmm2,%xmm6,%xmm6
1621          vpor                                   %xmm5,%xmm6,%xmm6
1622         vpermd          %ymm6,%ymm7,%ymm6
1623         vmovdqu         %ymm6,($out)
1624         lea             32($out),$out
1625         dec     %eax
1626         jnz     .Loop_gather_1024
1627
1628         vpxor   %ymm0,%ymm0,%ymm0
1629         vmovdqu %ymm0,($out)
1630         vzeroupper
1631 ___
1632 $code.=<<___ if ($win64);
1633         movaps  (%rsp),%xmm6
1634         movaps  0x10(%rsp),%xmm7
1635         movaps  0x20(%rsp),%xmm8
1636         movaps  0x30(%rsp),%xmm9
1637         movaps  0x40(%rsp),%xmm10
1638         movaps  0x50(%rsp),%xmm11
1639         movaps  0x60(%rsp),%xmm12
1640         movaps  0x70(%rsp),%xmm13
1641         movaps  0x80(%rsp),%xmm14
1642         movaps  0x90(%rsp),%xmm15
1643         lea     0xa8(%rsp),%rsp
1644 .LSEH_end_rsaz_1024_gather5:
1645 ___
1646 $code.=<<___;
1647         ret
1648 .size   rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1649 ___
1650 }
1651
1652 $code.=<<___;
1653 .extern OPENSSL_ia32cap_P
1654 .globl  rsaz_avx2_eligible
1655 .type   rsaz_avx2_eligible,\@abi-omnipotent
1656 .align  32
1657 rsaz_avx2_eligible:
1658         mov     OPENSSL_ia32cap_P+8(%rip),%eax
1659         and     \$`1<<5`,%eax
1660         shr     \$5,%eax
1661         ret
1662 .size   rsaz_avx2_eligible,.-rsaz_avx2_eligible
1663
1664 .align  64
1665 .Land_mask:
1666         .quad   0x1fffffff,0x1fffffff,0x1fffffff,-1
1667 .Lscatter_permd:
1668         .long   0,2,4,6,7,7,7,7
1669 .Lgather_permd:
1670         .long   0,7,1,7,2,7,3,7
1671 .Lgather_table:
1672         .byte   0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1673 .align  64
1674 ___
1675
1676 if ($win64) {
1677 $rec="%rcx";
1678 $frame="%rdx";
1679 $context="%r8";
1680 $disp="%r9";
1681
1682 $code.=<<___
1683 .extern __imp_RtlVirtualUnwind
1684 .type   rsaz_se_handler,\@abi-omnipotent
1685 .align  16
1686 rsaz_se_handler:
1687         push    %rsi
1688         push    %rdi
1689         push    %rbx
1690         push    %rbp
1691         push    %r12
1692         push    %r13
1693         push    %r14
1694         push    %r15
1695         pushfq
1696         sub     \$64,%rsp
1697
1698         mov     120($context),%rax      # pull context->Rax
1699         mov     248($context),%rbx      # pull context->Rip
1700
1701         mov     8($disp),%rsi           # disp->ImageBase
1702         mov     56($disp),%r11          # disp->HandlerData
1703
1704         mov     0(%r11),%r10d           # HandlerData[0]
1705         lea     (%rsi,%r10),%r10        # prologue label
1706         cmp     %r10,%rbx               # context->Rip<prologue label
1707         jb      .Lcommon_seh_tail
1708
1709         mov     152($context),%rax      # pull context->Rsp
1710
1711         mov     4(%r11),%r10d           # HandlerData[1]
1712         lea     (%rsi,%r10),%r10        # epilogue label
1713         cmp     %r10,%rbx               # context->Rip>=epilogue label
1714         jae     .Lcommon_seh_tail
1715
1716         mov     160($context),%rax      # pull context->Rbp
1717
1718         mov     -48(%rax),%r15
1719         mov     -40(%rax),%r14
1720         mov     -32(%rax),%r13
1721         mov     -24(%rax),%r12
1722         mov     -16(%rax),%rbp
1723         mov     -8(%rax),%rbx
1724         mov     %r15,240($context)
1725         mov     %r14,232($context)
1726         mov     %r13,224($context)
1727         mov     %r12,216($context)
1728         mov     %rbp,160($context)
1729         mov     %rbx,144($context)
1730
1731         lea     -0xd8(%rax),%rsi        # %xmm save area
1732         lea     512($context),%rdi      # & context.Xmm6
1733         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1734         .long   0xa548f3fc              # cld; rep movsq
1735
1736 .Lcommon_seh_tail:
1737         mov     8(%rax),%rdi
1738         mov     16(%rax),%rsi
1739         mov     %rax,152($context)      # restore context->Rsp
1740         mov     %rsi,168($context)      # restore context->Rsi
1741         mov     %rdi,176($context)      # restore context->Rdi
1742
1743         mov     40($disp),%rdi          # disp->ContextRecord
1744         mov     $context,%rsi           # context
1745         mov     \$154,%ecx              # sizeof(CONTEXT)
1746         .long   0xa548f3fc              # cld; rep movsq
1747
1748         mov     $disp,%rsi
1749         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1750         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1751         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1752         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1753         mov     40(%rsi),%r10           # disp->ContextRecord
1754         lea     56(%rsi),%r11           # &disp->HandlerData
1755         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1756         mov     %r10,32(%rsp)           # arg5
1757         mov     %r11,40(%rsp)           # arg6
1758         mov     %r12,48(%rsp)           # arg7
1759         mov     %rcx,56(%rsp)           # arg8, (NULL)
1760         call    *__imp_RtlVirtualUnwind(%rip)
1761
1762         mov     \$1,%eax                # ExceptionContinueSearch
1763         add     \$64,%rsp
1764         popfq
1765         pop     %r15
1766         pop     %r14
1767         pop     %r13
1768         pop     %r12
1769         pop     %rbp
1770         pop     %rbx
1771         pop     %rdi
1772         pop     %rsi
1773         ret
1774 .size   rsaz_se_handler,.-rsaz_se_handler
1775
1776 .section        .pdata
1777 .align  4
1778         .rva    .LSEH_begin_rsaz_1024_sqr_avx2
1779         .rva    .LSEH_end_rsaz_1024_sqr_avx2
1780         .rva    .LSEH_info_rsaz_1024_sqr_avx2
1781
1782         .rva    .LSEH_begin_rsaz_1024_mul_avx2
1783         .rva    .LSEH_end_rsaz_1024_mul_avx2
1784         .rva    .LSEH_info_rsaz_1024_mul_avx2
1785
1786         .rva    .LSEH_begin_rsaz_1024_gather5
1787         .rva    .LSEH_end_rsaz_1024_gather5
1788         .rva    .LSEH_info_rsaz_1024_gather5
1789 .section        .xdata
1790 .align  8
1791 .LSEH_info_rsaz_1024_sqr_avx2:
1792         .byte   9,0,0,0
1793         .rva    rsaz_se_handler
1794         .rva    .Lsqr_1024_body,.Lsqr_1024_epilogue
1795 .LSEH_info_rsaz_1024_mul_avx2:
1796         .byte   9,0,0,0
1797         .rva    rsaz_se_handler
1798         .rva    .Lmul_1024_body,.Lmul_1024_epilogue
1799 .LSEH_info_rsaz_1024_gather5:
1800         .byte   0x01,0x33,0x16,0x00
1801         .byte   0x33,0xf8,0x09,0x00     #movaps 0x90(rsp),xmm15
1802         .byte   0x2e,0xe8,0x08,0x00     #movaps 0x80(rsp),xmm14
1803         .byte   0x29,0xd8,0x07,0x00     #movaps 0x70(rsp),xmm13
1804         .byte   0x24,0xc8,0x06,0x00     #movaps 0x60(rsp),xmm12
1805         .byte   0x1f,0xb8,0x05,0x00     #movaps 0x50(rsp),xmm11
1806         .byte   0x1a,0xa8,0x04,0x00     #movaps 0x40(rsp),xmm10
1807         .byte   0x15,0x98,0x03,0x00     #movaps 0x30(rsp),xmm9
1808         .byte   0x10,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
1809         .byte   0x0c,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
1810         .byte   0x08,0x68,0x00,0x00     #movaps 0x00(rsp),xmm6
1811         .byte   0x04,0x01,0x15,0x00     #sub    rsp,0xa8
1812 ___
1813 }
1814
1815 foreach (split("\n",$code)) {
1816         s/\`([^\`]*)\`/eval($1)/ge;
1817
1818         s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge               or
1819
1820         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1821         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1822         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1823         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1824         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1825         print $_,"\n";
1826 }
1827
1828 }}} else {{{
1829 print <<___;    # assembler is too old
1830 .text
1831
1832 .globl  rsaz_avx2_eligible
1833 .type   rsaz_avx2_eligible,\@abi-omnipotent
1834 rsaz_avx2_eligible:
1835         xor     %eax,%eax
1836         ret
1837 .size   rsaz_avx2_eligible,.-rsaz_avx2_eligible
1838
1839 .globl  rsaz_1024_sqr_avx2
1840 .globl  rsaz_1024_mul_avx2
1841 .globl  rsaz_1024_norm2red_avx2
1842 .globl  rsaz_1024_red2norm_avx2
1843 .globl  rsaz_1024_scatter5_avx2
1844 .globl  rsaz_1024_gather5_avx2
1845 .type   rsaz_1024_sqr_avx2,\@abi-omnipotent
1846 rsaz_1024_sqr_avx2:
1847 rsaz_1024_mul_avx2:
1848 rsaz_1024_norm2red_avx2:
1849 rsaz_1024_red2norm_avx2:
1850 rsaz_1024_scatter5_avx2:
1851 rsaz_1024_gather5_avx2:
1852         .byte   0x0f,0x0b       # ud2
1853         ret
1854 .size   rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1855 ___
1856 }}}
1857
1858 close STDOUT;