843a102c8b58ad056fed5c45c6ead98c21542ae8
[openssl.git] / crypto / bn / asm / x86_64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005.
11 #
12 # Montgomery multiplication routine for x86_64. While it gives modest
13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
15 # respectful 50%. It remains to be seen if loop unrolling and
16 # dedicated squaring routine can provide further improvement...
17
18 # July 2011.
19 #
20 # Add dedicated squaring procedure. Performance improvement varies
21 # from platform to platform, but in average it's ~5%/15%/25%/33%
22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24 # August 2011.
25 #
26 # Unroll and modulo-schedule inner loops in such manner that they
27 # are "fallen through" for input lengths of 8, which is critical for
28 # 1024-bit RSA *sign*. Average performance improvement in comparison
29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32 # June 2013.
33 #
34 # Optimize reduction in squaring procedure and improve 1024+-bit RSA
35 # sign performance by 10-16% on Intel Sandy Bridge and later
36 # (virtually same on non-Intel processors).
37
38 # August 2013.
39 #
40 # Add MULX/ADOX/ADCX code path.
41
42 $flavour = shift;
43 $output  = shift;
44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
52
53 open OUT,"| \"$^X\" $xlate $flavour $output";
54 *STDOUT=*OUT;
55
56 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
57                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
58         $addx = ($1>=2.22);
59 }
60
61 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
62             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
63         $addx = ($1>=2.10);
64 }
65
66 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
67             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
68         $addx = ($1>=11);
69 }
70
71 # int bn_mul_mont(
72 $rp="%rdi";     # BN_ULONG *rp,
73 $ap="%rsi";     # const BN_ULONG *ap,
74 $bp="%rdx";     # const BN_ULONG *bp,
75 $np="%rcx";     # const BN_ULONG *np,
76 $n0="%r8";      # const BN_ULONG *n0,
77 $num="%r9";     # int num);
78 $lo0="%r10";
79 $hi0="%r11";
80 $hi1="%r13";
81 $i="%r14";
82 $j="%r15";
83 $m0="%rbx";
84 $m1="%rbp";
85
86 $code=<<___;
87 .text
88
89 .extern OPENSSL_ia32cap_P
90
91 .globl  bn_mul_mont
92 .type   bn_mul_mont,\@function,6
93 .align  16
94 bn_mul_mont:
95         test    \$3,${num}d
96         jnz     .Lmul_enter
97         cmp     \$8,${num}d
98         jb      .Lmul_enter
99 ___
100 $code.=<<___ if ($addx);
101         mov     OPENSSL_ia32cap_P+8(%rip),%r11d
102 ___
103 $code.=<<___;
104         cmp     $ap,$bp
105         jne     .Lmul4x_enter
106         test    \$7,${num}d
107         jz      .Lsqr8x_enter
108         jmp     .Lmul4x_enter
109
110 .align  16
111 .Lmul_enter:
112         push    %rbx
113         push    %rbp
114         push    %r12
115         push    %r13
116         push    %r14
117         push    %r15
118
119         mov     ${num}d,${num}d
120         lea     2($num),%r10
121         mov     %rsp,%r11
122         neg     %r10
123         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
124         and     \$-1024,%rsp            # minimize TLB usage
125
126         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
127 .Lmul_body:
128         mov     $bp,%r12                # reassign $bp
129 ___
130                 $bp="%r12";
131 $code.=<<___;
132         mov     ($n0),$n0               # pull n0[0] value
133         mov     ($bp),$m0               # m0=bp[0]
134         mov     ($ap),%rax
135
136         xor     $i,$i                   # i=0
137         xor     $j,$j                   # j=0
138
139         mov     $n0,$m1
140         mulq    $m0                     # ap[0]*bp[0]
141         mov     %rax,$lo0
142         mov     ($np),%rax
143
144         imulq   $lo0,$m1                # "tp[0]"*n0
145         mov     %rdx,$hi0
146
147         mulq    $m1                     # np[0]*m1
148         add     %rax,$lo0               # discarded
149         mov     8($ap),%rax
150         adc     \$0,%rdx
151         mov     %rdx,$hi1
152
153         lea     1($j),$j                # j++
154         jmp     .L1st_enter
155
156 .align  16
157 .L1st:
158         add     %rax,$hi1
159         mov     ($ap,$j,8),%rax
160         adc     \$0,%rdx
161         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
162         mov     $lo0,$hi0
163         adc     \$0,%rdx
164         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
165         mov     %rdx,$hi1
166
167 .L1st_enter:
168         mulq    $m0                     # ap[j]*bp[0]
169         add     %rax,$hi0
170         mov     ($np,$j,8),%rax
171         adc     \$0,%rdx
172         lea     1($j),$j                # j++
173         mov     %rdx,$lo0
174
175         mulq    $m1                     # np[j]*m1
176         cmp     $num,$j
177         jne     .L1st
178
179         add     %rax,$hi1
180         mov     ($ap),%rax              # ap[0]
181         adc     \$0,%rdx
182         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
183         adc     \$0,%rdx
184         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
185         mov     %rdx,$hi1
186         mov     $lo0,$hi0
187
188         xor     %rdx,%rdx
189         add     $hi0,$hi1
190         adc     \$0,%rdx
191         mov     $hi1,-8(%rsp,$num,8)
192         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
193
194         lea     1($i),$i                # i++
195         jmp     .Louter
196 .align  16
197 .Louter:
198         mov     ($bp,$i,8),$m0          # m0=bp[i]
199         xor     $j,$j                   # j=0
200         mov     $n0,$m1
201         mov     (%rsp),$lo0
202         mulq    $m0                     # ap[0]*bp[i]
203         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
204         mov     ($np),%rax
205         adc     \$0,%rdx
206
207         imulq   $lo0,$m1                # tp[0]*n0
208         mov     %rdx,$hi0
209
210         mulq    $m1                     # np[0]*m1
211         add     %rax,$lo0               # discarded
212         mov     8($ap),%rax
213         adc     \$0,%rdx
214         mov     8(%rsp),$lo0            # tp[1]
215         mov     %rdx,$hi1
216
217         lea     1($j),$j                # j++
218         jmp     .Linner_enter
219
220 .align  16
221 .Linner:
222         add     %rax,$hi1
223         mov     ($ap,$j,8),%rax
224         adc     \$0,%rdx
225         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
226         mov     (%rsp,$j,8),$lo0
227         adc     \$0,%rdx
228         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
229         mov     %rdx,$hi1
230
231 .Linner_enter:
232         mulq    $m0                     # ap[j]*bp[i]
233         add     %rax,$hi0
234         mov     ($np,$j,8),%rax
235         adc     \$0,%rdx
236         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
237         mov     %rdx,$hi0
238         adc     \$0,$hi0
239         lea     1($j),$j                # j++
240
241         mulq    $m1                     # np[j]*m1
242         cmp     $num,$j
243         jne     .Linner
244
245         add     %rax,$hi1
246         mov     ($ap),%rax              # ap[0]
247         adc     \$0,%rdx
248         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
249         mov     (%rsp,$j,8),$lo0
250         adc     \$0,%rdx
251         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
252         mov     %rdx,$hi1
253
254         xor     %rdx,%rdx
255         add     $hi0,$hi1
256         adc     \$0,%rdx
257         add     $lo0,$hi1               # pull upmost overflow bit
258         adc     \$0,%rdx
259         mov     $hi1,-8(%rsp,$num,8)
260         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
261
262         lea     1($i),$i                # i++
263         cmp     $num,$i
264         jl      .Louter
265
266         xor     $i,$i                   # i=0 and clear CF!
267         mov     (%rsp),%rax             # tp[0]
268         lea     (%rsp),$ap              # borrow ap for tp
269         mov     $num,$j                 # j=num
270         jmp     .Lsub
271 .align  16
272 .Lsub:  sbb     ($np,$i,8),%rax
273         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
274         mov     8($ap,$i,8),%rax        # tp[i+1]
275         lea     1($i),$i                # i++
276         dec     $j                      # doesnn't affect CF!
277         jnz     .Lsub
278
279         sbb     \$0,%rax                # handle upmost overflow bit
280         xor     $i,$i
281         and     %rax,$ap
282         not     %rax
283         mov     $rp,$np
284         and     %rax,$np
285         mov     $num,$j                 # j=num
286         or      $np,$ap                 # ap=borrow?tp:rp
287 .align  16
288 .Lcopy:                                 # copy or in-place refresh
289         mov     ($ap,$i,8),%rax
290         mov     $i,(%rsp,$i,8)          # zap temporary vector
291         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
292         lea     1($i),$i
293         sub     \$1,$j
294         jnz     .Lcopy
295
296         mov     8(%rsp,$num,8),%rsi     # restore %rsp
297         mov     \$1,%rax
298         mov     (%rsi),%r15
299         mov     8(%rsi),%r14
300         mov     16(%rsi),%r13
301         mov     24(%rsi),%r12
302         mov     32(%rsi),%rbp
303         mov     40(%rsi),%rbx
304         lea     48(%rsi),%rsp
305 .Lmul_epilogue:
306         ret
307 .size   bn_mul_mont,.-bn_mul_mont
308 ___
309 {{{
310 my @A=("%r10","%r11");
311 my @N=("%r13","%rdi");
312 $code.=<<___;
313 .type   bn_mul4x_mont,\@function,6
314 .align  16
315 bn_mul4x_mont:
316 .Lmul4x_enter:
317 ___
318 $code.=<<___ if ($addx);
319         and     \$0x80100,%r11d
320         cmp     \$0x80100,%r11d
321         je      .Lmulx4x_enter
322 ___
323 $code.=<<___;
324         push    %rbx
325         push    %rbp
326         push    %r12
327         push    %r13
328         push    %r14
329         push    %r15
330
331         mov     ${num}d,${num}d
332         lea     4($num),%r10
333         mov     %rsp,%r11
334         neg     %r10
335         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
336         and     \$-1024,%rsp            # minimize TLB usage
337
338         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
339 .Lmul4x_body:
340         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
341         mov     %rdx,%r12               # reassign $bp
342 ___
343                 $bp="%r12";
344 $code.=<<___;
345         mov     ($n0),$n0               # pull n0[0] value
346         mov     ($bp),$m0               # m0=bp[0]
347         mov     ($ap),%rax
348
349         xor     $i,$i                   # i=0
350         xor     $j,$j                   # j=0
351
352         mov     $n0,$m1
353         mulq    $m0                     # ap[0]*bp[0]
354         mov     %rax,$A[0]
355         mov     ($np),%rax
356
357         imulq   $A[0],$m1               # "tp[0]"*n0
358         mov     %rdx,$A[1]
359
360         mulq    $m1                     # np[0]*m1
361         add     %rax,$A[0]              # discarded
362         mov     8($ap),%rax
363         adc     \$0,%rdx
364         mov     %rdx,$N[1]
365
366         mulq    $m0
367         add     %rax,$A[1]
368         mov     8($np),%rax
369         adc     \$0,%rdx
370         mov     %rdx,$A[0]
371
372         mulq    $m1
373         add     %rax,$N[1]
374         mov     16($ap),%rax
375         adc     \$0,%rdx
376         add     $A[1],$N[1]
377         lea     4($j),$j                # j++
378         adc     \$0,%rdx
379         mov     $N[1],(%rsp)
380         mov     %rdx,$N[0]
381         jmp     .L1st4x
382 .align  16
383 .L1st4x:
384         mulq    $m0                     # ap[j]*bp[0]
385         add     %rax,$A[0]
386         mov     -16($np,$j,8),%rax
387         adc     \$0,%rdx
388         mov     %rdx,$A[1]
389
390         mulq    $m1                     # np[j]*m1
391         add     %rax,$N[0]
392         mov     -8($ap,$j,8),%rax
393         adc     \$0,%rdx
394         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
395         adc     \$0,%rdx
396         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
397         mov     %rdx,$N[1]
398
399         mulq    $m0                     # ap[j]*bp[0]
400         add     %rax,$A[1]
401         mov     -8($np,$j,8),%rax
402         adc     \$0,%rdx
403         mov     %rdx,$A[0]
404
405         mulq    $m1                     # np[j]*m1
406         add     %rax,$N[1]
407         mov     ($ap,$j,8),%rax
408         adc     \$0,%rdx
409         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
410         adc     \$0,%rdx
411         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
412         mov     %rdx,$N[0]
413
414         mulq    $m0                     # ap[j]*bp[0]
415         add     %rax,$A[0]
416         mov     ($np,$j,8),%rax
417         adc     \$0,%rdx
418         mov     %rdx,$A[1]
419
420         mulq    $m1                     # np[j]*m1
421         add     %rax,$N[0]
422         mov     8($ap,$j,8),%rax
423         adc     \$0,%rdx
424         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
425         adc     \$0,%rdx
426         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
427         mov     %rdx,$N[1]
428
429         mulq    $m0                     # ap[j]*bp[0]
430         add     %rax,$A[1]
431         mov     8($np,$j,8),%rax
432         adc     \$0,%rdx
433         lea     4($j),$j                # j++
434         mov     %rdx,$A[0]
435
436         mulq    $m1                     # np[j]*m1
437         add     %rax,$N[1]
438         mov     -16($ap,$j,8),%rax
439         adc     \$0,%rdx
440         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
441         adc     \$0,%rdx
442         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
443         mov     %rdx,$N[0]
444         cmp     $num,$j
445         jl      .L1st4x
446
447         mulq    $m0                     # ap[j]*bp[0]
448         add     %rax,$A[0]
449         mov     -16($np,$j,8),%rax
450         adc     \$0,%rdx
451         mov     %rdx,$A[1]
452
453         mulq    $m1                     # np[j]*m1
454         add     %rax,$N[0]
455         mov     -8($ap,$j,8),%rax
456         adc     \$0,%rdx
457         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
458         adc     \$0,%rdx
459         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
460         mov     %rdx,$N[1]
461
462         mulq    $m0                     # ap[j]*bp[0]
463         add     %rax,$A[1]
464         mov     -8($np,$j,8),%rax
465         adc     \$0,%rdx
466         mov     %rdx,$A[0]
467
468         mulq    $m1                     # np[j]*m1
469         add     %rax,$N[1]
470         mov     ($ap),%rax              # ap[0]
471         adc     \$0,%rdx
472         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
473         adc     \$0,%rdx
474         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
475         mov     %rdx,$N[0]
476
477         xor     $N[1],$N[1]
478         add     $A[0],$N[0]
479         adc     \$0,$N[1]
480         mov     $N[0],-8(%rsp,$j,8)
481         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
482
483         lea     1($i),$i                # i++
484 .align  4
485 .Louter4x:
486         mov     ($bp,$i,8),$m0          # m0=bp[i]
487         xor     $j,$j                   # j=0
488         mov     (%rsp),$A[0]
489         mov     $n0,$m1
490         mulq    $m0                     # ap[0]*bp[i]
491         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
492         mov     ($np),%rax
493         adc     \$0,%rdx
494
495         imulq   $A[0],$m1               # tp[0]*n0
496         mov     %rdx,$A[1]
497
498         mulq    $m1                     # np[0]*m1
499         add     %rax,$A[0]              # "$N[0]", discarded
500         mov     8($ap),%rax
501         adc     \$0,%rdx
502         mov     %rdx,$N[1]
503
504         mulq    $m0                     # ap[j]*bp[i]
505         add     %rax,$A[1]
506         mov     8($np),%rax
507         adc     \$0,%rdx
508         add     8(%rsp),$A[1]           # +tp[1]
509         adc     \$0,%rdx
510         mov     %rdx,$A[0]
511
512         mulq    $m1                     # np[j]*m1
513         add     %rax,$N[1]
514         mov     16($ap),%rax
515         adc     \$0,%rdx
516         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
517         lea     4($j),$j                # j+=2
518         adc     \$0,%rdx
519         mov     $N[1],(%rsp)            # tp[j-1]
520         mov     %rdx,$N[0]
521         jmp     .Linner4x
522 .align  16
523 .Linner4x:
524         mulq    $m0                     # ap[j]*bp[i]
525         add     %rax,$A[0]
526         mov     -16($np,$j,8),%rax
527         adc     \$0,%rdx
528         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
529         adc     \$0,%rdx
530         mov     %rdx,$A[1]
531
532         mulq    $m1                     # np[j]*m1
533         add     %rax,$N[0]
534         mov     -8($ap,$j,8),%rax
535         adc     \$0,%rdx
536         add     $A[0],$N[0]
537         adc     \$0,%rdx
538         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
539         mov     %rdx,$N[1]
540
541         mulq    $m0                     # ap[j]*bp[i]
542         add     %rax,$A[1]
543         mov     -8($np,$j,8),%rax
544         adc     \$0,%rdx
545         add     -8(%rsp,$j,8),$A[1]
546         adc     \$0,%rdx
547         mov     %rdx,$A[0]
548
549         mulq    $m1                     # np[j]*m1
550         add     %rax,$N[1]
551         mov     ($ap,$j,8),%rax
552         adc     \$0,%rdx
553         add     $A[1],$N[1]
554         adc     \$0,%rdx
555         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
556         mov     %rdx,$N[0]
557
558         mulq    $m0                     # ap[j]*bp[i]
559         add     %rax,$A[0]
560         mov     ($np,$j,8),%rax
561         adc     \$0,%rdx
562         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
563         adc     \$0,%rdx
564         mov     %rdx,$A[1]
565
566         mulq    $m1                     # np[j]*m1
567         add     %rax,$N[0]
568         mov     8($ap,$j,8),%rax
569         adc     \$0,%rdx
570         add     $A[0],$N[0]
571         adc     \$0,%rdx
572         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
573         mov     %rdx,$N[1]
574
575         mulq    $m0                     # ap[j]*bp[i]
576         add     %rax,$A[1]
577         mov     8($np,$j,8),%rax
578         adc     \$0,%rdx
579         add     8(%rsp,$j,8),$A[1]
580         adc     \$0,%rdx
581         lea     4($j),$j                # j++
582         mov     %rdx,$A[0]
583
584         mulq    $m1                     # np[j]*m1
585         add     %rax,$N[1]
586         mov     -16($ap,$j,8),%rax
587         adc     \$0,%rdx
588         add     $A[1],$N[1]
589         adc     \$0,%rdx
590         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
591         mov     %rdx,$N[0]
592         cmp     $num,$j
593         jl      .Linner4x
594
595         mulq    $m0                     # ap[j]*bp[i]
596         add     %rax,$A[0]
597         mov     -16($np,$j,8),%rax
598         adc     \$0,%rdx
599         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
600         adc     \$0,%rdx
601         mov     %rdx,$A[1]
602
603         mulq    $m1                     # np[j]*m1
604         add     %rax,$N[0]
605         mov     -8($ap,$j,8),%rax
606         adc     \$0,%rdx
607         add     $A[0],$N[0]
608         adc     \$0,%rdx
609         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
610         mov     %rdx,$N[1]
611
612         mulq    $m0                     # ap[j]*bp[i]
613         add     %rax,$A[1]
614         mov     -8($np,$j,8),%rax
615         adc     \$0,%rdx
616         add     -8(%rsp,$j,8),$A[1]
617         adc     \$0,%rdx
618         lea     1($i),$i                # i++
619         mov     %rdx,$A[0]
620
621         mulq    $m1                     # np[j]*m1
622         add     %rax,$N[1]
623         mov     ($ap),%rax              # ap[0]
624         adc     \$0,%rdx
625         add     $A[1],$N[1]
626         adc     \$0,%rdx
627         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
628         mov     %rdx,$N[0]
629
630         xor     $N[1],$N[1]
631         add     $A[0],$N[0]
632         adc     \$0,$N[1]
633         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
634         adc     \$0,$N[1]
635         mov     $N[0],-8(%rsp,$j,8)
636         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
637
638         cmp     $num,$i
639         jl      .Louter4x
640 ___
641 {
642 my @ri=("%rax","%rdx",$m0,$m1);
643 $code.=<<___;
644         mov     16(%rsp,$num,8),$rp     # restore $rp
645         mov     0(%rsp),@ri[0]          # tp[0]
646         pxor    %xmm0,%xmm0
647         mov     8(%rsp),@ri[1]          # tp[1]
648         shr     \$2,$num                # num/=4
649         lea     (%rsp),$ap              # borrow ap for tp
650         xor     $i,$i                   # i=0 and clear CF!
651
652         sub     0($np),@ri[0]
653         mov     16($ap),@ri[2]          # tp[2]
654         mov     24($ap),@ri[3]          # tp[3]
655         sbb     8($np),@ri[1]
656         lea     -1($num),$j             # j=num/4-1
657         jmp     .Lsub4x
658 .align  16
659 .Lsub4x:
660         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
661         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
662         sbb     16($np,$i,8),@ri[2]
663         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
664         mov     40($ap,$i,8),@ri[1]
665         sbb     24($np,$i,8),@ri[3]
666         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
667         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
668         sbb     32($np,$i,8),@ri[0]
669         mov     48($ap,$i,8),@ri[2]
670         mov     56($ap,$i,8),@ri[3]
671         sbb     40($np,$i,8),@ri[1]
672         lea     4($i),$i                # i++
673         dec     $j                      # doesnn't affect CF!
674         jnz     .Lsub4x
675
676         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
677         mov     32($ap,$i,8),@ri[0]     # load overflow bit
678         sbb     16($np,$i,8),@ri[2]
679         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
680         sbb     24($np,$i,8),@ri[3]
681         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
682
683         sbb     \$0,@ri[0]              # handle upmost overflow bit
684         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
685         xor     $i,$i                   # i=0
686         and     @ri[0],$ap
687         not     @ri[0]
688         mov     $rp,$np
689         and     @ri[0],$np
690         lea     -1($num),$j
691         or      $np,$ap                 # ap=borrow?tp:rp
692
693         movdqu  ($ap),%xmm1
694         movdqa  %xmm0,(%rsp)
695         movdqu  %xmm1,($rp)
696         jmp     .Lcopy4x
697 .align  16
698 .Lcopy4x:                                       # copy or in-place refresh
699         movdqu  16($ap,$i),%xmm2
700         movdqu  32($ap,$i),%xmm1
701         movdqa  %xmm0,16(%rsp,$i)
702         movdqu  %xmm2,16($rp,$i)
703         movdqa  %xmm0,32(%rsp,$i)
704         movdqu  %xmm1,32($rp,$i)
705         lea     32($i),$i
706         dec     $j
707         jnz     .Lcopy4x
708
709         shl     \$2,$num
710         movdqu  16($ap,$i),%xmm2
711         movdqa  %xmm0,16(%rsp,$i)
712         movdqu  %xmm2,16($rp,$i)
713 ___
714 }
715 $code.=<<___;
716         mov     8(%rsp,$num,8),%rsi     # restore %rsp
717         mov     \$1,%rax
718         mov     (%rsi),%r15
719         mov     8(%rsi),%r14
720         mov     16(%rsi),%r13
721         mov     24(%rsi),%r12
722         mov     32(%rsi),%rbp
723         mov     40(%rsi),%rbx
724         lea     48(%rsi),%rsp
725 .Lmul4x_epilogue:
726         ret
727 .size   bn_mul4x_mont,.-bn_mul4x_mont
728 ___
729 }}}
730 \f{{{
731 ######################################################################
732 # void bn_sqr8x_mont(
733 my $rptr="%rdi";        # const BN_ULONG *rptr,
734 my $aptr="%rsi";        # const BN_ULONG *aptr,
735 my $bptr="%rdx";        # not used
736 my $nptr="%rcx";        # const BN_ULONG *nptr,
737 my $n0  ="%r8";         # const BN_ULONG *n0);
738 my $num ="%r9";         # int num, has to be divisible by 8
739
740 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
741 my @A0=("%r10","%r11");
742 my @A1=("%r12","%r13");
743 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
744
745 $code.=<<___;
746 .type   bn_sqr8x_mont,\@function,6
747 .align  32
748 bn_sqr8x_mont:
749 .Lsqr8x_enter:
750 ___
751 $code.=<<___ if ($addx);
752         and     \$0x80100,%r11d
753         cmp     \$0x80100,%r11d
754         je      .Lsqrx8x_enter
755 ___
756 $code.=<<___;
757         push    %rbx
758         push    %rbp
759         push    %r12
760         push    %r13
761         push    %r14
762         push    %r15
763
764         shl     \$3,${num}d             # convert $num to bytes
765         xor     %r10,%r10
766         mov     %rsp,%r11               # put aside %rsp
767         sub     $num,%r10               # -$num
768         mov     ($n0),$n0               # *n0
769         lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
770         and     \$-1024,%rsp            # minimize TLB usage
771         ##############################################################
772         # Stack layout
773         #
774         # +0    saved $num, used in reduction section
775         # +8    &t[2*$num], used in reduction section
776         # +32   saved $rptr
777         # +40   saved $nptr
778         # +48   saved *n0
779         # +56   saved %rsp
780         # +64   t[2*$num]
781         #
782         mov     $rptr,32(%rsp)          # save $rptr
783         mov     $nptr,40(%rsp)
784         mov     $n0,  48(%rsp)
785         mov     %r11, 56(%rsp)          # save original %rsp
786 .Lsqr8x_body:
787         ##############################################################
788         # Squaring part:
789         #
790         # a) multiply-n-add everything but a[i]*a[i];
791         # b) shift result of a) by 1 to the left and accumulate
792         #    a[i]*a[i] products;
793         #
794         ##############################################################
795         #                                                     a[1]a[0]
796         #                                                 a[2]a[0]
797         #                                             a[3]a[0]
798         #                                             a[2]a[1]
799         #                                         a[4]a[0]
800         #                                         a[3]a[1]
801         #                                     a[5]a[0]
802         #                                     a[4]a[1]
803         #                                     a[3]a[2]
804         #                                 a[6]a[0]
805         #                                 a[5]a[1]
806         #                                 a[4]a[2]
807         #                             a[7]a[0]
808         #                             a[6]a[1]
809         #                             a[5]a[2]
810         #                             a[4]a[3]
811         #                         a[7]a[1]
812         #                         a[6]a[2]
813         #                         a[5]a[3]
814         #                     a[7]a[2]
815         #                     a[6]a[3]
816         #                     a[5]a[4]
817         #                 a[7]a[3]
818         #                 a[6]a[4]
819         #             a[7]a[4]
820         #             a[6]a[5]
821         #         a[7]a[5]
822         #     a[7]a[6]
823         #                                                     a[1]a[0]
824         #                                                 a[2]a[0]
825         #                                             a[3]a[0]
826         #                                         a[4]a[0]
827         #                                     a[5]a[0]
828         #                                 a[6]a[0]
829         #                             a[7]a[0]
830         #                                             a[2]a[1]
831         #                                         a[3]a[1]
832         #                                     a[4]a[1]
833         #                                 a[5]a[1]
834         #                             a[6]a[1]
835         #                         a[7]a[1]
836         #                                     a[3]a[2]
837         #                                 a[4]a[2]
838         #                             a[5]a[2]
839         #                         a[6]a[2]
840         #                     a[7]a[2]
841         #                             a[4]a[3]
842         #                         a[5]a[3]
843         #                     a[6]a[3]
844         #                 a[7]a[3]
845         #                     a[5]a[4]
846         #                 a[6]a[4]
847         #             a[7]a[4]
848         #             a[6]a[5]
849         #         a[7]a[5]
850         #     a[7]a[6]
851         #                                                         a[0]a[0]
852         #                                                 a[1]a[1]
853         #                                         a[2]a[2]
854         #                                 a[3]a[3]
855         #                         a[4]a[4]
856         #                 a[5]a[5]
857         #         a[6]a[6]
858         # a[7]a[7]
859
860         lea     32(%r10),$i             # $i=-($num-32)
861         lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
862
863         mov     $num,$j                 # $j=$num
864
865                                         # comments apply to $num==8 case
866         mov     -32($aptr,$i),$a0       # a[0]
867         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
868         mov     -24($aptr,$i),%rax      # a[1]
869         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
870         mov     -16($aptr,$i),$ai       # a[2]
871         mov     %rax,$a1
872
873         mul     $a0                     # a[1]*a[0]
874         mov     %rax,$A0[0]             # a[1]*a[0]
875          mov    $ai,%rax                # a[2]
876         mov     %rdx,$A0[1]
877         mov     $A0[0],-24($tptr,$i)    # t[1]
878
879         mul     $a0                     # a[2]*a[0]
880         add     %rax,$A0[1]
881          mov    $ai,%rax
882         adc     \$0,%rdx
883         mov     $A0[1],-16($tptr,$i)    # t[2]
884         mov     %rdx,$A0[0]
885
886         lea     -16($i),$j              # j=-16
887
888
889          mov    8($aptr,$j),$ai         # a[3]
890         mul     $a1                     # a[2]*a[1]
891         mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
892          mov    $ai,%rax
893         mov     %rdx,$A1[1]
894
895          lea    16($j),$j
896         mul     $a0                     # a[3]*a[0]
897         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
898          mov    $ai,%rax
899         mov     %rdx,$A0[1]
900         adc     \$0,$A0[1]
901         add     $A1[0],$A0[0]
902         adc     \$0,$A0[1]
903         mov     $A0[0],-8($tptr,$j)     # t[3]
904         jmp     .Lsqr4x_1st
905
906 .align  32
907 .Lsqr4x_1st:
908          mov    ($aptr,$j),$ai          # a[4]
909         mul     $a1                     # a[3]*a[1]
910         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
911          mov    $ai,%rax
912         mov     %rdx,$A1[0]
913         adc     \$0,$A1[0]
914
915         mul     $a0                     # a[4]*a[0]
916         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
917          mov    $ai,%rax                # a[3]
918          mov    8($aptr,$j),$ai         # a[5]
919         mov     %rdx,$A0[0]
920         adc     \$0,$A0[0]
921         add     $A1[1],$A0[1]
922         adc     \$0,$A0[0]
923
924
925         mul     $a1                     # a[4]*a[3]
926         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
927          mov    $ai,%rax
928          mov    $A0[1],($tptr,$j)       # t[4]
929         mov     %rdx,$A1[1]
930         adc     \$0,$A1[1]
931
932         mul     $a0                     # a[5]*a[2]
933         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
934          mov    $ai,%rax
935          mov    16($aptr,$j),$ai        # a[6]
936         mov     %rdx,$A0[1]
937         adc     \$0,$A0[1]
938         add     $A1[0],$A0[0]
939         adc     \$0,$A0[1]
940
941         mul     $a1                     # a[5]*a[3]
942         add     %rax,$A1[1]             # a[5]*a[3]+t[6]
943          mov    $ai,%rax
944          mov    $A0[0],8($tptr,$j)      # t[5]
945         mov     %rdx,$A1[0]
946         adc     \$0,$A1[0]
947
948         mul     $a0                     # a[6]*a[2]
949         add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
950          mov    $ai,%rax                # a[3]
951          mov    24($aptr,$j),$ai        # a[7]
952         mov     %rdx,$A0[0]
953         adc     \$0,$A0[0]
954         add     $A1[1],$A0[1]
955         adc     \$0,$A0[0]
956
957
958         mul     $a1                     # a[6]*a[5]
959         add     %rax,$A1[0]             # a[6]*a[5]+t[7]
960          mov    $ai,%rax
961          mov    $A0[1],16($tptr,$j)     # t[6]
962         mov     %rdx,$A1[1]
963         adc     \$0,$A1[1]
964
965         mul     $a0                     # a[7]*a[4]
966         add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
967          mov    $ai,%rax
968          lea    32($j),$j
969         mov     %rdx,$A0[1]
970         adc     \$0,$A0[1]
971         add     $A1[0],$A0[0]
972         adc     \$0,$A0[1]
973         mov     $A0[0],-8($tptr,$j)     # t[7]
974
975         cmp     \$0,$j
976         jne     .Lsqr4x_1st
977
978         mul     $a1                     # a[7]*a[5]
979         add     %rax,$A1[1]
980         lea     16($i),$i
981         adc     \$0,%rdx
982         add     $A0[1],$A1[1]
983         adc     \$0,%rdx
984
985         mov     $A1[1],($tptr)          # t[8]
986         mov     %rdx,$A1[0]
987         mov     %rdx,8($tptr)           # t[9]
988         jmp     .Lsqr4x_outer
989
990 .align  32
991 .Lsqr4x_outer:                          # comments apply to $num==6 case
992         mov     -32($aptr,$i),$a0       # a[0]
993         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
994         mov     -24($aptr,$i),%rax      # a[1]
995         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
996         mov     -16($aptr,$i),$ai       # a[2]
997         mov     %rax,$a1
998
999         mov     -24($tptr,$i),$A0[0]    # t[1]
1000         mul     $a0                     # a[1]*a[0]
1001         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
1002          mov    $ai,%rax                # a[2]
1003         adc     \$0,%rdx
1004         mov     $A0[0],-24($tptr,$i)    # t[1]
1005         mov     %rdx,$A0[1]
1006
1007         mul     $a0                     # a[2]*a[0]
1008         add     %rax,$A0[1]
1009          mov    $ai,%rax
1010         adc     \$0,%rdx
1011         add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
1012         mov     %rdx,$A0[0]
1013         adc     \$0,$A0[0]
1014         mov     $A0[1],-16($tptr,$i)    # t[2]
1015
1016         lea     -16($i),$j              # j=-16
1017         xor     $A1[0],$A1[0]
1018
1019
1020          mov    8($aptr,$j),$ai         # a[3]
1021         mul     $a1                     # a[2]*a[1]
1022         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
1023          mov    $ai,%rax
1024         adc     \$0,%rdx
1025         add     8($tptr,$j),$A1[0]
1026         mov     %rdx,$A1[1]
1027         adc     \$0,$A1[1]
1028
1029         mul     $a0                     # a[3]*a[0]
1030         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1031          mov    $ai,%rax
1032         adc     \$0,%rdx
1033         add     $A1[0],$A0[0]
1034         mov     %rdx,$A0[1]
1035         adc     \$0,$A0[1]
1036         mov     $A0[0],8($tptr,$j)      # t[3]
1037
1038         lea     16($j),$j
1039         jmp     .Lsqr4x_inner
1040
1041 .align  32
1042 .Lsqr4x_inner:
1043          mov    ($aptr,$j),$ai          # a[4]
1044         mul     $a1                     # a[3]*a[1]
1045         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
1046          mov    $ai,%rax
1047         mov     %rdx,$A1[0]
1048         adc     \$0,$A1[0]
1049         add     ($tptr,$j),$A1[1]
1050         adc     \$0,$A1[0]
1051
1052         mul     $a0                     # a[4]*a[0]
1053         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
1054          mov    $ai,%rax                # a[3]
1055          mov    8($aptr,$j),$ai         # a[5]
1056         mov     %rdx,$A0[0]
1057         adc     \$0,$A0[0]
1058         add     $A1[1],$A0[1]
1059         adc     \$0,$A0[0]
1060
1061         mul     $a1                     # a[4]*a[3]
1062         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
1063         mov     $A0[1],($tptr,$j)       # t[4]
1064          mov    $ai,%rax
1065         mov     %rdx,$A1[1]
1066         adc     \$0,$A1[1]
1067         add     8($tptr,$j),$A1[0]
1068         lea     16($j),$j               # j++
1069         adc     \$0,$A1[1]
1070
1071         mul     $a0                     # a[5]*a[2]
1072         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
1073          mov    $ai,%rax
1074         adc     \$0,%rdx
1075         add     $A1[0],$A0[0]
1076         mov     %rdx,$A0[1]
1077         adc     \$0,$A0[1]
1078         mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
1079
1080         cmp     \$0,$j
1081         jne     .Lsqr4x_inner
1082
1083         mul     $a1                     # a[5]*a[3]
1084         add     %rax,$A1[1]
1085         adc     \$0,%rdx
1086         add     $A0[1],$A1[1]
1087         adc     \$0,%rdx
1088
1089         mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
1090         mov     %rdx,$A1[0]
1091         mov     %rdx,8($tptr)           # t[7], "preloaded t[3]" below
1092
1093         add     \$16,$i
1094         jnz     .Lsqr4x_outer
1095
1096                                         # comments apply to $num==4 case
1097         mov     -32($aptr),$a0          # a[0]
1098         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
1099         mov     -24($aptr),%rax         # a[1]
1100         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
1101         mov     -16($aptr),$ai          # a[2]
1102         mov     %rax,$a1
1103
1104         mul     $a0                     # a[1]*a[0]
1105         add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
1106          mov    $ai,%rax                # a[2]
1107         mov     %rdx,$A0[1]
1108         adc     \$0,$A0[1]
1109
1110         mul     $a0                     # a[2]*a[0]
1111         add     %rax,$A0[1]
1112          mov    $ai,%rax
1113          mov    $A0[0],-24($tptr)       # t[1]
1114         mov     %rdx,$A0[0]
1115         adc     \$0,$A0[0]
1116         add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
1117          mov    -8($aptr),$ai           # a[3]
1118         adc     \$0,$A0[0]
1119
1120         mul     $a1                     # a[2]*a[1]
1121         add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
1122          mov    $ai,%rax
1123          mov    $A0[1],-16($tptr)       # t[2]
1124         mov     %rdx,$A1[1]
1125         adc     \$0,$A1[1]
1126
1127         mul     $a0                     # a[3]*a[0]
1128         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1129          mov    $ai,%rax
1130         mov     %rdx,$A0[1]
1131         adc     \$0,$A0[1]
1132         add     $A1[0],$A0[0]
1133         adc     \$0,$A0[1]
1134         mov     $A0[0],-8($tptr)        # t[3]
1135
1136         mul     $a1                     # a[3]*a[1]
1137         add     %rax,$A1[1]
1138          mov    -16($aptr),%rax         # a[2]
1139         adc     \$0,%rdx
1140         add     $A0[1],$A1[1]
1141         adc     \$0,%rdx
1142
1143         mov     $A1[1],($tptr)          # t[4]
1144         mov     %rdx,$A1[0]
1145         mov     %rdx,8($tptr)           # t[5]
1146
1147         mul     $ai                     # a[2]*a[3]
1148 ___
1149 {
1150 my ($shift,$carry)=($a0,$a1);
1151 my @S=(@A1,$ai,$n0);
1152 $code.=<<___;
1153          add    \$16,$i
1154          xor    $shift,$shift
1155          sub    $num,$i                 # $i=16-$num
1156          xor    $carry,$carry
1157
1158         add     $A1[0],%rax             # t[5]
1159         adc     \$0,%rdx
1160         mov     %rax,8($tptr)           # t[5]
1161         mov     %rdx,16($tptr)          # t[6]
1162         mov     $carry,24($tptr)        # t[7]
1163
1164          mov    -16($aptr,$i),%rax      # a[0]
1165         lea     64(%rsp),$tptr
1166          xor    $A0[0],$A0[0]           # t[0]
1167          mov    8($tptr),$A0[1]         # t[1]
1168
1169         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1170         shr     \$63,$A0[0]
1171         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1172         shr     \$63,$A0[1]
1173         or      $A0[0],$S[1]            # | t[2*i]>>63
1174          mov    16($tptr),$A0[0]        # t[2*i+2]      # prefetch
1175         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1176         mul     %rax                    # a[i]*a[i]
1177         neg     $carry                  # mov $carry,cf
1178          mov    24($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1179         adc     %rax,$S[0]
1180          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1181         mov     $S[0],($tptr)
1182         adc     %rdx,$S[1]
1183
1184         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1185          mov    $S[1],8($tptr)
1186          sbb    $carry,$carry           # mov cf,$carry
1187         shr     \$63,$A0[0]
1188         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1189         shr     \$63,$A0[1]
1190         or      $A0[0],$S[3]            # | t[2*i]>>63
1191          mov    32($tptr),$A0[0]        # t[2*i+2]      # prefetch
1192         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1193         mul     %rax                    # a[i]*a[i]
1194         neg     $carry                  # mov $carry,cf
1195          mov    40($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1196         adc     %rax,$S[2]
1197          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1198         mov     $S[2],16($tptr)
1199         adc     %rdx,$S[3]
1200         lea     16($i),$i
1201         mov     $S[3],24($tptr)
1202         sbb     $carry,$carry           # mov cf,$carry
1203         lea     64($tptr),$tptr
1204         jmp     .Lsqr4x_shift_n_add
1205
1206 .align  32
1207 .Lsqr4x_shift_n_add:
1208         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1209         shr     \$63,$A0[0]
1210         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1211         shr     \$63,$A0[1]
1212         or      $A0[0],$S[1]            # | t[2*i]>>63
1213          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1214         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1215         mul     %rax                    # a[i]*a[i]
1216         neg     $carry                  # mov $carry,cf
1217          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1218         adc     %rax,$S[0]
1219          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1220         mov     $S[0],-32($tptr)
1221         adc     %rdx,$S[1]
1222
1223         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1224          mov    $S[1],-24($tptr)
1225          sbb    $carry,$carry           # mov cf,$carry
1226         shr     \$63,$A0[0]
1227         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1228         shr     \$63,$A0[1]
1229         or      $A0[0],$S[3]            # | t[2*i]>>63
1230          mov    0($tptr),$A0[0]         # t[2*i+2]      # prefetch
1231         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1232         mul     %rax                    # a[i]*a[i]
1233         neg     $carry                  # mov $carry,cf
1234          mov    8($tptr),$A0[1]         # t[2*i+2+1]    # prefetch
1235         adc     %rax,$S[2]
1236          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1237         mov     $S[2],-16($tptr)
1238         adc     %rdx,$S[3]
1239
1240         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1241          mov    $S[3],-8($tptr)
1242          sbb    $carry,$carry           # mov cf,$carry
1243         shr     \$63,$A0[0]
1244         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1245         shr     \$63,$A0[1]
1246         or      $A0[0],$S[1]            # | t[2*i]>>63
1247          mov    16($tptr),$A0[0]        # t[2*i+2]      # prefetch
1248         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1249         mul     %rax                    # a[i]*a[i]
1250         neg     $carry                  # mov $carry,cf
1251          mov    24($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1252         adc     %rax,$S[0]
1253          mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
1254         mov     $S[0],0($tptr)
1255         adc     %rdx,$S[1]
1256
1257         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1258          mov    $S[1],8($tptr)
1259          sbb    $carry,$carry           # mov cf,$carry
1260         shr     \$63,$A0[0]
1261         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1262         shr     \$63,$A0[1]
1263         or      $A0[0],$S[3]            # | t[2*i]>>63
1264          mov    32($tptr),$A0[0]        # t[2*i+2]      # prefetch
1265         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1266         mul     %rax                    # a[i]*a[i]
1267         neg     $carry                  # mov $carry,cf
1268          mov    40($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1269         adc     %rax,$S[2]
1270          mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
1271         mov     $S[2],16($tptr)
1272         adc     %rdx,$S[3]
1273         mov     $S[3],24($tptr)
1274         sbb     $carry,$carry           # mov cf,$carry
1275         lea     64($tptr),$tptr
1276         add     \$32,$i
1277         jnz     .Lsqr4x_shift_n_add
1278
1279         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1280         shr     \$63,$A0[0]
1281         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1282         shr     \$63,$A0[1]
1283         or      $A0[0],$S[1]            # | t[2*i]>>63
1284          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1285         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1286         mul     %rax                    # a[i]*a[i]
1287         neg     $carry                  # mov $carry,cf
1288          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1289         adc     %rax,$S[0]
1290          mov    -8($aptr),%rax          # a[i+1]        # prefetch
1291         mov     $S[0],-32($tptr)
1292         adc     %rdx,$S[1]
1293
1294         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1295          mov    $S[1],-24($tptr)
1296          sbb    $carry,$carry           # mov cf,$carry
1297         shr     \$63,$A0[0]
1298         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1299         shr     \$63,$A0[1]
1300         or      $A0[0],$S[3]            # | t[2*i]>>63
1301         mul     %rax                    # a[i]*a[i]
1302         neg     $carry                  # mov $carry,cf
1303         adc     %rax,$S[2]
1304         adc     %rdx,$S[3]
1305         mov     $S[2],-16($tptr)
1306         mov     $S[3],-8($tptr)
1307 ___
1308 }\f
1309 ######################################################################
1310 # Montgomery reduction part, "word-by-word" algorithm.
1311 #
1312 # This new path is inspired by multiple submissions from Intel, by
1313 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1314 # Vinodh Gopal...
1315 {
1316 my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1317
1318 $code.=<<___;
1319         mov     40(%rsp),$nptr          # pull $nptr
1320         xor     %rax,%rax
1321         lea     ($nptr,$num),%rdx       # end of n[]
1322         lea     64(%rsp,$num,2),$tptr   # end of t[] buffer
1323         mov     %rdx,0(%rsp)
1324         mov     $tptr,8(%rsp)
1325         mov     %rax,($tptr)            # clear top-most carry bit
1326         lea     64(%rsp,$num),$tptr     # end of initial t[] window
1327         neg     $num
1328         jmp     .L8x_reduction_loop
1329
1330 .align  32
1331 .L8x_reduction_loop:
1332         lea     ($tptr,$num),$tptr      # start of current t[] window
1333         mov     8*0($tptr),$m0
1334         mov     8*1($tptr),%r9
1335         mov     8*2($tptr),%r10
1336         mov     8*3($tptr),%r11
1337         mov     8*4($tptr),%r12
1338         mov     8*5($tptr),%r13
1339         mov     8*6($tptr),%r14
1340         mov     8*7($tptr),%r15
1341         lea     8*8($tptr),$tptr
1342
1343         mov     $m0,%r8
1344         imulq   48(%rsp),$m0            # n0*a[0]
1345         mov     8*0($nptr),%rax         # n[0]
1346         mov     \$8,%ecx
1347         jmp     .L8x_reduce
1348
1349 .align  32
1350 .L8x_reduce:
1351         mulq    $m0
1352          mov    8*1($nptr),%rax         # n[1]
1353         neg     %r8
1354         mov     %rdx,%r8
1355         adc     \$0,%r8
1356
1357         mulq    $m0
1358         add     %rax,%r9
1359          mov    8*2($nptr),%rax
1360         adc     \$0,%rdx
1361         add     %r9,%r8
1362          mov    $m0,64-8(%rsp,%rcx,8)   # put aside n0*a[i]
1363         mov     %rdx,%r9
1364         adc     \$0,%r9
1365
1366         mulq    $m0
1367         add     %rax,%r10
1368          mov    8*3($nptr),%rax
1369         adc     \$0,%rdx
1370         add     %r10,%r9
1371          mov    48(%rsp),$carry         # pull n0, borrow $carry
1372         mov     %rdx,%r10
1373         adc     \$0,%r10
1374
1375         mulq    $m0
1376         add     %rax,%r11
1377          mov    8*4($nptr),%rax
1378         adc     \$0,%rdx
1379          imulq  %r8,$carry              # modulo-scheduled
1380         add     %r11,%r10
1381         mov     %rdx,%r11
1382         adc     \$0,%r11
1383
1384         mulq    $m0
1385         add     %rax,%r12
1386          mov    8*5($nptr),%rax
1387         adc     \$0,%rdx
1388         add     %r12,%r11
1389         mov     %rdx,%r12
1390         adc     \$0,%r12
1391
1392         mulq    $m0
1393         add     %rax,%r13
1394          mov    8*6($nptr),%rax
1395         adc     \$0,%rdx
1396         add     %r13,%r12
1397         mov     %rdx,%r13
1398         adc     \$0,%r13
1399
1400         mulq    $m0
1401         add     %rax,%r14
1402          mov    8*7($nptr),%rax
1403         adc     \$0,%rdx
1404         add     %r14,%r13
1405         mov     %rdx,%r14
1406         adc     \$0,%r14
1407
1408         mulq    $m0
1409          mov    $carry,$m0              # n0*a[i]
1410         add     %rax,%r15
1411          mov    8*0($nptr),%rax         # n[0]
1412         adc     \$0,%rdx
1413         add     %r15,%r14
1414         mov     %rdx,%r15
1415         adc     \$0,%r15
1416
1417         dec     %ecx
1418         jnz     .L8x_reduce
1419
1420         lea     8*8($nptr),$nptr
1421         xor     %rax,%rax
1422         mov     8(%rsp),%rdx            # pull end of t[]
1423         cmp     0(%rsp),$nptr           # end of n[]?
1424         jae     .L8x_no_tail
1425
1426         add     8*0($tptr),%r8
1427         adc     8*1($tptr),%r9
1428         adc     8*2($tptr),%r10
1429         adc     8*3($tptr),%r11
1430         adc     8*4($tptr),%r12
1431         adc     8*5($tptr),%r13
1432         adc     8*6($tptr),%r14
1433         adc     8*7($tptr),%r15
1434         sbb     $carry,$carry           # top carry
1435
1436         mov     64+56(%rsp),$m0         # pull n0*a[0]
1437         mov     \$8,%ecx
1438         mov     8*0($nptr),%rax
1439         jmp     .L8x_tail
1440
1441 .align  32
1442 .L8x_tail:
1443         mulq    $m0
1444         add     %rax,%r8
1445          mov    8*1($nptr),%rax
1446          mov    %r8,($tptr)             # save result
1447         mov     %rdx,%r8
1448         adc     \$0,%r8
1449
1450         mulq    $m0
1451         add     %rax,%r9
1452          mov    8*2($nptr),%rax
1453         adc     \$0,%rdx
1454         add     %r9,%r8
1455          lea    8($tptr),$tptr          # $tptr++
1456         mov     %rdx,%r9
1457         adc     \$0,%r9
1458
1459         mulq    $m0
1460         add     %rax,%r10
1461          mov    8*3($nptr),%rax
1462         adc     \$0,%rdx
1463         add     %r10,%r9
1464         mov     %rdx,%r10
1465         adc     \$0,%r10
1466
1467         mulq    $m0
1468         add     %rax,%r11
1469          mov    8*4($nptr),%rax
1470         adc     \$0,%rdx
1471         add     %r11,%r10
1472         mov     %rdx,%r11
1473         adc     \$0,%r11
1474
1475         mulq    $m0
1476         add     %rax,%r12
1477          mov    8*5($nptr),%rax
1478         adc     \$0,%rdx
1479         add     %r12,%r11
1480         mov     %rdx,%r12
1481         adc     \$0,%r12
1482
1483         mulq    $m0
1484         add     %rax,%r13
1485          mov    8*6($nptr),%rax
1486         adc     \$0,%rdx
1487         add     %r13,%r12
1488         mov     %rdx,%r13
1489         adc     \$0,%r13
1490
1491         mulq    $m0
1492         add     %rax,%r14
1493          mov    8*7($nptr),%rax
1494         adc     \$0,%rdx
1495         add     %r14,%r13
1496         mov     %rdx,%r14
1497         adc     \$0,%r14
1498
1499         mulq    $m0
1500          mov    64-16(%rsp,%rcx,8),$m0  # pull n0*a[i]
1501         add     %rax,%r15
1502         adc     \$0,%rdx
1503         add     %r15,%r14
1504          mov    8*0($nptr),%rax         # pull n[0]
1505         mov     %rdx,%r15
1506         adc     \$0,%r15
1507
1508         dec     %ecx
1509         jnz     .L8x_tail
1510
1511         lea     8*8($nptr),$nptr
1512         mov     8(%rsp),%rdx            # pull end of t[]
1513         cmp     0(%rsp),$nptr           # end of n[]?
1514         jae     .L8x_tail_done          # break out of loop
1515
1516          mov    64+56(%rsp),$m0         # pull n0*a[0]
1517         neg     $carry
1518          mov    8*0($nptr),%rax         # pull n[0]
1519         adc     8*0($tptr),%r8
1520         adc     8*1($tptr),%r9
1521         adc     8*2($tptr),%r10
1522         adc     8*3($tptr),%r11
1523         adc     8*4($tptr),%r12
1524         adc     8*5($tptr),%r13
1525         adc     8*6($tptr),%r14
1526         adc     8*7($tptr),%r15
1527         sbb     $carry,$carry           # top carry
1528
1529         mov     \$8,%ecx
1530         jmp     .L8x_tail
1531
1532 .align  32
1533 .L8x_tail_done:
1534         add     (%rdx),%r8              # can this overflow?
1535         xor     %rax,%rax
1536
1537         neg     $carry
1538 .L8x_no_tail:
1539         adc     8*0($tptr),%r8
1540         adc     8*1($tptr),%r9
1541         adc     8*2($tptr),%r10
1542         adc     8*3($tptr),%r11
1543         adc     8*4($tptr),%r12
1544         adc     8*5($tptr),%r13
1545         adc     8*6($tptr),%r14
1546         adc     8*7($tptr),%r15
1547         adc     \$0,%rax                # top-most carry
1548
1549         mov     40(%rsp),$nptr          # restore $nptr
1550
1551         mov     %r8,8*0($tptr)          # store top 512 bits
1552         mov     %r9,8*1($tptr)
1553          mov    $nptr,$num              # $num is %r9, can't be moved upwards
1554         mov     %r10,8*2($tptr)
1555          sub    0(%rsp),$num            # -$num
1556         mov     %r11,8*3($tptr)
1557         mov     %r12,8*4($tptr)
1558         mov     %r13,8*5($tptr)
1559         mov     %r14,8*6($tptr)
1560         mov     %r15,8*7($tptr)
1561         lea     8*8($tptr),$tptr
1562         mov     %rax,(%rdx)             # store top-most carry
1563
1564         cmp     %rdx,$tptr              # end of t[]?
1565         jb      .L8x_reduction_loop
1566
1567         neg     $num                    # restore $num
1568 ___
1569 }\f
1570 ##############################################################
1571 # Post-condition, 4x unrolled copy from bn_mul_mont
1572 #
1573 {
1574 my ($tptr,$nptr)=("%rbx",$aptr);
1575 my @ri=("%rax","%rdx","%r10","%r11");
1576 $code.=<<___;
1577         mov     64(%rsp,$num),@ri[0]    # tp[0]
1578         lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
1579         mov     40(%rsp),$nptr          # restore $nptr
1580         shr     \$5,$num                # num/4
1581         mov     8($tptr),@ri[1]         # t[1]
1582         xor     $i,$i                   # i=0 and clear CF!
1583
1584         mov     32(%rsp),$rptr          # restore $rptr
1585         sub     0($nptr),@ri[0]
1586         mov     16($tptr),@ri[2]        # t[2]
1587         mov     24($tptr),@ri[3]        # t[3]
1588         sbb     8($nptr),@ri[1]
1589         lea     -1($num),$j             # j=num/4-1
1590         jmp     .Lsqr4x_sub
1591 .align  32
1592 .Lsqr4x_sub:
1593         mov     @ri[0],0($rptr)         # rp[i]=tp[i]-np[i]
1594         mov     @ri[1],8($rptr)         # rp[i]=tp[i]-np[i]
1595         sbb     16($nptr,$i,8),@ri[2]
1596         mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
1597         mov     40($tptr,$i,8),@ri[1]
1598         sbb     24($nptr,$i,8),@ri[3]
1599         mov     @ri[2],16($rptr)        # rp[i]=tp[i]-np[i]
1600         mov     @ri[3],24($rptr)        # rp[i]=tp[i]-np[i]
1601         lea     32($rptr),$rptr
1602         sbb     32($nptr,$i,8),@ri[0]
1603         mov     48($tptr,$i,8),@ri[2]
1604         mov     56($tptr,$i,8),@ri[3]
1605         sbb     40($nptr,$i,8),@ri[1]
1606         lea     4($i),$i                # i++
1607         dec     $j                      # doesn't affect CF!
1608         jnz     .Lsqr4x_sub
1609
1610         mov     @ri[0],0($rptr)         # rp[i]=tp[i]-np[i]
1611         mov     32($tptr,$i,8),@ri[0]   # load overflow bit
1612         sbb     16($nptr,$i,8),@ri[2]
1613         mov     @ri[1],8($rptr)         # rp[i]=tp[i]-np[i]
1614         sbb     24($nptr,$i,8),@ri[3]
1615         mov     @ri[2],16($rptr)        # rp[i]=tp[i]-np[i]
1616
1617         sbb     \$0,@ri[0]              # handle upmost overflow bit
1618         mov     @ri[3],24($rptr)        # rp[i]=tp[i]-np[i]
1619         mov     32(%rsp),$rptr          # restore $rptr
1620         xor     $i,$i                   # i=0
1621         and     @ri[0],$tptr
1622         not     @ri[0]
1623         mov     $rptr,$nptr
1624         and     @ri[0],$nptr
1625         lea     -1($num),$j
1626         or      $nptr,$tptr             # tp=borrow?tp:rp
1627
1628         pxor    %xmm0,%xmm0
1629         lea     64(%rsp,$num,8),$nptr
1630         movdqu  ($tptr),%xmm1
1631         lea     ($nptr,$num,8),$nptr
1632         movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
1633         movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
1634         movdqu  %xmm1,($rptr)
1635         jmp     .Lsqr4x_copy
1636 .align  32
1637 .Lsqr4x_copy:                           # copy or in-place refresh
1638         movdqu  16($tptr,$i),%xmm2
1639         movdqu  32($tptr,$i),%xmm1
1640         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1641         movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
1642         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1643         movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
1644         movdqu  %xmm2,16($rptr,$i)
1645         movdqu  %xmm1,32($rptr,$i)
1646         lea     32($i),$i
1647         dec     $j
1648         jnz     .Lsqr4x_copy
1649
1650         movdqu  16($tptr,$i),%xmm2
1651         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1652         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1653         movdqu  %xmm2,16($rptr,$i)
1654 ___
1655 }
1656 $code.=<<___;
1657         mov     56(%rsp),%rsi           # restore %rsp
1658         mov     \$1,%rax
1659         mov     0(%rsi),%r15
1660         mov     8(%rsi),%r14
1661         mov     16(%rsi),%r13
1662         mov     24(%rsi),%r12
1663         mov     32(%rsi),%rbp
1664         mov     40(%rsi),%rbx
1665         lea     48(%rsi),%rsp
1666 .Lsqr8x_epilogue:
1667         ret
1668 .size   bn_sqr8x_mont,.-bn_sqr8x_mont
1669 ___
1670 }}}
1671 \f
1672 if ($addx) {{{
1673 my $bp="%rdx";  # original value
1674
1675 $code.=<<___;
1676 .type   bn_mulx4x_mont,\@function,6
1677 .align  32
1678 bn_mulx4x_mont:
1679 .Lmulx4x_enter:
1680         push    %rbx
1681         push    %rbp
1682         push    %r12
1683         push    %r13
1684         push    %r14
1685         push    %r15
1686
1687         shl     \$3,${num}d             # convert $num to bytes
1688         xor     %r10,%r10
1689         mov     %rsp,%r11               # put aside %rsp
1690         sub     $num,%r10               # -$num
1691         mov     ($n0),$n0               # *n0
1692         lea     -72(%rsp,%r10),%rsp     # alloca(frame+$num+8)
1693         lea     ($bp,$num),%r10
1694         and     \$-128,%rsp
1695         ##############################################################
1696         # Stack layout
1697         # +0    num
1698         # +8    off-loaded &b[i]
1699         # +16   end of b[num]
1700         # +24   saved n0
1701         # +32   saved rp
1702         # +40
1703         # +48   inner counter
1704         # +56   saved %rsp
1705         # +64   tmp[num+1]
1706         #
1707         mov     $num,0(%rsp)            # save $num
1708         shr     \$5,$num
1709         mov     %r10,16(%rsp)           # end of b[num]
1710         sub     \$1,$num
1711         mov     $n0, 24(%rsp)           # save *n0
1712         mov     $rp, 32(%rsp)           # save $rp
1713         mov     $num,48(%rsp)           # inner counter
1714         mov     %r11,56(%rsp)           # save original %rsp
1715         jmp     .Lmulx4x_body
1716
1717 .align  32
1718 .Lmulx4x_body:
1719 ___
1720 my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
1721    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1722 my $rptr=$bptr;
1723 $code.=<<___;
1724         lea     8($bp),$bptr
1725         mov     ($bp),%rdx              # b[0], $bp==%rdx actually
1726         lea     64+32(%rsp),$tptr
1727         mov     %rdx,$bi
1728         xor     $zero,$zero             # of=0,cf=0
1729
1730         mulx    0*8($aptr),$mi,%rax     # a[0]*b[0]
1731         mulx    1*8($aptr),%r11,%r14    # a[1]*b[0]
1732         adcx    %rax,%r11
1733         mov     $bptr,8(%rsp)           # off-load &b[i]
1734         mulx    2*8($aptr),%r12,%r13    # ...
1735         adcx    %r14,%r12
1736         adcx    $zero,%r13
1737
1738         mov     $mi,$bptr               # borrow $bptr
1739         imulq   24(%rsp),$mi            # "t[0]"*n0
1740         xor     $zero,$zero             # cf=0, of=0
1741
1742         mulx    3*8($aptr),%rax,%r14
1743          mov    $mi,%rdx
1744         lea     4*8($aptr),$aptr
1745         adcx    %rax,%r13
1746         adcx    $zero,%r14              # cf=0
1747
1748         mulx    0*8($nptr),%rax,%r10
1749         adcx    %rax,$bptr              # discarded
1750         adox    %r11,%r10
1751         mulx    1*8($nptr),%rax,%r11
1752         adcx    %rax,%r10
1753         adox    %r12,%r11
1754         mulx    2*8($nptr),%rax,%r12
1755         mov     48(%rsp),$bptr          # counter value
1756         mov     %r10,-4*8($tptr)
1757         adcx    %rax,%r11
1758         adox    %r13,%r12
1759         mulx    3*8($nptr),%rax,%r15
1760          .byte  0x66,0x66
1761          mov    $bi,%rdx
1762         mov     %r11,-3*8($tptr)
1763         adcx    %rax,%r12
1764         adox    $zero,%r15              # of=0
1765         lea     4*8($nptr),$nptr
1766         mov     %r12,-2*8($tptr)
1767
1768         #jmp    .Lmulx4x_1st
1769
1770 .align  32
1771 .Lmulx4x_1st:
1772         adcx    $zero,%r15              # cf=0, modulo-scheduled
1773         mulx    0*8($aptr),%r10,%rax    # a[4]*b[0]
1774         adcx    %r14,%r10
1775         mulx    1*8($aptr),%r11,%r14    # a[5]*b[0]
1776         adcx    %rax,%r11
1777         mulx    2*8($aptr),%r12,%rax    # ...
1778         adcx    %r14,%r12
1779         mulx    3*8($aptr),%r13,%r14
1780          .byte  0x66,0x66
1781          mov    $mi,%rdx
1782         adcx    %rax,%r13
1783         adcx    $zero,%r14              # cf=0
1784         lea     4*8($aptr),$aptr
1785         lea     4*8($tptr),$tptr
1786
1787         adox    %r15,%r10
1788         mulx    0*8($nptr),%rax,%r15
1789         adcx    %rax,%r10
1790         adox    %r15,%r11
1791         mulx    1*8($nptr),%rax,%r15
1792         adcx    %rax,%r11
1793         adox    %r15,%r12
1794         mulx    2*8($nptr),%rax,%r15
1795         mov     %r10,-5*8($tptr)
1796         adcx    %rax,%r12
1797         mov     %r11,-4*8($tptr)
1798         adox    %r15,%r13
1799         mulx    3*8($nptr),%rax,%r15
1800          mov    $bi,%rdx
1801         mov     %r12,-3*8($tptr)
1802         adcx    %rax,%r13
1803         adox    $zero,%r15
1804         lea     4*8($nptr),$nptr
1805         mov     %r13,-2*8($tptr)
1806
1807         dec     $bptr                   # of=0, pass cf
1808         jnz     .Lmulx4x_1st
1809
1810         mov     0(%rsp),$num            # load num
1811         mov     8(%rsp),$bptr           # re-load &b[i]
1812         adc     $zero,%r15              # modulo-scheduled
1813         add     %r15,%r14
1814         sbb     %r15,%r15               # top-most carry
1815         mov     %r14,-1*8($tptr)
1816         jmp     .Lmulx4x_outer
1817
1818 .align  32
1819 .Lmulx4x_outer:
1820         mov     ($bptr),%rdx            # b[i]
1821         lea     8($bptr),$bptr
1822         sub     $num,$aptr              # rewind $aptr
1823         mov     %r15,($tptr)            # save top-most carry
1824         mov     64(%rsp),%r10
1825         lea     64(%rsp),$tptr
1826         sub     $num,$nptr              # rewind $nptr
1827         xor     $zero,$zero             # cf=0, of=0
1828         mov     %rdx,$bi
1829
1830         mulx    0*8($aptr),$mi,%rax     # a[0]*b[i]
1831         adox    %r10,$mi
1832         mov     1*8($tptr),%r10
1833         mulx    1*8($aptr),%r11,%r14    # a[1]*b[i]
1834         adcx    %rax,%r11
1835         mov     $bptr,8(%rsp)           # off-load &b[i]
1836         mulx    2*8($aptr),%r12,%r13    # ...
1837         adox    %r10,%r11
1838         adcx    %r14,%r12
1839         adox    $zero,%r12
1840         .byte   0x66,0x66
1841         adcx    $zero,%r13
1842         mov     2*8($tptr),%r10
1843
1844         mov     $mi,$bptr               # borrow $bptr
1845         imulq   24(%rsp),$mi            # "t[0]"*n0
1846         xor     $zero,$zero             # cf=0, of=0
1847
1848         mulx    3*8($aptr),%rax,%r14
1849          mov    $mi,%rdx
1850         adox    %r10,%r12
1851         adcx    %rax,%r13
1852         adox    3*8($tptr),%r13
1853         adcx    $zero,%r14
1854         lea     4*8($aptr),$aptr
1855         lea     4*8($tptr),$tptr
1856         adox    $zero,%r14
1857
1858         mulx    0*8($nptr),%rax,%r10
1859         adcx    %rax,$bptr              # discarded
1860         adox    %r11,%r10
1861         mulx    1*8($nptr),%rax,%r11
1862         adcx    %rax,%r10
1863         adox    %r12,%r11
1864         mulx    2*8($nptr),%rax,%r12
1865         mov     %r10,-4*8($tptr)
1866         mov     0*8($tptr),%r10
1867         adcx    %rax,%r11
1868         adox    %r13,%r12
1869         mulx    3*8($nptr),%rax,%r15
1870          mov    $bi,%rdx
1871         mov     %r11,-3*8($tptr)
1872         adcx    %rax,%r12
1873         adox    $zero,%r15              # of=0
1874         mov     48(%rsp),$bptr          # counter value
1875         .byte   0x66,0x3e
1876         mov     %r12,-2*8($tptr)
1877         lea     4*8($nptr),$nptr
1878
1879         jmp     .Lmulx4x_inner
1880
1881 .align  32
1882 .Lmulx4x_inner:
1883         adcx    $zero,%r15              # cf=0, modulo-scheduled
1884         adox    %r10,%r14
1885         mulx    0*8($aptr),%r10,%rax    # a[4]*b[i]
1886         mov     1*8($tptr),%r13
1887         adcx    %r14,%r10
1888         mulx    1*8($aptr),%r11,%r14    # a[5]*b[i]
1889         adox    %rax,%r11
1890         mulx    2*8($aptr),%r12,%rax    # ...
1891         adcx    %r13,%r11
1892         adox    %r14,%r12
1893         mulx    3*8($aptr),%r13,%r14
1894          mov    $mi,%rdx
1895         adcx    2*8($tptr),%r12
1896         adox    %rax,%r13
1897         adcx    3*8($tptr),%r13
1898         adox    $zero,%r14              # of=0
1899         .byte   0x48,0x8d,0xb6,0x20,0x00,0x00,0x00      # lea   4*8($aptr),$aptr
1900         .byte   0x48,0x8d,0x9b,0x20,0x00,0x00,0x00      # lea   4*8($tptr),$tptr
1901         adcx    $zero,%r14              # cf=0
1902
1903         adox    %r15,%r10
1904         mulx    0*8($nptr),%rax,%r15
1905         adcx    %rax,%r10
1906         adox    %r15,%r11
1907         mulx    1*8($nptr),%rax,%r15
1908         adcx    %rax,%r11
1909         adox    %r15,%r12
1910         mulx    2*8($nptr),%rax,%r15
1911         mov     %r10,-5*8($tptr)
1912         mov     0*8($tptr),%r10
1913         adcx    %rax,%r12
1914         adox    %r15,%r13
1915         mulx    3*8($nptr),%rax,%r15
1916          mov    $bi,%rdx
1917         mov     %r11,-4*8($tptr)
1918         mov     %r12,-3*8($tptr)
1919         adcx    %rax,%r13
1920         adox    $zero,%r15
1921         lea     4*8($nptr),$nptr
1922         mov     %r13,-2*8($tptr)
1923
1924         dec     $bptr                   # of=0, pass cf
1925         jnz     .Lmulx4x_inner
1926
1927         mov     0(%rsp),$num            # load num
1928         mov     8(%rsp),$bptr           # re-load &b[i]
1929         adc     $zero,%r15              # modulo-scheduled
1930         sub     %r10,$zero              # pull top-most carry
1931         adc     %r15,%r14
1932         sbb     %r15,%r15               # top-most carry
1933         mov     %r14,-1*8($tptr)
1934
1935         cmp     16(%rsp),$bptr
1936         jne     .Lmulx4x_outer
1937
1938         neg     $num
1939         mov     32(%rsp),$rptr          # restore rp
1940         lea     64(%rsp),$tptr
1941
1942         xor     %rdx,%rdx
1943         pxor    %xmm0,%xmm0
1944         mov     0*8($nptr,$num),%r8
1945         mov     1*8($nptr,$num),%r9
1946         neg     %r8
1947         jmp     .Lmulx4x_sub_entry
1948
1949 .align  32
1950 .Lmulx4x_sub:
1951         mov     0*8($nptr,$num),%r8
1952         mov     1*8($nptr,$num),%r9
1953         not     %r8
1954 .Lmulx4x_sub_entry:
1955         mov     2*8($nptr,$num),%r10
1956         not     %r9
1957         and     %r15,%r8
1958         mov     3*8($nptr,$num),%r11
1959         not     %r10
1960         and     %r15,%r9
1961         not     %r11
1962         and     %r15,%r10
1963         and     %r15,%r11
1964
1965         neg     %rdx                    # mov %rdx,%cf
1966         adc     0*8($tptr),%r8
1967         adc     1*8($tptr),%r9
1968         movdqa  %xmm0,($tptr)
1969         adc     2*8($tptr),%r10
1970         adc     3*8($tptr),%r11
1971         movdqa  %xmm0,16($tptr)
1972         lea     4*8($tptr),$tptr
1973         sbb     %rdx,%rdx               # mov %cf,%rdx
1974
1975         mov     %r8,0*8($rptr)
1976         mov     %r9,1*8($rptr)
1977         mov     %r10,2*8($rptr)
1978         mov     %r11,3*8($rptr)
1979         lea     4*8($rptr),$rptr
1980
1981         add     \$32,$num
1982         jnz     .Lmulx4x_sub
1983
1984         mov     56(%rsp),%rsi           # restore %rsp
1985         mov     \$1,%rax
1986         mov     (%rsi),%r15
1987         mov     8(%rsi),%r14
1988         mov     16(%rsi),%r13
1989         mov     24(%rsi),%r12
1990         mov     32(%rsi),%rbp
1991         mov     40(%rsi),%rbx
1992         lea     48(%rsi),%rsp
1993 .Lmulx4x_epilogue:
1994         ret
1995 .size   bn_mulx4x_mont,.-bn_mulx4x_mont
1996 ___
1997 }\f{
1998 ######################################################################
1999 # void bn_sqr8x_mont(
2000 my $rptr="%rdi";        # const BN_ULONG *rptr,
2001 my $aptr="%rsi";        # const BN_ULONG *aptr,
2002 my $bptr="%rdx";        # not used
2003 my $nptr="%rcx";        # const BN_ULONG *nptr,
2004 my $n0  ="%r8";         # const BN_ULONG *n0);
2005 my $num ="%r9";         # int num, has to be divisible by 8
2006
2007 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2008 my @A0=("%r10","%r11");
2009 my @A1=("%r12","%r13");
2010 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2011
2012 $code.=<<___;
2013 .type   bn_sqrx8x_mont,\@function,6
2014 .align  32
2015 bn_sqrx8x_mont:
2016 .Lsqrx8x_enter:
2017         push    %rbx
2018         push    %rbp
2019         push    %r12
2020         push    %r13
2021         push    %r14
2022         push    %r15
2023
2024         shl     \$3,${num}d             # convert $num to bytes
2025         xor     %r10,%r10
2026         mov     %rsp,%r11               # put aside %rsp
2027         sub     $num,%r10               # -$num
2028         mov     ($n0),$n0               # *n0
2029         lea     -64(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
2030         and     \$-1024,%rsp            # minimize TLB usage
2031         ##############################################################
2032         # Stack layout
2033         #
2034         # +0    saved $num, used in reduction section
2035         # +8    &t[2*$num], used in reduction section
2036         # +16   intermediate carry bit
2037         # +24   top-most carry bit, used in reduction section
2038         # +32   saved *n0
2039         # +48   t[2*$num]
2040         #
2041         movq    $rptr,%xmm1             # save $rptr
2042         movq    $nptr,%xmm2             # save $nptr
2043         movq    %r10, %xmm3             # -$num
2044         movq    %r11, %xmm4             # save original %rsp
2045         mov     $n0,  32(%rsp)
2046 .Lsqrx8x_body:
2047         ##################################################################
2048         # Squaring part:
2049         #
2050         # a) multiply-n-add everything but a[i]*a[i];
2051         # b) shift result of a) by 1 to the left and accumulate
2052         #    a[i]*a[i] products;
2053         #
2054         ##################################################################
2055         # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2056         #                                                     a[1]a[0]
2057         #                                                 a[2]a[0]
2058         #                                             a[3]a[0]
2059         #                                             a[2]a[1]
2060         #                                         a[3]a[1]
2061         #                                     a[3]a[2]
2062         #
2063         #                                         a[4]a[0]
2064         #                                     a[5]a[0]
2065         #                                 a[6]a[0]
2066         #                             a[7]a[0]
2067         #                                     a[4]a[1]
2068         #                                 a[5]a[1]
2069         #                             a[6]a[1]
2070         #                         a[7]a[1]
2071         #                                 a[4]a[2]
2072         #                             a[5]a[2]
2073         #                         a[6]a[2]
2074         #                     a[7]a[2]
2075         #                             a[4]a[3]
2076         #                         a[5]a[3]
2077         #                     a[6]a[3]
2078         #                 a[7]a[3]
2079         #
2080         #                     a[5]a[4]
2081         #                 a[6]a[4]
2082         #             a[7]a[4]
2083         #             a[6]a[5]
2084         #         a[7]a[5]
2085         #     a[7]a[6]
2086         # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2087 ___
2088 {
2089 my ($zero,$carry)=("%rbp","%rcx");
2090 my $aaptr=$zero;
2091 $code.=<<___;
2092         pxor    %xmm0,%xmm0
2093         lea     48(%rsp),$tptr
2094         lea     ($aptr,$num),$aaptr
2095         mov     $num,(%rsp)             # save $num
2096         mov     $aaptr,8(%rsp)          # save end of $aptr
2097         jmp     .Lsqr8x_zero_start
2098
2099 .Lsqrx8x_zero:
2100         movdqa  %xmm0,0*8($tptr)
2101         movdqa  %xmm0,2*8($tptr)
2102         movdqa  %xmm0,4*8($tptr)
2103         movdqa  %xmm0,6*8($tptr)
2104 .Lsqr8x_zero_start:
2105         movdqa  %xmm0,8*8($tptr)
2106         movdqa  %xmm0,10*8($tptr)
2107         movdqa  %xmm0,12*8($tptr)
2108         movdqa  %xmm0,14*8($tptr)
2109         lea     16*8($tptr),$tptr
2110         sub     \$64,$num
2111         jnz     .Lsqrx8x_zero
2112
2113         mov     0*8($aptr),%rdx         # a[0], modulo-scheduled
2114         xor     %r8,%r8
2115         xor     %r9,%r9
2116         xor     %r10,%r10
2117         xor     %r11,%r11
2118         xor     %r12,%r12
2119         xor     %r13,%r13
2120         xor     %r14,%r14
2121         lea     48(%rsp),$tptr
2122         xor     $zero,$zero             # cf=0, cf=0
2123         jmp     .Lsqrx8x_outer_loop
2124
2125 .align  32
2126 .Lsqrx8x_outer_loop:
2127         mulx    1*8($aptr),%rax,%rbx    # a[1]*a[0]
2128         adcx    %rax,%r8                # a[1]*a[0]+=t[1]
2129         adox    %rbx,%r9
2130         mulx    2*8($aptr),%rax,%rbx    # a[2]*a[0]
2131         adcx    %rax,%r9
2132         adox    %rbx,%r10
2133         .byte   0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00    # mulx  3*8($aptr),%rax,%rbx    # ...
2134         adcx    %rax,%r10
2135         adox    %rbx,%r11
2136         .byte   0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00    # mulx  4*8($aptr),%rax,%rbx
2137         adcx    %rax,%r11
2138         adox    %rbx,%r12
2139         mulx    5*8($aptr),%rax,%rbx
2140         adcx    %rax,%r12
2141         adox    %rbx,%r13
2142         mulx    6*8($aptr),%rax,%rbx
2143         adcx    %rax,%r13
2144         adox    %rbx,%r14
2145         mulx    7*8($aptr),%rax,%r15
2146          mov    1*8($aptr),%rdx         # a[1]
2147         adcx    %rax,%r14
2148         adox    $zero,%r15
2149         adc     8*8($tptr),%r15
2150         sbb     $carry,$carry           # mov %cf,$carry
2151         xor     $zero,$zero             # cf=0, of=0
2152
2153         mov     %r8,1*8($tptr)          # t[1]
2154         mov     %r9,2*8($tptr)          # t[2]
2155
2156         mulx    2*8($aptr),%r8,%rbx     # a[2]*a[1]
2157         mulx    3*8($aptr),%r9,%rax     # a[3]*a[1]
2158         adcx    %r10,%r8
2159         adox    %rbx,%r9
2160         mulx    4*8($aptr),%r10,%rbx    # ...
2161         adcx    %r11,%r9
2162         adox    %rax,%r10
2163         .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00    # mulx  5*8($aptr),%r11,%rax
2164         adcx    %r12,%r10
2165         adox    %rbx,%r11
2166         .byte   0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00    # mulx  6*8($aptr),%r12,%rbx
2167         adcx    %r13,%r11
2168         adox    %r14,%r12
2169         .byte   0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00    # mulx  7*8($aptr),%r13,%r14
2170          mov    2*8($aptr),%rdx         # a[2]
2171         adcx    %rax,%r12
2172         adox    %rbx,%r13
2173         adcx    %r15,%r13
2174         adox    $zero,%r14              # of=0
2175         adcx    $zero,%r14              # cf=0
2176
2177         mov     %r8,3*8($tptr)          # t[3]
2178         mov     %r9,4*8($tptr)          # t[4]
2179
2180         mulx    3*8($aptr),%r8,%rbx     # a[3]*a[2]
2181         mulx    4*8($aptr),%r9,%rax     # a[4]*a[2]
2182         adcx    %r10,%r8
2183         adox    %rbx,%r9
2184         mulx    5*8($aptr),%r10,%rbx    # ...
2185         adcx    %r11,%r9
2186         adox    %rax,%r10
2187         .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00    # mulx  6*8($aptr),%r11,%rax
2188         adcx    %r12,%r10
2189         adox    %r13,%r11
2190         .byte   0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00    # mulx  7*8($aptr),%r12,%r13
2191         .byte   0x3e
2192          mov    3*8($aptr),%rdx         # a[3]
2193         adcx    %rbx,%r11
2194         adox    %rax,%r12
2195         adcx    %r14,%r12
2196         adox    $zero,%r13              # of=0
2197         adcx    $zero,%r13              # cf=0
2198
2199         mov     %r8,5*8($tptr)          # t[5]
2200         mov     %r9,6*8($tptr)          # t[6]
2201
2202         mulx    4*8($aptr),%r8,%rax     # a[4]*a[3]
2203         mulx    5*8($aptr),%r9,%rbx     # a[5]*a[3]
2204         adcx    %r10,%r8
2205         adox    %rax,%r9
2206         mulx    6*8($aptr),%r10,%rax    # ...
2207         adcx    %r11,%r9
2208         adox    %r12,%r10
2209         mulx    7*8($aptr),%r11,%r12
2210          mov    4*8($aptr),%rdx         # a[4]
2211          mov    5*8($aptr),%r14         # a[5]
2212         adcx    %rbx,%r10
2213         adox    %rax,%r11
2214          mov    6*8($aptr),%r15         # a[6]
2215         adcx    %r13,%r11
2216         adox    $zero,%r12              # of=0
2217         adcx    $zero,%r12              # cf=0
2218
2219         mov     %r8,7*8($tptr)          # t[7]
2220         mov     %r9,8*8($tptr)          # t[8]
2221
2222         mulx    %r14,%r9,%rax           # a[5]*a[4]
2223          mov    7*8($aptr),%r8          # a[7]
2224         adcx    %r10,%r9
2225         mulx    %r15,%r10,%rbx          # a[6]*a[4]
2226         adox    %rax,%r10
2227         adcx    %r11,%r10
2228         mulx    %r8,%r11,%rax           # a[7]*a[4]
2229          mov    %r14,%rdx               # a[5]
2230         adox    %rbx,%r11
2231         adcx    %r12,%r11
2232         #adox   $zero,%rax              # of=0
2233         adcx    $zero,%rax              # cf=0
2234
2235         mulx    %r15,%r14,%rbx          # a[6]*a[5]
2236         mulx    %r8,%r12,%r13           # a[7]*a[5]
2237          mov    %r15,%rdx               # a[6]
2238          lea    8*8($aptr),$aptr
2239         adcx    %r14,%r11
2240         adox    %rbx,%r12
2241         adcx    %rax,%r12
2242         .byte   0x66,0x66
2243         adox    $zero,%r13
2244
2245         mulx    %r8,%r8,%r14            # a[7]*a[6]
2246         adcx    %r8,%r13
2247         adcx    $zero,%r14
2248
2249         cmp     8(%rsp),$aptr
2250         je      .Lsqrx8x_outer_break
2251
2252         neg     $carry                  # mov $carry,%cf
2253         mov     $zero,%r15
2254         mov     8*8($tptr),%r8
2255         adc     9*8($tptr),%r9          # +=t[9]
2256         adc     10*8($tptr),%r10        # ...
2257         adc     11*8($tptr),%r11
2258         adc     12*8($tptr),%r12
2259         adc     13*8($tptr),%r13
2260         adc     14*8($tptr),%r14
2261         adc     15*8($tptr),%r15
2262         lea     8*8($tptr),$tptr
2263         sbb     $carry,$carry           # mov %cf,$carry
2264
2265         mov     -64($aptr),%rdx         # a[0]
2266         lea     ($aptr),$aaptr
2267         mov     $carry,16(%rsp)         # offload $carry
2268         mov     $tptr,24(%rsp)
2269
2270         lea     8*8($tptr),$tptr
2271         xor     %eax,%eax               # cf=0, of=0
2272         mov     \$-8,%rcx
2273         jmp     .Lsqrx8x_loop
2274
2275 .align  32
2276 .Lsqrx8x_loop:
2277         mov     %r8,%rbx
2278         mulx    0*8($aaptr),%rax,%r8    # a[8]*a[i]
2279         adcx    %rax,%rbx               # +=t[8]
2280         adox    %r9,%r8
2281
2282         mulx    1*8($aaptr),%rax,%r9    # ...
2283         adcx    %rax,%r8
2284         adox    %r10,%r9
2285
2286         mulx    2*8($aaptr),%rax,%r10
2287         adcx    %rax,%r9
2288         adox    %r11,%r10
2289
2290         mulx    3*8($aaptr),%rax,%r11
2291         adcx    %rax,%r10
2292         adox    %r12,%r11
2293
2294         .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00    # mulx  4*8($aaptr),%rax,%r12
2295         adcx    %rax,%r11
2296         adox    %r13,%r12
2297
2298         mulx    5*8($aaptr),%rax,%r13
2299         adcx    %rax,%r12
2300         adox    %r14,%r13
2301
2302         mulx    6*8($aaptr),%rax,%r14
2303          mov    %rbx,($tptr,%rcx,8)     # store t[8+i]
2304          mov    \$0,%ebx
2305         adcx    %rax,%r13
2306         adox    %r15,%r14
2307
2308         .byte   0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00    # mulx  7*8($aaptr),%rax,%r15
2309          mov    8($aptr,%rcx,8),%rdx    # a[i]
2310         adcx    %rax,%r14
2311         adox    %rbx,%r15               # %rbx is 0, of=0
2312         adcx    %rbx,%r15               # cf=0
2313
2314         inc     %rcx                    # of=0
2315         jnz     .Lsqrx8x_loop
2316
2317         lea     8*8($aaptr),$aaptr
2318         cmp     8(%rsp),$aaptr          # done?
2319         je      .Lsqrx8x_break
2320
2321         sub     16(%rsp),%rbx           # mov 16(%rsp),%cf
2322         mov     -64($aptr),%rdx
2323         adc     0*8($tptr),%r8
2324         adc     1*8($tptr),%r9
2325         adc     2*8($tptr),%r10
2326         adc     3*8($tptr),%r11
2327         adc     4*8($tptr),%r12
2328         adc     5*8($tptr),%r13
2329         adc     6*8($tptr),%r14
2330         adc     7*8($tptr),%r15
2331         lea     8*8($tptr),$tptr
2332         sbb     %rbx,%rbx               # mov %cf,%rbx
2333         xor     %eax,%eax               # cf=0, of=0
2334         mov     %rbx,16(%rsp)           # offload carry
2335         mov     \$-8,%rcx
2336         jmp     .Lsqrx8x_loop
2337
2338 .align  32
2339 .Lsqrx8x_break:
2340         sub     16(%rsp),%r8            # consume last carry
2341         mov     24(%rsp),$aaptr         # initial $tptr
2342         mov     0*8($aptr),%rdx         # a[8], modulo-scheduled
2343         mov     %r8,0*8($tptr)
2344         lea     8*8($aaptr),$aaptr
2345         mov     %r9,1*8($tptr)
2346          mov    1*8($aaptr),%r8         # potentially forwarded store
2347         mov     %r10,2*8($tptr)
2348          mov    2*8($aaptr),%r9         # ...
2349         mov     %r11,3*8($tptr)
2350          mov    3*8($aaptr),%r10
2351         mov     %r12,4*8($tptr)
2352          mov    4*8($aaptr),%r11
2353         mov     %r13,5*8($tptr)
2354          mov    5*8($aaptr),%r12
2355         mov     %r14,6*8($tptr)
2356          mov    6*8($aaptr),%r13
2357         mov     %r15,7*8($tptr)
2358          mov    7*8($aaptr),%r14
2359         mov     $aaptr,$tptr
2360         xor     $zero,$zero             # cf=0, cf=0
2361         jmp     .Lsqrx8x_outer_loop
2362
2363 .align  32
2364 .Lsqrx8x_outer_break:
2365         mov     %r9,9*8($tptr)          # t[9]
2366          movq   %xmm3,%rcx              # -$num
2367         mov     %r10,10*8($tptr)        # ...
2368         mov     %r11,11*8($tptr)
2369         mov     %r12,12*8($tptr)
2370         mov     %r13,13*8($tptr)
2371         mov     %r14,14*8($tptr)
2372 ___
2373 }\f{
2374 my $i="%rcx";
2375 $code.=<<___;
2376         mov     (%rsp),$num             # restore $num
2377
2378         lea     48(%rsp),$tptr
2379         mov     ($aptr,$i),%rdx         # a[0]
2380
2381         mov     8($tptr),$A0[1]         # t[1]
2382         xor     $A0[0],$A0[0]           # t[0], of=0, cf=0
2383         adox    $A0[1],$A0[1]
2384          mov    16($tptr),$A1[0]        # t[2]  # prefetch
2385          mov    24($tptr),$A1[1]        # t[3]  # prefetch
2386         nop
2387         #jmp    .Lsqrx4x_shift_n_add    # happens to be aligned
2388
2389 .align  32
2390 .Lsqrx4x_shift_n_add:
2391         mulx    %rdx,%rax,%rbx
2392          adox   $A1[0],$A1[0]
2393         adcx    $A0[0],%rax
2394          .byte  0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov   8($aptr,$i),%rdx        # a[i+1]        # prefetch
2395          .byte  0x4c,0x8b,0x97,0x20,0x00,0x00,0x00      # mov   32($tptr),$A0[0]        # t[2*i+4]      # prefetch
2396          adox   $A1[1],$A1[1]
2397         adcx    $A0[1],%rbx
2398          mov    40($tptr),$A0[1]                # t[2*i+4+1]    # prefetch
2399         mov     %rax,0($tptr)
2400         mov     %rbx,8($tptr)
2401
2402         mulx    %rdx,%rax,%rbx
2403          adox   $A0[0],$A0[0]
2404         adcx    $A1[0],%rax
2405          mov    16($aptr,$i),%rdx       # a[i+2]        # prefetch
2406          mov    48($tptr),$A1[0]        # t[2*i+6]      # prefetch
2407          adox   $A0[1],$A0[1]
2408         adcx    $A1[1],%rbx
2409          mov    56($tptr),$A1[1]        # t[2*i+6+1]    # prefetch
2410         mov     %rax,16($tptr)
2411         mov     %rbx,24($tptr)
2412
2413         mulx    %rdx,%rax,%rbx
2414          adox   $A1[0],$A1[0]
2415         adcx    $A0[0],%rax
2416          mov    24($aptr,$i),%rdx       # a[i+3]        # prefetch
2417          lea    32($i),$i
2418          mov    64($tptr),$A0[0]        # t[2*i+8]      # prefetch
2419          adox   $A1[1],$A1[1]
2420         adcx    $A0[1],%rbx
2421          mov    72($tptr),$A0[1]        # t[2*i+8+1]    # prefetch
2422         mov     %rax,32($tptr)
2423         mov     %rbx,40($tptr)
2424
2425         mulx    %rdx,%rax,%rbx
2426          adox   $A0[0],$A0[0]
2427         adcx    $A1[0],%rax
2428         jrcxz   .Lsqrx4x_shift_n_add_break
2429          .byte  0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov   0($aptr,$i),%rdx        # a[i+4]        # prefetch
2430          adox   $A0[1],$A0[1]
2431         adcx    $A1[1],%rbx
2432          mov    80($tptr),$A1[0]        # t[2*i+10]     # prefetch
2433          mov    88($tptr),$A1[1]        # t[2*i+10+1]   # prefetch
2434         mov     %rax,48($tptr)
2435         mov     %rbx,56($tptr)
2436         lea     64($tptr),$tptr
2437         nop
2438         jmp     .Lsqrx4x_shift_n_add
2439
2440 .align  32
2441 .Lsqrx4x_shift_n_add_break:
2442         adcx    $A1[1],%rbx
2443         .byte   0x48,0x89,0x87,0x30,0x00,0x00,0x00      # mov   %rax,48($tptr)
2444         .byte   0x48,0x89,0x9f,0x38,0x00,0x00,0x00      # mov   %rbx,56($tptr)
2445         .byte   0x48,0x8d,0xbf,0x40,0x00,0x00,0x00      # lea   64($tptr),$tptr
2446 ___
2447 }\f
2448 ######################################################################
2449 # Montgomery reduction part, "word-by-word" algorithm.
2450 #
2451 # This new path is inspired by multiple submissions from Intel, by
2452 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
2453 # Vinodh Gopal...
2454 {
2455 my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
2456
2457 $code.=<<___;
2458         movq    %xmm2,$nptr
2459         mov     32(%rsp),%rbx           # n0
2460         mov     48(%rsp),%rdx           # "%r8", 8*0($tptr)
2461         lea     ($nptr,$num),%rax       # end of n[]
2462         #lea    48(%rsp,$num,2),$tptr   # end of t[] buffer
2463         mov     %rax, 0(%rsp)           # save end of n[]
2464         mov     $tptr,8(%rsp)           # save end of t[]
2465
2466         lea     48(%rsp),$tptr          # initial t[] window
2467         xor     %rax,%rax
2468         nop
2469         #jmp    .Lsqrx8x_reduction_loop
2470
2471 .align  32
2472 .Lsqrx8x_reduction_loop:
2473         mov     8*1($tptr),%r9
2474         mov     8*2($tptr),%r10
2475         mov     8*3($tptr),%r11
2476         mov     8*4($tptr),%r12
2477         mov     %rdx,%r8
2478         imulq   %rbx,%rdx               # n0*a[i]
2479         mov     8*5($tptr),%r13
2480         mov     8*6($tptr),%r14
2481         mov     8*7($tptr),%r15
2482         mov     %rax,24(%rsp)           # store top-most carry bit
2483
2484         lea     8*8($tptr),$tptr
2485         xor     $carry,$carry           # cf=0,of=0
2486         mov     \$-8,%rcx
2487         jmp     .Lsqrx8x_reduce
2488
2489 .align  32
2490 .Lsqrx8x_reduce:
2491         mov     %r8, %rbx
2492         mulx    8*0($nptr),%rax,%r8     # n[0]
2493         adcx    %rbx,%rax               # discarded
2494         adox    %r9,%r8
2495
2496         mulx    8*1($nptr),%rbx,%r9     # n[1]
2497         adcx    %rbx,%r8
2498         adox    %r10,%r9
2499
2500         mulx    8*2($nptr),%rbx,%r10
2501         adcx    %rbx,%r9
2502         adox    %r11,%r10
2503
2504         mulx    8*3($nptr),%rbx,%r11
2505         adcx    %rbx,%r10
2506         adox    %r12,%r11
2507
2508         .byte   0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00    # mulx  8*4($nptr),%rbx,%r12
2509          mov    %rdx,%rax
2510          mov    %r8,%rdx
2511         adcx    %rbx,%r11
2512         adox    %r13,%r12
2513
2514          mulx   32(%rsp),%rbx,%rdx      # %rdx discarded
2515          mov    %rax,%rdx
2516          mov    %rax,48+64(%rsp,%rcx,8) # put aside n0*a[i]
2517
2518         mulx    8*5($nptr),%rax,%r13
2519         adcx    %rax,%r12
2520         adox    %r14,%r13
2521
2522         mulx    8*6($nptr),%rax,%r14
2523         adcx    %rax,%r13
2524         adox    %r15,%r14
2525
2526         mulx    8*7($nptr),%rax,%r15
2527          mov    %rbx,%rdx
2528         adcx    %rax,%r14
2529         adox    $carry,%r15             # $carry is 0
2530         adcx    $carry,%r15             # cf=0
2531
2532         inc     %rcx                    # of=0
2533         jnz     .Lsqrx8x_reduce
2534
2535         lea     8*8($nptr),$nptr
2536         xor     %rax,%rax
2537         cmp     0(%rsp),$nptr           # end of n[]?
2538         jae     .Lsqrx8x_no_tail
2539
2540         mov     48(%rsp),%rdx           # pull n0*a[0]
2541         add     8*0($tptr),%r8
2542         adcx    8*1($tptr),%r9
2543         adcx    8*2($tptr),%r10
2544         adcx    8*3($tptr),%r11
2545         adcx    8*4($tptr),%r12
2546         adcx    8*5($tptr),%r13
2547         adcx    8*6($tptr),%r14
2548         adcx    8*7($tptr),%r15
2549         lea     8*8($tptr),$tptr
2550         sbb     $carry,$carry           # top carry
2551
2552         mov     \$-8,%rcx
2553         mov     $carry,16(%rsp)
2554         xor     $carry,$carry           # of=0, cf=0
2555         jmp     .Lsqrx8x_tail
2556
2557 .align  32
2558 .Lsqrx8x_tail:
2559         mov     %r8,%rbx
2560         mulx    8*0($nptr),%rax,%r8
2561         adcx    %rax,%rbx
2562         adox    %r9,%r8
2563
2564         mulx    8*1($nptr),%rax,%r9
2565         adcx    %rax,%r8
2566         adox    %r10,%r9
2567
2568         mulx    8*2($nptr),%rax,%r10
2569         adcx    %rax,%r9
2570         adox    %r11,%r10
2571
2572         mulx    8*3($nptr),%rax,%r11
2573         adcx    %rax,%r10
2574         adox    %r12,%r11
2575
2576         .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00    # mulx  8*4($nptr),%rax,%r12
2577         adcx    %rax,%r11
2578         adox    %r13,%r12
2579
2580         mulx    8*5($nptr),%rax,%r13
2581         adcx    %rax,%r12
2582         adox    %r14,%r13
2583
2584         mulx    8*6($nptr),%rax,%r14
2585         adcx    %rax,%r13
2586         adox    %r15,%r14
2587
2588         mulx    8*7($nptr),%rax,%r15
2589          mov    48+72(%rsp,%rcx,8),%rdx # pull n0*a[i]
2590         adcx    %rax,%r14
2591         .byte   0x66
2592         adox    $carry,%r15
2593          mov    %rbx,($tptr,%rcx,8)     # save result
2594          mov    %r8,%rbx
2595         adcx    $carry,%r15             # cf=0
2596
2597         inc     %rcx                    # of=0
2598         jnz     .Lsqrx8x_tail
2599
2600         lea     8*8($nptr),$nptr
2601         cmp     0(%rsp),$nptr           # end of n[]?
2602         jae     .Lsqrx8x_tail_done      # break out of loop
2603
2604         sub     16(%rsp),$carry         # neg   $carry
2605          mov    48(%rsp),%rdx           # pull n0*a[0]
2606         adcx    8*0($tptr),%r8
2607         adcx    8*1($tptr),%r9
2608         adcx    8*2($tptr),%r10
2609         adcx    8*3($tptr),%r11
2610         adcx    8*4($tptr),%r12
2611         adcx    8*5($tptr),%r13
2612         adcx    8*6($tptr),%r14
2613         adcx    8*7($tptr),%r15
2614         lea     8*8($tptr),$tptr
2615         sbb     $carry,$carry
2616
2617         mov     \$-8,%rcx
2618         mov     $carry,16(%rsp)
2619         xor     $carry,$carry           # of=0, cf=0
2620         jmp     .Lsqrx8x_tail
2621
2622 .align  32
2623 .Lsqrx8x_tail_done:
2624         add     24(%rsp),%r8            # can this overflow?
2625         xor     %rax,%rax
2626
2627         sub     16(%rsp),$carry         # neg $carry
2628 .Lsqrx8x_no_tail:                       # carry flag is 0
2629         adc     8*0($tptr),%r8
2630          movq   %xmm3,%rcx
2631         adc     8*1($tptr),%r9
2632          movq   %xmm2,$nptr             # restore $nptr
2633         adc     8*2($tptr),%r10
2634          lea    8*8($tptr),$carry       # borrow $carry
2635         adc     8*3($tptr),%r11
2636         adc     8*4($tptr),%r12
2637         adc     8*5($tptr),%r13
2638         adc     8*6($tptr),%r14
2639         adc     8*7($tptr),%r15
2640         adc     %rax,%rax               # top-most carry
2641
2642         cmp     8(%rsp),$carry          # end of t[]?
2643         mov     32(%rsp),%rbx           # n0
2644         mov     8*8($tptr,%rcx),%rdx    # modulo-scheduled "%r8"
2645
2646         lea     8*8($tptr,%rcx),$tptr   # start of current t[] window
2647         mov     %r8,-8*8($carry)        # store top 512 bits
2648         mov     %r9,-8*7($carry)
2649         mov     %r10,-8*6($carry)
2650         mov     %r11,-8*5($carry)
2651         mov     %r12,-8*4($carry)
2652         mov     %r13,-8*3($carry)
2653         mov     %r14,-8*2($carry)
2654         mov     %r15,-8*1($carry)
2655
2656         jb      .Lsqrx8x_reduction_loop
2657
2658         mov     %rcx,$num
2659         neg     $num                    # restore $num
2660 ___
2661 }\f
2662 ##############################################################
2663 # Post-condition, 8x unrolled
2664 #
2665 {
2666 my ($rptr,$nptr,$lptr,$i)=($aptr,"%rbp","%rbx","%rcx");
2667 my @ri=map("%r$_",(10..13));
2668 my @ni=map("%r$_",(14..15));
2669 $code.=<<___;
2670         lea     ($nptr,$num),$nptr      # end of $nptr
2671         lea     48(%rsp,$num),$lptr     # end of lower half of t[2*num]
2672         lea     48(%rsp,$num),$tptr
2673         neg     %rax                    # top-most carry as mask
2674         xor     %rdx,%rdx
2675         movq    %xmm1,$rptr             # restore $rptr
2676
2677         mov     0*8($nptr,$i),%r8
2678         mov     1*8($nptr,$i),%r9
2679         neg     %r8
2680         jmp     .Lsqrx8x_sub_entry
2681
2682 .align  32
2683 .Lsqrx8x_sub:
2684         mov     0*8($nptr,$i),%r8
2685         mov     1*8($nptr,$i),%r9
2686         not     %r8
2687 .Lsqrx8x_sub_entry:
2688         mov     2*8($nptr,$i),%r10
2689         not     %r9
2690         and     %rax,%r8
2691         mov     3*8($nptr,$i),%r11
2692         not     %r10
2693         and     %rax,%r9
2694         mov     4*8($nptr,$i),%r12
2695         not     %r11
2696         and     %rax,%r10
2697         mov     5*8($nptr,$i),%r13
2698         not     %r12
2699         and     %rax,%r11
2700         mov     6*8($nptr,$i),%r14
2701         not     %r13
2702         and     %rax,%r12
2703         mov     7*8($nptr,$i),%r15
2704         not     %r14
2705         and     %rax,%r13
2706         movdqa  %xmm0,0*8($lptr,$i)     # zap lower half
2707         not     %r15
2708         and     %rax,%r14
2709         movdqa  %xmm0,2*8($lptr,$i)
2710         and     %rax,%r15
2711
2712         neg     %rdx                    # mov %rdx,%cf
2713         movdqa  %xmm0,4*8($lptr,$i)
2714         adc     0*8($tptr),%r8
2715         adc     1*8($tptr),%r9
2716         movdqa  %xmm0,6*8($lptr,$i)
2717         adc     2*8($tptr),%r10
2718         adc     3*8($tptr),%r11
2719         movdqa  %xmm0,0*8($tptr)        # zap upper half
2720         adc     4*8($tptr),%r12
2721         adc     5*8($tptr),%r13
2722         movdqa  %xmm0,2*8($tptr)
2723         adc     6*8($tptr),%r14
2724         adc     7*8($tptr),%r15
2725         movdqa  %xmm0,4*8($tptr)
2726         sbb     %rdx,%rdx               # mov %cf,%rdx
2727         movdqa  %xmm0,6*8($tptr)
2728         lea     8*8($tptr),$tptr
2729
2730         mov     %r8,0*8($rptr)
2731         mov     %r9,1*8($rptr)
2732         mov     %r10,2*8($rptr)
2733         mov     %r11,3*8($rptr)
2734         mov     %r12,4*8($rptr)
2735         mov     %r13,5*8($rptr)
2736         mov     %r14,6*8($rptr)
2737         mov     %r15,7*8($rptr)
2738         lea     8*8($rptr),$rptr
2739
2740         add     \$64,$i
2741         jnz     .Lsqrx8x_sub
2742 ___
2743 }
2744 $code.=<<___;
2745         movq    %xmm4,%rsi              # restore %rsp
2746         mov     \$1,%rax
2747         mov     0(%rsi),%r15
2748         mov     8(%rsi),%r14
2749         mov     16(%rsi),%r13
2750         mov     24(%rsi),%r12
2751         mov     32(%rsi),%rbp
2752         mov     40(%rsi),%rbx
2753         lea     48(%rsi),%rsp
2754 .Lsqrx8x_epilogue:
2755         ret
2756 .size   bn_sqrx8x_mont,.-bn_sqrx8x_mont
2757 ___
2758 }}}
2759 $code.=<<___;
2760 .asciz  "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2761 .align  16
2762 ___
2763
2764 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2765 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
2766 if ($win64) {
2767 $rec="%rcx";
2768 $frame="%rdx";
2769 $context="%r8";
2770 $disp="%r9";
2771
2772 $code.=<<___;
2773 .extern __imp_RtlVirtualUnwind
2774 .type   mul_handler,\@abi-omnipotent
2775 .align  16
2776 mul_handler:
2777         push    %rsi
2778         push    %rdi
2779         push    %rbx
2780         push    %rbp
2781         push    %r12
2782         push    %r13
2783         push    %r14
2784         push    %r15
2785         pushfq
2786         sub     \$64,%rsp
2787
2788         mov     120($context),%rax      # pull context->Rax
2789         mov     248($context),%rbx      # pull context->Rip
2790
2791         mov     8($disp),%rsi           # disp->ImageBase
2792         mov     56($disp),%r11          # disp->HandlerData
2793
2794         mov     0(%r11),%r10d           # HandlerData[0]
2795         lea     (%rsi,%r10),%r10        # end of prologue label
2796         cmp     %r10,%rbx               # context->Rip<end of prologue label
2797         jb      .Lcommon_seh_tail
2798
2799         mov     152($context),%rax      # pull context->Rsp
2800
2801         mov     4(%r11),%r10d           # HandlerData[1]
2802         lea     (%rsi,%r10),%r10        # epilogue label
2803         cmp     %r10,%rbx               # context->Rip>=epilogue label
2804         jae     .Lcommon_seh_tail
2805
2806         mov     192($context),%r10      # pull $num
2807         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
2808         lea     48(%rax),%rax
2809
2810         mov     -8(%rax),%rbx
2811         mov     -16(%rax),%rbp
2812         mov     -24(%rax),%r12
2813         mov     -32(%rax),%r13
2814         mov     -40(%rax),%r14
2815         mov     -48(%rax),%r15
2816         mov     %rbx,144($context)      # restore context->Rbx
2817         mov     %rbp,160($context)      # restore context->Rbp
2818         mov     %r12,216($context)      # restore context->R12
2819         mov     %r13,224($context)      # restore context->R13
2820         mov     %r14,232($context)      # restore context->R14
2821         mov     %r15,240($context)      # restore context->R15
2822
2823         jmp     .Lcommon_seh_tail
2824 .size   mul_handler,.-mul_handler
2825
2826 .type   sqr_handler,\@abi-omnipotent
2827 .align  16
2828 sqr_handler:
2829         push    %rsi
2830         push    %rdi
2831         push    %rbx
2832         push    %rbp
2833         push    %r12
2834         push    %r13
2835         push    %r14
2836         push    %r15
2837         pushfq
2838         sub     \$64,%rsp
2839
2840         mov     120($context),%rax      # pull context->Rax
2841         mov     248($context),%rbx      # pull context->Rip
2842
2843         mov     8($disp),%rsi           # disp->ImageBase
2844         mov     56($disp),%r11          # disp->HandlerData
2845
2846         mov     0(%r11),%r10d           # HandlerData[0]
2847         lea     (%rsi,%r10),%r10        # end of prologue label
2848         cmp     %r10,%rbx               # context->Rip<.Lsqr_body
2849         jb      .Lcommon_seh_tail
2850
2851         mov     152($context),%rax      # pull context->Rsp
2852
2853         mov     4(%r11),%r10d           # HandlerData[1]
2854         lea     (%rsi,%r10),%r10        # epilogue label
2855         cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
2856         jae     .Lcommon_seh_tail
2857
2858         mov     56(%rax),%rax           # pull saved stack pointer
2859         lea     48(%rax),%rax
2860
2861         mov     -8(%rax),%rbx
2862         mov     -16(%rax),%rbp
2863         mov     -24(%rax),%r12
2864         mov     -32(%rax),%r13
2865         mov     -40(%rax),%r14
2866         mov     -48(%rax),%r15
2867         mov     %rbx,144($context)      # restore context->Rbx
2868         mov     %rbp,160($context)      # restore context->Rbp
2869         mov     %r12,216($context)      # restore context->R12
2870         mov     %r13,224($context)      # restore context->R13
2871         mov     %r14,232($context)      # restore context->R14
2872         mov     %r15,240($context)      # restore context->R15
2873
2874 .Lcommon_seh_tail:
2875         mov     8(%rax),%rdi
2876         mov     16(%rax),%rsi
2877         mov     %rax,152($context)      # restore context->Rsp
2878         mov     %rsi,168($context)      # restore context->Rsi
2879         mov     %rdi,176($context)      # restore context->Rdi
2880
2881         mov     40($disp),%rdi          # disp->ContextRecord
2882         mov     $context,%rsi           # context
2883         mov     \$154,%ecx              # sizeof(CONTEXT)
2884         .long   0xa548f3fc              # cld; rep movsq
2885
2886         mov     $disp,%rsi
2887         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
2888         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
2889         mov     0(%rsi),%r8             # arg3, disp->ControlPc
2890         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
2891         mov     40(%rsi),%r10           # disp->ContextRecord
2892         lea     56(%rsi),%r11           # &disp->HandlerData
2893         lea     24(%rsi),%r12           # &disp->EstablisherFrame
2894         mov     %r10,32(%rsp)           # arg5
2895         mov     %r11,40(%rsp)           # arg6
2896         mov     %r12,48(%rsp)           # arg7
2897         mov     %rcx,56(%rsp)           # arg8, (NULL)
2898         call    *__imp_RtlVirtualUnwind(%rip)
2899
2900         mov     \$1,%eax                # ExceptionContinueSearch
2901         add     \$64,%rsp
2902         popfq
2903         pop     %r15
2904         pop     %r14
2905         pop     %r13
2906         pop     %r12
2907         pop     %rbp
2908         pop     %rbx
2909         pop     %rdi
2910         pop     %rsi
2911         ret
2912 .size   sqr_handler,.-sqr_handler
2913
2914 .section        .pdata
2915 .align  4
2916         .rva    .LSEH_begin_bn_mul_mont
2917         .rva    .LSEH_end_bn_mul_mont
2918         .rva    .LSEH_info_bn_mul_mont
2919
2920         .rva    .LSEH_begin_bn_mul4x_mont
2921         .rva    .LSEH_end_bn_mul4x_mont
2922         .rva    .LSEH_info_bn_mul4x_mont
2923
2924         .rva    .LSEH_begin_bn_sqr8x_mont
2925         .rva    .LSEH_end_bn_sqr8x_mont
2926         .rva    .LSEH_info_bn_sqr8x_mont
2927 ___
2928 $code.=<<___ if ($addx);
2929         .rva    .LSEH_begin_bn_mulx4x_mont
2930         .rva    .LSEH_end_bn_mulx4x_mont
2931         .rva    .LSEH_info_bn_mulx4x_mont
2932
2933         .rva    .LSEH_begin_bn_sqrx8x_mont
2934         .rva    .LSEH_end_bn_sqrx8x_mont
2935         .rva    .LSEH_info_bn_sqrx8x_mont
2936 ___
2937 $code.=<<___;
2938 .section        .xdata
2939 .align  8
2940 .LSEH_info_bn_mul_mont:
2941         .byte   9,0,0,0
2942         .rva    mul_handler
2943         .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
2944 .LSEH_info_bn_mul4x_mont:
2945         .byte   9,0,0,0
2946         .rva    mul_handler
2947         .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
2948 .LSEH_info_bn_sqr8x_mont:
2949         .byte   9,0,0,0
2950         .rva    sqr_handler
2951         .rva    .Lsqr8x_body,.Lsqr8x_epilogue   # HandlerData[]
2952 ___
2953 $code.=<<___ if ($addx);
2954 .LSEH_info_bn_mulx4x_mont:
2955         .byte   9,0,0,0
2956         .rva    sqr_handler
2957         .rva    .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
2958 .LSEH_info_bn_sqrx8x_mont:
2959         .byte   9,0,0,0
2960         .rva    sqr_handler
2961         .rva    .Lsqrx8x_body,.Lsqrx8x_epilogue # HandlerData[]
2962 ___
2963 }
2964
2965 print $code;
2966 close STDOUT;