bn/asm/x86_86-mont.pl: optimize reduction for Intel Core family.
[openssl.git] / crypto / bn / asm / x86_64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005.
11 #
12 # Montgomery multiplication routine for x86_64. While it gives modest
13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
15 # respectful 50%. It remains to be seen if loop unrolling and
16 # dedicated squaring routine can provide further improvement...
17
18 # July 2011.
19 #
20 # Add dedicated squaring procedure. Performance improvement varies
21 # from platform to platform, but in average it's ~5%/15%/25%/33%
22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24 # August 2011.
25 #
26 # Unroll and modulo-schedule inner loops in such manner that they
27 # are "fallen through" for input lengths of 8, which is critical for
28 # 1024-bit RSA *sign*. Average performance improvement in comparison
29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32 # June 2013.
33 #
34 # Optmize reduction in squaring procedure and improve 1024+-bit RSA
35 # sign performance by 10-16% on Intel Sandy Bridge and later
36 # (virtually same on non-Intel processors).
37
38 $flavour = shift;
39 $output  = shift;
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
41
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
43
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
48
49 open OUT,"| \"$^X\" $xlate $flavour $output";
50 *STDOUT=*OUT;
51
52 # int bn_mul_mont(
53 $rp="%rdi";     # BN_ULONG *rp,
54 $ap="%rsi";     # const BN_ULONG *ap,
55 $bp="%rdx";     # const BN_ULONG *bp,
56 $np="%rcx";     # const BN_ULONG *np,
57 $n0="%r8";      # const BN_ULONG *n0,
58 $num="%r9";     # int num);
59 $lo0="%r10";
60 $hi0="%r11";
61 $hi1="%r13";
62 $i="%r14";
63 $j="%r15";
64 $m0="%rbx";
65 $m1="%rbp";
66
67 $code=<<___;
68 .text
69
70 .globl  bn_mul_mont
71 .type   bn_mul_mont,\@function,6
72 .align  16
73 bn_mul_mont:
74         test    \$3,${num}d
75         jnz     .Lmul_enter
76         cmp     \$8,${num}d
77         jb      .Lmul_enter
78         cmp     $ap,$bp
79         jne     .Lmul4x_enter
80         test    \$7,${num}d
81         jz      .Lsqr8x_enter
82         jmp     .Lmul4x_enter
83
84 .align  16
85 .Lmul_enter:
86         push    %rbx
87         push    %rbp
88         push    %r12
89         push    %r13
90         push    %r14
91         push    %r15
92
93         mov     ${num}d,${num}d
94         lea     2($num),%r10
95         mov     %rsp,%r11
96         neg     %r10
97         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
98         and     \$-1024,%rsp            # minimize TLB usage
99
100         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
101 .Lmul_body:
102         mov     $bp,%r12                # reassign $bp
103 ___
104                 $bp="%r12";
105 $code.=<<___;
106         mov     ($n0),$n0               # pull n0[0] value
107         mov     ($bp),$m0               # m0=bp[0]
108         mov     ($ap),%rax
109
110         xor     $i,$i                   # i=0
111         xor     $j,$j                   # j=0
112
113         mov     $n0,$m1
114         mulq    $m0                     # ap[0]*bp[0]
115         mov     %rax,$lo0
116         mov     ($np),%rax
117
118         imulq   $lo0,$m1                # "tp[0]"*n0
119         mov     %rdx,$hi0
120
121         mulq    $m1                     # np[0]*m1
122         add     %rax,$lo0               # discarded
123         mov     8($ap),%rax
124         adc     \$0,%rdx
125         mov     %rdx,$hi1
126
127         lea     1($j),$j                # j++
128         jmp     .L1st_enter
129
130 .align  16
131 .L1st:
132         add     %rax,$hi1
133         mov     ($ap,$j,8),%rax
134         adc     \$0,%rdx
135         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
136         mov     $lo0,$hi0
137         adc     \$0,%rdx
138         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
139         mov     %rdx,$hi1
140
141 .L1st_enter:
142         mulq    $m0                     # ap[j]*bp[0]
143         add     %rax,$hi0
144         mov     ($np,$j,8),%rax
145         adc     \$0,%rdx
146         lea     1($j),$j                # j++
147         mov     %rdx,$lo0
148
149         mulq    $m1                     # np[j]*m1
150         cmp     $num,$j
151         jne     .L1st
152
153         add     %rax,$hi1
154         mov     ($ap),%rax              # ap[0]
155         adc     \$0,%rdx
156         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
157         adc     \$0,%rdx
158         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
159         mov     %rdx,$hi1
160         mov     $lo0,$hi0
161
162         xor     %rdx,%rdx
163         add     $hi0,$hi1
164         adc     \$0,%rdx
165         mov     $hi1,-8(%rsp,$num,8)
166         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
167
168         lea     1($i),$i                # i++
169         jmp     .Louter
170 .align  16
171 .Louter:
172         mov     ($bp,$i,8),$m0          # m0=bp[i]
173         xor     $j,$j                   # j=0
174         mov     $n0,$m1
175         mov     (%rsp),$lo0
176         mulq    $m0                     # ap[0]*bp[i]
177         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
178         mov     ($np),%rax
179         adc     \$0,%rdx
180
181         imulq   $lo0,$m1                # tp[0]*n0
182         mov     %rdx,$hi0
183
184         mulq    $m1                     # np[0]*m1
185         add     %rax,$lo0               # discarded
186         mov     8($ap),%rax
187         adc     \$0,%rdx
188         mov     8(%rsp),$lo0            # tp[1]
189         mov     %rdx,$hi1
190
191         lea     1($j),$j                # j++
192         jmp     .Linner_enter
193
194 .align  16
195 .Linner:
196         add     %rax,$hi1
197         mov     ($ap,$j,8),%rax
198         adc     \$0,%rdx
199         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
200         mov     (%rsp,$j,8),$lo0
201         adc     \$0,%rdx
202         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
203         mov     %rdx,$hi1
204
205 .Linner_enter:
206         mulq    $m0                     # ap[j]*bp[i]
207         add     %rax,$hi0
208         mov     ($np,$j,8),%rax
209         adc     \$0,%rdx
210         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
211         mov     %rdx,$hi0
212         adc     \$0,$hi0
213         lea     1($j),$j                # j++
214
215         mulq    $m1                     # np[j]*m1
216         cmp     $num,$j
217         jne     .Linner
218
219         add     %rax,$hi1
220         mov     ($ap),%rax              # ap[0]
221         adc     \$0,%rdx
222         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
223         mov     (%rsp,$j,8),$lo0
224         adc     \$0,%rdx
225         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
226         mov     %rdx,$hi1
227
228         xor     %rdx,%rdx
229         add     $hi0,$hi1
230         adc     \$0,%rdx
231         add     $lo0,$hi1               # pull upmost overflow bit
232         adc     \$0,%rdx
233         mov     $hi1,-8(%rsp,$num,8)
234         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
235
236         lea     1($i),$i                # i++
237         cmp     $num,$i
238         jl      .Louter
239
240         xor     $i,$i                   # i=0 and clear CF!
241         mov     (%rsp),%rax             # tp[0]
242         lea     (%rsp),$ap              # borrow ap for tp
243         mov     $num,$j                 # j=num
244         jmp     .Lsub
245 .align  16
246 .Lsub:  sbb     ($np,$i,8),%rax
247         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
248         mov     8($ap,$i,8),%rax        # tp[i+1]
249         lea     1($i),$i                # i++
250         dec     $j                      # doesnn't affect CF!
251         jnz     .Lsub
252
253         sbb     \$0,%rax                # handle upmost overflow bit
254         xor     $i,$i
255         and     %rax,$ap
256         not     %rax
257         mov     $rp,$np
258         and     %rax,$np
259         mov     $num,$j                 # j=num
260         or      $np,$ap                 # ap=borrow?tp:rp
261 .align  16
262 .Lcopy:                                 # copy or in-place refresh
263         mov     ($ap,$i,8),%rax
264         mov     $i,(%rsp,$i,8)          # zap temporary vector
265         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
266         lea     1($i),$i
267         sub     \$1,$j
268         jnz     .Lcopy
269
270         mov     8(%rsp,$num,8),%rsi     # restore %rsp
271         mov     \$1,%rax
272         mov     (%rsi),%r15
273         mov     8(%rsi),%r14
274         mov     16(%rsi),%r13
275         mov     24(%rsi),%r12
276         mov     32(%rsi),%rbp
277         mov     40(%rsi),%rbx
278         lea     48(%rsi),%rsp
279 .Lmul_epilogue:
280         ret
281 .size   bn_mul_mont,.-bn_mul_mont
282 ___
283 {{{
284 my @A=("%r10","%r11");
285 my @N=("%r13","%rdi");
286 $code.=<<___;
287 .type   bn_mul4x_mont,\@function,6
288 .align  16
289 bn_mul4x_mont:
290 .Lmul4x_enter:
291         push    %rbx
292         push    %rbp
293         push    %r12
294         push    %r13
295         push    %r14
296         push    %r15
297
298         mov     ${num}d,${num}d
299         lea     4($num),%r10
300         mov     %rsp,%r11
301         neg     %r10
302         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
303         and     \$-1024,%rsp            # minimize TLB usage
304
305         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
306 .Lmul4x_body:
307         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
308         mov     %rdx,%r12               # reassign $bp
309 ___
310                 $bp="%r12";
311 $code.=<<___;
312         mov     ($n0),$n0               # pull n0[0] value
313         mov     ($bp),$m0               # m0=bp[0]
314         mov     ($ap),%rax
315
316         xor     $i,$i                   # i=0
317         xor     $j,$j                   # j=0
318
319         mov     $n0,$m1
320         mulq    $m0                     # ap[0]*bp[0]
321         mov     %rax,$A[0]
322         mov     ($np),%rax
323
324         imulq   $A[0],$m1               # "tp[0]"*n0
325         mov     %rdx,$A[1]
326
327         mulq    $m1                     # np[0]*m1
328         add     %rax,$A[0]              # discarded
329         mov     8($ap),%rax
330         adc     \$0,%rdx
331         mov     %rdx,$N[1]
332
333         mulq    $m0
334         add     %rax,$A[1]
335         mov     8($np),%rax
336         adc     \$0,%rdx
337         mov     %rdx,$A[0]
338
339         mulq    $m1
340         add     %rax,$N[1]
341         mov     16($ap),%rax
342         adc     \$0,%rdx
343         add     $A[1],$N[1]
344         lea     4($j),$j                # j++
345         adc     \$0,%rdx
346         mov     $N[1],(%rsp)
347         mov     %rdx,$N[0]
348         jmp     .L1st4x
349 .align  16
350 .L1st4x:
351         mulq    $m0                     # ap[j]*bp[0]
352         add     %rax,$A[0]
353         mov     -16($np,$j,8),%rax
354         adc     \$0,%rdx
355         mov     %rdx,$A[1]
356
357         mulq    $m1                     # np[j]*m1
358         add     %rax,$N[0]
359         mov     -8($ap,$j,8),%rax
360         adc     \$0,%rdx
361         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
362         adc     \$0,%rdx
363         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
364         mov     %rdx,$N[1]
365
366         mulq    $m0                     # ap[j]*bp[0]
367         add     %rax,$A[1]
368         mov     -8($np,$j,8),%rax
369         adc     \$0,%rdx
370         mov     %rdx,$A[0]
371
372         mulq    $m1                     # np[j]*m1
373         add     %rax,$N[1]
374         mov     ($ap,$j,8),%rax
375         adc     \$0,%rdx
376         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
377         adc     \$0,%rdx
378         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
379         mov     %rdx,$N[0]
380
381         mulq    $m0                     # ap[j]*bp[0]
382         add     %rax,$A[0]
383         mov     ($np,$j,8),%rax
384         adc     \$0,%rdx
385         mov     %rdx,$A[1]
386
387         mulq    $m1                     # np[j]*m1
388         add     %rax,$N[0]
389         mov     8($ap,$j,8),%rax
390         adc     \$0,%rdx
391         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
392         adc     \$0,%rdx
393         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
394         mov     %rdx,$N[1]
395
396         mulq    $m0                     # ap[j]*bp[0]
397         add     %rax,$A[1]
398         mov     8($np,$j,8),%rax
399         adc     \$0,%rdx
400         lea     4($j),$j                # j++
401         mov     %rdx,$A[0]
402
403         mulq    $m1                     # np[j]*m1
404         add     %rax,$N[1]
405         mov     -16($ap,$j,8),%rax
406         adc     \$0,%rdx
407         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
408         adc     \$0,%rdx
409         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
410         mov     %rdx,$N[0]
411         cmp     $num,$j
412         jl      .L1st4x
413
414         mulq    $m0                     # ap[j]*bp[0]
415         add     %rax,$A[0]
416         mov     -16($np,$j,8),%rax
417         adc     \$0,%rdx
418         mov     %rdx,$A[1]
419
420         mulq    $m1                     # np[j]*m1
421         add     %rax,$N[0]
422         mov     -8($ap,$j,8),%rax
423         adc     \$0,%rdx
424         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
425         adc     \$0,%rdx
426         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
427         mov     %rdx,$N[1]
428
429         mulq    $m0                     # ap[j]*bp[0]
430         add     %rax,$A[1]
431         mov     -8($np,$j,8),%rax
432         adc     \$0,%rdx
433         mov     %rdx,$A[0]
434
435         mulq    $m1                     # np[j]*m1
436         add     %rax,$N[1]
437         mov     ($ap),%rax              # ap[0]
438         adc     \$0,%rdx
439         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
440         adc     \$0,%rdx
441         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
442         mov     %rdx,$N[0]
443
444         xor     $N[1],$N[1]
445         add     $A[0],$N[0]
446         adc     \$0,$N[1]
447         mov     $N[0],-8(%rsp,$j,8)
448         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
449
450         lea     1($i),$i                # i++
451 .align  4
452 .Louter4x:
453         mov     ($bp,$i,8),$m0          # m0=bp[i]
454         xor     $j,$j                   # j=0
455         mov     (%rsp),$A[0]
456         mov     $n0,$m1
457         mulq    $m0                     # ap[0]*bp[i]
458         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
459         mov     ($np),%rax
460         adc     \$0,%rdx
461
462         imulq   $A[0],$m1               # tp[0]*n0
463         mov     %rdx,$A[1]
464
465         mulq    $m1                     # np[0]*m1
466         add     %rax,$A[0]              # "$N[0]", discarded
467         mov     8($ap),%rax
468         adc     \$0,%rdx
469         mov     %rdx,$N[1]
470
471         mulq    $m0                     # ap[j]*bp[i]
472         add     %rax,$A[1]
473         mov     8($np),%rax
474         adc     \$0,%rdx
475         add     8(%rsp),$A[1]           # +tp[1]
476         adc     \$0,%rdx
477         mov     %rdx,$A[0]
478
479         mulq    $m1                     # np[j]*m1
480         add     %rax,$N[1]
481         mov     16($ap),%rax
482         adc     \$0,%rdx
483         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
484         lea     4($j),$j                # j+=2
485         adc     \$0,%rdx
486         mov     $N[1],(%rsp)            # tp[j-1]
487         mov     %rdx,$N[0]
488         jmp     .Linner4x
489 .align  16
490 .Linner4x:
491         mulq    $m0                     # ap[j]*bp[i]
492         add     %rax,$A[0]
493         mov     -16($np,$j,8),%rax
494         adc     \$0,%rdx
495         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
496         adc     \$0,%rdx
497         mov     %rdx,$A[1]
498
499         mulq    $m1                     # np[j]*m1
500         add     %rax,$N[0]
501         mov     -8($ap,$j,8),%rax
502         adc     \$0,%rdx
503         add     $A[0],$N[0]
504         adc     \$0,%rdx
505         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
506         mov     %rdx,$N[1]
507
508         mulq    $m0                     # ap[j]*bp[i]
509         add     %rax,$A[1]
510         mov     -8($np,$j,8),%rax
511         adc     \$0,%rdx
512         add     -8(%rsp,$j,8),$A[1]
513         adc     \$0,%rdx
514         mov     %rdx,$A[0]
515
516         mulq    $m1                     # np[j]*m1
517         add     %rax,$N[1]
518         mov     ($ap,$j,8),%rax
519         adc     \$0,%rdx
520         add     $A[1],$N[1]
521         adc     \$0,%rdx
522         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
523         mov     %rdx,$N[0]
524
525         mulq    $m0                     # ap[j]*bp[i]
526         add     %rax,$A[0]
527         mov     ($np,$j,8),%rax
528         adc     \$0,%rdx
529         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
530         adc     \$0,%rdx
531         mov     %rdx,$A[1]
532
533         mulq    $m1                     # np[j]*m1
534         add     %rax,$N[0]
535         mov     8($ap,$j,8),%rax
536         adc     \$0,%rdx
537         add     $A[0],$N[0]
538         adc     \$0,%rdx
539         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
540         mov     %rdx,$N[1]
541
542         mulq    $m0                     # ap[j]*bp[i]
543         add     %rax,$A[1]
544         mov     8($np,$j,8),%rax
545         adc     \$0,%rdx
546         add     8(%rsp,$j,8),$A[1]
547         adc     \$0,%rdx
548         lea     4($j),$j                # j++
549         mov     %rdx,$A[0]
550
551         mulq    $m1                     # np[j]*m1
552         add     %rax,$N[1]
553         mov     -16($ap,$j,8),%rax
554         adc     \$0,%rdx
555         add     $A[1],$N[1]
556         adc     \$0,%rdx
557         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
558         mov     %rdx,$N[0]
559         cmp     $num,$j
560         jl      .Linner4x
561
562         mulq    $m0                     # ap[j]*bp[i]
563         add     %rax,$A[0]
564         mov     -16($np,$j,8),%rax
565         adc     \$0,%rdx
566         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
567         adc     \$0,%rdx
568         mov     %rdx,$A[1]
569
570         mulq    $m1                     # np[j]*m1
571         add     %rax,$N[0]
572         mov     -8($ap,$j,8),%rax
573         adc     \$0,%rdx
574         add     $A[0],$N[0]
575         adc     \$0,%rdx
576         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
577         mov     %rdx,$N[1]
578
579         mulq    $m0                     # ap[j]*bp[i]
580         add     %rax,$A[1]
581         mov     -8($np,$j,8),%rax
582         adc     \$0,%rdx
583         add     -8(%rsp,$j,8),$A[1]
584         adc     \$0,%rdx
585         lea     1($i),$i                # i++
586         mov     %rdx,$A[0]
587
588         mulq    $m1                     # np[j]*m1
589         add     %rax,$N[1]
590         mov     ($ap),%rax              # ap[0]
591         adc     \$0,%rdx
592         add     $A[1],$N[1]
593         adc     \$0,%rdx
594         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
595         mov     %rdx,$N[0]
596
597         xor     $N[1],$N[1]
598         add     $A[0],$N[0]
599         adc     \$0,$N[1]
600         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
601         adc     \$0,$N[1]
602         mov     $N[0],-8(%rsp,$j,8)
603         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
604
605         cmp     $num,$i
606         jl      .Louter4x
607 ___
608 {
609 my @ri=("%rax","%rdx",$m0,$m1);
610 $code.=<<___;
611         mov     16(%rsp,$num,8),$rp     # restore $rp
612         mov     0(%rsp),@ri[0]          # tp[0]
613         pxor    %xmm0,%xmm0
614         mov     8(%rsp),@ri[1]          # tp[1]
615         shr     \$2,$num                # num/=4
616         lea     (%rsp),$ap              # borrow ap for tp
617         xor     $i,$i                   # i=0 and clear CF!
618
619         sub     0($np),@ri[0]
620         mov     16($ap),@ri[2]          # tp[2]
621         mov     24($ap),@ri[3]          # tp[3]
622         sbb     8($np),@ri[1]
623         lea     -1($num),$j             # j=num/4-1
624         jmp     .Lsub4x
625 .align  16
626 .Lsub4x:
627         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
628         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
629         sbb     16($np,$i,8),@ri[2]
630         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
631         mov     40($ap,$i,8),@ri[1]
632         sbb     24($np,$i,8),@ri[3]
633         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
634         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
635         sbb     32($np,$i,8),@ri[0]
636         mov     48($ap,$i,8),@ri[2]
637         mov     56($ap,$i,8),@ri[3]
638         sbb     40($np,$i,8),@ri[1]
639         lea     4($i),$i                # i++
640         dec     $j                      # doesnn't affect CF!
641         jnz     .Lsub4x
642
643         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
644         mov     32($ap,$i,8),@ri[0]     # load overflow bit
645         sbb     16($np,$i,8),@ri[2]
646         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
647         sbb     24($np,$i,8),@ri[3]
648         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
649
650         sbb     \$0,@ri[0]              # handle upmost overflow bit
651         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
652         xor     $i,$i                   # i=0
653         and     @ri[0],$ap
654         not     @ri[0]
655         mov     $rp,$np
656         and     @ri[0],$np
657         lea     -1($num),$j
658         or      $np,$ap                 # ap=borrow?tp:rp
659
660         movdqu  ($ap),%xmm1
661         movdqa  %xmm0,(%rsp)
662         movdqu  %xmm1,($rp)
663         jmp     .Lcopy4x
664 .align  16
665 .Lcopy4x:                                       # copy or in-place refresh
666         movdqu  16($ap,$i),%xmm2
667         movdqu  32($ap,$i),%xmm1
668         movdqa  %xmm0,16(%rsp,$i)
669         movdqu  %xmm2,16($rp,$i)
670         movdqa  %xmm0,32(%rsp,$i)
671         movdqu  %xmm1,32($rp,$i)
672         lea     32($i),$i
673         dec     $j
674         jnz     .Lcopy4x
675
676         shl     \$2,$num
677         movdqu  16($ap,$i),%xmm2
678         movdqa  %xmm0,16(%rsp,$i)
679         movdqu  %xmm2,16($rp,$i)
680 ___
681 }
682 $code.=<<___;
683         mov     8(%rsp,$num,8),%rsi     # restore %rsp
684         mov     \$1,%rax
685         mov     (%rsi),%r15
686         mov     8(%rsi),%r14
687         mov     16(%rsi),%r13
688         mov     24(%rsi),%r12
689         mov     32(%rsi),%rbp
690         mov     40(%rsi),%rbx
691         lea     48(%rsi),%rsp
692 .Lmul4x_epilogue:
693         ret
694 .size   bn_mul4x_mont,.-bn_mul4x_mont
695 ___
696 }}}
697 \f{{{
698 ######################################################################
699 # void bn_sqr8x_mont(
700 my $rptr="%rdi";        # const BN_ULONG *rptr,
701 my $aptr="%rsi";        # const BN_ULONG *aptr,
702 my $bptr="%rdx";        # not used
703 my $nptr="%rcx";        # const BN_ULONG *nptr,
704 my $n0  ="%r8";         # const BN_ULONG *n0);
705 my $num ="%r9";         # int num, has to be divisible by 8
706
707 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
708 my @A0=("%r10","%r11");
709 my @A1=("%r12","%r13");
710 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
711
712 $code.=<<___;
713 .type   bn_sqr8x_mont,\@function,6
714 .align  32
715 bn_sqr8x_mont:
716 .Lsqr8x_enter:
717         push    %rbx
718         push    %rbp
719         push    %r12
720         push    %r13
721         push    %r14
722         push    %r15
723
724         shl     \$3,${num}d             # convert $num to bytes
725         xor     %r10,%r10
726         mov     %rsp,%r11               # put aside %rsp
727         sub     $num,%r10               # -$num
728         mov     ($n0),$n0               # *n0
729         lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
730         and     \$-1024,%rsp            # minimize TLB usage
731         ##############################################################
732         # Stack layout
733         #
734         # +0    saved $num, used in reduction section
735         # +8    &t[2*$num], used in reduction section
736         # +32   saved $rptr
737         # +40   saved $nptr
738         # +48   saved *n0
739         # +56   saved %rsp
740         # +64   t[2*$num]
741         #
742         mov     $rptr,32(%rsp)          # save $rptr
743         mov     $nptr,40(%rsp)
744         mov     $n0,  48(%rsp)
745         mov     %r11, 56(%rsp)          # save original %rsp
746 .Lsqr8x_body:
747         ##############################################################
748         # Squaring part:
749         #
750         # a) multiply-n-add everything but a[i]*a[i];
751         # b) shift result of a) by 1 to the left and accumulate
752         #    a[i]*a[i] products;
753         #
754         ##############################################################
755         #                                                     a[1]a[0]
756         #                                                 a[2]a[0]
757         #                                             a[3]a[0]
758         #                                             a[2]a[1]
759         #                                         a[4]a[0]
760         #                                         a[3]a[1]
761         #                                     a[5]a[0]
762         #                                     a[4]a[1]
763         #                                     a[3]a[2]
764         #                                 a[6]a[0]
765         #                                 a[5]a[1]
766         #                                 a[4]a[2]
767         #                             a[7]a[0]
768         #                             a[6]a[1]
769         #                             a[5]a[2]
770         #                             a[4]a[3]
771         #                         a[7]a[1]
772         #                         a[6]a[2]
773         #                         a[5]a[3]
774         #                     a[7]a[2]
775         #                     a[6]a[3]
776         #                     a[5]a[4]
777         #                 a[7]a[3]
778         #                 a[6]a[4]
779         #             a[7]a[4]
780         #             a[6]a[5]
781         #         a[7]a[5]
782         #     a[7]a[6]
783         #                                                     a[1]a[0]
784         #                                                 a[2]a[0]
785         #                                             a[3]a[0]
786         #                                         a[4]a[0]
787         #                                     a[5]a[0]
788         #                                 a[6]a[0]
789         #                             a[7]a[0]
790         #                                             a[2]a[1]
791         #                                         a[3]a[1]
792         #                                     a[4]a[1]
793         #                                 a[5]a[1]
794         #                             a[6]a[1]
795         #                         a[7]a[1]
796         #                                     a[3]a[2]
797         #                                 a[4]a[2]
798         #                             a[5]a[2]
799         #                         a[6]a[2]
800         #                     a[7]a[2]
801         #                             a[4]a[3]
802         #                         a[5]a[3]
803         #                     a[6]a[3]
804         #                 a[7]a[3]
805         #                     a[5]a[4]
806         #                 a[6]a[4]
807         #             a[7]a[4]
808         #             a[6]a[5]
809         #         a[7]a[5]
810         #     a[7]a[6]
811         #                                                         a[0]a[0]
812         #                                                 a[1]a[1]
813         #                                         a[2]a[2]
814         #                                 a[3]a[3]
815         #                         a[4]a[4]
816         #                 a[5]a[5]
817         #         a[6]a[6]
818         # a[7]a[7]
819
820         lea     32(%r10),$i             # $i=-($num-32)
821         lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
822
823         mov     $num,$j                 # $j=$num
824
825                                         # comments apply to $num==8 case
826         mov     -32($aptr,$i),$a0       # a[0]
827         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
828         mov     -24($aptr,$i),%rax      # a[1]
829         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
830         mov     -16($aptr,$i),$ai       # a[2]
831         mov     %rax,$a1
832
833         mul     $a0                     # a[1]*a[0]
834         mov     %rax,$A0[0]             # a[1]*a[0]
835          mov    $ai,%rax                # a[2]
836         mov     %rdx,$A0[1]
837         mov     $A0[0],-24($tptr,$i)    # t[1]
838
839         mul     $a0                     # a[2]*a[0]
840         add     %rax,$A0[1]
841          mov    $ai,%rax
842         adc     \$0,%rdx
843         mov     $A0[1],-16($tptr,$i)    # t[2]
844         mov     %rdx,$A0[0]
845
846         lea     -16($i),$j              # j=-16
847
848
849          mov    8($aptr,$j),$ai         # a[3]
850         mul     $a1                     # a[2]*a[1]
851         mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
852          mov    $ai,%rax
853         mov     %rdx,$A1[1]
854
855          lea    16($j),$j
856         mul     $a0                     # a[3]*a[0]
857         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
858          mov    $ai,%rax
859         mov     %rdx,$A0[1]
860         adc     \$0,$A0[1]
861         add     $A1[0],$A0[0]
862         adc     \$0,$A0[1]
863         mov     $A0[0],-8($tptr,$j)     # t[3]
864         jmp     .Lsqr4x_1st
865
866 .align  32
867 .Lsqr4x_1st:
868          mov    ($aptr,$j),$ai          # a[4]
869         mul     $a1                     # a[3]*a[1]
870         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
871          mov    $ai,%rax
872         mov     %rdx,$A1[0]
873         adc     \$0,$A1[0]
874
875         mul     $a0                     # a[4]*a[0]
876         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
877          mov    $ai,%rax                # a[3]
878          mov    8($aptr,$j),$ai         # a[5]
879         mov     %rdx,$A0[0]
880         adc     \$0,$A0[0]
881         add     $A1[1],$A0[1]
882         adc     \$0,$A0[0]
883
884
885         mul     $a1                     # a[4]*a[3]
886         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
887          mov    $ai,%rax
888          mov    $A0[1],($tptr,$j)       # t[4]
889         mov     %rdx,$A1[1]
890         adc     \$0,$A1[1]
891
892         mul     $a0                     # a[5]*a[2]
893         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
894          mov    $ai,%rax
895          mov    16($aptr,$j),$ai        # a[6]
896         mov     %rdx,$A0[1]
897         adc     \$0,$A0[1]
898         add     $A1[0],$A0[0]
899         adc     \$0,$A0[1]
900
901         mul     $a1                     # a[5]*a[3]
902         add     %rax,$A1[1]             # a[5]*a[3]+t[6]
903          mov    $ai,%rax
904          mov    $A0[0],8($tptr,$j)      # t[5]
905         mov     %rdx,$A1[0]
906         adc     \$0,$A1[0]
907
908         mul     $a0                     # a[6]*a[2]
909         add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
910          mov    $ai,%rax                # a[3]
911          mov    24($aptr,$j),$ai        # a[7]
912         mov     %rdx,$A0[0]
913         adc     \$0,$A0[0]
914         add     $A1[1],$A0[1]
915         adc     \$0,$A0[0]
916
917
918         mul     $a1                     # a[6]*a[5]
919         add     %rax,$A1[0]             # a[6]*a[5]+t[7]
920          mov    $ai,%rax
921          mov    $A0[1],16($tptr,$j)     # t[6]
922         mov     %rdx,$A1[1]
923         adc     \$0,$A1[1]
924
925         mul     $a0                     # a[7]*a[4]
926         add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
927          mov    $ai,%rax
928          lea    32($j),$j
929         mov     %rdx,$A0[1]
930         adc     \$0,$A0[1]
931         add     $A1[0],$A0[0]
932         adc     \$0,$A0[1]
933         mov     $A0[0],-8($tptr,$j)     # t[7]
934
935         cmp     \$0,$j
936         jne     .Lsqr4x_1st
937
938         mul     $a1                     # a[7]*a[5]
939         add     %rax,$A1[1]
940         lea     16($i),$i
941         adc     \$0,%rdx
942         add     $A0[1],$A1[1]
943         adc     \$0,%rdx
944
945         mov     $A1[1],($tptr)          # t[8]
946         mov     %rdx,$A1[0]
947         mov     %rdx,8($tptr)           # t[9]
948         jmp     .Lsqr4x_outer
949
950 .align  32
951 .Lsqr4x_outer:                          # comments apply to $num==6 case
952         mov     -32($aptr,$i),$a0       # a[0]
953         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
954         mov     -24($aptr,$i),%rax      # a[1]
955         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
956         mov     -16($aptr,$i),$ai       # a[2]
957         mov     %rax,$a1
958
959         mov     -24($tptr,$i),$A0[0]    # t[1]
960         mul     $a0                     # a[1]*a[0]
961         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
962          mov    $ai,%rax                # a[2]
963         adc     \$0,%rdx
964         mov     $A0[0],-24($tptr,$i)    # t[1]
965         mov     %rdx,$A0[1]
966
967         mul     $a0                     # a[2]*a[0]
968         add     %rax,$A0[1]
969          mov    $ai,%rax
970         adc     \$0,%rdx
971         add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
972         mov     %rdx,$A0[0]
973         adc     \$0,$A0[0]
974         mov     $A0[1],-16($tptr,$i)    # t[2]
975
976         lea     -16($i),$j              # j=-16
977         xor     $A1[0],$A1[0]
978
979
980          mov    8($aptr,$j),$ai         # a[3]
981         mul     $a1                     # a[2]*a[1]
982         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
983          mov    $ai,%rax
984         adc     \$0,%rdx
985         add     8($tptr,$j),$A1[0]
986         mov     %rdx,$A1[1]
987         adc     \$0,$A1[1]
988
989         mul     $a0                     # a[3]*a[0]
990         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
991          mov    $ai,%rax
992         adc     \$0,%rdx
993         add     $A1[0],$A0[0]
994         mov     %rdx,$A0[1]
995         adc     \$0,$A0[1]
996         mov     $A0[0],8($tptr,$j)      # t[3]
997
998         lea     16($j),$j
999         jmp     .Lsqr4x_inner
1000
1001 .align  32
1002 .Lsqr4x_inner:
1003          mov    ($aptr,$j),$ai          # a[4]
1004         mul     $a1                     # a[3]*a[1]
1005         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
1006          mov    $ai,%rax
1007         mov     %rdx,$A1[0]
1008         adc     \$0,$A1[0]
1009         add     ($tptr,$j),$A1[1]
1010         adc     \$0,$A1[0]
1011
1012         mul     $a0                     # a[4]*a[0]
1013         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
1014          mov    $ai,%rax                # a[3]
1015          mov    8($aptr,$j),$ai         # a[5]
1016         mov     %rdx,$A0[0]
1017         adc     \$0,$A0[0]
1018         add     $A1[1],$A0[1]
1019         adc     \$0,$A0[0]
1020
1021         mul     $a1                     # a[4]*a[3]
1022         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
1023         mov     $A0[1],($tptr,$j)       # t[4]
1024          mov    $ai,%rax
1025         mov     %rdx,$A1[1]
1026         adc     \$0,$A1[1]
1027         add     8($tptr,$j),$A1[0]
1028         lea     16($j),$j               # j++
1029         adc     \$0,$A1[1]
1030
1031         mul     $a0                     # a[5]*a[2]
1032         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
1033          mov    $ai,%rax
1034         adc     \$0,%rdx
1035         add     $A1[0],$A0[0]
1036         mov     %rdx,$A0[1]
1037         adc     \$0,$A0[1]
1038         mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
1039
1040         cmp     \$0,$j
1041         jne     .Lsqr4x_inner
1042
1043         mul     $a1                     # a[5]*a[3]
1044         add     %rax,$A1[1]
1045         adc     \$0,%rdx
1046         add     $A0[1],$A1[1]
1047         adc     \$0,%rdx
1048
1049         mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
1050         mov     %rdx,$A1[0]
1051         mov     %rdx,8($tptr)           # t[7], "preloaded t[3]" below
1052
1053         add     \$16,$i
1054         jnz     .Lsqr4x_outer
1055
1056                                         # comments apply to $num==4 case
1057         mov     -32($aptr),$a0          # a[0]
1058         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
1059         mov     -24($aptr),%rax         # a[1]
1060         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
1061         mov     -16($aptr),$ai          # a[2]
1062         mov     %rax,$a1
1063
1064         mul     $a0                     # a[1]*a[0]
1065         add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
1066          mov    $ai,%rax                # a[2]
1067         mov     %rdx,$A0[1]
1068         adc     \$0,$A0[1]
1069
1070         mul     $a0                     # a[2]*a[0]
1071         add     %rax,$A0[1]
1072          mov    $ai,%rax
1073          mov    $A0[0],-24($tptr)       # t[1]
1074         mov     %rdx,$A0[0]
1075         adc     \$0,$A0[0]
1076         add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
1077          mov    -8($aptr),$ai           # a[3]
1078         adc     \$0,$A0[0]
1079
1080         mul     $a1                     # a[2]*a[1]
1081         add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
1082          mov    $ai,%rax
1083          mov    $A0[1],-16($tptr)       # t[2]
1084         mov     %rdx,$A1[1]
1085         adc     \$0,$A1[1]
1086
1087         mul     $a0                     # a[3]*a[0]
1088         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1089          mov    $ai,%rax
1090         mov     %rdx,$A0[1]
1091         adc     \$0,$A0[1]
1092         add     $A1[0],$A0[0]
1093         adc     \$0,$A0[1]
1094         mov     $A0[0],-8($tptr)        # t[3]
1095
1096         mul     $a1                     # a[3]*a[1]
1097         add     %rax,$A1[1]
1098          mov    -16($aptr),%rax         # a[2]
1099         adc     \$0,%rdx
1100         add     $A0[1],$A1[1]
1101         adc     \$0,%rdx
1102
1103         mov     $A1[1],($tptr)          # t[4]
1104         mov     %rdx,$A1[0]
1105         mov     %rdx,8($tptr)           # t[5]
1106
1107         mul     $ai                     # a[2]*a[3]
1108 ___
1109 {
1110 my ($shift,$carry)=($a0,$a1);
1111 my @S=(@A1,$ai,$n0);
1112 $code.=<<___;
1113          add    \$16,$i
1114          xor    $shift,$shift
1115          sub    $num,$i                 # $i=16-$num
1116          xor    $carry,$carry
1117
1118         add     $A1[0],%rax             # t[5]
1119         adc     \$0,%rdx
1120         mov     %rax,8($tptr)           # t[5]
1121         mov     %rdx,16($tptr)          # t[6]
1122         mov     $carry,24($tptr)        # t[7]
1123
1124          mov    -16($aptr,$i),%rax      # a[0]
1125         lea     64(%rsp,$num,2),$tptr
1126          xor    $A0[0],$A0[0]           # t[0]
1127          mov    -24($tptr,$i,2),$A0[1]  # t[1]
1128
1129         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1130         shr     \$63,$A0[0]
1131         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1132         shr     \$63,$A0[1]
1133         or      $A0[0],$S[1]            # | t[2*i]>>63
1134          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1135         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1136         mul     %rax                    # a[i]*a[i]
1137         neg     $carry                  # mov $carry,cf
1138          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1139         adc     %rax,$S[0]
1140          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1141         mov     $S[0],-32($tptr,$i,2)
1142         adc     %rdx,$S[1]
1143
1144         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1145          mov    $S[1],-24($tptr,$i,2)
1146          sbb    $carry,$carry           # mov cf,$carry
1147         shr     \$63,$A0[0]
1148         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1149         shr     \$63,$A0[1]
1150         or      $A0[0],$S[3]            # | t[2*i]>>63
1151          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1152         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1153         mul     %rax                    # a[i]*a[i]
1154         neg     $carry                  # mov $carry,cf
1155          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1156         adc     %rax,$S[2]
1157          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1158         mov     $S[2],-16($tptr,$i,2)
1159         adc     %rdx,$S[3]
1160         lea     16($i),$i
1161         mov     $S[3],-40($tptr,$i,2)
1162         sbb     $carry,$carry           # mov cf,$carry
1163         jmp     .Lsqr4x_shift_n_add
1164
1165 .align  32
1166 .Lsqr4x_shift_n_add:
1167         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1168         shr     \$63,$A0[0]
1169         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1170         shr     \$63,$A0[1]
1171         or      $A0[0],$S[1]            # | t[2*i]>>63
1172          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1173         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1174         mul     %rax                    # a[i]*a[i]
1175         neg     $carry                  # mov $carry,cf
1176          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1177         adc     %rax,$S[0]
1178          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1179         mov     $S[0],-32($tptr,$i,2)
1180         adc     %rdx,$S[1]
1181
1182         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1183          mov    $S[1],-24($tptr,$i,2)
1184          sbb    $carry,$carry           # mov cf,$carry
1185         shr     \$63,$A0[0]
1186         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1187         shr     \$63,$A0[1]
1188         or      $A0[0],$S[3]            # | t[2*i]>>63
1189          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1190         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1191         mul     %rax                    # a[i]*a[i]
1192         neg     $carry                  # mov $carry,cf
1193          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1194         adc     %rax,$S[2]
1195          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1196         mov     $S[2],-16($tptr,$i,2)
1197         adc     %rdx,$S[3]
1198
1199         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1200          mov    $S[3],-8($tptr,$i,2)
1201          sbb    $carry,$carry           # mov cf,$carry
1202         shr     \$63,$A0[0]
1203         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1204         shr     \$63,$A0[1]
1205         or      $A0[0],$S[1]            # | t[2*i]>>63
1206          mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1207         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1208         mul     %rax                    # a[i]*a[i]
1209         neg     $carry                  # mov $carry,cf
1210          mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1211         adc     %rax,$S[0]
1212          mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
1213         mov     $S[0],0($tptr,$i,2)
1214         adc     %rdx,$S[1]
1215
1216         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1217          mov    $S[1],8($tptr,$i,2)
1218          sbb    $carry,$carry           # mov cf,$carry
1219         shr     \$63,$A0[0]
1220         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1221         shr     \$63,$A0[1]
1222         or      $A0[0],$S[3]            # | t[2*i]>>63
1223          mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1224         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1225         mul     %rax                    # a[i]*a[i]
1226         neg     $carry                  # mov $carry,cf
1227          mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1228         adc     %rax,$S[2]
1229          mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
1230         mov     $S[2],16($tptr,$i,2)
1231         adc     %rdx,$S[3]
1232         mov     $S[3],24($tptr,$i,2)
1233         sbb     $carry,$carry           # mov cf,$carry
1234         add     \$32,$i
1235         jnz     .Lsqr4x_shift_n_add
1236
1237         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1238         shr     \$63,$A0[0]
1239         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1240         shr     \$63,$A0[1]
1241         or      $A0[0],$S[1]            # | t[2*i]>>63
1242          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1243         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1244         mul     %rax                    # a[i]*a[i]
1245         neg     $carry                  # mov $carry,cf
1246          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1247         adc     %rax,$S[0]
1248          mov    -8($aptr),%rax          # a[i+1]        # prefetch
1249         mov     $S[0],-32($tptr)
1250         adc     %rdx,$S[1]
1251
1252         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1253          mov    $S[1],-24($tptr)
1254          sbb    $carry,$carry           # mov cf,$carry
1255         shr     \$63,$A0[0]
1256         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1257         shr     \$63,$A0[1]
1258         or      $A0[0],$S[3]            # | t[2*i]>>63
1259         mul     %rax                    # a[i]*a[i]
1260         neg     $carry                  # mov $carry,cf
1261         adc     %rax,$S[2]
1262         adc     %rdx,$S[3]
1263         mov     $S[2],-16($tptr)
1264         mov     $S[3],-8($tptr)
1265 ___
1266 }\f
1267 ######################################################################
1268 # Montgomery reduction part, "word-by-word" algorithm.
1269 #
1270 # This new path is inspired by multiple submissions from Intel, by
1271 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1272 # Vinodh Gopal...
1273 {
1274 my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1275
1276 $code.=<<___;
1277         mov     40(%rsp),$nptr          # pull $nptr
1278         xor     %rax,%rax
1279         lea     ($nptr,$num),%rdx       # end of n[]
1280         lea     64(%rsp,$num,2),$tptr   # end of t[] buffer
1281         mov     %rdx,0(%rsp)
1282         mov     $tptr,8(%rsp)
1283         mov     %rax,($tptr)            # clear top-most carry bit
1284         lea     64(%rsp,$num),$tptr     # end of initial t[] window
1285         neg     $num
1286         jmp     .L8x_reduction_loop
1287
1288 .align  32
1289 .L8x_reduction_loop:
1290         lea     ($tptr,$num),$tptr      # start of current t[] window
1291         mov     8*0($tptr),$m0
1292         mov     8*1($tptr),%r9
1293         mov     8*2($tptr),%r10
1294         mov     8*3($tptr),%r11
1295         mov     8*4($tptr),%r12
1296         mov     8*5($tptr),%r13
1297         mov     8*6($tptr),%r14
1298         mov     8*7($tptr),%r15
1299         lea     8*8($tptr),$tptr
1300
1301         mov     $m0,%r8
1302         imulq   48(%rsp),$m0            # n0*a[0]
1303         mov     8*0($nptr),%rax         # n[0]
1304         mov     \$8,%ecx
1305         jmp     .L8x_reduce
1306
1307 .align  32
1308 .L8x_reduce:
1309         mulq    $m0
1310          mov    8*1($nptr),%rax         # n[1]
1311         neg     %r8
1312         mov     %rdx,%r8
1313         adc     \$0,%r8
1314
1315         mulq    $m0
1316         add     %rax,%r9
1317          mov    8*2($nptr),%rax
1318         adc     \$0,%rdx
1319         add     %r9,%r8
1320          mov    $m0,64-8(%rsp,%rcx,8)   # put aside n0*a[i]
1321         mov     %rdx,%r9
1322         adc     \$0,%r9
1323
1324         mulq    $m0
1325         add     %rax,%r10
1326          mov    8*3($nptr),%rax
1327         adc     \$0,%rdx
1328         add     %r10,%r9
1329          mov    48(%rsp),$carry         # pull n0, borrow $carry
1330         mov     %rdx,%r10
1331         adc     \$0,%r10
1332
1333         mulq    $m0
1334         add     %rax,%r11
1335          mov    8*4($nptr),%rax
1336         adc     \$0,%rdx
1337          imulq  %r8,$carry              # modulo-scheduled
1338         add     %r11,%r10
1339         mov     %rdx,%r11
1340         adc     \$0,%r11
1341
1342         mulq    $m0
1343         add     %rax,%r12
1344          mov    8*5($nptr),%rax
1345         adc     \$0,%rdx
1346         add     %r12,%r11
1347         mov     %rdx,%r12
1348         adc     \$0,%r12
1349
1350         mulq    $m0
1351         add     %rax,%r13
1352          mov    8*6($nptr),%rax
1353         adc     \$0,%rdx
1354         add     %r13,%r12
1355         mov     %rdx,%r13
1356         adc     \$0,%r13
1357
1358         mulq    $m0
1359         add     %rax,%r14
1360          mov    8*7($nptr),%rax
1361         adc     \$0,%rdx
1362         add     %r14,%r13
1363         mov     %rdx,%r14
1364         adc     \$0,%r14
1365
1366         mulq    $m0
1367          mov    $carry,$m0              # n0*a[i]
1368         add     %rax,%r15
1369          mov    8*0($nptr),%rax         # n[0]
1370         adc     \$0,%rdx
1371         add     %r15,%r14
1372         mov     %rdx,%r15
1373         adc     \$0,%r15
1374
1375         dec     %ecx
1376         jnz     .L8x_reduce
1377
1378         lea     8*8($nptr),$nptr
1379         xor     %rax,%rax
1380         mov     8(%rsp),%rdx            # pull end of t[]
1381         xor     $carry,$carry
1382         cmp     0(%rsp),$nptr           # end of n[]?
1383         jae     .L8x_no_tail
1384
1385         add     8*0($tptr),%r8
1386         adc     8*1($tptr),%r9
1387         adc     8*2($tptr),%r10
1388         adc     8*3($tptr),%r11
1389         adc     8*4($tptr),%r12
1390         adc     8*5($tptr),%r13
1391         adc     8*6($tptr),%r14
1392         adc     8*7($tptr),%r15
1393         sbb     $carry,$carry           # top carry
1394
1395         mov     64+56(%rsp),$m0         # pull n0*a[0]
1396         mov     \$8,%ecx
1397         mov     8*0($nptr),%rax
1398         jmp     .L8x_tail
1399
1400 .align  32
1401 .L8x_tail:
1402         mulq    $m0
1403         add     %rax,%r8
1404          mov    8*1($nptr),%rax
1405          mov    %r8,($tptr)             # save result
1406         mov     %rdx,%r8
1407         adc     \$0,%r8
1408
1409         mulq    $m0
1410         add     %rax,%r9
1411          mov    8*2($nptr),%rax
1412         adc     \$0,%rdx
1413         add     %r9,%r8
1414          lea    8($tptr),$tptr          # $tptr++
1415         mov     %rdx,%r9
1416         adc     \$0,%r9
1417
1418         mulq    $m0
1419         add     %rax,%r10
1420          mov    8*3($nptr),%rax
1421         adc     \$0,%rdx
1422         add     %r10,%r9
1423         mov     %rdx,%r10
1424         adc     \$0,%r10
1425
1426         mulq    $m0
1427         add     %rax,%r11
1428          mov    8*4($nptr),%rax
1429         adc     \$0,%rdx
1430         add     %r11,%r10
1431         mov     %rdx,%r11
1432         adc     \$0,%r11
1433
1434         mulq    $m0
1435         add     %rax,%r12
1436          mov    8*5($nptr),%rax
1437         adc     \$0,%rdx
1438         add     %r12,%r11
1439         mov     %rdx,%r12
1440         adc     \$0,%r12
1441
1442         mulq    $m0
1443         add     %rax,%r13
1444          mov    8*6($nptr),%rax
1445         adc     \$0,%rdx
1446         add     %r13,%r12
1447         mov     %rdx,%r13
1448         adc     \$0,%r13
1449
1450         mulq    $m0
1451         add     %rax,%r14
1452          mov    8*7($nptr),%rax
1453         adc     \$0,%rdx
1454         add     %r14,%r13
1455         mov     %rdx,%r14
1456         adc     \$0,%r14
1457
1458         mulq    $m0
1459          mov    64-16(%rsp,%rcx,8),$m0  # pull n0*a[i]
1460         add     %rax,%r15
1461         adc     \$0,%rdx
1462         add     %r15,%r14
1463          mov    8*0($nptr),%rax         # pull n[0]
1464         mov     %rdx,%r15
1465         adc     \$0,%r15
1466
1467         dec     %ecx
1468         jnz     .L8x_tail
1469
1470         lea     8*8($nptr),$nptr
1471         mov     8(%rsp),%rdx            # pull end of t[]
1472         cmp     0(%rsp),$nptr           # end of n[]?
1473         jae     .L8x_tail_done          # break out of loop
1474
1475          mov    64+56(%rsp),$m0         # pull n0*a[0]
1476         neg     $carry
1477          mov    8*0($nptr),%rax         # pull n[0]
1478         adc     8*0($tptr),%r8
1479         adc     8*1($tptr),%r9
1480         adc     8*2($tptr),%r10
1481         adc     8*3($tptr),%r11
1482         adc     8*4($tptr),%r12
1483         adc     8*5($tptr),%r13
1484         adc     8*6($tptr),%r14
1485         adc     8*7($tptr),%r15
1486         sbb     $carry,$carry           # top carry
1487
1488         mov     \$8,%ecx
1489         jmp     .L8x_tail
1490
1491 .align  32
1492 .L8x_tail_done:
1493         add     (%rdx),%r8              # can this overflow?
1494         adc     \$0,%r9
1495         adc     \$0,%r10
1496         adc     \$0,%r11
1497         adc     \$0,%r12
1498         adc     \$0,%r13
1499         adc     \$0,%r14
1500         adc     \$0,%r15
1501         sbb     %rax,%rax
1502
1503 .L8x_no_tail:
1504         neg     $carry
1505         adc     8*0($tptr),%r8
1506         adc     8*1($tptr),%r9
1507         adc     8*2($tptr),%r10
1508         adc     8*3($tptr),%r11
1509         adc     8*4($tptr),%r12
1510         adc     8*5($tptr),%r13
1511         adc     8*6($tptr),%r14
1512         adc     8*7($tptr),%r15
1513         sbb     $carry,$carry
1514         neg     %rax
1515         sub     $carry,%rax             # top-most carry
1516
1517         mov     40(%rsp),$nptr          # restore $nptr
1518
1519         mov     %r8,8*0($tptr)          # store top 512 bits
1520         mov     %r9,8*1($tptr)
1521          mov    $nptr,$num              # $num is %r9, can't be moved upwards
1522         mov     %r10,8*2($tptr)
1523          sub    0(%rsp),$num            # -$num
1524         mov     %r11,8*3($tptr)
1525         mov     %r12,8*4($tptr)
1526         mov     %r13,8*5($tptr)
1527         mov     %r14,8*6($tptr)
1528         mov     %r15,8*7($tptr)
1529         lea     8*8($tptr),$tptr
1530         mov     %rax,(%rdx)             # store top-most carry
1531
1532         cmp     %rdx,$tptr              # end of t[]?
1533         jb      .L8x_reduction_loop
1534
1535         neg     $num                    # restore $num
1536 ___
1537 }\f
1538 ##############################################################
1539 # Post-condition, 4x unrolled copy from bn_mul_mont
1540 #
1541 {
1542 my ($tptr,$nptr)=("%rbx",$aptr);
1543 my @ri=("%rax","%rdx","%r10","%r11");
1544 $code.=<<___;
1545         mov     64(%rsp,$num),@ri[0]    # tp[0]
1546         lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
1547         mov     40(%rsp),$nptr          # restore $nptr
1548         shr     \$5,$num                # num/4
1549         mov     8($tptr),@ri[1]         # t[1]
1550         xor     $i,$i                   # i=0 and clear CF!
1551
1552         mov     32(%rsp),$rptr          # restore $rptr
1553         sub     0($nptr),@ri[0]
1554         mov     16($tptr),@ri[2]        # t[2]
1555         mov     24($tptr),@ri[3]        # t[3]
1556         sbb     8($nptr),@ri[1]
1557         lea     -1($num),$j             # j=num/4-1
1558         jmp     .Lsqr4x_sub
1559 .align  32
1560 .Lsqr4x_sub:
1561         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1562         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1563         sbb     16($nptr,$i,8),@ri[2]
1564         mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
1565         mov     40($tptr,$i,8),@ri[1]
1566         sbb     24($nptr,$i,8),@ri[3]
1567         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1568         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1569         sbb     32($nptr,$i,8),@ri[0]
1570         mov     48($tptr,$i,8),@ri[2]
1571         mov     56($tptr,$i,8),@ri[3]
1572         sbb     40($nptr,$i,8),@ri[1]
1573         lea     4($i),$i                # i++
1574         dec     $j                      # doesn't affect CF!
1575         jnz     .Lsqr4x_sub
1576
1577         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1578         mov     32($tptr,$i,8),@ri[0]   # load overflow bit
1579         sbb     16($nptr,$i,8),@ri[2]
1580         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1581         sbb     24($nptr,$i,8),@ri[3]
1582         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1583
1584         sbb     \$0,@ri[0]              # handle upmost overflow bit
1585         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1586         xor     $i,$i                   # i=0
1587         and     @ri[0],$tptr
1588         not     @ri[0]
1589         mov     $rptr,$nptr
1590         and     @ri[0],$nptr
1591         lea     -1($num),$j
1592         or      $nptr,$tptr             # tp=borrow?tp:rp
1593
1594         pxor    %xmm0,%xmm0
1595         lea     64(%rsp,$num,8),$nptr
1596         movdqu  ($tptr),%xmm1
1597         lea     ($nptr,$num,8),$nptr
1598         movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
1599         movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
1600         movdqu  %xmm1,($rptr)
1601         jmp     .Lsqr4x_copy
1602 .align  32
1603 .Lsqr4x_copy:                           # copy or in-place refresh
1604         movdqu  16($tptr,$i),%xmm2
1605         movdqu  32($tptr,$i),%xmm1
1606         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1607         movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
1608         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1609         movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
1610         movdqu  %xmm2,16($rptr,$i)
1611         movdqu  %xmm1,32($rptr,$i)
1612         lea     32($i),$i
1613         dec     $j
1614         jnz     .Lsqr4x_copy
1615
1616         movdqu  16($tptr,$i),%xmm2
1617         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1618         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1619         movdqu  %xmm2,16($rptr,$i)
1620 ___
1621 }
1622 $code.=<<___;
1623         mov     56(%rsp),%rsi           # restore %rsp
1624         mov     \$1,%rax
1625         mov     0(%rsi),%r15
1626         mov     8(%rsi),%r14
1627         mov     16(%rsi),%r13
1628         mov     24(%rsi),%r12
1629         mov     32(%rsi),%rbp
1630         mov     40(%rsi),%rbx
1631         lea     48(%rsi),%rsp
1632 .Lsqr8x_epilogue:
1633         ret
1634 .size   bn_sqr8x_mont,.-bn_sqr8x_mont
1635 ___
1636 }}}
1637 $code.=<<___;
1638 .asciz  "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1639 .align  16
1640 ___
1641
1642 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1643 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1644 if ($win64) {
1645 $rec="%rcx";
1646 $frame="%rdx";
1647 $context="%r8";
1648 $disp="%r9";
1649
1650 $code.=<<___;
1651 .extern __imp_RtlVirtualUnwind
1652 .type   mul_handler,\@abi-omnipotent
1653 .align  16
1654 mul_handler:
1655         push    %rsi
1656         push    %rdi
1657         push    %rbx
1658         push    %rbp
1659         push    %r12
1660         push    %r13
1661         push    %r14
1662         push    %r15
1663         pushfq
1664         sub     \$64,%rsp
1665
1666         mov     120($context),%rax      # pull context->Rax
1667         mov     248($context),%rbx      # pull context->Rip
1668
1669         mov     8($disp),%rsi           # disp->ImageBase
1670         mov     56($disp),%r11          # disp->HandlerData
1671
1672         mov     0(%r11),%r10d           # HandlerData[0]
1673         lea     (%rsi,%r10),%r10        # end of prologue label
1674         cmp     %r10,%rbx               # context->Rip<end of prologue label
1675         jb      .Lcommon_seh_tail
1676
1677         mov     152($context),%rax      # pull context->Rsp
1678
1679         mov     4(%r11),%r10d           # HandlerData[1]
1680         lea     (%rsi,%r10),%r10        # epilogue label
1681         cmp     %r10,%rbx               # context->Rip>=epilogue label
1682         jae     .Lcommon_seh_tail
1683
1684         mov     192($context),%r10      # pull $num
1685         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
1686         lea     48(%rax),%rax
1687
1688         mov     -8(%rax),%rbx
1689         mov     -16(%rax),%rbp
1690         mov     -24(%rax),%r12
1691         mov     -32(%rax),%r13
1692         mov     -40(%rax),%r14
1693         mov     -48(%rax),%r15
1694         mov     %rbx,144($context)      # restore context->Rbx
1695         mov     %rbp,160($context)      # restore context->Rbp
1696         mov     %r12,216($context)      # restore context->R12
1697         mov     %r13,224($context)      # restore context->R13
1698         mov     %r14,232($context)      # restore context->R14
1699         mov     %r15,240($context)      # restore context->R15
1700
1701         jmp     .Lcommon_seh_tail
1702 .size   mul_handler,.-mul_handler
1703
1704 .type   sqr_handler,\@abi-omnipotent
1705 .align  16
1706 sqr_handler:
1707         push    %rsi
1708         push    %rdi
1709         push    %rbx
1710         push    %rbp
1711         push    %r12
1712         push    %r13
1713         push    %r14
1714         push    %r15
1715         pushfq
1716         sub     \$64,%rsp
1717
1718         mov     120($context),%rax      # pull context->Rax
1719         mov     248($context),%rbx      # pull context->Rip
1720
1721         lea     .Lsqr8x_body(%rip),%r10
1722         cmp     %r10,%rbx               # context->Rip<.Lsqr_body
1723         jb      .Lcommon_seh_tail
1724
1725         mov     152($context),%rax      # pull context->Rsp
1726
1727         lea     .Lsqr8x_epilogue(%rip),%r10
1728         cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
1729         jae     .Lcommon_seh_tail
1730
1731         mov     56(%rax),%rax           # pull saved stack pointer
1732         lea     48(%rax),%rax
1733
1734         mov     -8(%rax),%rbx
1735         mov     -16(%rax),%rbp
1736         mov     -24(%rax),%r12
1737         mov     -32(%rax),%r13
1738         mov     -40(%rax),%r14
1739         mov     -48(%rax),%r15
1740         mov     %rbx,144($context)      # restore context->Rbx
1741         mov     %rbp,160($context)      # restore context->Rbp
1742         mov     %r12,216($context)      # restore context->R12
1743         mov     %r13,224($context)      # restore context->R13
1744         mov     %r14,232($context)      # restore context->R14
1745         mov     %r15,240($context)      # restore context->R15
1746
1747 .Lcommon_seh_tail:
1748         mov     8(%rax),%rdi
1749         mov     16(%rax),%rsi
1750         mov     %rax,152($context)      # restore context->Rsp
1751         mov     %rsi,168($context)      # restore context->Rsi
1752         mov     %rdi,176($context)      # restore context->Rdi
1753
1754         mov     40($disp),%rdi          # disp->ContextRecord
1755         mov     $context,%rsi           # context
1756         mov     \$154,%ecx              # sizeof(CONTEXT)
1757         .long   0xa548f3fc              # cld; rep movsq
1758
1759         mov     $disp,%rsi
1760         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1761         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1762         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1763         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1764         mov     40(%rsi),%r10           # disp->ContextRecord
1765         lea     56(%rsi),%r11           # &disp->HandlerData
1766         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1767         mov     %r10,32(%rsp)           # arg5
1768         mov     %r11,40(%rsp)           # arg6
1769         mov     %r12,48(%rsp)           # arg7
1770         mov     %rcx,56(%rsp)           # arg8, (NULL)
1771         call    *__imp_RtlVirtualUnwind(%rip)
1772
1773         mov     \$1,%eax                # ExceptionContinueSearch
1774         add     \$64,%rsp
1775         popfq
1776         pop     %r15
1777         pop     %r14
1778         pop     %r13
1779         pop     %r12
1780         pop     %rbp
1781         pop     %rbx
1782         pop     %rdi
1783         pop     %rsi
1784         ret
1785 .size   sqr_handler,.-sqr_handler
1786
1787 .section        .pdata
1788 .align  4
1789         .rva    .LSEH_begin_bn_mul_mont
1790         .rva    .LSEH_end_bn_mul_mont
1791         .rva    .LSEH_info_bn_mul_mont
1792
1793         .rva    .LSEH_begin_bn_mul4x_mont
1794         .rva    .LSEH_end_bn_mul4x_mont
1795         .rva    .LSEH_info_bn_mul4x_mont
1796
1797         .rva    .LSEH_begin_bn_sqr8x_mont
1798         .rva    .LSEH_end_bn_sqr8x_mont
1799         .rva    .LSEH_info_bn_sqr8x_mont
1800
1801 .section        .xdata
1802 .align  8
1803 .LSEH_info_bn_mul_mont:
1804         .byte   9,0,0,0
1805         .rva    mul_handler
1806         .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
1807 .LSEH_info_bn_mul4x_mont:
1808         .byte   9,0,0,0
1809         .rva    mul_handler
1810         .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
1811 .LSEH_info_bn_sqr8x_mont:
1812         .byte   9,0,0,0
1813         .rva    sqr_handler
1814 ___
1815 }
1816
1817 print $code;
1818 close STDOUT;