x86_64-mont.pl: futher optimization resulting in up to 48% improvement
[openssl.git] / crypto / bn / asm / x86_64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005.
11 #
12 # Montgomery multiplication routine for x86_64. While it gives modest
13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
15 # respectful 50%. It remains to be seen if loop unrolling and
16 # dedicated squaring routine can provide further improvement...
17
18 # July 2011.
19 #
20 # Add dedicated squaring procedure. Performance improvement varies
21 # from platform to platform, but in average it's ~5%/15%/25%/33%
22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24 # August 2011.
25 #
26 # Unroll and modulo-schedule inner loops in such manner that they
27 # are "fallen through" for input lengths of 8, which is critical for
28 # 1024-bit RSA *sign*. Average performance improvement in comparison
29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32 $flavour = shift;
33 $output  = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open STDOUT,"| $^X $xlate $flavour $output";
44
45 # int bn_mul_mont(
46 $rp="%rdi";     # BN_ULONG *rp,
47 $ap="%rsi";     # const BN_ULONG *ap,
48 $bp="%rdx";     # const BN_ULONG *bp,
49 $np="%rcx";     # const BN_ULONG *np,
50 $n0="%r8";      # const BN_ULONG *n0,
51 $num="%r9";     # int num);
52 $lo0="%r10";
53 $hi0="%r11";
54 $hi1="%r13";
55 $i="%r14";
56 $j="%r15";
57 $m0="%rbx";
58 $m1="%rbp";
59
60 $code=<<___;
61 .text
62
63 .globl  bn_mul_mont
64 .type   bn_mul_mont,\@function,6
65 .align  16
66 bn_mul_mont:
67         test    \$3,${num}d
68         jnz     .Lmul_enter
69         cmp     \$8,${num}d
70         jb      .Lmul_enter
71         cmp     $ap,$bp
72         jne     .Lmul4x_enter
73         jmp     .Lsqr4x_enter
74
75 .align  16
76 .Lmul_enter:
77         push    %rbx
78         push    %rbp
79         push    %r12
80         push    %r13
81         push    %r14
82         push    %r15
83
84         mov     ${num}d,${num}d
85         lea     2($num),%r10
86         mov     %rsp,%r11
87         neg     %r10
88         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
89         and     \$-1024,%rsp            # minimize TLB usage
90
91         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
92 .Lmul_body:
93         mov     $bp,%r12                # reassign $bp
94 ___
95                 $bp="%r12";
96 $code.=<<___;
97         mov     ($n0),$n0               # pull n0[0] value
98         mov     ($bp),$m0               # m0=bp[0]
99         mov     ($ap),%rax
100
101         xor     $i,$i                   # i=0
102         xor     $j,$j                   # j=0
103
104         mov     $n0,$m1
105         mulq    $m0                     # ap[0]*bp[0]
106         mov     %rax,$lo0
107         mov     ($np),%rax
108
109         imulq   $lo0,$m1                # "tp[0]"*n0
110         mov     %rdx,$hi0
111
112         mulq    $m1                     # np[0]*m1
113         add     %rax,$lo0               # discarded
114         mov     8($ap),%rax
115         adc     \$0,%rdx
116         mov     %rdx,$hi1
117
118         lea     1($j),$j                # j++
119         jmp     .L1st_enter
120
121 .align  16
122 .L1st:
123         add     %rax,$hi1
124         mov     ($ap,$j,8),%rax
125         adc     \$0,%rdx
126         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
127         mov     $lo0,$hi0
128         adc     \$0,%rdx
129         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
130         mov     %rdx,$hi1
131
132 .L1st_enter:
133         mulq    $m0                     # ap[j]*bp[0]
134         add     %rax,$hi0
135         mov     ($np,$j,8),%rax
136         adc     \$0,%rdx
137         lea     1($j),$j                # j++
138         mov     %rdx,$lo0
139
140         mulq    $m1                     # np[j]*m1
141         cmp     $num,$j
142         jne     .L1st
143
144         add     %rax,$hi1
145         adc     \$0,%rdx
146         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
147         adc     \$0,%rdx
148         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
149         mov     %rdx,$hi1
150         mov     $lo0,$hi0
151
152         xor     %rdx,%rdx
153         add     $hi0,$hi1
154         adc     \$0,%rdx
155         mov     $hi1,-8(%rsp,$num,8)
156         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
157
158         lea     1($i),$i                # i++
159         jmp     .Louter
160 .align  16
161 .Louter:
162         mov     ($bp,$i,8),$m0          # m0=bp[i]
163         xor     $j,$j                   # j=0
164         mov     ($ap),%rax              # ap[0]
165         mov     $n0,$m1
166         mov     (%rsp),$lo0
167         mulq    $m0                     # ap[0]*bp[i]
168         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
169         mov     ($np),%rax
170         adc     \$0,%rdx
171
172         imulq   $lo0,$m1                # tp[0]*n0
173         mov     %rdx,$hi0
174
175         mulq    $m1                     # np[0]*m1
176         add     %rax,$lo0               # discarded
177         mov     8($ap),%rax
178         adc     \$0,%rdx
179         mov     8(%rsp),$lo0            # tp[1]
180         mov     %rdx,$hi1
181
182         lea     1($j),$j                # j++
183         jmp     .Linner_enter
184
185 .align  16
186 .Linner:
187         add     %rax,$hi1
188         mov     ($ap,$j,8),%rax
189         adc     \$0,%rdx
190         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
191         mov     (%rsp,$j,8),$lo0
192         adc     \$0,%rdx
193         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
194         mov     %rdx,$hi1
195
196 .Linner_enter:
197         mulq    $m0                     # ap[j]*bp[i]
198         add     %rax,$hi0
199         mov     ($np,$j,8),%rax
200         adc     \$0,%rdx
201         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
202         mov     %rdx,$hi0
203         adc     \$0,$hi0
204         lea     1($j),$j                # j++
205
206         mulq    $m1                     # np[j]*m1
207         cmp     $num,$j
208         jne     .Linner
209
210         add     %rax,$hi1
211         adc     \$0,%rdx
212         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
213         mov     (%rsp,$j,8),$lo0
214         adc     \$0,%rdx
215         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
216         mov     %rdx,$hi1
217
218         xor     %rdx,%rdx
219         add     $hi0,$hi1
220         adc     \$0,%rdx
221         add     $lo0,$hi1               # pull upmost overflow bit
222         adc     \$0,%rdx
223         mov     $hi1,-8(%rsp,$num,8)
224         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
225
226         lea     1($i),$i                # i++
227         cmp     $num,$i
228         jl      .Louter
229
230         xor     $i,$i                   # i=0 and clear CF!
231         mov     (%rsp),%rax             # tp[0]
232         lea     (%rsp),$ap              # borrow ap for tp
233         mov     $num,$j                 # j=num
234         jmp     .Lsub
235 .align  16
236 .Lsub:  sbb     ($np,$i,8),%rax
237         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
238         mov     8($ap,$i,8),%rax        # tp[i+1]
239         lea     1($i),$i                # i++
240         dec     $j                      # doesnn't affect CF!
241         jnz     .Lsub
242
243         sbb     \$0,%rax                # handle upmost overflow bit
244         xor     $i,$i
245         and     %rax,$ap
246         not     %rax
247         mov     $rp,$np
248         and     %rax,$np
249         mov     $num,$j                 # j=num
250         or      $np,$ap                 # ap=borrow?tp:rp
251 .align  16
252 .Lcopy:                                 # copy or in-place refresh
253         mov     ($ap,$i,8),%rax
254         mov     $i,(%rsp,$i,8)          # zap temporary vector
255         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
256         lea     1($i),$i
257         sub     \$1,$j
258         jnz     .Lcopy
259
260         mov     8(%rsp,$num,8),%rsi     # restore %rsp
261         mov     \$1,%rax
262         mov     (%rsi),%r15
263         mov     8(%rsi),%r14
264         mov     16(%rsi),%r13
265         mov     24(%rsi),%r12
266         mov     32(%rsi),%rbp
267         mov     40(%rsi),%rbx
268         lea     48(%rsi),%rsp
269 .Lmul_epilogue:
270         ret
271 .size   bn_mul_mont,.-bn_mul_mont
272 ___
273 {{{
274 my @A=("%r10","%r11");
275 my @N=("%r13","%rdi");
276 $code.=<<___;
277 .type   bn_mul4x_mont,\@function,6
278 .align  16
279 bn_mul4x_mont:
280 .Lmul4x_enter:
281         push    %rbx
282         push    %rbp
283         push    %r12
284         push    %r13
285         push    %r14
286         push    %r15
287
288         mov     ${num}d,${num}d
289         lea     4($num),%r10
290         mov     %rsp,%r11
291         neg     %r10
292         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
293         and     \$-1024,%rsp            # minimize TLB usage
294
295         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
296 .Lmul4x_body:
297         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
298         mov     %rdx,%r12               # reassign $bp
299 ___
300                 $bp="%r12";
301 $code.=<<___;
302         mov     ($n0),$n0               # pull n0[0] value
303         mov     ($bp),$m0               # m0=bp[0]
304         mov     ($ap),%rax
305
306         xor     $i,$i                   # i=0
307         xor     $j,$j                   # j=0
308
309         mov     $n0,$m1
310         mulq    $m0                     # ap[0]*bp[0]
311         mov     %rax,$A[0]
312         mov     ($np),%rax
313
314         imulq   $A[0],$m1               # "tp[0]"*n0
315         mov     %rdx,$A[1]
316
317         mulq    $m1                     # np[0]*m1
318         add     %rax,$A[0]              # discarded
319         mov     8($ap),%rax
320         adc     \$0,%rdx
321         mov     %rdx,$N[1]
322
323         mulq    $m0
324         add     %rax,$A[1]
325         mov     8($np),%rax
326         adc     \$0,%rdx
327         mov     %rdx,$A[0]
328
329         mulq    $m1
330         add     %rax,$N[1]
331         mov     16($ap),%rax
332         adc     \$0,%rdx
333         add     $A[1],$N[1]
334         lea     4($j),$j                # j++
335         adc     \$0,%rdx
336         mov     $N[1],(%rsp)
337         mov     %rdx,$N[0]
338         jmp     .L1st4x
339 .align  16
340 .L1st4x:
341         mulq    $m0                     # ap[j]*bp[0]
342         add     %rax,$A[0]
343         mov     -16($np,$j,8),%rax
344         adc     \$0,%rdx
345         mov     %rdx,$A[1]
346
347         mulq    $m1                     # np[j]*m1
348         add     %rax,$N[0]
349         mov     -8($ap,$j,8),%rax
350         adc     \$0,%rdx
351         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
352         adc     \$0,%rdx
353         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
354         mov     %rdx,$N[1]
355
356         mulq    $m0                     # ap[j]*bp[0]
357         add     %rax,$A[1]
358         mov     -8($np,$j,8),%rax
359         adc     \$0,%rdx
360         mov     %rdx,$A[0]
361
362         mulq    $m1                     # np[j]*m1
363         add     %rax,$N[1]
364         mov     ($ap,$j,8),%rax
365         adc     \$0,%rdx
366         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
367         adc     \$0,%rdx
368         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
369         mov     %rdx,$N[0]
370
371         mulq    $m0                     # ap[j]*bp[0]
372         add     %rax,$A[0]
373         mov     ($np,$j,8),%rax
374         adc     \$0,%rdx
375         mov     %rdx,$A[1]
376
377         mulq    $m1                     # np[j]*m1
378         add     %rax,$N[0]
379         mov     8($ap,$j,8),%rax
380         adc     \$0,%rdx
381         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
382         adc     \$0,%rdx
383         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
384         mov     %rdx,$N[1]
385
386         mulq    $m0                     # ap[j]*bp[0]
387         add     %rax,$A[1]
388         mov     8($np,$j,8),%rax
389         adc     \$0,%rdx
390         lea     4($j),$j                # j++
391         mov     %rdx,$A[0]
392
393         mulq    $m1                     # np[j]*m1
394         add     %rax,$N[1]
395         mov     -16($ap,$j,8),%rax
396         adc     \$0,%rdx
397         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
398         adc     \$0,%rdx
399         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
400         mov     %rdx,$N[0]
401         cmp     $num,$j
402         jl      .L1st4x
403
404         mulq    $m0                     # ap[j]*bp[0]
405         add     %rax,$A[0]
406         mov     -16($np,$j,8),%rax
407         adc     \$0,%rdx
408         mov     %rdx,$A[1]
409
410         mulq    $m1                     # np[j]*m1
411         add     %rax,$N[0]
412         mov     -8($ap,$j,8),%rax
413         adc     \$0,%rdx
414         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
415         adc     \$0,%rdx
416         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
417         mov     %rdx,$N[1]
418
419         mulq    $m0                     # ap[j]*bp[0]
420         add     %rax,$A[1]
421         mov     -8($np,$j,8),%rax
422         adc     \$0,%rdx
423         mov     %rdx,$A[0]
424
425         mulq    $m1                     # np[j]*m1
426         add     %rax,$N[1]
427         mov     ($ap),%rax              # ap[0]
428         adc     \$0,%rdx
429         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
430         adc     \$0,%rdx
431         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
432         mov     %rdx,$N[0]
433
434         xor     $N[1],$N[1]
435         add     $A[0],$N[0]
436         adc     \$0,$N[1]
437         mov     $N[0],-8(%rsp,$j,8)
438         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
439
440         lea     1($i),$i                # i++
441 .align  4
442 .Louter4x:
443         mov     ($bp,$i,8),$m0          # m0=bp[i]
444         xor     $j,$j                   # j=0
445         mov     (%rsp),$A[0]
446         mov     $n0,$m1
447         mulq    $m0                     # ap[0]*bp[i]
448         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
449         mov     ($np),%rax
450         adc     \$0,%rdx
451
452         imulq   $A[0],$m1               # tp[0]*n0
453         mov     %rdx,$A[1]
454
455         mulq    $m1                     # np[0]*m1
456         add     %rax,$A[0]              # "$N[0]", discarded
457         mov     8($ap),%rax
458         adc     \$0,%rdx
459         mov     %rdx,$N[1]
460
461         mulq    $m0                     # ap[j]*bp[i]
462         add     %rax,$A[1]
463         mov     8($np),%rax
464         adc     \$0,%rdx
465         add     8(%rsp),$A[1]           # +tp[1]
466         adc     \$0,%rdx
467         mov     %rdx,$A[0]
468
469         mulq    $m1                     # np[j]*m1
470         add     %rax,$N[1]
471         mov     16($ap),%rax
472         adc     \$0,%rdx
473         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
474         lea     4($j),$j                # j+=2
475         adc     \$0,%rdx
476         mov     $N[1],(%rsp)            # tp[j-1]
477         mov     %rdx,$N[0]
478         jmp     .Linner4x
479 .align  16
480 .Linner4x:
481         mulq    $m0                     # ap[j]*bp[i]
482         add     %rax,$A[0]
483         mov     -16($np,$j,8),%rax
484         adc     \$0,%rdx
485         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
486         adc     \$0,%rdx
487         mov     %rdx,$A[1]
488
489         mulq    $m1                     # np[j]*m1
490         add     %rax,$N[0]
491         mov     -8($ap,$j,8),%rax
492         adc     \$0,%rdx
493         add     $A[0],$N[0]
494         adc     \$0,%rdx
495         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
496         mov     %rdx,$N[1]
497
498         mulq    $m0                     # ap[j]*bp[i]
499         add     %rax,$A[1]
500         mov     -8($np,$j,8),%rax
501         adc     \$0,%rdx
502         add     -8(%rsp,$j,8),$A[1]
503         adc     \$0,%rdx
504         mov     %rdx,$A[0]
505
506         mulq    $m1                     # np[j]*m1
507         add     %rax,$N[1]
508         mov     ($ap,$j,8),%rax
509         adc     \$0,%rdx
510         add     $A[1],$N[1]
511         adc     \$0,%rdx
512         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
513         mov     %rdx,$N[0]
514
515         mulq    $m0                     # ap[j]*bp[i]
516         add     %rax,$A[0]
517         mov     ($np,$j,8),%rax
518         adc     \$0,%rdx
519         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
520         adc     \$0,%rdx
521         mov     %rdx,$A[1]
522
523         mulq    $m1                     # np[j]*m1
524         add     %rax,$N[0]
525         mov     8($ap,$j,8),%rax
526         adc     \$0,%rdx
527         add     $A[0],$N[0]
528         adc     \$0,%rdx
529         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
530         mov     %rdx,$N[1]
531
532         mulq    $m0                     # ap[j]*bp[i]
533         add     %rax,$A[1]
534         mov     8($np,$j,8),%rax
535         adc     \$0,%rdx
536         add     8(%rsp,$j,8),$A[1]
537         adc     \$0,%rdx
538         lea     4($j),$j                # j++
539         mov     %rdx,$A[0]
540
541         mulq    $m1                     # np[j]*m1
542         add     %rax,$N[1]
543         mov     -16($ap,$j,8),%rax
544         adc     \$0,%rdx
545         add     $A[1],$N[1]
546         adc     \$0,%rdx
547         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
548         mov     %rdx,$N[0]
549         cmp     $num,$j
550         jl      .Linner4x
551
552         mulq    $m0                     # ap[j]*bp[i]
553         add     %rax,$A[0]
554         mov     -16($np,$j,8),%rax
555         adc     \$0,%rdx
556         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
557         adc     \$0,%rdx
558         mov     %rdx,$A[1]
559
560         mulq    $m1                     # np[j]*m1
561         add     %rax,$N[0]
562         mov     -8($ap,$j,8),%rax
563         adc     \$0,%rdx
564         add     $A[0],$N[0]
565         adc     \$0,%rdx
566         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
567         mov     %rdx,$N[1]
568
569         mulq    $m0                     # ap[j]*bp[i]
570         add     %rax,$A[1]
571         mov     -8($np,$j,8),%rax
572         adc     \$0,%rdx
573         add     -8(%rsp,$j,8),$A[1]
574         adc     \$0,%rdx
575         lea     1($i),$i                # i++
576         mov     %rdx,$A[0]
577
578         mulq    $m1                     # np[j]*m1
579         add     %rax,$N[1]
580         mov     ($ap),%rax              # ap[0]
581         adc     \$0,%rdx
582         add     $A[1],$N[1]
583         adc     \$0,%rdx
584         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
585         mov     %rdx,$N[0]
586
587         xor     $N[1],$N[1]
588         add     $A[0],$N[0]
589         adc     \$0,$N[1]
590         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
591         adc     \$0,$N[1]
592         mov     $N[0],-8(%rsp,$j,8)
593         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
594
595         cmp     $num,$i
596         jl      .Louter4x
597 ___
598 {
599 my @ri=("%rax","%rdx",$m0,$m1);
600 $code.=<<___;
601         mov     16(%rsp,$num,8),$rp     # restore $rp
602         mov     0(%rsp),@ri[0]          # tp[0]
603         pxor    %xmm0,%xmm0
604         mov     8(%rsp),@ri[1]          # tp[1]
605         shr     \$2,$num                # num/=4
606         lea     (%rsp),$ap              # borrow ap for tp
607         xor     $i,$i                   # i=0 and clear CF!
608
609         sub     0($np),@ri[0]
610         mov     16($ap),@ri[2]          # tp[2]
611         mov     24($ap),@ri[3]          # tp[3]
612         sbb     8($np),@ri[1]
613         lea     -1($num),$j             # j=num/4-1
614         jmp     .Lsub4x
615 .align  16
616 .Lsub4x:
617         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
618         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
619         sbb     16($np,$i,8),@ri[2]
620         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
621         mov     40($ap,$i,8),@ri[1]
622         sbb     24($np,$i,8),@ri[3]
623         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
624         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
625         sbb     32($np,$i,8),@ri[0]
626         mov     48($ap,$i,8),@ri[2]
627         mov     56($ap,$i,8),@ri[3]
628         sbb     40($np,$i,8),@ri[1]
629         lea     4($i),$i                # i++
630         dec     $j                      # doesnn't affect CF!
631         jnz     .Lsub4x
632
633         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
634         mov     32($ap,$i,8),@ri[0]     # load overflow bit
635         sbb     16($np,$i,8),@ri[2]
636         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
637         sbb     24($np,$i,8),@ri[3]
638         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
639
640         sbb     \$0,@ri[0]              # handle upmost overflow bit
641         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
642         xor     $i,$i                   # i=0
643         and     @ri[0],$ap
644         not     @ri[0]
645         mov     $rp,$np
646         and     @ri[0],$np
647         lea     -1($num),$j
648         or      $np,$ap                 # ap=borrow?tp:rp
649
650         movdqu  ($ap),%xmm1
651         movdqa  %xmm0,(%rsp)
652         movdqu  %xmm1,($rp)
653         jmp     .Lcopy4x
654 .align  16
655 .Lcopy4x:                                       # copy or in-place refresh
656         movdqu  16($ap,$i),%xmm2
657         movdqu  32($ap,$i),%xmm1
658         movdqa  %xmm0,16(%rsp,$i)
659         movdqu  %xmm2,16($rp,$i)
660         movdqa  %xmm0,32(%rsp,$i)
661         movdqu  %xmm1,32($rp,$i)
662         lea     32($i),$i
663         dec     $j
664         jnz     .Lcopy4x
665
666         shl     \$2,$num
667         movdqu  16($ap,$i),%xmm2
668         movdqa  %xmm0,16(%rsp,$i)
669         movdqu  %xmm2,16($rp,$i)
670 ___
671 }
672 $code.=<<___;
673         mov     8(%rsp,$num,8),%rsi     # restore %rsp
674         mov     \$1,%rax
675         mov     (%rsi),%r15
676         mov     8(%rsi),%r14
677         mov     16(%rsi),%r13
678         mov     24(%rsi),%r12
679         mov     32(%rsi),%rbp
680         mov     40(%rsi),%rbx
681         lea     48(%rsi),%rsp
682 .Lmul4x_epilogue:
683         ret
684 .size   bn_mul4x_mont,.-bn_mul4x_mont
685 ___
686 }}}
687 \f{{{
688 ######################################################################
689 # void bn_sqr4x_mont(
690 my $rptr="%rdi";        # const BN_ULONG *rptr,
691 my $aptr="%rsi";        # const BN_ULONG *aptr,
692 my $bptr="%rdx";        # not used
693 my $nptr="%rcx";        # const BN_ULONG *nptr,
694 my $n0  ="%r8";         # const BN_ULONG *n0);
695 my $num ="%r9";         # int num, has to be divisible by 4 and
696                         # not less than 8
697
698 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
699 my @A0=("%r10","%r11");
700 my @A1=("%r12","%r13");
701 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
702
703 $code.=<<___;
704 .type   bn_sqr4x_mont,\@function,6
705 .align  16
706 bn_sqr4x_mont:
707 .Lsqr4x_enter:
708         push    %rbx
709         push    %rbp
710         push    %r12
711         push    %r13
712         push    %r14
713         push    %r15
714
715         shl     \$3,${num}d             # convert $num to bytes
716         xor     %r10,%r10
717         mov     %rsp,%r11               # put aside %rsp
718         sub     $num,%r10               # -$num
719         mov     ($n0),$n0               # *n0
720         lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
721         and     \$-1024,%rsp            # minimize TLB usage
722         ##############################################################
723         # Stack layout
724         #
725         # +0    saved $num, used in reduction section
726         # +8    &t[2*$num], used in reduction section
727         # +32   saved $rptr
728         # +40   saved $nptr
729         # +48   saved *n0
730         # +56   saved %rsp
731         # +64   t[2*$num]
732         #
733         mov     $rptr,32(%rsp)          # save $rptr
734         mov     $nptr,40(%rsp)
735         mov     $n0,  48(%rsp)
736         mov     %r11, 56(%rsp)          # save original %rsp
737 .Lsqr4x_body:
738         ##############################################################
739         # Squaring part:
740         #
741         # a) multiply-n-add everything but a[i]*a[i];
742         # b) shift result of a) by 1 to the left and accumulate
743         #    a[i]*a[i] products;
744         #
745         lea     32(%r10),$i             # $i=-($num-32)
746         lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
747
748         mov     $num,$j                 # $j=$num
749
750                                         # comments apply to $num==8 case
751         mov     -32($aptr,$i),$a0       # a[0]
752         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
753         mov     -24($aptr,$i),%rax      # a[1]
754         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
755         mov     -16($aptr,$i),$ai       # a[2]
756         mov     %rax,$a1
757
758         mul     $a0                     # a[1]*a[0]
759         mov     %rax,$A0[0]             # a[1]*a[0]
760          mov    $ai,%rax                # a[2]
761         mov     %rdx,$A0[1]
762         mov     $A0[0],-24($tptr,$i)    # t[1]
763
764         xor     $A0[0],$A0[0]
765         mul     $a0                     # a[2]*a[0]
766         add     %rax,$A0[1]
767          mov    $ai,%rax
768         adc     %rdx,$A0[0]
769         mov     $A0[1],-16($tptr,$i)    # t[2]
770
771         lea     -16($i),$j              # j=-16
772
773
774          mov    8($aptr,$j),$ai         # a[3]
775         mul     $a1                     # a[2]*a[1]
776         mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
777          mov    $ai,%rax
778         mov     %rdx,$A1[1]
779
780         xor     $A0[1],$A0[1]
781         add     $A1[0],$A0[0]
782          lea    16($j),$j
783         adc     \$0,$A0[1]
784         mul     $a0                     # a[3]*a[0]
785         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
786          mov    $ai,%rax
787         adc     %rdx,$A0[1]
788         mov     $A0[0],-8($tptr,$j)     # t[3]
789         jmp     .Lsqr4x_1st
790
791 .align  16
792 .Lsqr4x_1st:
793          mov    ($aptr,$j),$ai          # a[4]
794         xor     $A1[0],$A1[0]
795         mul     $a1                     # a[3]*a[1]
796         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
797          mov    $ai,%rax
798         adc     %rdx,$A1[0]
799
800         xor     $A0[0],$A0[0]
801         add     $A1[1],$A0[1]
802         adc     \$0,$A0[0]
803         mul     $a0                     # a[4]*a[0]
804         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
805          mov    $ai,%rax                # a[3]
806         adc     %rdx,$A0[0]
807         mov     $A0[1],($tptr,$j)       # t[4]
808
809
810          mov    8($aptr,$j),$ai         # a[5]
811         xor     $A1[1],$A1[1]
812         mul     $a1                     # a[4]*a[3]
813         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
814          mov    $ai,%rax
815         adc     %rdx,$A1[1]
816
817         xor     $A0[1],$A0[1]
818         add     $A1[0],$A0[0]
819          lea    16($j),$j
820         adc     \$0,$A0[1]
821         mul     $a0                     # a[5]*a[2]
822         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
823          mov    $ai,%rax
824         adc     %rdx,$A0[1]
825         mov     $A0[0],-8($tptr,$j)     # t[5]
826
827          mov    ($aptr,$j),$ai          # a[6]
828         xor     $A1[0],$A1[0]
829         mul     $a1                     # a[5]*a[3]
830         add     %rax,$A1[1]             # a[5]*a[3]+t[6]
831          mov    $ai,%rax
832         adc     %rdx,$A1[0]
833
834         xor     $A0[0],$A0[0]
835         add     $A1[1],$A0[1]
836         adc     \$0,$A0[0]
837         mul     $a0                     # a[6]*a[2]
838         add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
839          mov    $ai,%rax                # a[3]
840         adc     %rdx,$A0[0]
841         mov     $A0[1],($tptr,$j)       # t[6]
842
843
844          mov    8($aptr,$j),$ai         # a[7]
845         xor     $A1[1],$A1[1]
846         mul     $a1                     # a[6]*a[5]
847         add     %rax,$A1[0]             # a[6]*a[5]+t[7]
848          mov    $ai,%rax
849         adc     %rdx,$A1[1]
850
851         xor     $A0[1],$A0[1]
852         add     $A1[0],$A0[0]
853          lea    16($j),$j
854         adc     \$0,$A0[1]
855         mul     $a0                     # a[7]*a[4]
856         add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
857          mov    $ai,%rax
858         adc     %rdx,$A0[1]
859         mov     $A0[0],-8($tptr,$j)     # t[7]
860
861         cmp     \$0,$j
862         jne     .Lsqr4x_1st
863
864         xor     $A1[0],$A1[0]
865         add     $A0[1],$A1[1]
866         adc     \$0,$A1[0]
867         mul     $a1                     # a[7]*a[5]
868         add     %rax,$A1[1]
869         adc     %rdx,$A1[0]
870
871         mov     $A1[1],($tptr)          # t[8]
872         lea     16($i),$i
873         mov     $A1[0],8($tptr)         # t[9]
874         jmp     .Lsqr4x_outer
875
876 .align  16
877 .Lsqr4x_outer:                          # comments apply to $num==6 case
878         mov     -32($aptr,$i),$a0       # a[0]
879         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
880         mov     -24($aptr,$i),%rax      # a[1]
881         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
882         mov     -16($aptr,$i),$ai       # a[2]
883         mov     %rax,$a1
884
885         mov     -24($tptr,$i),$A0[0]    # t[1]
886         xor     $A0[1],$A0[1]
887         mul     $a0                     # a[1]*a[0]
888         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
889          mov    $ai,%rax                # a[2]
890         adc     %rdx,$A0[1]
891         mov     $A0[0],-24($tptr,$i)    # t[1]
892
893         xor     $A0[0],$A0[0]
894         add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
895         adc     \$0,$A0[0]
896         mul     $a0                     # a[2]*a[0]
897         add     %rax,$A0[1]
898          mov    $ai,%rax
899         adc     %rdx,$A0[0]
900         mov     $A0[1],-16($tptr,$i)    # t[2]
901
902         lea     -16($i),$j              # j=-16
903         xor     $A1[0],$A1[0]
904
905
906          mov    8($aptr,$j),$ai         # a[3]
907         xor     $A1[1],$A1[1]
908         add     8($tptr,$j),$A1[0]
909         adc     \$0,$A1[1]
910         mul     $a1                     # a[2]*a[1]
911         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
912          mov    $ai,%rax
913         adc     %rdx,$A1[1]
914
915         xor     $A0[1],$A0[1]
916         add     $A1[0],$A0[0]
917         adc     \$0,$A0[1]
918         mul     $a0                     # a[3]*a[0]
919         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
920          mov    $ai,%rax
921         adc     %rdx,$A0[1]
922         mov     $A0[0],8($tptr,$j)      # t[3]
923
924         lea     16($j),$j
925         jmp     .Lsqr4x_inner
926
927 .align  16
928 .Lsqr4x_inner:
929          mov    ($aptr,$j),$ai          # a[4]
930         xor     $A1[0],$A1[0]
931         add     ($tptr,$j),$A1[1]
932         adc     \$0,$A1[0]
933         mul     $a1                     # a[3]*a[1]
934         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
935          mov    $ai,%rax
936         adc     %rdx,$A1[0]
937
938         xor     $A0[0],$A0[0]
939         add     $A1[1],$A0[1]
940         adc     \$0,$A0[0]
941         mul     $a0                     # a[4]*a[0]
942         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
943          mov    $ai,%rax                # a[3]
944         adc     %rdx,$A0[0]
945         mov     $A0[1],($tptr,$j)       # t[4]
946
947          mov    8($aptr,$j),$ai         # a[5]
948         xor     $A1[1],$A1[1]
949         add     8($tptr,$j),$A1[0]
950         adc     \$0,$A1[1]
951         mul     $a1                     # a[4]*a[3]
952         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
953          mov    $ai,%rax
954         adc     %rdx,$A1[1]
955
956         xor     $A0[1],$A0[1]
957         add     $A1[0],$A0[0]
958         lea     16($j),$j               # j++
959         adc     \$0,$A0[1]
960         mul     $a0                     # a[5]*a[2]
961         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
962          mov    $ai,%rax
963         adc     %rdx,$A0[1]
964         mov     $A0[0],-8($tptr,$j)     # t[5]
965
966         cmp     \$0,$j
967         jne     .Lsqr4x_inner
968
969         xor     $A1[0],$A1[0]
970         add     $A0[1],$A1[1]
971         adc     \$0,$A1[0]
972         mul     $a1                     # a[5]*a[3]
973         add     %rax,$A1[1]
974         adc     %rdx,$A1[0]
975
976         mov     $A1[1],($tptr)          # t[6]
977         mov     $A1[0],8($tptr)         # t[7]
978
979         add     \$16,$i
980         jnz     .Lsqr4x_outer
981
982                                         # comments apply to $num==4 case
983         mov     -32($aptr),$a0          # a[0]
984         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
985         mov     -24($aptr),%rax         # a[1]
986         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
987         mov     -16($aptr),$ai          # a[2]
988         mov     %rax,$a1
989
990         mov     -24($tptr),$A0[0]       # t[1]
991         xor     $A0[1],$A0[1]
992         mul     $a0                     # a[1]*a[0]
993         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
994          mov    $ai,%rax                # a[2]
995         adc     %rdx,$A0[1]
996         mov     $A0[0],-24($tptr)       # t[1]
997
998         xor     $A0[0],$A0[0]
999         add     -16($tptr),$A0[1]       # a[2]*a[0]+t[2]
1000         adc     \$0,$A0[0]
1001         mul     $a0                     # a[2]*a[0]
1002         add     %rax,$A0[1]
1003          mov    $ai,%rax
1004         adc     %rdx,$A0[0]
1005         mov     $A0[1],-16($tptr)       # t[2]
1006
1007         xor     $A1[0],$A1[0]
1008          mov    -8($aptr),$ai           # a[3]
1009         xor     $A1[1],$A1[1]
1010         add     -8($tptr),$A1[0]
1011         adc     \$0,$A1[1]
1012         mul     $a1                     # a[2]*a[1]
1013         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
1014          mov    $ai,%rax
1015         adc     %rdx,$A1[1]
1016
1017         xor     $A0[1],$A0[1]
1018         add     $A1[0],$A0[0]
1019         adc     \$0,$A0[1]
1020         mul     $a0                     # a[3]*a[0]
1021         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1022          mov    $ai,%rax
1023         adc     %rdx,$A0[1]
1024         mov     $A0[0],-8($tptr)        # t[3]
1025
1026         xor     $A1[0],$A1[0]
1027         add     $A0[1],$A1[1]
1028         adc     \$0,$A1[0]
1029         mul     $a1                     # a[3]*a[1]
1030         add     %rax,$A1[1]
1031          mov    -16($aptr),%rax         # a[2]
1032         adc     %rdx,$A1[0]
1033
1034         mov     $A1[1],($tptr)          # t[4]
1035         mov     $A1[0],8($tptr)         # t[5]
1036
1037         mul     $ai                     # a[2]*a[3]
1038 ___
1039 {
1040 my ($shift,$carry)=($a0,$a1);
1041 my @S=(@A1,$ai,$n0);
1042 $code.=<<___;
1043          add    \$16,$i
1044          xor    $shift,$shift
1045          sub    $num,$i                 # $i=16-$num
1046          xor    $carry,$carry
1047
1048         add     $A1[0],%rax             # t[5]
1049         adc     \$0,%rdx
1050         mov     %rax,8($tptr)           # t[5]
1051         mov     %rdx,16($tptr)          # t[6]
1052         mov     $carry,24($tptr)        # t[7]
1053
1054          mov    -16($aptr,$i),%rax      # a[0]
1055         lea     64(%rsp,$num,2),$tptr
1056          xor    $A0[0],$A0[0]           # t[0]
1057          mov    -24($tptr,$i,2),$A0[1]  # t[1]
1058
1059         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1060         shr     \$63,$A0[0]
1061         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1062         shr     \$63,$A0[1]
1063         or      $A0[0],$S[1]            # | t[2*i]>>63
1064          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1065         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1066         mul     %rax                    # a[i]*a[i]
1067         neg     $carry                  # mov $carry,cf
1068          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1069         adc     %rax,$S[0]
1070          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1071         mov     $S[0],-32($tptr,$i,2)
1072         adc     %rdx,$S[1]
1073
1074         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1075          mov    $S[1],-24($tptr,$i,2)
1076          sbb    $carry,$carry           # mov cf,$carry
1077         shr     \$63,$A0[0]
1078         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1079         shr     \$63,$A0[1]
1080         or      $A0[0],$S[3]            # | t[2*i]>>63
1081          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1082         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1083         mul     %rax                    # a[i]*a[i]
1084         neg     $carry                  # mov $carry,cf
1085          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1086         adc     %rax,$S[2]
1087          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1088         mov     $S[2],-16($tptr,$i,2)
1089         adc     %rdx,$S[3]
1090         lea     16($i),$i
1091         mov     $S[3],-40($tptr,$i,2)
1092         sbb     $carry,$carry           # mov cf,$carry
1093         jmp     .Lsqr4x_shift_n_add
1094
1095 .align  16
1096 .Lsqr4x_shift_n_add:
1097         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1098         shr     \$63,$A0[0]
1099         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1100         shr     \$63,$A0[1]
1101         or      $A0[0],$S[1]            # | t[2*i]>>63
1102          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1103         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1104         mul     %rax                    # a[i]*a[i]
1105         neg     $carry                  # mov $carry,cf
1106          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1107         adc     %rax,$S[0]
1108          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1109         mov     $S[0],-32($tptr,$i,2)
1110         adc     %rdx,$S[1]
1111
1112         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1113          mov    $S[1],-24($tptr,$i,2)
1114          sbb    $carry,$carry           # mov cf,$carry
1115         shr     \$63,$A0[0]
1116         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1117         shr     \$63,$A0[1]
1118         or      $A0[0],$S[3]            # | t[2*i]>>63
1119          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1120         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1121         mul     %rax                    # a[i]*a[i]
1122         neg     $carry                  # mov $carry,cf
1123          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1124         adc     %rax,$S[2]
1125          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1126         mov     $S[2],-16($tptr,$i,2)
1127         adc     %rdx,$S[3]
1128
1129         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1130          mov    $S[3],-8($tptr,$i,2)
1131          sbb    $carry,$carry           # mov cf,$carry
1132         shr     \$63,$A0[0]
1133         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1134         shr     \$63,$A0[1]
1135         or      $A0[0],$S[1]            # | t[2*i]>>63
1136          mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1137         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1138         mul     %rax                    # a[i]*a[i]
1139         neg     $carry                  # mov $carry,cf
1140          mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1141         adc     %rax,$S[0]
1142          mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
1143         mov     $S[0],0($tptr,$i,2)
1144         adc     %rdx,$S[1]
1145
1146         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1147          mov    $S[1],8($tptr,$i,2)
1148          sbb    $carry,$carry           # mov cf,$carry
1149         shr     \$63,$A0[0]
1150         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1151         shr     \$63,$A0[1]
1152         or      $A0[0],$S[3]            # | t[2*i]>>63
1153          mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1154         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1155         mul     %rax                    # a[i]*a[i]
1156         neg     $carry                  # mov $carry,cf
1157          mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1158         adc     %rax,$S[2]
1159          mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
1160         mov     $S[2],16($tptr,$i,2)
1161         adc     %rdx,$S[3]
1162         mov     $S[3],24($tptr,$i,2)
1163         sbb     $carry,$carry           # mov cf,$carry
1164         add     \$32,$i
1165         jnz     .Lsqr4x_shift_n_add
1166
1167         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1168         shr     \$63,$A0[0]
1169         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1170         shr     \$63,$A0[1]
1171         or      $A0[0],$S[1]            # | t[2*i]>>63
1172          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1173         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1174         mul     %rax                    # a[i]*a[i]
1175         neg     $carry                  # mov $carry,cf
1176          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1177         adc     %rax,$S[0]
1178          mov    -8($aptr),%rax          # a[i+1]        # prefetch
1179         mov     $S[0],-32($tptr)
1180         adc     %rdx,$S[1]
1181
1182         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1183          mov    $S[1],-24($tptr)
1184          sbb    $carry,$carry           # mov cf,$carry
1185         shr     \$63,$A0[0]
1186         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1187         shr     \$63,$A0[1]
1188         or      $A0[0],$S[3]            # | t[2*i]>>63
1189         mul     %rax                    # a[i]*a[i]
1190         neg     $carry                  # mov $carry,cf
1191         adc     %rax,$S[2]
1192         adc     %rdx,$S[3]
1193         mov     $S[2],-16($tptr)
1194         mov     $S[3],-8($tptr)
1195 ___
1196 }\f
1197 ##############################################################
1198 # Montgomery reduction part, "word-by-word" algorithm.
1199 #
1200 {
1201 my ($topbit,$nptr)=("%rbp",$aptr);
1202 my ($m0,$m1)=($a0,$a1);
1203 my @Ni=("%rbx","%r9");
1204 $code.=<<___;
1205         mov     40(%rsp),$nptr          # restore $nptr
1206         mov     48(%rsp),$n0            # restore *n0
1207         xor     $j,$j
1208         mov     $num,0(%rsp)            # save $num
1209         sub     $num,$j                 # $j=-$num
1210          mov    64(%rsp),$A0[0]         # t[0]          # modsched #
1211          mov    $n0,$m0                 #               # modsched #
1212         lea     64(%rsp,$num,2),%rax    # end of t[] buffer
1213         lea     64(%rsp,$num),$tptr     # end of t[] window
1214         mov     %rax,8(%rsp)            # save end of t[] buffer
1215         lea     ($nptr,$num),$nptr      # end of n[] buffer
1216         xor     $topbit,$topbit         # $topbit=0
1217
1218         mov     0($nptr,$j),%rax        # n[0]          # modsched #
1219         mov     8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1220          imulq  $A0[0],$m0              # m0=t[0]*n0    # modsched #
1221          mov    %rax,$Ni[0]             #               # modsched #
1222         jmp     .Lsqr4x_mont_outer
1223
1224 .align  16
1225 .Lsqr4x_mont_outer:
1226         xor     $A0[1],$A0[1]
1227         mul     $m0                     # n[0]*m0
1228         add     %rax,$A0[0]             # n[0]*m0+t[0]
1229          mov    $Ni[1],%rax
1230         adc     %rdx,$A0[1]
1231         mov     $n0,$m1
1232
1233         xor     $A0[0],$A0[0]
1234         add     8($tptr,$j),$A0[1]
1235         adc     \$0,$A0[0]
1236         mul     $m0                     # n[1]*m0
1237         add     %rax,$A0[1]             # n[1]*m0+t[1]
1238          mov    $Ni[0],%rax
1239         adc     %rdx,$A0[0]
1240
1241         imulq   $A0[1],$m1
1242
1243         mov     16($nptr,$j),$Ni[0]     # n[2]
1244         xor     $A1[1],$A1[1]
1245         add     $A0[1],$A1[0]
1246         adc     \$0,$A1[1]
1247         mul     $m1                     # n[0]*m1
1248         add     %rax,$A1[0]             # n[0]*m1+"t[1]"
1249          mov    $Ni[0],%rax
1250         adc     %rdx,$A1[1]
1251         mov     $A1[0],8($tptr,$j)      # "t[1]"
1252
1253         xor     $A0[1],$A0[1]
1254         add     16($tptr,$j),$A0[0]
1255         adc     \$0,$A0[1]
1256         mul     $m0                     # n[2]*m0
1257         add     %rax,$A0[0]             # n[2]*m0+t[2]
1258          mov    $Ni[1],%rax
1259         adc     %rdx,$A0[1]
1260
1261         mov     24($nptr,$j),$Ni[1]     # n[3]
1262         xor     $A1[0],$A1[0]
1263         add     $A0[0],$A1[1]
1264         adc     \$0,$A1[0]
1265         mul     $m1                     # n[1]*m1
1266         add     %rax,$A1[1]             # n[1]*m1+"t[2]"
1267          mov    $Ni[1],%rax
1268         adc     %rdx,$A1[0]
1269         mov     $A1[1],16($tptr,$j)     # "t[2]"
1270
1271         xor     $A0[0],$A0[0]
1272         add     24($tptr,$j),$A0[1]
1273         lea     32($j),$j
1274         adc     \$0,$A0[0]
1275         mul     $m0                     # n[3]*m0
1276         add     %rax,$A0[1]             # n[3]*m0+t[3]
1277          mov    $Ni[0],%rax
1278         adc     %rdx,$A0[0]
1279         jmp     .Lsqr4x_mont_inner
1280
1281 .align  16
1282 .Lsqr4x_mont_inner:
1283         mov     ($nptr,$j),$Ni[0]       # n[4]
1284         xor     $A1[1],$A1[1]
1285         add     $A0[1],$A1[0]
1286         adc     \$0,$A1[1]
1287         mul     $m1                     # n[2]*m1
1288         add     %rax,$A1[0]             # n[2]*m1+"t[3]"
1289          mov    $Ni[0],%rax
1290         adc     %rdx,$A1[1]
1291         mov     $A1[0],-8($tptr,$j)     # "t[3]"
1292
1293         xor     $A0[1],$A0[1]
1294         add     ($tptr,$j),$A0[0]
1295         adc     \$0,$A0[1]
1296         mul     $m0                     # n[4]*m0
1297         add     %rax,$A0[0]             # n[4]*m0+t[4]
1298          mov    $Ni[1],%rax
1299         adc     %rdx,$A0[1]
1300
1301         mov     8($nptr,$j),$Ni[1]      # n[5]
1302         xor     $A1[0],$A1[0]
1303         add     $A0[0],$A1[1]
1304         adc     \$0,$A1[0]
1305         mul     $m1                     # n[3]*m1
1306         add     %rax,$A1[1]             # n[3]*m1+"t[4]"
1307          mov    $Ni[1],%rax
1308         adc     %rdx,$A1[0]
1309         mov     $A1[1],($tptr,$j)       # "t[4]"
1310
1311         xor     $A0[0],$A0[0]
1312         add     8($tptr,$j),$A0[1]
1313         adc     \$0,$A0[0]
1314         mul     $m0                     # n[5]*m0
1315         add     %rax,$A0[1]             # n[5]*m0+t[5]
1316          mov    $Ni[0],%rax
1317         adc     %rdx,$A0[0]
1318
1319
1320         mov     16($nptr,$j),$Ni[0]     # n[6]
1321         xor     $A1[1],$A1[1]
1322         add     $A0[1],$A1[0]
1323         adc     \$0,$A1[1]
1324         mul     $m1                     # n[4]*m1
1325         add     %rax,$A1[0]             # n[4]*m1+"t[5]"
1326          mov    $Ni[0],%rax
1327         adc     %rdx,$A1[1]
1328         mov     $A1[0],8($tptr,$j)      # "t[5]"
1329
1330         xor     $A0[1],$A0[1]
1331         add     16($tptr,$j),$A0[0]
1332         adc     \$0,$A0[1]
1333         mul     $m0                     # n[6]*m0
1334         add     %rax,$A0[0]             # n[6]*m0+t[6]
1335          mov    $Ni[1],%rax
1336         adc     %rdx,$A0[1]
1337
1338         mov     24($nptr,$j),$Ni[1]     # n[7]
1339         xor     $A1[0],$A1[0]
1340         add     $A0[0],$A1[1]
1341         adc     \$0,$A1[0]
1342         mul     $m1                     # n[5]*m1
1343         add     %rax,$A1[1]             # n[5]*m1+"t[6]"
1344          mov    $Ni[1],%rax
1345         adc     %rdx,$A1[0]
1346         mov     $A1[1],16($tptr,$j)     # "t[6]"
1347
1348         xor     $A0[0],$A0[0]
1349         add     24($tptr,$j),$A0[1]
1350         lea     32($j),$j
1351         adc     \$0,$A0[0]
1352         mul     $m0                     # n[7]*m0
1353         add     %rax,$A0[1]             # n[7]*m0+t[7]
1354          mov    $Ni[0],%rax
1355         adc     %rdx,$A0[0]
1356         cmp     \$0,$j
1357         jne     .Lsqr4x_mont_inner
1358
1359          sub    0(%rsp),$j              # $j=-$num      # modsched #
1360          mov    $n0,$m0                 #               # modsched #
1361
1362         xor     $A1[1],$A1[1]
1363         add     $A0[1],$A1[0]
1364         adc     \$0,$A1[1]
1365         mul     $m1                     # n[6]*m1
1366         add     %rax,$A1[0]             # n[6]*m1+"t[7]"
1367         mov     $Ni[1],%rax
1368         adc     %rdx,$A1[1]
1369         mov     $A1[0],-8($tptr)        # "t[7]"
1370
1371         xor     $A0[1],$A0[1]
1372         add     ($tptr),$A0[0]          # +t[8]
1373         adc     \$0,$A0[1]
1374          mov    0($nptr,$j),$Ni[0]      # n[0]          # modsched #
1375         add     $topbit,$A0[0]
1376         adc     \$0,$A0[1]
1377
1378          imulq  16($tptr,$j),$m0        # m0=t[0]*n0    # modsched #
1379         xor     $A1[0],$A1[0]
1380          mov    8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1381         add     $A0[0],$A1[1]
1382          mov    16($tptr,$j),$A0[0]     # t[0]          # modsched #
1383         adc     \$0,$A1[0]
1384         mul     $m1                     # n[7]*m1
1385         add     %rax,$A1[1]             # n[7]*m1+"t[8]"
1386          mov    $Ni[0],%rax             #               # modsched #
1387         adc     %rdx,$A1[0]
1388         mov     $A1[1],($tptr)          # "t[8]"
1389
1390         xor     $topbit,$topbit
1391         add     8($tptr),$A1[0]         # +t[9]
1392         adc     $topbit,$topbit
1393         add     $A0[1],$A1[0]
1394         lea     16($tptr),$tptr         # "t[$num]>>128"
1395         adc     \$0,$topbit
1396         mov     $A1[0],-8($tptr)        # "t[9]"
1397         cmp     8(%rsp),$tptr           # are we done?
1398         jb      .Lsqr4x_mont_outer
1399
1400         mov     0(%rsp),$num            # restore $num
1401         mov     $topbit,($tptr)         # save $topbit
1402 ___
1403 }\f
1404 ##############################################################
1405 # Post-condition, 4x unrolled copy from bn_mul_mont
1406 #
1407 {
1408 my ($tptr,$nptr)=("%rbx",$aptr);
1409 my @ri=("%rax","%rdx","%r10","%r11");
1410 $code.=<<___;
1411         mov     64(%rsp,$num),@ri[0]    # tp[0]
1412         lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
1413         mov     40(%rsp),$nptr          # restore $nptr
1414         shr     \$5,$num                # num/4
1415         mov     8($tptr),@ri[1]         # t[1]
1416         xor     $i,$i                   # i=0 and clear CF!
1417
1418         mov     32(%rsp),$rptr          # restore $rptr
1419         sub     0($nptr),@ri[0]
1420         mov     16($tptr),@ri[2]        # t[2]
1421         mov     24($tptr),@ri[3]        # t[3]
1422         sbb     8($nptr),@ri[1]
1423         lea     -1($num),$j             # j=num/4-1
1424         jmp     .Lsqr4x_sub
1425 .align  16
1426 .Lsqr4x_sub:
1427         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1428         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1429         sbb     16($nptr,$i,8),@ri[2]
1430         mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
1431         mov     40($tptr,$i,8),@ri[1]
1432         sbb     24($nptr,$i,8),@ri[3]
1433         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1434         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1435         sbb     32($nptr,$i,8),@ri[0]
1436         mov     48($tptr,$i,8),@ri[2]
1437         mov     56($tptr,$i,8),@ri[3]
1438         sbb     40($nptr,$i,8),@ri[1]
1439         lea     4($i),$i                # i++
1440         dec     $j                      # doesn't affect CF!
1441         jnz     .Lsqr4x_sub
1442
1443         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1444         mov     32($tptr,$i,8),@ri[0]   # load overflow bit
1445         sbb     16($nptr,$i,8),@ri[2]
1446         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1447         sbb     24($nptr,$i,8),@ri[3]
1448         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1449
1450         sbb     \$0,@ri[0]              # handle upmost overflow bit
1451         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1452         xor     $i,$i                   # i=0
1453         and     @ri[0],$tptr
1454         not     @ri[0]
1455         mov     $rptr,$nptr
1456         and     @ri[0],$nptr
1457         lea     -1($num),$j
1458         or      $nptr,$tptr             # tp=borrow?tp:rp
1459
1460         pxor    %xmm0,%xmm0
1461         lea     64(%rsp,$num,8),$nptr
1462         movdqu  ($tptr),%xmm1
1463         lea     ($nptr,$num,8),$nptr
1464         movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
1465         movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
1466         movdqu  %xmm1,($rptr)
1467         jmp     .Lsqr4x_copy
1468 .align  16
1469 .Lsqr4x_copy:                           # copy or in-place refresh
1470         movdqu  16($tptr,$i),%xmm2
1471         movdqu  32($tptr,$i),%xmm1
1472         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1473         movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
1474         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1475         movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
1476         movdqu  %xmm2,16($rptr,$i)
1477         movdqu  %xmm1,32($rptr,$i)
1478         lea     32($i),$i
1479         dec     $j
1480         jnz     .Lsqr4x_copy
1481
1482         movdqu  16($tptr,$i),%xmm2
1483         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1484         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1485         movdqu  %xmm2,16($rptr,$i)
1486 ___
1487 }
1488 $code.=<<___;
1489         mov     56(%rsp),%rsi           # restore %rsp
1490         mov     \$1,%rax
1491         mov     0(%rsi),%r15
1492         mov     8(%rsi),%r14
1493         mov     16(%rsi),%r13
1494         mov     24(%rsi),%r12
1495         mov     32(%rsi),%rbp
1496         mov     40(%rsi),%rbx
1497         lea     48(%rsi),%rsp
1498 .Lsqr4x_epilogue:
1499         ret
1500 .size   bn_sqr4x_mont,.-bn_sqr4x_mont
1501 ___
1502 }}}
1503 $code.=<<___;
1504 .asciz  "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1505 .align  16
1506 ___
1507
1508 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1509 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1510 if ($win64) {
1511 $rec="%rcx";
1512 $frame="%rdx";
1513 $context="%r8";
1514 $disp="%r9";
1515
1516 $code.=<<___;
1517 .extern __imp_RtlVirtualUnwind
1518 .type   mul_handler,\@abi-omnipotent
1519 .align  16
1520 mul_handler:
1521         push    %rsi
1522         push    %rdi
1523         push    %rbx
1524         push    %rbp
1525         push    %r12
1526         push    %r13
1527         push    %r14
1528         push    %r15
1529         pushfq
1530         sub     \$64,%rsp
1531
1532         mov     120($context),%rax      # pull context->Rax
1533         mov     248($context),%rbx      # pull context->Rip
1534
1535         mov     8($disp),%rsi           # disp->ImageBase
1536         mov     56($disp),%r11          # disp->HandlerData
1537
1538         mov     0(%r11),%r10d           # HandlerData[0]
1539         lea     (%rsi,%r10),%r10        # end of prologue label
1540         cmp     %r10,%rbx               # context->Rip<end of prologue label
1541         jb      .Lcommon_seh_tail
1542
1543         mov     152($context),%rax      # pull context->Rsp
1544
1545         mov     4(%r11),%r10d           # HandlerData[1]
1546         lea     (%rsi,%r10),%r10        # epilogue label
1547         cmp     %r10,%rbx               # context->Rip>=epilogue label
1548         jae     .Lcommon_seh_tail
1549
1550         mov     192($context),%r10      # pull $num
1551         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
1552         lea     48(%rax),%rax
1553
1554         mov     -8(%rax),%rbx
1555         mov     -16(%rax),%rbp
1556         mov     -24(%rax),%r12
1557         mov     -32(%rax),%r13
1558         mov     -40(%rax),%r14
1559         mov     -48(%rax),%r15
1560         mov     %rbx,144($context)      # restore context->Rbx
1561         mov     %rbp,160($context)      # restore context->Rbp
1562         mov     %r12,216($context)      # restore context->R12
1563         mov     %r13,224($context)      # restore context->R13
1564         mov     %r14,232($context)      # restore context->R14
1565         mov     %r15,240($context)      # restore context->R15
1566
1567         jmp     .Lcommon_seh_tail
1568 .size   mul_handler,.-mul_handler
1569
1570 .type   sqr_handler,\@abi-omnipotent
1571 .align  16
1572 sqr_handler:
1573         push    %rsi
1574         push    %rdi
1575         push    %rbx
1576         push    %rbp
1577         push    %r12
1578         push    %r13
1579         push    %r14
1580         push    %r15
1581         pushfq
1582         sub     \$64,%rsp
1583
1584         mov     120($context),%rax      # pull context->Rax
1585         mov     248($context),%rbx      # pull context->Rip
1586
1587         lea     .Lsqr4x_body(%rip),%r10
1588         cmp     %r10,%rbx               # context->Rip<.Lsqr_body
1589         jb      .Lcommon_seh_tail
1590
1591         mov     152($context),%rax      # pull context->Rsp
1592
1593         lea     .Lsqr4x_epilogue(%rip),%r10
1594         cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
1595         jae     .Lcommon_seh_tail
1596
1597         mov     56(%rax),%rax           # pull saved stack pointer
1598         lea     48(%rax),%rax
1599
1600         mov     -8(%rax),%rbx
1601         mov     -16(%rax),%rbp
1602         mov     -24(%rax),%r12
1603         mov     -32(%rax),%r13
1604         mov     -40(%rax),%r14
1605         mov     -48(%rax),%r15
1606         mov     %rbx,144($context)      # restore context->Rbx
1607         mov     %rbp,160($context)      # restore context->Rbp
1608         mov     %r12,216($context)      # restore context->R12
1609         mov     %r13,224($context)      # restore context->R13
1610         mov     %r14,232($context)      # restore context->R14
1611         mov     %r15,240($context)      # restore context->R15
1612
1613 .Lcommon_seh_tail:
1614         mov     8(%rax),%rdi
1615         mov     16(%rax),%rsi
1616         mov     %rax,152($context)      # restore context->Rsp
1617         mov     %rsi,168($context)      # restore context->Rsi
1618         mov     %rdi,176($context)      # restore context->Rdi
1619
1620         mov     40($disp),%rdi          # disp->ContextRecord
1621         mov     $context,%rsi           # context
1622         mov     \$154,%ecx              # sizeof(CONTEXT)
1623         .long   0xa548f3fc              # cld; rep movsq
1624
1625         mov     $disp,%rsi
1626         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1627         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1628         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1629         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1630         mov     40(%rsi),%r10           # disp->ContextRecord
1631         lea     56(%rsi),%r11           # &disp->HandlerData
1632         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1633         mov     %r10,32(%rsp)           # arg5
1634         mov     %r11,40(%rsp)           # arg6
1635         mov     %r12,48(%rsp)           # arg7
1636         mov     %rcx,56(%rsp)           # arg8, (NULL)
1637         call    *__imp_RtlVirtualUnwind(%rip)
1638
1639         mov     \$1,%eax                # ExceptionContinueSearch
1640         add     \$64,%rsp
1641         popfq
1642         pop     %r15
1643         pop     %r14
1644         pop     %r13
1645         pop     %r12
1646         pop     %rbp
1647         pop     %rbx
1648         pop     %rdi
1649         pop     %rsi
1650         ret
1651 .size   sqr_handler,.-sqr_handler
1652
1653 .section        .pdata
1654 .align  4
1655         .rva    .LSEH_begin_bn_mul_mont
1656         .rva    .LSEH_end_bn_mul_mont
1657         .rva    .LSEH_info_bn_mul_mont
1658
1659         .rva    .LSEH_begin_bn_mul4x_mont
1660         .rva    .LSEH_end_bn_mul4x_mont
1661         .rva    .LSEH_info_bn_mul4x_mont
1662
1663         .rva    .LSEH_begin_bn_sqr4x_mont
1664         .rva    .LSEH_end_bn_sqr4x_mont
1665         .rva    .LSEH_info_bn_sqr4x_mont
1666
1667 .section        .xdata
1668 .align  8
1669 .LSEH_info_bn_mul_mont:
1670         .byte   9,0,0,0
1671         .rva    mul_handler
1672         .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
1673 .LSEH_info_bn_mul4x_mont:
1674         .byte   9,0,0,0
1675         .rva    mul_handler
1676         .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
1677 .LSEH_info_bn_sqr4x_mont:
1678         .byte   9,0,0,0
1679         .rva    sqr_handler
1680 ___
1681 }
1682
1683 print $code;
1684 close STDOUT;