x86_64 assembler pack update from HEAD.
[openssl.git] / crypto / bn / asm / x86_64-mont.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # October 2005.
11 #
12 # Montgomery multiplication routine for x86_64. While it gives modest
13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
15 # respectful 50%. It remains to be seen if loop unrolling and
16 # dedicated squaring routine can provide further improvement...
17
18 # July 2011.
19 #
20 # Add dedicated squaring procedure. Performance improvement varies
21 # from platform to platform, but in average it's ~5%/15%/25%/33%
22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24 # August 2011.
25 #
26 # Unroll and modulo-schedule inner loops in such manner that they
27 # are "fallen through" for input lengths of 8, which is critical for
28 # 1024-bit RSA *sign*. Average performance improvement in comparison
29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32 $flavour = shift;
33 $output  = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open STDOUT,"| $^X $xlate $flavour $output";
44
45 # int bn_mul_mont(
46 $rp="%rdi";     # BN_ULONG *rp,
47 $ap="%rsi";     # const BN_ULONG *ap,
48 $bp="%rdx";     # const BN_ULONG *bp,
49 $np="%rcx";     # const BN_ULONG *np,
50 $n0="%r8";      # const BN_ULONG *n0,
51 $num="%r9";     # int num);
52 $lo0="%r10";
53 $hi0="%r11";
54 $hi1="%r13";
55 $i="%r14";
56 $j="%r15";
57 $m0="%rbx";
58 $m1="%rbp";
59
60 $code=<<___;
61 .text
62
63 .globl  bn_mul_mont
64 .type   bn_mul_mont,\@function,6
65 .align  16
66 bn_mul_mont:
67         test    \$3,${num}d
68         jnz     .Lmul_enter
69         cmp     \$8,${num}d
70         jb      .Lmul_enter
71         cmp     $ap,$bp
72         jne     .Lmul4x_enter
73         jmp     .Lsqr4x_enter
74
75 .align  16
76 .Lmul_enter:
77         push    %rbx
78         push    %rbp
79         push    %r12
80         push    %r13
81         push    %r14
82         push    %r15
83
84         mov     ${num}d,${num}d
85         lea     2($num),%r10
86         mov     %rsp,%r11
87         neg     %r10
88         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
89         and     \$-1024,%rsp            # minimize TLB usage
90
91         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
92 .Lmul_body:
93         mov     $bp,%r12                # reassign $bp
94 ___
95                 $bp="%r12";
96 $code.=<<___;
97         mov     ($n0),$n0               # pull n0[0] value
98         mov     ($bp),$m0               # m0=bp[0]
99         mov     ($ap),%rax
100
101         xor     $i,$i                   # i=0
102         xor     $j,$j                   # j=0
103
104         mov     $n0,$m1
105         mulq    $m0                     # ap[0]*bp[0]
106         mov     %rax,$lo0
107         mov     ($np),%rax
108
109         imulq   $lo0,$m1                # "tp[0]"*n0
110         mov     %rdx,$hi0
111
112         mulq    $m1                     # np[0]*m1
113         add     %rax,$lo0               # discarded
114         mov     8($ap),%rax
115         adc     \$0,%rdx
116         mov     %rdx,$hi1
117
118         lea     1($j),$j                # j++
119         jmp     .L1st_enter
120
121 .align  16
122 .L1st:
123         add     %rax,$hi1
124         mov     ($ap,$j,8),%rax
125         adc     \$0,%rdx
126         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
127         mov     $lo0,$hi0
128         adc     \$0,%rdx
129         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
130         mov     %rdx,$hi1
131
132 .L1st_enter:
133         mulq    $m0                     # ap[j]*bp[0]
134         add     %rax,$hi0
135         mov     ($np,$j,8),%rax
136         adc     \$0,%rdx
137         lea     1($j),$j                # j++
138         mov     %rdx,$lo0
139
140         mulq    $m1                     # np[j]*m1
141         cmp     $num,$j
142         jne     .L1st
143
144         add     %rax,$hi1
145         mov     ($ap),%rax              # ap[0]
146         adc     \$0,%rdx
147         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
148         adc     \$0,%rdx
149         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
150         mov     %rdx,$hi1
151         mov     $lo0,$hi0
152
153         xor     %rdx,%rdx
154         add     $hi0,$hi1
155         adc     \$0,%rdx
156         mov     $hi1,-8(%rsp,$num,8)
157         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
158
159         lea     1($i),$i                # i++
160         jmp     .Louter
161 .align  16
162 .Louter:
163         mov     ($bp,$i,8),$m0          # m0=bp[i]
164         xor     $j,$j                   # j=0
165         mov     $n0,$m1
166         mov     (%rsp),$lo0
167         mulq    $m0                     # ap[0]*bp[i]
168         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
169         mov     ($np),%rax
170         adc     \$0,%rdx
171
172         imulq   $lo0,$m1                # tp[0]*n0
173         mov     %rdx,$hi0
174
175         mulq    $m1                     # np[0]*m1
176         add     %rax,$lo0               # discarded
177         mov     8($ap),%rax
178         adc     \$0,%rdx
179         mov     8(%rsp),$lo0            # tp[1]
180         mov     %rdx,$hi1
181
182         lea     1($j),$j                # j++
183         jmp     .Linner_enter
184
185 .align  16
186 .Linner:
187         add     %rax,$hi1
188         mov     ($ap,$j,8),%rax
189         adc     \$0,%rdx
190         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
191         mov     (%rsp,$j,8),$lo0
192         adc     \$0,%rdx
193         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
194         mov     %rdx,$hi1
195
196 .Linner_enter:
197         mulq    $m0                     # ap[j]*bp[i]
198         add     %rax,$hi0
199         mov     ($np,$j,8),%rax
200         adc     \$0,%rdx
201         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
202         mov     %rdx,$hi0
203         adc     \$0,$hi0
204         lea     1($j),$j                # j++
205
206         mulq    $m1                     # np[j]*m1
207         cmp     $num,$j
208         jne     .Linner
209
210         add     %rax,$hi1
211         mov     ($ap),%rax              # ap[0]
212         adc     \$0,%rdx
213         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
214         mov     (%rsp,$j,8),$lo0
215         adc     \$0,%rdx
216         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
217         mov     %rdx,$hi1
218
219         xor     %rdx,%rdx
220         add     $hi0,$hi1
221         adc     \$0,%rdx
222         add     $lo0,$hi1               # pull upmost overflow bit
223         adc     \$0,%rdx
224         mov     $hi1,-8(%rsp,$num,8)
225         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
226
227         lea     1($i),$i                # i++
228         cmp     $num,$i
229         jl      .Louter
230
231         xor     $i,$i                   # i=0 and clear CF!
232         mov     (%rsp),%rax             # tp[0]
233         lea     (%rsp),$ap              # borrow ap for tp
234         mov     $num,$j                 # j=num
235         jmp     .Lsub
236 .align  16
237 .Lsub:  sbb     ($np,$i,8),%rax
238         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
239         mov     8($ap,$i,8),%rax        # tp[i+1]
240         lea     1($i),$i                # i++
241         dec     $j                      # doesnn't affect CF!
242         jnz     .Lsub
243
244         sbb     \$0,%rax                # handle upmost overflow bit
245         xor     $i,$i
246         and     %rax,$ap
247         not     %rax
248         mov     $rp,$np
249         and     %rax,$np
250         mov     $num,$j                 # j=num
251         or      $np,$ap                 # ap=borrow?tp:rp
252 .align  16
253 .Lcopy:                                 # copy or in-place refresh
254         mov     ($ap,$i,8),%rax
255         mov     $i,(%rsp,$i,8)          # zap temporary vector
256         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
257         lea     1($i),$i
258         sub     \$1,$j
259         jnz     .Lcopy
260
261         mov     8(%rsp,$num,8),%rsi     # restore %rsp
262         mov     \$1,%rax
263         mov     (%rsi),%r15
264         mov     8(%rsi),%r14
265         mov     16(%rsi),%r13
266         mov     24(%rsi),%r12
267         mov     32(%rsi),%rbp
268         mov     40(%rsi),%rbx
269         lea     48(%rsi),%rsp
270 .Lmul_epilogue:
271         ret
272 .size   bn_mul_mont,.-bn_mul_mont
273 ___
274 {{{
275 my @A=("%r10","%r11");
276 my @N=("%r13","%rdi");
277 $code.=<<___;
278 .type   bn_mul4x_mont,\@function,6
279 .align  16
280 bn_mul4x_mont:
281 .Lmul4x_enter:
282         push    %rbx
283         push    %rbp
284         push    %r12
285         push    %r13
286         push    %r14
287         push    %r15
288
289         mov     ${num}d,${num}d
290         lea     4($num),%r10
291         mov     %rsp,%r11
292         neg     %r10
293         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
294         and     \$-1024,%rsp            # minimize TLB usage
295
296         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
297 .Lmul4x_body:
298         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
299         mov     %rdx,%r12               # reassign $bp
300 ___
301                 $bp="%r12";
302 $code.=<<___;
303         mov     ($n0),$n0               # pull n0[0] value
304         mov     ($bp),$m0               # m0=bp[0]
305         mov     ($ap),%rax
306
307         xor     $i,$i                   # i=0
308         xor     $j,$j                   # j=0
309
310         mov     $n0,$m1
311         mulq    $m0                     # ap[0]*bp[0]
312         mov     %rax,$A[0]
313         mov     ($np),%rax
314
315         imulq   $A[0],$m1               # "tp[0]"*n0
316         mov     %rdx,$A[1]
317
318         mulq    $m1                     # np[0]*m1
319         add     %rax,$A[0]              # discarded
320         mov     8($ap),%rax
321         adc     \$0,%rdx
322         mov     %rdx,$N[1]
323
324         mulq    $m0
325         add     %rax,$A[1]
326         mov     8($np),%rax
327         adc     \$0,%rdx
328         mov     %rdx,$A[0]
329
330         mulq    $m1
331         add     %rax,$N[1]
332         mov     16($ap),%rax
333         adc     \$0,%rdx
334         add     $A[1],$N[1]
335         lea     4($j),$j                # j++
336         adc     \$0,%rdx
337         mov     $N[1],(%rsp)
338         mov     %rdx,$N[0]
339         jmp     .L1st4x
340 .align  16
341 .L1st4x:
342         mulq    $m0                     # ap[j]*bp[0]
343         add     %rax,$A[0]
344         mov     -16($np,$j,8),%rax
345         adc     \$0,%rdx
346         mov     %rdx,$A[1]
347
348         mulq    $m1                     # np[j]*m1
349         add     %rax,$N[0]
350         mov     -8($ap,$j,8),%rax
351         adc     \$0,%rdx
352         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
353         adc     \$0,%rdx
354         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
355         mov     %rdx,$N[1]
356
357         mulq    $m0                     # ap[j]*bp[0]
358         add     %rax,$A[1]
359         mov     -8($np,$j,8),%rax
360         adc     \$0,%rdx
361         mov     %rdx,$A[0]
362
363         mulq    $m1                     # np[j]*m1
364         add     %rax,$N[1]
365         mov     ($ap,$j,8),%rax
366         adc     \$0,%rdx
367         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
368         adc     \$0,%rdx
369         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
370         mov     %rdx,$N[0]
371
372         mulq    $m0                     # ap[j]*bp[0]
373         add     %rax,$A[0]
374         mov     ($np,$j,8),%rax
375         adc     \$0,%rdx
376         mov     %rdx,$A[1]
377
378         mulq    $m1                     # np[j]*m1
379         add     %rax,$N[0]
380         mov     8($ap,$j,8),%rax
381         adc     \$0,%rdx
382         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
383         adc     \$0,%rdx
384         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
385         mov     %rdx,$N[1]
386
387         mulq    $m0                     # ap[j]*bp[0]
388         add     %rax,$A[1]
389         mov     8($np,$j,8),%rax
390         adc     \$0,%rdx
391         lea     4($j),$j                # j++
392         mov     %rdx,$A[0]
393
394         mulq    $m1                     # np[j]*m1
395         add     %rax,$N[1]
396         mov     -16($ap,$j,8),%rax
397         adc     \$0,%rdx
398         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
399         adc     \$0,%rdx
400         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
401         mov     %rdx,$N[0]
402         cmp     $num,$j
403         jl      .L1st4x
404
405         mulq    $m0                     # ap[j]*bp[0]
406         add     %rax,$A[0]
407         mov     -16($np,$j,8),%rax
408         adc     \$0,%rdx
409         mov     %rdx,$A[1]
410
411         mulq    $m1                     # np[j]*m1
412         add     %rax,$N[0]
413         mov     -8($ap,$j,8),%rax
414         adc     \$0,%rdx
415         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
416         adc     \$0,%rdx
417         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
418         mov     %rdx,$N[1]
419
420         mulq    $m0                     # ap[j]*bp[0]
421         add     %rax,$A[1]
422         mov     -8($np,$j,8),%rax
423         adc     \$0,%rdx
424         mov     %rdx,$A[0]
425
426         mulq    $m1                     # np[j]*m1
427         add     %rax,$N[1]
428         mov     ($ap),%rax              # ap[0]
429         adc     \$0,%rdx
430         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
431         adc     \$0,%rdx
432         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
433         mov     %rdx,$N[0]
434
435         xor     $N[1],$N[1]
436         add     $A[0],$N[0]
437         adc     \$0,$N[1]
438         mov     $N[0],-8(%rsp,$j,8)
439         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
440
441         lea     1($i),$i                # i++
442 .align  4
443 .Louter4x:
444         mov     ($bp,$i,8),$m0          # m0=bp[i]
445         xor     $j,$j                   # j=0
446         mov     (%rsp),$A[0]
447         mov     $n0,$m1
448         mulq    $m0                     # ap[0]*bp[i]
449         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
450         mov     ($np),%rax
451         adc     \$0,%rdx
452
453         imulq   $A[0],$m1               # tp[0]*n0
454         mov     %rdx,$A[1]
455
456         mulq    $m1                     # np[0]*m1
457         add     %rax,$A[0]              # "$N[0]", discarded
458         mov     8($ap),%rax
459         adc     \$0,%rdx
460         mov     %rdx,$N[1]
461
462         mulq    $m0                     # ap[j]*bp[i]
463         add     %rax,$A[1]
464         mov     8($np),%rax
465         adc     \$0,%rdx
466         add     8(%rsp),$A[1]           # +tp[1]
467         adc     \$0,%rdx
468         mov     %rdx,$A[0]
469
470         mulq    $m1                     # np[j]*m1
471         add     %rax,$N[1]
472         mov     16($ap),%rax
473         adc     \$0,%rdx
474         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
475         lea     4($j),$j                # j+=2
476         adc     \$0,%rdx
477         mov     $N[1],(%rsp)            # tp[j-1]
478         mov     %rdx,$N[0]
479         jmp     .Linner4x
480 .align  16
481 .Linner4x:
482         mulq    $m0                     # ap[j]*bp[i]
483         add     %rax,$A[0]
484         mov     -16($np,$j,8),%rax
485         adc     \$0,%rdx
486         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
487         adc     \$0,%rdx
488         mov     %rdx,$A[1]
489
490         mulq    $m1                     # np[j]*m1
491         add     %rax,$N[0]
492         mov     -8($ap,$j,8),%rax
493         adc     \$0,%rdx
494         add     $A[0],$N[0]
495         adc     \$0,%rdx
496         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
497         mov     %rdx,$N[1]
498
499         mulq    $m0                     # ap[j]*bp[i]
500         add     %rax,$A[1]
501         mov     -8($np,$j,8),%rax
502         adc     \$0,%rdx
503         add     -8(%rsp,$j,8),$A[1]
504         adc     \$0,%rdx
505         mov     %rdx,$A[0]
506
507         mulq    $m1                     # np[j]*m1
508         add     %rax,$N[1]
509         mov     ($ap,$j,8),%rax
510         adc     \$0,%rdx
511         add     $A[1],$N[1]
512         adc     \$0,%rdx
513         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
514         mov     %rdx,$N[0]
515
516         mulq    $m0                     # ap[j]*bp[i]
517         add     %rax,$A[0]
518         mov     ($np,$j,8),%rax
519         adc     \$0,%rdx
520         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
521         adc     \$0,%rdx
522         mov     %rdx,$A[1]
523
524         mulq    $m1                     # np[j]*m1
525         add     %rax,$N[0]
526         mov     8($ap,$j,8),%rax
527         adc     \$0,%rdx
528         add     $A[0],$N[0]
529         adc     \$0,%rdx
530         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
531         mov     %rdx,$N[1]
532
533         mulq    $m0                     # ap[j]*bp[i]
534         add     %rax,$A[1]
535         mov     8($np,$j,8),%rax
536         adc     \$0,%rdx
537         add     8(%rsp,$j,8),$A[1]
538         adc     \$0,%rdx
539         lea     4($j),$j                # j++
540         mov     %rdx,$A[0]
541
542         mulq    $m1                     # np[j]*m1
543         add     %rax,$N[1]
544         mov     -16($ap,$j,8),%rax
545         adc     \$0,%rdx
546         add     $A[1],$N[1]
547         adc     \$0,%rdx
548         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
549         mov     %rdx,$N[0]
550         cmp     $num,$j
551         jl      .Linner4x
552
553         mulq    $m0                     # ap[j]*bp[i]
554         add     %rax,$A[0]
555         mov     -16($np,$j,8),%rax
556         adc     \$0,%rdx
557         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
558         adc     \$0,%rdx
559         mov     %rdx,$A[1]
560
561         mulq    $m1                     # np[j]*m1
562         add     %rax,$N[0]
563         mov     -8($ap,$j,8),%rax
564         adc     \$0,%rdx
565         add     $A[0],$N[0]
566         adc     \$0,%rdx
567         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
568         mov     %rdx,$N[1]
569
570         mulq    $m0                     # ap[j]*bp[i]
571         add     %rax,$A[1]
572         mov     -8($np,$j,8),%rax
573         adc     \$0,%rdx
574         add     -8(%rsp,$j,8),$A[1]
575         adc     \$0,%rdx
576         lea     1($i),$i                # i++
577         mov     %rdx,$A[0]
578
579         mulq    $m1                     # np[j]*m1
580         add     %rax,$N[1]
581         mov     ($ap),%rax              # ap[0]
582         adc     \$0,%rdx
583         add     $A[1],$N[1]
584         adc     \$0,%rdx
585         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
586         mov     %rdx,$N[0]
587
588         xor     $N[1],$N[1]
589         add     $A[0],$N[0]
590         adc     \$0,$N[1]
591         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
592         adc     \$0,$N[1]
593         mov     $N[0],-8(%rsp,$j,8)
594         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
595
596         cmp     $num,$i
597         jl      .Louter4x
598 ___
599 {
600 my @ri=("%rax","%rdx",$m0,$m1);
601 $code.=<<___;
602         mov     16(%rsp,$num,8),$rp     # restore $rp
603         mov     0(%rsp),@ri[0]          # tp[0]
604         pxor    %xmm0,%xmm0
605         mov     8(%rsp),@ri[1]          # tp[1]
606         shr     \$2,$num                # num/=4
607         lea     (%rsp),$ap              # borrow ap for tp
608         xor     $i,$i                   # i=0 and clear CF!
609
610         sub     0($np),@ri[0]
611         mov     16($ap),@ri[2]          # tp[2]
612         mov     24($ap),@ri[3]          # tp[3]
613         sbb     8($np),@ri[1]
614         lea     -1($num),$j             # j=num/4-1
615         jmp     .Lsub4x
616 .align  16
617 .Lsub4x:
618         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
619         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
620         sbb     16($np,$i,8),@ri[2]
621         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
622         mov     40($ap,$i,8),@ri[1]
623         sbb     24($np,$i,8),@ri[3]
624         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
625         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
626         sbb     32($np,$i,8),@ri[0]
627         mov     48($ap,$i,8),@ri[2]
628         mov     56($ap,$i,8),@ri[3]
629         sbb     40($np,$i,8),@ri[1]
630         lea     4($i),$i                # i++
631         dec     $j                      # doesnn't affect CF!
632         jnz     .Lsub4x
633
634         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
635         mov     32($ap,$i,8),@ri[0]     # load overflow bit
636         sbb     16($np,$i,8),@ri[2]
637         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
638         sbb     24($np,$i,8),@ri[3]
639         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
640
641         sbb     \$0,@ri[0]              # handle upmost overflow bit
642         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
643         xor     $i,$i                   # i=0
644         and     @ri[0],$ap
645         not     @ri[0]
646         mov     $rp,$np
647         and     @ri[0],$np
648         lea     -1($num),$j
649         or      $np,$ap                 # ap=borrow?tp:rp
650
651         movdqu  ($ap),%xmm1
652         movdqa  %xmm0,(%rsp)
653         movdqu  %xmm1,($rp)
654         jmp     .Lcopy4x
655 .align  16
656 .Lcopy4x:                                       # copy or in-place refresh
657         movdqu  16($ap,$i),%xmm2
658         movdqu  32($ap,$i),%xmm1
659         movdqa  %xmm0,16(%rsp,$i)
660         movdqu  %xmm2,16($rp,$i)
661         movdqa  %xmm0,32(%rsp,$i)
662         movdqu  %xmm1,32($rp,$i)
663         lea     32($i),$i
664         dec     $j
665         jnz     .Lcopy4x
666
667         shl     \$2,$num
668         movdqu  16($ap,$i),%xmm2
669         movdqa  %xmm0,16(%rsp,$i)
670         movdqu  %xmm2,16($rp,$i)
671 ___
672 }
673 $code.=<<___;
674         mov     8(%rsp,$num,8),%rsi     # restore %rsp
675         mov     \$1,%rax
676         mov     (%rsi),%r15
677         mov     8(%rsi),%r14
678         mov     16(%rsi),%r13
679         mov     24(%rsi),%r12
680         mov     32(%rsi),%rbp
681         mov     40(%rsi),%rbx
682         lea     48(%rsi),%rsp
683 .Lmul4x_epilogue:
684         ret
685 .size   bn_mul4x_mont,.-bn_mul4x_mont
686 ___
687 }}}
688 \f{{{
689 ######################################################################
690 # void bn_sqr4x_mont(
691 my $rptr="%rdi";        # const BN_ULONG *rptr,
692 my $aptr="%rsi";        # const BN_ULONG *aptr,
693 my $bptr="%rdx";        # not used
694 my $nptr="%rcx";        # const BN_ULONG *nptr,
695 my $n0  ="%r8";         # const BN_ULONG *n0);
696 my $num ="%r9";         # int num, has to be divisible by 4 and
697                         # not less than 8
698
699 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
700 my @A0=("%r10","%r11");
701 my @A1=("%r12","%r13");
702 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
703
704 $code.=<<___;
705 .type   bn_sqr4x_mont,\@function,6
706 .align  16
707 bn_sqr4x_mont:
708 .Lsqr4x_enter:
709         push    %rbx
710         push    %rbp
711         push    %r12
712         push    %r13
713         push    %r14
714         push    %r15
715
716         shl     \$3,${num}d             # convert $num to bytes
717         xor     %r10,%r10
718         mov     %rsp,%r11               # put aside %rsp
719         sub     $num,%r10               # -$num
720         mov     ($n0),$n0               # *n0
721         lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
722         and     \$-1024,%rsp            # minimize TLB usage
723         ##############################################################
724         # Stack layout
725         #
726         # +0    saved $num, used in reduction section
727         # +8    &t[2*$num], used in reduction section
728         # +32   saved $rptr
729         # +40   saved $nptr
730         # +48   saved *n0
731         # +56   saved %rsp
732         # +64   t[2*$num]
733         #
734         mov     $rptr,32(%rsp)          # save $rptr
735         mov     $nptr,40(%rsp)
736         mov     $n0,  48(%rsp)
737         mov     %r11, 56(%rsp)          # save original %rsp
738 .Lsqr4x_body:
739         ##############################################################
740         # Squaring part:
741         #
742         # a) multiply-n-add everything but a[i]*a[i];
743         # b) shift result of a) by 1 to the left and accumulate
744         #    a[i]*a[i] products;
745         #
746         lea     32(%r10),$i             # $i=-($num-32)
747         lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
748
749         mov     $num,$j                 # $j=$num
750
751                                         # comments apply to $num==8 case
752         mov     -32($aptr,$i),$a0       # a[0]
753         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
754         mov     -24($aptr,$i),%rax      # a[1]
755         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
756         mov     -16($aptr,$i),$ai       # a[2]
757         mov     %rax,$a1
758
759         mul     $a0                     # a[1]*a[0]
760         mov     %rax,$A0[0]             # a[1]*a[0]
761          mov    $ai,%rax                # a[2]
762         mov     %rdx,$A0[1]
763         mov     $A0[0],-24($tptr,$i)    # t[1]
764
765         xor     $A0[0],$A0[0]
766         mul     $a0                     # a[2]*a[0]
767         add     %rax,$A0[1]
768          mov    $ai,%rax
769         adc     %rdx,$A0[0]
770         mov     $A0[1],-16($tptr,$i)    # t[2]
771
772         lea     -16($i),$j              # j=-16
773
774
775          mov    8($aptr,$j),$ai         # a[3]
776         mul     $a1                     # a[2]*a[1]
777         mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
778          mov    $ai,%rax
779         mov     %rdx,$A1[1]
780
781         xor     $A0[1],$A0[1]
782         add     $A1[0],$A0[0]
783          lea    16($j),$j
784         adc     \$0,$A0[1]
785         mul     $a0                     # a[3]*a[0]
786         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
787          mov    $ai,%rax
788         adc     %rdx,$A0[1]
789         mov     $A0[0],-8($tptr,$j)     # t[3]
790         jmp     .Lsqr4x_1st
791
792 .align  16
793 .Lsqr4x_1st:
794          mov    ($aptr,$j),$ai          # a[4]
795         xor     $A1[0],$A1[0]
796         mul     $a1                     # a[3]*a[1]
797         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
798          mov    $ai,%rax
799         adc     %rdx,$A1[0]
800
801         xor     $A0[0],$A0[0]
802         add     $A1[1],$A0[1]
803         adc     \$0,$A0[0]
804         mul     $a0                     # a[4]*a[0]
805         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
806          mov    $ai,%rax                # a[3]
807         adc     %rdx,$A0[0]
808         mov     $A0[1],($tptr,$j)       # t[4]
809
810
811          mov    8($aptr,$j),$ai         # a[5]
812         xor     $A1[1],$A1[1]
813         mul     $a1                     # a[4]*a[3]
814         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
815          mov    $ai,%rax
816         adc     %rdx,$A1[1]
817
818         xor     $A0[1],$A0[1]
819         add     $A1[0],$A0[0]
820         adc     \$0,$A0[1]
821         mul     $a0                     # a[5]*a[2]
822         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
823          mov    $ai,%rax
824         adc     %rdx,$A0[1]
825         mov     $A0[0],8($tptr,$j)      # t[5]
826
827          mov    16($aptr,$j),$ai        # a[6]
828         xor     $A1[0],$A1[0]
829         mul     $a1                     # a[5]*a[3]
830         add     %rax,$A1[1]             # a[5]*a[3]+t[6]
831          mov    $ai,%rax
832         adc     %rdx,$A1[0]
833
834         xor     $A0[0],$A0[0]
835         add     $A1[1],$A0[1]
836         adc     \$0,$A0[0]
837         mul     $a0                     # a[6]*a[2]
838         add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
839          mov    $ai,%rax                # a[3]
840         adc     %rdx,$A0[0]
841         mov     $A0[1],16($tptr,$j)     # t[6]
842
843
844          mov    24($aptr,$j),$ai        # a[7]
845         xor     $A1[1],$A1[1]
846         mul     $a1                     # a[6]*a[5]
847         add     %rax,$A1[0]             # a[6]*a[5]+t[7]
848          mov    $ai,%rax
849         adc     %rdx,$A1[1]
850
851         xor     $A0[1],$A0[1]
852         add     $A1[0],$A0[0]
853          lea    32($j),$j
854         adc     \$0,$A0[1]
855         mul     $a0                     # a[7]*a[4]
856         add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
857          mov    $ai,%rax
858         adc     %rdx,$A0[1]
859         mov     $A0[0],-8($tptr,$j)     # t[7]
860
861         cmp     \$0,$j
862         jne     .Lsqr4x_1st
863
864         xor     $A1[0],$A1[0]
865         add     $A0[1],$A1[1]
866         adc     \$0,$A1[0]
867         mul     $a1                     # a[7]*a[5]
868         add     %rax,$A1[1]
869         adc     %rdx,$A1[0]
870
871         mov     $A1[1],($tptr)          # t[8]
872         lea     16($i),$i
873         mov     $A1[0],8($tptr)         # t[9]
874         jmp     .Lsqr4x_outer
875
876 .align  16
877 .Lsqr4x_outer:                          # comments apply to $num==6 case
878         mov     -32($aptr,$i),$a0       # a[0]
879         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
880         mov     -24($aptr,$i),%rax      # a[1]
881         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
882         mov     -16($aptr,$i),$ai       # a[2]
883         mov     %rax,$a1
884
885         mov     -24($tptr,$i),$A0[0]    # t[1]
886         xor     $A0[1],$A0[1]
887         mul     $a0                     # a[1]*a[0]
888         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
889          mov    $ai,%rax                # a[2]
890         adc     %rdx,$A0[1]
891         mov     $A0[0],-24($tptr,$i)    # t[1]
892
893         xor     $A0[0],$A0[0]
894         add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
895         adc     \$0,$A0[0]
896         mul     $a0                     # a[2]*a[0]
897         add     %rax,$A0[1]
898          mov    $ai,%rax
899         adc     %rdx,$A0[0]
900         mov     $A0[1],-16($tptr,$i)    # t[2]
901
902         lea     -16($i),$j              # j=-16
903         xor     $A1[0],$A1[0]
904
905
906          mov    8($aptr,$j),$ai         # a[3]
907         xor     $A1[1],$A1[1]
908         add     8($tptr,$j),$A1[0]
909         adc     \$0,$A1[1]
910         mul     $a1                     # a[2]*a[1]
911         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
912          mov    $ai,%rax
913         adc     %rdx,$A1[1]
914
915         xor     $A0[1],$A0[1]
916         add     $A1[0],$A0[0]
917         adc     \$0,$A0[1]
918         mul     $a0                     # a[3]*a[0]
919         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
920          mov    $ai,%rax
921         adc     %rdx,$A0[1]
922         mov     $A0[0],8($tptr,$j)      # t[3]
923
924         lea     16($j),$j
925         jmp     .Lsqr4x_inner
926
927 .align  16
928 .Lsqr4x_inner:
929          mov    ($aptr,$j),$ai          # a[4]
930         xor     $A1[0],$A1[0]
931         add     ($tptr,$j),$A1[1]
932         adc     \$0,$A1[0]
933         mul     $a1                     # a[3]*a[1]
934         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
935          mov    $ai,%rax
936         adc     %rdx,$A1[0]
937
938         xor     $A0[0],$A0[0]
939         add     $A1[1],$A0[1]
940         adc     \$0,$A0[0]
941         mul     $a0                     # a[4]*a[0]
942         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
943          mov    $ai,%rax                # a[3]
944         adc     %rdx,$A0[0]
945         mov     $A0[1],($tptr,$j)       # t[4]
946
947          mov    8($aptr,$j),$ai         # a[5]
948         xor     $A1[1],$A1[1]
949         add     8($tptr,$j),$A1[0]
950         adc     \$0,$A1[1]
951         mul     $a1                     # a[4]*a[3]
952         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
953          mov    $ai,%rax
954         adc     %rdx,$A1[1]
955
956         xor     $A0[1],$A0[1]
957         add     $A1[0],$A0[0]
958         lea     16($j),$j               # j++
959         adc     \$0,$A0[1]
960         mul     $a0                     # a[5]*a[2]
961         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
962          mov    $ai,%rax
963         adc     %rdx,$A0[1]
964         mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
965
966         cmp     \$0,$j
967         jne     .Lsqr4x_inner
968
969         xor     $A1[0],$A1[0]
970         add     $A0[1],$A1[1]
971         adc     \$0,$A1[0]
972         mul     $a1                     # a[5]*a[3]
973         add     %rax,$A1[1]
974         adc     %rdx,$A1[0]
975
976         mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
977         mov     $A1[0],8($tptr)         # t[7], "preloaded t[3]" below
978
979         add     \$16,$i
980         jnz     .Lsqr4x_outer
981
982                                         # comments apply to $num==4 case
983         mov     -32($aptr),$a0          # a[0]
984         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
985         mov     -24($aptr),%rax         # a[1]
986         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
987         mov     -16($aptr),$ai          # a[2]
988         mov     %rax,$a1
989
990         xor     $A0[1],$A0[1]
991         mul     $a0                     # a[1]*a[0]
992         add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
993          mov    $ai,%rax                # a[2]
994         adc     %rdx,$A0[1]
995         mov     $A0[0],-24($tptr)       # t[1]
996
997         xor     $A0[0],$A0[0]
998         add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
999         adc     \$0,$A0[0]
1000         mul     $a0                     # a[2]*a[0]
1001         add     %rax,$A0[1]
1002          mov    $ai,%rax
1003         adc     %rdx,$A0[0]
1004         mov     $A0[1],-16($tptr)       # t[2]
1005
1006          mov    -8($aptr),$ai           # a[3]
1007         mul     $a1                     # a[2]*a[1]
1008         add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
1009          mov    $ai,%rax
1010         adc     \$0,%rdx
1011
1012         xor     $A0[1],$A0[1]
1013         add     $A1[0],$A0[0]
1014          mov    %rdx,$A1[1]
1015         adc     \$0,$A0[1]
1016         mul     $a0                     # a[3]*a[0]
1017         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1018          mov    $ai,%rax
1019         adc     %rdx,$A0[1]
1020         mov     $A0[0],-8($tptr)        # t[3]
1021
1022         xor     $A1[0],$A1[0]
1023         add     $A0[1],$A1[1]
1024         adc     \$0,$A1[0]
1025         mul     $a1                     # a[3]*a[1]
1026         add     %rax,$A1[1]
1027          mov    -16($aptr),%rax         # a[2]
1028         adc     %rdx,$A1[0]
1029
1030         mov     $A1[1],($tptr)          # t[4]
1031         mov     $A1[0],8($tptr)         # t[5]
1032
1033         mul     $ai                     # a[2]*a[3]
1034 ___
1035 {
1036 my ($shift,$carry)=($a0,$a1);
1037 my @S=(@A1,$ai,$n0);
1038 $code.=<<___;
1039          add    \$16,$i
1040          xor    $shift,$shift
1041          sub    $num,$i                 # $i=16-$num
1042          xor    $carry,$carry
1043
1044         add     $A1[0],%rax             # t[5]
1045         adc     \$0,%rdx
1046         mov     %rax,8($tptr)           # t[5]
1047         mov     %rdx,16($tptr)          # t[6]
1048         mov     $carry,24($tptr)        # t[7]
1049
1050          mov    -16($aptr,$i),%rax      # a[0]
1051         lea     64(%rsp,$num,2),$tptr
1052          xor    $A0[0],$A0[0]           # t[0]
1053          mov    -24($tptr,$i,2),$A0[1]  # t[1]
1054
1055         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1056         shr     \$63,$A0[0]
1057         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1058         shr     \$63,$A0[1]
1059         or      $A0[0],$S[1]            # | t[2*i]>>63
1060          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1061         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1062         mul     %rax                    # a[i]*a[i]
1063         neg     $carry                  # mov $carry,cf
1064          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1065         adc     %rax,$S[0]
1066          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1067         mov     $S[0],-32($tptr,$i,2)
1068         adc     %rdx,$S[1]
1069
1070         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1071          mov    $S[1],-24($tptr,$i,2)
1072          sbb    $carry,$carry           # mov cf,$carry
1073         shr     \$63,$A0[0]
1074         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1075         shr     \$63,$A0[1]
1076         or      $A0[0],$S[3]            # | t[2*i]>>63
1077          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1078         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1079         mul     %rax                    # a[i]*a[i]
1080         neg     $carry                  # mov $carry,cf
1081          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1082         adc     %rax,$S[2]
1083          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1084         mov     $S[2],-16($tptr,$i,2)
1085         adc     %rdx,$S[3]
1086         lea     16($i),$i
1087         mov     $S[3],-40($tptr,$i,2)
1088         sbb     $carry,$carry           # mov cf,$carry
1089         jmp     .Lsqr4x_shift_n_add
1090
1091 .align  16
1092 .Lsqr4x_shift_n_add:
1093         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1094         shr     \$63,$A0[0]
1095         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1096         shr     \$63,$A0[1]
1097         or      $A0[0],$S[1]            # | t[2*i]>>63
1098          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1099         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1100         mul     %rax                    # a[i]*a[i]
1101         neg     $carry                  # mov $carry,cf
1102          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1103         adc     %rax,$S[0]
1104          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1105         mov     $S[0],-32($tptr,$i,2)
1106         adc     %rdx,$S[1]
1107
1108         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1109          mov    $S[1],-24($tptr,$i,2)
1110          sbb    $carry,$carry           # mov cf,$carry
1111         shr     \$63,$A0[0]
1112         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1113         shr     \$63,$A0[1]
1114         or      $A0[0],$S[3]            # | t[2*i]>>63
1115          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1116         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1117         mul     %rax                    # a[i]*a[i]
1118         neg     $carry                  # mov $carry,cf
1119          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1120         adc     %rax,$S[2]
1121          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1122         mov     $S[2],-16($tptr,$i,2)
1123         adc     %rdx,$S[3]
1124
1125         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1126          mov    $S[3],-8($tptr,$i,2)
1127          sbb    $carry,$carry           # mov cf,$carry
1128         shr     \$63,$A0[0]
1129         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1130         shr     \$63,$A0[1]
1131         or      $A0[0],$S[1]            # | t[2*i]>>63
1132          mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1133         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1134         mul     %rax                    # a[i]*a[i]
1135         neg     $carry                  # mov $carry,cf
1136          mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1137         adc     %rax,$S[0]
1138          mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
1139         mov     $S[0],0($tptr,$i,2)
1140         adc     %rdx,$S[1]
1141
1142         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1143          mov    $S[1],8($tptr,$i,2)
1144          sbb    $carry,$carry           # mov cf,$carry
1145         shr     \$63,$A0[0]
1146         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1147         shr     \$63,$A0[1]
1148         or      $A0[0],$S[3]            # | t[2*i]>>63
1149          mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1150         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1151         mul     %rax                    # a[i]*a[i]
1152         neg     $carry                  # mov $carry,cf
1153          mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1154         adc     %rax,$S[2]
1155          mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
1156         mov     $S[2],16($tptr,$i,2)
1157         adc     %rdx,$S[3]
1158         mov     $S[3],24($tptr,$i,2)
1159         sbb     $carry,$carry           # mov cf,$carry
1160         add     \$32,$i
1161         jnz     .Lsqr4x_shift_n_add
1162
1163         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1164         shr     \$63,$A0[0]
1165         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1166         shr     \$63,$A0[1]
1167         or      $A0[0],$S[1]            # | t[2*i]>>63
1168          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1169         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1170         mul     %rax                    # a[i]*a[i]
1171         neg     $carry                  # mov $carry,cf
1172          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1173         adc     %rax,$S[0]
1174          mov    -8($aptr),%rax          # a[i+1]        # prefetch
1175         mov     $S[0],-32($tptr)
1176         adc     %rdx,$S[1]
1177
1178         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1179          mov    $S[1],-24($tptr)
1180          sbb    $carry,$carry           # mov cf,$carry
1181         shr     \$63,$A0[0]
1182         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1183         shr     \$63,$A0[1]
1184         or      $A0[0],$S[3]            # | t[2*i]>>63
1185         mul     %rax                    # a[i]*a[i]
1186         neg     $carry                  # mov $carry,cf
1187         adc     %rax,$S[2]
1188         adc     %rdx,$S[3]
1189         mov     $S[2],-16($tptr)
1190         mov     $S[3],-8($tptr)
1191 ___
1192 }\f
1193 ##############################################################
1194 # Montgomery reduction part, "word-by-word" algorithm.
1195 #
1196 {
1197 my ($topbit,$nptr)=("%rbp",$aptr);
1198 my ($m0,$m1)=($a0,$a1);
1199 my @Ni=("%rbx","%r9");
1200 $code.=<<___;
1201         mov     40(%rsp),$nptr          # restore $nptr
1202         mov     48(%rsp),$n0            # restore *n0
1203         xor     $j,$j
1204         mov     $num,0(%rsp)            # save $num
1205         sub     $num,$j                 # $j=-$num
1206          mov    64(%rsp),$A0[0]         # t[0]          # modsched #
1207          mov    $n0,$m0                 #               # modsched #
1208         lea     64(%rsp,$num,2),%rax    # end of t[] buffer
1209         lea     64(%rsp,$num),$tptr     # end of t[] window
1210         mov     %rax,8(%rsp)            # save end of t[] buffer
1211         lea     ($nptr,$num),$nptr      # end of n[] buffer
1212         xor     $topbit,$topbit         # $topbit=0
1213
1214         mov     0($nptr,$j),%rax        # n[0]          # modsched #
1215         mov     8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1216          imulq  $A0[0],$m0              # m0=t[0]*n0    # modsched #
1217          mov    %rax,$Ni[0]             #               # modsched #
1218         jmp     .Lsqr4x_mont_outer
1219
1220 .align  16
1221 .Lsqr4x_mont_outer:
1222         xor     $A0[1],$A0[1]
1223         mul     $m0                     # n[0]*m0
1224         add     %rax,$A0[0]             # n[0]*m0+t[0]
1225          mov    $Ni[1],%rax
1226         adc     %rdx,$A0[1]
1227         mov     $n0,$m1
1228
1229         xor     $A0[0],$A0[0]
1230         add     8($tptr,$j),$A0[1]
1231         adc     \$0,$A0[0]
1232         mul     $m0                     # n[1]*m0
1233         add     %rax,$A0[1]             # n[1]*m0+t[1]
1234          mov    $Ni[0],%rax
1235         adc     %rdx,$A0[0]
1236
1237         imulq   $A0[1],$m1
1238
1239         mov     16($nptr,$j),$Ni[0]     # n[2]
1240         xor     $A1[1],$A1[1]
1241         add     $A0[1],$A1[0]
1242         adc     \$0,$A1[1]
1243         mul     $m1                     # n[0]*m1
1244         add     %rax,$A1[0]             # n[0]*m1+"t[1]"
1245          mov    $Ni[0],%rax
1246         adc     %rdx,$A1[1]
1247         mov     $A1[0],8($tptr,$j)      # "t[1]"
1248
1249         xor     $A0[1],$A0[1]
1250         add     16($tptr,$j),$A0[0]
1251         adc     \$0,$A0[1]
1252         mul     $m0                     # n[2]*m0
1253         add     %rax,$A0[0]             # n[2]*m0+t[2]
1254          mov    $Ni[1],%rax
1255         adc     %rdx,$A0[1]
1256
1257         mov     24($nptr,$j),$Ni[1]     # n[3]
1258         xor     $A1[0],$A1[0]
1259         add     $A0[0],$A1[1]
1260         adc     \$0,$A1[0]
1261         mul     $m1                     # n[1]*m1
1262         add     %rax,$A1[1]             # n[1]*m1+"t[2]"
1263          mov    $Ni[1],%rax
1264         adc     %rdx,$A1[0]
1265         mov     $A1[1],16($tptr,$j)     # "t[2]"
1266
1267         xor     $A0[0],$A0[0]
1268         add     24($tptr,$j),$A0[1]
1269         lea     32($j),$j
1270         adc     \$0,$A0[0]
1271         mul     $m0                     # n[3]*m0
1272         add     %rax,$A0[1]             # n[3]*m0+t[3]
1273          mov    $Ni[0],%rax
1274         adc     %rdx,$A0[0]
1275         jmp     .Lsqr4x_mont_inner
1276
1277 .align  16
1278 .Lsqr4x_mont_inner:
1279         mov     ($nptr,$j),$Ni[0]       # n[4]
1280         xor     $A1[1],$A1[1]
1281         add     $A0[1],$A1[0]
1282         adc     \$0,$A1[1]
1283         mul     $m1                     # n[2]*m1
1284         add     %rax,$A1[0]             # n[2]*m1+"t[3]"
1285          mov    $Ni[0],%rax
1286         adc     %rdx,$A1[1]
1287         mov     $A1[0],-8($tptr,$j)     # "t[3]"
1288
1289         xor     $A0[1],$A0[1]
1290         add     ($tptr,$j),$A0[0]
1291         adc     \$0,$A0[1]
1292         mul     $m0                     # n[4]*m0
1293         add     %rax,$A0[0]             # n[4]*m0+t[4]
1294          mov    $Ni[1],%rax
1295         adc     %rdx,$A0[1]
1296
1297         mov     8($nptr,$j),$Ni[1]      # n[5]
1298         xor     $A1[0],$A1[0]
1299         add     $A0[0],$A1[1]
1300         adc     \$0,$A1[0]
1301         mul     $m1                     # n[3]*m1
1302         add     %rax,$A1[1]             # n[3]*m1+"t[4]"
1303          mov    $Ni[1],%rax
1304         adc     %rdx,$A1[0]
1305         mov     $A1[1],($tptr,$j)       # "t[4]"
1306
1307         xor     $A0[0],$A0[0]
1308         add     8($tptr,$j),$A0[1]
1309         adc     \$0,$A0[0]
1310         mul     $m0                     # n[5]*m0
1311         add     %rax,$A0[1]             # n[5]*m0+t[5]
1312          mov    $Ni[0],%rax
1313         adc     %rdx,$A0[0]
1314
1315
1316         mov     16($nptr,$j),$Ni[0]     # n[6]
1317         xor     $A1[1],$A1[1]
1318         add     $A0[1],$A1[0]
1319         adc     \$0,$A1[1]
1320         mul     $m1                     # n[4]*m1
1321         add     %rax,$A1[0]             # n[4]*m1+"t[5]"
1322          mov    $Ni[0],%rax
1323         adc     %rdx,$A1[1]
1324         mov     $A1[0],8($tptr,$j)      # "t[5]"
1325
1326         xor     $A0[1],$A0[1]
1327         add     16($tptr,$j),$A0[0]
1328         adc     \$0,$A0[1]
1329         mul     $m0                     # n[6]*m0
1330         add     %rax,$A0[0]             # n[6]*m0+t[6]
1331          mov    $Ni[1],%rax
1332         adc     %rdx,$A0[1]
1333
1334         mov     24($nptr,$j),$Ni[1]     # n[7]
1335         xor     $A1[0],$A1[0]
1336         add     $A0[0],$A1[1]
1337         adc     \$0,$A1[0]
1338         mul     $m1                     # n[5]*m1
1339         add     %rax,$A1[1]             # n[5]*m1+"t[6]"
1340          mov    $Ni[1],%rax
1341         adc     %rdx,$A1[0]
1342         mov     $A1[1],16($tptr,$j)     # "t[6]"
1343
1344         xor     $A0[0],$A0[0]
1345         add     24($tptr,$j),$A0[1]
1346         lea     32($j),$j
1347         adc     \$0,$A0[0]
1348         mul     $m0                     # n[7]*m0
1349         add     %rax,$A0[1]             # n[7]*m0+t[7]
1350          mov    $Ni[0],%rax
1351         adc     %rdx,$A0[0]
1352         cmp     \$0,$j
1353         jne     .Lsqr4x_mont_inner
1354
1355          sub    0(%rsp),$j              # $j=-$num      # modsched #
1356          mov    $n0,$m0                 #               # modsched #
1357
1358         xor     $A1[1],$A1[1]
1359         add     $A0[1],$A1[0]
1360         adc     \$0,$A1[1]
1361         mul     $m1                     # n[6]*m1
1362         add     %rax,$A1[0]             # n[6]*m1+"t[7]"
1363         mov     $Ni[1],%rax
1364         adc     %rdx,$A1[1]
1365         mov     $A1[0],-8($tptr)        # "t[7]"
1366
1367         xor     $A0[1],$A0[1]
1368         add     ($tptr),$A0[0]          # +t[8]
1369         adc     \$0,$A0[1]
1370          mov    0($nptr,$j),$Ni[0]      # n[0]          # modsched #
1371         add     $topbit,$A0[0]
1372         adc     \$0,$A0[1]
1373
1374          imulq  16($tptr,$j),$m0        # m0=t[0]*n0    # modsched #
1375         xor     $A1[0],$A1[0]
1376          mov    8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1377         add     $A0[0],$A1[1]
1378          mov    16($tptr,$j),$A0[0]     # t[0]          # modsched #
1379         adc     \$0,$A1[0]
1380         mul     $m1                     # n[7]*m1
1381         add     %rax,$A1[1]             # n[7]*m1+"t[8]"
1382          mov    $Ni[0],%rax             #               # modsched #
1383         adc     %rdx,$A1[0]
1384         mov     $A1[1],($tptr)          # "t[8]"
1385
1386         xor     $topbit,$topbit
1387         add     8($tptr),$A1[0]         # +t[9]
1388         adc     $topbit,$topbit
1389         add     $A0[1],$A1[0]
1390         lea     16($tptr),$tptr         # "t[$num]>>128"
1391         adc     \$0,$topbit
1392         mov     $A1[0],-8($tptr)        # "t[9]"
1393         cmp     8(%rsp),$tptr           # are we done?
1394         jb      .Lsqr4x_mont_outer
1395
1396         mov     0(%rsp),$num            # restore $num
1397         mov     $topbit,($tptr)         # save $topbit
1398 ___
1399 }\f
1400 ##############################################################
1401 # Post-condition, 4x unrolled copy from bn_mul_mont
1402 #
1403 {
1404 my ($tptr,$nptr)=("%rbx",$aptr);
1405 my @ri=("%rax","%rdx","%r10","%r11");
1406 $code.=<<___;
1407         mov     64(%rsp,$num),@ri[0]    # tp[0]
1408         lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
1409         mov     40(%rsp),$nptr          # restore $nptr
1410         shr     \$5,$num                # num/4
1411         mov     8($tptr),@ri[1]         # t[1]
1412         xor     $i,$i                   # i=0 and clear CF!
1413
1414         mov     32(%rsp),$rptr          # restore $rptr
1415         sub     0($nptr),@ri[0]
1416         mov     16($tptr),@ri[2]        # t[2]
1417         mov     24($tptr),@ri[3]        # t[3]
1418         sbb     8($nptr),@ri[1]
1419         lea     -1($num),$j             # j=num/4-1
1420         jmp     .Lsqr4x_sub
1421 .align  16
1422 .Lsqr4x_sub:
1423         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1424         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1425         sbb     16($nptr,$i,8),@ri[2]
1426         mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
1427         mov     40($tptr,$i,8),@ri[1]
1428         sbb     24($nptr,$i,8),@ri[3]
1429         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1430         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1431         sbb     32($nptr,$i,8),@ri[0]
1432         mov     48($tptr,$i,8),@ri[2]
1433         mov     56($tptr,$i,8),@ri[3]
1434         sbb     40($nptr,$i,8),@ri[1]
1435         lea     4($i),$i                # i++
1436         dec     $j                      # doesn't affect CF!
1437         jnz     .Lsqr4x_sub
1438
1439         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1440         mov     32($tptr,$i,8),@ri[0]   # load overflow bit
1441         sbb     16($nptr,$i,8),@ri[2]
1442         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1443         sbb     24($nptr,$i,8),@ri[3]
1444         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1445
1446         sbb     \$0,@ri[0]              # handle upmost overflow bit
1447         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1448         xor     $i,$i                   # i=0
1449         and     @ri[0],$tptr
1450         not     @ri[0]
1451         mov     $rptr,$nptr
1452         and     @ri[0],$nptr
1453         lea     -1($num),$j
1454         or      $nptr,$tptr             # tp=borrow?tp:rp
1455
1456         pxor    %xmm0,%xmm0
1457         lea     64(%rsp,$num,8),$nptr
1458         movdqu  ($tptr),%xmm1
1459         lea     ($nptr,$num,8),$nptr
1460         movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
1461         movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
1462         movdqu  %xmm1,($rptr)
1463         jmp     .Lsqr4x_copy
1464 .align  16
1465 .Lsqr4x_copy:                           # copy or in-place refresh
1466         movdqu  16($tptr,$i),%xmm2
1467         movdqu  32($tptr,$i),%xmm1
1468         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1469         movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
1470         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1471         movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
1472         movdqu  %xmm2,16($rptr,$i)
1473         movdqu  %xmm1,32($rptr,$i)
1474         lea     32($i),$i
1475         dec     $j
1476         jnz     .Lsqr4x_copy
1477
1478         movdqu  16($tptr,$i),%xmm2
1479         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1480         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1481         movdqu  %xmm2,16($rptr,$i)
1482 ___
1483 }
1484 $code.=<<___;
1485         mov     56(%rsp),%rsi           # restore %rsp
1486         mov     \$1,%rax
1487         mov     0(%rsi),%r15
1488         mov     8(%rsi),%r14
1489         mov     16(%rsi),%r13
1490         mov     24(%rsi),%r12
1491         mov     32(%rsi),%rbp
1492         mov     40(%rsi),%rbx
1493         lea     48(%rsi),%rsp
1494 .Lsqr4x_epilogue:
1495         ret
1496 .size   bn_sqr4x_mont,.-bn_sqr4x_mont
1497 ___
1498 }}}
1499 $code.=<<___;
1500 .asciz  "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1501 .align  16
1502 ___
1503
1504 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1505 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1506 if ($win64) {
1507 $rec="%rcx";
1508 $frame="%rdx";
1509 $context="%r8";
1510 $disp="%r9";
1511
1512 $code.=<<___;
1513 .extern __imp_RtlVirtualUnwind
1514 .type   mul_handler,\@abi-omnipotent
1515 .align  16
1516 mul_handler:
1517         push    %rsi
1518         push    %rdi
1519         push    %rbx
1520         push    %rbp
1521         push    %r12
1522         push    %r13
1523         push    %r14
1524         push    %r15
1525         pushfq
1526         sub     \$64,%rsp
1527
1528         mov     120($context),%rax      # pull context->Rax
1529         mov     248($context),%rbx      # pull context->Rip
1530
1531         mov     8($disp),%rsi           # disp->ImageBase
1532         mov     56($disp),%r11          # disp->HandlerData
1533
1534         mov     0(%r11),%r10d           # HandlerData[0]
1535         lea     (%rsi,%r10),%r10        # end of prologue label
1536         cmp     %r10,%rbx               # context->Rip<end of prologue label
1537         jb      .Lcommon_seh_tail
1538
1539         mov     152($context),%rax      # pull context->Rsp
1540
1541         mov     4(%r11),%r10d           # HandlerData[1]
1542         lea     (%rsi,%r10),%r10        # epilogue label
1543         cmp     %r10,%rbx               # context->Rip>=epilogue label
1544         jae     .Lcommon_seh_tail
1545
1546         mov     192($context),%r10      # pull $num
1547         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
1548         lea     48(%rax),%rax
1549
1550         mov     -8(%rax),%rbx
1551         mov     -16(%rax),%rbp
1552         mov     -24(%rax),%r12
1553         mov     -32(%rax),%r13
1554         mov     -40(%rax),%r14
1555         mov     -48(%rax),%r15
1556         mov     %rbx,144($context)      # restore context->Rbx
1557         mov     %rbp,160($context)      # restore context->Rbp
1558         mov     %r12,216($context)      # restore context->R12
1559         mov     %r13,224($context)      # restore context->R13
1560         mov     %r14,232($context)      # restore context->R14
1561         mov     %r15,240($context)      # restore context->R15
1562
1563         jmp     .Lcommon_seh_tail
1564 .size   mul_handler,.-mul_handler
1565
1566 .type   sqr_handler,\@abi-omnipotent
1567 .align  16
1568 sqr_handler:
1569         push    %rsi
1570         push    %rdi
1571         push    %rbx
1572         push    %rbp
1573         push    %r12
1574         push    %r13
1575         push    %r14
1576         push    %r15
1577         pushfq
1578         sub     \$64,%rsp
1579
1580         mov     120($context),%rax      # pull context->Rax
1581         mov     248($context),%rbx      # pull context->Rip
1582
1583         lea     .Lsqr4x_body(%rip),%r10
1584         cmp     %r10,%rbx               # context->Rip<.Lsqr_body
1585         jb      .Lcommon_seh_tail
1586
1587         mov     152($context),%rax      # pull context->Rsp
1588
1589         lea     .Lsqr4x_epilogue(%rip),%r10
1590         cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
1591         jae     .Lcommon_seh_tail
1592
1593         mov     56(%rax),%rax           # pull saved stack pointer
1594         lea     48(%rax),%rax
1595
1596         mov     -8(%rax),%rbx
1597         mov     -16(%rax),%rbp
1598         mov     -24(%rax),%r12
1599         mov     -32(%rax),%r13
1600         mov     -40(%rax),%r14
1601         mov     -48(%rax),%r15
1602         mov     %rbx,144($context)      # restore context->Rbx
1603         mov     %rbp,160($context)      # restore context->Rbp
1604         mov     %r12,216($context)      # restore context->R12
1605         mov     %r13,224($context)      # restore context->R13
1606         mov     %r14,232($context)      # restore context->R14
1607         mov     %r15,240($context)      # restore context->R15
1608
1609 .Lcommon_seh_tail:
1610         mov     8(%rax),%rdi
1611         mov     16(%rax),%rsi
1612         mov     %rax,152($context)      # restore context->Rsp
1613         mov     %rsi,168($context)      # restore context->Rsi
1614         mov     %rdi,176($context)      # restore context->Rdi
1615
1616         mov     40($disp),%rdi          # disp->ContextRecord
1617         mov     $context,%rsi           # context
1618         mov     \$154,%ecx              # sizeof(CONTEXT)
1619         .long   0xa548f3fc              # cld; rep movsq
1620
1621         mov     $disp,%rsi
1622         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1623         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1624         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1625         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1626         mov     40(%rsi),%r10           # disp->ContextRecord
1627         lea     56(%rsi),%r11           # &disp->HandlerData
1628         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1629         mov     %r10,32(%rsp)           # arg5
1630         mov     %r11,40(%rsp)           # arg6
1631         mov     %r12,48(%rsp)           # arg7
1632         mov     %rcx,56(%rsp)           # arg8, (NULL)
1633         call    *__imp_RtlVirtualUnwind(%rip)
1634
1635         mov     \$1,%eax                # ExceptionContinueSearch
1636         add     \$64,%rsp
1637         popfq
1638         pop     %r15
1639         pop     %r14
1640         pop     %r13
1641         pop     %r12
1642         pop     %rbp
1643         pop     %rbx
1644         pop     %rdi
1645         pop     %rsi
1646         ret
1647 .size   sqr_handler,.-sqr_handler
1648
1649 .section        .pdata
1650 .align  4
1651         .rva    .LSEH_begin_bn_mul_mont
1652         .rva    .LSEH_end_bn_mul_mont
1653         .rva    .LSEH_info_bn_mul_mont
1654
1655         .rva    .LSEH_begin_bn_mul4x_mont
1656         .rva    .LSEH_end_bn_mul4x_mont
1657         .rva    .LSEH_info_bn_mul4x_mont
1658
1659         .rva    .LSEH_begin_bn_sqr4x_mont
1660         .rva    .LSEH_end_bn_sqr4x_mont
1661         .rva    .LSEH_info_bn_sqr4x_mont
1662
1663 .section        .xdata
1664 .align  8
1665 .LSEH_info_bn_mul_mont:
1666         .byte   9,0,0,0
1667         .rva    mul_handler
1668         .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
1669 .LSEH_info_bn_mul4x_mont:
1670         .byte   9,0,0,0
1671         .rva    mul_handler
1672         .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
1673 .LSEH_info_bn_sqr4x_mont:
1674         .byte   9,0,0,0
1675         .rva    sqr_handler
1676 ___
1677 }
1678
1679 print $code;
1680 close STDOUT;