This commit completes recent modular exponentiation optimizations on
[openssl.git] / crypto / bn / asm / x86_64-mont5.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # August 2011.
11 #
12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
13 # countermeasures. The subroutines are produced by replacing bp[i]
14 # references in their x86_64-mont.pl counterparts with cache-neutral
15 # references to powers table computed in BN_mod_exp_mont_consttime.
16 # In addition subroutine that scatters elements of the powers table
17 # is implemented, so that scatter-/gathering can be tuned without
18 # bn_exp.c modifications.
19
20 $flavour = shift;
21 $output  = shift;
22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
23
24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
25
26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
29 die "can't locate x86_64-xlate.pl";
30
31 open STDOUT,"| $^X $xlate $flavour $output";
32
33 # int bn_mul_mont_gather5(
34 $rp="%rdi";     # BN_ULONG *rp,
35 $ap="%rsi";     # const BN_ULONG *ap,
36 $bp="%rdx";     # const BN_ULONG *bp,
37 $np="%rcx";     # const BN_ULONG *np,
38 $n0="%r8";      # const BN_ULONG *n0,
39 $num="%r9";     # int num,
40                 # int idx);     # 0 to 2^5-1, "index" in $bp holding
41                                 # pre-computed powers of a', interlaced
42                                 # in such manner that b[0] is $bp[idx],
43                                 # b[1] is [2^5+idx], etc.
44 $lo0="%r10";
45 $hi0="%r11";
46 $hi1="%r13";
47 $i="%r14";
48 $j="%r15";
49 $m0="%rbx";
50 $m1="%rbp";
51
52 $code=<<___;
53 .text
54
55 .globl  bn_mul_mont_gather5
56 .type   bn_mul_mont_gather5,\@function,6
57 .align  64
58 bn_mul_mont_gather5:
59         test    \$3,${num}d
60         jnz     .Lmul_enter
61         cmp     \$8,${num}d
62         jb      .Lmul_enter
63         jmp     .Lmul4x_enter
64
65 .align  16
66 .Lmul_enter:
67         mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
68         push    %rbx
69         push    %rbp
70         push    %r12
71         push    %r13
72         push    %r14
73         push    %r15
74
75         mov     ${num}d,${num}d
76         lea     2($num),%r11
77         mov     %rsp,%rax
78         neg     %r11
79         lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+2))
80         and     \$-1024,%rsp            # minimize TLB usage
81
82         mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
83 .Lmul_body:
84         mov     $bp,%r12                # reassign $bp
85 ___
86                 $bp="%r12";
87                 $STRIDE=2**5*8;         # 5 is "window size"
88                 $N=$STRIDE/4;           # should match cache line size
89 $code.=<<___;
90         mov     %r10,%r11
91         shr     \$`log($N/8)/log(2)`,%r10
92         and     \$`$N/8-1`,%r11
93         not     %r10
94         lea     .Lmagic_masks(%rip),%rax
95         and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
96         lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
97         movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
98         movq    8(%rax,%r10,8),%xmm5    # cache line contains element
99         movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
100         movq    24(%rax,%r10,8),%xmm7
101
102         movq    `0*$STRIDE/4-96`($bp),%xmm0
103         movq    `1*$STRIDE/4-96`($bp),%xmm1
104         pand    %xmm4,%xmm0
105         movq    `2*$STRIDE/4-96`($bp),%xmm2
106         pand    %xmm5,%xmm1
107         movq    `3*$STRIDE/4-96`($bp),%xmm3
108         pand    %xmm6,%xmm2
109         por     %xmm1,%xmm0
110         pand    %xmm7,%xmm3
111         por     %xmm2,%xmm0
112         lea     $STRIDE($bp),$bp
113         por     %xmm3,%xmm0
114
115         movq    %xmm0,$m0               # m0=bp[0]
116
117         mov     ($n0),$n0               # pull n0[0] value
118         mov     ($ap),%rax
119
120         xor     $i,$i                   # i=0
121         xor     $j,$j                   # j=0
122
123         movq    `0*$STRIDE/4-96`($bp),%xmm0
124         movq    `1*$STRIDE/4-96`($bp),%xmm1
125         pand    %xmm4,%xmm0
126         movq    `2*$STRIDE/4-96`($bp),%xmm2
127         pand    %xmm5,%xmm1
128
129         mov     $n0,$m1
130         mulq    $m0                     # ap[0]*bp[0]
131         mov     %rax,$lo0
132         mov     ($np),%rax
133
134         movq    `3*$STRIDE/4-96`($bp),%xmm3
135         pand    %xmm6,%xmm2
136         por     %xmm1,%xmm0
137         pand    %xmm7,%xmm3
138
139         imulq   $lo0,$m1                # "tp[0]"*n0
140         mov     %rdx,$hi0
141
142         por     %xmm2,%xmm0
143         lea     $STRIDE($bp),$bp
144         por     %xmm3,%xmm0
145
146         mulq    $m1                     # np[0]*m1
147         add     %rax,$lo0               # discarded
148         mov     8($ap),%rax
149         adc     \$0,%rdx
150         mov     %rdx,$hi1
151
152         lea     1($j),$j                # j++
153         jmp     .L1st_enter
154
155 .align  16
156 .L1st:
157         add     %rax,$hi1
158         mov     ($ap,$j,8),%rax
159         adc     \$0,%rdx
160         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
161         mov     $lo0,$hi0
162         adc     \$0,%rdx
163         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
164         mov     %rdx,$hi1
165
166 .L1st_enter:
167         mulq    $m0                     # ap[j]*bp[0]
168         add     %rax,$hi0
169         mov     ($np,$j,8),%rax
170         adc     \$0,%rdx
171         lea     1($j),$j                # j++
172         mov     %rdx,$lo0
173
174         mulq    $m1                     # np[j]*m1
175         cmp     $num,$j
176         jne     .L1st
177
178         movq    %xmm0,$m0               # bp[1]
179
180         add     %rax,$hi1
181         mov     ($ap),%rax              # ap[0]
182         adc     \$0,%rdx
183         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
184         adc     \$0,%rdx
185         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
186         mov     %rdx,$hi1
187         mov     $lo0,$hi0
188
189         xor     %rdx,%rdx
190         add     $hi0,$hi1
191         adc     \$0,%rdx
192         mov     $hi1,-8(%rsp,$num,8)
193         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
194
195         lea     1($i),$i                # i++
196         jmp     .Louter
197 .align  16
198 .Louter:
199         xor     $j,$j                   # j=0
200         mov     $n0,$m1
201         mov     (%rsp),$lo0
202
203         movq    `0*$STRIDE/4-96`($bp),%xmm0
204         movq    `1*$STRIDE/4-96`($bp),%xmm1
205         pand    %xmm4,%xmm0
206         movq    `2*$STRIDE/4-96`($bp),%xmm2
207         pand    %xmm5,%xmm1
208
209         mulq    $m0                     # ap[0]*bp[i]
210         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
211         mov     ($np),%rax
212         adc     \$0,%rdx
213
214         movq    `3*$STRIDE/4-96`($bp),%xmm3
215         pand    %xmm6,%xmm2
216         por     %xmm1,%xmm0
217         pand    %xmm7,%xmm3
218
219         imulq   $lo0,$m1                # tp[0]*n0
220         mov     %rdx,$hi0
221
222         por     %xmm2,%xmm0
223         lea     $STRIDE($bp),$bp
224         por     %xmm3,%xmm0
225
226         mulq    $m1                     # np[0]*m1
227         add     %rax,$lo0               # discarded
228         mov     8($ap),%rax
229         adc     \$0,%rdx
230         mov     8(%rsp),$lo0            # tp[1]
231         mov     %rdx,$hi1
232
233         lea     1($j),$j                # j++
234         jmp     .Linner_enter
235
236 .align  16
237 .Linner:
238         add     %rax,$hi1
239         mov     ($ap,$j,8),%rax
240         adc     \$0,%rdx
241         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
242         mov     (%rsp,$j,8),$lo0
243         adc     \$0,%rdx
244         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
245         mov     %rdx,$hi1
246
247 .Linner_enter:
248         mulq    $m0                     # ap[j]*bp[i]
249         add     %rax,$hi0
250         mov     ($np,$j,8),%rax
251         adc     \$0,%rdx
252         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
253         mov     %rdx,$hi0
254         adc     \$0,$hi0
255         lea     1($j),$j                # j++
256
257         mulq    $m1                     # np[j]*m1
258         cmp     $num,$j
259         jne     .Linner
260
261         movq    %xmm0,$m0               # bp[i+1]
262
263         add     %rax,$hi1
264         mov     ($ap),%rax              # ap[0]
265         adc     \$0,%rdx
266         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
267         mov     (%rsp,$j,8),$lo0
268         adc     \$0,%rdx
269         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
270         mov     %rdx,$hi1
271
272         xor     %rdx,%rdx
273         add     $hi0,$hi1
274         adc     \$0,%rdx
275         add     $lo0,$hi1               # pull upmost overflow bit
276         adc     \$0,%rdx
277         mov     $hi1,-8(%rsp,$num,8)
278         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
279
280         lea     1($i),$i                # i++
281         cmp     $num,$i
282         jl      .Louter
283
284         xor     $i,$i                   # i=0 and clear CF!
285         mov     (%rsp),%rax             # tp[0]
286         lea     (%rsp),$ap              # borrow ap for tp
287         mov     $num,$j                 # j=num
288         jmp     .Lsub
289 .align  16
290 .Lsub:  sbb     ($np,$i,8),%rax
291         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
292         mov     8($ap,$i,8),%rax        # tp[i+1]
293         lea     1($i),$i                # i++
294         dec     $j                      # doesnn't affect CF!
295         jnz     .Lsub
296
297         sbb     \$0,%rax                # handle upmost overflow bit
298         xor     $i,$i
299         and     %rax,$ap
300         not     %rax
301         mov     $rp,$np
302         and     %rax,$np
303         mov     $num,$j                 # j=num
304         or      $np,$ap                 # ap=borrow?tp:rp
305 .align  16
306 .Lcopy:                                 # copy or in-place refresh
307         mov     ($ap,$i,8),%rax
308         mov     $i,(%rsp,$i,8)          # zap temporary vector
309         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
310         lea     1($i),$i
311         sub     \$1,$j
312         jnz     .Lcopy
313
314         mov     8(%rsp,$num,8),%rsi     # restore %rsp
315         mov     \$1,%rax
316         mov     (%rsi),%r15
317         mov     8(%rsi),%r14
318         mov     16(%rsi),%r13
319         mov     24(%rsi),%r12
320         mov     32(%rsi),%rbp
321         mov     40(%rsi),%rbx
322         lea     48(%rsi),%rsp
323 .Lmul_epilogue:
324         ret
325 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
326 ___
327 {{{
328 my @A=("%r10","%r11");
329 my @N=("%r13","%rdi");
330 $code.=<<___;
331 .type   bn_mul4x_mont_gather5,\@function,6
332 .align  16
333 bn_mul4x_mont_gather5:
334 .Lmul4x_enter:
335         mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
336         push    %rbx
337         push    %rbp
338         push    %r12
339         push    %r13
340         push    %r14
341         push    %r15
342
343         mov     ${num}d,${num}d
344         lea     4($num),%r11
345         mov     %rsp,%rax               # !!!!
346         neg     %r11
347         lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+4))
348         and     \$-1024,%rsp            # minimize TLB usage
349
350         mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
351 .Lmul4x_body:
352         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
353         mov     %rdx,%r12               # reassign $bp
354 ___
355                 $bp="%r12";
356                 $STRIDE=2**5*8;         # 5 is "window size"
357                 $N=$STRIDE/4;           # should match cache line size
358 $code.=<<___;
359         mov     %r10,%r11
360         shr     \$`log($N/8)/log(2)`,%r10
361         and     \$`$N/8-1`,%r11
362         not     %r10
363         lea     .Lmagic_masks(%rip),%rax
364         and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
365         lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
366         movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
367         movq    8(%rax,%r10,8),%xmm5    # cache line contains element
368         movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
369         movq    24(%rax,%r10,8),%xmm7
370
371         movq    `0*$STRIDE/4-96`($bp),%xmm0
372         movq    `1*$STRIDE/4-96`($bp),%xmm1
373         pand    %xmm4,%xmm0
374         movq    `2*$STRIDE/4-96`($bp),%xmm2
375         pand    %xmm5,%xmm1
376         movq    `3*$STRIDE/4-96`($bp),%xmm3
377         pand    %xmm6,%xmm2
378         por     %xmm1,%xmm0
379         pand    %xmm7,%xmm3
380         por     %xmm2,%xmm0
381         lea     $STRIDE($bp),$bp
382         por     %xmm3,%xmm0
383
384         movq    %xmm0,$m0               # m0=bp[0]
385         mov     ($n0),$n0               # pull n0[0] value
386         mov     ($ap),%rax
387
388         xor     $i,$i                   # i=0
389         xor     $j,$j                   # j=0
390
391         movq    `0*$STRIDE/4-96`($bp),%xmm0
392         movq    `1*$STRIDE/4-96`($bp),%xmm1
393         pand    %xmm4,%xmm0
394         movq    `2*$STRIDE/4-96`($bp),%xmm2
395         pand    %xmm5,%xmm1
396
397         mov     $n0,$m1
398         mulq    $m0                     # ap[0]*bp[0]
399         mov     %rax,$A[0]
400         mov     ($np),%rax
401
402         movq    `3*$STRIDE/4-96`($bp),%xmm3
403         pand    %xmm6,%xmm2
404         por     %xmm1,%xmm0
405         pand    %xmm7,%xmm3
406
407         imulq   $A[0],$m1               # "tp[0]"*n0
408         mov     %rdx,$A[1]
409
410         por     %xmm2,%xmm0
411         lea     $STRIDE($bp),$bp
412         por     %xmm3,%xmm0
413
414         mulq    $m1                     # np[0]*m1
415         add     %rax,$A[0]              # discarded
416         mov     8($ap),%rax
417         adc     \$0,%rdx
418         mov     %rdx,$N[1]
419
420         mulq    $m0
421         add     %rax,$A[1]
422         mov     8($np),%rax
423         adc     \$0,%rdx
424         mov     %rdx,$A[0]
425
426         mulq    $m1
427         add     %rax,$N[1]
428         mov     16($ap),%rax
429         adc     \$0,%rdx
430         add     $A[1],$N[1]
431         lea     4($j),$j                # j++
432         adc     \$0,%rdx
433         mov     $N[1],(%rsp)
434         mov     %rdx,$N[0]
435         jmp     .L1st4x
436 .align  16
437 .L1st4x:
438         mulq    $m0                     # ap[j]*bp[0]
439         add     %rax,$A[0]
440         mov     -16($np,$j,8),%rax
441         adc     \$0,%rdx
442         mov     %rdx,$A[1]
443
444         mulq    $m1                     # np[j]*m1
445         add     %rax,$N[0]
446         mov     -8($ap,$j,8),%rax
447         adc     \$0,%rdx
448         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
449         adc     \$0,%rdx
450         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
451         mov     %rdx,$N[1]
452
453         mulq    $m0                     # ap[j]*bp[0]
454         add     %rax,$A[1]
455         mov     -8($np,$j,8),%rax
456         adc     \$0,%rdx
457         mov     %rdx,$A[0]
458
459         mulq    $m1                     # np[j]*m1
460         add     %rax,$N[1]
461         mov     ($ap,$j,8),%rax
462         adc     \$0,%rdx
463         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
464         adc     \$0,%rdx
465         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
466         mov     %rdx,$N[0]
467
468         mulq    $m0                     # ap[j]*bp[0]
469         add     %rax,$A[0]
470         mov     ($np,$j,8),%rax
471         adc     \$0,%rdx
472         mov     %rdx,$A[1]
473
474         mulq    $m1                     # np[j]*m1
475         add     %rax,$N[0]
476         mov     8($ap,$j,8),%rax
477         adc     \$0,%rdx
478         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
479         adc     \$0,%rdx
480         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
481         mov     %rdx,$N[1]
482
483         mulq    $m0                     # ap[j]*bp[0]
484         add     %rax,$A[1]
485         mov     8($np,$j,8),%rax
486         adc     \$0,%rdx
487         lea     4($j),$j                # j++
488         mov     %rdx,$A[0]
489
490         mulq    $m1                     # np[j]*m1
491         add     %rax,$N[1]
492         mov     -16($ap,$j,8),%rax
493         adc     \$0,%rdx
494         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
495         adc     \$0,%rdx
496         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
497         mov     %rdx,$N[0]
498         cmp     $num,$j
499         jl      .L1st4x
500
501         mulq    $m0                     # ap[j]*bp[0]
502         add     %rax,$A[0]
503         mov     -16($np,$j,8),%rax
504         adc     \$0,%rdx
505         mov     %rdx,$A[1]
506
507         mulq    $m1                     # np[j]*m1
508         add     %rax,$N[0]
509         mov     -8($ap,$j,8),%rax
510         adc     \$0,%rdx
511         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
512         adc     \$0,%rdx
513         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
514         mov     %rdx,$N[1]
515
516         mulq    $m0                     # ap[j]*bp[0]
517         add     %rax,$A[1]
518         mov     -8($np,$j,8),%rax
519         adc     \$0,%rdx
520         mov     %rdx,$A[0]
521
522         mulq    $m1                     # np[j]*m1
523         add     %rax,$N[1]
524         mov     ($ap),%rax              # ap[0]
525         adc     \$0,%rdx
526         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
527         adc     \$0,%rdx
528         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
529         mov     %rdx,$N[0]
530
531         movq    %xmm0,$m0               # bp[1]
532
533         xor     $N[1],$N[1]
534         add     $A[0],$N[0]
535         adc     \$0,$N[1]
536         mov     $N[0],-8(%rsp,$j,8)
537         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
538
539         lea     1($i),$i                # i++
540 .align  4
541 .Louter4x:
542         xor     $j,$j                   # j=0
543         movq    `0*$STRIDE/4-96`($bp),%xmm0
544         movq    `1*$STRIDE/4-96`($bp),%xmm1
545         pand    %xmm4,%xmm0
546         movq    `2*$STRIDE/4-96`($bp),%xmm2
547         pand    %xmm5,%xmm1
548
549         mov     (%rsp),$A[0]
550         mov     $n0,$m1
551         mulq    $m0                     # ap[0]*bp[i]
552         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
553         mov     ($np),%rax
554         adc     \$0,%rdx
555
556         movq    `3*$STRIDE/4-96`($bp),%xmm3
557         pand    %xmm6,%xmm2
558         por     %xmm1,%xmm0
559         pand    %xmm7,%xmm3
560
561         imulq   $A[0],$m1               # tp[0]*n0
562         mov     %rdx,$A[1]
563
564         por     %xmm2,%xmm0
565         lea     $STRIDE($bp),$bp
566         por     %xmm3,%xmm0
567
568         mulq    $m1                     # np[0]*m1
569         add     %rax,$A[0]              # "$N[0]", discarded
570         mov     8($ap),%rax
571         adc     \$0,%rdx
572         mov     %rdx,$N[1]
573
574         mulq    $m0                     # ap[j]*bp[i]
575         add     %rax,$A[1]
576         mov     8($np),%rax
577         adc     \$0,%rdx
578         add     8(%rsp),$A[1]           # +tp[1]
579         adc     \$0,%rdx
580         mov     %rdx,$A[0]
581
582         mulq    $m1                     # np[j]*m1
583         add     %rax,$N[1]
584         mov     16($ap),%rax
585         adc     \$0,%rdx
586         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
587         lea     4($j),$j                # j+=2
588         adc     \$0,%rdx
589         mov     $N[1],(%rsp)            # tp[j-1]
590         mov     %rdx,$N[0]
591         jmp     .Linner4x
592 .align  16
593 .Linner4x:
594         mulq    $m0                     # ap[j]*bp[i]
595         add     %rax,$A[0]
596         mov     -16($np,$j,8),%rax
597         adc     \$0,%rdx
598         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
599         adc     \$0,%rdx
600         mov     %rdx,$A[1]
601
602         mulq    $m1                     # np[j]*m1
603         add     %rax,$N[0]
604         mov     -8($ap,$j,8),%rax
605         adc     \$0,%rdx
606         add     $A[0],$N[0]
607         adc     \$0,%rdx
608         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
609         mov     %rdx,$N[1]
610
611         mulq    $m0                     # ap[j]*bp[i]
612         add     %rax,$A[1]
613         mov     -8($np,$j,8),%rax
614         adc     \$0,%rdx
615         add     -8(%rsp,$j,8),$A[1]
616         adc     \$0,%rdx
617         mov     %rdx,$A[0]
618
619         mulq    $m1                     # np[j]*m1
620         add     %rax,$N[1]
621         mov     ($ap,$j,8),%rax
622         adc     \$0,%rdx
623         add     $A[1],$N[1]
624         adc     \$0,%rdx
625         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
626         mov     %rdx,$N[0]
627
628         mulq    $m0                     # ap[j]*bp[i]
629         add     %rax,$A[0]
630         mov     ($np,$j,8),%rax
631         adc     \$0,%rdx
632         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
633         adc     \$0,%rdx
634         mov     %rdx,$A[1]
635
636         mulq    $m1                     # np[j]*m1
637         add     %rax,$N[0]
638         mov     8($ap,$j,8),%rax
639         adc     \$0,%rdx
640         add     $A[0],$N[0]
641         adc     \$0,%rdx
642         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
643         mov     %rdx,$N[1]
644
645         mulq    $m0                     # ap[j]*bp[i]
646         add     %rax,$A[1]
647         mov     8($np,$j,8),%rax
648         adc     \$0,%rdx
649         add     8(%rsp,$j,8),$A[1]
650         adc     \$0,%rdx
651         lea     4($j),$j                # j++
652         mov     %rdx,$A[0]
653
654         mulq    $m1                     # np[j]*m1
655         add     %rax,$N[1]
656         mov     -16($ap,$j,8),%rax
657         adc     \$0,%rdx
658         add     $A[1],$N[1]
659         adc     \$0,%rdx
660         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
661         mov     %rdx,$N[0]
662         cmp     $num,$j
663         jl      .Linner4x
664
665         mulq    $m0                     # ap[j]*bp[i]
666         add     %rax,$A[0]
667         mov     -16($np,$j,8),%rax
668         adc     \$0,%rdx
669         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
670         adc     \$0,%rdx
671         mov     %rdx,$A[1]
672
673         mulq    $m1                     # np[j]*m1
674         add     %rax,$N[0]
675         mov     -8($ap,$j,8),%rax
676         adc     \$0,%rdx
677         add     $A[0],$N[0]
678         adc     \$0,%rdx
679         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
680         mov     %rdx,$N[1]
681
682         mulq    $m0                     # ap[j]*bp[i]
683         add     %rax,$A[1]
684         mov     -8($np,$j,8),%rax
685         adc     \$0,%rdx
686         add     -8(%rsp,$j,8),$A[1]
687         adc     \$0,%rdx
688         lea     1($i),$i                # i++
689         mov     %rdx,$A[0]
690
691         mulq    $m1                     # np[j]*m1
692         add     %rax,$N[1]
693         mov     ($ap),%rax              # ap[0]
694         adc     \$0,%rdx
695         add     $A[1],$N[1]
696         adc     \$0,%rdx
697         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
698         mov     %rdx,$N[0]
699
700         movq    %xmm0,$m0               # bp[i+1]
701
702         xor     $N[1],$N[1]
703         add     $A[0],$N[0]
704         adc     \$0,$N[1]
705         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
706         adc     \$0,$N[1]
707         mov     $N[0],-8(%rsp,$j,8)
708         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
709
710         cmp     $num,$i
711         jl      .Louter4x
712 ___
713 {
714 my @ri=("%rax","%rdx",$m0,$m1);
715 $code.=<<___;
716         mov     16(%rsp,$num,8),$rp     # restore $rp
717         mov     0(%rsp),@ri[0]          # tp[0]
718         pxor    %xmm0,%xmm0
719         mov     8(%rsp),@ri[1]          # tp[1]
720         shr     \$2,$num                # num/=4
721         lea     (%rsp),$ap              # borrow ap for tp
722         xor     $i,$i                   # i=0 and clear CF!
723
724         sub     0($np),@ri[0]
725         mov     16($ap),@ri[2]          # tp[2]
726         mov     24($ap),@ri[3]          # tp[3]
727         sbb     8($np),@ri[1]
728         lea     -1($num),$j             # j=num/4-1
729         jmp     .Lsub4x
730 .align  16
731 .Lsub4x:
732         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
733         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
734         sbb     16($np,$i,8),@ri[2]
735         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
736         mov     40($ap,$i,8),@ri[1]
737         sbb     24($np,$i,8),@ri[3]
738         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
739         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
740         sbb     32($np,$i,8),@ri[0]
741         mov     48($ap,$i,8),@ri[2]
742         mov     56($ap,$i,8),@ri[3]
743         sbb     40($np,$i,8),@ri[1]
744         lea     4($i),$i                # i++
745         dec     $j                      # doesnn't affect CF!
746         jnz     .Lsub4x
747
748         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
749         mov     32($ap,$i,8),@ri[0]     # load overflow bit
750         sbb     16($np,$i,8),@ri[2]
751         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
752         sbb     24($np,$i,8),@ri[3]
753         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
754
755         sbb     \$0,@ri[0]              # handle upmost overflow bit
756         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
757         xor     $i,$i                   # i=0
758         and     @ri[0],$ap
759         not     @ri[0]
760         mov     $rp,$np
761         and     @ri[0],$np
762         lea     -1($num),$j
763         or      $np,$ap                 # ap=borrow?tp:rp
764
765         movdqu  ($ap),%xmm1
766         movdqa  %xmm0,(%rsp)
767         movdqu  %xmm1,($rp)
768         jmp     .Lcopy4x
769 .align  16
770 .Lcopy4x:                                       # copy or in-place refresh
771         movdqu  16($ap,$i),%xmm2
772         movdqu  32($ap,$i),%xmm1
773         movdqa  %xmm0,16(%rsp,$i)
774         movdqu  %xmm2,16($rp,$i)
775         movdqa  %xmm0,32(%rsp,$i)
776         movdqu  %xmm1,32($rp,$i)
777         lea     32($i),$i
778         dec     $j
779         jnz     .Lcopy4x
780
781         shl     \$2,$num
782         movdqu  16($ap,$i),%xmm2
783         movdqa  %xmm0,16(%rsp,$i)
784         movdqu  %xmm2,16($rp,$i)
785 ___
786 }
787 $code.=<<___;
788         mov     8(%rsp,$num,8),%rsi     # restore %rsp
789         mov     \$1,%rax
790         mov     (%rsi),%r15
791         mov     8(%rsi),%r14
792         mov     16(%rsi),%r13
793         mov     24(%rsi),%r12
794         mov     32(%rsi),%rbp
795         mov     40(%rsi),%rbx
796         lea     48(%rsi),%rsp
797 .Lmul4x_epilogue:
798         ret
799 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
800 ___
801 }}}
802
803 {
804 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
805                                 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
806 $code.=<<___;
807 .globl  bn_scatter5
808 .type   bn_scatter5,\@abi-omnipotent
809 .align  16
810 bn_scatter5:
811         lea     ($tbl,$idx,8),$tbl
812 .Lscatter:
813         mov     ($inp),%rax
814         lea     8($inp),$inp
815         mov     %rax,($tbl)
816         lea     32*8($tbl),$tbl
817         sub     \$1,$num
818         jnz     .Lscatter
819         ret
820 .size   bn_scatter5,.-bn_scatter5
821 ___
822 }
823 $code.=<<___;
824 .align  64
825 .Lmagic_masks:
826         .long   0,0, 0,0, 0,0, -1,-1
827         .long   0,0, 0,0, 0,0,  0,0
828 .asciz  "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
829 ___
830
831 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
832
833 print $code;
834 close STDOUT;