crypto/bn/asm/x86_64-mont5.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # August 2011.
  18 #
  19 # Companion to x86_64-mont.pl that optimizes cache-timing attack
  20 # countermeasures. The subroutines are produced by replacing bp[i]
  21 # references in their x86_64-mont.pl counterparts with cache-neutral
  22 # references to powers table computed in BN_mod_exp_mont_consttime.
  23 # In addition subroutine that scatters elements of the powers table
  24 # is implemented, so that scatter-/gathering can be tuned without
  25 # bn_exp.c modifications.
  26
  27 # August 2013.
  28 #
  29 # Add MULX/AD*X code paths and additional interfaces to optimize for
  30 # branch prediction unit. For input lengths that are multiples of 8
  31 # the np argument is not just modulus value, but one interleaved
  32 # with 0. This is to optimize post-condition...
  33
  34 $flavour = shift;
  35 $output  = shift;
  36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  37
  38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  39
  40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  43 die "can't locate x86_64-xlate.pl";
  44
  45 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  46 *STDOUT=*OUT;
  47
  48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  49                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  50         $addx = ($1>=2.23);
  51 }
  52
  53 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  54             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  55         $addx = ($1>=2.10);
  56 }
  57
  58 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  59             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  60         $addx = ($1>=12);
  61 }
  62
  63 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  64         my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
  65         $addx = ($ver>=3.03);
  66 }
  67
  68 # int bn_mul_mont_gather5(
  69 $rp="%rdi";     # BN_ULONG *rp,
  70 $ap="%rsi";     # const BN_ULONG *ap,
  71 $bp="%rdx";     # const BN_ULONG *bp,
  72 $np="%rcx";     # const BN_ULONG *np,
  73 $n0="%r8";      # const BN_ULONG *n0,
  74 $num="%r9";     # int num,
  75                 # int idx);     # 0 to 2^5-1, "index" in $bp holding
  76                                 # pre-computed powers of a', interlaced
  77                                 # in such manner that b[0] is $bp[idx],
  78                                 # b[1] is [2^5+idx], etc.
  79 $lo0="%r10";
  80 $hi0="%r11";
  81 $hi1="%r13";
  82 $i="%r14";
  83 $j="%r15";
  84 $m0="%rbx";
  85 $m1="%rbp";
  86
  87 $code=<<___;
  88 .text
  89
  90 .extern OPENSSL_ia32cap_P
  91
  92 .globl  bn_mul_mont_gather5
  93 .type   bn_mul_mont_gather5,\@function,6
  94 .align  64
  95 bn_mul_mont_gather5:
  96 .cfi_startproc
  97         mov     ${num}d,${num}d
  98         mov     %rsp,%rax
  99 .cfi_def_cfa_register   %rax
 100         test    \$7,${num}d
 101         jnz     .Lmul_enter
 102 ___
 103 $code.=<<___ if ($addx);
 104         mov     OPENSSL_ia32cap_P+8(%rip),%r11d
 105 ___
 106 $code.=<<___;
 107         jmp     .Lmul4x_enter
 108
 109 .align  16
 110 .Lmul_enter:
 111         movd    `($win64?56:8)`(%rsp),%xmm5     # load 7th argument
 112         push    %rbx
 113 .cfi_push       %rbx
 114         push    %rbp
 115 .cfi_push       %rbp
 116         push    %r12
 117 .cfi_push       %r12
 118         push    %r13
 119 .cfi_push       %r13
 120         push    %r14
 121 .cfi_push       %r14
 122         push    %r15
 123 .cfi_push       %r14
 124
 125         neg     $num
 126         mov     %rsp,%r11
 127         lea     -280(%rsp,$num,8),%r10  # future alloca(8*(num+2)+256+8)
 128         neg     $num                    # restore $num
 129         and     \$-1024,%r10            # minimize TLB usage
 130
 131         # An OS-agnostic version of __chkstk.
 132         #
 133         # Some OSes (Windows) insist on stack being "wired" to
 134         # physical memory in strictly sequential manner, i.e. if stack
 135         # allocation spans two pages, then reference to farmost one can
 136         # be punishable by SEGV. But page walking can do good even on
 137         # other OSes, because it guarantees that villain thread hits
 138         # the guard page before it can make damage to innocent one...
 139         sub     %r10,%r11
 140         and     \$-4096,%r11
 141         lea     (%r10,%r11),%rsp
 142         mov     (%rsp),%r11
 143         cmp     %r10,%rsp
 144         ja      .Lmul_page_walk
 145         jmp     .Lmul_page_walk_done
 146
 147 .Lmul_page_walk:
 148         lea     -4096(%rsp),%rsp
 149         mov     (%rsp),%r11
 150         cmp     %r10,%rsp
 151         ja      .Lmul_page_walk
 152 .Lmul_page_walk_done:
 153
 154         lea     .Linc(%rip),%r10
 155         mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
 156 .cfi_cfa_expression     %rsp+8,$num,8,mul,plus,deref,+8
 157 .Lmul_body:
 158
 159         lea     128($bp),%r12           # reassign $bp (+size optimization)
 160 ___
 161                 $bp="%r12";
 162                 $STRIDE=2**5*8;         # 5 is "window size"
 163                 $N=$STRIDE/4;           # should match cache line size
 164 $code.=<<___;
 165         movdqa  0(%r10),%xmm0           # 00000001000000010000000000000000
 166         movdqa  16(%r10),%xmm1          # 00000002000000020000000200000002
 167         lea     24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
 168         and     \$-16,%r10
 169
 170         pshufd  \$0,%xmm5,%xmm5         # broadcast index
 171         movdqa  %xmm1,%xmm4
 172         movdqa  %xmm1,%xmm2
 173 ___
 174 ########################################################################
 175 # calculate mask by comparing 0..31 to index and save result to stack
 176 #
 177 $code.=<<___;
 178         paddd   %xmm0,%xmm1
 179         pcmpeqd %xmm5,%xmm0             # compare to 1,0
 180         .byte   0x67
 181         movdqa  %xmm4,%xmm3
 182 ___
 183 for($k=0;$k<$STRIDE/16-4;$k+=4) {
 184 $code.=<<___;
 185         paddd   %xmm1,%xmm2
 186         pcmpeqd %xmm5,%xmm1             # compare to 3,2
 187         movdqa  %xmm0,`16*($k+0)+112`(%r10)
 188         movdqa  %xmm4,%xmm0
 189
 190         paddd   %xmm2,%xmm3
 191         pcmpeqd %xmm5,%xmm2             # compare to 5,4
 192         movdqa  %xmm1,`16*($k+1)+112`(%r10)
 193         movdqa  %xmm4,%xmm1
 194
 195         paddd   %xmm3,%xmm0
 196         pcmpeqd %xmm5,%xmm3             # compare to 7,6
 197         movdqa  %xmm2,`16*($k+2)+112`(%r10)
 198         movdqa  %xmm4,%xmm2
 199
 200         paddd   %xmm0,%xmm1
 201         pcmpeqd %xmm5,%xmm0
 202         movdqa  %xmm3,`16*($k+3)+112`(%r10)
 203         movdqa  %xmm4,%xmm3
 204 ___
 205 }
 206 $code.=<<___;                           # last iteration can be optimized
 207         paddd   %xmm1,%xmm2
 208         pcmpeqd %xmm5,%xmm1
 209         movdqa  %xmm0,`16*($k+0)+112`(%r10)
 210
 211         paddd   %xmm2,%xmm3
 212         .byte   0x67
 213         pcmpeqd %xmm5,%xmm2
 214         movdqa  %xmm1,`16*($k+1)+112`(%r10)
 215
 216         pcmpeqd %xmm5,%xmm3
 217         movdqa  %xmm2,`16*($k+2)+112`(%r10)
 218         pand    `16*($k+0)-128`($bp),%xmm0      # while it's still in register
 219
 220         pand    `16*($k+1)-128`($bp),%xmm1
 221         pand    `16*($k+2)-128`($bp),%xmm2
 222         movdqa  %xmm3,`16*($k+3)+112`(%r10)
 223         pand    `16*($k+3)-128`($bp),%xmm3
 224         por     %xmm2,%xmm0
 225         por     %xmm3,%xmm1
 226 ___
 227 for($k=0;$k<$STRIDE/16-4;$k+=4) {
 228 $code.=<<___;
 229         movdqa  `16*($k+0)-128`($bp),%xmm4
 230         movdqa  `16*($k+1)-128`($bp),%xmm5
 231         movdqa  `16*($k+2)-128`($bp),%xmm2
 232         pand    `16*($k+0)+112`(%r10),%xmm4
 233         movdqa  `16*($k+3)-128`($bp),%xmm3
 234         pand    `16*($k+1)+112`(%r10),%xmm5
 235         por     %xmm4,%xmm0
 236         pand    `16*($k+2)+112`(%r10),%xmm2
 237         por     %xmm5,%xmm1
 238         pand    `16*($k+3)+112`(%r10),%xmm3
 239         por     %xmm2,%xmm0
 240         por     %xmm3,%xmm1
 241 ___
 242 }
 243 $code.=<<___;
 244         por     %xmm1,%xmm0
 245         pshufd  \$0x4e,%xmm0,%xmm1
 246         por     %xmm1,%xmm0
 247         lea     $STRIDE($bp),$bp
 248         movq    %xmm0,$m0               # m0=bp[0]
 249
 250         mov     ($n0),$n0               # pull n0[0] value
 251         mov     ($ap),%rax
 252
 253         xor     $i,$i                   # i=0
 254         xor     $j,$j                   # j=0
 255
 256         mov     $n0,$m1
 257         mulq    $m0                     # ap[0]*bp[0]
 258         mov     %rax,$lo0
 259         mov     ($np),%rax
 260
 261         imulq   $lo0,$m1                # "tp[0]"*n0
 262         mov     %rdx,$hi0
 263
 264         mulq    $m1                     # np[0]*m1
 265         add     %rax,$lo0               # discarded
 266         mov     8($ap),%rax
 267         adc     \$0,%rdx
 268         mov     %rdx,$hi1
 269
 270         lea     1($j),$j                # j++
 271         jmp     .L1st_enter
 272
 273 .align  16
 274 .L1st:
 275         add     %rax,$hi1
 276         mov     ($ap,$j,8),%rax
 277         adc     \$0,%rdx
 278         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
 279         mov     $lo0,$hi0
 280         adc     \$0,%rdx
 281         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 282         mov     %rdx,$hi1
 283
 284 .L1st_enter:
 285         mulq    $m0                     # ap[j]*bp[0]
 286         add     %rax,$hi0
 287         mov     ($np,$j,8),%rax
 288         adc     \$0,%rdx
 289         lea     1($j),$j                # j++
 290         mov     %rdx,$lo0
 291
 292         mulq    $m1                     # np[j]*m1
 293         cmp     $num,$j
 294         jne     .L1st                   # note that upon exit $j==$num, so
 295                                         # they can be used interchangeably
 296
 297         add     %rax,$hi1
 298         adc     \$0,%rdx
 299         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
 300         adc     \$0,%rdx
 301         mov     $hi1,-16(%rsp,$num,8)   # tp[num-1]
 302         mov     %rdx,$hi1
 303         mov     $lo0,$hi0
 304
 305         xor     %rdx,%rdx
 306         add     $hi0,$hi1
 307         adc     \$0,%rdx
 308         mov     $hi1,-8(%rsp,$num,8)
 309         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 310
 311         lea     1($i),$i                # i++
 312         jmp     .Louter
 313 .align  16
 314 .Louter:
 315         lea     24+128(%rsp,$num,8),%rdx        # where 256-byte mask is (+size optimization)
 316         and     \$-16,%rdx
 317         pxor    %xmm4,%xmm4
 318         pxor    %xmm5,%xmm5
 319 ___
 320 for($k=0;$k<$STRIDE/16;$k+=4) {
 321 $code.=<<___;
 322         movdqa  `16*($k+0)-128`($bp),%xmm0
 323         movdqa  `16*($k+1)-128`($bp),%xmm1
 324         movdqa  `16*($k+2)-128`($bp),%xmm2
 325         movdqa  `16*($k+3)-128`($bp),%xmm3
 326         pand    `16*($k+0)-128`(%rdx),%xmm0
 327         pand    `16*($k+1)-128`(%rdx),%xmm1
 328         por     %xmm0,%xmm4
 329         pand    `16*($k+2)-128`(%rdx),%xmm2
 330         por     %xmm1,%xmm5
 331         pand    `16*($k+3)-128`(%rdx),%xmm3
 332         por     %xmm2,%xmm4
 333         por     %xmm3,%xmm5
 334 ___
 335 }
 336 $code.=<<___;
 337         por     %xmm5,%xmm4
 338         pshufd  \$0x4e,%xmm4,%xmm0
 339         por     %xmm4,%xmm0
 340         lea     $STRIDE($bp),$bp
 341
 342         mov     ($ap),%rax              # ap[0]
 343         movq    %xmm0,$m0               # m0=bp[i]
 344
 345         xor     $j,$j                   # j=0
 346         mov     $n0,$m1
 347         mov     (%rsp),$lo0
 348
 349         mulq    $m0                     # ap[0]*bp[i]
 350         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
 351         mov     ($np),%rax
 352         adc     \$0,%rdx
 353
 354         imulq   $lo0,$m1                # tp[0]*n0
 355         mov     %rdx,$hi0
 356
 357         mulq    $m1                     # np[0]*m1
 358         add     %rax,$lo0               # discarded
 359         mov     8($ap),%rax
 360         adc     \$0,%rdx
 361         mov     8(%rsp),$lo0            # tp[1]
 362         mov     %rdx,$hi1
 363
 364         lea     1($j),$j                # j++
 365         jmp     .Linner_enter
 366
 367 .align  16
 368 .Linner:
 369         add     %rax,$hi1
 370         mov     ($ap,$j,8),%rax
 371         adc     \$0,%rdx
 372         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
 373         mov     (%rsp,$j,8),$lo0
 374         adc     \$0,%rdx
 375         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 376         mov     %rdx,$hi1
 377
 378 .Linner_enter:
 379         mulq    $m0                     # ap[j]*bp[i]
 380         add     %rax,$hi0
 381         mov     ($np,$j,8),%rax
 382         adc     \$0,%rdx
 383         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
 384         mov     %rdx,$hi0
 385         adc     \$0,$hi0
 386         lea     1($j),$j                # j++
 387
 388         mulq    $m1                     # np[j]*m1
 389         cmp     $num,$j
 390         jne     .Linner                 # note that upon exit $j==$num, so
 391                                         # they can be used interchangeably
 392         add     %rax,$hi1
 393         adc     \$0,%rdx
 394         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
 395         mov     (%rsp,$num,8),$lo0
 396         adc     \$0,%rdx
 397         mov     $hi1,-16(%rsp,$num,8)   # tp[num-1]
 398         mov     %rdx,$hi1
 399
 400         xor     %rdx,%rdx
 401         add     $hi0,$hi1
 402         adc     \$0,%rdx
 403         add     $lo0,$hi1               # pull upmost overflow bit
 404         adc     \$0,%rdx
 405         mov     $hi1,-8(%rsp,$num,8)
 406         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 407
 408         lea     1($i),$i                # i++
 409         cmp     $num,$i
 410         jb      .Louter
 411
 412         xor     $i,$i                   # i=0 and clear CF!
 413         mov     (%rsp),%rax             # tp[0]
 414         lea     (%rsp),$ap              # borrow ap for tp
 415         mov     $num,$j                 # j=num
 416         jmp     .Lsub
 417 .align  16
 418 .Lsub:  sbb     ($np,$i,8),%rax
 419         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
 420         mov     8($ap,$i,8),%rax        # tp[i+1]
 421         lea     1($i),$i                # i++
 422         dec     $j                      # doesnn't affect CF!
 423         jnz     .Lsub
 424
 425         sbb     \$0,%rax                # handle upmost overflow bit
 426         xor     $i,$i
 427         and     %rax,$ap
 428         not     %rax
 429         mov     $rp,$np
 430         and     %rax,$np
 431         mov     $num,$j                 # j=num
 432         or      $np,$ap                 # ap=borrow?tp:rp
 433 .align  16
 434 .Lcopy:                                 # copy or in-place refresh
 435         mov     ($ap,$i,8),%rax
 436         mov     $i,(%rsp,$i,8)          # zap temporary vector
 437         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
 438         lea     1($i),$i
 439         sub     \$1,$j
 440         jnz     .Lcopy
 441
 442         mov     8(%rsp,$num,8),%rsi     # restore %rsp
 443 .cfi_def_cfa    %rsi,8
 444         mov     \$1,%rax
 445
 446         mov     -48(%rsi),%r15
 447 .cfi_restore    %r15
 448         mov     -40(%rsi),%r14
 449 .cfi_restore    %r15
 450         mov     -32(%rsi),%r13
 451 .cfi_restore    %r15
 452         mov     -24(%rsi),%r12
 453 .cfi_restore    %r15
 454         mov     -16(%rsi),%rbp
 455 .cfi_restore    %r15
 456         mov     -8(%rsi),%rbx
 457 .cfi_restore    %r15
 458         lea     (%rsi),%rsp
 459 .cfi_def_cfa_register   %rsp
 460 .Lmul_epilogue:
 461         ret
 462 .cfi_endproc
 463 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
 464 ___
 465 {{{
 466 my @A=("%r10","%r11");
 467 my @N=("%r13","%rdi");
 468 $code.=<<___;
 469 .type   bn_mul4x_mont_gather5,\@function,6
 470 .align  32
 471 bn_mul4x_mont_gather5:
 472 .cfi_startproc
 473         .byte   0x67
 474         mov     %rsp,%rax
 475 .cfi_def_cfa_register   %rax
 476 .Lmul4x_enter:
 477 ___
 478 $code.=<<___ if ($addx);
 479         and     \$0x80108,%r11d
 480         cmp     \$0x80108,%r11d         # check for AD*X+BMI2+BMI1
 481         je      .Lmulx4x_enter
 482 ___
 483 $code.=<<___;
 484         push    %rbx
 485 .cfi_push       %rbx
 486         push    %rbp
 487 .cfi_push       %rbp
 488         push    %r12
 489 .cfi_push       %r12
 490         push    %r13
 491 .cfi_push       %r13
 492         push    %r14
 493 .cfi_push       %r14
 494         push    %r15
 495 .cfi_push       %r15
 496 .Lmul4x_prologue:
 497
 498         .byte   0x67
 499         shl     \$3,${num}d             # convert $num to bytes
 500         lea     ($num,$num,2),%r10      # 3*$num in bytes
 501         neg     $num                    # -$num
 502
 503         ##############################################################
 504         # Ensure that stack frame doesn't alias with $rptr+3*$num
 505         # modulo 4096, which covers ret[num], am[num] and n[num]
 506         # (see bn_exp.c). This is done to allow memory disambiguation
 507         # logic do its magic. [Extra [num] is allocated in order
 508         # to align with bn_power5's frame, which is cleansed after
 509         # completing exponentiation. Extra 256 bytes is for power mask
 510         # calculated from 7th argument, the index.]
 511         #
 512         lea     -320(%rsp,$num,2),%r11
 513         mov     %rsp,%rbp
 514         sub     $rp,%r11
 515         and     \$4095,%r11
 516         cmp     %r11,%r10
 517         jb      .Lmul4xsp_alt
 518         sub     %r11,%rbp               # align with $rp
 519         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
 520         jmp     .Lmul4xsp_done
 521
 522 .align  32
 523 .Lmul4xsp_alt:
 524         lea     4096-320(,$num,2),%r10
 525         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
 526         sub     %r10,%r11
 527         mov     \$0,%r10
 528         cmovc   %r10,%r11
 529         sub     %r11,%rbp
 530 .Lmul4xsp_done:
 531         and     \$-64,%rbp
 532         mov     %rsp,%r11
 533         sub     %rbp,%r11
 534         and     \$-4096,%r11
 535         lea     (%rbp,%r11),%rsp
 536         mov     (%rsp),%r10
 537         cmp     %rbp,%rsp
 538         ja      .Lmul4x_page_walk
 539         jmp     .Lmul4x_page_walk_done
 540
 541 .Lmul4x_page_walk:
 542         lea     -4096(%rsp),%rsp
 543         mov     (%rsp),%r10
 544         cmp     %rbp,%rsp
 545         ja      .Lmul4x_page_walk
 546 .Lmul4x_page_walk_done:
 547
 548         neg     $num
 549
 550         mov     %rax,40(%rsp)
 551 .cfi_cfa_expression     %rsp+40,deref,+8
 552 .Lmul4x_body:
 553
 554         call    mul4x_internal
 555
 556         mov     40(%rsp),%rsi           # restore %rsp
 557 .cfi_def_cfa    %rsi,8
 558         mov     \$1,%rax
 559
 560         mov     -48(%rsi),%r15
 561 .cfi_restore    %r15
 562         mov     -40(%rsi),%r14
 563 .cfi_restore    %r14
 564         mov     -32(%rsi),%r13
 565 .cfi_restore    %r13
 566         mov     -24(%rsi),%r12
 567 .cfi_restore    %r12
 568         mov     -16(%rsi),%rbp
 569 .cfi_restore    %rbp
 570         mov     -8(%rsi),%rbx
 571 .cfi_restore    %rbx
 572         lea     (%rsi),%rsp
 573 .cfi_def_cfa_register   %rsp
 574 .Lmul4x_epilogue:
 575         ret
 576 .cfi_endproc
 577 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
 578
 579 .type   mul4x_internal,\@abi-omnipotent
 580 .align  32
 581 mul4x_internal:
 582         shl     \$5,$num                # $num was in bytes
 583         movd    `($win64?56:8)`(%rax),%xmm5     # load 7th argument, index
 584         lea     .Linc(%rip),%rax
 585         lea     128(%rdx,$num),%r13     # end of powers table (+size optimization)
 586         shr     \$5,$num                # restore $num
 587 ___
 588                 $bp="%r12";
 589                 $STRIDE=2**5*8;         # 5 is "window size"
 590                 $N=$STRIDE/4;           # should match cache line size
 591                 $tp=$i;
 592 $code.=<<___;
 593         movdqa  0(%rax),%xmm0           # 00000001000000010000000000000000
 594         movdqa  16(%rax),%xmm1          # 00000002000000020000000200000002
 595         lea     88-112(%rsp,$num),%r10  # place the mask after tp[num+1] (+ICache optimization)
 596         lea     128(%rdx),$bp           # size optimization
 597
 598         pshufd  \$0,%xmm5,%xmm5         # broadcast index
 599         movdqa  %xmm1,%xmm4
 600         .byte   0x67,0x67
 601         movdqa  %xmm1,%xmm2
 602 ___
 603 ########################################################################
 604 # calculate mask by comparing 0..31 to index and save result to stack
 605 #
 606 $code.=<<___;
 607         paddd   %xmm0,%xmm1
 608         pcmpeqd %xmm5,%xmm0             # compare to 1,0
 609         .byte   0x67
 610         movdqa  %xmm4,%xmm3
 611 ___
 612 for($i=0;$i<$STRIDE/16-4;$i+=4) {
 613 $code.=<<___;
 614         paddd   %xmm1,%xmm2
 615         pcmpeqd %xmm5,%xmm1             # compare to 3,2
 616         movdqa  %xmm0,`16*($i+0)+112`(%r10)
 617         movdqa  %xmm4,%xmm0
 618
 619         paddd   %xmm2,%xmm3
 620         pcmpeqd %xmm5,%xmm2             # compare to 5,4
 621         movdqa  %xmm1,`16*($i+1)+112`(%r10)
 622         movdqa  %xmm4,%xmm1
 623
 624         paddd   %xmm3,%xmm0
 625         pcmpeqd %xmm5,%xmm3             # compare to 7,6
 626         movdqa  %xmm2,`16*($i+2)+112`(%r10)
 627         movdqa  %xmm4,%xmm2
 628
 629         paddd   %xmm0,%xmm1
 630         pcmpeqd %xmm5,%xmm0
 631         movdqa  %xmm3,`16*($i+3)+112`(%r10)
 632         movdqa  %xmm4,%xmm3
 633 ___
 634 }
 635 $code.=<<___;                           # last iteration can be optimized
 636         paddd   %xmm1,%xmm2
 637         pcmpeqd %xmm5,%xmm1
 638         movdqa  %xmm0,`16*($i+0)+112`(%r10)
 639
 640         paddd   %xmm2,%xmm3
 641         .byte   0x67
 642         pcmpeqd %xmm5,%xmm2
 643         movdqa  %xmm1,`16*($i+1)+112`(%r10)
 644
 645         pcmpeqd %xmm5,%xmm3
 646         movdqa  %xmm2,`16*($i+2)+112`(%r10)
 647         pand    `16*($i+0)-128`($bp),%xmm0      # while it's still in register
 648
 649         pand    `16*($i+1)-128`($bp),%xmm1
 650         pand    `16*($i+2)-128`($bp),%xmm2
 651         movdqa  %xmm3,`16*($i+3)+112`(%r10)
 652         pand    `16*($i+3)-128`($bp),%xmm3
 653         por     %xmm2,%xmm0
 654         por     %xmm3,%xmm1
 655 ___
 656 for($i=0;$i<$STRIDE/16-4;$i+=4) {
 657 $code.=<<___;
 658         movdqa  `16*($i+0)-128`($bp),%xmm4
 659         movdqa  `16*($i+1)-128`($bp),%xmm5
 660         movdqa  `16*($i+2)-128`($bp),%xmm2
 661         pand    `16*($i+0)+112`(%r10),%xmm4
 662         movdqa  `16*($i+3)-128`($bp),%xmm3
 663         pand    `16*($i+1)+112`(%r10),%xmm5
 664         por     %xmm4,%xmm0
 665         pand    `16*($i+2)+112`(%r10),%xmm2
 666         por     %xmm5,%xmm1
 667         pand    `16*($i+3)+112`(%r10),%xmm3
 668         por     %xmm2,%xmm0
 669         por     %xmm3,%xmm1
 670 ___
 671 }
 672 $code.=<<___;
 673         por     %xmm1,%xmm0
 674         pshufd  \$0x4e,%xmm0,%xmm1
 675         por     %xmm1,%xmm0
 676         lea     $STRIDE($bp),$bp
 677         movq    %xmm0,$m0               # m0=bp[0]
 678
 679         mov     %r13,16+8(%rsp)         # save end of b[num]
 680         mov     $rp, 56+8(%rsp)         # save $rp
 681
 682         mov     ($n0),$n0               # pull n0[0] value
 683         mov     ($ap),%rax
 684         lea     ($ap,$num),$ap          # end of a[num]
 685         neg     $num
 686
 687         mov     $n0,$m1
 688         mulq    $m0                     # ap[0]*bp[0]
 689         mov     %rax,$A[0]
 690         mov     ($np),%rax
 691
 692         imulq   $A[0],$m1               # "tp[0]"*n0
 693         lea     64+8(%rsp),$tp
 694         mov     %rdx,$A[1]
 695
 696         mulq    $m1                     # np[0]*m1
 697         add     %rax,$A[0]              # discarded
 698         mov     8($ap,$num),%rax
 699         adc     \$0,%rdx
 700         mov     %rdx,$N[1]
 701
 702         mulq    $m0
 703         add     %rax,$A[1]
 704         mov     8*1($np),%rax
 705         adc     \$0,%rdx
 706         mov     %rdx,$A[0]
 707
 708         mulq    $m1
 709         add     %rax,$N[1]
 710         mov     16($ap,$num),%rax
 711         adc     \$0,%rdx
 712         add     $A[1],$N[1]
 713         lea     4*8($num),$j            # j=4
 714         lea     8*4($np),$np
 715         adc     \$0,%rdx
 716         mov     $N[1],($tp)
 717         mov     %rdx,$N[0]
 718         jmp     .L1st4x
 719
 720 .align  32
 721 .L1st4x:
 722         mulq    $m0                     # ap[j]*bp[0]
 723         add     %rax,$A[0]
 724         mov     -8*2($np),%rax
 725         lea     32($tp),$tp
 726         adc     \$0,%rdx
 727         mov     %rdx,$A[1]
 728
 729         mulq    $m1                     # np[j]*m1
 730         add     %rax,$N[0]
 731         mov     -8($ap,$j),%rax
 732         adc     \$0,%rdx
 733         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 734         adc     \$0,%rdx
 735         mov     $N[0],-24($tp)          # tp[j-1]
 736         mov     %rdx,$N[1]
 737
 738         mulq    $m0                     # ap[j]*bp[0]
 739         add     %rax,$A[1]
 740         mov     -8*1($np),%rax
 741         adc     \$0,%rdx
 742         mov     %rdx,$A[0]
 743
 744         mulq    $m1                     # np[j]*m1
 745         add     %rax,$N[1]
 746         mov     ($ap,$j),%rax
 747         adc     \$0,%rdx
 748         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 749         adc     \$0,%rdx
 750         mov     $N[1],-16($tp)          # tp[j-1]
 751         mov     %rdx,$N[0]
 752
 753         mulq    $m0                     # ap[j]*bp[0]
 754         add     %rax,$A[0]
 755         mov     8*0($np),%rax
 756         adc     \$0,%rdx
 757         mov     %rdx,$A[1]
 758
 759         mulq    $m1                     # np[j]*m1
 760         add     %rax,$N[0]
 761         mov     8($ap,$j),%rax
 762         adc     \$0,%rdx
 763         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 764         adc     \$0,%rdx
 765         mov     $N[0],-8($tp)           # tp[j-1]
 766         mov     %rdx,$N[1]
 767
 768         mulq    $m0                     # ap[j]*bp[0]
 769         add     %rax,$A[1]
 770         mov     8*1($np),%rax
 771         adc     \$0,%rdx
 772         mov     %rdx,$A[0]
 773
 774         mulq    $m1                     # np[j]*m1
 775         add     %rax,$N[1]
 776         mov     16($ap,$j),%rax
 777         adc     \$0,%rdx
 778         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 779         lea     8*4($np),$np
 780         adc     \$0,%rdx
 781         mov     $N[1],($tp)             # tp[j-1]
 782         mov     %rdx,$N[0]
 783
 784         add     \$32,$j                 # j+=4
 785         jnz     .L1st4x
 786
 787         mulq    $m0                     # ap[j]*bp[0]
 788         add     %rax,$A[0]
 789         mov     -8*2($np),%rax
 790         lea     32($tp),$tp
 791         adc     \$0,%rdx
 792         mov     %rdx,$A[1]
 793
 794         mulq    $m1                     # np[j]*m1
 795         add     %rax,$N[0]
 796         mov     -8($ap),%rax
 797         adc     \$0,%rdx
 798         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 799         adc     \$0,%rdx
 800         mov     $N[0],-24($tp)          # tp[j-1]
 801         mov     %rdx,$N[1]
 802
 803         mulq    $m0                     # ap[j]*bp[0]
 804         add     %rax,$A[1]
 805         mov     -8*1($np),%rax
 806         adc     \$0,%rdx
 807         mov     %rdx,$A[0]
 808
 809         mulq    $m1                     # np[j]*m1
 810         add     %rax,$N[1]
 811         mov     ($ap,$num),%rax         # ap[0]
 812         adc     \$0,%rdx
 813         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 814         adc     \$0,%rdx
 815         mov     $N[1],-16($tp)          # tp[j-1]
 816         mov     %rdx,$N[0]
 817
 818         lea     ($np,$num),$np          # rewind $np
 819
 820         xor     $N[1],$N[1]
 821         add     $A[0],$N[0]
 822         adc     \$0,$N[1]
 823         mov     $N[0],-8($tp)
 824
 825         jmp     .Louter4x
 826
 827 .align  32
 828 .Louter4x:
 829         lea     16+128($tp),%rdx        # where 256-byte mask is (+size optimization)
 830         pxor    %xmm4,%xmm4
 831         pxor    %xmm5,%xmm5
 832 ___
 833 for($i=0;$i<$STRIDE/16;$i+=4) {
 834 $code.=<<___;
 835         movdqa  `16*($i+0)-128`($bp),%xmm0
 836         movdqa  `16*($i+1)-128`($bp),%xmm1
 837         movdqa  `16*($i+2)-128`($bp),%xmm2
 838         movdqa  `16*($i+3)-128`($bp),%xmm3
 839         pand    `16*($i+0)-128`(%rdx),%xmm0
 840         pand    `16*($i+1)-128`(%rdx),%xmm1
 841         por     %xmm0,%xmm4
 842         pand    `16*($i+2)-128`(%rdx),%xmm2
 843         por     %xmm1,%xmm5
 844         pand    `16*($i+3)-128`(%rdx),%xmm3
 845         por     %xmm2,%xmm4
 846         por     %xmm3,%xmm5
 847 ___
 848 }
 849 $code.=<<___;
 850         por     %xmm5,%xmm4
 851         pshufd  \$0x4e,%xmm4,%xmm0
 852         por     %xmm4,%xmm0
 853         lea     $STRIDE($bp),$bp
 854         movq    %xmm0,$m0               # m0=bp[i]
 855
 856         mov     ($tp,$num),$A[0]
 857         mov     $n0,$m1
 858         mulq    $m0                     # ap[0]*bp[i]
 859         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
 860         mov     ($np),%rax
 861         adc     \$0,%rdx
 862
 863         imulq   $A[0],$m1               # tp[0]*n0
 864         mov     %rdx,$A[1]
 865         mov     $N[1],($tp)             # store upmost overflow bit
 866
 867         lea     ($tp,$num),$tp          # rewind $tp
 868
 869         mulq    $m1                     # np[0]*m1
 870         add     %rax,$A[0]              # "$N[0]", discarded
 871         mov     8($ap,$num),%rax
 872         adc     \$0,%rdx
 873         mov     %rdx,$N[1]
 874
 875         mulq    $m0                     # ap[j]*bp[i]
 876         add     %rax,$A[1]
 877         mov     8*1($np),%rax
 878         adc     \$0,%rdx
 879         add     8($tp),$A[1]            # +tp[1]
 880         adc     \$0,%rdx
 881         mov     %rdx,$A[0]
 882
 883         mulq    $m1                     # np[j]*m1
 884         add     %rax,$N[1]
 885         mov     16($ap,$num),%rax
 886         adc     \$0,%rdx
 887         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
 888         lea     4*8($num),$j            # j=4
 889         lea     8*4($np),$np
 890         adc     \$0,%rdx
 891         mov     %rdx,$N[0]
 892         jmp     .Linner4x
 893
 894 .align  32
 895 .Linner4x:
 896         mulq    $m0                     # ap[j]*bp[i]
 897         add     %rax,$A[0]
 898         mov     -8*2($np),%rax
 899         adc     \$0,%rdx
 900         add     16($tp),$A[0]           # ap[j]*bp[i]+tp[j]
 901         lea     32($tp),$tp
 902         adc     \$0,%rdx
 903         mov     %rdx,$A[1]
 904
 905         mulq    $m1                     # np[j]*m1
 906         add     %rax,$N[0]
 907         mov     -8($ap,$j),%rax
 908         adc     \$0,%rdx
 909         add     $A[0],$N[0]
 910         adc     \$0,%rdx
 911         mov     $N[1],-32($tp)          # tp[j-1]
 912         mov     %rdx,$N[1]
 913
 914         mulq    $m0                     # ap[j]*bp[i]
 915         add     %rax,$A[1]
 916         mov     -8*1($np),%rax
 917         adc     \$0,%rdx
 918         add     -8($tp),$A[1]
 919         adc     \$0,%rdx
 920         mov     %rdx,$A[0]
 921
 922         mulq    $m1                     # np[j]*m1
 923         add     %rax,$N[1]
 924         mov     ($ap,$j),%rax
 925         adc     \$0,%rdx
 926         add     $A[1],$N[1]
 927         adc     \$0,%rdx
 928         mov     $N[0],-24($tp)          # tp[j-1]
 929         mov     %rdx,$N[0]
 930
 931         mulq    $m0                     # ap[j]*bp[i]
 932         add     %rax,$A[0]
 933         mov     8*0($np),%rax
 934         adc     \$0,%rdx
 935         add     ($tp),$A[0]             # ap[j]*bp[i]+tp[j]
 936         adc     \$0,%rdx
 937         mov     %rdx,$A[1]
 938
 939         mulq    $m1                     # np[j]*m1
 940         add     %rax,$N[0]
 941         mov     8($ap,$j),%rax
 942         adc     \$0,%rdx
 943         add     $A[0],$N[0]
 944         adc     \$0,%rdx
 945         mov     $N[1],-16($tp)          # tp[j-1]
 946         mov     %rdx,$N[1]
 947
 948         mulq    $m0                     # ap[j]*bp[i]
 949         add     %rax,$A[1]
 950         mov     8*1($np),%rax
 951         adc     \$0,%rdx
 952         add     8($tp),$A[1]
 953         adc     \$0,%rdx
 954         mov     %rdx,$A[0]
 955
 956         mulq    $m1                     # np[j]*m1
 957         add     %rax,$N[1]
 958         mov     16($ap,$j),%rax
 959         adc     \$0,%rdx
 960         add     $A[1],$N[1]
 961         lea     8*4($np),$np
 962         adc     \$0,%rdx
 963         mov     $N[0],-8($tp)           # tp[j-1]
 964         mov     %rdx,$N[0]
 965
 966         add     \$32,$j                 # j+=4
 967         jnz     .Linner4x
 968
 969         mulq    $m0                     # ap[j]*bp[i]
 970         add     %rax,$A[0]
 971         mov     -8*2($np),%rax
 972         adc     \$0,%rdx
 973         add     16($tp),$A[0]           # ap[j]*bp[i]+tp[j]
 974         lea     32($tp),$tp
 975         adc     \$0,%rdx
 976         mov     %rdx,$A[1]
 977
 978         mulq    $m1                     # np[j]*m1
 979         add     %rax,$N[0]
 980         mov     -8($ap),%rax
 981         adc     \$0,%rdx
 982         add     $A[0],$N[0]
 983         adc     \$0,%rdx
 984         mov     $N[1],-32($tp)          # tp[j-1]
 985         mov     %rdx,$N[1]
 986
 987         mulq    $m0                     # ap[j]*bp[i]
 988         add     %rax,$A[1]
 989         mov     $m1,%rax
 990         mov     -8*1($np),$m1
 991         adc     \$0,%rdx
 992         add     -8($tp),$A[1]
 993         adc     \$0,%rdx
 994         mov     %rdx,$A[0]
 995
 996         mulq    $m1                     # np[j]*m1
 997         add     %rax,$N[1]
 998         mov     ($ap,$num),%rax         # ap[0]
 999         adc     \$0,%rdx
1000         add     $A[1],$N[1]
1001         adc     \$0,%rdx
1002         mov     $N[0],-24($tp)          # tp[j-1]
1003         mov     %rdx,$N[0]
1004
1005         mov     $N[1],-16($tp)          # tp[j-1]
1006         lea     ($np,$num),$np          # rewind $np
1007
1008         xor     $N[1],$N[1]
1009         add     $A[0],$N[0]
1010         adc     \$0,$N[1]
1011         add     ($tp),$N[0]             # pull upmost overflow bit
1012         adc     \$0,$N[1]               # upmost overflow bit
1013         mov     $N[0],-8($tp)
1014
1015         cmp     16+8(%rsp),$bp
1016         jb      .Louter4x
1017 ___
1018 if (1) {
1019 $code.=<<___;
1020         xor     %rax,%rax
1021         sub     $N[0],$m1               # compare top-most words
1022         adc     $j,$j                   # $j is zero
1023         or      $j,$N[1]
1024         sub     $N[1],%rax              # %rax=-$N[1]
1025         lea     ($tp,$num),%rbx         # tptr in .sqr4x_sub
1026         mov     ($np),%r12
1027         lea     ($np),%rbp              # nptr in .sqr4x_sub
1028         mov     %r9,%rcx
1029         sar     \$3+2,%rcx
1030         mov     56+8(%rsp),%rdi         # rptr in .sqr4x_sub
1031         dec     %r12                    # so that after 'not' we get -n[0]
1032         xor     %r10,%r10
1033         mov     8*1(%rbp),%r13
1034         mov     8*2(%rbp),%r14
1035         mov     8*3(%rbp),%r15
1036         jmp     .Lsqr4x_sub_entry
1037 ___
1038 } else {
1039 my @ri=("%rax",$bp,$m0,$m1);
1040 my $rp="%rdx";
1041 $code.=<<___
1042         xor     \$1,$N[1]
1043         lea     ($tp,$num),$tp          # rewind $tp
1044         sar     \$5,$num                # cf=0
1045         lea     ($np,$N[1],8),$np
1046         mov     56+8(%rsp),$rp          # restore $rp
1047         jmp     .Lsub4x
1048
1049 .align  32
1050 .Lsub4x:
1051         .byte   0x66
1052         mov     8*0($tp),@ri[0]
1053         mov     8*1($tp),@ri[1]
1054         .byte   0x66
1055         sbb     16*0($np),@ri[0]
1056         mov     8*2($tp),@ri[2]
1057         sbb     16*1($np),@ri[1]
1058         mov     3*8($tp),@ri[3]
1059         lea     4*8($tp),$tp
1060         sbb     16*2($np),@ri[2]
1061         mov     @ri[0],8*0($rp)
1062         sbb     16*3($np),@ri[3]
1063         lea     16*4($np),$np
1064         mov     @ri[1],8*1($rp)
1065         mov     @ri[2],8*2($rp)
1066         mov     @ri[3],8*3($rp)
1067         lea     8*4($rp),$rp
1068
1069         inc     $num
1070         jnz     .Lsub4x
1071
1072         ret
1073 ___
1074 }
1075 $code.=<<___;
1076 .size   mul4x_internal,.-mul4x_internal
1077 ___
1078 }}}
1079 \f{{{
1080 ######################################################################
1081 # void bn_power5(
1082 my $rptr="%rdi";        # BN_ULONG *rptr,
1083 my $aptr="%rsi";        # const BN_ULONG *aptr,
1084 my $bptr="%rdx";        # const void *table,
1085 my $nptr="%rcx";        # const BN_ULONG *nptr,
1086 my $n0  ="%r8";         # const BN_ULONG *n0);
1087 my $num ="%r9";         # int num, has to be divisible by 8
1088                         # int pwr
1089
1090 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1091 my @A0=("%r10","%r11");
1092 my @A1=("%r12","%r13");
1093 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1094
1095 $code.=<<___;
1096 .globl  bn_power5
1097 .type   bn_power5,\@function,6
1098 .align  32
1099 bn_power5:
1100 .cfi_startproc
1101         mov     %rsp,%rax
1102 .cfi_def_cfa_register   %rax
1103 ___
1104 $code.=<<___ if ($addx);
1105         mov     OPENSSL_ia32cap_P+8(%rip),%r11d
1106         and     \$0x80108,%r11d
1107         cmp     \$0x80108,%r11d         # check for AD*X+BMI2+BMI1
1108         je      .Lpowerx5_enter
1109 ___
1110 $code.=<<___;
1111         push    %rbx
1112 .cfi_push       %rbx
1113         push    %rbp
1114 .cfi_push       %rbp
1115         push    %r12
1116 .cfi_push       %r12
1117         push    %r13
1118 .cfi_push       %r13
1119         push    %r14
1120 .cfi_push       %r14
1121         push    %r15
1122 .cfi_push       %r15
1123 .Lpower5_prologue:
1124
1125         shl     \$3,${num}d             # convert $num to bytes
1126         lea     ($num,$num,2),%r10d     # 3*$num
1127         neg     $num
1128         mov     ($n0),$n0               # *n0
1129
1130         ##############################################################
1131         # Ensure that stack frame doesn't alias with $rptr+3*$num
1132         # modulo 4096, which covers ret[num], am[num] and n[num]
1133         # (see bn_exp.c). This is done to allow memory disambiguation
1134         # logic do its magic. [Extra 256 bytes is for power mask
1135         # calculated from 7th argument, the index.]
1136         #
1137         lea     -320(%rsp,$num,2),%r11
1138         mov     %rsp,%rbp
1139         sub     $rptr,%r11
1140         and     \$4095,%r11
1141         cmp     %r11,%r10
1142         jb      .Lpwr_sp_alt
1143         sub     %r11,%rbp               # align with $aptr
1144         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
1145         jmp     .Lpwr_sp_done
1146
1147 .align  32
1148 .Lpwr_sp_alt:
1149         lea     4096-320(,$num,2),%r10
1150         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*num*8+256)
1151         sub     %r10,%r11
1152         mov     \$0,%r10
1153         cmovc   %r10,%r11
1154         sub     %r11,%rbp
1155 .Lpwr_sp_done:
1156         and     \$-64,%rbp
1157         mov     %rsp,%r11
1158         sub     %rbp,%r11
1159         and     \$-4096,%r11
1160         lea     (%rbp,%r11),%rsp
1161         mov     (%rsp),%r10
1162         cmp     %rbp,%rsp
1163         ja      .Lpwr_page_walk
1164         jmp     .Lpwr_page_walk_done
1165
1166 .Lpwr_page_walk:
1167         lea     -4096(%rsp),%rsp
1168         mov     (%rsp),%r10
1169         cmp     %rbp,%rsp
1170         ja      .Lpwr_page_walk
1171 .Lpwr_page_walk_done:
1172
1173         mov     $num,%r10
1174         neg     $num
1175
1176         ##############################################################
1177         # Stack layout
1178         #
1179         # +0    saved $num, used in reduction section
1180         # +8    &t[2*$num], used in reduction section
1181         # +32   saved *n0
1182         # +40   saved %rsp
1183         # +48   t[2*$num]
1184         #
1185         mov     $n0,  32(%rsp)
1186         mov     %rax, 40(%rsp)          # save original %rsp
1187 .cfi_cfa_expression     %rsp+40,deref,+8
1188 .Lpower5_body:
1189         movq    $rptr,%xmm1             # save $rptr, used in sqr8x
1190         movq    $nptr,%xmm2             # save $nptr
1191         movq    %r10, %xmm3             # -$num, used in sqr8x
1192         movq    $bptr,%xmm4
1193
1194         call    __bn_sqr8x_internal
1195         call    __bn_post4x_internal
1196         call    __bn_sqr8x_internal
1197         call    __bn_post4x_internal
1198         call    __bn_sqr8x_internal
1199         call    __bn_post4x_internal
1200         call    __bn_sqr8x_internal
1201         call    __bn_post4x_internal
1202         call    __bn_sqr8x_internal
1203         call    __bn_post4x_internal
1204
1205         movq    %xmm2,$nptr
1206         movq    %xmm4,$bptr
1207         mov     $aptr,$rptr
1208         mov     40(%rsp),%rax
1209         lea     32(%rsp),$n0
1210
1211         call    mul4x_internal
1212
1213         mov     40(%rsp),%rsi           # restore %rsp
1214 .cfi_def_cfa    %rsi,8
1215         mov     \$1,%rax
1216         mov     -48(%rsi),%r15
1217 .cfi_restore    %r15
1218         mov     -40(%rsi),%r14
1219 .cfi_restore    %r14
1220         mov     -32(%rsi),%r13
1221 .cfi_restore    %r13
1222         mov     -24(%rsi),%r12
1223 .cfi_restore    %r12
1224         mov     -16(%rsi),%rbp
1225 .cfi_restore    %rbp
1226         mov     -8(%rsi),%rbx
1227 .cfi_restore    %rbx
1228         lea     (%rsi),%rsp
1229 .cfi_def_cfa_register   %rsp
1230 .Lpower5_epilogue:
1231         ret
1232 .cfi_endproc
1233 .size   bn_power5,.-bn_power5
1234
1235 .globl  bn_sqr8x_internal
1236 .hidden bn_sqr8x_internal
1237 .type   bn_sqr8x_internal,\@abi-omnipotent
1238 .align  32
1239 bn_sqr8x_internal:
1240 __bn_sqr8x_internal:
1241         ##############################################################
1242         # Squaring part:
1243         #
1244         # a) multiply-n-add everything but a[i]*a[i];
1245         # b) shift result of a) by 1 to the left and accumulate
1246         #    a[i]*a[i] products;
1247         #
1248         ##############################################################
1249         #                                                     a[1]a[0]
1250         #                                                 a[2]a[0]
1251         #                                             a[3]a[0]
1252         #                                             a[2]a[1]
1253         #                                         a[4]a[0]
1254         #                                         a[3]a[1]
1255         #                                     a[5]a[0]
1256         #                                     a[4]a[1]
1257         #                                     a[3]a[2]
1258         #                                 a[6]a[0]
1259         #                                 a[5]a[1]
1260         #                                 a[4]a[2]
1261         #                             a[7]a[0]
1262         #                             a[6]a[1]
1263         #                             a[5]a[2]
1264         #                             a[4]a[3]
1265         #                         a[7]a[1]
1266         #                         a[6]a[2]
1267         #                         a[5]a[3]
1268         #                     a[7]a[2]
1269         #                     a[6]a[3]
1270         #                     a[5]a[4]
1271         #                 a[7]a[3]
1272         #                 a[6]a[4]
1273         #             a[7]a[4]
1274         #             a[6]a[5]
1275         #         a[7]a[5]
1276         #     a[7]a[6]
1277         #                                                     a[1]a[0]
1278         #                                                 a[2]a[0]
1279         #                                             a[3]a[0]
1280         #                                         a[4]a[0]
1281         #                                     a[5]a[0]
1282         #                                 a[6]a[0]
1283         #                             a[7]a[0]
1284         #                                             a[2]a[1]
1285         #                                         a[3]a[1]
1286         #                                     a[4]a[1]
1287         #                                 a[5]a[1]
1288         #                             a[6]a[1]
1289         #                         a[7]a[1]
1290         #                                     a[3]a[2]
1291         #                                 a[4]a[2]
1292         #                             a[5]a[2]
1293         #                         a[6]a[2]
1294         #                     a[7]a[2]
1295         #                             a[4]a[3]
1296         #                         a[5]a[3]
1297         #                     a[6]a[3]
1298         #                 a[7]a[3]
1299         #                     a[5]a[4]
1300         #                 a[6]a[4]
1301         #             a[7]a[4]
1302         #             a[6]a[5]
1303         #         a[7]a[5]
1304         #     a[7]a[6]
1305         #                                                         a[0]a[0]
1306         #                                                 a[1]a[1]
1307         #                                         a[2]a[2]
1308         #                                 a[3]a[3]
1309         #                         a[4]a[4]
1310         #                 a[5]a[5]
1311         #         a[6]a[6]
1312         # a[7]a[7]
1313
1314         lea     32(%r10),$i             # $i=-($num-32)
1315         lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
1316
1317         mov     $num,$j                 # $j=$num
1318
1319                                         # comments apply to $num==8 case
1320         mov     -32($aptr,$i),$a0       # a[0]
1321         lea     48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1322         mov     -24($aptr,$i),%rax      # a[1]
1323         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
1324         mov     -16($aptr,$i),$ai       # a[2]
1325         mov     %rax,$a1
1326
1327         mul     $a0                     # a[1]*a[0]
1328         mov     %rax,$A0[0]             # a[1]*a[0]
1329          mov    $ai,%rax                # a[2]
1330         mov     %rdx,$A0[1]
1331         mov     $A0[0],-24($tptr,$i)    # t[1]
1332
1333         mul     $a0                     # a[2]*a[0]
1334         add     %rax,$A0[1]
1335          mov    $ai,%rax
1336         adc     \$0,%rdx
1337         mov     $A0[1],-16($tptr,$i)    # t[2]
1338         mov     %rdx,$A0[0]
1339
1340
1341          mov    -8($aptr,$i),$ai        # a[3]
1342         mul     $a1                     # a[2]*a[1]
1343         mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
1344          mov    $ai,%rax
1345         mov     %rdx,$A1[1]
1346
1347          lea    ($i),$j
1348         mul     $a0                     # a[3]*a[0]
1349         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1350          mov    $ai,%rax
1351         mov     %rdx,$A0[1]
1352         adc     \$0,$A0[1]
1353         add     $A1[0],$A0[0]
1354         adc     \$0,$A0[1]
1355         mov     $A0[0],-8($tptr,$j)     # t[3]
1356         jmp     .Lsqr4x_1st
1357
1358 .align  32
1359 .Lsqr4x_1st:
1360          mov    ($aptr,$j),$ai          # a[4]
1361         mul     $a1                     # a[3]*a[1]
1362         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
1363          mov    $ai,%rax
1364         mov     %rdx,$A1[0]
1365         adc     \$0,$A1[0]
1366
1367         mul     $a0                     # a[4]*a[0]
1368         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
1369          mov    $ai,%rax                # a[3]
1370          mov    8($aptr,$j),$ai         # a[5]
1371         mov     %rdx,$A0[0]
1372         adc     \$0,$A0[0]
1373         add     $A1[1],$A0[1]
1374         adc     \$0,$A0[0]
1375
1376
1377         mul     $a1                     # a[4]*a[3]
1378         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
1379          mov    $ai,%rax
1380          mov    $A0[1],($tptr,$j)       # t[4]
1381         mov     %rdx,$A1[1]
1382         adc     \$0,$A1[1]
1383
1384         mul     $a0                     # a[5]*a[2]
1385         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
1386          mov    $ai,%rax
1387          mov    16($aptr,$j),$ai        # a[6]
1388         mov     %rdx,$A0[1]
1389         adc     \$0,$A0[1]
1390         add     $A1[0],$A0[0]
1391         adc     \$0,$A0[1]
1392
1393         mul     $a1                     # a[5]*a[3]
1394         add     %rax,$A1[1]             # a[5]*a[3]+t[6]
1395          mov    $ai,%rax
1396          mov    $A0[0],8($tptr,$j)      # t[5]
1397         mov     %rdx,$A1[0]
1398         adc     \$0,$A1[0]
1399
1400         mul     $a0                     # a[6]*a[2]
1401         add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
1402          mov    $ai,%rax                # a[3]
1403          mov    24($aptr,$j),$ai        # a[7]
1404         mov     %rdx,$A0[0]
1405         adc     \$0,$A0[0]
1406         add     $A1[1],$A0[1]
1407         adc     \$0,$A0[0]
1408
1409
1410         mul     $a1                     # a[6]*a[5]
1411         add     %rax,$A1[0]             # a[6]*a[5]+t[7]
1412          mov    $ai,%rax
1413          mov    $A0[1],16($tptr,$j)     # t[6]
1414         mov     %rdx,$A1[1]
1415         adc     \$0,$A1[1]
1416          lea    32($j),$j
1417
1418         mul     $a0                     # a[7]*a[4]
1419         add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
1420          mov    $ai,%rax
1421         mov     %rdx,$A0[1]
1422         adc     \$0,$A0[1]
1423         add     $A1[0],$A0[0]
1424         adc     \$0,$A0[1]
1425         mov     $A0[0],-8($tptr,$j)     # t[7]
1426
1427         cmp     \$0,$j
1428         jne     .Lsqr4x_1st
1429
1430         mul     $a1                     # a[7]*a[5]
1431         add     %rax,$A1[1]
1432         lea     16($i),$i
1433         adc     \$0,%rdx
1434         add     $A0[1],$A1[1]
1435         adc     \$0,%rdx
1436
1437         mov     $A1[1],($tptr)          # t[8]
1438         mov     %rdx,$A1[0]
1439         mov     %rdx,8($tptr)           # t[9]
1440         jmp     .Lsqr4x_outer
1441
1442 .align  32
1443 .Lsqr4x_outer:                          # comments apply to $num==6 case
1444         mov     -32($aptr,$i),$a0       # a[0]
1445         lea     48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1446         mov     -24($aptr,$i),%rax      # a[1]
1447         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
1448         mov     -16($aptr,$i),$ai       # a[2]
1449         mov     %rax,$a1
1450
1451         mul     $a0                     # a[1]*a[0]
1452         mov     -24($tptr,$i),$A0[0]    # t[1]
1453         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
1454          mov    $ai,%rax                # a[2]
1455         adc     \$0,%rdx
1456         mov     $A0[0],-24($tptr,$i)    # t[1]
1457         mov     %rdx,$A0[1]
1458
1459         mul     $a0                     # a[2]*a[0]
1460         add     %rax,$A0[1]
1461          mov    $ai,%rax
1462         adc     \$0,%rdx
1463         add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
1464         mov     %rdx,$A0[0]
1465         adc     \$0,$A0[0]
1466         mov     $A0[1],-16($tptr,$i)    # t[2]
1467
1468         xor     $A1[0],$A1[0]
1469
1470          mov    -8($aptr,$i),$ai        # a[3]
1471         mul     $a1                     # a[2]*a[1]
1472         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
1473          mov    $ai,%rax
1474         adc     \$0,%rdx
1475         add     -8($tptr,$i),$A1[0]
1476         mov     %rdx,$A1[1]
1477         adc     \$0,$A1[1]
1478
1479         mul     $a0                     # a[3]*a[0]
1480         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1481          mov    $ai,%rax
1482         adc     \$0,%rdx
1483         add     $A1[0],$A0[0]
1484         mov     %rdx,$A0[1]
1485         adc     \$0,$A0[1]
1486         mov     $A0[0],-8($tptr,$i)     # t[3]
1487
1488         lea     ($i),$j
1489         jmp     .Lsqr4x_inner
1490
1491 .align  32
1492 .Lsqr4x_inner:
1493          mov    ($aptr,$j),$ai          # a[4]
1494         mul     $a1                     # a[3]*a[1]
1495         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
1496          mov    $ai,%rax
1497         mov     %rdx,$A1[0]
1498         adc     \$0,$A1[0]
1499         add     ($tptr,$j),$A1[1]
1500         adc     \$0,$A1[0]
1501
1502         .byte   0x67
1503         mul     $a0                     # a[4]*a[0]
1504         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
1505          mov    $ai,%rax                # a[3]
1506          mov    8($aptr,$j),$ai         # a[5]
1507         mov     %rdx,$A0[0]
1508         adc     \$0,$A0[0]
1509         add     $A1[1],$A0[1]
1510         adc     \$0,$A0[0]
1511
1512         mul     $a1                     # a[4]*a[3]
1513         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
1514         mov     $A0[1],($tptr,$j)       # t[4]
1515          mov    $ai,%rax
1516         mov     %rdx,$A1[1]
1517         adc     \$0,$A1[1]
1518         add     8($tptr,$j),$A1[0]
1519         lea     16($j),$j               # j++
1520         adc     \$0,$A1[1]
1521
1522         mul     $a0                     # a[5]*a[2]
1523         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
1524          mov    $ai,%rax
1525         adc     \$0,%rdx
1526         add     $A1[0],$A0[0]
1527         mov     %rdx,$A0[1]
1528         adc     \$0,$A0[1]
1529         mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
1530
1531         cmp     \$0,$j
1532         jne     .Lsqr4x_inner
1533
1534         .byte   0x67
1535         mul     $a1                     # a[5]*a[3]
1536         add     %rax,$A1[1]
1537         adc     \$0,%rdx
1538         add     $A0[1],$A1[1]
1539         adc     \$0,%rdx
1540
1541         mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
1542         mov     %rdx,$A1[0]
1543         mov     %rdx,8($tptr)           # t[7], "preloaded t[3]" below
1544
1545         add     \$16,$i
1546         jnz     .Lsqr4x_outer
1547
1548                                         # comments apply to $num==4 case
1549         mov     -32($aptr),$a0          # a[0]
1550         lea     48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1551         mov     -24($aptr),%rax         # a[1]
1552         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
1553         mov     -16($aptr),$ai          # a[2]
1554         mov     %rax,$a1
1555
1556         mul     $a0                     # a[1]*a[0]
1557         add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
1558          mov    $ai,%rax                # a[2]
1559         mov     %rdx,$A0[1]
1560         adc     \$0,$A0[1]
1561
1562         mul     $a0                     # a[2]*a[0]
1563         add     %rax,$A0[1]
1564          mov    $ai,%rax
1565          mov    $A0[0],-24($tptr)       # t[1]
1566         mov     %rdx,$A0[0]
1567         adc     \$0,$A0[0]
1568         add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
1569          mov    -8($aptr),$ai           # a[3]
1570         adc     \$0,$A0[0]
1571
1572         mul     $a1                     # a[2]*a[1]
1573         add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
1574          mov    $ai,%rax
1575          mov    $A0[1],-16($tptr)       # t[2]
1576         mov     %rdx,$A1[1]
1577         adc     \$0,$A1[1]
1578
1579         mul     $a0                     # a[3]*a[0]
1580         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1581          mov    $ai,%rax
1582         mov     %rdx,$A0[1]
1583         adc     \$0,$A0[1]
1584         add     $A1[0],$A0[0]
1585         adc     \$0,$A0[1]
1586         mov     $A0[0],-8($tptr)        # t[3]
1587
1588         mul     $a1                     # a[3]*a[1]
1589         add     %rax,$A1[1]
1590          mov    -16($aptr),%rax         # a[2]
1591         adc     \$0,%rdx
1592         add     $A0[1],$A1[1]
1593         adc     \$0,%rdx
1594
1595         mov     $A1[1],($tptr)          # t[4]
1596         mov     %rdx,$A1[0]
1597         mov     %rdx,8($tptr)           # t[5]
1598
1599         mul     $ai                     # a[2]*a[3]
1600 ___
1601 {
1602 my ($shift,$carry)=($a0,$a1);
1603 my @S=(@A1,$ai,$n0);
1604 $code.=<<___;
1605          add    \$16,$i
1606          xor    $shift,$shift
1607          sub    $num,$i                 # $i=16-$num
1608          xor    $carry,$carry
1609
1610         add     $A1[0],%rax             # t[5]
1611         adc     \$0,%rdx
1612         mov     %rax,8($tptr)           # t[5]
1613         mov     %rdx,16($tptr)          # t[6]
1614         mov     $carry,24($tptr)        # t[7]
1615
1616          mov    -16($aptr,$i),%rax      # a[0]
1617         lea     48+8(%rsp),$tptr
1618          xor    $A0[0],$A0[0]           # t[0]
1619          mov    8($tptr),$A0[1]         # t[1]
1620
1621         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1622         shr     \$63,$A0[0]
1623         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1624         shr     \$63,$A0[1]
1625         or      $A0[0],$S[1]            # | t[2*i]>>63
1626          mov    16($tptr),$A0[0]        # t[2*i+2]      # prefetch
1627         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1628         mul     %rax                    # a[i]*a[i]
1629         neg     $carry                  # mov $carry,cf
1630          mov    24($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1631         adc     %rax,$S[0]
1632          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1633         mov     $S[0],($tptr)
1634         adc     %rdx,$S[1]
1635
1636         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1637          mov    $S[1],8($tptr)
1638          sbb    $carry,$carry           # mov cf,$carry
1639         shr     \$63,$A0[0]
1640         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1641         shr     \$63,$A0[1]
1642         or      $A0[0],$S[3]            # | t[2*i]>>63
1643          mov    32($tptr),$A0[0]        # t[2*i+2]      # prefetch
1644         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1645         mul     %rax                    # a[i]*a[i]
1646         neg     $carry                  # mov $carry,cf
1647          mov    40($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1648         adc     %rax,$S[2]
1649          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1650         mov     $S[2],16($tptr)
1651         adc     %rdx,$S[3]
1652         lea     16($i),$i
1653         mov     $S[3],24($tptr)
1654         sbb     $carry,$carry           # mov cf,$carry
1655         lea     64($tptr),$tptr
1656         jmp     .Lsqr4x_shift_n_add
1657
1658 .align  32
1659 .Lsqr4x_shift_n_add:
1660         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1661         shr     \$63,$A0[0]
1662         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1663         shr     \$63,$A0[1]
1664         or      $A0[0],$S[1]            # | t[2*i]>>63
1665          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1666         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1667         mul     %rax                    # a[i]*a[i]
1668         neg     $carry                  # mov $carry,cf
1669          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1670         adc     %rax,$S[0]
1671          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1672         mov     $S[0],-32($tptr)
1673         adc     %rdx,$S[1]
1674
1675         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1676          mov    $S[1],-24($tptr)
1677          sbb    $carry,$carry           # mov cf,$carry
1678         shr     \$63,$A0[0]
1679         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1680         shr     \$63,$A0[1]
1681         or      $A0[0],$S[3]            # | t[2*i]>>63
1682          mov    0($tptr),$A0[0]         # t[2*i+2]      # prefetch
1683         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1684         mul     %rax                    # a[i]*a[i]
1685         neg     $carry                  # mov $carry,cf
1686          mov    8($tptr),$A0[1]         # t[2*i+2+1]    # prefetch
1687         adc     %rax,$S[2]
1688          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1689         mov     $S[2],-16($tptr)
1690         adc     %rdx,$S[3]
1691
1692         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1693          mov    $S[3],-8($tptr)
1694          sbb    $carry,$carry           # mov cf,$carry
1695         shr     \$63,$A0[0]
1696         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1697         shr     \$63,$A0[1]
1698         or      $A0[0],$S[1]            # | t[2*i]>>63
1699          mov    16($tptr),$A0[0]        # t[2*i+2]      # prefetch
1700         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1701         mul     %rax                    # a[i]*a[i]
1702         neg     $carry                  # mov $carry,cf
1703          mov    24($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1704         adc     %rax,$S[0]
1705          mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
1706         mov     $S[0],0($tptr)
1707         adc     %rdx,$S[1]
1708
1709         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1710          mov    $S[1],8($tptr)
1711          sbb    $carry,$carry           # mov cf,$carry
1712         shr     \$63,$A0[0]
1713         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1714         shr     \$63,$A0[1]
1715         or      $A0[0],$S[3]            # | t[2*i]>>63
1716          mov    32($tptr),$A0[0]        # t[2*i+2]      # prefetch
1717         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1718         mul     %rax                    # a[i]*a[i]
1719         neg     $carry                  # mov $carry,cf
1720          mov    40($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1721         adc     %rax,$S[2]
1722          mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
1723         mov     $S[2],16($tptr)
1724         adc     %rdx,$S[3]
1725         mov     $S[3],24($tptr)
1726         sbb     $carry,$carry           # mov cf,$carry
1727         lea     64($tptr),$tptr
1728         add     \$32,$i
1729         jnz     .Lsqr4x_shift_n_add
1730
1731         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1732         .byte   0x67
1733         shr     \$63,$A0[0]
1734         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1735         shr     \$63,$A0[1]
1736         or      $A0[0],$S[1]            # | t[2*i]>>63
1737          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1738         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1739         mul     %rax                    # a[i]*a[i]
1740         neg     $carry                  # mov $carry,cf
1741          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1742         adc     %rax,$S[0]
1743          mov    -8($aptr),%rax          # a[i+1]        # prefetch
1744         mov     $S[0],-32($tptr)
1745         adc     %rdx,$S[1]
1746
1747         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1748          mov    $S[1],-24($tptr)
1749          sbb    $carry,$carry           # mov cf,$carry
1750         shr     \$63,$A0[0]
1751         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1752         shr     \$63,$A0[1]
1753         or      $A0[0],$S[3]            # | t[2*i]>>63
1754         mul     %rax                    # a[i]*a[i]
1755         neg     $carry                  # mov $carry,cf
1756         adc     %rax,$S[2]
1757         adc     %rdx,$S[3]
1758         mov     $S[2],-16($tptr)
1759         mov     $S[3],-8($tptr)
1760 ___
1761 }\f
1762 ######################################################################
1763 # Montgomery reduction part, "word-by-word" algorithm.
1764 #
1765 # This new path is inspired by multiple submissions from Intel, by
1766 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1767 # Vinodh Gopal...
1768 {
1769 my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1770
1771 $code.=<<___;
1772         movq    %xmm2,$nptr
1773 __bn_sqr8x_reduction:
1774         xor     %rax,%rax
1775         lea     ($nptr,$num),%rcx       # end of n[]
1776         lea     48+8(%rsp,$num,2),%rdx  # end of t[] buffer
1777         mov     %rcx,0+8(%rsp)
1778         lea     48+8(%rsp,$num),$tptr   # end of initial t[] window
1779         mov     %rdx,8+8(%rsp)
1780         neg     $num
1781         jmp     .L8x_reduction_loop
1782
1783 .align  32
1784 .L8x_reduction_loop:
1785         lea     ($tptr,$num),$tptr      # start of current t[] window
1786         .byte   0x66
1787         mov     8*0($tptr),$m0
1788         mov     8*1($tptr),%r9
1789         mov     8*2($tptr),%r10
1790         mov     8*3($tptr),%r11
1791         mov     8*4($tptr),%r12
1792         mov     8*5($tptr),%r13
1793         mov     8*6($tptr),%r14
1794         mov     8*7($tptr),%r15
1795         mov     %rax,(%rdx)             # store top-most carry bit
1796         lea     8*8($tptr),$tptr
1797
1798         .byte   0x67
1799         mov     $m0,%r8
1800         imulq   32+8(%rsp),$m0          # n0*a[0]
1801         mov     8*0($nptr),%rax         # n[0]
1802         mov     \$8,%ecx
1803         jmp     .L8x_reduce
1804
1805 .align  32
1806 .L8x_reduce:
1807         mulq    $m0
1808          mov    8*1($nptr),%rax         # n[1]
1809         neg     %r8
1810         mov     %rdx,%r8
1811         adc     \$0,%r8
1812
1813         mulq    $m0
1814         add     %rax,%r9
1815          mov    8*2($nptr),%rax
1816         adc     \$0,%rdx
1817         add     %r9,%r8
1818          mov    $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
1819         mov     %rdx,%r9
1820         adc     \$0,%r9
1821
1822         mulq    $m0
1823         add     %rax,%r10
1824          mov    8*3($nptr),%rax
1825         adc     \$0,%rdx
1826         add     %r10,%r9
1827          mov    32+8(%rsp),$carry       # pull n0, borrow $carry
1828         mov     %rdx,%r10
1829         adc     \$0,%r10
1830
1831         mulq    $m0
1832         add     %rax,%r11
1833          mov    8*4($nptr),%rax
1834         adc     \$0,%rdx
1835          imulq  %r8,$carry              # modulo-scheduled
1836         add     %r11,%r10
1837         mov     %rdx,%r11
1838         adc     \$0,%r11
1839
1840         mulq    $m0
1841         add     %rax,%r12
1842          mov    8*5($nptr),%rax
1843         adc     \$0,%rdx
1844         add     %r12,%r11
1845         mov     %rdx,%r12
1846         adc     \$0,%r12
1847
1848         mulq    $m0
1849         add     %rax,%r13
1850          mov    8*6($nptr),%rax
1851         adc     \$0,%rdx
1852         add     %r13,%r12
1853         mov     %rdx,%r13
1854         adc     \$0,%r13
1855
1856         mulq    $m0
1857         add     %rax,%r14
1858          mov    8*7($nptr),%rax
1859         adc     \$0,%rdx
1860         add     %r14,%r13
1861         mov     %rdx,%r14
1862         adc     \$0,%r14
1863
1864         mulq    $m0
1865          mov    $carry,$m0              # n0*a[i]
1866         add     %rax,%r15
1867          mov    8*0($nptr),%rax         # n[0]
1868         adc     \$0,%rdx
1869         add     %r15,%r14
1870         mov     %rdx,%r15
1871         adc     \$0,%r15
1872
1873         dec     %ecx
1874         jnz     .L8x_reduce
1875
1876         lea     8*8($nptr),$nptr
1877         xor     %rax,%rax
1878         mov     8+8(%rsp),%rdx          # pull end of t[]
1879         cmp     0+8(%rsp),$nptr         # end of n[]?
1880         jae     .L8x_no_tail
1881
1882         .byte   0x66
1883         add     8*0($tptr),%r8
1884         adc     8*1($tptr),%r9
1885         adc     8*2($tptr),%r10
1886         adc     8*3($tptr),%r11
1887         adc     8*4($tptr),%r12
1888         adc     8*5($tptr),%r13
1889         adc     8*6($tptr),%r14
1890         adc     8*7($tptr),%r15
1891         sbb     $carry,$carry           # top carry
1892
1893         mov     48+56+8(%rsp),$m0       # pull n0*a[0]
1894         mov     \$8,%ecx
1895         mov     8*0($nptr),%rax
1896         jmp     .L8x_tail
1897
1898 .align  32
1899 .L8x_tail:
1900         mulq    $m0
1901         add     %rax,%r8
1902          mov    8*1($nptr),%rax
1903          mov    %r8,($tptr)             # save result
1904         mov     %rdx,%r8
1905         adc     \$0,%r8
1906
1907         mulq    $m0
1908         add     %rax,%r9
1909          mov    8*2($nptr),%rax
1910         adc     \$0,%rdx
1911         add     %r9,%r8
1912          lea    8($tptr),$tptr          # $tptr++
1913         mov     %rdx,%r9
1914         adc     \$0,%r9
1915
1916         mulq    $m0
1917         add     %rax,%r10
1918          mov    8*3($nptr),%rax
1919         adc     \$0,%rdx
1920         add     %r10,%r9
1921         mov     %rdx,%r10
1922         adc     \$0,%r10
1923
1924         mulq    $m0
1925         add     %rax,%r11
1926          mov    8*4($nptr),%rax
1927         adc     \$0,%rdx
1928         add     %r11,%r10
1929         mov     %rdx,%r11
1930         adc     \$0,%r11
1931
1932         mulq    $m0
1933         add     %rax,%r12
1934          mov    8*5($nptr),%rax
1935         adc     \$0,%rdx
1936         add     %r12,%r11
1937         mov     %rdx,%r12
1938         adc     \$0,%r12
1939
1940         mulq    $m0
1941         add     %rax,%r13
1942          mov    8*6($nptr),%rax
1943         adc     \$0,%rdx
1944         add     %r13,%r12
1945         mov     %rdx,%r13
1946         adc     \$0,%r13
1947
1948         mulq    $m0
1949         add     %rax,%r14
1950          mov    8*7($nptr),%rax
1951         adc     \$0,%rdx
1952         add     %r14,%r13
1953         mov     %rdx,%r14
1954         adc     \$0,%r14
1955
1956         mulq    $m0
1957          mov    48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1958         add     %rax,%r15
1959         adc     \$0,%rdx
1960         add     %r15,%r14
1961          mov    8*0($nptr),%rax         # pull n[0]
1962         mov     %rdx,%r15
1963         adc     \$0,%r15
1964
1965         dec     %ecx
1966         jnz     .L8x_tail
1967
1968         lea     8*8($nptr),$nptr
1969         mov     8+8(%rsp),%rdx          # pull end of t[]
1970         cmp     0+8(%rsp),$nptr         # end of n[]?
1971         jae     .L8x_tail_done          # break out of loop
1972
1973          mov    48+56+8(%rsp),$m0       # pull n0*a[0]
1974         neg     $carry
1975          mov    8*0($nptr),%rax         # pull n[0]
1976         adc     8*0($tptr),%r8
1977         adc     8*1($tptr),%r9
1978         adc     8*2($tptr),%r10
1979         adc     8*3($tptr),%r11
1980         adc     8*4($tptr),%r12
1981         adc     8*5($tptr),%r13
1982         adc     8*6($tptr),%r14
1983         adc     8*7($tptr),%r15
1984         sbb     $carry,$carry           # top carry
1985
1986         mov     \$8,%ecx
1987         jmp     .L8x_tail
1988
1989 .align  32
1990 .L8x_tail_done:
1991         xor     %rax,%rax
1992         add     (%rdx),%r8              # can this overflow?
1993         adc     \$0,%r9
1994         adc     \$0,%r10
1995         adc     \$0,%r11
1996         adc     \$0,%r12
1997         adc     \$0,%r13
1998         adc     \$0,%r14
1999         adc     \$0,%r15
2000         adc     \$0,%rax
2001
2002         neg     $carry
2003 .L8x_no_tail:
2004         adc     8*0($tptr),%r8
2005         adc     8*1($tptr),%r9
2006         adc     8*2($tptr),%r10
2007         adc     8*3($tptr),%r11
2008         adc     8*4($tptr),%r12
2009         adc     8*5($tptr),%r13
2010         adc     8*6($tptr),%r14
2011         adc     8*7($tptr),%r15
2012         adc     \$0,%rax                # top-most carry
2013          mov    -8($nptr),%rcx          # np[num-1]
2014          xor    $carry,$carry
2015
2016         movq    %xmm2,$nptr             # restore $nptr
2017
2018         mov     %r8,8*0($tptr)          # store top 512 bits
2019         mov     %r9,8*1($tptr)
2020          movq   %xmm3,$num              # $num is %r9, can't be moved upwards
2021         mov     %r10,8*2($tptr)
2022         mov     %r11,8*3($tptr)
2023         mov     %r12,8*4($tptr)
2024         mov     %r13,8*5($tptr)
2025         mov     %r14,8*6($tptr)
2026         mov     %r15,8*7($tptr)
2027         lea     8*8($tptr),$tptr
2028
2029         cmp     %rdx,$tptr              # end of t[]?
2030         jb      .L8x_reduction_loop
2031         ret
2032 .size   bn_sqr8x_internal,.-bn_sqr8x_internal
2033 ___
2034 }\f
2035 ##############################################################
2036 # Post-condition, 4x unrolled
2037 #
2038 {
2039 my ($tptr,$nptr)=("%rbx","%rbp");
2040 $code.=<<___;
2041 .type   __bn_post4x_internal,\@abi-omnipotent
2042 .align  32
2043 __bn_post4x_internal:
2044         mov     8*0($nptr),%r12
2045         lea     (%rdi,$num),$tptr       # %rdi was $tptr above
2046         mov     $num,%rcx
2047         movq    %xmm1,$rptr             # restore $rptr
2048         neg     %rax
2049         movq    %xmm1,$aptr             # prepare for back-to-back call
2050         sar     \$3+2,%rcx
2051         dec     %r12                    # so that after 'not' we get -n[0]
2052         xor     %r10,%r10
2053         mov     8*1($nptr),%r13
2054         mov     8*2($nptr),%r14
2055         mov     8*3($nptr),%r15
2056         jmp     .Lsqr4x_sub_entry
2057
2058 .align  16
2059 .Lsqr4x_sub:
2060         mov     8*0($nptr),%r12
2061         mov     8*1($nptr),%r13
2062         mov     8*2($nptr),%r14
2063         mov     8*3($nptr),%r15
2064 .Lsqr4x_sub_entry:
2065         lea     8*4($nptr),$nptr
2066         not     %r12
2067         not     %r13
2068         not     %r14
2069         not     %r15
2070         and     %rax,%r12
2071         and     %rax,%r13
2072         and     %rax,%r14
2073         and     %rax,%r15
2074
2075         neg     %r10                    # mov %r10,%cf
2076         adc     8*0($tptr),%r12
2077         adc     8*1($tptr),%r13
2078         adc     8*2($tptr),%r14
2079         adc     8*3($tptr),%r15
2080         mov     %r12,8*0($rptr)
2081         lea     8*4($tptr),$tptr
2082         mov     %r13,8*1($rptr)
2083         sbb     %r10,%r10               # mov %cf,%r10
2084         mov     %r14,8*2($rptr)
2085         mov     %r15,8*3($rptr)
2086         lea     8*4($rptr),$rptr
2087
2088         inc     %rcx                    # pass %cf
2089         jnz     .Lsqr4x_sub
2090
2091         mov     $num,%r10               # prepare for back-to-back call
2092         neg     $num                    # restore $num
2093         ret
2094 .size   __bn_post4x_internal,.-__bn_post4x_internal
2095 ___
2096 }
2097 {
2098 $code.=<<___;
2099 .globl  bn_from_montgomery
2100 .type   bn_from_montgomery,\@abi-omnipotent
2101 .align  32
2102 bn_from_montgomery:
2103         testl   \$7,`($win64?"48(%rsp)":"%r9d")`
2104         jz      bn_from_mont8x
2105         xor     %eax,%eax
2106         ret
2107 .size   bn_from_montgomery,.-bn_from_montgomery
2108
2109 .type   bn_from_mont8x,\@function,6
2110 .align  32
2111 bn_from_mont8x:
2112 .cfi_startproc
2113         .byte   0x67
2114         mov     %rsp,%rax
2115 .cfi_def_cfa_register   %rax
2116         push    %rbx
2117 .cfi_push       %rbx
2118         push    %rbp
2119 .cfi_push       %rbp
2120         push    %r12
2121 .cfi_push       %r12
2122         push    %r13
2123 .cfi_push       %r13
2124         push    %r14
2125 .cfi_push       %r14
2126         push    %r15
2127 .cfi_push       %r15
2128 .Lfrom_prologue:
2129
2130         shl     \$3,${num}d             # convert $num to bytes
2131         lea     ($num,$num,2),%r10      # 3*$num in bytes
2132         neg     $num
2133         mov     ($n0),$n0               # *n0
2134
2135         ##############################################################
2136         # Ensure that stack frame doesn't alias with $rptr+3*$num
2137         # modulo 4096, which covers ret[num], am[num] and n[num]
2138         # (see bn_exp.c). The stack is allocated to aligned with
2139         # bn_power5's frame, and as bn_from_montgomery happens to be
2140         # last operation, we use the opportunity to cleanse it.
2141         #
2142         lea     -320(%rsp,$num,2),%r11
2143         mov     %rsp,%rbp
2144         sub     $rptr,%r11
2145         and     \$4095,%r11
2146         cmp     %r11,%r10
2147         jb      .Lfrom_sp_alt
2148         sub     %r11,%rbp               # align with $aptr
2149         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
2150         jmp     .Lfrom_sp_done
2151
2152 .align  32
2153 .Lfrom_sp_alt:
2154         lea     4096-320(,$num,2),%r10
2155         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
2156         sub     %r10,%r11
2157         mov     \$0,%r10
2158         cmovc   %r10,%r11
2159         sub     %r11,%rbp
2160 .Lfrom_sp_done:
2161         and     \$-64,%rbp
2162         mov     %rsp,%r11
2163         sub     %rbp,%r11
2164         and     \$-4096,%r11
2165         lea     (%rbp,%r11),%rsp
2166         mov     (%rsp),%r10
2167         cmp     %rbp,%rsp
2168         ja      .Lfrom_page_walk
2169         jmp     .Lfrom_page_walk_done
2170
2171 .Lfrom_page_walk:
2172         lea     -4096(%rsp),%rsp
2173         mov     (%rsp),%r10
2174         cmp     %rbp,%rsp
2175         ja      .Lfrom_page_walk
2176 .Lfrom_page_walk_done:
2177
2178         mov     $num,%r10
2179         neg     $num
2180
2181         ##############################################################
2182         # Stack layout
2183         #
2184         # +0    saved $num, used in reduction section
2185         # +8    &t[2*$num], used in reduction section
2186         # +32   saved *n0
2187         # +40   saved %rsp
2188         # +48   t[2*$num]
2189         #
2190         mov     $n0,  32(%rsp)
2191         mov     %rax, 40(%rsp)          # save original %rsp
2192 .cfi_cfa_expression     %rsp+40,deref,+8
2193 .Lfrom_body:
2194         mov     $num,%r11
2195         lea     48(%rsp),%rax
2196         pxor    %xmm0,%xmm0
2197         jmp     .Lmul_by_1
2198
2199 .align  32
2200 .Lmul_by_1:
2201         movdqu  ($aptr),%xmm1
2202         movdqu  16($aptr),%xmm2
2203         movdqu  32($aptr),%xmm3
2204         movdqa  %xmm0,(%rax,$num)
2205         movdqu  48($aptr),%xmm4
2206         movdqa  %xmm0,16(%rax,$num)
2207         .byte   0x48,0x8d,0xb6,0x40,0x00,0x00,0x00      # lea   64($aptr),$aptr
2208         movdqa  %xmm1,(%rax)
2209         movdqa  %xmm0,32(%rax,$num)
2210         movdqa  %xmm2,16(%rax)
2211         movdqa  %xmm0,48(%rax,$num)
2212         movdqa  %xmm3,32(%rax)
2213         movdqa  %xmm4,48(%rax)
2214         lea     64(%rax),%rax
2215         sub     \$64,%r11
2216         jnz     .Lmul_by_1
2217
2218         movq    $rptr,%xmm1
2219         movq    $nptr,%xmm2
2220         .byte   0x67
2221         mov     $nptr,%rbp
2222         movq    %r10, %xmm3             # -num
2223 ___
2224 $code.=<<___ if ($addx);
2225         mov     OPENSSL_ia32cap_P+8(%rip),%r11d
2226         and     \$0x80108,%r11d
2227         cmp     \$0x80108,%r11d         # check for AD*X+BMI2+BMI1
2228         jne     .Lfrom_mont_nox
2229
2230         lea     (%rax,$num),$rptr
2231         call    __bn_sqrx8x_reduction
2232         call    __bn_postx4x_internal
2233
2234         pxor    %xmm0,%xmm0
2235         lea     48(%rsp),%rax
2236         jmp     .Lfrom_mont_zero
2237
2238 .align  32
2239 .Lfrom_mont_nox:
2240 ___
2241 $code.=<<___;
2242         call    __bn_sqr8x_reduction
2243         call    __bn_post4x_internal
2244
2245         pxor    %xmm0,%xmm0
2246         lea     48(%rsp),%rax
2247         jmp     .Lfrom_mont_zero
2248
2249 .align  32
2250 .Lfrom_mont_zero:
2251         mov     40(%rsp),%rsi           # restore %rsp
2252 .cfi_def_cfa    %rsi,8
2253         movdqa  %xmm0,16*0(%rax)
2254         movdqa  %xmm0,16*1(%rax)
2255         movdqa  %xmm0,16*2(%rax)
2256         movdqa  %xmm0,16*3(%rax)
2257         lea     16*4(%rax),%rax
2258         sub     \$32,$num
2259         jnz     .Lfrom_mont_zero
2260
2261         mov     \$1,%rax
2262         mov     -48(%rsi),%r15
2263 .cfi_restore    %r15
2264         mov     -40(%rsi),%r14
2265 .cfi_restore    %r14
2266         mov     -32(%rsi),%r13
2267 .cfi_restore    %r13
2268         mov     -24(%rsi),%r12
2269 .cfi_restore    %r12
2270         mov     -16(%rsi),%rbp
2271 .cfi_restore    %rbp
2272         mov     -8(%rsi),%rbx
2273 .cfi_restore    %rbx
2274         lea     (%rsi),%rsp
2275 .cfi_def_cfa_register   %rsp
2276 .Lfrom_epilogue:
2277         ret
2278 .cfi_endproc
2279 .size   bn_from_mont8x,.-bn_from_mont8x
2280 ___
2281 }
2282 }}}
2283 \f
2284 if ($addx) {{{
2285 my $bp="%rdx";  # restore original value
2286
2287 $code.=<<___;
2288 .type   bn_mulx4x_mont_gather5,\@function,6
2289 .align  32
2290 bn_mulx4x_mont_gather5:
2291 .cfi_startproc
2292         mov     %rsp,%rax
2293 .cfi_def_cfa_register   %rax
2294 .Lmulx4x_enter:
2295         push    %rbx
2296 .cfi_push       %rbx
2297         push    %rbp
2298 .cfi_push       %rbp
2299         push    %r12
2300 .cfi_push       %r12
2301         push    %r13
2302 .cfi_push       %r13
2303         push    %r14
2304 .cfi_push       %r14
2305         push    %r15
2306 .cfi_push       %r15
2307 .Lmulx4x_prologue:
2308
2309         shl     \$3,${num}d             # convert $num to bytes
2310         lea     ($num,$num,2),%r10      # 3*$num in bytes
2311         neg     $num                    # -$num
2312         mov     ($n0),$n0               # *n0
2313
2314         ##############################################################
2315         # Ensure that stack frame doesn't alias with $rptr+3*$num
2316         # modulo 4096, which covers ret[num], am[num] and n[num]
2317         # (see bn_exp.c). This is done to allow memory disambiguation
2318         # logic do its magic. [Extra [num] is allocated in order
2319         # to align with bn_power5's frame, which is cleansed after
2320         # completing exponentiation. Extra 256 bytes is for power mask
2321         # calculated from 7th argument, the index.]
2322         #
2323         lea     -320(%rsp,$num,2),%r11
2324         mov     %rsp,%rbp
2325         sub     $rp,%r11
2326         and     \$4095,%r11
2327         cmp     %r11,%r10
2328         jb      .Lmulx4xsp_alt
2329         sub     %r11,%rbp               # align with $aptr
2330         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
2331         jmp     .Lmulx4xsp_done
2332
2333 .Lmulx4xsp_alt:
2334         lea     4096-320(,$num,2),%r10
2335         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
2336         sub     %r10,%r11
2337         mov     \$0,%r10
2338         cmovc   %r10,%r11
2339         sub     %r11,%rbp
2340 .Lmulx4xsp_done:
2341         and     \$-64,%rbp              # ensure alignment
2342         mov     %rsp,%r11
2343         sub     %rbp,%r11
2344         and     \$-4096,%r11
2345         lea     (%rbp,%r11),%rsp
2346         mov     (%rsp),%r10
2347         cmp     %rbp,%rsp
2348         ja      .Lmulx4x_page_walk
2349         jmp     .Lmulx4x_page_walk_done
2350
2351 .Lmulx4x_page_walk:
2352         lea     -4096(%rsp),%rsp
2353         mov     (%rsp),%r10
2354         cmp     %rbp,%rsp
2355         ja      .Lmulx4x_page_walk
2356 .Lmulx4x_page_walk_done:
2357
2358         ##############################################################
2359         # Stack layout
2360         # +0    -num
2361         # +8    off-loaded &b[i]
2362         # +16   end of b[num]
2363         # +24   inner counter
2364         # +32   saved n0
2365         # +40   saved %rsp
2366         # +48
2367         # +56   saved rp
2368         # +64   tmp[num+1]
2369         #
2370         mov     $n0, 32(%rsp)           # save *n0
2371         mov     %rax,40(%rsp)           # save original %rsp
2372 .cfi_cfa_expression     %rsp+40,deref,+8
2373 .Lmulx4x_body:
2374         call    mulx4x_internal
2375
2376         mov     40(%rsp),%rsi           # restore %rsp
2377 .cfi_def_cfa    %rsi,8
2378         mov     \$1,%rax
2379
2380         mov     -48(%rsi),%r15
2381 .cfi_restore    %r15
2382         mov     -40(%rsi),%r14
2383 .cfi_restore    %r14
2384         mov     -32(%rsi),%r13
2385 .cfi_restore    %r13
2386         mov     -24(%rsi),%r12
2387 .cfi_restore    %r12
2388         mov     -16(%rsi),%rbp
2389 .cfi_restore    %rbp
2390         mov     -8(%rsi),%rbx
2391 .cfi_restore    %rbx
2392         lea     (%rsi),%rsp
2393 .cfi_def_cfa_register   %rsp
2394 .Lmulx4x_epilogue:
2395         ret
2396 .cfi_endproc
2397 .size   bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2398
2399 .type   mulx4x_internal,\@abi-omnipotent
2400 .align  32
2401 mulx4x_internal:
2402         mov     $num,8(%rsp)            # save -$num (it was in bytes)
2403         mov     $num,%r10
2404         neg     $num                    # restore $num
2405         shl     \$5,$num
2406         neg     %r10                    # restore $num
2407         lea     128($bp,$num),%r13      # end of powers table (+size optimization)
2408         shr     \$5+5,$num
2409         movd    `($win64?56:8)`(%rax),%xmm5     # load 7th argument
2410         sub     \$1,$num
2411         lea     .Linc(%rip),%rax
2412         mov     %r13,16+8(%rsp)         # end of b[num]
2413         mov     $num,24+8(%rsp)         # inner counter
2414         mov     $rp, 56+8(%rsp)         # save $rp
2415 ___
2416 my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
2417    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2418 my $rptr=$bptr;
2419 my $STRIDE=2**5*8;              # 5 is "window size"
2420 my $N=$STRIDE/4;                # should match cache line size
2421 $code.=<<___;
2422         movdqa  0(%rax),%xmm0           # 00000001000000010000000000000000
2423         movdqa  16(%rax),%xmm1          # 00000002000000020000000200000002
2424         lea     88-112(%rsp,%r10),%r10  # place the mask after tp[num+1] (+ICache optimizaton)
2425         lea     128($bp),$bptr          # size optimization
2426
2427         pshufd  \$0,%xmm5,%xmm5         # broadcast index
2428         movdqa  %xmm1,%xmm4
2429         .byte   0x67
2430         movdqa  %xmm1,%xmm2
2431 ___
2432 ########################################################################
2433 # calculate mask by comparing 0..31 to index and save result to stack
2434 #
2435 $code.=<<___;
2436         .byte   0x67
2437         paddd   %xmm0,%xmm1
2438         pcmpeqd %xmm5,%xmm0             # compare to 1,0
2439         movdqa  %xmm4,%xmm3
2440 ___
2441 for($i=0;$i<$STRIDE/16-4;$i+=4) {
2442 $code.=<<___;
2443         paddd   %xmm1,%xmm2
2444         pcmpeqd %xmm5,%xmm1             # compare to 3,2
2445         movdqa  %xmm0,`16*($i+0)+112`(%r10)
2446         movdqa  %xmm4,%xmm0
2447
2448         paddd   %xmm2,%xmm3
2449         pcmpeqd %xmm5,%xmm2             # compare to 5,4
2450         movdqa  %xmm1,`16*($i+1)+112`(%r10)
2451         movdqa  %xmm4,%xmm1
2452
2453         paddd   %xmm3,%xmm0
2454         pcmpeqd %xmm5,%xmm3             # compare to 7,6
2455         movdqa  %xmm2,`16*($i+2)+112`(%r10)
2456         movdqa  %xmm4,%xmm2
2457
2458         paddd   %xmm0,%xmm1
2459         pcmpeqd %xmm5,%xmm0
2460         movdqa  %xmm3,`16*($i+3)+112`(%r10)
2461         movdqa  %xmm4,%xmm3
2462 ___
2463 }
2464 $code.=<<___;                           # last iteration can be optimized
2465         .byte   0x67
2466         paddd   %xmm1,%xmm2
2467         pcmpeqd %xmm5,%xmm1
2468         movdqa  %xmm0,`16*($i+0)+112`(%r10)
2469
2470         paddd   %xmm2,%xmm3
2471         pcmpeqd %xmm5,%xmm2
2472         movdqa  %xmm1,`16*($i+1)+112`(%r10)
2473
2474         pcmpeqd %xmm5,%xmm3
2475         movdqa  %xmm2,`16*($i+2)+112`(%r10)
2476
2477         pand    `16*($i+0)-128`($bptr),%xmm0    # while it's still in register
2478         pand    `16*($i+1)-128`($bptr),%xmm1
2479         pand    `16*($i+2)-128`($bptr),%xmm2
2480         movdqa  %xmm3,`16*($i+3)+112`(%r10)
2481         pand    `16*($i+3)-128`($bptr),%xmm3
2482         por     %xmm2,%xmm0
2483         por     %xmm3,%xmm1
2484 ___
2485 for($i=0;$i<$STRIDE/16-4;$i+=4) {
2486 $code.=<<___;
2487         movdqa  `16*($i+0)-128`($bptr),%xmm4
2488         movdqa  `16*($i+1)-128`($bptr),%xmm5
2489         movdqa  `16*($i+2)-128`($bptr),%xmm2
2490         pand    `16*($i+0)+112`(%r10),%xmm4
2491         movdqa  `16*($i+3)-128`($bptr),%xmm3
2492         pand    `16*($i+1)+112`(%r10),%xmm5
2493         por     %xmm4,%xmm0
2494         pand    `16*($i+2)+112`(%r10),%xmm2
2495         por     %xmm5,%xmm1
2496         pand    `16*($i+3)+112`(%r10),%xmm3
2497         por     %xmm2,%xmm0
2498         por     %xmm3,%xmm1
2499 ___
2500 }
2501 $code.=<<___;
2502         pxor    %xmm1,%xmm0
2503         pshufd  \$0x4e,%xmm0,%xmm1
2504         por     %xmm1,%xmm0
2505         lea     $STRIDE($bptr),$bptr
2506         movq    %xmm0,%rdx              # bp[0]
2507         lea     64+8*4+8(%rsp),$tptr
2508
2509         mov     %rdx,$bi
2510         mulx    0*8($aptr),$mi,%rax     # a[0]*b[0]
2511         mulx    1*8($aptr),%r11,%r12    # a[1]*b[0]
2512         add     %rax,%r11
2513         mulx    2*8($aptr),%rax,%r13    # ...
2514         adc     %rax,%r12
2515         adc     \$0,%r13
2516         mulx    3*8($aptr),%rax,%r14
2517
2518         mov     $mi,%r15
2519         imulq   32+8(%rsp),$mi          # "t[0]"*n0
2520         xor     $zero,$zero             # cf=0, of=0
2521         mov     $mi,%rdx
2522
2523         mov     $bptr,8+8(%rsp)         # off-load &b[i]
2524
2525         lea     4*8($aptr),$aptr
2526         adcx    %rax,%r13
2527         adcx    $zero,%r14              # cf=0
2528
2529         mulx    0*8($nptr),%rax,%r10
2530         adcx    %rax,%r15               # discarded
2531         adox    %r11,%r10
2532         mulx    1*8($nptr),%rax,%r11
2533         adcx    %rax,%r10
2534         adox    %r12,%r11
2535         mulx    2*8($nptr),%rax,%r12
2536         mov     24+8(%rsp),$bptr        # counter value
2537         mov     %r10,-8*4($tptr)
2538         adcx    %rax,%r11
2539         adox    %r13,%r12
2540         mulx    3*8($nptr),%rax,%r15
2541          mov    $bi,%rdx
2542         mov     %r11,-8*3($tptr)
2543         adcx    %rax,%r12
2544         adox    $zero,%r15              # of=0
2545         lea     4*8($nptr),$nptr
2546         mov     %r12,-8*2($tptr)
2547         jmp     .Lmulx4x_1st
2548
2549 .align  32
2550 .Lmulx4x_1st:
2551         adcx    $zero,%r15              # cf=0, modulo-scheduled
2552         mulx    0*8($aptr),%r10,%rax    # a[4]*b[0]
2553         adcx    %r14,%r10
2554         mulx    1*8($aptr),%r11,%r14    # a[5]*b[0]
2555         adcx    %rax,%r11
2556         mulx    2*8($aptr),%r12,%rax    # ...
2557         adcx    %r14,%r12
2558         mulx    3*8($aptr),%r13,%r14
2559          .byte  0x67,0x67
2560          mov    $mi,%rdx
2561         adcx    %rax,%r13
2562         adcx    $zero,%r14              # cf=0
2563         lea     4*8($aptr),$aptr
2564         lea     4*8($tptr),$tptr
2565
2566         adox    %r15,%r10
2567         mulx    0*8($nptr),%rax,%r15
2568         adcx    %rax,%r10
2569         adox    %r15,%r11
2570         mulx    1*8($nptr),%rax,%r15
2571         adcx    %rax,%r11
2572         adox    %r15,%r12
2573         mulx    2*8($nptr),%rax,%r15
2574         mov     %r10,-5*8($tptr)
2575         adcx    %rax,%r12
2576         mov     %r11,-4*8($tptr)
2577         adox    %r15,%r13
2578         mulx    3*8($nptr),%rax,%r15
2579          mov    $bi,%rdx
2580         mov     %r12,-3*8($tptr)
2581         adcx    %rax,%r13
2582         adox    $zero,%r15
2583         lea     4*8($nptr),$nptr
2584         mov     %r13,-2*8($tptr)
2585
2586         dec     $bptr                   # of=0, pass cf
2587         jnz     .Lmulx4x_1st
2588
2589         mov     8(%rsp),$num            # load -num
2590         adc     $zero,%r15              # modulo-scheduled
2591         lea     ($aptr,$num),$aptr      # rewind $aptr
2592         add     %r15,%r14
2593         mov     8+8(%rsp),$bptr         # re-load &b[i]
2594         adc     $zero,$zero             # top-most carry
2595         mov     %r14,-1*8($tptr)
2596         jmp     .Lmulx4x_outer
2597
2598 .align  32
2599 .Lmulx4x_outer:
2600         lea     16-256($tptr),%r10      # where 256-byte mask is (+density control)
2601         pxor    %xmm4,%xmm4
2602         .byte   0x67,0x67
2603         pxor    %xmm5,%xmm5
2604 ___
2605 for($i=0;$i<$STRIDE/16;$i+=4) {
2606 $code.=<<___;
2607         movdqa  `16*($i+0)-128`($bptr),%xmm0
2608         movdqa  `16*($i+1)-128`($bptr),%xmm1
2609         movdqa  `16*($i+2)-128`($bptr),%xmm2
2610         pand    `16*($i+0)+256`(%r10),%xmm0
2611         movdqa  `16*($i+3)-128`($bptr),%xmm3
2612         pand    `16*($i+1)+256`(%r10),%xmm1
2613         por     %xmm0,%xmm4
2614         pand    `16*($i+2)+256`(%r10),%xmm2
2615         por     %xmm1,%xmm5
2616         pand    `16*($i+3)+256`(%r10),%xmm3
2617         por     %xmm2,%xmm4
2618         por     %xmm3,%xmm5
2619 ___
2620 }
2621 $code.=<<___;
2622         por     %xmm5,%xmm4
2623         pshufd  \$0x4e,%xmm4,%xmm0
2624         por     %xmm4,%xmm0
2625         lea     $STRIDE($bptr),$bptr
2626         movq    %xmm0,%rdx              # m0=bp[i]
2627
2628         mov     $zero,($tptr)           # save top-most carry
2629         lea     4*8($tptr,$num),$tptr   # rewind $tptr
2630         mulx    0*8($aptr),$mi,%r11     # a[0]*b[i]
2631         xor     $zero,$zero             # cf=0, of=0
2632         mov     %rdx,$bi
2633         mulx    1*8($aptr),%r14,%r12    # a[1]*b[i]
2634         adox    -4*8($tptr),$mi         # +t[0]
2635         adcx    %r14,%r11
2636         mulx    2*8($aptr),%r15,%r13    # ...
2637         adox    -3*8($tptr),%r11
2638         adcx    %r15,%r12
2639         mulx    3*8($aptr),%rdx,%r14
2640         adox    -2*8($tptr),%r12
2641         adcx    %rdx,%r13
2642         lea     ($nptr,$num),$nptr      # rewind $nptr
2643         lea     4*8($aptr),$aptr
2644         adox    -1*8($tptr),%r13
2645         adcx    $zero,%r14
2646         adox    $zero,%r14
2647
2648         mov     $mi,%r15
2649         imulq   32+8(%rsp),$mi          # "t[0]"*n0
2650
2651         mov     $mi,%rdx
2652         xor     $zero,$zero             # cf=0, of=0
2653         mov     $bptr,8+8(%rsp)         # off-load &b[i]
2654
2655         mulx    0*8($nptr),%rax,%r10
2656         adcx    %rax,%r15               # discarded
2657         adox    %r11,%r10
2658         mulx    1*8($nptr),%rax,%r11
2659         adcx    %rax,%r10
2660         adox    %r12,%r11
2661         mulx    2*8($nptr),%rax,%r12
2662         adcx    %rax,%r11
2663         adox    %r13,%r12
2664         mulx    3*8($nptr),%rax,%r15
2665          mov    $bi,%rdx
2666         mov     24+8(%rsp),$bptr        # counter value
2667         mov     %r10,-8*4($tptr)
2668         adcx    %rax,%r12
2669         mov     %r11,-8*3($tptr)
2670         adox    $zero,%r15              # of=0
2671         mov     %r12,-8*2($tptr)
2672         lea     4*8($nptr),$nptr
2673         jmp     .Lmulx4x_inner
2674
2675 .align  32
2676 .Lmulx4x_inner:
2677         mulx    0*8($aptr),%r10,%rax    # a[4]*b[i]
2678         adcx    $zero,%r15              # cf=0, modulo-scheduled
2679         adox    %r14,%r10
2680         mulx    1*8($aptr),%r11,%r14    # a[5]*b[i]
2681         adcx    0*8($tptr),%r10
2682         adox    %rax,%r11
2683         mulx    2*8($aptr),%r12,%rax    # ...
2684         adcx    1*8($tptr),%r11
2685         adox    %r14,%r12
2686         mulx    3*8($aptr),%r13,%r14
2687          mov    $mi,%rdx
2688         adcx    2*8($tptr),%r12
2689         adox    %rax,%r13
2690         adcx    3*8($tptr),%r13
2691         adox    $zero,%r14              # of=0
2692         lea     4*8($aptr),$aptr
2693         lea     4*8($tptr),$tptr
2694         adcx    $zero,%r14              # cf=0
2695
2696         adox    %r15,%r10
2697         mulx    0*8($nptr),%rax,%r15
2698         adcx    %rax,%r10
2699         adox    %r15,%r11
2700         mulx    1*8($nptr),%rax,%r15
2701         adcx    %rax,%r11
2702         adox    %r15,%r12
2703         mulx    2*8($nptr),%rax,%r15
2704         mov     %r10,-5*8($tptr)
2705         adcx    %rax,%r12
2706         adox    %r15,%r13
2707         mov     %r11,-4*8($tptr)
2708         mulx    3*8($nptr),%rax,%r15
2709          mov    $bi,%rdx
2710         lea     4*8($nptr),$nptr
2711         mov     %r12,-3*8($tptr)
2712         adcx    %rax,%r13
2713         adox    $zero,%r15
2714         mov     %r13,-2*8($tptr)
2715
2716         dec     $bptr                   # of=0, pass cf
2717         jnz     .Lmulx4x_inner
2718
2719         mov     0+8(%rsp),$num          # load -num
2720         adc     $zero,%r15              # modulo-scheduled
2721         sub     0*8($tptr),$bptr        # pull top-most carry to %cf
2722         mov     8+8(%rsp),$bptr         # re-load &b[i]
2723         mov     16+8(%rsp),%r10
2724         adc     %r15,%r14
2725         lea     ($aptr,$num),$aptr      # rewind $aptr
2726         adc     $zero,$zero             # top-most carry
2727         mov     %r14,-1*8($tptr)
2728
2729         cmp     %r10,$bptr
2730         jb      .Lmulx4x_outer
2731
2732         mov     -8($nptr),%r10
2733         mov     $zero,%r8
2734         mov     ($nptr,$num),%r12
2735         lea     ($nptr,$num),%rbp       # rewind $nptr
2736         mov     $num,%rcx
2737         lea     ($tptr,$num),%rdi       # rewind $tptr
2738         xor     %eax,%eax
2739         xor     %r15,%r15
2740         sub     %r14,%r10               # compare top-most words
2741         adc     %r15,%r15
2742         or      %r15,%r8
2743         sar     \$3+2,%rcx
2744         sub     %r8,%rax                # %rax=-%r8
2745         mov     56+8(%rsp),%rdx         # restore rp
2746         dec     %r12                    # so that after 'not' we get -n[0]
2747         mov     8*1(%rbp),%r13
2748         xor     %r8,%r8
2749         mov     8*2(%rbp),%r14
2750         mov     8*3(%rbp),%r15
2751         jmp     .Lsqrx4x_sub_entry      # common post-condition
2752 .size   mulx4x_internal,.-mulx4x_internal
2753 ___
2754 }\f{
2755 ######################################################################
2756 # void bn_power5(
2757 my $rptr="%rdi";        # BN_ULONG *rptr,
2758 my $aptr="%rsi";        # const BN_ULONG *aptr,
2759 my $bptr="%rdx";        # const void *table,
2760 my $nptr="%rcx";        # const BN_ULONG *nptr,
2761 my $n0  ="%r8";         # const BN_ULONG *n0);
2762 my $num ="%r9";         # int num, has to be divisible by 8
2763                         # int pwr);
2764
2765 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2766 my @A0=("%r10","%r11");
2767 my @A1=("%r12","%r13");
2768 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2769
2770 $code.=<<___;
2771 .type   bn_powerx5,\@function,6
2772 .align  32
2773 bn_powerx5:
2774 .cfi_startproc
2775         mov     %rsp,%rax
2776 .cfi_def_cfa_register   %rax
2777 .Lpowerx5_enter:
2778         push    %rbx
2779 .cfi_push       %rbx
2780         push    %rbp
2781 .cfi_push       %rbp
2782         push    %r12
2783 .cfi_push       %r12
2784         push    %r13
2785 .cfi_push       %r13
2786         push    %r14
2787 .cfi_push       %r14
2788         push    %r15
2789 .cfi_push       %r15
2790 .Lpowerx5_prologue:
2791
2792         shl     \$3,${num}d             # convert $num to bytes
2793         lea     ($num,$num,2),%r10      # 3*$num in bytes
2794         neg     $num
2795         mov     ($n0),$n0               # *n0
2796
2797         ##############################################################
2798         # Ensure that stack frame doesn't alias with $rptr+3*$num
2799         # modulo 4096, which covers ret[num], am[num] and n[num]
2800         # (see bn_exp.c). This is done to allow memory disambiguation
2801         # logic do its magic. [Extra 256 bytes is for power mask
2802         # calculated from 7th argument, the index.]
2803         #
2804         lea     -320(%rsp,$num,2),%r11
2805         mov     %rsp,%rbp
2806         sub     $rptr,%r11
2807         and     \$4095,%r11
2808         cmp     %r11,%r10
2809         jb      .Lpwrx_sp_alt
2810         sub     %r11,%rbp               # align with $aptr
2811         lea     -320(%rbp,$num,2),%rbp  # future alloca(frame+2*$num*8+256)
2812         jmp     .Lpwrx_sp_done
2813
2814 .align  32
2815 .Lpwrx_sp_alt:
2816         lea     4096-320(,$num,2),%r10
2817         lea     -320(%rbp,$num,2),%rbp  # alloca(frame+2*$num*8+256)
2818         sub     %r10,%r11
2819         mov     \$0,%r10
2820         cmovc   %r10,%r11
2821         sub     %r11,%rbp
2822 .Lpwrx_sp_done:
2823         and     \$-64,%rbp
2824         mov     %rsp,%r11
2825         sub     %rbp,%r11
2826         and     \$-4096,%r11
2827         lea     (%rbp,%r11),%rsp
2828         mov     (%rsp),%r10
2829         cmp     %rbp,%rsp
2830         ja      .Lpwrx_page_walk
2831         jmp     .Lpwrx_page_walk_done
2832
2833 .Lpwrx_page_walk:
2834         lea     -4096(%rsp),%rsp
2835         mov     (%rsp),%r10
2836         cmp     %rbp,%rsp
2837         ja      .Lpwrx_page_walk
2838 .Lpwrx_page_walk_done:
2839
2840         mov     $num,%r10
2841         neg     $num
2842
2843         ##############################################################
2844         # Stack layout
2845         #
2846         # +0    saved $num, used in reduction section
2847         # +8    &t[2*$num], used in reduction section
2848         # +16   intermediate carry bit
2849         # +24   top-most carry bit, used in reduction section
2850         # +32   saved *n0
2851         # +40   saved %rsp
2852         # +48   t[2*$num]
2853         #
2854         pxor    %xmm0,%xmm0
2855         movq    $rptr,%xmm1             # save $rptr
2856         movq    $nptr,%xmm2             # save $nptr
2857         movq    %r10, %xmm3             # -$num
2858         movq    $bptr,%xmm4
2859         mov     $n0,  32(%rsp)
2860         mov     %rax, 40(%rsp)          # save original %rsp
2861 .cfi_cfa_expression     %rsp+40,deref,+8
2862 .Lpowerx5_body:
2863
2864         call    __bn_sqrx8x_internal
2865         call    __bn_postx4x_internal
2866         call    __bn_sqrx8x_internal
2867         call    __bn_postx4x_internal
2868         call    __bn_sqrx8x_internal
2869         call    __bn_postx4x_internal
2870         call    __bn_sqrx8x_internal
2871         call    __bn_postx4x_internal
2872         call    __bn_sqrx8x_internal
2873         call    __bn_postx4x_internal
2874
2875         mov     %r10,$num               # -num
2876         mov     $aptr,$rptr
2877         movq    %xmm2,$nptr
2878         movq    %xmm4,$bptr
2879         mov     40(%rsp),%rax
2880
2881         call    mulx4x_internal
2882
2883         mov     40(%rsp),%rsi           # restore %rsp
2884 .cfi_def_cfa    %rsi,8
2885         mov     \$1,%rax
2886
2887         mov     -48(%rsi),%r15
2888 .cfi_restore    %r15
2889         mov     -40(%rsi),%r14
2890 .cfi_restore    %r14
2891         mov     -32(%rsi),%r13
2892 .cfi_restore    %r13
2893         mov     -24(%rsi),%r12
2894 .cfi_restore    %r12
2895         mov     -16(%rsi),%rbp
2896 .cfi_restore    %rbp
2897         mov     -8(%rsi),%rbx
2898 .cfi_restore    %rbx
2899         lea     (%rsi),%rsp
2900 .cfi_def_cfa_register   %rsp
2901 .Lpowerx5_epilogue:
2902         ret
2903 .cfi_endproc
2904 .size   bn_powerx5,.-bn_powerx5
2905
2906 .globl  bn_sqrx8x_internal
2907 .hidden bn_sqrx8x_internal
2908 .type   bn_sqrx8x_internal,\@abi-omnipotent
2909 .align  32
2910 bn_sqrx8x_internal:
2911 __bn_sqrx8x_internal:
2912         ##################################################################
2913         # Squaring part:
2914         #
2915         # a) multiply-n-add everything but a[i]*a[i];
2916         # b) shift result of a) by 1 to the left and accumulate
2917         #    a[i]*a[i] products;
2918         #
2919         ##################################################################
2920         # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2921         #                                                     a[1]a[0]
2922         #                                                 a[2]a[0]
2923         #                                             a[3]a[0]
2924         #                                             a[2]a[1]
2925         #                                         a[3]a[1]
2926         #                                     a[3]a[2]
2927         #
2928         #                                         a[4]a[0]
2929         #                                     a[5]a[0]
2930         #                                 a[6]a[0]
2931         #                             a[7]a[0]
2932         #                                     a[4]a[1]
2933         #                                 a[5]a[1]
2934         #                             a[6]a[1]
2935         #                         a[7]a[1]
2936         #                                 a[4]a[2]
2937         #                             a[5]a[2]
2938         #                         a[6]a[2]
2939         #                     a[7]a[2]
2940         #                             a[4]a[3]
2941         #                         a[5]a[3]
2942         #                     a[6]a[3]
2943         #                 a[7]a[3]
2944         #
2945         #                     a[5]a[4]
2946         #                 a[6]a[4]
2947         #             a[7]a[4]
2948         #             a[6]a[5]
2949         #         a[7]a[5]
2950         #     a[7]a[6]
2951         # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2952 ___
2953 {
2954 my ($zero,$carry)=("%rbp","%rcx");
2955 my $aaptr=$zero;
2956 $code.=<<___;
2957         lea     48+8(%rsp),$tptr
2958         lea     ($aptr,$num),$aaptr
2959         mov     $num,0+8(%rsp)                  # save $num
2960         mov     $aaptr,8+8(%rsp)                # save end of $aptr
2961         jmp     .Lsqr8x_zero_start
2962
2963 .align  32
2964 .byte   0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2965 .Lsqrx8x_zero:
2966         .byte   0x3e
2967         movdqa  %xmm0,0*8($tptr)
2968         movdqa  %xmm0,2*8($tptr)
2969         movdqa  %xmm0,4*8($tptr)
2970         movdqa  %xmm0,6*8($tptr)
2971 .Lsqr8x_zero_start:                     # aligned at 32
2972         movdqa  %xmm0,8*8($tptr)
2973         movdqa  %xmm0,10*8($tptr)
2974         movdqa  %xmm0,12*8($tptr)
2975         movdqa  %xmm0,14*8($tptr)
2976         lea     16*8($tptr),$tptr
2977         sub     \$64,$num
2978         jnz     .Lsqrx8x_zero
2979
2980         mov     0*8($aptr),%rdx         # a[0], modulo-scheduled
2981         #xor    %r9,%r9                 # t[1], ex-$num, zero already
2982         xor     %r10,%r10
2983         xor     %r11,%r11
2984         xor     %r12,%r12
2985         xor     %r13,%r13
2986         xor     %r14,%r14
2987         xor     %r15,%r15
2988         lea     48+8(%rsp),$tptr
2989         xor     $zero,$zero             # cf=0, cf=0
2990         jmp     .Lsqrx8x_outer_loop
2991
2992 .align  32
2993 .Lsqrx8x_outer_loop:
2994         mulx    1*8($aptr),%r8,%rax     # a[1]*a[0]
2995         adcx    %r9,%r8                 # a[1]*a[0]+=t[1]
2996         adox    %rax,%r10
2997         mulx    2*8($aptr),%r9,%rax     # a[2]*a[0]
2998         adcx    %r10,%r9
2999         adox    %rax,%r11
3000         .byte   0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00    # mulx  3*8($aptr),%r10,%rax    # ...
3001         adcx    %r11,%r10
3002         adox    %rax,%r12
3003         .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00    # mulx  4*8($aptr),%r11,%rax
3004         adcx    %r12,%r11
3005         adox    %rax,%r13
3006         mulx    5*8($aptr),%r12,%rax
3007         adcx    %r13,%r12
3008         adox    %rax,%r14
3009         mulx    6*8($aptr),%r13,%rax
3010         adcx    %r14,%r13
3011         adox    %r15,%rax
3012         mulx    7*8($aptr),%r14,%r15
3013          mov    1*8($aptr),%rdx         # a[1]
3014         adcx    %rax,%r14
3015         adox    $zero,%r15
3016         adc     8*8($tptr),%r15
3017         mov     %r8,1*8($tptr)          # t[1]
3018         mov     %r9,2*8($tptr)          # t[2]
3019         sbb     $carry,$carry           # mov %cf,$carry
3020         xor     $zero,$zero             # cf=0, of=0
3021
3022
3023         mulx    2*8($aptr),%r8,%rbx     # a[2]*a[1]
3024         mulx    3*8($aptr),%r9,%rax     # a[3]*a[1]
3025         adcx    %r10,%r8
3026         adox    %rbx,%r9
3027         mulx    4*8($aptr),%r10,%rbx    # ...
3028         adcx    %r11,%r9
3029         adox    %rax,%r10
3030         .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00    # mulx  5*8($aptr),%r11,%rax
3031         adcx    %r12,%r10
3032         adox    %rbx,%r11
3033         .byte   0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00    # mulx  6*8($aptr),%r12,%rbx
3034         adcx    %r13,%r11
3035         adox    %r14,%r12
3036         .byte   0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00    # mulx  7*8($aptr),%r13,%r14
3037          mov    2*8($aptr),%rdx         # a[2]
3038         adcx    %rax,%r12
3039         adox    %rbx,%r13
3040         adcx    %r15,%r13
3041         adox    $zero,%r14              # of=0
3042         adcx    $zero,%r14              # cf=0
3043
3044         mov     %r8,3*8($tptr)          # t[3]
3045         mov     %r9,4*8($tptr)          # t[4]
3046
3047         mulx    3*8($aptr),%r8,%rbx     # a[3]*a[2]
3048         mulx    4*8($aptr),%r9,%rax     # a[4]*a[2]
3049         adcx    %r10,%r8
3050         adox    %rbx,%r9
3051         mulx    5*8($aptr),%r10,%rbx    # ...
3052         adcx    %r11,%r9
3053         adox    %rax,%r10
3054         .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00    # mulx  6*8($aptr),%r11,%rax
3055         adcx    %r12,%r10
3056         adox    %r13,%r11
3057         .byte   0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00    # mulx  7*8($aptr),%r12,%r13
3058         .byte   0x3e
3059          mov    3*8($aptr),%rdx         # a[3]
3060         adcx    %rbx,%r11
3061         adox    %rax,%r12
3062         adcx    %r14,%r12
3063         mov     %r8,5*8($tptr)          # t[5]
3064         mov     %r9,6*8($tptr)          # t[6]
3065          mulx   4*8($aptr),%r8,%rax     # a[4]*a[3]
3066         adox    $zero,%r13              # of=0
3067         adcx    $zero,%r13              # cf=0
3068
3069         mulx    5*8($aptr),%r9,%rbx     # a[5]*a[3]
3070         adcx    %r10,%r8
3071         adox    %rax,%r9
3072         mulx    6*8($aptr),%r10,%rax    # ...
3073         adcx    %r11,%r9
3074         adox    %r12,%r10
3075         mulx    7*8($aptr),%r11,%r12
3076          mov    4*8($aptr),%rdx         # a[4]
3077          mov    5*8($aptr),%r14         # a[5]
3078         adcx    %rbx,%r10
3079         adox    %rax,%r11
3080          mov    6*8($aptr),%r15         # a[6]
3081         adcx    %r13,%r11
3082         adox    $zero,%r12              # of=0
3083         adcx    $zero,%r12              # cf=0
3084
3085         mov     %r8,7*8($tptr)          # t[7]
3086         mov     %r9,8*8($tptr)          # t[8]
3087
3088         mulx    %r14,%r9,%rax           # a[5]*a[4]
3089          mov    7*8($aptr),%r8          # a[7]
3090         adcx    %r10,%r9
3091         mulx    %r15,%r10,%rbx          # a[6]*a[4]
3092         adox    %rax,%r10
3093         adcx    %r11,%r10
3094         mulx    %r8,%r11,%rax           # a[7]*a[4]
3095          mov    %r14,%rdx               # a[5]
3096         adox    %rbx,%r11
3097         adcx    %r12,%r11
3098         #adox   $zero,%rax              # of=0
3099         adcx    $zero,%rax              # cf=0
3100
3101         mulx    %r15,%r14,%rbx          # a[6]*a[5]
3102         mulx    %r8,%r12,%r13           # a[7]*a[5]
3103          mov    %r15,%rdx               # a[6]
3104          lea    8*8($aptr),$aptr
3105         adcx    %r14,%r11
3106         adox    %rbx,%r12
3107         adcx    %rax,%r12
3108         adox    $zero,%r13
3109
3110         .byte   0x67,0x67
3111         mulx    %r8,%r8,%r14            # a[7]*a[6]
3112         adcx    %r8,%r13
3113         adcx    $zero,%r14
3114
3115         cmp     8+8(%rsp),$aptr
3116         je      .Lsqrx8x_outer_break
3117
3118         neg     $carry                  # mov $carry,%cf
3119         mov     \$-8,%rcx
3120         mov     $zero,%r15
3121         mov     8*8($tptr),%r8
3122         adcx    9*8($tptr),%r9          # +=t[9]
3123         adcx    10*8($tptr),%r10        # ...
3124         adcx    11*8($tptr),%r11
3125         adc     12*8($tptr),%r12
3126         adc     13*8($tptr),%r13
3127         adc     14*8($tptr),%r14
3128         adc     15*8($tptr),%r15
3129         lea     ($aptr),$aaptr
3130         lea     2*64($tptr),$tptr
3131         sbb     %rax,%rax               # mov %cf,$carry
3132
3133         mov     -64($aptr),%rdx         # a[0]
3134         mov     %rax,16+8(%rsp)         # offload $carry
3135         mov     $tptr,24+8(%rsp)
3136
3137         #lea    8*8($tptr),$tptr        # see 2*8*8($tptr) above
3138         xor     %eax,%eax               # cf=0, of=0
3139         jmp     .Lsqrx8x_loop
3140
3141 .align  32
3142 .Lsqrx8x_loop:
3143         mov     %r8,%rbx
3144         mulx    0*8($aaptr),%rax,%r8    # a[8]*a[i]
3145         adcx    %rax,%rbx               # +=t[8]
3146         adox    %r9,%r8
3147
3148         mulx    1*8($aaptr),%rax,%r9    # ...
3149         adcx    %rax,%r8
3150         adox    %r10,%r9
3151
3152         mulx    2*8($aaptr),%rax,%r10
3153         adcx    %rax,%r9
3154         adox    %r11,%r10
3155
3156         mulx    3*8($aaptr),%rax,%r11
3157         adcx    %rax,%r10
3158         adox    %r12,%r11
3159
3160         .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00    # mulx  4*8($aaptr),%rax,%r12
3161         adcx    %rax,%r11
3162         adox    %r13,%r12
3163
3164         mulx    5*8($aaptr),%rax,%r13
3165         adcx    %rax,%r12
3166         adox    %r14,%r13
3167
3168         mulx    6*8($aaptr),%rax,%r14
3169          mov    %rbx,($tptr,%rcx,8)     # store t[8+i]
3170          mov    \$0,%ebx
3171         adcx    %rax,%r13
3172         adox    %r15,%r14
3173
3174         .byte   0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00    # mulx  7*8($aaptr),%rax,%r15
3175          mov    8($aptr,%rcx,8),%rdx    # a[i]
3176         adcx    %rax,%r14
3177         adox    %rbx,%r15               # %rbx is 0, of=0
3178         adcx    %rbx,%r15               # cf=0
3179
3180         .byte   0x67
3181         inc     %rcx                    # of=0
3182         jnz     .Lsqrx8x_loop
3183
3184         lea     8*8($aaptr),$aaptr
3185         mov     \$-8,%rcx
3186         cmp     8+8(%rsp),$aaptr        # done?
3187         je      .Lsqrx8x_break
3188
3189         sub     16+8(%rsp),%rbx         # mov 16(%rsp),%cf
3190         .byte   0x66
3191         mov     -64($aptr),%rdx
3192         adcx    0*8($tptr),%r8
3193         adcx    1*8($tptr),%r9
3194         adc     2*8($tptr),%r10
3195         adc     3*8($tptr),%r11
3196         adc     4*8($tptr),%r12
3197         adc     5*8($tptr),%r13
3198         adc     6*8($tptr),%r14
3199         adc     7*8($tptr),%r15
3200         lea     8*8($tptr),$tptr
3201         .byte   0x67
3202         sbb     %rax,%rax               # mov %cf,%rax
3203         xor     %ebx,%ebx               # cf=0, of=0
3204         mov     %rax,16+8(%rsp)         # offload carry
3205         jmp     .Lsqrx8x_loop
3206
3207 .align  32
3208 .Lsqrx8x_break:
3209         sub     16+8(%rsp),%r8          # consume last carry
3210         mov     24+8(%rsp),$carry       # initial $tptr, borrow $carry
3211         mov     0*8($aptr),%rdx         # a[8], modulo-scheduled
3212         xor     %ebp,%ebp               # xor   $zero,$zero
3213         mov     %r8,0*8($tptr)
3214         cmp     $carry,$tptr            # cf=0, of=0
3215         je      .Lsqrx8x_outer_loop
3216
3217         mov     %r9,1*8($tptr)
3218          mov    1*8($carry),%r9
3219         mov     %r10,2*8($tptr)
3220          mov    2*8($carry),%r10
3221         mov     %r11,3*8($tptr)
3222          mov    3*8($carry),%r11
3223         mov     %r12,4*8($tptr)
3224          mov    4*8($carry),%r12
3225         mov     %r13,5*8($tptr)
3226          mov    5*8($carry),%r13
3227         mov     %r14,6*8($tptr)
3228          mov    6*8($carry),%r14
3229         mov     %r15,7*8($tptr)
3230          mov    7*8($carry),%r15
3231         mov     $carry,$tptr
3232         jmp     .Lsqrx8x_outer_loop
3233
3234 .align  32
3235 .Lsqrx8x_outer_break:
3236         mov     %r9,9*8($tptr)          # t[9]
3237          movq   %xmm3,%rcx              # -$num
3238         mov     %r10,10*8($tptr)        # ...
3239         mov     %r11,11*8($tptr)
3240         mov     %r12,12*8($tptr)
3241         mov     %r13,13*8($tptr)
3242         mov     %r14,14*8($tptr)
3243 ___
3244 }\f{
3245 my $i="%rcx";
3246 $code.=<<___;
3247         lea     48+8(%rsp),$tptr
3248         mov     ($aptr,$i),%rdx         # a[0]
3249
3250         mov     8($tptr),$A0[1]         # t[1]
3251         xor     $A0[0],$A0[0]           # t[0], of=0, cf=0
3252         mov     0+8(%rsp),$num          # restore $num
3253         adox    $A0[1],$A0[1]
3254          mov    16($tptr),$A1[0]        # t[2]  # prefetch
3255          mov    24($tptr),$A1[1]        # t[3]  # prefetch
3256         #jmp    .Lsqrx4x_shift_n_add    # happens to be aligned
3257
3258 .align  32
3259 .Lsqrx4x_shift_n_add:
3260         mulx    %rdx,%rax,%rbx
3261          adox   $A1[0],$A1[0]
3262         adcx    $A0[0],%rax
3263          .byte  0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov   8($aptr,$i),%rdx        # a[i+1]        # prefetch
3264          .byte  0x4c,0x8b,0x97,0x20,0x00,0x00,0x00      # mov   32($tptr),$A0[0]        # t[2*i+4]      # prefetch
3265          adox   $A1[1],$A1[1]
3266         adcx    $A0[1],%rbx
3267          mov    40($tptr),$A0[1]                # t[2*i+4+1]    # prefetch
3268         mov     %rax,0($tptr)
3269         mov     %rbx,8($tptr)
3270
3271         mulx    %rdx,%rax,%rbx
3272          adox   $A0[0],$A0[0]
3273         adcx    $A1[0],%rax
3274          mov    16($aptr,$i),%rdx       # a[i+2]        # prefetch
3275          mov    48($tptr),$A1[0]        # t[2*i+6]      # prefetch
3276          adox   $A0[1],$A0[1]
3277         adcx    $A1[1],%rbx
3278          mov    56($tptr),$A1[1]        # t[2*i+6+1]    # prefetch
3279         mov     %rax,16($tptr)
3280         mov     %rbx,24($tptr)
3281
3282         mulx    %rdx,%rax,%rbx
3283          adox   $A1[0],$A1[0]
3284         adcx    $A0[0],%rax
3285          mov    24($aptr,$i),%rdx       # a[i+3]        # prefetch
3286          lea    32($i),$i
3287          mov    64($tptr),$A0[0]        # t[2*i+8]      # prefetch
3288          adox   $A1[1],$A1[1]
3289         adcx    $A0[1],%rbx
3290          mov    72($tptr),$A0[1]        # t[2*i+8+1]    # prefetch
3291         mov     %rax,32($tptr)
3292         mov     %rbx,40($tptr)
3293
3294         mulx    %rdx,%rax,%rbx
3295          adox   $A0[0],$A0[0]
3296         adcx    $A1[0],%rax
3297         jrcxz   .Lsqrx4x_shift_n_add_break
3298          .byte  0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov   0($aptr,$i),%rdx        # a[i+4]        # prefetch
3299          adox   $A0[1],$A0[1]
3300         adcx    $A1[1],%rbx
3301          mov    80($tptr),$A1[0]        # t[2*i+10]     # prefetch
3302          mov    88($tptr),$A1[1]        # t[2*i+10+1]   # prefetch
3303         mov     %rax,48($tptr)
3304         mov     %rbx,56($tptr)
3305         lea     64($tptr),$tptr
3306         nop
3307         jmp     .Lsqrx4x_shift_n_add
3308
3309 .align  32
3310 .Lsqrx4x_shift_n_add_break:
3311         adcx    $A1[1],%rbx
3312         mov     %rax,48($tptr)
3313         mov     %rbx,56($tptr)
3314         lea     64($tptr),$tptr         # end of t[] buffer
3315 ___
3316 }\f
3317 ######################################################################
3318 # Montgomery reduction part, "word-by-word" algorithm.
3319 #
3320 # This new path is inspired by multiple submissions from Intel, by
3321 # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3322 # Vinodh Gopal...
3323 {
3324 my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3325
3326 $code.=<<___;
3327         movq    %xmm2,$nptr
3328 __bn_sqrx8x_reduction:
3329         xor     %eax,%eax               # initial top-most carry bit
3330         mov     32+8(%rsp),%rbx         # n0
3331         mov     48+8(%rsp),%rdx         # "%r8", 8*0($tptr)
3332         lea     -8*8($nptr,$num),%rcx   # end of n[]
3333         #lea    48+8(%rsp,$num,2),$tptr # end of t[] buffer
3334         mov     %rcx, 0+8(%rsp)         # save end of n[]
3335         mov     $tptr,8+8(%rsp)         # save end of t[]
3336
3337         lea     48+8(%rsp),$tptr                # initial t[] window
3338         jmp     .Lsqrx8x_reduction_loop
3339
3340 .align  32
3341 .Lsqrx8x_reduction_loop:
3342         mov     8*1($tptr),%r9
3343         mov     8*2($tptr),%r10
3344         mov     8*3($tptr),%r11
3345         mov     8*4($tptr),%r12
3346         mov     %rdx,%r8
3347         imulq   %rbx,%rdx               # n0*a[i]
3348         mov     8*5($tptr),%r13
3349         mov     8*6($tptr),%r14
3350         mov     8*7($tptr),%r15
3351         mov     %rax,24+8(%rsp)         # store top-most carry bit
3352
3353         lea     8*8($tptr),$tptr
3354         xor     $carry,$carry           # cf=0,of=0
3355         mov     \$-8,%rcx
3356         jmp     .Lsqrx8x_reduce
3357
3358 .align  32
3359 .Lsqrx8x_reduce:
3360         mov     %r8, %rbx
3361         mulx    8*0($nptr),%rax,%r8     # n[0]
3362         adcx    %rbx,%rax               # discarded
3363         adox    %r9,%r8
3364
3365         mulx    8*1($nptr),%rbx,%r9     # n[1]
3366         adcx    %rbx,%r8
3367         adox    %r10,%r9
3368
3369         mulx    8*2($nptr),%rbx,%r10
3370         adcx    %rbx,%r9
3371         adox    %r11,%r10
3372
3373         mulx    8*3($nptr),%rbx,%r11
3374         adcx    %rbx,%r10
3375         adox    %r12,%r11
3376
3377         .byte   0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00    # mulx  8*4($nptr),%rbx,%r12
3378          mov    %rdx,%rax
3379          mov    %r8,%rdx
3380         adcx    %rbx,%r11
3381         adox    %r13,%r12
3382
3383          mulx   32+8(%rsp),%rbx,%rdx    # %rdx discarded
3384          mov    %rax,%rdx
3385          mov    %rax,64+48+8(%rsp,%rcx,8)       # put aside n0*a[i]
3386
3387         mulx    8*5($nptr),%rax,%r13
3388         adcx    %rax,%r12
3389         adox    %r14,%r13
3390
3391         mulx    8*6($nptr),%rax,%r14
3392         adcx    %rax,%r13
3393         adox    %r15,%r14
3394
3395         mulx    8*7($nptr),%rax,%r15
3396          mov    %rbx,%rdx
3397         adcx    %rax,%r14
3398         adox    $carry,%r15             # $carry is 0
3399         adcx    $carry,%r15             # cf=0
3400
3401         .byte   0x67,0x67,0x67
3402         inc     %rcx                    # of=0
3403         jnz     .Lsqrx8x_reduce
3404
3405         mov     $carry,%rax             # xor   %rax,%rax
3406         cmp     0+8(%rsp),$nptr         # end of n[]?
3407         jae     .Lsqrx8x_no_tail
3408
3409         mov     48+8(%rsp),%rdx         # pull n0*a[0]
3410         add     8*0($tptr),%r8
3411         lea     8*8($nptr),$nptr
3412         mov     \$-8,%rcx
3413         adcx    8*1($tptr),%r9
3414         adcx    8*2($tptr),%r10
3415         adc     8*3($tptr),%r11
3416         adc     8*4($tptr),%r12
3417         adc     8*5($tptr),%r13
3418         adc     8*6($tptr),%r14
3419         adc     8*7($tptr),%r15
3420         lea     8*8($tptr),$tptr
3421         sbb     %rax,%rax               # top carry
3422
3423         xor     $carry,$carry           # of=0, cf=0
3424         mov     %rax,16+8(%rsp)
3425         jmp     .Lsqrx8x_tail
3426
3427 .align  32
3428 .Lsqrx8x_tail:
3429         mov     %r8,%rbx
3430         mulx    8*0($nptr),%rax,%r8
3431         adcx    %rax,%rbx
3432         adox    %r9,%r8
3433
3434         mulx    8*1($nptr),%rax,%r9
3435         adcx    %rax,%r8
3436         adox    %r10,%r9
3437
3438         mulx    8*2($nptr),%rax,%r10
3439         adcx    %rax,%r9
3440         adox    %r11,%r10
3441
3442         mulx    8*3($nptr),%rax,%r11
3443         adcx    %rax,%r10
3444         adox    %r12,%r11
3445
3446         .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00    # mulx  8*4($nptr),%rax,%r12
3447         adcx    %rax,%r11
3448         adox    %r13,%r12
3449
3450         mulx    8*5($nptr),%rax,%r13
3451         adcx    %rax,%r12
3452         adox    %r14,%r13
3453
3454         mulx    8*6($nptr),%rax,%r14
3455         adcx    %rax,%r13
3456         adox    %r15,%r14
3457
3458         mulx    8*7($nptr),%rax,%r15
3459          mov    72+48+8(%rsp,%rcx,8),%rdx       # pull n0*a[i]
3460         adcx    %rax,%r14
3461         adox    $carry,%r15
3462          mov    %rbx,($tptr,%rcx,8)     # save result
3463          mov    %r8,%rbx
3464         adcx    $carry,%r15             # cf=0
3465
3466         inc     %rcx                    # of=0
3467         jnz     .Lsqrx8x_tail
3468
3469         cmp     0+8(%rsp),$nptr         # end of n[]?
3470         jae     .Lsqrx8x_tail_done      # break out of loop
3471
3472         sub     16+8(%rsp),$carry       # mov 16(%rsp),%cf
3473          mov    48+8(%rsp),%rdx         # pull n0*a[0]
3474          lea    8*8($nptr),$nptr
3475         adc     8*0($tptr),%r8
3476         adc     8*1($tptr),%r9
3477         adc     8*2($tptr),%r10
3478         adc     8*3($tptr),%r11
3479         adc     8*4($tptr),%r12
3480         adc     8*5($tptr),%r13
3481         adc     8*6($tptr),%r14
3482         adc     8*7($tptr),%r15
3483         lea     8*8($tptr),$tptr
3484         sbb     %rax,%rax
3485         sub     \$8,%rcx                # mov   \$-8,%rcx
3486
3487         xor     $carry,$carry           # of=0, cf=0
3488         mov     %rax,16+8(%rsp)
3489         jmp     .Lsqrx8x_tail
3490
3491 .align  32
3492 .Lsqrx8x_tail_done:
3493         xor     %rax,%rax
3494         add     24+8(%rsp),%r8          # can this overflow?
3495         adc     \$0,%r9
3496         adc     \$0,%r10
3497         adc     \$0,%r11
3498         adc     \$0,%r12
3499         adc     \$0,%r13
3500         adc     \$0,%r14
3501         adc     \$0,%r15
3502         adc     \$0,%rax
3503
3504         sub     16+8(%rsp),$carry       # mov 16(%rsp),%cf
3505 .Lsqrx8x_no_tail:                       # %cf is 0 if jumped here
3506         adc     8*0($tptr),%r8
3507          movq   %xmm3,%rcx
3508         adc     8*1($tptr),%r9
3509          mov    8*7($nptr),$carry
3510          movq   %xmm2,$nptr             # restore $nptr
3511         adc     8*2($tptr),%r10
3512         adc     8*3($tptr),%r11
3513         adc     8*4($tptr),%r12
3514         adc     8*5($tptr),%r13
3515         adc     8*6($tptr),%r14
3516         adc     8*7($tptr),%r15
3517         adc     \$0,%rax                # top-most carry
3518
3519         mov     32+8(%rsp),%rbx         # n0
3520         mov     8*8($tptr,%rcx),%rdx    # modulo-scheduled "%r8"
3521
3522         mov     %r8,8*0($tptr)          # store top 512 bits
3523          lea    8*8($tptr),%r8          # borrow %r8
3524         mov     %r9,8*1($tptr)
3525         mov     %r10,8*2($tptr)
3526         mov     %r11,8*3($tptr)
3527         mov     %r12,8*4($tptr)
3528         mov     %r13,8*5($tptr)
3529         mov     %r14,8*6($tptr)
3530         mov     %r15,8*7($tptr)
3531
3532         lea     8*8($tptr,%rcx),$tptr   # start of current t[] window
3533         cmp     8+8(%rsp),%r8           # end of t[]?
3534         jb      .Lsqrx8x_reduction_loop
3535         ret
3536 .size   bn_sqrx8x_internal,.-bn_sqrx8x_internal
3537 ___
3538 }\f
3539 ##############################################################
3540 # Post-condition, 4x unrolled
3541 #
3542 {
3543 my ($rptr,$nptr)=("%rdx","%rbp");
3544 $code.=<<___;
3545 .align  32
3546 __bn_postx4x_internal:
3547         mov     8*0($nptr),%r12
3548         mov     %rcx,%r10               # -$num
3549         mov     %rcx,%r9                # -$num
3550         neg     %rax
3551         sar     \$3+2,%rcx
3552         #lea    48+8(%rsp,%r9),$tptr
3553         movq    %xmm1,$rptr             # restore $rptr
3554         movq    %xmm1,$aptr             # prepare for back-to-back call
3555         dec     %r12                    # so that after 'not' we get -n[0]
3556         mov     8*1($nptr),%r13
3557         xor     %r8,%r8
3558         mov     8*2($nptr),%r14
3559         mov     8*3($nptr),%r15
3560         jmp     .Lsqrx4x_sub_entry
3561
3562 .align  16
3563 .Lsqrx4x_sub:
3564         mov     8*0($nptr),%r12
3565         mov     8*1($nptr),%r13
3566         mov     8*2($nptr),%r14
3567         mov     8*3($nptr),%r15
3568 .Lsqrx4x_sub_entry:
3569         andn    %rax,%r12,%r12
3570         lea     8*4($nptr),$nptr
3571         andn    %rax,%r13,%r13
3572         andn    %rax,%r14,%r14
3573         andn    %rax,%r15,%r15
3574
3575         neg     %r8                     # mov %r8,%cf
3576         adc     8*0($tptr),%r12
3577         adc     8*1($tptr),%r13
3578         adc     8*2($tptr),%r14
3579         adc     8*3($tptr),%r15
3580         mov     %r12,8*0($rptr)
3581         lea     8*4($tptr),$tptr
3582         mov     %r13,8*1($rptr)
3583         sbb     %r8,%r8                 # mov %cf,%r8
3584         mov     %r14,8*2($rptr)
3585         mov     %r15,8*3($rptr)
3586         lea     8*4($rptr),$rptr
3587
3588         inc     %rcx
3589         jnz     .Lsqrx4x_sub
3590
3591         neg     %r9                     # restore $num
3592
3593         ret
3594 .size   __bn_postx4x_internal,.-__bn_postx4x_internal
3595 ___
3596 }
3597 }}}
3598 {
3599 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3600                                 ("%rdi","%esi","%rdx","%ecx");  # Unix order
3601 my $out=$inp;
3602 my $STRIDE=2**5*8;
3603 my $N=$STRIDE/4;
3604
3605 $code.=<<___;
3606 .globl  bn_get_bits5
3607 .type   bn_get_bits5,\@abi-omnipotent
3608 .align  16
3609 bn_get_bits5:
3610         lea     0($inp),%r10
3611         lea     1($inp),%r11
3612         mov     $num,%ecx
3613         shr     \$4,$num
3614         and     \$15,%ecx
3615         lea     -8(%ecx),%eax
3616         cmp     \$11,%ecx
3617         cmova   %r11,%r10
3618         cmova   %eax,%ecx
3619         movzw   (%r10,$num,2),%eax
3620         shrl    %cl,%eax
3621         and     \$31,%eax
3622         ret
3623 .size   bn_get_bits5,.-bn_get_bits5
3624
3625 .globl  bn_scatter5
3626 .type   bn_scatter5,\@abi-omnipotent
3627 .align  16
3628 bn_scatter5:
3629         cmp     \$0, $num
3630         jz      .Lscatter_epilogue
3631         lea     ($tbl,$idx,8),$tbl
3632 .Lscatter:
3633         mov     ($inp),%rax
3634         lea     8($inp),$inp
3635         mov     %rax,($tbl)
3636         lea     32*8($tbl),$tbl
3637         sub     \$1,$num
3638         jnz     .Lscatter
3639 .Lscatter_epilogue:
3640         ret
3641 .size   bn_scatter5,.-bn_scatter5
3642
3643 .globl  bn_gather5
3644 .type   bn_gather5,\@abi-omnipotent
3645 .align  32
3646 bn_gather5:
3647 .LSEH_begin_bn_gather5:                 # Win64 thing, but harmless in other cases
3648         # I can't trust assembler to use specific encoding:-(
3649         .byte   0x4c,0x8d,0x14,0x24                     #lea    (%rsp),%r10
3650         .byte   0x48,0x81,0xec,0x08,0x01,0x00,0x00      #sub    $0x108,%rsp
3651         lea     .Linc(%rip),%rax
3652         and     \$-16,%rsp              # shouldn't be formally required
3653
3654         movd    $idx,%xmm5
3655         movdqa  0(%rax),%xmm0           # 00000001000000010000000000000000
3656         movdqa  16(%rax),%xmm1          # 00000002000000020000000200000002
3657         lea     128($tbl),%r11          # size optimization
3658         lea     128(%rsp),%rax          # size optimization
3659
3660         pshufd  \$0,%xmm5,%xmm5         # broadcast $idx
3661         movdqa  %xmm1,%xmm4
3662         movdqa  %xmm1,%xmm2
3663 ___
3664 ########################################################################
3665 # calculate mask by comparing 0..31 to $idx and save result to stack
3666 #
3667 for($i=0;$i<$STRIDE/16;$i+=4) {
3668 $code.=<<___;
3669         paddd   %xmm0,%xmm1
3670         pcmpeqd %xmm5,%xmm0             # compare to 1,0
3671 ___
3672 $code.=<<___    if ($i);
3673         movdqa  %xmm3,`16*($i-1)-128`(%rax)
3674 ___
3675 $code.=<<___;
3676         movdqa  %xmm4,%xmm3
3677
3678         paddd   %xmm1,%xmm2
3679         pcmpeqd %xmm5,%xmm1             # compare to 3,2
3680         movdqa  %xmm0,`16*($i+0)-128`(%rax)
3681         movdqa  %xmm4,%xmm0
3682
3683         paddd   %xmm2,%xmm3
3684         pcmpeqd %xmm5,%xmm2             # compare to 5,4
3685         movdqa  %xmm1,`16*($i+1)-128`(%rax)
3686         movdqa  %xmm4,%xmm1
3687
3688         paddd   %xmm3,%xmm0
3689         pcmpeqd %xmm5,%xmm3             # compare to 7,6
3690         movdqa  %xmm2,`16*($i+2)-128`(%rax)
3691         movdqa  %xmm4,%xmm2
3692 ___
3693 }
3694 $code.=<<___;
3695         movdqa  %xmm3,`16*($i-1)-128`(%rax)
3696         jmp     .Lgather
3697
3698 .align  32
3699 .Lgather:
3700         pxor    %xmm4,%xmm4
3701         pxor    %xmm5,%xmm5
3702 ___
3703 for($i=0;$i<$STRIDE/16;$i+=4) {
3704 $code.=<<___;
3705         movdqa  `16*($i+0)-128`(%r11),%xmm0
3706         movdqa  `16*($i+1)-128`(%r11),%xmm1
3707         movdqa  `16*($i+2)-128`(%r11),%xmm2
3708         pand    `16*($i+0)-128`(%rax),%xmm0
3709         movdqa  `16*($i+3)-128`(%r11),%xmm3
3710         pand    `16*($i+1)-128`(%rax),%xmm1
3711         por     %xmm0,%xmm4
3712         pand    `16*($i+2)-128`(%rax),%xmm2
3713         por     %xmm1,%xmm5
3714         pand    `16*($i+3)-128`(%rax),%xmm3
3715         por     %xmm2,%xmm4
3716         por     %xmm3,%xmm5
3717 ___
3718 }
3719 $code.=<<___;
3720         por     %xmm5,%xmm4
3721         lea     $STRIDE(%r11),%r11
3722         pshufd  \$0x4e,%xmm4,%xmm0
3723         por     %xmm4,%xmm0
3724         movq    %xmm0,($out)            # m0=bp[0]
3725         lea     8($out),$out
3726         sub     \$1,$num
3727         jnz     .Lgather
3728
3729         lea     (%r10),%rsp
3730         ret
3731 .LSEH_end_bn_gather5:
3732 .size   bn_gather5,.-bn_gather5
3733 ___
3734 }
3735 $code.=<<___;
3736 .align  64
3737 .Linc:
3738         .long   0,0, 1,1
3739         .long   2,2, 2,2
3740 .asciz  "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3741 ___
3742
3743 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3744 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
3745 if ($win64) {
3746 $rec="%rcx";
3747 $frame="%rdx";
3748 $context="%r8";
3749 $disp="%r9";
3750
3751 $code.=<<___;
3752 .extern __imp_RtlVirtualUnwind
3753 .type   mul_handler,\@abi-omnipotent
3754 .align  16
3755 mul_handler:
3756         push    %rsi
3757         push    %rdi
3758         push    %rbx
3759         push    %rbp
3760         push    %r12
3761         push    %r13
3762         push    %r14
3763         push    %r15
3764         pushfq
3765         sub     \$64,%rsp
3766
3767         mov     120($context),%rax      # pull context->Rax
3768         mov     248($context),%rbx      # pull context->Rip
3769
3770         mov     8($disp),%rsi           # disp->ImageBase
3771         mov     56($disp),%r11          # disp->HandlerData
3772
3773         mov     0(%r11),%r10d           # HandlerData[0]
3774         lea     (%rsi,%r10),%r10        # end of prologue label
3775         cmp     %r10,%rbx               # context->Rip<end of prologue label
3776         jb      .Lcommon_seh_tail
3777
3778         mov     4(%r11),%r10d           # HandlerData[1]
3779         lea     (%rsi,%r10),%r10        # beginning of body label
3780         cmp     %r10,%rbx               # context->Rip<body label
3781         jb      .Lcommon_pop_regs
3782
3783         mov     152($context),%rax      # pull context->Rsp
3784
3785         mov     8(%r11),%r10d           # HandlerData[2]
3786         lea     (%rsi,%r10),%r10        # epilogue label
3787         cmp     %r10,%rbx               # context->Rip>=epilogue label
3788         jae     .Lcommon_seh_tail
3789
3790         lea     .Lmul_epilogue(%rip),%r10
3791         cmp     %r10,%rbx
3792         ja      .Lbody_40
3793
3794         mov     192($context),%r10      # pull $num
3795         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
3796
3797         jmp     .Lcommon_pop_regs
3798
3799 .Lbody_40:
3800         mov     40(%rax),%rax           # pull saved stack pointer
3801 .Lcommon_pop_regs:
3802         mov     -8(%rax),%rbx
3803         mov     -16(%rax),%rbp
3804         mov     -24(%rax),%r12
3805         mov     -32(%rax),%r13
3806         mov     -40(%rax),%r14
3807         mov     -48(%rax),%r15
3808         mov     %rbx,144($context)      # restore context->Rbx
3809         mov     %rbp,160($context)      # restore context->Rbp
3810         mov     %r12,216($context)      # restore context->R12
3811         mov     %r13,224($context)      # restore context->R13
3812         mov     %r14,232($context)      # restore context->R14
3813         mov     %r15,240($context)      # restore context->R15
3814
3815 .Lcommon_seh_tail:
3816         mov     8(%rax),%rdi
3817         mov     16(%rax),%rsi
3818         mov     %rax,152($context)      # restore context->Rsp
3819         mov     %rsi,168($context)      # restore context->Rsi
3820         mov     %rdi,176($context)      # restore context->Rdi
3821
3822         mov     40($disp),%rdi          # disp->ContextRecord
3823         mov     $context,%rsi           # context
3824         mov     \$154,%ecx              # sizeof(CONTEXT)
3825         .long   0xa548f3fc              # cld; rep movsq
3826
3827         mov     $disp,%rsi
3828         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
3829         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
3830         mov     0(%rsi),%r8             # arg3, disp->ControlPc
3831         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
3832         mov     40(%rsi),%r10           # disp->ContextRecord
3833         lea     56(%rsi),%r11           # &disp->HandlerData
3834         lea     24(%rsi),%r12           # &disp->EstablisherFrame
3835         mov     %r10,32(%rsp)           # arg5
3836         mov     %r11,40(%rsp)           # arg6
3837         mov     %r12,48(%rsp)           # arg7
3838         mov     %rcx,56(%rsp)           # arg8, (NULL)
3839         call    *__imp_RtlVirtualUnwind(%rip)
3840
3841         mov     \$1,%eax                # ExceptionContinueSearch
3842         add     \$64,%rsp
3843         popfq
3844         pop     %r15
3845         pop     %r14
3846         pop     %r13
3847         pop     %r12
3848         pop     %rbp
3849         pop     %rbx
3850         pop     %rdi
3851         pop     %rsi
3852         ret
3853 .size   mul_handler,.-mul_handler
3854
3855 .section        .pdata
3856 .align  4
3857         .rva    .LSEH_begin_bn_mul_mont_gather5
3858         .rva    .LSEH_end_bn_mul_mont_gather5
3859         .rva    .LSEH_info_bn_mul_mont_gather5
3860
3861         .rva    .LSEH_begin_bn_mul4x_mont_gather5
3862         .rva    .LSEH_end_bn_mul4x_mont_gather5
3863         .rva    .LSEH_info_bn_mul4x_mont_gather5
3864
3865         .rva    .LSEH_begin_bn_power5
3866         .rva    .LSEH_end_bn_power5
3867         .rva    .LSEH_info_bn_power5
3868
3869         .rva    .LSEH_begin_bn_from_mont8x
3870         .rva    .LSEH_end_bn_from_mont8x
3871         .rva    .LSEH_info_bn_from_mont8x
3872 ___
3873 $code.=<<___ if ($addx);
3874         .rva    .LSEH_begin_bn_mulx4x_mont_gather5
3875         .rva    .LSEH_end_bn_mulx4x_mont_gather5
3876         .rva    .LSEH_info_bn_mulx4x_mont_gather5
3877
3878         .rva    .LSEH_begin_bn_powerx5
3879         .rva    .LSEH_end_bn_powerx5
3880         .rva    .LSEH_info_bn_powerx5
3881 ___
3882 $code.=<<___;
3883         .rva    .LSEH_begin_bn_gather5
3884         .rva    .LSEH_end_bn_gather5
3885         .rva    .LSEH_info_bn_gather5
3886
3887 .section        .xdata
3888 .align  8
3889 .LSEH_info_bn_mul_mont_gather5:
3890         .byte   9,0,0,0
3891         .rva    mul_handler
3892         .rva    .Lmul_body,.Lmul_body,.Lmul_epilogue            # HandlerData[]
3893 .align  8
3894 .LSEH_info_bn_mul4x_mont_gather5:
3895         .byte   9,0,0,0
3896         .rva    mul_handler
3897         .rva    .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue          # HandlerData[]
3898 .align  8
3899 .LSEH_info_bn_power5:
3900         .byte   9,0,0,0
3901         .rva    mul_handler
3902         .rva    .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue       # HandlerData[]
3903 .align  8
3904 .LSEH_info_bn_from_mont8x:
3905         .byte   9,0,0,0
3906         .rva    mul_handler
3907         .rva    .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue             # HandlerData[]
3908 ___
3909 $code.=<<___ if ($addx);
3910 .align  8
3911 .LSEH_info_bn_mulx4x_mont_gather5:
3912         .byte   9,0,0,0
3913         .rva    mul_handler
3914         .rva    .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue       # HandlerData[]
3915 .align  8
3916 .LSEH_info_bn_powerx5:
3917         .byte   9,0,0,0
3918         .rva    mul_handler
3919         .rva    .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue    # HandlerData[]
3920 ___
3921 $code.=<<___;
3922 .align  8
3923 .LSEH_info_bn_gather5:
3924         .byte   0x01,0x0b,0x03,0x0a
3925         .byte   0x0b,0x01,0x21,0x00     # sub   rsp,0x108
3926         .byte   0x04,0xa3,0x00,0x00     # lea   r10,(rsp)
3927 .align  8
3928 ___
3929 }
3930
3931 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3932
3933 print $code;
3934 close STDOUT;