crypto/ec/asm/ecp_nistz256-x86_64.pl

   1 #!/usr/bin/env perl
   2
   3 ##############################################################################
   4 #                                                                            #
   5 # Copyright 2014 Intel Corporation                                           #
   6 #                                                                            #
   7 # Licensed under the Apache License, Version 2.0 (the "License");            #
   8 # you may not use this file except in compliance with the License.           #
   9 # You may obtain a copy of the License at                                    #
  10 #                                                                            #
  11 #    http://www.apache.org/licenses/LICENSE-2.0                              #
  12 #                                                                            #
  13 # Unless required by applicable law or agreed to in writing, software        #
  14 # distributed under the License is distributed on an "AS IS" BASIS,          #
  15 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.   #
  16 # See the License for the specific language governing permissions and        #
  17 # limitations under the License.                                             #
  18 #                                                                            #
  19 ##############################################################################
  20 #                                                                            #
  21 #  Developers and authors:                                                   #
  22 #  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
  23 #  (1) Intel Corporation, Israel Development Center                          #
  24 #  (2) University of Haifa                                                   #
  25 #  Reference:                                                                #
  26 #  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
  27 #                           256 Bit Primes"                                  #
  28 #                                                                            #
  29 ##############################################################################
  30
  31 # Further optimization by <appro@openssl.org>:
  32 #
  33 #               this/original
  34 # Opteron       +8-33%
  35 # Bulldozer     +10-30%
  36 # P4            +14-38%
  37 # Westmere      +8-23%
  38 # Sandy Bridge  +8-24%
  39 # Ivy Bridge    +7-25%
  40 # Haswell       +5-25%
  41 # Atom          +10-32%
  42 # VIA Nano      +37-130%
  43 #
  44 # Ranges denote minimum and maximum improvement coefficients depending
  45 # on benchmark. Lower coefficients are for ECDSA sign, relatively
  46 # fastest server-side operation.
  47
  48 $flavour = shift;
  49 $output  = shift;
  50 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  51
  52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  53
  54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  57 die "can't locate x86_64-xlate.pl";
  58
  59 open OUT,"| \"$^X\" $xlate $flavour $output";
  60 *STDOUT=*OUT;
  61
  62 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  63                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  64         $avx = ($1>=2.19) + ($1>=2.22);
  65         $addx = ($1>=2.23);
  66 }
  67
  68 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  69             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  70         $avx = ($1>=2.09) + ($1>=2.10);
  71         $addx = ($1>=2.10);
  72 }
  73
  74 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  75             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  76         $avx = ($1>=10) + ($1>=11);
  77         $addx = ($1>=12);
  78 }
  79
  80 if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
  81         my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
  82         $avx = ($ver>=3.0) + ($ver>=3.01);
  83         $addx = ($ver>=3.03);
  84 }
  85
  86 $code.=<<___;
  87 .text
  88 .extern OPENSSL_ia32cap_P
  89
  90 # The polynomial
  91 .align 64
  92 .Lpoly:
  93 .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
  94
  95 # 2^512 mod P precomputed for NIST P256 polynomial
  96 .LRR:
  97 .quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
  98
  99 .LOne:
 100 .long 1,1,1,1,1,1,1,1
 101 .LTwo:
 102 .long 2,2,2,2,2,2,2,2
 103 .LThree:
 104 .long 3,3,3,3,3,3,3,3
 105 .LONE_mont:
 106 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
 107 ___
 108
 109 {
 110 ################################################################################
 111 # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
 112
 113 my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
 114 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
 115 my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
 116
 117 $code.=<<___;
 118
 119 .globl  ecp_nistz256_mul_by_2
 120 .type   ecp_nistz256_mul_by_2,\@function,2
 121 .align  64
 122 ecp_nistz256_mul_by_2:
 123         push    %r12
 124         push    %r13
 125
 126         mov     8*0($a_ptr), $a0
 127         mov     8*1($a_ptr), $a1
 128         add     $a0, $a0                # a0:a3+a0:a3
 129         mov     8*2($a_ptr), $a2
 130         adc     $a1, $a1
 131         mov     8*3($a_ptr), $a3
 132         lea     .Lpoly(%rip), $a_ptr
 133          mov    $a0, $t0
 134         adc     $a2, $a2
 135         adc     $a3, $a3
 136          mov    $a1, $t1
 137         sbb     $t4, $t4
 138
 139         sub     8*0($a_ptr), $a0
 140          mov    $a2, $t2
 141         sbb     8*1($a_ptr), $a1
 142         sbb     8*2($a_ptr), $a2
 143          mov    $a3, $t3
 144         sbb     8*3($a_ptr), $a3
 145         test    $t4, $t4
 146
 147         cmovz   $t0, $a0
 148         cmovz   $t1, $a1
 149         mov     $a0, 8*0($r_ptr)
 150         cmovz   $t2, $a2
 151         mov     $a1, 8*1($r_ptr)
 152         cmovz   $t3, $a3
 153         mov     $a2, 8*2($r_ptr)
 154         mov     $a3, 8*3($r_ptr)
 155
 156         pop     %r13
 157         pop     %r12
 158         ret
 159 .size   ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
 160
 161 ################################################################################
 162 # void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]);
 163 .globl  ecp_nistz256_div_by_2
 164 .type   ecp_nistz256_div_by_2,\@function,2
 165 .align  32
 166 ecp_nistz256_div_by_2:
 167         push    %r12
 168         push    %r13
 169
 170         mov     8*0($a_ptr), $a0
 171         mov     8*1($a_ptr), $a1
 172         mov     8*2($a_ptr), $a2
 173          mov    $a0, $t0
 174         mov     8*3($a_ptr), $a3
 175         lea     .Lpoly(%rip), $a_ptr
 176
 177          mov    $a1, $t1
 178         xor     $t4, $t4
 179         add     8*0($a_ptr), $a0
 180          mov    $a2, $t2
 181         adc     8*1($a_ptr), $a1
 182         adc     8*2($a_ptr), $a2
 183          mov    $a3, $t3
 184         adc     8*3($a_ptr), $a3
 185         adc     \$0, $t4
 186         xor     $a_ptr, $a_ptr          # borrow $a_ptr
 187         test    \$1, $t0
 188
 189         cmovz   $t0, $a0
 190         cmovz   $t1, $a1
 191         cmovz   $t2, $a2
 192         cmovz   $t3, $a3
 193         cmovz   $a_ptr, $t4
 194
 195         mov     $a1, $t0                # a0:a3>>1
 196         shr     \$1, $a0
 197         shl     \$63, $t0
 198         mov     $a2, $t1
 199         shr     \$1, $a1
 200         or      $t0, $a0
 201         shl     \$63, $t1
 202         mov     $a3, $t2
 203         shr     \$1, $a2
 204         or      $t1, $a1
 205         shl     \$63, $t2
 206         shr     \$1, $a3
 207         shl     \$63, $t4
 208         or      $t2, $a2
 209         or      $t4, $a3
 210
 211         mov     $a0, 8*0($r_ptr)
 212         mov     $a1, 8*1($r_ptr)
 213         mov     $a2, 8*2($r_ptr)
 214         mov     $a3, 8*3($r_ptr)
 215
 216         pop     %r13
 217         pop     %r12
 218         ret
 219 .size   ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
 220
 221 ################################################################################
 222 # void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]);
 223 .globl  ecp_nistz256_mul_by_3
 224 .type   ecp_nistz256_mul_by_3,\@function,2
 225 .align  32
 226 ecp_nistz256_mul_by_3:
 227         push    %r12
 228         push    %r13
 229
 230         mov     8*0($a_ptr), $a0
 231         xor     $t4, $t4
 232         mov     8*1($a_ptr), $a1
 233         add     $a0, $a0                # a0:a3+a0:a3
 234         mov     8*2($a_ptr), $a2
 235         adc     $a1, $a1
 236         mov     8*3($a_ptr), $a3
 237          mov    $a0, $t0
 238         adc     $a2, $a2
 239         adc     $a3, $a3
 240          mov    $a1, $t1
 241         adc     \$0, $t4
 242
 243         sub     \$-1, $a0
 244          mov    $a2, $t2
 245         sbb     .Lpoly+8*1(%rip), $a1
 246         sbb     \$0, $a2
 247          mov    $a3, $t3
 248         sbb     .Lpoly+8*3(%rip), $a3
 249         test    $t4, $t4
 250
 251         cmovz   $t0, $a0
 252         cmovz   $t1, $a1
 253         cmovz   $t2, $a2
 254         cmovz   $t3, $a3
 255
 256         xor     $t4, $t4
 257         add     8*0($a_ptr), $a0        # a0:a3+=a_ptr[0:3]
 258         adc     8*1($a_ptr), $a1
 259          mov    $a0, $t0
 260         adc     8*2($a_ptr), $a2
 261         adc     8*3($a_ptr), $a3
 262          mov    $a1, $t1
 263         adc     \$0, $t4
 264
 265         sub     \$-1, $a0
 266          mov    $a2, $t2
 267         sbb     .Lpoly+8*1(%rip), $a1
 268         sbb     \$0, $a2
 269          mov    $a3, $t3
 270         sbb     .Lpoly+8*3(%rip), $a3
 271         test    $t4, $t4
 272
 273         cmovz   $t0, $a0
 274         cmovz   $t1, $a1
 275         mov     $a0, 8*0($r_ptr)
 276         cmovz   $t2, $a2
 277         mov     $a1, 8*1($r_ptr)
 278         cmovz   $t3, $a3
 279         mov     $a2, 8*2($r_ptr)
 280         mov     $a3, 8*3($r_ptr)
 281
 282         pop %r13
 283         pop %r12
 284         ret
 285 .size   ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
 286
 287 ################################################################################
 288 # void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
 289 .globl  ecp_nistz256_add
 290 .type   ecp_nistz256_add,\@function,3
 291 .align  32
 292 ecp_nistz256_add:
 293         push    %r12
 294         push    %r13
 295
 296         mov     8*0($a_ptr), $a0
 297         xor     $t4, $t4
 298         mov     8*1($a_ptr), $a1
 299         mov     8*2($a_ptr), $a2
 300         mov     8*3($a_ptr), $a3
 301         lea     .Lpoly(%rip), $a_ptr
 302
 303         add     8*0($b_ptr), $a0
 304         adc     8*1($b_ptr), $a1
 305          mov    $a0, $t0
 306         adc     8*2($b_ptr), $a2
 307         adc     8*3($b_ptr), $a3
 308          mov    $a1, $t1
 309         adc     \$0, $t4
 310
 311         sub     8*0($a_ptr), $a0
 312          mov    $a2, $t2
 313         sbb     8*1($a_ptr), $a1
 314         sbb     8*2($a_ptr), $a2
 315          mov    $a3, $t3
 316         sbb     8*3($a_ptr), $a3
 317         test    $t4, $t4
 318
 319         cmovz   $t0, $a0
 320         cmovz   $t1, $a1
 321         mov     $a0, 8*0($r_ptr)
 322         cmovz   $t2, $a2
 323         mov     $a1, 8*1($r_ptr)
 324         cmovz   $t3, $a3
 325         mov     $a2, 8*2($r_ptr)
 326         mov     $a3, 8*3($r_ptr)
 327
 328         pop %r13
 329         pop %r12
 330         ret
 331 .size   ecp_nistz256_add,.-ecp_nistz256_add
 332
 333 ################################################################################
 334 # void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]);
 335 .globl  ecp_nistz256_sub
 336 .type   ecp_nistz256_sub,\@function,3
 337 .align  32
 338 ecp_nistz256_sub:
 339         push    %r12
 340         push    %r13
 341
 342         mov     8*0($a_ptr), $a0
 343         xor     $t4, $t4
 344         mov     8*1($a_ptr), $a1
 345         mov     8*2($a_ptr), $a2
 346         mov     8*3($a_ptr), $a3
 347         lea     .Lpoly(%rip), $a_ptr
 348
 349         sub     8*0($b_ptr), $a0
 350         sbb     8*1($b_ptr), $a1
 351          mov    $a0, $t0
 352         sbb     8*2($b_ptr), $a2
 353         sbb     8*3($b_ptr), $a3
 354          mov    $a1, $t1
 355         sbb     \$0, $t4
 356
 357         add     8*0($a_ptr), $a0
 358          mov    $a2, $t2
 359         adc     8*1($a_ptr), $a1
 360         adc     8*2($a_ptr), $a2
 361          mov    $a3, $t3
 362         adc     8*3($a_ptr), $a3
 363         test    $t4, $t4
 364
 365         cmovz   $t0, $a0
 366         cmovz   $t1, $a1
 367         mov     $a0, 8*0($r_ptr)
 368         cmovz   $t2, $a2
 369         mov     $a1, 8*1($r_ptr)
 370         cmovz   $t3, $a3
 371         mov     $a2, 8*2($r_ptr)
 372         mov     $a3, 8*3($r_ptr)
 373
 374         pop %r13
 375         pop %r12
 376         ret
 377 .size   ecp_nistz256_sub,.-ecp_nistz256_sub
 378
 379 ################################################################################
 380 # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
 381 .globl  ecp_nistz256_neg
 382 .type   ecp_nistz256_neg,\@function,2
 383 .align  32
 384 ecp_nistz256_neg:
 385         push    %r12
 386         push    %r13
 387
 388         xor     $a0, $a0
 389         xor     $a1, $a1
 390         xor     $a2, $a2
 391         xor     $a3, $a3
 392         xor     $t4, $t4
 393
 394         sub     8*0($a_ptr), $a0
 395         sbb     8*1($a_ptr), $a1
 396         sbb     8*2($a_ptr), $a2
 397          mov    $a0, $t0
 398         sbb     8*3($a_ptr), $a3
 399         lea     .Lpoly(%rip), $a_ptr
 400          mov    $a1, $t1
 401         sbb     \$0, $t4
 402
 403         add     8*0($a_ptr), $a0
 404          mov    $a2, $t2
 405         adc     8*1($a_ptr), $a1
 406         adc     8*2($a_ptr), $a2
 407          mov    $a3, $t3
 408         adc     8*3($a_ptr), $a3
 409         test    $t4, $t4
 410
 411         cmovz   $t0, $a0
 412         cmovz   $t1, $a1
 413         mov     $a0, 8*0($r_ptr)
 414         cmovz   $t2, $a2
 415         mov     $a1, 8*1($r_ptr)
 416         cmovz   $t3, $a3
 417         mov     $a2, 8*2($r_ptr)
 418         mov     $a3, 8*3($r_ptr)
 419
 420         pop %r13
 421         pop %r12
 422         ret
 423 .size   ecp_nistz256_neg,.-ecp_nistz256_neg
 424 ___
 425 }
 426 {
 427 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
 428 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
 429 my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
 430 my ($poly1,$poly3)=($acc6,$acc7);
 431
 432 $code.=<<___;
 433 ################################################################################
 434 # void ecp_nistz256_to_mont(
 435 #   uint64_t res[4],
 436 #   uint64_t in[4]);
 437 .globl  ecp_nistz256_to_mont
 438 .type   ecp_nistz256_to_mont,\@function,2
 439 .align  32
 440 ecp_nistz256_to_mont:
 441 ___
 442 $code.=<<___    if ($addx);
 443         mov     \$0x80100, %ecx
 444         and     OPENSSL_ia32cap_P+8(%rip), %ecx
 445 ___
 446 $code.=<<___;
 447         lea     .LRR(%rip), $b_org
 448         jmp     .Lmul_mont
 449 .size   ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
 450
 451 ################################################################################
 452 # void ecp_nistz256_mul_mont(
 453 #   uint64_t res[4],
 454 #   uint64_t a[4],
 455 #   uint64_t b[4]);
 456
 457 .globl  ecp_nistz256_mul_mont
 458 .type   ecp_nistz256_mul_mont,\@function,3
 459 .align  32
 460 ecp_nistz256_mul_mont:
 461 ___
 462 $code.=<<___    if ($addx);
 463         mov     \$0x80100, %ecx
 464         and     OPENSSL_ia32cap_P+8(%rip), %ecx
 465 ___
 466 $code.=<<___;
 467 .Lmul_mont:
 468         push    %rbp
 469         push    %rbx
 470         push    %r12
 471         push    %r13
 472         push    %r14
 473         push    %r15
 474 ___
 475 $code.=<<___    if ($addx);
 476         cmp     \$0x80100, %ecx
 477         je      .Lmul_montx
 478 ___
 479 $code.=<<___;
 480         mov     $b_org, $b_ptr
 481         mov     8*0($b_org), %rax
 482         mov     8*0($a_ptr), $acc1
 483         mov     8*1($a_ptr), $acc2
 484         mov     8*2($a_ptr), $acc3
 485         mov     8*3($a_ptr), $acc4
 486
 487         call    __ecp_nistz256_mul_montq
 488 ___
 489 $code.=<<___    if ($addx);
 490         jmp     .Lmul_mont_done
 491
 492 .align  32
 493 .Lmul_montx:
 494         mov     $b_org, $b_ptr
 495         mov     8*0($b_org), %rdx
 496         mov     8*0($a_ptr), $acc1
 497         mov     8*1($a_ptr), $acc2
 498         mov     8*2($a_ptr), $acc3
 499         mov     8*3($a_ptr), $acc4
 500         lea     -128($a_ptr), $a_ptr    # control u-op density
 501
 502         call    __ecp_nistz256_mul_montx
 503 ___
 504 $code.=<<___;
 505 .Lmul_mont_done:
 506         pop     %r15
 507         pop     %r14
 508         pop     %r13
 509         pop     %r12
 510         pop     %rbx
 511         pop     %rbp
 512         ret
 513 .size   ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
 514
 515 .type   __ecp_nistz256_mul_montq,\@abi-omnipotent
 516 .align  32
 517 __ecp_nistz256_mul_montq:
 518         ########################################################################
 519         # Multiply a by b[0]
 520         mov     %rax, $t1
 521         mulq    $acc1
 522         mov     .Lpoly+8*1(%rip),$poly1
 523         mov     %rax, $acc0
 524         mov     $t1, %rax
 525         mov     %rdx, $acc1
 526
 527         mulq    $acc2
 528         mov     .Lpoly+8*3(%rip),$poly3
 529         add     %rax, $acc1
 530         mov     $t1, %rax
 531         adc     \$0, %rdx
 532         mov     %rdx, $acc2
 533
 534         mulq    $acc3
 535         add     %rax, $acc2
 536         mov     $t1, %rax
 537         adc     \$0, %rdx
 538         mov     %rdx, $acc3
 539
 540         mulq    $acc4
 541         add     %rax, $acc3
 542          mov    $acc0, %rax
 543         adc     \$0, %rdx
 544         xor     $acc5, $acc5
 545         mov     %rdx, $acc4
 546
 547         ########################################################################
 548         # First reduction step
 549         # Basically now we want to multiply acc[0] by p256,
 550         # and add the result to the acc.
 551         # Due to the special form of p256 we do some optimizations
 552         #
 553         # acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
 554         # then we add acc[0] and get acc[0] x 2^64
 555
 556         mulq    $poly1
 557         xor     $t0, $t0
 558         add     $acc0, $acc1            # +=acc[0]*2^64
 559         adc     \$0, %rdx
 560         add     %rax, $acc1
 561         mov     $acc0, %rax
 562
 563         # acc[0] x p256[2] = 0
 564         adc     %rdx, $acc2
 565         adc     \$0, $t0
 566
 567         mulq    $poly3
 568         xor     $acc0, $acc0
 569         add     $t0, $acc3
 570         adc     \$0, %rdx
 571         add     %rax, $acc3
 572          mov    8*1($b_ptr), %rax
 573         adc     %rdx, $acc4
 574         adc     \$0, $acc5
 575
 576         ########################################################################
 577         # Multiply by b[1]
 578         mov     %rax, $t1
 579         mulq    8*0($a_ptr)
 580         add     %rax, $acc1
 581         mov     $t1, %rax
 582         adc     \$0, %rdx
 583         mov     %rdx, $t0
 584
 585         mulq    8*1($a_ptr)
 586         add     $t0, $acc2
 587         adc     \$0, %rdx
 588         add     %rax, $acc2
 589         mov     $t1, %rax
 590         adc     \$0, %rdx
 591         mov     %rdx, $t0
 592
 593         mulq    8*2($a_ptr)
 594         add     $t0, $acc3
 595         adc     \$0, %rdx
 596         add     %rax, $acc3
 597         mov     $t1, %rax
 598         adc     \$0, %rdx
 599         mov     %rdx, $t0
 600
 601         mulq    8*3($a_ptr)
 602         add     $t0, $acc4
 603         adc     \$0, %rdx
 604         add     %rax, $acc4
 605          mov    $acc1, %rax
 606         adc     %rdx, $acc5
 607         adc     \$0, $acc0
 608
 609         ########################################################################
 610         # Second reduction step
 611         mulq    $poly1
 612         xor     $t0, $t0
 613         add     $acc1, $acc2
 614         adc     \$0, %rdx
 615         add     %rax, $acc2
 616         mov     $acc1, %rax
 617         adc     %rdx, $acc3
 618         adc     \$0, $t0
 619
 620         mulq    $poly3
 621         xor     $acc1, $acc1
 622         add     $t0, $acc4
 623         adc     \$0, %rdx
 624         add     %rax, $acc4
 625          mov    8*2($b_ptr), %rax
 626         adc     %rdx, $acc5
 627         adc     \$0, $acc0
 628
 629         ########################################################################
 630         # Multiply by b[2]
 631         mov     %rax, $t1
 632         mulq    8*0($a_ptr)
 633         add     %rax, $acc2
 634         mov     $t1, %rax
 635         adc     \$0, %rdx
 636         mov     %rdx, $t0
 637
 638         mulq    8*1($a_ptr)
 639         add     $t0, $acc3
 640         adc     \$0, %rdx
 641         add     %rax, $acc3
 642         mov     $t1, %rax
 643         adc     \$0, %rdx
 644         mov     %rdx, $t0
 645
 646         mulq    8*2($a_ptr)
 647         add     $t0, $acc4
 648         adc     \$0, %rdx
 649         add     %rax, $acc4
 650         mov     $t1, %rax
 651         adc     \$0, %rdx
 652         mov     %rdx, $t0
 653
 654         mulq    8*3($a_ptr)
 655         add     $t0, $acc5
 656         adc     \$0, %rdx
 657         add     %rax, $acc5
 658          mov    $acc2, %rax
 659         adc     %rdx, $acc0
 660         adc     \$0, $acc1
 661
 662         ########################################################################
 663         # Third reduction step
 664         mulq    $poly1
 665         xor     $t0, $t0
 666         add     $acc2, $acc3
 667         adc     \$0, %rdx
 668         add     %rax, $acc3
 669         mov     $acc2, %rax
 670         adc     %rdx, $acc4
 671         adc     \$0, $t0
 672
 673         mulq    $poly3
 674         xor     $acc2, $acc2
 675         add     $t0, $acc5
 676         adc     \$0, %rdx
 677         add     %rax, $acc5
 678          mov    8*3($b_ptr), %rax
 679         adc     %rdx, $acc0
 680         adc     \$0, $acc1
 681
 682         ########################################################################
 683         # Multiply by b[3]
 684         mov     %rax, $t1
 685         mulq    8*0($a_ptr)
 686         add     %rax, $acc3
 687         mov     $t1, %rax
 688         adc     \$0, %rdx
 689         mov     %rdx, $t0
 690
 691         mulq    8*1($a_ptr)
 692         add     $t0, $acc4
 693         adc     \$0, %rdx
 694         add     %rax, $acc4
 695         mov     $t1, %rax
 696         adc     \$0, %rdx
 697         mov     %rdx, $t0
 698
 699         mulq    8*2($a_ptr)
 700         add     $t0, $acc5
 701         adc     \$0, %rdx
 702         add     %rax, $acc5
 703         mov     $t1, %rax
 704         adc     \$0, %rdx
 705         mov     %rdx, $t0
 706
 707         mulq    8*3($a_ptr)
 708         add     $t0, $acc0
 709         adc     \$0, %rdx
 710         add     %rax, $acc0
 711          mov    $acc3, %rax
 712         adc     %rdx, $acc1
 713         adc     \$0, $acc2
 714
 715         ########################################################################
 716         # Final reduction step
 717         mulq    $poly1
 718         #xor    $t0, $t0
 719         add     $acc3, $acc4
 720         adc     \$0, %rdx
 721         add     %rax, $acc4
 722         mov     $acc3, %rax
 723         adc     %rdx, $acc5
 724         #adc    \$0, $t0                # doesn't overflow
 725
 726         mulq    $poly3
 727         #add    $t0, $acc0
 728         #adc    \$0, %rdx
 729          mov    $acc4, $t0
 730         add     %rax, $acc0
 731         adc     %rdx, $acc1
 732          mov    $acc5, $t1
 733         adc     \$0, $acc2
 734
 735         ########################################################################
 736         # Branch-less conditional subtraction of P
 737         sub     \$-1, $acc4             # .Lpoly[0]
 738          mov    $acc0, $t2
 739         sbb     $poly1, $acc5           # .Lpoly[1]
 740         sbb     \$0, $acc0              # .Lpoly[2]
 741          mov    $acc1, $t3
 742         sbb     $poly3, $acc1           # .Lpoly[3]
 743         neg     $acc2
 744
 745         cmovnc  $t0, $acc4
 746         cmovnc  $t1, $acc5
 747         mov     $acc4, 8*0($r_ptr)
 748         cmovnc  $t2, $acc0
 749         mov     $acc5, 8*1($r_ptr)
 750         cmovnc  $t3, $acc1
 751         mov     $acc0, 8*2($r_ptr)
 752         mov     $acc1, 8*3($r_ptr)
 753
 754         ret
 755 .size   __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
 756
 757 ################################################################################
 758 # void ecp_nistz256_sqr_mont(
 759 #   uint64_t res[4],
 760 #   uint64_t a[4]);
 761
 762 # we optimize the square according to S.Gueron and V.Krasnov,
 763 # "Speeding up Big-Number Squaring"
 764 .globl  ecp_nistz256_sqr_mont
 765 .type   ecp_nistz256_sqr_mont,\@function,2
 766 .align  32
 767 ecp_nistz256_sqr_mont:
 768 ___
 769 $code.=<<___    if ($addx);
 770         mov     \$0x80100, %ecx
 771         and     OPENSSL_ia32cap_P+8(%rip), %ecx
 772 ___
 773 $code.=<<___;
 774         push    %rbp
 775         push    %rbx
 776         push    %r12
 777         push    %r13
 778         push    %r14
 779         push    %r15
 780 ___
 781 $code.=<<___    if ($addx);
 782         cmp     \$0x80100, %ecx
 783         je      .Lsqr_montx
 784 ___
 785 $code.=<<___;
 786         mov     8*0($a_ptr), %rax
 787         mov     8*1($a_ptr), $acc6
 788         mov     8*2($a_ptr), $acc7
 789         mov     8*3($a_ptr), $acc0
 790
 791         call    __ecp_nistz256_sqr_montq
 792 ___
 793 $code.=<<___    if ($addx);
 794         jmp     .Lsqr_mont_done
 795
 796 .align  32
 797 .Lsqr_montx:
 798         mov     8*0($a_ptr), %rdx
 799         mov     8*1($a_ptr), $acc6
 800         mov     8*2($a_ptr), $acc7
 801         mov     8*3($a_ptr), $acc0
 802         lea     -128($a_ptr), $a_ptr    # control u-op density
 803
 804         call    __ecp_nistz256_sqr_montx
 805 ___
 806 $code.=<<___;
 807 .Lsqr_mont_done:
 808         pop     %r15
 809         pop     %r14
 810         pop     %r13
 811         pop     %r12
 812         pop     %rbx
 813         pop     %rbp
 814         ret
 815 .size   ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
 816
 817 .type   __ecp_nistz256_sqr_montq,\@abi-omnipotent
 818 .align  32
 819 __ecp_nistz256_sqr_montq:
 820         mov     %rax, $acc5
 821         mulq    $acc6                   # a[1]*a[0]
 822         mov     %rax, $acc1
 823         mov     $acc7, %rax
 824         mov     %rdx, $acc2
 825
 826         mulq    $acc5                   # a[0]*a[2]
 827         add     %rax, $acc2
 828         mov     $acc0, %rax
 829         adc     \$0, %rdx
 830         mov     %rdx, $acc3
 831
 832         mulq    $acc5                   # a[0]*a[3]
 833         add     %rax, $acc3
 834          mov    $acc7, %rax
 835         adc     \$0, %rdx
 836         mov     %rdx, $acc4
 837
 838         #################################
 839         mulq    $acc6                   # a[1]*a[2]
 840         add     %rax, $acc3
 841         mov     $acc0, %rax
 842         adc     \$0, %rdx
 843         mov     %rdx, $t1
 844
 845         mulq    $acc6                   # a[1]*a[3]
 846         add     %rax, $acc4
 847          mov    $acc0, %rax
 848         adc     \$0, %rdx
 849         add     $t1, $acc4
 850         mov     %rdx, $acc5
 851         adc     \$0, $acc5
 852
 853         #################################
 854         mulq    $acc7                   # a[2]*a[3]
 855         xor     $acc7, $acc7
 856         add     %rax, $acc5
 857          mov    8*0($a_ptr), %rax
 858         mov     %rdx, $acc6
 859         adc     \$0, $acc6
 860
 861         add     $acc1, $acc1            # acc1:6<<1
 862         adc     $acc2, $acc2
 863         adc     $acc3, $acc3
 864         adc     $acc4, $acc4
 865         adc     $acc5, $acc5
 866         adc     $acc6, $acc6
 867         adc     \$0, $acc7
 868
 869         mulq    %rax
 870         mov     %rax, $acc0
 871         mov     8*1($a_ptr), %rax
 872         mov     %rdx, $t0
 873
 874         mulq    %rax
 875         add     $t0, $acc1
 876         adc     %rax, $acc2
 877         mov     8*2($a_ptr), %rax
 878         adc     \$0, %rdx
 879         mov     %rdx, $t0
 880
 881         mulq    %rax
 882         add     $t0, $acc3
 883         adc     %rax, $acc4
 884         mov     8*3($a_ptr), %rax
 885         adc     \$0, %rdx
 886         mov     %rdx, $t0
 887
 888         mulq    %rax
 889         add     $t0, $acc5
 890         adc     %rax, $acc6
 891          mov    $acc0, %rax
 892         adc     %rdx, $acc7
 893
 894         mov     .Lpoly+8*1(%rip), $a_ptr
 895         mov     .Lpoly+8*3(%rip), $t1
 896
 897         ##########################################
 898         # Now the reduction
 899         # First iteration
 900         mulq    $a_ptr
 901         #xor    $t0, $t0
 902         add     $acc0, $acc1
 903         adc     \$0, %rdx
 904         add     %rax, $acc1
 905         mov     $acc0, %rax
 906         adc     %rdx, $acc2     # doesn't overflow
 907         #adc    \$0, $t0
 908
 909         mulq    $t1
 910         xor     $acc0, $acc0
 911         #add    $t0, $acc3
 912         #adc    \$0, %rdx
 913         add     %rax, $acc3
 914          mov    $acc1, %rax
 915         adc     %rdx, $acc4
 916         adc     \$0, $acc0
 917
 918         ##########################################
 919         # Second iteration
 920         mulq    $a_ptr
 921         #xor    $t0, $t0
 922         add     $acc1, $acc2
 923         adc     \$0, %rdx
 924         add     %rax, $acc2
 925         mov     $acc1, %rax
 926         adc     %rdx, $acc3     # doesn't overflow
 927         #adc    \$0, $t0
 928
 929         mulq    $t1
 930         xor     $acc1, $acc1
 931         #add    $t0, $acc4
 932         #adc    \$0, %rdx
 933         add     %rax, $acc4
 934          mov    $acc2, %rax
 935         adc     %rdx, $acc0
 936         adc     \$0, $acc1
 937
 938         ##########################################
 939         # Third iteration
 940         mulq    $a_ptr
 941         #xor    $t0, $t0
 942         add     $acc2, $acc3
 943         adc     \$0, %rdx
 944         add     %rax, $acc3
 945         mov     $acc2, %rax
 946         adc     %rdx, $acc4     # doesn't overflow
 947         #adc    \$0, $t0
 948
 949         mulq    $t1
 950         xor     $acc2, $acc2
 951         #add    $t0, $acc0
 952         #adc    \$0, %rdx
 953         add     %rax, $acc0
 954          mov    $acc3, %rax
 955         adc     %rdx, $acc1
 956         adc     \$0, $acc2
 957
 958         ###########################################
 959         # Last iteration
 960         mulq    $a_ptr
 961         #xor    $t0, $t0
 962         add     $acc3, $acc4
 963         adc     \$0, %rdx
 964         add     %rax, $acc4
 965         mov     $acc3, %rax
 966         adc     %rdx, $acc0     # doesn't overflow
 967         #adc    \$0, $t0
 968
 969         mulq    $t1
 970         xor     $acc3, $acc3
 971         #add    $t0, $acc1
 972         #adc    \$0, %rdx
 973         add     %rax, $acc1
 974         adc     %rdx, $acc2
 975         adc     \$0, $acc3
 976
 977         ############################################
 978         # Add the rest of the acc
 979         add     $acc0, $acc5
 980          mov    $acc4, $acc0
 981         adc     $acc1, $acc6
 982         adc     $acc2, $acc7
 983          mov    $acc5, $acc1
 984         adc     \$0, $acc3
 985
 986         sub     \$-1, $acc4             # .Lpoly[0]
 987          mov    $acc6, $acc2
 988         sbb     $a_ptr, $acc5           # .Lpoly[1]
 989         sbb     \$0, $acc6              # .Lpoly[2]
 990          mov    $acc7, $t0
 991         sbb     $t1, $acc7              # .Lpoly[3]
 992         neg     $acc3
 993
 994         cmovnc  $acc0, $acc4
 995         cmovnc  $acc1, $acc5
 996         mov     $acc4, 8*0($r_ptr)
 997         cmovnc  $acc2, $acc6
 998         mov     $acc5, 8*1($r_ptr)
 999         cmovnc  $t0, $acc7
1000         mov     $acc6, 8*2($r_ptr)
1001         mov     $acc7, 8*3($r_ptr)
1002
1003         ret
1004 .size   __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
1005 ___
1006
1007 if ($addx) {
1008 $code.=<<___;
1009 .type   __ecp_nistz256_mul_montx,\@abi-omnipotent
1010 .align  32
1011 __ecp_nistz256_mul_montx:
1012         ########################################################################
1013         # Multiply by b[0]
1014         mulx    $acc1, $acc0, $acc1
1015         mulx    $acc2, $t0, $acc2
1016         mov     \$32, $poly1
1017         xor     $acc5, $acc5            # cf=0
1018         mulx    $acc3, $t1, $acc3
1019         mov     .Lpoly+8*3(%rip), $poly3
1020         adc     $t0, $acc1
1021         mulx    $acc4, $t0, $acc4
1022          mov    $acc0, %rdx
1023         adc     $t1, $acc2
1024          shlx   $poly1,$acc0,$t1
1025         adc     $t0, $acc3
1026          shrx   $poly1,$acc0,$t0
1027         adc     \$0, $acc4
1028
1029         ########################################################################
1030         # First reduction step
1031         xor     $acc0, $acc0            # $acc0=0,cf=0,of=0
1032         adox    $t1, $acc1
1033         adox    $t0, $acc2
1034
1035         mulx    $poly3, $t0, $t1
1036          mov    8*1($b_ptr), %rdx
1037         adox    $t0, $acc3
1038         adcx    $t1, $acc4
1039
1040         adox    $acc0, $acc4
1041         adcx    $acc0, $acc5            # cf=0
1042         adox    $acc0, $acc5            # of=0
1043
1044         ########################################################################
1045         # Multiply by b[1]
1046         mulx    8*0+128($a_ptr), $t0, $t1
1047         adcx    $t0, $acc1
1048         adox    $t1, $acc2
1049
1050         mulx    8*1+128($a_ptr), $t0, $t1
1051         adcx    $t0, $acc2
1052         adox    $t1, $acc3
1053
1054         mulx    8*2+128($a_ptr), $t0, $t1
1055         adcx    $t0, $acc3
1056         adox    $t1, $acc4
1057
1058         mulx    8*3+128($a_ptr), $t0, $t1
1059          mov    $acc1, %rdx
1060         adcx    $t0, $acc4
1061          shlx   $poly1, $acc1, $t0
1062         adox    $t1, $acc5
1063          shrx   $poly1, $acc1, $t1
1064
1065         adcx    $acc0, $acc5
1066         adox    $acc0, $acc0
1067         adc     \$0, $acc0
1068
1069         ########################################################################
1070         # Second reduction step
1071         xor     $acc1 ,$acc1            # $acc1=0,cf=0,of=0
1072         adox    $t0, $acc2
1073         adox    $t1, $acc3
1074
1075         mulx    $poly3, $t0, $t1
1076          mov    8*2($b_ptr), %rdx
1077         adox    $t0, $acc4
1078         adcx    $t1, $acc5
1079
1080         adox    $acc1, $acc5
1081         adcx    $acc1, $acc0            # cf=0
1082         adox    $acc1, $acc0            # of=0
1083
1084         ########################################################################
1085         # Multiply by b[2]
1086         mulx    8*0+128($a_ptr), $t0, $t1
1087         adcx    $t0, $acc2
1088         adox    $t1, $acc3
1089
1090         mulx    8*1+128($a_ptr), $t0, $t1
1091         adcx    $t0, $acc3
1092         adox    $t1, $acc4
1093
1094         mulx    8*2+128($a_ptr), $t0, $t1
1095         adcx    $t0, $acc4
1096         adox    $t1, $acc5
1097
1098         mulx    8*3+128($a_ptr), $t0, $t1
1099          mov    $acc2, %rdx
1100         adcx    $t0, $acc5
1101          shlx   $poly1, $acc2, $t0
1102         adox    $t1, $acc0
1103          shrx   $poly1, $acc2, $t1
1104
1105         adcx    $acc1, $acc0
1106         adox    $acc1, $acc1
1107         adc     \$0, $acc1
1108
1109         ########################################################################
1110         # Third reduction step
1111         xor     $acc2, $acc2            # $acc2=0,cf=0,of=0
1112         adox    $t0, $acc3
1113         adox    $t1, $acc4
1114
1115         mulx    $poly3, $t0, $t1
1116          mov    8*3($b_ptr), %rdx
1117         adox    $t0, $acc5
1118         adcx    $t1, $acc0
1119
1120         adox    $acc2, $acc0
1121         adcx    $acc2, $acc1            # cf=0
1122         adox    $acc2, $acc1            # of=0
1123
1124         ########################################################################
1125         # Multiply by b[3]
1126         mulx    8*0+128($a_ptr), $t0, $t1
1127         adcx    $t0, $acc3
1128         adox    $t1, $acc4
1129
1130         mulx    8*1+128($a_ptr), $t0, $t1
1131         adcx    $t0, $acc4
1132         adox    $t1, $acc5
1133
1134         mulx    8*2+128($a_ptr), $t0, $t1
1135         adcx    $t0, $acc5
1136         adox    $t1, $acc0
1137
1138         mulx    8*3+128($a_ptr), $t0, $t1
1139          mov    $acc3, %rdx
1140         adcx    $t0, $acc0
1141          shlx   $poly1, $acc3, $t0
1142         adox    $t1, $acc1
1143          shrx   $poly1, $acc3, $t1
1144
1145         adcx    $acc2, $acc1
1146         adox    $acc2, $acc2
1147         adc     \$0, $acc2
1148
1149         ########################################################################
1150         # Fourth reduction step
1151         xor     $acc3, $acc3            # $acc3=0,cf=0,of=0
1152         adox    $t0, $acc4
1153         adox    $t1, $acc5
1154
1155         mulx    $poly3, $t0, $t1
1156          mov    $acc4, $t2
1157         mov     .Lpoly+8*1(%rip), $poly1
1158         adcx    $t0, $acc0
1159         adox    $t1, $acc1
1160          mov    $acc5, $t3
1161
1162         adcx    $acc3, $acc1
1163         adox    $acc3, $acc2
1164         adc     \$0, $acc2
1165          mov    $acc0, $t0
1166
1167         ########################################################################
1168         # Branch-less conditional subtraction of P
1169         xor     %eax, %eax
1170         sbb     \$-1, $acc4             # .Lpoly[0]
1171         sbb     $poly1, $acc5           # .Lpoly[1]
1172         sbb     \$0, $acc0              # .Lpoly[2]
1173          mov    $acc1, $t1
1174         sbb     $poly3, $acc1           # .Lpoly[3]
1175
1176         bt      \$0,$acc2
1177         cmovnc  $t2, $acc4
1178         cmovnc  $t3, $acc5
1179         mov     $acc4, 8*0($r_ptr)
1180         cmovnc  $t0, $acc0
1181         mov     $acc5, 8*1($r_ptr)
1182         cmovnc  $t1, $acc1
1183         mov     $acc0, 8*2($r_ptr)
1184         mov     $acc1, 8*3($r_ptr)
1185
1186         ret
1187 .size   __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
1188
1189 .type   __ecp_nistz256_sqr_montx,\@abi-omnipotent
1190 .align  32
1191 __ecp_nistz256_sqr_montx:
1192         mulx    $acc6, $acc1, $acc2     # a[0]*a[1]
1193         mulx    $acc7, $t0, $acc3       # a[0]*a[2]
1194         xor     %eax, %eax
1195         adc     $t0, $acc2
1196         mulx    $acc0, $t1, $acc4       # a[0]*a[3]
1197          mov    $acc6, %rdx
1198         adc     $t1, $acc3
1199         adc     \$0, $acc4
1200         xor     $acc5, $acc5            # $acc5=0,cf=0,of=0
1201
1202         #################################
1203         mulx    $acc7, $t0, $t1         # a[1]*a[2]
1204         adcx    $t0, $acc3
1205         adox    $t1, $acc4
1206
1207         mulx    $acc0, $t0, $t1         # a[1]*a[3]
1208          mov    $acc7, %rdx
1209         adcx    $t0, $acc4
1210         adox    $t1, $acc5
1211         adc     \$0, $acc5
1212
1213         #################################
1214         mulx    $acc0, $t0, $acc6       # a[2]*a[3]
1215          mov    8*0+128($a_ptr), %rdx
1216         xor     $acc7, $acc7            # $acc7=0,cf=0,of=0
1217          adcx   $acc1, $acc1            # acc1:6<<1
1218         adox    $t0, $acc5
1219          adcx   $acc2, $acc2
1220         adox    $acc7, $acc6            # of=0
1221
1222         mulx    %rdx, $acc0, $t1
1223         mov     8*1+128($a_ptr), %rdx
1224          adcx   $acc3, $acc3
1225         adox    $t1, $acc1
1226          adcx   $acc4, $acc4
1227         mulx    %rdx, $t0, $t4
1228         mov     8*2+128($a_ptr), %rdx
1229          adcx   $acc5, $acc5
1230         adox    $t0, $acc2
1231          adcx   $acc6, $acc6
1232         .byte   0x67
1233         mulx    %rdx, $t0, $t1
1234         mov     8*3+128($a_ptr), %rdx
1235         adox    $t4, $acc3
1236          adcx   $acc7, $acc7
1237         adox    $t0, $acc4
1238          mov    \$32, $a_ptr
1239         adox    $t1, $acc5
1240         .byte   0x67,0x67
1241         mulx    %rdx, $t0, $t4
1242          mov    $acc0, %rdx
1243         adox    $t0, $acc6
1244          shlx   $a_ptr, $acc0, $t0
1245         adox    $t4, $acc7
1246          shrx   $a_ptr, $acc0, $t4
1247          mov    .Lpoly+8*3(%rip), $t1
1248
1249         # reduction step 1
1250         xor     $acc0, $acc0
1251         adcx    $t0, $acc1
1252         adcx    $t4, $acc2
1253
1254         mulx    $t1, $t0, $t4
1255          mov    $acc1, %rdx
1256         adcx    $t0, $acc3
1257          shlx   $a_ptr, $acc1, $t0
1258         adox    $t4, $acc0
1259          shrx   $a_ptr, $acc1, $t4
1260         adc     \$0, $acc0
1261
1262         # reduction step 2
1263         xor     $acc1, $acc1
1264         adcx    $t0, $acc2
1265         adcx    $t4, $acc3
1266
1267         mulx    $t1, $t0, $t4
1268          mov    $acc2, %rdx
1269         adcx    $t0, $acc0
1270          shlx   $a_ptr, $acc2, $t0
1271         adox    $t4, $acc1
1272          shrx   $a_ptr, $acc2, $t4
1273         adc     \$0, $acc1
1274
1275         # reduction step 3
1276         xor     $acc2, $acc2
1277         adcx    $t0, $acc3
1278         adcx    $t4, $acc0
1279
1280         mulx    $t1, $t0, $t4
1281          mov    $acc3, %rdx
1282         adcx    $t0, $acc1
1283          shlx   $a_ptr, $acc3, $t0
1284         adox    $t4, $acc2
1285          shrx   $a_ptr, $acc3, $t4
1286         adc     \$0, $acc2
1287
1288         # reduction step 4
1289         xor     $acc3, $acc3
1290         adcx    $t0, $acc0
1291         adcx    $t4, $acc1
1292
1293         mulx    $t1, $t0, $t4
1294         adcx    $t0, $acc2
1295         adox    $t4, $acc3
1296         adc     \$0, $acc3
1297
1298         xor     $t3, $t3                # cf=0
1299         adc     $acc0, $acc4            # accumulate upper half
1300          mov    .Lpoly+8*1(%rip), $a_ptr
1301         adc     $acc1, $acc5
1302          mov    $acc4, $acc0
1303         adc     $acc2, $acc6
1304         adc     $acc3, $acc7
1305          mov    $acc5, $acc1
1306         adc     \$0, $t3
1307
1308         xor     %eax, %eax              # cf=0
1309         sbb     \$-1, $acc4             # .Lpoly[0]
1310          mov    $acc6, $acc2
1311         sbb     $a_ptr, $acc5           # .Lpoly[1]
1312         sbb     \$0, $acc6              # .Lpoly[2]
1313          mov    $acc7, $acc3
1314         sbb     $t1, $acc7              # .Lpoly[3]
1315
1316         bt      \$0,$t3
1317         cmovnc  $acc0, $acc4
1318         cmovnc  $acc1, $acc5
1319         mov     $acc4, 8*0($r_ptr)
1320         cmovnc  $acc2, $acc6
1321         mov     $acc5, 8*1($r_ptr)
1322         cmovnc  $acc3, $acc7
1323         mov     $acc6, 8*2($r_ptr)
1324         mov     $acc7, 8*3($r_ptr)
1325
1326         ret
1327 .size   __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
1328 ___
1329 }
1330 }
1331 {
1332 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
1333 my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
1334 my ($t0,$t1)=("%rcx","%rsi");
1335
1336 $code.=<<___;
1337 ################################################################################
1338 # void ecp_nistz256_from_mont(
1339 #   uint64_t res[4],
1340 #   uint64_t in[4]);
1341 # This one performs Montgomery multiplication by 1, so we only need the reduction
1342
1343 .globl  ecp_nistz256_from_mont
1344 .type   ecp_nistz256_from_mont,\@function,2
1345 .align  32
1346 ecp_nistz256_from_mont:
1347         push    %r12
1348         push    %r13
1349
1350         mov     8*0($in_ptr), %rax
1351         mov     8*1($in_ptr), $acc1
1352         mov     8*2($in_ptr), $acc2
1353         mov     8*3($in_ptr), $acc3
1354         lea     .Lpoly(%rip), $in_ptr
1355         xor     $acc4, $acc4
1356         mov     %rax, $acc0
1357
1358         #########################################
1359         # First iteration
1360         mulq    1*8($in_ptr)
1361         xor     $t0, $t0
1362         add     $acc0, $acc1
1363         adc     \$0, %rdx
1364         add     %rax, $acc1
1365         mov     $acc0, %rax
1366         adc     %rdx, $acc2
1367         adc     \$0, $t0
1368
1369         mulq    3*8($in_ptr)
1370         xor     $acc0, $acc0
1371         add     $t0, $acc3
1372         adc     \$0, %rdx
1373         add     %rax, $acc3
1374          mov    $acc1, %rax
1375         adc     %rdx, $acc4
1376         adc     \$0, $acc0
1377
1378         #########################################
1379         # Second iteration
1380         mulq    1*8($in_ptr)
1381         xor     $t0, $t0
1382         add     $acc1, $acc2
1383         adc     \$0, %rdx
1384         add     %rax, $acc2
1385         mov     $acc1, %rax
1386         adc     %rdx, $acc3
1387         adc     \$0, $t0
1388
1389         mulq    3*8($in_ptr)
1390         xor     $acc1, $acc1
1391         add     $t0, $acc4
1392         adc     \$0, %rdx
1393         add     %rax, $acc4
1394          mov    $acc2, %rax
1395         adc     %rdx, $acc0
1396         adc     \$0, $acc1
1397
1398         ##########################################
1399         # Third iteration
1400         mulq    1*8($in_ptr)
1401         xor     $t0, $t0
1402         add     $acc2, $acc3
1403         adc     \$0, %rdx
1404         add     %rax, $acc3
1405         mov     $acc2, %rax
1406         adc     %rdx, $acc4
1407         adc     \$0, $t0
1408
1409         mulq    3*8($in_ptr)
1410         xor     $acc2, $acc2
1411         add     $t0, $acc0
1412         adc     \$0, %rdx
1413         add     %rax, $acc0
1414          mov    $acc3, %rax
1415         adc     %rdx, $acc1
1416         adc     \$0, $acc2
1417
1418         ###########################################
1419         # Last iteration
1420         mulq    1*8($in_ptr)
1421         xor     $t0, $t0
1422         add     $acc3, $acc4
1423         adc     \$0, %rdx
1424         add     %rax, $acc4
1425         mov     $acc3, %rax
1426         adc     %rdx, $acc0
1427         adc     \$0, $t0
1428
1429         mulq    3*8($in_ptr)
1430         add     $t0, $acc1
1431         adc     \$0, %rdx
1432         add     %rax, $acc1
1433         adc     %rdx, $acc2
1434         sbb     $acc3, $acc3
1435
1436         mov     0*8($in_ptr), %rax
1437         mov     1*8($in_ptr), %rdx
1438         mov     2*8($in_ptr), $t0
1439         mov     3*8($in_ptr), $t1
1440
1441         and     $acc3, %rax
1442         and     $acc3, %rdx
1443         and     $acc3, $t0
1444         and     $acc3, $t1
1445
1446         sub     %rax, $acc4
1447         sbb     %rdx, $acc0
1448         mov     $acc4, 8*0($r_ptr)
1449         sbb     $t0, $acc1
1450         mov     $acc0, 8*1($r_ptr)
1451         sbb     $t1, $acc2
1452         mov     $acc1, 8*2($r_ptr)
1453         mov     $acc2, 8*3($r_ptr)
1454
1455         pop     %r13
1456         pop     %r12
1457         ret
1458 .size   ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
1459 ___
1460 }
1461 {
1462 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1463 my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
1464 my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
1465 my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
1466
1467 $code.=<<___;
1468 ################################################################################
1469 # void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
1470 .globl  ecp_nistz256_scatter_w5
1471 .type   ecp_nistz256_scatter_w5,\@abi-omnipotent
1472 .align  32
1473 ecp_nistz256_scatter_w5:
1474         lea     -3($index,$index,2), $index
1475         movdqa  0x00($in_t), %xmm0
1476         shl     \$5, $index
1477         movdqa  0x10($in_t), %xmm1
1478         movdqa  0x20($in_t), %xmm2
1479         movdqa  0x30($in_t), %xmm3
1480         movdqa  0x40($in_t), %xmm4
1481         movdqa  0x50($in_t), %xmm5
1482         movdqa  %xmm0, 0x00($val,$index)
1483         movdqa  %xmm1, 0x10($val,$index)
1484         movdqa  %xmm2, 0x20($val,$index)
1485         movdqa  %xmm3, 0x30($val,$index)
1486         movdqa  %xmm4, 0x40($val,$index)
1487         movdqa  %xmm5, 0x50($val,$index)
1488
1489         ret
1490 .size   ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1491
1492 ################################################################################
1493 # void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
1494 .globl  ecp_nistz256_gather_w5
1495 .type   ecp_nistz256_gather_w5,\@abi-omnipotent
1496 .align  32
1497 ecp_nistz256_gather_w5:
1498 ___
1499 $code.=<<___    if ($avx>1);
1500         mov     OPENSSL_ia32cap_P+8(%rip), %eax
1501         test    \$`1<<5`, %eax
1502         jnz     .Lavx2_gather_w5
1503 ___
1504 $code.=<<___    if ($win64);
1505         lea     -0x88(%rsp), %rax
1506 .LSEH_begin_ecp_nistz256_gather_w5:
1507         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
1508         .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6, -0x20(%rax)
1509         .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7, -0x10(%rax)
1510         .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8, 0(%rax)
1511         .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9, 0x10(%rax)
1512         .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10, 0x20(%rax)
1513         .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11, 0x30(%rax)
1514         .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12, 0x40(%rax)
1515         .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13, 0x50(%rax)
1516         .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14, 0x60(%rax)
1517         .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15, 0x70(%rax)
1518 ___
1519 $code.=<<___;
1520         movdqa  .LOne(%rip), $ONE
1521         movd    $index, $INDEX
1522
1523         pxor    $Ra, $Ra
1524         pxor    $Rb, $Rb
1525         pxor    $Rc, $Rc
1526         pxor    $Rd, $Rd
1527         pxor    $Re, $Re
1528         pxor    $Rf, $Rf
1529
1530         movdqa  $ONE, $M0
1531         pshufd  \$0, $INDEX, $INDEX
1532
1533         mov     \$16, %rax
1534 .Lselect_loop_sse_w5:
1535
1536         movdqa  $M0, $TMP0
1537         paddd   $ONE, $M0
1538         pcmpeqd $INDEX, $TMP0
1539
1540         movdqa  16*0($in_t), $T0a
1541         movdqa  16*1($in_t), $T0b
1542         movdqa  16*2($in_t), $T0c
1543         movdqa  16*3($in_t), $T0d
1544         movdqa  16*4($in_t), $T0e
1545         movdqa  16*5($in_t), $T0f
1546         lea 16*6($in_t), $in_t
1547
1548         pand    $TMP0, $T0a
1549         pand    $TMP0, $T0b
1550         por     $T0a, $Ra
1551         pand    $TMP0, $T0c
1552         por     $T0b, $Rb
1553         pand    $TMP0, $T0d
1554         por     $T0c, $Rc
1555         pand    $TMP0, $T0e
1556         por     $T0d, $Rd
1557         pand    $TMP0, $T0f
1558         por     $T0e, $Re
1559         por     $T0f, $Rf
1560
1561         dec     %rax
1562         jnz     .Lselect_loop_sse_w5
1563
1564         movdqu  $Ra, 16*0($val)
1565         movdqu  $Rb, 16*1($val)
1566         movdqu  $Rc, 16*2($val)
1567         movdqu  $Rd, 16*3($val)
1568         movdqu  $Re, 16*4($val)
1569         movdqu  $Rf, 16*5($val)
1570 ___
1571 $code.=<<___    if ($win64);
1572         movaps  (%rsp), %xmm6
1573         movaps  0x10(%rsp), %xmm7
1574         movaps  0x20(%rsp), %xmm8
1575         movaps  0x30(%rsp), %xmm9
1576         movaps  0x40(%rsp), %xmm10
1577         movaps  0x50(%rsp), %xmm11
1578         movaps  0x60(%rsp), %xmm12
1579         movaps  0x70(%rsp), %xmm13
1580         movaps  0x80(%rsp), %xmm14
1581         movaps  0x90(%rsp), %xmm15
1582         lea     0xa8(%rsp), %rsp
1583 .LSEH_end_ecp_nistz256_gather_w5:
1584 ___
1585 $code.=<<___;
1586         ret
1587 .size   ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1588
1589 ################################################################################
1590 # void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
1591 .globl  ecp_nistz256_scatter_w7
1592 .type   ecp_nistz256_scatter_w7,\@abi-omnipotent
1593 .align  32
1594 ecp_nistz256_scatter_w7:
1595         movdqu  0x00($in_t), %xmm0
1596         shl     \$6, $index
1597         movdqu  0x10($in_t), %xmm1
1598         movdqu  0x20($in_t), %xmm2
1599         movdqu  0x30($in_t), %xmm3
1600         movdqa  %xmm0, 0x00($val,$index)
1601         movdqa  %xmm1, 0x10($val,$index)
1602         movdqa  %xmm2, 0x20($val,$index)
1603         movdqa  %xmm3, 0x30($val,$index)
1604
1605         ret
1606 .size   ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1607
1608 ################################################################################
1609 # void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
1610 .globl  ecp_nistz256_gather_w7
1611 .type   ecp_nistz256_gather_w7,\@abi-omnipotent
1612 .align  32
1613 ecp_nistz256_gather_w7:
1614 ___
1615 $code.=<<___    if ($avx>1);
1616         mov     OPENSSL_ia32cap_P+8(%rip), %eax
1617         test    \$`1<<5`, %eax
1618         jnz     .Lavx2_gather_w7
1619 ___
1620 $code.=<<___    if ($win64);
1621         lea     -0x88(%rsp), %rax
1622 .LSEH_begin_ecp_nistz256_gather_w7:
1623         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
1624         .byte   0x0f,0x29,0x70,0xe0             #movaps %xmm6, -0x20(%rax)
1625         .byte   0x0f,0x29,0x78,0xf0             #movaps %xmm7, -0x10(%rax)
1626         .byte   0x44,0x0f,0x29,0x00             #movaps %xmm8, 0(%rax)
1627         .byte   0x44,0x0f,0x29,0x48,0x10        #movaps %xmm9, 0x10(%rax)
1628         .byte   0x44,0x0f,0x29,0x50,0x20        #movaps %xmm10, 0x20(%rax)
1629         .byte   0x44,0x0f,0x29,0x58,0x30        #movaps %xmm11, 0x30(%rax)
1630         .byte   0x44,0x0f,0x29,0x60,0x40        #movaps %xmm12, 0x40(%rax)
1631         .byte   0x44,0x0f,0x29,0x68,0x50        #movaps %xmm13, 0x50(%rax)
1632         .byte   0x44,0x0f,0x29,0x70,0x60        #movaps %xmm14, 0x60(%rax)
1633         .byte   0x44,0x0f,0x29,0x78,0x70        #movaps %xmm15, 0x70(%rax)
1634 ___
1635 $code.=<<___;
1636         movdqa  .LOne(%rip), $M0
1637         movd    $index, $INDEX
1638
1639         pxor    $Ra, $Ra
1640         pxor    $Rb, $Rb
1641         pxor    $Rc, $Rc
1642         pxor    $Rd, $Rd
1643
1644         movdqa  $M0, $ONE
1645         pshufd  \$0, $INDEX, $INDEX
1646         mov     \$64, %rax
1647
1648 .Lselect_loop_sse_w7:
1649         movdqa  $M0, $TMP0
1650         paddd   $ONE, $M0
1651         movdqa  16*0($in_t), $T0a
1652         movdqa  16*1($in_t), $T0b
1653         pcmpeqd $INDEX, $TMP0
1654         movdqa  16*2($in_t), $T0c
1655         movdqa  16*3($in_t), $T0d
1656         lea     16*4($in_t), $in_t
1657
1658         pand    $TMP0, $T0a
1659         pand    $TMP0, $T0b
1660         por     $T0a, $Ra
1661         pand    $TMP0, $T0c
1662         por     $T0b, $Rb
1663         pand    $TMP0, $T0d
1664         por     $T0c, $Rc
1665         prefetcht0      255($in_t)
1666         por     $T0d, $Rd
1667
1668         dec     %rax
1669         jnz     .Lselect_loop_sse_w7
1670
1671         movdqu  $Ra, 16*0($val)
1672         movdqu  $Rb, 16*1($val)
1673         movdqu  $Rc, 16*2($val)
1674         movdqu  $Rd, 16*3($val)
1675 ___
1676 $code.=<<___    if ($win64);
1677         movaps  (%rsp), %xmm6
1678         movaps  0x10(%rsp), %xmm7
1679         movaps  0x20(%rsp), %xmm8
1680         movaps  0x30(%rsp), %xmm9
1681         movaps  0x40(%rsp), %xmm10
1682         movaps  0x50(%rsp), %xmm11
1683         movaps  0x60(%rsp), %xmm12
1684         movaps  0x70(%rsp), %xmm13
1685         movaps  0x80(%rsp), %xmm14
1686         movaps  0x90(%rsp), %xmm15
1687         lea     0xa8(%rsp), %rsp
1688 .LSEH_end_ecp_nistz256_gather_w7:
1689 ___
1690 $code.=<<___;
1691         ret
1692 .size   ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1693 ___
1694 }
1695 if ($avx>1) {
1696 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1697 my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1698 my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1699 my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1700
1701 $code.=<<___;
1702 ################################################################################
1703 # void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
1704 .type   ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
1705 .align  32
1706 ecp_nistz256_avx2_gather_w5:
1707 .Lavx2_gather_w5:
1708         vzeroupper
1709 ___
1710 $code.=<<___    if ($win64);
1711         lea     -0x88(%rsp), %rax
1712 .LSEH_begin_ecp_nistz256_avx2_gather_w5:
1713         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
1714         .byte   0xc5,0xf8,0x29,0x70,0xe0        #vmovaps %xmm6, -0x20(%rax)
1715         .byte   0xc5,0xf8,0x29,0x78,0xf0        #vmovaps %xmm7, -0x10(%rax)
1716         .byte   0xc5,0x78,0x29,0x40,0x00        #vmovaps %xmm8, 8(%rax)
1717         .byte   0xc5,0x78,0x29,0x48,0x10        #vmovaps %xmm9, 0x10(%rax)
1718         .byte   0xc5,0x78,0x29,0x50,0x20        #vmovaps %xmm10, 0x20(%rax)
1719         .byte   0xc5,0x78,0x29,0x58,0x30        #vmovaps %xmm11, 0x30(%rax)
1720         .byte   0xc5,0x78,0x29,0x60,0x40        #vmovaps %xmm12, 0x40(%rax)
1721         .byte   0xc5,0x78,0x29,0x68,0x50        #vmovaps %xmm13, 0x50(%rax)
1722         .byte   0xc5,0x78,0x29,0x70,0x60        #vmovaps %xmm14, 0x60(%rax)
1723         .byte   0xc5,0x78,0x29,0x78,0x70        #vmovaps %xmm15, 0x70(%rax)
1724 ___
1725 $code.=<<___;
1726         vmovdqa .LTwo(%rip), $TWO
1727
1728         vpxor   $Ra, $Ra, $Ra
1729         vpxor   $Rb, $Rb, $Rb
1730         vpxor   $Rc, $Rc, $Rc
1731
1732         vmovdqa .LOne(%rip), $M0
1733         vmovdqa .LTwo(%rip), $M1
1734
1735         vmovd   $index, %xmm1
1736         vpermd  $INDEX, $Ra, $INDEX
1737
1738         mov     \$8, %rax
1739 .Lselect_loop_avx2_w5:
1740
1741         vmovdqa 32*0($in_t), $T0a
1742         vmovdqa 32*1($in_t), $T0b
1743         vmovdqa 32*2($in_t), $T0c
1744
1745         vmovdqa 32*3($in_t), $T1a
1746         vmovdqa 32*4($in_t), $T1b
1747         vmovdqa 32*5($in_t), $T1c
1748
1749         vpcmpeqd        $INDEX, $M0, $TMP0
1750         vpcmpeqd        $INDEX, $M1, $TMP1
1751
1752         vpaddd  $TWO, $M0, $M0
1753         vpaddd  $TWO, $M1, $M1
1754         lea     32*6($in_t), $in_t
1755
1756         vpand   $TMP0, $T0a, $T0a
1757         vpand   $TMP0, $T0b, $T0b
1758         vpand   $TMP0, $T0c, $T0c
1759         vpand   $TMP1, $T1a, $T1a
1760         vpand   $TMP1, $T1b, $T1b
1761         vpand   $TMP1, $T1c, $T1c
1762
1763         vpxor   $T0a, $Ra, $Ra
1764         vpxor   $T0b, $Rb, $Rb
1765         vpxor   $T0c, $Rc, $Rc
1766         vpxor   $T1a, $Ra, $Ra
1767         vpxor   $T1b, $Rb, $Rb
1768         vpxor   $T1c, $Rc, $Rc
1769
1770         dec %rax
1771         jnz .Lselect_loop_avx2_w5
1772
1773         vmovdqu $Ra, 32*0($val)
1774         vmovdqu $Rb, 32*1($val)
1775         vmovdqu $Rc, 32*2($val)
1776         vzeroupper
1777 ___
1778 $code.=<<___    if ($win64);
1779         movaps  (%rsp), %xmm6
1780         movaps  0x10(%rsp), %xmm7
1781         movaps  0x20(%rsp), %xmm8
1782         movaps  0x30(%rsp), %xmm9
1783         movaps  0x40(%rsp), %xmm10
1784         movaps  0x50(%rsp), %xmm11
1785         movaps  0x60(%rsp), %xmm12
1786         movaps  0x70(%rsp), %xmm13
1787         movaps  0x80(%rsp), %xmm14
1788         movaps  0x90(%rsp), %xmm15
1789         lea     0xa8(%rsp), %rsp
1790 .LSEH_end_ecp_nistz256_avx2_gather_w5:
1791 ___
1792 $code.=<<___;
1793         ret
1794 .size   ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
1795 ___
1796 }
1797 if ($avx>1) {
1798 my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1799 my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1800 my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1801 my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1802 my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1803
1804 $code.=<<___;
1805
1806 ################################################################################
1807 # void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
1808 .globl  ecp_nistz256_avx2_gather_w7
1809 .type   ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
1810 .align  32
1811 ecp_nistz256_avx2_gather_w7:
1812 .Lavx2_gather_w7:
1813         vzeroupper
1814 ___
1815 $code.=<<___    if ($win64);
1816         lea     -0x88(%rsp), %rax
1817 .LSEH_begin_ecp_nistz256_avx2_gather_w7:
1818         .byte   0x48,0x8d,0x60,0xe0             #lea    -0x20(%rax), %rsp
1819         .byte   0xc5,0xf8,0x29,0x70,0xe0        #vmovaps %xmm6, -0x20(%rax)
1820         .byte   0xc5,0xf8,0x29,0x78,0xf0        #vmovaps %xmm7, -0x10(%rax)
1821         .byte   0xc5,0x78,0x29,0x40,0x00        #vmovaps %xmm8, 8(%rax)
1822         .byte   0xc5,0x78,0x29,0x48,0x10        #vmovaps %xmm9, 0x10(%rax)
1823         .byte   0xc5,0x78,0x29,0x50,0x20        #vmovaps %xmm10, 0x20(%rax)
1824         .byte   0xc5,0x78,0x29,0x58,0x30        #vmovaps %xmm11, 0x30(%rax)
1825         .byte   0xc5,0x78,0x29,0x60,0x40        #vmovaps %xmm12, 0x40(%rax)
1826         .byte   0xc5,0x78,0x29,0x68,0x50        #vmovaps %xmm13, 0x50(%rax)
1827         .byte   0xc5,0x78,0x29,0x70,0x60        #vmovaps %xmm14, 0x60(%rax)
1828         .byte   0xc5,0x78,0x29,0x78,0x70        #vmovaps %xmm15, 0x70(%rax)
1829 ___
1830 $code.=<<___;
1831         vmovdqa .LThree(%rip), $THREE
1832
1833         vpxor   $Ra, $Ra, $Ra
1834         vpxor   $Rb, $Rb, $Rb
1835
1836         vmovdqa .LOne(%rip), $M0
1837         vmovdqa .LTwo(%rip), $M1
1838         vmovdqa .LThree(%rip), $M2
1839
1840         vmovd   $index, %xmm1
1841         vpermd  $INDEX, $Ra, $INDEX
1842         # Skip index = 0, because it is implicitly the point at infinity
1843
1844         mov     \$21, %rax
1845 .Lselect_loop_avx2_w7:
1846
1847         vmovdqa 32*0($in_t), $T0a
1848         vmovdqa 32*1($in_t), $T0b
1849
1850         vmovdqa 32*2($in_t), $T1a
1851         vmovdqa 32*3($in_t), $T1b
1852
1853         vmovdqa 32*4($in_t), $T2a
1854         vmovdqa 32*5($in_t), $T2b
1855
1856         vpcmpeqd        $INDEX, $M0, $TMP0
1857         vpcmpeqd        $INDEX, $M1, $TMP1
1858         vpcmpeqd        $INDEX, $M2, $TMP2
1859
1860         vpaddd  $THREE, $M0, $M0
1861         vpaddd  $THREE, $M1, $M1
1862         vpaddd  $THREE, $M2, $M2
1863         lea     32*6($in_t), $in_t
1864
1865         vpand   $TMP0, $T0a, $T0a
1866         vpand   $TMP0, $T0b, $T0b
1867         vpand   $TMP1, $T1a, $T1a
1868         vpand   $TMP1, $T1b, $T1b
1869         vpand   $TMP2, $T2a, $T2a
1870         vpand   $TMP2, $T2b, $T2b
1871
1872         vpxor   $T0a, $Ra, $Ra
1873         vpxor   $T0b, $Rb, $Rb
1874         vpxor   $T1a, $Ra, $Ra
1875         vpxor   $T1b, $Rb, $Rb
1876         vpxor   $T2a, $Ra, $Ra
1877         vpxor   $T2b, $Rb, $Rb
1878
1879         dec %rax
1880         jnz .Lselect_loop_avx2_w7
1881
1882
1883         vmovdqa 32*0($in_t), $T0a
1884         vmovdqa 32*1($in_t), $T0b
1885
1886         vpcmpeqd        $INDEX, $M0, $TMP0
1887
1888         vpand   $TMP0, $T0a, $T0a
1889         vpand   $TMP0, $T0b, $T0b
1890
1891         vpxor   $T0a, $Ra, $Ra
1892         vpxor   $T0b, $Rb, $Rb
1893
1894         vmovdqu $Ra, 32*0($val)
1895         vmovdqu $Rb, 32*1($val)
1896         vzeroupper
1897 ___
1898 $code.=<<___    if ($win64);
1899         movaps  (%rsp), %xmm6
1900         movaps  0x10(%rsp), %xmm7
1901         movaps  0x20(%rsp), %xmm8
1902         movaps  0x30(%rsp), %xmm9
1903         movaps  0x40(%rsp), %xmm10
1904         movaps  0x50(%rsp), %xmm11
1905         movaps  0x60(%rsp), %xmm12
1906         movaps  0x70(%rsp), %xmm13
1907         movaps  0x80(%rsp), %xmm14
1908         movaps  0x90(%rsp), %xmm15
1909         lea     0xa8(%rsp), %rsp
1910 .LSEH_end_ecp_nistz256_avx2_gather_w7:
1911 ___
1912 $code.=<<___;
1913         ret
1914 .size   ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
1915 ___
1916 } else {
1917 $code.=<<___;
1918 .globl  ecp_nistz256_avx2_gather_w7
1919 .type   ecp_nistz256_avx2_gather_w7,\@function,3
1920 .align  32
1921 ecp_nistz256_avx2_gather_w7:
1922         .byte   0x0f,0x0b       # ud2
1923         ret
1924 .size   ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
1925 ___
1926 }
1927 {{{
1928 ########################################################################
1929 # This block implements higher level point_double, point_add and
1930 # point_add_affine. The key to performance in this case is to allow
1931 # out-of-order execution logic to overlap computations from next step
1932 # with tail processing from current step. By using tailored calling
1933 # sequence we minimize inter-step overhead to give processor better
1934 # shot at overlapping operations...
1935 #
1936 # You will notice that input data is copied to stack. Trouble is that
1937 # there are no registers to spare for holding original pointers and
1938 # reloading them, pointers, would create undesired dependencies on
1939 # effective addresses calculation paths. In other words it's too done
1940 # to favour out-of-order execution logic.
1941 #                                               <appro@openssl.org>
1942
1943 my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1944 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1945 my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1946 my ($poly1,$poly3)=($acc6,$acc7);
1947
1948 sub load_for_mul () {
1949 my ($a,$b,$src0) = @_;
1950 my $bias = $src0 eq "%rax" ? 0 : -128;
1951
1952 "       mov     $b, $src0
1953         lea     $b, $b_ptr
1954         mov     8*0+$a, $acc1
1955         mov     8*1+$a, $acc2
1956         lea     $bias+$a, $a_ptr
1957         mov     8*2+$a, $acc3
1958         mov     8*3+$a, $acc4"
1959 }
1960
1961 sub load_for_sqr () {
1962 my ($a,$src0) = @_;
1963 my $bias = $src0 eq "%rax" ? 0 : -128;
1964
1965 "       mov     8*0+$a, $src0
1966         mov     8*1+$a, $acc6
1967         lea     $bias+$a, $a_ptr
1968         mov     8*2+$a, $acc7
1969         mov     8*3+$a, $acc0"
1970 }
1971
1972                                                                         {
1973 ########################################################################
1974 # operate in 4-5-0-1 "name space" that matches multiplication output
1975 #
1976 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1977
1978 $code.=<<___;
1979 .type   __ecp_nistz256_add_toq,\@abi-omnipotent
1980 .align  32
1981 __ecp_nistz256_add_toq:
1982         add     8*0($b_ptr), $a0
1983         adc     8*1($b_ptr), $a1
1984          mov    $a0, $t0
1985         adc     8*2($b_ptr), $a2
1986         adc     8*3($b_ptr), $a3
1987          mov    $a1, $t1
1988         sbb     $t4, $t4
1989
1990         sub     \$-1, $a0
1991          mov    $a2, $t2
1992         sbb     $poly1, $a1
1993         sbb     \$0, $a2
1994          mov    $a3, $t3
1995         sbb     $poly3, $a3
1996         test    $t4, $t4
1997
1998         cmovz   $t0, $a0
1999         cmovz   $t1, $a1
2000         mov     $a0, 8*0($r_ptr)
2001         cmovz   $t2, $a2
2002         mov     $a1, 8*1($r_ptr)
2003         cmovz   $t3, $a3
2004         mov     $a2, 8*2($r_ptr)
2005         mov     $a3, 8*3($r_ptr)
2006
2007         ret
2008 .size   __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
2009
2010 .type   __ecp_nistz256_sub_fromq,\@abi-omnipotent
2011 .align  32
2012 __ecp_nistz256_sub_fromq:
2013         sub     8*0($b_ptr), $a0
2014         sbb     8*1($b_ptr), $a1
2015          mov    $a0, $t0
2016         sbb     8*2($b_ptr), $a2
2017         sbb     8*3($b_ptr), $a3
2018          mov    $a1, $t1
2019         sbb     $t4, $t4
2020
2021         add     \$-1, $a0
2022          mov    $a2, $t2
2023         adc     $poly1, $a1
2024         adc     \$0, $a2
2025          mov    $a3, $t3
2026         adc     $poly3, $a3
2027         test    $t4, $t4
2028
2029         cmovz   $t0, $a0
2030         cmovz   $t1, $a1
2031         mov     $a0, 8*0($r_ptr)
2032         cmovz   $t2, $a2
2033         mov     $a1, 8*1($r_ptr)
2034         cmovz   $t3, $a3
2035         mov     $a2, 8*2($r_ptr)
2036         mov     $a3, 8*3($r_ptr)
2037
2038         ret
2039 .size   __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
2040
2041 .type   __ecp_nistz256_subq,\@abi-omnipotent
2042 .align  32
2043 __ecp_nistz256_subq:
2044         sub     $a0, $t0
2045         sbb     $a1, $t1
2046          mov    $t0, $a0
2047         sbb     $a2, $t2
2048         sbb     $a3, $t3
2049          mov    $t1, $a1
2050         sbb     $t4, $t4
2051
2052         add     \$-1, $t0
2053          mov    $t2, $a2
2054         adc     $poly1, $t1
2055         adc     \$0, $t2
2056          mov    $t3, $a3
2057         adc     $poly3, $t3
2058         test    $t4, $t4
2059
2060         cmovnz  $t0, $a0
2061         cmovnz  $t1, $a1
2062         cmovnz  $t2, $a2
2063         cmovnz  $t3, $a3
2064
2065         ret
2066 .size   __ecp_nistz256_subq,.-__ecp_nistz256_subq
2067
2068 .type   __ecp_nistz256_mul_by_2q,\@abi-omnipotent
2069 .align  32
2070 __ecp_nistz256_mul_by_2q:
2071         add     $a0, $a0                # a0:a3+a0:a3
2072         adc     $a1, $a1
2073          mov    $a0, $t0
2074         adc     $a2, $a2
2075         adc     $a3, $a3
2076          mov    $a1, $t1
2077         sbb     $t4, $t4
2078
2079         sub     \$-1, $a0
2080          mov    $a2, $t2
2081         sbb     $poly1, $a1
2082         sbb     \$0, $a2
2083          mov    $a3, $t3
2084         sbb     $poly3, $a3
2085         test    $t4, $t4
2086
2087         cmovz   $t0, $a0
2088         cmovz   $t1, $a1
2089         mov     $a0, 8*0($r_ptr)
2090         cmovz   $t2, $a2
2091         mov     $a1, 8*1($r_ptr)
2092         cmovz   $t3, $a3
2093         mov     $a2, 8*2($r_ptr)
2094         mov     $a3, 8*3($r_ptr)
2095
2096         ret
2097 .size   __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
2098 ___
2099                                                                         }
2100 sub gen_double () {
2101     my $x = shift;
2102     my ($src0,$sfx,$bias);
2103     my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
2104
2105     if ($x ne "x") {
2106         $src0 = "%rax";
2107         $sfx  = "";
2108         $bias = 0;
2109
2110 $code.=<<___;
2111 .globl  ecp_nistz256_point_double
2112 .type   ecp_nistz256_point_double,\@function,2
2113 .align  32
2114 ecp_nistz256_point_double:
2115 ___
2116 $code.=<<___    if ($addx);
2117         mov     \$0x80100, %ecx
2118         and     OPENSSL_ia32cap_P+8(%rip), %ecx
2119         cmp     \$0x80100, %ecx
2120         je      .Lpoint_doublex
2121 ___
2122     } else {
2123         $src0 = "%rdx";
2124         $sfx  = "x";
2125         $bias = 128;
2126
2127 $code.=<<___;
2128 .type   ecp_nistz256_point_doublex,\@function,2
2129 .align  32
2130 ecp_nistz256_point_doublex:
2131 .Lpoint_doublex:
2132 ___
2133     }
2134 $code.=<<___;
2135         push    %rbp
2136         push    %rbx
2137         push    %r12
2138         push    %r13
2139         push    %r14
2140         push    %r15
2141         sub     \$32*5+8, %rsp
2142
2143         movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$a_ptr.x
2144         mov     $a_ptr, $b_ptr                  # backup copy
2145         movdqu  0x10($a_ptr), %xmm1
2146          mov    0x20+8*0($a_ptr), $acc4         # load in_y in "5-4-0-1" order
2147          mov    0x20+8*1($a_ptr), $acc5
2148          mov    0x20+8*2($a_ptr), $acc0
2149          mov    0x20+8*3($a_ptr), $acc1
2150          mov    .Lpoly+8*1(%rip), $poly1
2151          mov    .Lpoly+8*3(%rip), $poly3
2152         movdqa  %xmm0, $in_x(%rsp)
2153         movdqa  %xmm1, $in_x+0x10(%rsp)
2154         lea     0x20($r_ptr), $acc2
2155         lea     0x40($r_ptr), $acc3
2156         movq    $r_ptr, %xmm0
2157         movq    $acc2, %xmm1
2158         movq    $acc3, %xmm2
2159
2160         lea     $S(%rsp), $r_ptr
2161         call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(S, in_y);
2162
2163         mov     0x40+8*0($a_ptr), $src0
2164         mov     0x40+8*1($a_ptr), $acc6
2165         mov     0x40+8*2($a_ptr), $acc7
2166         mov     0x40+8*3($a_ptr), $acc0
2167         lea     0x40-$bias($a_ptr), $a_ptr
2168         lea     $Zsqr(%rsp), $r_ptr
2169         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Zsqr, in_z);
2170
2171         `&load_for_sqr("$S(%rsp)", "$src0")`
2172         lea     $S(%rsp), $r_ptr
2173         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(S, S);
2174
2175         mov     0x20($b_ptr), $src0             # $b_ptr is still valid
2176         mov     0x40+8*0($b_ptr), $acc1
2177         mov     0x40+8*1($b_ptr), $acc2
2178         mov     0x40+8*2($b_ptr), $acc3
2179         mov     0x40+8*3($b_ptr), $acc4
2180         lea     0x40-$bias($b_ptr), $a_ptr
2181         lea     0x20($b_ptr), $b_ptr
2182         movq    %xmm2, $r_ptr
2183         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, in_z, in_y);
2184         call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(res_z, res_z);
2185
2186         mov     $in_x+8*0(%rsp), $acc4          # "5-4-0-1" order
2187         mov     $in_x+8*1(%rsp), $acc5
2188         lea     $Zsqr(%rsp), $b_ptr
2189         mov     $in_x+8*2(%rsp), $acc0
2190         mov     $in_x+8*3(%rsp), $acc1
2191         lea     $M(%rsp), $r_ptr
2192         call    __ecp_nistz256_add_to$x         # p256_add(M, in_x, Zsqr);
2193
2194         mov     $in_x+8*0(%rsp), $acc4          # "5-4-0-1" order
2195         mov     $in_x+8*1(%rsp), $acc5
2196         lea     $Zsqr(%rsp), $b_ptr
2197         mov     $in_x+8*2(%rsp), $acc0
2198         mov     $in_x+8*3(%rsp), $acc1
2199         lea     $Zsqr(%rsp), $r_ptr
2200         call    __ecp_nistz256_sub_from$x       # p256_sub(Zsqr, in_x, Zsqr);
2201
2202         `&load_for_sqr("$S(%rsp)", "$src0")`
2203         movq    %xmm1, $r_ptr
2204         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(res_y, S);
2205 ___
2206 {
2207 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
2208 # operate in 4-5-6-7 "name space" that matches squaring output
2209 #
2210 my ($poly1,$poly3)=($a_ptr,$t1);
2211 my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
2212
2213 $code.=<<___;
2214         xor     $t4, $t4
2215         mov     $a0, $t0
2216         add     \$-1, $a0
2217         mov     $a1, $t1
2218         adc     $poly1, $a1
2219         mov     $a2, $t2
2220         adc     \$0, $a2
2221         mov     $a3, $t3
2222         adc     $poly3, $a3
2223         adc     \$0, $t4
2224         xor     $a_ptr, $a_ptr          # borrow $a_ptr
2225         test    \$1, $t0
2226
2227         cmovz   $t0, $a0
2228         cmovz   $t1, $a1
2229         cmovz   $t2, $a2
2230         cmovz   $t3, $a3
2231         cmovz   $a_ptr, $t4
2232
2233         mov     $a1, $t0                # a0:a3>>1
2234         shr     \$1, $a0
2235         shl     \$63, $t0
2236         mov     $a2, $t1
2237         shr     \$1, $a1
2238         or      $t0, $a0
2239         shl     \$63, $t1
2240         mov     $a3, $t2
2241         shr     \$1, $a2
2242         or      $t1, $a1
2243         shl     \$63, $t2
2244         mov     $a0, 8*0($r_ptr)
2245         shr     \$1, $a3
2246         mov     $a1, 8*1($r_ptr)
2247         shl     \$63, $t4
2248         or      $t2, $a2
2249         or      $t4, $a3
2250         mov     $a2, 8*2($r_ptr)
2251         mov     $a3, 8*3($r_ptr)
2252 ___
2253 }
2254 $code.=<<___;
2255         `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
2256         lea     $M(%rsp), $r_ptr
2257         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(M, M, Zsqr);
2258
2259         lea     $tmp0(%rsp), $r_ptr
2260         call    __ecp_nistz256_mul_by_2$x
2261
2262         lea     $M(%rsp), $b_ptr
2263         lea     $M(%rsp), $r_ptr
2264         call    __ecp_nistz256_add_to$x         # p256_mul_by_3(M, M);
2265
2266         `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
2267         lea     $S(%rsp), $r_ptr
2268         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S, S, in_x);
2269
2270         lea     $tmp0(%rsp), $r_ptr
2271         call    __ecp_nistz256_mul_by_2$x       # p256_mul_by_2(tmp0, S);
2272
2273         `&load_for_sqr("$M(%rsp)", "$src0")`
2274         movq    %xmm0, $r_ptr
2275         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(res_x, M);
2276
2277         lea     $tmp0(%rsp), $b_ptr
2278         mov     $acc6, $acc0                    # harmonize sqr output and sub input
2279         mov     $acc7, $acc1
2280         mov     $a_ptr, $poly1
2281         mov     $t1, $poly3
2282         call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, tmp0);
2283
2284         mov     $S+8*0(%rsp), $t0
2285         mov     $S+8*1(%rsp), $t1
2286         mov     $S+8*2(%rsp), $t2
2287         mov     $S+8*3(%rsp), $acc2             # "4-5-0-1" order
2288         lea     $S(%rsp), $r_ptr
2289         call    __ecp_nistz256_sub$x            # p256_sub(S, S, res_x);
2290
2291         mov     $M(%rsp), $src0
2292         lea     $M(%rsp), $b_ptr
2293         mov     $acc4, $acc6                    # harmonize sub output and mul input
2294         xor     %ecx, %ecx
2295         mov     $acc4, $S+8*0(%rsp)             # have to save:-(
2296         mov     $acc5, $acc2
2297         mov     $acc5, $S+8*1(%rsp)
2298         cmovz   $acc0, $acc3
2299         mov     $acc0, $S+8*2(%rsp)
2300         lea     $S-$bias(%rsp), $a_ptr
2301         cmovz   $acc1, $acc4
2302         mov     $acc1, $S+8*3(%rsp)
2303         mov     $acc6, $acc1
2304         lea     $S(%rsp), $r_ptr
2305         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S, S, M);
2306
2307         movq    %xmm1, $b_ptr
2308         movq    %xmm1, $r_ptr
2309         call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, S, res_y);
2310
2311         add     \$32*5+8, %rsp
2312         pop     %r15
2313         pop     %r14
2314         pop     %r13
2315         pop     %r12
2316         pop     %rbx
2317         pop     %rbp
2318         ret
2319 .size   ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
2320 ___
2321 }
2322 &gen_double("q");
2323
2324 sub gen_add () {
2325     my $x = shift;
2326     my ($src0,$sfx,$bias);
2327     my ($H,$Hsqr,$R,$Rsqr,$Hcub,
2328         $U1,$U2,$S1,$S2,
2329         $res_x,$res_y,$res_z,
2330         $in1_x,$in1_y,$in1_z,
2331         $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
2332     my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2333
2334     if ($x ne "x") {
2335         $src0 = "%rax";
2336         $sfx  = "";
2337         $bias = 0;
2338
2339 $code.=<<___;
2340 .globl  ecp_nistz256_point_add
2341 .type   ecp_nistz256_point_add,\@function,3
2342 .align  32
2343 ecp_nistz256_point_add:
2344 ___
2345 $code.=<<___    if ($addx);
2346         mov     \$0x80100, %ecx
2347         and     OPENSSL_ia32cap_P+8(%rip), %ecx
2348         cmp     \$0x80100, %ecx
2349         je      .Lpoint_addx
2350 ___
2351     } else {
2352         $src0 = "%rdx";
2353         $sfx  = "x";
2354         $bias = 128;
2355
2356 $code.=<<___;
2357 .type   ecp_nistz256_point_addx,\@function,3
2358 .align  32
2359 ecp_nistz256_point_addx:
2360 .Lpoint_addx:
2361 ___
2362     }
2363 $code.=<<___;
2364         push    %rbp
2365         push    %rbx
2366         push    %r12
2367         push    %r13
2368         push    %r14
2369         push    %r15
2370         sub     \$32*18+8, %rsp
2371
2372         movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$a_ptr
2373         movdqu  0x10($a_ptr), %xmm1
2374         movdqu  0x20($a_ptr), %xmm2
2375         movdqu  0x30($a_ptr), %xmm3
2376         movdqu  0x40($a_ptr), %xmm4
2377         movdqu  0x50($a_ptr), %xmm5
2378         mov     $a_ptr, $b_ptr                  # reassign
2379         mov     $b_org, $a_ptr                  # reassign
2380         movdqa  %xmm0, $in1_x(%rsp)
2381         movdqa  %xmm1, $in1_x+0x10(%rsp)
2382         por     %xmm0, %xmm1
2383         movdqa  %xmm2, $in1_y(%rsp)
2384         movdqa  %xmm3, $in1_y+0x10(%rsp)
2385         por     %xmm2, %xmm3
2386         movdqa  %xmm4, $in1_z(%rsp)
2387         movdqa  %xmm5, $in1_z+0x10(%rsp)
2388         por     %xmm1, %xmm3
2389
2390         movdqu  0x00($a_ptr), %xmm0             # copy  *(P256_POINT *)$b_ptr
2391          pshufd \$0xb1, %xmm3, %xmm5
2392         movdqu  0x10($a_ptr), %xmm1
2393         movdqu  0x20($a_ptr), %xmm2
2394          por    %xmm3, %xmm5
2395         movdqu  0x30($a_ptr), %xmm3
2396          mov    0x40+8*0($a_ptr), $src0         # load original in2_z
2397          mov    0x40+8*1($a_ptr), $acc6
2398          mov    0x40+8*2($a_ptr), $acc7
2399          mov    0x40+8*3($a_ptr), $acc0
2400         movdqa  %xmm0, $in2_x(%rsp)
2401          pshufd \$0x1e, %xmm5, %xmm4
2402         movdqa  %xmm1, $in2_x+0x10(%rsp)
2403         por     %xmm0, %xmm1
2404          movq   $r_ptr, %xmm0                   # save $r_ptr
2405         movdqa  %xmm2, $in2_y(%rsp)
2406         movdqa  %xmm3, $in2_y+0x10(%rsp)
2407         por     %xmm2, %xmm3
2408          por    %xmm4, %xmm5
2409          pxor   %xmm4, %xmm4
2410         por     %xmm1, %xmm3
2411
2412         lea     0x40-$bias($a_ptr), $a_ptr      # $a_ptr is still valid
2413          mov    $src0, $in2_z+8*0(%rsp)         # make in2_z copy
2414          mov    $acc6, $in2_z+8*1(%rsp)
2415          mov    $acc7, $in2_z+8*2(%rsp)
2416          mov    $acc0, $in2_z+8*3(%rsp)
2417         lea     $Z2sqr(%rsp), $r_ptr            # Z2^2
2418         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z2sqr, in2_z);
2419
2420         pcmpeqd %xmm4, %xmm5
2421         pshufd  \$0xb1, %xmm3, %xmm4
2422         por     %xmm3, %xmm4
2423         pshufd  \$0, %xmm5, %xmm5               # in1infty
2424         pshufd  \$0x1e, %xmm4, %xmm3
2425         por     %xmm3, %xmm4
2426         pxor    %xmm3, %xmm3
2427         pcmpeqd %xmm3, %xmm4
2428         pshufd  \$0, %xmm4, %xmm4               # in2infty
2429          mov    0x40+8*0($b_ptr), $src0         # load original in1_z
2430          mov    0x40+8*1($b_ptr), $acc6
2431          mov    0x40+8*2($b_ptr), $acc7
2432          mov    0x40+8*3($b_ptr), $acc0
2433
2434         lea     0x40-$bias($b_ptr), $a_ptr
2435         lea     $Z1sqr(%rsp), $r_ptr            # Z1^2
2436         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z1sqr, in1_z);
2437
2438         `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
2439         lea     $S1(%rsp), $r_ptr               # S1 = Z2^3
2440         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S1, Z2sqr, in2_z);
2441
2442         `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2443         lea     $S2(%rsp), $r_ptr               # S2 = Z1^3
2444         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Z1sqr, in1_z);
2445
2446         `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
2447         lea     $S1(%rsp), $r_ptr               # S1 = Y1*Z2^3
2448         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S1, S1, in1_y);
2449
2450         `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2451         lea     $S2(%rsp), $r_ptr               # S2 = Y2*Z1^3
2452         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S2, in2_y);
2453
2454         lea     $S1(%rsp), $b_ptr
2455         lea     $R(%rsp), $r_ptr                # R = S2 - S1
2456         call    __ecp_nistz256_sub_from$x       # p256_sub(R, S2, S1);
2457
2458         or      $acc5, $acc4                    # see if result is zero
2459         movdqa  %xmm4, %xmm2
2460         or      $acc0, $acc4
2461         or      $acc1, $acc4
2462         por     %xmm5, %xmm2                    # in1infty || in2infty
2463         movq    $acc4, %xmm3
2464
2465         `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2466         lea     $U1(%rsp), $r_ptr               # U1 = X1*Z2^2
2467         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U1, in1_x, Z2sqr);
2468
2469         `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
2470         lea     $U2(%rsp), $r_ptr               # U2 = X2*Z1^2
2471         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, in2_x, Z1sqr);
2472
2473         lea     $U1(%rsp), $b_ptr
2474         lea     $H(%rsp), $r_ptr                # H = U2 - U1
2475         call    __ecp_nistz256_sub_from$x       # p256_sub(H, U2, U1);
2476
2477         or      $acc5, $acc4                    # see if result is zero
2478         or      $acc0, $acc4
2479         or      $acc1, $acc4
2480
2481         .byte   0x3e                            # predict taken
2482         jnz     .Ladd_proceed$x                 # is_equal(U1,U2)?
2483         movq    %xmm2, $acc0
2484         movq    %xmm3, $acc1
2485         test    $acc0, $acc0
2486         jnz     .Ladd_proceed$x                 # (in1infty || in2infty)?
2487         test    $acc1, $acc1
2488         jz      .Ladd_proceed$x                 # is_equal(S1,S2)?
2489
2490         movq    %xmm0, $r_ptr                   # restore $r_ptr
2491         pxor    %xmm0, %xmm0
2492         movdqu  %xmm0, 0x00($r_ptr)
2493         movdqu  %xmm0, 0x10($r_ptr)
2494         movdqu  %xmm0, 0x20($r_ptr)
2495         movdqu  %xmm0, 0x30($r_ptr)
2496         movdqu  %xmm0, 0x40($r_ptr)
2497         movdqu  %xmm0, 0x50($r_ptr)
2498         jmp     .Ladd_done$x
2499
2500 .align  32
2501 .Ladd_proceed$x:
2502         `&load_for_sqr("$R(%rsp)", "$src0")`
2503         lea     $Rsqr(%rsp), $r_ptr             # R^2
2504         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Rsqr, R);
2505
2506         `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2507         lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
2508         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, H, in1_z);
2509
2510         `&load_for_sqr("$H(%rsp)", "$src0")`
2511         lea     $Hsqr(%rsp), $r_ptr             # H^2
2512         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Hsqr, H);
2513
2514         `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
2515         lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
2516         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, res_z, in2_z);
2517
2518         `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
2519         lea     $Hcub(%rsp), $r_ptr             # H^3
2520         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(Hcub, Hsqr, H);
2521
2522         `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
2523         lea     $U2(%rsp), $r_ptr               # U1*H^2
2524         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, U1, Hsqr);
2525 ___
2526 {
2527 #######################################################################
2528 # operate in 4-5-0-1 "name space" that matches multiplication output
2529 #
2530 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2531 my ($poly1, $poly3)=($acc6,$acc7);
2532
2533 $code.=<<___;
2534         #lea    $U2(%rsp), $a_ptr
2535         #lea    $Hsqr(%rsp), $r_ptr     # 2*U1*H^2
2536         #call   __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
2537
2538         add     $acc0, $acc0            # a0:a3+a0:a3
2539         lea     $Rsqr(%rsp), $a_ptr
2540         adc     $acc1, $acc1
2541          mov    $acc0, $t0
2542         adc     $acc2, $acc2
2543         adc     $acc3, $acc3
2544          mov    $acc1, $t1
2545         sbb     $t4, $t4
2546
2547         sub     \$-1, $acc0
2548          mov    $acc2, $t2
2549         sbb     $poly1, $acc1
2550         sbb     \$0, $acc2
2551          mov    $acc3, $t3
2552         sbb     $poly3, $acc3
2553         test    $t4, $t4
2554
2555         cmovz   $t0, $acc0
2556         mov     8*0($a_ptr), $t0
2557         cmovz   $t1, $acc1
2558         mov     8*1($a_ptr), $t1
2559         cmovz   $t2, $acc2
2560         mov     8*2($a_ptr), $t2
2561         cmovz   $t3, $acc3
2562         mov     8*3($a_ptr), $t3
2563
2564         call    __ecp_nistz256_sub$x            # p256_sub(res_x, Rsqr, Hsqr);
2565
2566         lea     $Hcub(%rsp), $b_ptr
2567         lea     $res_x(%rsp), $r_ptr
2568         call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, Hcub);
2569
2570         mov     $U2+8*0(%rsp), $t0
2571         mov     $U2+8*1(%rsp), $t1
2572         mov     $U2+8*2(%rsp), $t2
2573         mov     $U2+8*3(%rsp), $t3
2574         lea     $res_y(%rsp), $r_ptr
2575
2576         call    __ecp_nistz256_sub$x            # p256_sub(res_y, U2, res_x);
2577
2578         mov     $acc0, 8*0($r_ptr)              # save the result, as
2579         mov     $acc1, 8*1($r_ptr)              # __ecp_nistz256_sub doesn't
2580         mov     $acc2, 8*2($r_ptr)
2581         mov     $acc3, 8*3($r_ptr)
2582 ___
2583 }
2584 $code.=<<___;
2585         `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2586         lea     $S2(%rsp), $r_ptr
2587         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S1, Hcub);
2588
2589         `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2590         lea     $res_y(%rsp), $r_ptr
2591         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_y, R, res_y);
2592
2593         lea     $S2(%rsp), $b_ptr
2594         lea     $res_y(%rsp), $r_ptr
2595         call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, res_y, S2);
2596
2597         movq    %xmm0, $r_ptr           # restore $r_ptr
2598
2599         movdqa  %xmm5, %xmm0            # copy_conditional(res_z, in2_z, in1infty);
2600         movdqa  %xmm5, %xmm1
2601         pandn   $res_z(%rsp), %xmm0
2602         movdqa  %xmm5, %xmm2
2603         pandn   $res_z+0x10(%rsp), %xmm1
2604         movdqa  %xmm5, %xmm3
2605         pand    $in2_z(%rsp), %xmm2
2606         pand    $in2_z+0x10(%rsp), %xmm3
2607         por     %xmm0, %xmm2
2608         por     %xmm1, %xmm3
2609
2610         movdqa  %xmm4, %xmm0            # copy_conditional(res_z, in1_z, in2infty);
2611         movdqa  %xmm4, %xmm1
2612         pandn   %xmm2, %xmm0
2613         movdqa  %xmm4, %xmm2
2614         pandn   %xmm3, %xmm1
2615         movdqa  %xmm4, %xmm3
2616         pand    $in1_z(%rsp), %xmm2
2617         pand    $in1_z+0x10(%rsp), %xmm3
2618         por     %xmm0, %xmm2
2619         por     %xmm1, %xmm3
2620         movdqu  %xmm2, 0x40($r_ptr)
2621         movdqu  %xmm3, 0x50($r_ptr)
2622
2623         movdqa  %xmm5, %xmm0            # copy_conditional(res_x, in2_x, in1infty);
2624         movdqa  %xmm5, %xmm1
2625         pandn   $res_x(%rsp), %xmm0
2626         movdqa  %xmm5, %xmm2
2627         pandn   $res_x+0x10(%rsp), %xmm1
2628         movdqa  %xmm5, %xmm3
2629         pand    $in2_x(%rsp), %xmm2
2630         pand    $in2_x+0x10(%rsp), %xmm3
2631         por     %xmm0, %xmm2
2632         por     %xmm1, %xmm3
2633
2634         movdqa  %xmm4, %xmm0            # copy_conditional(res_x, in1_x, in2infty);
2635         movdqa  %xmm4, %xmm1
2636         pandn   %xmm2, %xmm0
2637         movdqa  %xmm4, %xmm2
2638         pandn   %xmm3, %xmm1
2639         movdqa  %xmm4, %xmm3
2640         pand    $in1_x(%rsp), %xmm2
2641         pand    $in1_x+0x10(%rsp), %xmm3
2642         por     %xmm0, %xmm2
2643         por     %xmm1, %xmm3
2644         movdqu  %xmm2, 0x00($r_ptr)
2645         movdqu  %xmm3, 0x10($r_ptr)
2646
2647         movdqa  %xmm5, %xmm0            # copy_conditional(res_y, in2_y, in1infty);
2648         movdqa  %xmm5, %xmm1
2649         pandn   $res_y(%rsp), %xmm0
2650         movdqa  %xmm5, %xmm2
2651         pandn   $res_y+0x10(%rsp), %xmm1
2652         movdqa  %xmm5, %xmm3
2653         pand    $in2_y(%rsp), %xmm2
2654         pand    $in2_y+0x10(%rsp), %xmm3
2655         por     %xmm0, %xmm2
2656         por     %xmm1, %xmm3
2657
2658         movdqa  %xmm4, %xmm0            # copy_conditional(res_y, in1_y, in2infty);
2659         movdqa  %xmm4, %xmm1
2660         pandn   %xmm2, %xmm0
2661         movdqa  %xmm4, %xmm2
2662         pandn   %xmm3, %xmm1
2663         movdqa  %xmm4, %xmm3
2664         pand    $in1_y(%rsp), %xmm2
2665         pand    $in1_y+0x10(%rsp), %xmm3
2666         por     %xmm0, %xmm2
2667         por     %xmm1, %xmm3
2668         movdqu  %xmm2, 0x20($r_ptr)
2669         movdqu  %xmm3, 0x30($r_ptr)
2670
2671 .Ladd_done$x:
2672         add     \$32*18+8, %rsp
2673         pop     %r15
2674         pop     %r14
2675         pop     %r13
2676         pop     %r12
2677         pop     %rbx
2678         pop     %rbp
2679         ret
2680 .size   ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2681 ___
2682 }
2683 &gen_add("q");
2684
2685 sub gen_add_affine () {
2686     my $x = shift;
2687     my ($src0,$sfx,$bias);
2688     my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2689         $res_x,$res_y,$res_z,
2690         $in1_x,$in1_y,$in1_z,
2691         $in2_x,$in2_y)=map(32*$_,(0..14));
2692     my $Z1sqr = $S2;
2693
2694     if ($x ne "x") {
2695         $src0 = "%rax";
2696         $sfx  = "";
2697         $bias = 0;
2698
2699 $code.=<<___;
2700 .globl  ecp_nistz256_point_add_affine
2701 .type   ecp_nistz256_point_add_affine,\@function,3
2702 .align  32
2703 ecp_nistz256_point_add_affine:
2704 ___
2705 $code.=<<___    if ($addx);
2706         mov     \$0x80100, %ecx
2707         and     OPENSSL_ia32cap_P+8(%rip), %ecx
2708         cmp     \$0x80100, %ecx
2709         je      .Lpoint_add_affinex
2710 ___
2711     } else {
2712         $src0 = "%rdx";
2713         $sfx  = "x";
2714         $bias = 128;
2715
2716 $code.=<<___;
2717 .type   ecp_nistz256_point_add_affinex,\@function,3
2718 .align  32
2719 ecp_nistz256_point_add_affinex:
2720 .Lpoint_add_affinex:
2721 ___
2722     }
2723 $code.=<<___;
2724         push    %rbp
2725         push    %rbx
2726         push    %r12
2727         push    %r13
2728         push    %r14
2729         push    %r15
2730         sub     \$32*15+8, %rsp
2731
2732         movdqu  0x00($a_ptr), %xmm0     # copy  *(P256_POINT *)$a_ptr
2733         mov     $b_org, $b_ptr          # reassign
2734         movdqu  0x10($a_ptr), %xmm1
2735         movdqu  0x20($a_ptr), %xmm2
2736         movdqu  0x30($a_ptr), %xmm3
2737         movdqu  0x40($a_ptr), %xmm4
2738         movdqu  0x50($a_ptr), %xmm5
2739          mov    0x40+8*0($a_ptr), $src0 # load original in1_z
2740          mov    0x40+8*1($a_ptr), $acc6
2741          mov    0x40+8*2($a_ptr), $acc7
2742          mov    0x40+8*3($a_ptr), $acc0
2743         movdqa  %xmm0, $in1_x(%rsp)
2744         movdqa  %xmm1, $in1_x+0x10(%rsp)
2745         por     %xmm0, %xmm1
2746         movdqa  %xmm2, $in1_y(%rsp)
2747         movdqa  %xmm3, $in1_y+0x10(%rsp)
2748         por     %xmm2, %xmm3
2749         movdqa  %xmm4, $in1_z(%rsp)
2750         movdqa  %xmm5, $in1_z+0x10(%rsp)
2751         por     %xmm1, %xmm3
2752
2753         movdqu  0x00($b_ptr), %xmm0     # copy  *(P256_POINT_AFFINE *)$b_ptr
2754          pshufd \$0xb1, %xmm3, %xmm5
2755         movdqu  0x10($b_ptr), %xmm1
2756         movdqu  0x20($b_ptr), %xmm2
2757          por    %xmm3, %xmm5
2758         movdqu  0x30($b_ptr), %xmm3
2759         movdqa  %xmm0, $in2_x(%rsp)
2760          pshufd \$0x1e, %xmm5, %xmm4
2761         movdqa  %xmm1, $in2_x+0x10(%rsp)
2762         por     %xmm0, %xmm1
2763          movq   $r_ptr, %xmm0           # save $r_ptr
2764         movdqa  %xmm2, $in2_y(%rsp)
2765         movdqa  %xmm3, $in2_y+0x10(%rsp)
2766         por     %xmm2, %xmm3
2767          por    %xmm4, %xmm5
2768          pxor   %xmm4, %xmm4
2769         por     %xmm1, %xmm3
2770
2771         lea     0x40-$bias($a_ptr), $a_ptr      # $a_ptr is still valid
2772         lea     $Z1sqr(%rsp), $r_ptr            # Z1^2
2773         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Z1sqr, in1_z);
2774
2775         pcmpeqd %xmm4, %xmm5
2776         pshufd  \$0xb1, %xmm3, %xmm4
2777          mov    0x00($b_ptr), $src0             # $b_ptr is still valid
2778          #lea   0x00($b_ptr), $b_ptr
2779          mov    $acc4, $acc1                    # harmonize sqr output and mul input
2780         por     %xmm3, %xmm4
2781         pshufd  \$0, %xmm5, %xmm5               # in1infty
2782         pshufd  \$0x1e, %xmm4, %xmm3
2783          mov    $acc5, $acc2
2784         por     %xmm3, %xmm4
2785         pxor    %xmm3, %xmm3
2786          mov    $acc6, $acc3
2787         pcmpeqd %xmm3, %xmm4
2788         pshufd  \$0, %xmm4, %xmm4               # in2infty
2789
2790         lea     $Z1sqr-$bias(%rsp), $a_ptr
2791         mov     $acc7, $acc4
2792         lea     $U2(%rsp), $r_ptr               # U2 = X2*Z1^2
2793         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, Z1sqr, in2_x);
2794
2795         lea     $in1_x(%rsp), $b_ptr
2796         lea     $H(%rsp), $r_ptr                # H = U2 - U1
2797         call    __ecp_nistz256_sub_from$x       # p256_sub(H, U2, in1_x);
2798
2799         `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2800         lea     $S2(%rsp), $r_ptr               # S2 = Z1^3
2801         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Z1sqr, in1_z);
2802
2803         `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2804         lea     $res_z(%rsp), $r_ptr            # Z3 = H*Z1*Z2
2805         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(res_z, H, in1_z);
2806
2807         `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2808         lea     $S2(%rsp), $r_ptr               # S2 = Y2*Z1^3
2809         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, S2, in2_y);
2810
2811         lea     $in1_y(%rsp), $b_ptr
2812         lea     $R(%rsp), $r_ptr                # R = S2 - S1
2813         call    __ecp_nistz256_sub_from$x       # p256_sub(R, S2, in1_y);
2814
2815         `&load_for_sqr("$H(%rsp)", "$src0")`
2816         lea     $Hsqr(%rsp), $r_ptr             # H^2
2817         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Hsqr, H);
2818
2819         `&load_for_sqr("$R(%rsp)", "$src0")`
2820         lea     $Rsqr(%rsp), $r_ptr             # R^2
2821         call    __ecp_nistz256_sqr_mont$x       # p256_sqr_mont(Rsqr, R);
2822
2823         `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2824         lea     $Hcub(%rsp), $r_ptr             # H^3
2825         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(Hcub, Hsqr, H);
2826
2827         `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2828         lea     $U2(%rsp), $r_ptr               # U1*H^2
2829         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(U2, in1_x, Hsqr);
2830 ___
2831 {
2832 #######################################################################
2833 # operate in 4-5-0-1 "name space" that matches multiplication output
2834 #
2835 my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2836 my ($poly1, $poly3)=($acc6,$acc7);
2837
2838 $code.=<<___;
2839         #lea    $U2(%rsp), $a_ptr
2840         #lea    $Hsqr(%rsp), $r_ptr     # 2*U1*H^2
2841         #call   __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
2842
2843         add     $acc0, $acc0            # a0:a3+a0:a3
2844         lea     $Rsqr(%rsp), $a_ptr
2845         adc     $acc1, $acc1
2846          mov    $acc0, $t0
2847         adc     $acc2, $acc2
2848         adc     $acc3, $acc3
2849          mov    $acc1, $t1
2850         sbb     $t4, $t4
2851
2852         sub     \$-1, $acc0
2853          mov    $acc2, $t2
2854         sbb     $poly1, $acc1
2855         sbb     \$0, $acc2
2856          mov    $acc3, $t3
2857         sbb     $poly3, $acc3
2858         test    $t4, $t4
2859
2860         cmovz   $t0, $acc0
2861         mov     8*0($a_ptr), $t0
2862         cmovz   $t1, $acc1
2863         mov     8*1($a_ptr), $t1
2864         cmovz   $t2, $acc2
2865         mov     8*2($a_ptr), $t2
2866         cmovz   $t3, $acc3
2867         mov     8*3($a_ptr), $t3
2868
2869         call    __ecp_nistz256_sub$x            # p256_sub(res_x, Rsqr, Hsqr);
2870
2871         lea     $Hcub(%rsp), $b_ptr
2872         lea     $res_x(%rsp), $r_ptr
2873         call    __ecp_nistz256_sub_from$x       # p256_sub(res_x, res_x, Hcub);
2874
2875         mov     $U2+8*0(%rsp), $t0
2876         mov     $U2+8*1(%rsp), $t1
2877         mov     $U2+8*2(%rsp), $t2
2878         mov     $U2+8*3(%rsp), $t3
2879         lea     $H(%rsp), $r_ptr
2880
2881         call    __ecp_nistz256_sub$x            # p256_sub(H, U2, res_x);
2882
2883         mov     $acc0, 8*0($r_ptr)              # save the result, as
2884         mov     $acc1, 8*1($r_ptr)              # __ecp_nistz256_sub doesn't
2885         mov     $acc2, 8*2($r_ptr)
2886         mov     $acc3, 8*3($r_ptr)
2887 ___
2888 }
2889 $code.=<<___;
2890         `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2891         lea     $S2(%rsp), $r_ptr
2892         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(S2, Hcub, in1_y);
2893
2894         `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2895         lea     $H(%rsp), $r_ptr
2896         call    __ecp_nistz256_mul_mont$x       # p256_mul_mont(H, H, R);
2897
2898         lea     $S2(%rsp), $b_ptr
2899         lea     $res_y(%rsp), $r_ptr
2900         call    __ecp_nistz256_sub_from$x       # p256_sub(res_y, H, S2);
2901
2902         movq    %xmm0, $r_ptr           # restore $r_ptr
2903
2904         movdqa  %xmm5, %xmm0            # copy_conditional(res_z, ONE, in1infty);
2905         movdqa  %xmm5, %xmm1
2906         pandn   $res_z(%rsp), %xmm0
2907         movdqa  %xmm5, %xmm2
2908         pandn   $res_z+0x10(%rsp), %xmm1
2909         movdqa  %xmm5, %xmm3
2910         pand    .LONE_mont(%rip), %xmm2
2911         pand    .LONE_mont+0x10(%rip), %xmm3
2912         por     %xmm0, %xmm2
2913         por     %xmm1, %xmm3
2914
2915         movdqa  %xmm4, %xmm0            # copy_conditional(res_z, in1_z, in2infty);
2916         movdqa  %xmm4, %xmm1
2917         pandn   %xmm2, %xmm0
2918         movdqa  %xmm4, %xmm2
2919         pandn   %xmm3, %xmm1
2920         movdqa  %xmm4, %xmm3
2921         pand    $in1_z(%rsp), %xmm2
2922         pand    $in1_z+0x10(%rsp), %xmm3
2923         por     %xmm0, %xmm2
2924         por     %xmm1, %xmm3
2925         movdqu  %xmm2, 0x40($r_ptr)
2926         movdqu  %xmm3, 0x50($r_ptr)
2927
2928         movdqa  %xmm5, %xmm0            # copy_conditional(res_x, in2_x, in1infty);
2929         movdqa  %xmm5, %xmm1
2930         pandn   $res_x(%rsp), %xmm0
2931         movdqa  %xmm5, %xmm2
2932         pandn   $res_x+0x10(%rsp), %xmm1
2933         movdqa  %xmm5, %xmm3
2934         pand    $in2_x(%rsp), %xmm2
2935         pand    $in2_x+0x10(%rsp), %xmm3
2936         por     %xmm0, %xmm2
2937         por     %xmm1, %xmm3
2938
2939         movdqa  %xmm4, %xmm0            # copy_conditional(res_x, in1_x, in2infty);
2940         movdqa  %xmm4, %xmm1
2941         pandn   %xmm2, %xmm0
2942         movdqa  %xmm4, %xmm2
2943         pandn   %xmm3, %xmm1
2944         movdqa  %xmm4, %xmm3
2945         pand    $in1_x(%rsp), %xmm2
2946         pand    $in1_x+0x10(%rsp), %xmm3
2947         por     %xmm0, %xmm2
2948         por     %xmm1, %xmm3
2949         movdqu  %xmm2, 0x00($r_ptr)
2950         movdqu  %xmm3, 0x10($r_ptr)
2951
2952         movdqa  %xmm5, %xmm0            # copy_conditional(res_y, in2_y, in1infty);
2953         movdqa  %xmm5, %xmm1
2954         pandn   $res_y(%rsp), %xmm0
2955         movdqa  %xmm5, %xmm2
2956         pandn   $res_y+0x10(%rsp), %xmm1
2957         movdqa  %xmm5, %xmm3
2958         pand    $in2_y(%rsp), %xmm2
2959         pand    $in2_y+0x10(%rsp), %xmm3
2960         por     %xmm0, %xmm2
2961         por     %xmm1, %xmm3
2962
2963         movdqa  %xmm4, %xmm0            # copy_conditional(res_y, in1_y, in2infty);
2964         movdqa  %xmm4, %xmm1
2965         pandn   %xmm2, %xmm0
2966         movdqa  %xmm4, %xmm2
2967         pandn   %xmm3, %xmm1
2968         movdqa  %xmm4, %xmm3
2969         pand    $in1_y(%rsp), %xmm2
2970         pand    $in1_y+0x10(%rsp), %xmm3
2971         por     %xmm0, %xmm2
2972         por     %xmm1, %xmm3
2973         movdqu  %xmm2, 0x20($r_ptr)
2974         movdqu  %xmm3, 0x30($r_ptr)
2975
2976         add     \$32*15+8, %rsp
2977         pop     %r15
2978         pop     %r14
2979         pop     %r13
2980         pop     %r12
2981         pop     %rbx
2982         pop     %rbp
2983         ret
2984 .size   ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2985 ___
2986 }
2987 &gen_add_affine("q");
2988
2989 ########################################################################
2990 # AD*X magic
2991 #
2992 if ($addx) {                                                            {
2993 ########################################################################
2994 # operate in 4-5-0-1 "name space" that matches multiplication output
2995 #
2996 my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2997
2998 $code.=<<___;
2999 .type   __ecp_nistz256_add_tox,\@abi-omnipotent
3000 .align  32
3001 __ecp_nistz256_add_tox:
3002         xor     $t4, $t4
3003         adc     8*0($b_ptr), $a0
3004         adc     8*1($b_ptr), $a1
3005          mov    $a0, $t0
3006         adc     8*2($b_ptr), $a2
3007         adc     8*3($b_ptr), $a3
3008          mov    $a1, $t1
3009         adc     \$0, $t4
3010
3011         xor     $t3, $t3
3012         sbb     \$-1, $a0
3013          mov    $a2, $t2
3014         sbb     $poly1, $a1
3015         sbb     \$0, $a2
3016          mov    $a3, $t3
3017         sbb     $poly3, $a3
3018
3019         bt      \$0, $t4
3020         cmovnc  $t0, $a0
3021         cmovnc  $t1, $a1
3022         mov     $a0, 8*0($r_ptr)
3023         cmovnc  $t2, $a2
3024         mov     $a1, 8*1($r_ptr)
3025         cmovnc  $t3, $a3
3026         mov     $a2, 8*2($r_ptr)
3027         mov     $a3, 8*3($r_ptr)
3028
3029         ret
3030 .size   __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
3031
3032 .type   __ecp_nistz256_sub_fromx,\@abi-omnipotent
3033 .align  32
3034 __ecp_nistz256_sub_fromx:
3035         xor     $t4, $t4
3036         sbb     8*0($b_ptr), $a0
3037         sbb     8*1($b_ptr), $a1
3038          mov    $a0, $t0
3039         sbb     8*2($b_ptr), $a2
3040         sbb     8*3($b_ptr), $a3
3041          mov    $a1, $t1
3042         sbb     \$0, $t4
3043
3044         xor     $t3, $t3
3045         adc     \$-1, $a0
3046          mov    $a2, $t2
3047         adc     $poly1, $a1
3048         adc     \$0, $a2
3049          mov    $a3, $t3
3050         adc     $poly3, $a3
3051
3052         bt      \$0, $t4
3053         cmovnc  $t0, $a0
3054         cmovnc  $t1, $a1
3055         mov     $a0, 8*0($r_ptr)
3056         cmovnc  $t2, $a2
3057         mov     $a1, 8*1($r_ptr)
3058         cmovnc  $t3, $a3
3059         mov     $a2, 8*2($r_ptr)
3060         mov     $a3, 8*3($r_ptr)
3061
3062         ret
3063 .size   __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
3064
3065 .type   __ecp_nistz256_subx,\@abi-omnipotent
3066 .align  32
3067 __ecp_nistz256_subx:
3068         xor     $t4, $t4
3069         sbb     $a0, $t0
3070         sbb     $a1, $t1
3071          mov    $t0, $a0
3072         sbb     $a2, $t2
3073         sbb     $a3, $t3
3074          mov    $t1, $a1
3075         sbb     \$0, $t4
3076
3077         xor     $a3 ,$a3
3078         adc     \$-1, $t0
3079          mov    $t2, $a2
3080         adc     $poly1, $t1
3081         adc     \$0, $t2
3082          mov    $t3, $a3
3083         adc     $poly3, $t3
3084
3085         bt      \$0, $t4
3086         cmovc   $t0, $a0
3087         cmovc   $t1, $a1
3088         cmovc   $t2, $a2
3089         cmovc   $t3, $a3
3090
3091         ret
3092 .size   __ecp_nistz256_subx,.-__ecp_nistz256_subx
3093
3094 .type   __ecp_nistz256_mul_by_2x,\@abi-omnipotent
3095 .align  32
3096 __ecp_nistz256_mul_by_2x:
3097         xor     $t4, $t4
3098         adc     $a0, $a0                # a0:a3+a0:a3
3099         adc     $a1, $a1
3100          mov    $a0, $t0
3101         adc     $a2, $a2
3102         adc     $a3, $a3
3103          mov    $a1, $t1
3104         adc     \$0, $t4
3105
3106         xor     $t3, $t3
3107         sbb     \$-1, $a0
3108          mov    $a2, $t2
3109         sbb     $poly1, $a1
3110         sbb     \$0, $a2
3111          mov    $a3, $t3
3112         sbb     $poly3, $a3
3113
3114         bt      \$0, $t4
3115         cmovnc  $t0, $a0
3116         cmovnc  $t1, $a1
3117         mov     $a0, 8*0($r_ptr)
3118         cmovnc  $t2, $a2
3119         mov     $a1, 8*1($r_ptr)
3120         cmovnc  $t3, $a3
3121         mov     $a2, 8*2($r_ptr)
3122         mov     $a3, 8*3($r_ptr)
3123
3124         ret
3125 .size   __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
3126 ___
3127                                                                         }
3128 &gen_double("x");
3129 &gen_add("x");
3130 &gen_add_affine("x");
3131 }
3132 }}}
3133
3134 ########################################################################
3135 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
3136 #
3137 open TABLE,"<ecp_nistz256_table.c"              or
3138 open TABLE,"<${dir}../ecp_nistz256_table.c"     or
3139 die "failed to open ecp_nistz256_table.c:",$!;
3140
3141 use integer;
3142
3143 foreach(<TABLE>) {
3144         s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
3145 }
3146 close TABLE;
3147
3148 die "insane number of elements" if ($#arr != 64*16*37-1);
3149
3150 print <<___;
3151 .text
3152 .globl  ecp_nistz256_precomputed
3153 .type   ecp_nistz256_precomputed,\@object
3154 .align  4096
3155 ecp_nistz256_precomputed:
3156 ___
3157 while (@line=splice(@arr,0,16)) {
3158         print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
3159 }
3160 print <<___;
3161 .size   ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
3162 ___
3163
3164 $code =~ s/\`([^\`]*)\`/eval $1/gem;
3165 print $code;
3166 close STDOUT;