crypto/ec/asm/x25519-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # X25519 lower-level primitives for x86_64.
  17 #
  18 # February 2018.
  19 #
  20 # This module implements radix 2^51 multiplication and squaring, and
  21 # radix 2^64 multiplication, squaring, addition, subtraction and final
  22 # reduction. Latter radix is used on ADCX/ADOX-capable processors such
  23 # as Broadwell. On related note one should mention that there are
  24 # vector implementations that provide significantly better performance
  25 # on some processors(*), but they are large and overly complex. Which
  26 # in combination with them being effectively processor-specific makes
  27 # the undertaking hard to justify. The goal for this implementation
  28 # is rather versatility and simplicity [and ultimately formal
  29 # verification].
  30 #
  31 # (*)   For example sandy2x should provide ~30% improvement on Sandy
  32 #       Bridge, but only nominal ~5% on Haswell [and big loss on
  33 #       Broadwell and successors].
  34 #
  35 ######################################################################
  36 # Improvement coefficients:
  37 #
  38 #                       amd64-51(*)     gcc-5.x(**)
  39 #
  40 # P4                    +22%            +40%
  41 # Sandy Bridge          -3%             +11%
  42 # Haswell               -1%             +13%
  43 # Broadwell(***)        +30%            +35%
  44 # Skylake(***)          +33%            +47%
  45 # Silvermont            +20%            +26%
  46 # Goldmont              +40%            +50%
  47 # Bulldozer             +20%            +9%
  48 # Ryzen(***)            +43%            +40%
  49 # VIA                   +170%           +120%
  50 #
  51 # (*)   amd64-51 is popular assembly implementation with 2^51 radix,
  52 #       only multiplication and squaring subroutines were linked
  53 #       for comparison, but not complete ladder step; gain on most
  54 #       processors is because this module refrains from shld, and
  55 #       minor regression on others is because this does result in
  56 #       higher instruction count;
  57 # (**)  compiler is free to inline functions, in assembly one would
  58 #       need to implement ladder step to do that, and it will improve
  59 #       performance by several percent;
  60 # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
  61 #       C implementation, so that comparison is always against
  62 #       2^51 radix;
  63
  64 # $output is the last argument if it looks like a file (it has an extension)
  65 # $flavour is the first argument if it doesn't look like a file
  66 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  67 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  68
  69 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  70
  71 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  72 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  73 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  74 die "can't locate x86_64-xlate.pl";
  75
  76 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  77     or die "can't call $xlate: $!";
  78 *STDOUT=*OUT;
  79
  80 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  81                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  82         $addx = ($1>=2.23);
  83 }
  84
  85 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  86             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  87         $addx = ($1>=2.10);
  88 }
  89
  90 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  91             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  92         $addx = ($1>=12);
  93 }
  94
  95 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
  96         my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
  97         $addx = ($ver>=3.03);
  98 }
  99
 100 $code.=<<___;
 101 .text
 102
 103 .globl  x25519_fe51_mul
 104 .type   x25519_fe51_mul,\@function,3
 105 .align  32
 106 x25519_fe51_mul:
 107 .cfi_startproc
 108         push    %rbp
 109 .cfi_push       %rbp
 110         push    %rbx
 111 .cfi_push       %rbx
 112         push    %r12
 113 .cfi_push       %r12
 114         push    %r13
 115 .cfi_push       %r13
 116         push    %r14
 117 .cfi_push       %r14
 118         push    %r15
 119 .cfi_push       %r15
 120         lea     -8*5(%rsp),%rsp
 121 .cfi_adjust_cfa_offset  40
 122 .Lfe51_mul_body:
 123
 124         mov     8*0(%rsi),%rax          # f[0]
 125         mov     8*0(%rdx),%r11          # load g[0-4]
 126         mov     8*1(%rdx),%r12
 127         mov     8*2(%rdx),%r13
 128         mov     8*3(%rdx),%rbp
 129         mov     8*4(%rdx),%r14
 130
 131         mov     %rdi,8*4(%rsp)          # offload 1st argument
 132         mov     %rax,%rdi
 133         mulq    %r11                    # f[0]*g[0]
 134         mov     %r11,8*0(%rsp)          # offload g[0]
 135         mov     %rax,%rbx               # %rbx:%rcx = h0
 136         mov     %rdi,%rax
 137         mov     %rdx,%rcx
 138         mulq    %r12                    # f[0]*g[1]
 139         mov     %r12,8*1(%rsp)          # offload g[1]
 140         mov     %rax,%r8                # %r8:%r9 = h1
 141         mov     %rdi,%rax
 142         lea     (%r14,%r14,8),%r15
 143         mov     %rdx,%r9
 144         mulq    %r13                    # f[0]*g[2]
 145         mov     %r13,8*2(%rsp)          # offload g[2]
 146         mov     %rax,%r10               # %r10:%r11 = h2
 147         mov     %rdi,%rax
 148         lea     (%r14,%r15,2),%rdi      # g[4]*19
 149         mov     %rdx,%r11
 150         mulq    %rbp                    # f[0]*g[3]
 151         mov     %rax,%r12               # %r12:%r13 = h3
 152         mov     8*0(%rsi),%rax          # f[0]
 153         mov     %rdx,%r13
 154         mulq    %r14                    # f[0]*g[4]
 155         mov     %rax,%r14               # %r14:%r15 = h4
 156         mov     8*1(%rsi),%rax          # f[1]
 157         mov     %rdx,%r15
 158
 159         mulq    %rdi                    # f[1]*g[4]*19
 160         add     %rax,%rbx
 161         mov     8*2(%rsi),%rax          # f[2]
 162         adc     %rdx,%rcx
 163         mulq    %rdi                    # f[2]*g[4]*19
 164         add     %rax,%r8
 165         mov     8*3(%rsi),%rax          # f[3]
 166         adc     %rdx,%r9
 167         mulq    %rdi                    # f[3]*g[4]*19
 168         add     %rax,%r10
 169         mov     8*4(%rsi),%rax          # f[4]
 170         adc     %rdx,%r11
 171         mulq    %rdi                    # f[4]*g[4]*19
 172         imulq   \$19,%rbp,%rdi          # g[3]*19
 173         add     %rax,%r12
 174         mov     8*1(%rsi),%rax          # f[1]
 175         adc     %rdx,%r13
 176         mulq    %rbp                    # f[1]*g[3]
 177         mov     8*2(%rsp),%rbp          # g[2]
 178         add     %rax,%r14
 179         mov     8*2(%rsi),%rax          # f[2]
 180         adc     %rdx,%r15
 181
 182         mulq    %rdi                    # f[2]*g[3]*19
 183         add     %rax,%rbx
 184         mov     8*3(%rsi),%rax          # f[3]
 185         adc     %rdx,%rcx
 186         mulq    %rdi                    # f[3]*g[3]*19
 187         add     %rax,%r8
 188         mov     8*4(%rsi),%rax          # f[4]
 189         adc     %rdx,%r9
 190         mulq    %rdi                    # f[4]*g[3]*19
 191         imulq   \$19,%rbp,%rdi          # g[2]*19
 192         add     %rax,%r10
 193         mov     8*1(%rsi),%rax          # f[1]
 194         adc     %rdx,%r11
 195         mulq    %rbp                    # f[1]*g[2]
 196         add     %rax,%r12
 197         mov     8*2(%rsi),%rax          # f[2]
 198         adc     %rdx,%r13
 199         mulq    %rbp                    # f[2]*g[2]
 200         mov     8*1(%rsp),%rbp          # g[1]
 201         add     %rax,%r14
 202         mov     8*3(%rsi),%rax          # f[3]
 203         adc     %rdx,%r15
 204
 205         mulq    %rdi                    # f[3]*g[2]*19
 206         add     %rax,%rbx
 207         mov     8*4(%rsi),%rax          # f[3]
 208         adc     %rdx,%rcx
 209         mulq    %rdi                    # f[4]*g[2]*19
 210         add     %rax,%r8
 211         mov     8*1(%rsi),%rax          # f[1]
 212         adc     %rdx,%r9
 213         mulq    %rbp                    # f[1]*g[1]
 214         imulq   \$19,%rbp,%rdi
 215         add     %rax,%r10
 216         mov     8*2(%rsi),%rax          # f[2]
 217         adc     %rdx,%r11
 218         mulq    %rbp                    # f[2]*g[1]
 219         add     %rax,%r12
 220         mov     8*3(%rsi),%rax          # f[3]
 221         adc     %rdx,%r13
 222         mulq    %rbp                    # f[3]*g[1]
 223         mov     8*0(%rsp),%rbp          # g[0]
 224         add     %rax,%r14
 225         mov     8*4(%rsi),%rax          # f[4]
 226         adc     %rdx,%r15
 227
 228         mulq    %rdi                    # f[4]*g[1]*19
 229         add     %rax,%rbx
 230         mov     8*1(%rsi),%rax          # f[1]
 231         adc     %rdx,%rcx
 232         mul     %rbp                    # f[1]*g[0]
 233         add     %rax,%r8
 234         mov     8*2(%rsi),%rax          # f[2]
 235         adc     %rdx,%r9
 236         mul     %rbp                    # f[2]*g[0]
 237         add     %rax,%r10
 238         mov     8*3(%rsi),%rax          # f[3]
 239         adc     %rdx,%r11
 240         mul     %rbp                    # f[3]*g[0]
 241         add     %rax,%r12
 242         mov     8*4(%rsi),%rax          # f[4]
 243         adc     %rdx,%r13
 244         mulq    %rbp                    # f[4]*g[0]
 245         add     %rax,%r14
 246         adc     %rdx,%r15
 247
 248         mov     8*4(%rsp),%rdi          # restore 1st argument
 249         jmp     .Lreduce51
 250 .Lfe51_mul_epilogue:
 251 .cfi_endproc
 252 .size   x25519_fe51_mul,.-x25519_fe51_mul
 253
 254 .globl  x25519_fe51_sqr
 255 .type   x25519_fe51_sqr,\@function,2
 256 .align  32
 257 x25519_fe51_sqr:
 258 .cfi_startproc
 259         push    %rbp
 260 .cfi_push       %rbp
 261         push    %rbx
 262 .cfi_push       %rbx
 263         push    %r12
 264 .cfi_push       %r12
 265         push    %r13
 266 .cfi_push       %r13
 267         push    %r14
 268 .cfi_push       %r14
 269         push    %r15
 270 .cfi_push       %r15
 271         lea     -8*5(%rsp),%rsp
 272 .cfi_adjust_cfa_offset  40
 273 .Lfe51_sqr_body:
 274
 275         mov     8*0(%rsi),%rax          # g[0]
 276         mov     8*2(%rsi),%r15          # g[2]
 277         mov     8*4(%rsi),%rbp          # g[4]
 278
 279         mov     %rdi,8*4(%rsp)          # offload 1st argument
 280         lea     (%rax,%rax),%r14
 281         mulq    %rax                    # g[0]*g[0]
 282         mov     %rax,%rbx
 283         mov     8*1(%rsi),%rax          # g[1]
 284         mov     %rdx,%rcx
 285         mulq    %r14                    # 2*g[0]*g[1]
 286         mov     %rax,%r8
 287         mov     %r15,%rax
 288         mov     %r15,8*0(%rsp)          # offload g[2]
 289         mov     %rdx,%r9
 290         mulq    %r14                    # 2*g[0]*g[2]
 291         mov     %rax,%r10
 292         mov     8*3(%rsi),%rax
 293         mov     %rdx,%r11
 294         imulq   \$19,%rbp,%rdi          # g[4]*19
 295         mulq    %r14                    # 2*g[0]*g[3]
 296         mov     %rax,%r12
 297         mov     %rbp,%rax
 298         mov     %rdx,%r13
 299         mulq    %r14                    # 2*g[0]*g[4]
 300         mov     %rax,%r14
 301         mov     %rbp,%rax
 302         mov     %rdx,%r15
 303
 304         mulq    %rdi                    # g[4]*g[4]*19
 305         add     %rax,%r12
 306         mov     8*1(%rsi),%rax          # g[1]
 307         adc     %rdx,%r13
 308
 309         mov     8*3(%rsi),%rsi          # g[3]
 310         lea     (%rax,%rax),%rbp
 311         mulq    %rax                    # g[1]*g[1]
 312         add     %rax,%r10
 313         mov     8*0(%rsp),%rax          # g[2]
 314         adc     %rdx,%r11
 315         mulq    %rbp                    # 2*g[1]*g[2]
 316         add     %rax,%r12
 317         mov     %rbp,%rax
 318         adc     %rdx,%r13
 319         mulq    %rsi                    # 2*g[1]*g[3]
 320         add     %rax,%r14
 321         mov     %rbp,%rax
 322         adc     %rdx,%r15
 323         imulq   \$19,%rsi,%rbp          # g[3]*19
 324         mulq    %rdi                    # 2*g[1]*g[4]*19
 325         add     %rax,%rbx
 326         lea     (%rsi,%rsi),%rax
 327         adc     %rdx,%rcx
 328
 329         mulq    %rdi                    # 2*g[3]*g[4]*19
 330         add     %rax,%r10
 331         mov     %rsi,%rax
 332         adc     %rdx,%r11
 333         mulq    %rbp                    # g[3]*g[3]*19
 334         add     %rax,%r8
 335         mov     8*0(%rsp),%rax          # g[2]
 336         adc     %rdx,%r9
 337
 338         lea     (%rax,%rax),%rsi
 339         mulq    %rax                    # g[2]*g[2]
 340         add     %rax,%r14
 341         mov     %rbp,%rax
 342         adc     %rdx,%r15
 343         mulq    %rsi                    # 2*g[2]*g[3]*19
 344         add     %rax,%rbx
 345         mov     %rsi,%rax
 346         adc     %rdx,%rcx
 347         mulq    %rdi                    # 2*g[2]*g[4]*19
 348         add     %rax,%r8
 349         adc     %rdx,%r9
 350
 351         mov     8*4(%rsp),%rdi          # restore 1st argument
 352         jmp     .Lreduce51
 353
 354 .align  32
 355 .Lreduce51:
 356         mov     \$0x7ffffffffffff,%rbp
 357
 358         mov     %r10,%rdx
 359         shr     \$51,%r10
 360         shl     \$13,%r11
 361         and     %rbp,%rdx               # %rdx = g2 = h2 & mask
 362         or      %r10,%r11               # h2>>51
 363         add     %r11,%r12
 364         adc     \$0,%r13                # h3 += h2>>51
 365
 366         mov     %rbx,%rax
 367         shr     \$51,%rbx
 368         shl     \$13,%rcx
 369         and     %rbp,%rax               # %rax = g0 = h0 & mask
 370         or      %rbx,%rcx               # h0>>51
 371         add     %rcx,%r8                # h1 += h0>>51
 372         adc     \$0,%r9
 373
 374         mov     %r12,%rbx
 375         shr     \$51,%r12
 376         shl     \$13,%r13
 377         and     %rbp,%rbx               # %rbx = g3 = h3 & mask
 378         or      %r12,%r13               # h3>>51
 379         add     %r13,%r14               # h4 += h3>>51
 380         adc     \$0,%r15
 381
 382         mov     %r8,%rcx
 383         shr     \$51,%r8
 384         shl     \$13,%r9
 385         and     %rbp,%rcx               # %rcx = g1 = h1 & mask
 386         or      %r8,%r9
 387         add     %r9,%rdx                # g2 += h1>>51
 388
 389         mov     %r14,%r10
 390         shr     \$51,%r14
 391         shl     \$13,%r15
 392         and     %rbp,%r10               # %r10 = g4 = h0 & mask
 393         or      %r14,%r15               # h0>>51
 394
 395         lea     (%r15,%r15,8),%r14
 396         lea     (%r15,%r14,2),%r15
 397         add     %r15,%rax               # g0 += (h0>>51)*19
 398
 399         mov     %rdx,%r8
 400         and     %rbp,%rdx               # g2 &= mask
 401         shr     \$51,%r8
 402         add     %r8,%rbx                # g3 += g2>>51
 403
 404         mov     %rax,%r9
 405         and     %rbp,%rax               # g0 &= mask
 406         shr     \$51,%r9
 407         add     %r9,%rcx                # g1 += g0>>51
 408
 409         mov     %rax,8*0(%rdi)          # save the result
 410         mov     %rcx,8*1(%rdi)
 411         mov     %rdx,8*2(%rdi)
 412         mov     %rbx,8*3(%rdi)
 413         mov     %r10,8*4(%rdi)
 414
 415         mov     8*5(%rsp),%r15
 416 .cfi_restore    %r15
 417         mov     8*6(%rsp),%r14
 418 .cfi_restore    %r14
 419         mov     8*7(%rsp),%r13
 420 .cfi_restore    %r13
 421         mov     8*8(%rsp),%r12
 422 .cfi_restore    %r12
 423         mov     8*9(%rsp),%rbx
 424 .cfi_restore    %rbx
 425         mov     8*10(%rsp),%rbp
 426 .cfi_restore    %rbp
 427         lea     8*11(%rsp),%rsp
 428 .cfi_adjust_cfa_offset  88
 429 .Lfe51_sqr_epilogue:
 430         ret
 431 .cfi_endproc
 432 .size   x25519_fe51_sqr,.-x25519_fe51_sqr
 433
 434 .globl  x25519_fe51_mul121666
 435 .type   x25519_fe51_mul121666,\@function,2
 436 .align  32
 437 x25519_fe51_mul121666:
 438 .cfi_startproc
 439         push    %rbp
 440 .cfi_push       %rbp
 441         push    %rbx
 442 .cfi_push       %rbx
 443         push    %r12
 444 .cfi_push       %r12
 445         push    %r13
 446 .cfi_push       %r13
 447         push    %r14
 448 .cfi_push       %r14
 449         push    %r15
 450 .cfi_push       %r15
 451         lea     -8*5(%rsp),%rsp
 452 .cfi_adjust_cfa_offset  40
 453 .Lfe51_mul121666_body:
 454         mov     \$121666,%eax
 455
 456         mulq    8*0(%rsi)
 457         mov     %rax,%rbx               # %rbx:%rcx = h0
 458         mov     \$121666,%eax
 459         mov     %rdx,%rcx
 460         mulq    8*1(%rsi)
 461         mov     %rax,%r8                # %r8:%r9 = h1
 462         mov     \$121666,%eax
 463         mov     %rdx,%r9
 464         mulq    8*2(%rsi)
 465         mov     %rax,%r10               # %r10:%r11 = h2
 466         mov     \$121666,%eax
 467         mov     %rdx,%r11
 468         mulq    8*3(%rsi)
 469         mov     %rax,%r12               # %r12:%r13 = h3
 470         mov     \$121666,%eax           # f[0]
 471         mov     %rdx,%r13
 472         mulq    8*4(%rsi)
 473         mov     %rax,%r14               # %r14:%r15 = h4
 474         mov     %rdx,%r15
 475
 476         jmp     .Lreduce51
 477 .Lfe51_mul121666_epilogue:
 478 .cfi_endproc
 479 .size   x25519_fe51_mul121666,.-x25519_fe51_mul121666
 480 ___
 481 ########################################################################
 482 # Base 2^64 subroutines modulo 2*(2^255-19)
 483 #
 484 if ($addx) {
 485 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
 486
 487 $code.=<<___;
 488 .extern OPENSSL_ia32cap_P
 489 .globl  x25519_fe64_eligible
 490 .type   x25519_fe64_eligible,\@abi-omnipotent
 491 .align  32
 492 x25519_fe64_eligible:
 493 .cfi_startproc
 494         mov     OPENSSL_ia32cap_P+8(%rip),%ecx
 495         xor     %eax,%eax
 496         and     \$0x80100,%ecx
 497         cmp     \$0x80100,%ecx
 498         cmove   %ecx,%eax
 499         ret
 500 .cfi_endproc
 501 .size   x25519_fe64_eligible,.-x25519_fe64_eligible
 502
 503 .globl  x25519_fe64_mul
 504 .type   x25519_fe64_mul,\@function,3
 505 .align  32
 506 x25519_fe64_mul:
 507 .cfi_startproc
 508         push    %rbp
 509 .cfi_push       %rbp
 510         push    %rbx
 511 .cfi_push       %rbx
 512         push    %r12
 513 .cfi_push       %r12
 514         push    %r13
 515 .cfi_push       %r13
 516         push    %r14
 517 .cfi_push       %r14
 518         push    %r15
 519 .cfi_push       %r15
 520         push    %rdi                    # offload dst
 521 .cfi_push       %rdi
 522         lea     -8*2(%rsp),%rsp
 523 .cfi_adjust_cfa_offset  16
 524 .Lfe64_mul_body:
 525
 526         mov     %rdx,%rax
 527         mov     8*0(%rdx),%rbp          # b[0]
 528         mov     8*0(%rsi),%rdx          # a[0]
 529         mov     8*1(%rax),%rcx          # b[1]
 530         mov     8*2(%rax),$acc6         # b[2]
 531         mov     8*3(%rax),$acc7         # b[3]
 532
 533         mulx    %rbp,$acc0,%rax         # a[0]*b[0]
 534         xor     %edi,%edi               # cf=0,of=0
 535         mulx    %rcx,$acc1,%rbx         # a[0]*b[1]
 536         adcx    %rax,$acc1
 537         mulx    $acc6,$acc2,%rax        # a[0]*b[2]
 538         adcx    %rbx,$acc2
 539         mulx    $acc7,$acc3,$acc4       # a[0]*b[3]
 540          mov    8*1(%rsi),%rdx          # a[1]
 541         adcx    %rax,$acc3
 542         mov     $acc6,(%rsp)            # offload b[2]
 543         adcx    %rdi,$acc4              # cf=0
 544
 545         mulx    %rbp,%rax,%rbx          # a[1]*b[0]
 546         adox    %rax,$acc1
 547         adcx    %rbx,$acc2
 548         mulx    %rcx,%rax,%rbx          # a[1]*b[1]
 549         adox    %rax,$acc2
 550         adcx    %rbx,$acc3
 551         mulx    $acc6,%rax,%rbx         # a[1]*b[2]
 552         adox    %rax,$acc3
 553         adcx    %rbx,$acc4
 554         mulx    $acc7,%rax,$acc5        # a[1]*b[3]
 555          mov    8*2(%rsi),%rdx          # a[2]
 556         adox    %rax,$acc4
 557         adcx    %rdi,$acc5              # cf=0
 558         adox    %rdi,$acc5              # of=0
 559
 560         mulx    %rbp,%rax,%rbx          # a[2]*b[0]
 561         adcx    %rax,$acc2
 562         adox    %rbx,$acc3
 563         mulx    %rcx,%rax,%rbx          # a[2]*b[1]
 564         adcx    %rax,$acc3
 565         adox    %rbx,$acc4
 566         mulx    $acc6,%rax,%rbx         # a[2]*b[2]
 567         adcx    %rax,$acc4
 568         adox    %rbx,$acc5
 569         mulx    $acc7,%rax,$acc6        # a[2]*b[3]
 570          mov    8*3(%rsi),%rdx          # a[3]
 571         adcx    %rax,$acc5
 572         adox    %rdi,$acc6              # of=0
 573         adcx    %rdi,$acc6              # cf=0
 574
 575         mulx    %rbp,%rax,%rbx          # a[3]*b[0]
 576         adox    %rax,$acc3
 577         adcx    %rbx,$acc4
 578         mulx    %rcx,%rax,%rbx          # a[3]*b[1]
 579         adox    %rax,$acc4
 580         adcx    %rbx,$acc5
 581         mulx    (%rsp),%rax,%rbx        # a[3]*b[2]
 582         adox    %rax,$acc5
 583         adcx    %rbx,$acc6
 584         mulx    $acc7,%rax,$acc7        # a[3]*b[3]
 585          mov    \$38,%edx
 586         adox    %rax,$acc6
 587         adcx    %rdi,$acc7              # cf=0
 588         adox    %rdi,$acc7              # of=0
 589
 590         jmp     .Lreduce64
 591 .Lfe64_mul_epilogue:
 592 .cfi_endproc
 593 .size   x25519_fe64_mul,.-x25519_fe64_mul
 594
 595 .globl  x25519_fe64_sqr
 596 .type   x25519_fe64_sqr,\@function,2
 597 .align  32
 598 x25519_fe64_sqr:
 599 .cfi_startproc
 600         push    %rbp
 601 .cfi_push       %rbp
 602         push    %rbx
 603 .cfi_push       %rbx
 604         push    %r12
 605 .cfi_push       %r12
 606         push    %r13
 607 .cfi_push       %r13
 608         push    %r14
 609 .cfi_push       %r14
 610         push    %r15
 611 .cfi_push       %r15
 612         push    %rdi                    # offload dst
 613 .cfi_push       %rdi
 614         lea     -8*2(%rsp),%rsp
 615 .cfi_adjust_cfa_offset  16
 616 .Lfe64_sqr_body:
 617
 618         mov     8*0(%rsi),%rdx          # a[0]
 619         mov     8*1(%rsi),%rcx          # a[1]
 620         mov     8*2(%rsi),%rbp          # a[2]
 621         mov     8*3(%rsi),%rsi          # a[3]
 622
 623         ################################################################
 624         mulx    %rdx,$acc0,$acc7        # a[0]*a[0]
 625         mulx    %rcx,$acc1,%rax         # a[0]*a[1]
 626         xor     %edi,%edi               # cf=0,of=0
 627         mulx    %rbp,$acc2,%rbx         # a[0]*a[2]
 628         adcx    %rax,$acc2
 629         mulx    %rsi,$acc3,$acc4        # a[0]*a[3]
 630          mov    %rcx,%rdx               # a[1]
 631         adcx    %rbx,$acc3
 632         adcx    %rdi,$acc4              # cf=0
 633
 634         ################################################################
 635         mulx    %rbp,%rax,%rbx          # a[1]*a[2]
 636         adox    %rax,$acc3
 637         adcx    %rbx,$acc4
 638         mulx    %rsi,%rax,$acc5         # a[1]*a[3]
 639          mov    %rbp,%rdx               # a[2]
 640         adox    %rax,$acc4
 641         adcx    %rdi,$acc5
 642
 643         ################################################################
 644         mulx    %rsi,%rax,$acc6         # a[2]*a[3]
 645          mov    %rcx,%rdx               # a[1]
 646         adox    %rax,$acc5
 647         adcx    %rdi,$acc6              # cf=0
 648         adox    %rdi,$acc6              # of=0
 649
 650          adcx   $acc1,$acc1             # acc1:6<<1
 651         adox    $acc7,$acc1
 652          adcx   $acc2,$acc2
 653         mulx    %rdx,%rax,%rbx          # a[1]*a[1]
 654          mov    %rbp,%rdx               # a[2]
 655          adcx   $acc3,$acc3
 656         adox    %rax,$acc2
 657          adcx   $acc4,$acc4
 658         adox    %rbx,$acc3
 659         mulx    %rdx,%rax,%rbx          # a[2]*a[2]
 660          mov    %rsi,%rdx               # a[3]
 661          adcx   $acc5,$acc5
 662         adox    %rax,$acc4
 663          adcx   $acc6,$acc6
 664         adox    %rbx,$acc5
 665         mulx    %rdx,%rax,$acc7         # a[3]*a[3]
 666          mov    \$38,%edx
 667         adox    %rax,$acc6
 668         adcx    %rdi,$acc7              # cf=0
 669         adox    %rdi,$acc7              # of=0
 670         jmp     .Lreduce64
 671
 672 .align  32
 673 .Lreduce64:
 674         mulx    $acc4,%rax,%rbx
 675         adcx    %rax,$acc0
 676         adox    %rbx,$acc1
 677         mulx    $acc5,%rax,%rbx
 678         adcx    %rax,$acc1
 679         adox    %rbx,$acc2
 680         mulx    $acc6,%rax,%rbx
 681         adcx    %rax,$acc2
 682         adox    %rbx,$acc3
 683         mulx    $acc7,%rax,$acc4
 684         adcx    %rax,$acc3
 685         adox    %rdi,$acc4
 686         adcx    %rdi,$acc4
 687
 688         mov     8*2(%rsp),%rdi          # restore dst
 689         imulq   %rdx,$acc4
 690
 691         add     $acc4,$acc0
 692         adc     \$0,$acc1
 693         adc     \$0,$acc2
 694         adc     \$0,$acc3
 695
 696         sbb     %rax,%rax               # cf -> mask
 697         and     \$38,%rax
 698
 699         add     %rax,$acc0
 700         mov     $acc1,8*1(%rdi)
 701         mov     $acc2,8*2(%rdi)
 702         mov     $acc3,8*3(%rdi)
 703         mov     $acc0,8*0(%rdi)
 704
 705         mov     8*3(%rsp),%r15
 706 .cfi_restore    %r15
 707         mov     8*4(%rsp),%r14
 708 .cfi_restore    %r14
 709         mov     8*5(%rsp),%r13
 710 .cfi_restore    %r13
 711         mov     8*6(%rsp),%r12
 712 .cfi_restore    %r12
 713         mov     8*7(%rsp),%rbx
 714 .cfi_restore    %rbx
 715         mov     8*8(%rsp),%rbp
 716 .cfi_restore    %rbp
 717         lea     8*9(%rsp),%rsp
 718 .cfi_adjust_cfa_offset  88
 719 .Lfe64_sqr_epilogue:
 720         ret
 721 .cfi_endproc
 722 .size   x25519_fe64_sqr,.-x25519_fe64_sqr
 723
 724 .globl  x25519_fe64_mul121666
 725 .type   x25519_fe64_mul121666,\@function,2
 726 .align  32
 727 x25519_fe64_mul121666:
 728 .Lfe64_mul121666_body:
 729 .cfi_startproc
 730         mov     \$121666,%edx
 731         mulx    8*0(%rsi),$acc0,%rcx
 732         mulx    8*1(%rsi),$acc1,%rax
 733         add     %rcx,$acc1
 734         mulx    8*2(%rsi),$acc2,%rcx
 735         adc     %rax,$acc2
 736         mulx    8*3(%rsi),$acc3,%rax
 737         adc     %rcx,$acc3
 738         adc     \$0,%rax
 739
 740         imulq   \$38,%rax,%rax
 741
 742         add     %rax,$acc0
 743         adc     \$0,$acc1
 744         adc     \$0,$acc2
 745         adc     \$0,$acc3
 746
 747         sbb     %rax,%rax               # cf -> mask
 748         and     \$38,%rax
 749
 750         add     %rax,$acc0
 751         mov     $acc1,8*1(%rdi)
 752         mov     $acc2,8*2(%rdi)
 753         mov     $acc3,8*3(%rdi)
 754         mov     $acc0,8*0(%rdi)
 755
 756 .Lfe64_mul121666_epilogue:
 757         ret
 758 .cfi_endproc
 759 .size   x25519_fe64_mul121666,.-x25519_fe64_mul121666
 760
 761 .globl  x25519_fe64_add
 762 .type   x25519_fe64_add,\@function,3
 763 .align  32
 764 x25519_fe64_add:
 765 .Lfe64_add_body:
 766 .cfi_startproc
 767         mov     8*0(%rsi),$acc0
 768         mov     8*1(%rsi),$acc1
 769         mov     8*2(%rsi),$acc2
 770         mov     8*3(%rsi),$acc3
 771
 772         add     8*0(%rdx),$acc0
 773         adc     8*1(%rdx),$acc1
 774         adc     8*2(%rdx),$acc2
 775         adc     8*3(%rdx),$acc3
 776
 777         sbb     %rax,%rax               # cf -> mask
 778         and     \$38,%rax
 779
 780         add     %rax,$acc0
 781         adc     \$0,$acc1
 782         adc     \$0,$acc2
 783         mov     $acc1,8*1(%rdi)
 784         adc     \$0,$acc3
 785         mov     $acc2,8*2(%rdi)
 786         sbb     %rax,%rax               # cf -> mask
 787         mov     $acc3,8*3(%rdi)
 788         and     \$38,%rax
 789
 790         add     %rax,$acc0
 791         mov     $acc0,8*0(%rdi)
 792
 793 .Lfe64_add_epilogue:
 794         ret
 795 .cfi_endproc
 796 .size   x25519_fe64_add,.-x25519_fe64_add
 797
 798 .globl  x25519_fe64_sub
 799 .type   x25519_fe64_sub,\@function,3
 800 .align  32
 801 x25519_fe64_sub:
 802 .Lfe64_sub_body:
 803 .cfi_startproc
 804         mov     8*0(%rsi),$acc0
 805         mov     8*1(%rsi),$acc1
 806         mov     8*2(%rsi),$acc2
 807         mov     8*3(%rsi),$acc3
 808
 809         sub     8*0(%rdx),$acc0
 810         sbb     8*1(%rdx),$acc1
 811         sbb     8*2(%rdx),$acc2
 812         sbb     8*3(%rdx),$acc3
 813
 814         sbb     %rax,%rax               # cf -> mask
 815         and     \$38,%rax
 816
 817         sub     %rax,$acc0
 818         sbb     \$0,$acc1
 819         sbb     \$0,$acc2
 820         mov     $acc1,8*1(%rdi)
 821         sbb     \$0,$acc3
 822         mov     $acc2,8*2(%rdi)
 823         sbb     %rax,%rax               # cf -> mask
 824         mov     $acc3,8*3(%rdi)
 825         and     \$38,%rax
 826
 827         sub     %rax,$acc0
 828         mov     $acc0,8*0(%rdi)
 829
 830 .Lfe64_sub_epilogue:
 831         ret
 832 .cfi_endproc
 833 .size   x25519_fe64_sub,.-x25519_fe64_sub
 834
 835 .globl  x25519_fe64_tobytes
 836 .type   x25519_fe64_tobytes,\@function,2
 837 .align  32
 838 x25519_fe64_tobytes:
 839 .Lfe64_to_body:
 840 .cfi_startproc
 841         mov     8*0(%rsi),$acc0
 842         mov     8*1(%rsi),$acc1
 843         mov     8*2(%rsi),$acc2
 844         mov     8*3(%rsi),$acc3
 845
 846         ################################# reduction modulo 2^255-19
 847         lea     ($acc3,$acc3),%rax
 848         sar     \$63,$acc3              # most significant bit -> mask
 849         shr     \$1,%rax                # most significant bit cleared
 850         and     \$19,$acc3
 851         add     \$19,$acc3              # compare to modulus in the same go
 852
 853         add     $acc3,$acc0
 854         adc     \$0,$acc1
 855         adc     \$0,$acc2
 856         adc     \$0,%rax
 857
 858         lea     (%rax,%rax),$acc3
 859         sar     \$63,%rax               # most significant bit -> mask
 860         shr     \$1,$acc3               # most significant bit cleared
 861         not     %rax
 862         and     \$19,%rax
 863
 864         sub     %rax,$acc0
 865         sbb     \$0,$acc1
 866         sbb     \$0,$acc2
 867         sbb     \$0,$acc3
 868
 869         mov     $acc0,8*0(%rdi)
 870         mov     $acc1,8*1(%rdi)
 871         mov     $acc2,8*2(%rdi)
 872         mov     $acc3,8*3(%rdi)
 873
 874 .Lfe64_to_epilogue:
 875         ret
 876 .cfi_endproc
 877 .size   x25519_fe64_tobytes,.-x25519_fe64_tobytes
 878 ___
 879 } else {
 880 $code.=<<___;
 881 .globl  x25519_fe64_eligible
 882 .type   x25519_fe64_eligible,\@abi-omnipotent
 883 .align  32
 884 x25519_fe64_eligible:
 885 .cfi_startproc
 886         xor     %eax,%eax
 887         ret
 888 .cfi_endproc
 889 .size   x25519_fe64_eligible,.-x25519_fe64_eligible
 890
 891 .globl  x25519_fe64_mul
 892 .type   x25519_fe64_mul,\@abi-omnipotent
 893 .globl  x25519_fe64_sqr
 894 .globl  x25519_fe64_mul121666
 895 .globl  x25519_fe64_add
 896 .globl  x25519_fe64_sub
 897 .globl  x25519_fe64_tobytes
 898 x25519_fe64_mul:
 899 x25519_fe64_sqr:
 900 x25519_fe64_mul121666:
 901 x25519_fe64_add:
 902 x25519_fe64_sub:
 903 x25519_fe64_tobytes:
 904 .cfi_startproc
 905         .byte   0x0f,0x0b       # ud2
 906         ret
 907 .cfi_endproc
 908 .size   x25519_fe64_mul,.-x25519_fe64_mul
 909 ___
 910 }
 911 $code.=<<___;
 912 .asciz  "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 913 ___
 914
 915 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 916 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
 917 if ($win64) {
 918 $rec="%rcx";
 919 $frame="%rdx";
 920 $context="%r8";
 921 $disp="%r9";
 922
 923 $code.=<<___;
 924 .extern __imp_RtlVirtualUnwind
 925
 926 .type   short_handler,\@abi-omnipotent
 927 .align  16
 928 short_handler:
 929         push    %rsi
 930         push    %rdi
 931         push    %rbx
 932         push    %rbp
 933         push    %r12
 934         push    %r13
 935         push    %r14
 936         push    %r15
 937         pushfq
 938         sub     \$64,%rsp
 939
 940         mov     120($context),%rax      # pull context->Rax
 941         mov     248($context),%rbx      # pull context->Rip
 942
 943         mov     8($disp),%rsi           # disp->ImageBase
 944         mov     56($disp),%r11          # disp->HandlerData
 945
 946         mov     0(%r11),%r10d           # HandlerData[0]
 947         lea     (%rsi,%r10),%r10        # end of prologue label
 948         cmp     %r10,%rbx               # context->Rip<end of prologue label
 949         jb      .Lcommon_seh_tail
 950
 951         mov     152($context),%rax      # pull context->Rsp
 952         jmp     .Lcommon_seh_tail
 953 .size   short_handler,.-short_handler
 954
 955 .type   full_handler,\@abi-omnipotent
 956 .align  16
 957 full_handler:
 958         push    %rsi
 959         push    %rdi
 960         push    %rbx
 961         push    %rbp
 962         push    %r12
 963         push    %r13
 964         push    %r14
 965         push    %r15
 966         pushfq
 967         sub     \$64,%rsp
 968
 969         mov     120($context),%rax      # pull context->Rax
 970         mov     248($context),%rbx      # pull context->Rip
 971
 972         mov     8($disp),%rsi           # disp->ImageBase
 973         mov     56($disp),%r11          # disp->HandlerData
 974
 975         mov     0(%r11),%r10d           # HandlerData[0]
 976         lea     (%rsi,%r10),%r10        # end of prologue label
 977         cmp     %r10,%rbx               # context->Rip<end of prologue label
 978         jb      .Lcommon_seh_tail
 979
 980         mov     152($context),%rax      # pull context->Rsp
 981
 982         mov     4(%r11),%r10d           # HandlerData[1]
 983         lea     (%rsi,%r10),%r10        # epilogue label
 984         cmp     %r10,%rbx               # context->Rip>=epilogue label
 985         jae     .Lcommon_seh_tail
 986
 987         mov     8(%r11),%r10d           # HandlerData[2]
 988         lea     (%rax,%r10),%rax
 989
 990         mov     -8(%rax),%rbp
 991         mov     -16(%rax),%rbx
 992         mov     -24(%rax),%r12
 993         mov     -32(%rax),%r13
 994         mov     -40(%rax),%r14
 995         mov     -48(%rax),%r15
 996         mov     %rbx,144($context)      # restore context->Rbx
 997         mov     %rbp,160($context)      # restore context->Rbp
 998         mov     %r12,216($context)      # restore context->R12
 999         mov     %r13,224($context)      # restore context->R13
1000         mov     %r14,232($context)      # restore context->R14
1001         mov     %r15,240($context)      # restore context->R15
1002
1003 .Lcommon_seh_tail:
1004         mov     8(%rax),%rdi
1005         mov     16(%rax),%rsi
1006         mov     %rax,152($context)      # restore context->Rsp
1007         mov     %rsi,168($context)      # restore context->Rsi
1008         mov     %rdi,176($context)      # restore context->Rdi
1009
1010         mov     40($disp),%rdi          # disp->ContextRecord
1011         mov     $context,%rsi           # context
1012         mov     \$154,%ecx              # sizeof(CONTEXT)
1013         .long   0xa548f3fc              # cld; rep movsq
1014
1015         mov     $disp,%rsi
1016         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1017         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1018         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1019         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1020         mov     40(%rsi),%r10           # disp->ContextRecord
1021         lea     56(%rsi),%r11           # &disp->HandlerData
1022         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1023         mov     %r10,32(%rsp)           # arg5
1024         mov     %r11,40(%rsp)           # arg6
1025         mov     %r12,48(%rsp)           # arg7
1026         mov     %rcx,56(%rsp)           # arg8, (NULL)
1027         call    *__imp_RtlVirtualUnwind(%rip)
1028
1029         mov     \$1,%eax                # ExceptionContinueSearch
1030         add     \$64,%rsp
1031         popfq
1032         pop     %r15
1033         pop     %r14
1034         pop     %r13
1035         pop     %r12
1036         pop     %rbp
1037         pop     %rbx
1038         pop     %rdi
1039         pop     %rsi
1040         ret
1041 .size   full_handler,.-full_handler
1042
1043 .section        .pdata
1044 .align  4
1045         .rva    .LSEH_begin_x25519_fe51_mul
1046         .rva    .LSEH_end_x25519_fe51_mul
1047         .rva    .LSEH_info_x25519_fe51_mul
1048
1049         .rva    .LSEH_begin_x25519_fe51_sqr
1050         .rva    .LSEH_end_x25519_fe51_sqr
1051         .rva    .LSEH_info_x25519_fe51_sqr
1052
1053         .rva    .LSEH_begin_x25519_fe51_mul121666
1054         .rva    .LSEH_end_x25519_fe51_mul121666
1055         .rva    .LSEH_info_x25519_fe51_mul121666
1056 ___
1057 $code.=<<___    if ($addx);
1058         .rva    .LSEH_begin_x25519_fe64_mul
1059         .rva    .LSEH_end_x25519_fe64_mul
1060         .rva    .LSEH_info_x25519_fe64_mul
1061
1062         .rva    .LSEH_begin_x25519_fe64_sqr
1063         .rva    .LSEH_end_x25519_fe64_sqr
1064         .rva    .LSEH_info_x25519_fe64_sqr
1065
1066         .rva    .LSEH_begin_x25519_fe64_mul121666
1067         .rva    .LSEH_end_x25519_fe64_mul121666
1068         .rva    .LSEH_info_x25519_fe64_mul121666
1069
1070         .rva    .LSEH_begin_x25519_fe64_add
1071         .rva    .LSEH_end_x25519_fe64_add
1072         .rva    .LSEH_info_x25519_fe64_add
1073
1074         .rva    .LSEH_begin_x25519_fe64_sub
1075         .rva    .LSEH_end_x25519_fe64_sub
1076         .rva    .LSEH_info_x25519_fe64_sub
1077
1078         .rva    .LSEH_begin_x25519_fe64_tobytes
1079         .rva    .LSEH_end_x25519_fe64_tobytes
1080         .rva    .LSEH_info_x25519_fe64_tobytes
1081 ___
1082 $code.=<<___;
1083 .section        .xdata
1084 .align  8
1085 .LSEH_info_x25519_fe51_mul:
1086         .byte   9,0,0,0
1087         .rva    full_handler
1088         .rva    .Lfe51_mul_body,.Lfe51_mul_epilogue     # HandlerData[]
1089         .long   88,0
1090 .LSEH_info_x25519_fe51_sqr:
1091         .byte   9,0,0,0
1092         .rva    full_handler
1093         .rva    .Lfe51_sqr_body,.Lfe51_sqr_epilogue     # HandlerData[]
1094         .long   88,0
1095 .LSEH_info_x25519_fe51_mul121666:
1096         .byte   9,0,0,0
1097         .rva    full_handler
1098         .rva    .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
1099         .long   88,0
1100 ___
1101 $code.=<<___    if ($addx);
1102 .LSEH_info_x25519_fe64_mul:
1103         .byte   9,0,0,0
1104         .rva    full_handler
1105         .rva    .Lfe64_mul_body,.Lfe64_mul_epilogue     # HandlerData[]
1106         .long   72,0
1107 .LSEH_info_x25519_fe64_sqr:
1108         .byte   9,0,0,0
1109         .rva    full_handler
1110         .rva    .Lfe64_sqr_body,.Lfe64_sqr_epilogue     # HandlerData[]
1111         .long   72,0
1112 .LSEH_info_x25519_fe64_mul121666:
1113         .byte   9,0,0,0
1114         .rva    short_handler
1115         .rva    .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
1116 .LSEH_info_x25519_fe64_add:
1117         .byte   9,0,0,0
1118         .rva    short_handler
1119         .rva    .Lfe64_add_body,.Lfe64_add_epilogue     # HandlerData[]
1120 .LSEH_info_x25519_fe64_sub:
1121         .byte   9,0,0,0
1122         .rva    short_handler
1123         .rva    .Lfe64_sub_body,.Lfe64_sub_epilogue     # HandlerData[]
1124 .LSEH_info_x25519_fe64_tobytes:
1125         .byte   9,0,0,0
1126         .rva    short_handler
1127         .rva    .Lfe64_to_body,.Lfe64_to_epilogue       # HandlerData[]
1128 ___
1129 }
1130
1131 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1132 print $code;
1133 close STDOUT or die "error closing STDOUT: $!";