crypto/ec/asm/x25519-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # X25519 lower-level primitives for x86_64.
  17 #
  18 # February 2018.
  19 #
  20 # This module implements radix 2^51 multiplication and squaring, and
  21 # radix 2^64 multiplication, squaring, addition, subtraction and final
  22 # reduction. Latter radix is used on ADCX/ADOX-capable processors such
  23 # as Broadwell. On related note one should mention that there are
  24 # vector implementations that provide significantly better performance
  25 # on some processors(*), but they are large and overly complex. Which
  26 # in combination with them being effectively processor-specific makes
  27 # the undertaking hard to justify. The goal for this implementation
  28 # is rather versatility and simplicity [and ultimately formal
  29 # verification].
  30 #
  31 # (*)   For example sandy2x should provide ~30% improvement on Sandy
  32 #       Bridge, but only nominal ~5% on Haswell [and big loss on
  33 #       Broadwell and successors].
  34 #
  35 ######################################################################
  36 # Improvement coefficients:
  37 #
  38 #                       amd64-51(*)     gcc-5.x(**)
  39 #
  40 # P4                    +22%            +40%
  41 # Sandy Bridge          -3%             +11%
  42 # Haswell               -1%             +13%
  43 # Broadwell(***)        +30%            +35%
  44 # Skylake(***)          +33%            +47%
  45 # Silvermont            +20%            +26%
  46 # Goldmont              +40%            +50%
  47 # Bulldozer             +20%            +9%
  48 # Ryzen(***)            +43%            +40%
  49 # VIA                   +170%           +120%
  50 #
  51 # (*)   amd64-51 is popular assembly implementation with 2^51 radix,
  52 #       only multiplication and squaring subroutines were linked
  53 #       for comparison, but not complete ladder step; gain on most
  54 #       processors is because this module refrains from shld, and
  55 #       minor regression on others is because this does result in
  56 #       higher instruction count;
  57 # (**)  compiler is free to inline functions, in assembly one would
  58 #       need to implement ladder step to do that, and it will improve
  59 #       performance by several percent;
  60 # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
  61 #       C implementation, so that comparison is always against
  62 #       2^51 radix;
  63
  64 # $output is the last argument if it looks like a file (it has an extension)
  65 # $flavour is the first argument if it doesn't look like a file
  66 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  67 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  68
  69 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  70
  71 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  72 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  73 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  74 die "can't locate x86_64-xlate.pl";
  75
  76 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  77     or die "can't call $xlate: $!";
  78 *STDOUT=*OUT;
  79
  80 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  81                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  82         $addx = ($1>=2.23);
  83 }
  84
  85 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  86             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  87         $addx = ($1>=2.10);
  88 }
  89
  90 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  91             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  92         $addx = ($1>=12);
  93 }
  94
  95 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  96         my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
  97         $addx = ($ver>=3.03);
  98 }
  99
 100 $code.=<<___;
 101 .text
 102
 103 .globl  x25519_fe51_mul
 104 .type   x25519_fe51_mul,\@function,3
 105 .align  32
 106 x25519_fe51_mul:
 107 .cfi_startproc
 108         push    %rbp
 109 .cfi_push       %rbp
 110         push    %rbx
 111 .cfi_push       %rbx
 112         push    %r12
 113 .cfi_push       %r12
 114         push    %r13
 115 .cfi_push       %r13
 116         push    %r14
 117 .cfi_push       %r14
 118         push    %r15
 119 .cfi_push       %r15
 120         lea     -8*5(%rsp),%rsp
 121 .cfi_adjust_cfa_offset  40
 122 .Lfe51_mul_body:
 123
 124         mov     8*0(%rsi),%rax          # f[0]
 125         mov     8*0(%rdx),%r11          # load g[0-4]
 126         mov     8*1(%rdx),%r12
 127         mov     8*2(%rdx),%r13
 128         mov     8*3(%rdx),%rbp
 129         mov     8*4(%rdx),%r14
 130
 131         mov     %rdi,8*4(%rsp)          # offload 1st argument
 132         mov     %rax,%rdi
 133         mulq    %r11                    # f[0]*g[0]
 134         mov     %r11,8*0(%rsp)          # offload g[0]
 135         mov     %rax,%rbx               # %rbx:%rcx = h0
 136         mov     %rdi,%rax
 137         mov     %rdx,%rcx
 138         mulq    %r12                    # f[0]*g[1]
 139         mov     %r12,8*1(%rsp)          # offload g[1]
 140         mov     %rax,%r8                # %r8:%r9 = h1
 141         mov     %rdi,%rax
 142         lea     (%r14,%r14,8),%r15
 143         mov     %rdx,%r9
 144         mulq    %r13                    # f[0]*g[2]
 145         mov     %r13,8*2(%rsp)          # offload g[2]
 146         mov     %rax,%r10               # %r10:%r11 = h2
 147         mov     %rdi,%rax
 148         lea     (%r14,%r15,2),%rdi      # g[4]*19
 149         mov     %rdx,%r11
 150         mulq    %rbp                    # f[0]*g[3]
 151         mov     %rax,%r12               # %r12:%r13 = h3
 152         mov     8*0(%rsi),%rax          # f[0]
 153         mov     %rdx,%r13
 154         mulq    %r14                    # f[0]*g[4]
 155         mov     %rax,%r14               # %r14:%r15 = h4
 156         mov     8*1(%rsi),%rax          # f[1]
 157         mov     %rdx,%r15
 158
 159         mulq    %rdi                    # f[1]*g[4]*19
 160         add     %rax,%rbx
 161         mov     8*2(%rsi),%rax          # f[2]
 162         adc     %rdx,%rcx
 163         mulq    %rdi                    # f[2]*g[4]*19
 164         add     %rax,%r8
 165         mov     8*3(%rsi),%rax          # f[3]
 166         adc     %rdx,%r9
 167         mulq    %rdi                    # f[3]*g[4]*19
 168         add     %rax,%r10
 169         mov     8*4(%rsi),%rax          # f[4]
 170         adc     %rdx,%r11
 171         mulq    %rdi                    # f[4]*g[4]*19
 172         imulq   \$19,%rbp,%rdi          # g[3]*19
 173         add     %rax,%r12
 174         mov     8*1(%rsi),%rax          # f[1]
 175         adc     %rdx,%r13
 176         mulq    %rbp                    # f[1]*g[3]
 177         mov     8*2(%rsp),%rbp          # g[2]
 178         add     %rax,%r14
 179         mov     8*2(%rsi),%rax          # f[2]
 180         adc     %rdx,%r15
 181
 182         mulq    %rdi                    # f[2]*g[3]*19
 183         add     %rax,%rbx
 184         mov     8*3(%rsi),%rax          # f[3]
 185         adc     %rdx,%rcx
 186         mulq    %rdi                    # f[3]*g[3]*19
 187         add     %rax,%r8
 188         mov     8*4(%rsi),%rax          # f[4]
 189         adc     %rdx,%r9
 190         mulq    %rdi                    # f[4]*g[3]*19
 191         imulq   \$19,%rbp,%rdi          # g[2]*19
 192         add     %rax,%r10
 193         mov     8*1(%rsi),%rax          # f[1]
 194         adc     %rdx,%r11
 195         mulq    %rbp                    # f[1]*g[2]
 196         add     %rax,%r12
 197         mov     8*2(%rsi),%rax          # f[2]
 198         adc     %rdx,%r13
 199         mulq    %rbp                    # f[2]*g[2]
 200         mov     8*1(%rsp),%rbp          # g[1]
 201         add     %rax,%r14
 202         mov     8*3(%rsi),%rax          # f[3]
 203         adc     %rdx,%r15
 204
 205         mulq    %rdi                    # f[3]*g[2]*19
 206         add     %rax,%rbx
 207         mov     8*4(%rsi),%rax          # f[3]
 208         adc     %rdx,%rcx
 209         mulq    %rdi                    # f[4]*g[2]*19
 210         add     %rax,%r8
 211         mov     8*1(%rsi),%rax          # f[1]
 212         adc     %rdx,%r9
 213         mulq    %rbp                    # f[1]*g[1]
 214         imulq   \$19,%rbp,%rdi
 215         add     %rax,%r10
 216         mov     8*2(%rsi),%rax          # f[2]
 217         adc     %rdx,%r11
 218         mulq    %rbp                    # f[2]*g[1]
 219         add     %rax,%r12
 220         mov     8*3(%rsi),%rax          # f[3]
 221         adc     %rdx,%r13
 222         mulq    %rbp                    # f[3]*g[1]
 223         mov     8*0(%rsp),%rbp          # g[0]
 224         add     %rax,%r14
 225         mov     8*4(%rsi),%rax          # f[4]
 226         adc     %rdx,%r15
 227
 228         mulq    %rdi                    # f[4]*g[1]*19
 229         add     %rax,%rbx
 230         mov     8*1(%rsi),%rax          # f[1]
 231         adc     %rdx,%rcx
 232         mul     %rbp                    # f[1]*g[0]
 233         add     %rax,%r8
 234         mov     8*2(%rsi),%rax          # f[2]
 235         adc     %rdx,%r9
 236         mul     %rbp                    # f[2]*g[0]
 237         add     %rax,%r10
 238         mov     8*3(%rsi),%rax          # f[3]
 239         adc     %rdx,%r11
 240         mul     %rbp                    # f[3]*g[0]
 241         add     %rax,%r12
 242         mov     8*4(%rsi),%rax          # f[4]
 243         adc     %rdx,%r13
 244         mulq    %rbp                    # f[4]*g[0]
 245         add     %rax,%r14
 246         adc     %rdx,%r15
 247
 248         mov     8*4(%rsp),%rdi          # restore 1st argument
 249         jmp     .Lreduce51
 250 .Lfe51_mul_epilogue:
 251 .cfi_endproc
 252 .size   x25519_fe51_mul,.-x25519_fe51_mul
 253
 254 .globl  x25519_fe51_sqr
 255 .type   x25519_fe51_sqr,\@function,2
 256 .align  32
 257 x25519_fe51_sqr:
 258 .cfi_startproc
 259         push    %rbp
 260 .cfi_push       %rbp
 261         push    %rbx
 262 .cfi_push       %rbx
 263         push    %r12
 264 .cfi_push       %r12
 265         push    %r13
 266 .cfi_push       %r13
 267         push    %r14
 268 .cfi_push       %r14
 269         push    %r15
 270 .cfi_push       %r15
 271         lea     -8*5(%rsp),%rsp
 272 .cfi_adjust_cfa_offset  40
 273 .Lfe51_sqr_body:
 274
 275         mov     8*0(%rsi),%rax          # g[0]
 276         mov     8*2(%rsi),%r15          # g[2]
 277         mov     8*4(%rsi),%rbp          # g[4]
 278
 279         mov     %rdi,8*4(%rsp)          # offload 1st argument
 280         lea     (%rax,%rax),%r14
 281         mulq    %rax                    # g[0]*g[0]
 282         mov     %rax,%rbx
 283         mov     8*1(%rsi),%rax          # g[1]
 284         mov     %rdx,%rcx
 285         mulq    %r14                    # 2*g[0]*g[1]
 286         mov     %rax,%r8
 287         mov     %r15,%rax
 288         mov     %r15,8*0(%rsp)          # offload g[2]
 289         mov     %rdx,%r9
 290         mulq    %r14                    # 2*g[0]*g[2]
 291         mov     %rax,%r10
 292         mov     8*3(%rsi),%rax
 293         mov     %rdx,%r11
 294         imulq   \$19,%rbp,%rdi          # g[4]*19
 295         mulq    %r14                    # 2*g[0]*g[3]
 296         mov     %rax,%r12
 297         mov     %rbp,%rax
 298         mov     %rdx,%r13
 299         mulq    %r14                    # 2*g[0]*g[4]
 300         mov     %rax,%r14
 301         mov     %rbp,%rax
 302         mov     %rdx,%r15
 303
 304         mulq    %rdi                    # g[4]*g[4]*19
 305         add     %rax,%r12
 306         mov     8*1(%rsi),%rax          # g[1]
 307         adc     %rdx,%r13
 308
 309         mov     8*3(%rsi),%rsi          # g[3]
 310         lea     (%rax,%rax),%rbp
 311         mulq    %rax                    # g[1]*g[1]
 312         add     %rax,%r10
 313         mov     8*0(%rsp),%rax          # g[2]
 314         adc     %rdx,%r11
 315         mulq    %rbp                    # 2*g[1]*g[2]
 316         add     %rax,%r12
 317         mov     %rbp,%rax
 318         adc     %rdx,%r13
 319         mulq    %rsi                    # 2*g[1]*g[3]
 320         add     %rax,%r14
 321         mov     %rbp,%rax
 322         adc     %rdx,%r15
 323         imulq   \$19,%rsi,%rbp          # g[3]*19
 324         mulq    %rdi                    # 2*g[1]*g[4]*19
 325         add     %rax,%rbx
 326         lea     (%rsi,%rsi),%rax
 327         adc     %rdx,%rcx
 328
 329         mulq    %rdi                    # 2*g[3]*g[4]*19
 330         add     %rax,%r10
 331         mov     %rsi,%rax
 332         adc     %rdx,%r11
 333         mulq    %rbp                    # g[3]*g[3]*19
 334         add     %rax,%r8
 335         mov     8*0(%rsp),%rax          # g[2]
 336         adc     %rdx,%r9
 337
 338         lea     (%rax,%rax),%rsi
 339         mulq    %rax                    # g[2]*g[2]
 340         add     %rax,%r14
 341         mov     %rbp,%rax
 342         adc     %rdx,%r15
 343         mulq    %rsi                    # 2*g[2]*g[3]*19
 344         add     %rax,%rbx
 345         mov     %rsi,%rax
 346         adc     %rdx,%rcx
 347         mulq    %rdi                    # 2*g[2]*g[4]*19
 348         add     %rax,%r8
 349         adc     %rdx,%r9
 350
 351         mov     8*4(%rsp),%rdi          # restore 1st argument
 352         jmp     .Lreduce51
 353
 354 .align  32
 355 .Lreduce51:
 356         mov     \$0x7ffffffffffff,%rbp
 357
 358         mov     %r10,%rdx
 359         shr     \$51,%r10
 360         shl     \$13,%r11
 361         and     %rbp,%rdx               # %rdx = g2 = h2 & mask
 362         or      %r10,%r11               # h2>>51
 363         add     %r11,%r12
 364         adc     \$0,%r13                # h3 += h2>>51
 365
 366         mov     %rbx,%rax
 367         shr     \$51,%rbx
 368         shl     \$13,%rcx
 369         and     %rbp,%rax               # %rax = g0 = h0 & mask
 370         or      %rbx,%rcx               # h0>>51
 371         add     %rcx,%r8                # h1 += h0>>51
 372         adc     \$0,%r9
 373
 374         mov     %r12,%rbx
 375         shr     \$51,%r12
 376         shl     \$13,%r13
 377         and     %rbp,%rbx               # %rbx = g3 = h3 & mask
 378         or      %r12,%r13               # h3>>51
 379         add     %r13,%r14               # h4 += h3>>51
 380         adc     \$0,%r15
 381
 382         mov     %r8,%rcx
 383         shr     \$51,%r8
 384         shl     \$13,%r9
 385         and     %rbp,%rcx               # %rcx = g1 = h1 & mask
 386         or      %r8,%r9
 387         add     %r9,%rdx                # g2 += h1>>51
 388
 389         mov     %r14,%r10
 390         shr     \$51,%r14
 391         shl     \$13,%r15
 392         and     %rbp,%r10               # %r10 = g4 = h0 & mask
 393         or      %r14,%r15               # h0>>51
 394
 395         lea     (%r15,%r15,8),%r14
 396         lea     (%r15,%r14,2),%r15
 397         add     %r15,%rax               # g0 += (h0>>51)*19
 398
 399         mov     %rdx,%r8
 400         and     %rbp,%rdx               # g2 &= mask
 401         shr     \$51,%r8
 402         add     %r8,%rbx                # g3 += g2>>51
 403
 404         mov     %rax,%r9
 405         and     %rbp,%rax               # g0 &= mask
 406         shr     \$51,%r9
 407         add     %r9,%rcx                # g1 += g0>>51
 408
 409         mov     %rax,8*0(%rdi)          # save the result
 410         mov     %rcx,8*1(%rdi)
 411         mov     %rdx,8*2(%rdi)
 412         mov     %rbx,8*3(%rdi)
 413         mov     %r10,8*4(%rdi)
 414
 415         mov     8*5(%rsp),%r15
 416 .cfi_restore    %r15
 417         mov     8*6(%rsp),%r14
 418 .cfi_restore    %r14
 419         mov     8*7(%rsp),%r13
 420 .cfi_restore    %r13
 421         mov     8*8(%rsp),%r12
 422 .cfi_restore    %r12
 423         mov     8*9(%rsp),%rbx
 424 .cfi_restore    %rbx
 425         mov     8*10(%rsp),%rbp
 426 .cfi_restore    %rbp
 427         lea     8*11(%rsp),%rsp
 428 .cfi_adjust_cfa_offset  88
 429 .Lfe51_sqr_epilogue:
 430         ret
 431 .cfi_endproc
 432 .size   x25519_fe51_sqr,.-x25519_fe51_sqr
 433
 434 .globl  x25519_fe51_mul121666
 435 .type   x25519_fe51_mul121666,\@function,2
 436 .align  32
 437 x25519_fe51_mul121666:
 438 .cfi_startproc
 439         push    %rbp
 440 .cfi_push       %rbp
 441         push    %rbx
 442 .cfi_push       %rbx
 443         push    %r12
 444 .cfi_push       %r12
 445         push    %r13
 446 .cfi_push       %r13
 447         push    %r14
 448 .cfi_push       %r14
 449         push    %r15
 450 .cfi_push       %r15
 451         lea     -8*5(%rsp),%rsp
 452 .cfi_adjust_cfa_offset  40
 453 .Lfe51_mul121666_body:
 454         mov     \$121666,%eax
 455
 456         mulq    8*0(%rsi)
 457         mov     %rax,%rbx               # %rbx:%rcx = h0
 458         mov     \$121666,%eax
 459         mov     %rdx,%rcx
 460         mulq    8*1(%rsi)
 461         mov     %rax,%r8                # %r8:%r9 = h1
 462         mov     \$121666,%eax
 463         mov     %rdx,%r9
 464         mulq    8*2(%rsi)
 465         mov     %rax,%r10               # %r10:%r11 = h2
 466         mov     \$121666,%eax
 467         mov     %rdx,%r11
 468         mulq    8*3(%rsi)
 469         mov     %rax,%r12               # %r12:%r13 = h3
 470         mov     \$121666,%eax           # f[0]
 471         mov     %rdx,%r13
 472         mulq    8*4(%rsi)
 473         mov     %rax,%r14               # %r14:%r15 = h4
 474         mov     %rdx,%r15
 475
 476         jmp     .Lreduce51
 477 .Lfe51_mul121666_epilogue:
 478 .cfi_endproc
 479 .size   x25519_fe51_mul121666,.-x25519_fe51_mul121666
 480 ___
 481 ########################################################################
 482 # Base 2^64 subroutines modulo 2*(2^255-19)
 483 #
 484 if ($addx) {
 485 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
 486
 487 $code.=<<___;
 488 .extern OPENSSL_ia32cap_P
 489 .globl  x25519_fe64_eligible
 490 .type   x25519_fe64_eligible,\@abi-omnipotent
 491 .align  32
 492 x25519_fe64_eligible:
 493         mov     OPENSSL_ia32cap_P+8(%rip),%ecx
 494         xor     %eax,%eax
 495         and     \$0x80100,%ecx
 496         cmp     \$0x80100,%ecx
 497         cmove   %ecx,%eax
 498         ret
 499 .size   x25519_fe64_eligible,.-x25519_fe64_eligible
 500
 501 .globl  x25519_fe64_mul
 502 .type   x25519_fe64_mul,\@function,3
 503 .align  32
 504 x25519_fe64_mul:
 505 .cfi_startproc
 506         push    %rbp
 507 .cfi_push       %rbp
 508         push    %rbx
 509 .cfi_push       %rbx
 510         push    %r12
 511 .cfi_push       %r12
 512         push    %r13
 513 .cfi_push       %r13
 514         push    %r14
 515 .cfi_push       %r14
 516         push    %r15
 517 .cfi_push       %r15
 518         push    %rdi                    # offload dst
 519 .cfi_push       %rdi
 520         lea     -8*2(%rsp),%rsp
 521 .cfi_adjust_cfa_offset  16
 522 .Lfe64_mul_body:
 523
 524         mov     %rdx,%rax
 525         mov     8*0(%rdx),%rbp          # b[0]
 526         mov     8*0(%rsi),%rdx          # a[0]
 527         mov     8*1(%rax),%rcx          # b[1]
 528         mov     8*2(%rax),$acc6         # b[2]
 529         mov     8*3(%rax),$acc7         # b[3]
 530
 531         mulx    %rbp,$acc0,%rax         # a[0]*b[0]
 532         xor     %edi,%edi               # cf=0,of=0
 533         mulx    %rcx,$acc1,%rbx         # a[0]*b[1]
 534         adcx    %rax,$acc1
 535         mulx    $acc6,$acc2,%rax        # a[0]*b[2]
 536         adcx    %rbx,$acc2
 537         mulx    $acc7,$acc3,$acc4       # a[0]*b[3]
 538          mov    8*1(%rsi),%rdx          # a[1]
 539         adcx    %rax,$acc3
 540         mov     $acc6,(%rsp)            # offload b[2]
 541         adcx    %rdi,$acc4              # cf=0
 542
 543         mulx    %rbp,%rax,%rbx          # a[1]*b[0]
 544         adox    %rax,$acc1
 545         adcx    %rbx,$acc2
 546         mulx    %rcx,%rax,%rbx          # a[1]*b[1]
 547         adox    %rax,$acc2
 548         adcx    %rbx,$acc3
 549         mulx    $acc6,%rax,%rbx         # a[1]*b[2]
 550         adox    %rax,$acc3
 551         adcx    %rbx,$acc4
 552         mulx    $acc7,%rax,$acc5        # a[1]*b[3]
 553          mov    8*2(%rsi),%rdx          # a[2]
 554         adox    %rax,$acc4
 555         adcx    %rdi,$acc5              # cf=0
 556         adox    %rdi,$acc5              # of=0
 557
 558         mulx    %rbp,%rax,%rbx          # a[2]*b[0]
 559         adcx    %rax,$acc2
 560         adox    %rbx,$acc3
 561         mulx    %rcx,%rax,%rbx          # a[2]*b[1]
 562         adcx    %rax,$acc3
 563         adox    %rbx,$acc4
 564         mulx    $acc6,%rax,%rbx         # a[2]*b[2]
 565         adcx    %rax,$acc4
 566         adox    %rbx,$acc5
 567         mulx    $acc7,%rax,$acc6        # a[2]*b[3]
 568          mov    8*3(%rsi),%rdx          # a[3]
 569         adcx    %rax,$acc5
 570         adox    %rdi,$acc6              # of=0
 571         adcx    %rdi,$acc6              # cf=0
 572
 573         mulx    %rbp,%rax,%rbx          # a[3]*b[0]
 574         adox    %rax,$acc3
 575         adcx    %rbx,$acc4
 576         mulx    %rcx,%rax,%rbx          # a[3]*b[1]
 577         adox    %rax,$acc4
 578         adcx    %rbx,$acc5
 579         mulx    (%rsp),%rax,%rbx        # a[3]*b[2]
 580         adox    %rax,$acc5
 581         adcx    %rbx,$acc6
 582         mulx    $acc7,%rax,$acc7        # a[3]*b[3]
 583          mov    \$38,%edx
 584         adox    %rax,$acc6
 585         adcx    %rdi,$acc7              # cf=0
 586         adox    %rdi,$acc7              # of=0
 587
 588         jmp     .Lreduce64
 589 .Lfe64_mul_epilogue:
 590 .cfi_endproc
 591 .size   x25519_fe64_mul,.-x25519_fe64_mul
 592
 593 .globl  x25519_fe64_sqr
 594 .type   x25519_fe64_sqr,\@function,2
 595 .align  32
 596 x25519_fe64_sqr:
 597 .cfi_startproc
 598         push    %rbp
 599 .cfi_push       %rbp
 600         push    %rbx
 601 .cfi_push       %rbx
 602         push    %r12
 603 .cfi_push       %r12
 604         push    %r13
 605 .cfi_push       %r13
 606         push    %r14
 607 .cfi_push       %r14
 608         push    %r15
 609 .cfi_push       %r15
 610         push    %rdi                    # offload dst
 611 .cfi_push       %rdi
 612         lea     -8*2(%rsp),%rsp
 613 .cfi_adjust_cfa_offset  16
 614 .Lfe64_sqr_body:
 615
 616         mov     8*0(%rsi),%rdx          # a[0]
 617         mov     8*1(%rsi),%rcx          # a[1]
 618         mov     8*2(%rsi),%rbp          # a[2]
 619         mov     8*3(%rsi),%rsi          # a[3]
 620
 621         ################################################################
 622         mulx    %rdx,$acc0,$acc7        # a[0]*a[0]
 623         mulx    %rcx,$acc1,%rax         # a[0]*a[1]
 624         xor     %edi,%edi               # cf=0,of=0
 625         mulx    %rbp,$acc2,%rbx         # a[0]*a[2]
 626         adcx    %rax,$acc2
 627         mulx    %rsi,$acc3,$acc4        # a[0]*a[3]
 628          mov    %rcx,%rdx               # a[1]
 629         adcx    %rbx,$acc3
 630         adcx    %rdi,$acc4              # cf=0
 631
 632         ################################################################
 633         mulx    %rbp,%rax,%rbx          # a[1]*a[2]
 634         adox    %rax,$acc3
 635         adcx    %rbx,$acc4
 636         mulx    %rsi,%rax,$acc5         # a[1]*a[3]
 637          mov    %rbp,%rdx               # a[2]
 638         adox    %rax,$acc4
 639         adcx    %rdi,$acc5
 640
 641         ################################################################
 642         mulx    %rsi,%rax,$acc6         # a[2]*a[3]
 643          mov    %rcx,%rdx               # a[1]
 644         adox    %rax,$acc5
 645         adcx    %rdi,$acc6              # cf=0
 646         adox    %rdi,$acc6              # of=0
 647
 648          adcx   $acc1,$acc1             # acc1:6<<1
 649         adox    $acc7,$acc1
 650          adcx   $acc2,$acc2
 651         mulx    %rdx,%rax,%rbx          # a[1]*a[1]
 652          mov    %rbp,%rdx               # a[2]
 653          adcx   $acc3,$acc3
 654         adox    %rax,$acc2
 655          adcx   $acc4,$acc4
 656         adox    %rbx,$acc3
 657         mulx    %rdx,%rax,%rbx          # a[2]*a[2]
 658          mov    %rsi,%rdx               # a[3]
 659          adcx   $acc5,$acc5
 660         adox    %rax,$acc4
 661          adcx   $acc6,$acc6
 662         adox    %rbx,$acc5
 663         mulx    %rdx,%rax,$acc7         # a[3]*a[3]
 664          mov    \$38,%edx
 665         adox    %rax,$acc6
 666         adcx    %rdi,$acc7              # cf=0
 667         adox    %rdi,$acc7              # of=0
 668         jmp     .Lreduce64
 669
 670 .align  32
 671 .Lreduce64:
 672         mulx    $acc4,%rax,%rbx
 673         adcx    %rax,$acc0
 674         adox    %rbx,$acc1
 675         mulx    $acc5,%rax,%rbx
 676         adcx    %rax,$acc1
 677         adox    %rbx,$acc2
 678         mulx    $acc6,%rax,%rbx
 679         adcx    %rax,$acc2
 680         adox    %rbx,$acc3
 681         mulx    $acc7,%rax,$acc4
 682         adcx    %rax,$acc3
 683         adox    %rdi,$acc4
 684         adcx    %rdi,$acc4
 685
 686         mov     8*2(%rsp),%rdi          # restore dst
 687         imulq   %rdx,$acc4
 688
 689         add     $acc4,$acc0
 690         adc     \$0,$acc1
 691         adc     \$0,$acc2
 692         adc     \$0,$acc3
 693
 694         sbb     %rax,%rax               # cf -> mask
 695         and     \$38,%rax
 696
 697         add     %rax,$acc0
 698         mov     $acc1,8*1(%rdi)
 699         mov     $acc2,8*2(%rdi)
 700         mov     $acc3,8*3(%rdi)
 701         mov     $acc0,8*0(%rdi)
 702
 703         mov     8*3(%rsp),%r15
 704 .cfi_restore    %r15
 705         mov     8*4(%rsp),%r14
 706 .cfi_restore    %r14
 707         mov     8*5(%rsp),%r13
 708 .cfi_restore    %r13
 709         mov     8*6(%rsp),%r12
 710 .cfi_restore    %r12
 711         mov     8*7(%rsp),%rbx
 712 .cfi_restore    %rbx
 713         mov     8*8(%rsp),%rbp
 714 .cfi_restore    %rbp
 715         lea     8*9(%rsp),%rsp
 716 .cfi_adjust_cfa_offset  88
 717 .Lfe64_sqr_epilogue:
 718         ret
 719 .cfi_endproc
 720 .size   x25519_fe64_sqr,.-x25519_fe64_sqr
 721
 722 .globl  x25519_fe64_mul121666
 723 .type   x25519_fe64_mul121666,\@function,2
 724 .align  32
 725 x25519_fe64_mul121666:
 726 .Lfe64_mul121666_body:
 727         mov     \$121666,%edx
 728         mulx    8*0(%rsi),$acc0,%rcx
 729         mulx    8*1(%rsi),$acc1,%rax
 730         add     %rcx,$acc1
 731         mulx    8*2(%rsi),$acc2,%rcx
 732         adc     %rax,$acc2
 733         mulx    8*3(%rsi),$acc3,%rax
 734         adc     %rcx,$acc3
 735         adc     \$0,%rax
 736
 737         imulq   \$38,%rax,%rax
 738
 739         add     %rax,$acc0
 740         adc     \$0,$acc1
 741         adc     \$0,$acc2
 742         adc     \$0,$acc3
 743
 744         sbb     %rax,%rax               # cf -> mask
 745         and     \$38,%rax
 746
 747         add     %rax,$acc0
 748         mov     $acc1,8*1(%rdi)
 749         mov     $acc2,8*2(%rdi)
 750         mov     $acc3,8*3(%rdi)
 751         mov     $acc0,8*0(%rdi)
 752
 753 .Lfe64_mul121666_epilogue:
 754         ret
 755 .size   x25519_fe64_mul121666,.-x25519_fe64_mul121666
 756
 757 .globl  x25519_fe64_add
 758 .type   x25519_fe64_add,\@function,3
 759 .align  32
 760 x25519_fe64_add:
 761 .Lfe64_add_body:
 762         mov     8*0(%rsi),$acc0
 763         mov     8*1(%rsi),$acc1
 764         mov     8*2(%rsi),$acc2
 765         mov     8*3(%rsi),$acc3
 766
 767         add     8*0(%rdx),$acc0
 768         adc     8*1(%rdx),$acc1
 769         adc     8*2(%rdx),$acc2
 770         adc     8*3(%rdx),$acc3
 771
 772         sbb     %rax,%rax               # cf -> mask
 773         and     \$38,%rax
 774
 775         add     %rax,$acc0
 776         adc     \$0,$acc1
 777         adc     \$0,$acc2
 778         mov     $acc1,8*1(%rdi)
 779         adc     \$0,$acc3
 780         mov     $acc2,8*2(%rdi)
 781         sbb     %rax,%rax               # cf -> mask
 782         mov     $acc3,8*3(%rdi)
 783         and     \$38,%rax
 784
 785         add     %rax,$acc0
 786         mov     $acc0,8*0(%rdi)
 787
 788 .Lfe64_add_epilogue:
 789         ret
 790 .size   x25519_fe64_add,.-x25519_fe64_add
 791
 792 .globl  x25519_fe64_sub
 793 .type   x25519_fe64_sub,\@function,3
 794 .align  32
 795 x25519_fe64_sub:
 796 .Lfe64_sub_body:
 797         mov     8*0(%rsi),$acc0
 798         mov     8*1(%rsi),$acc1
 799         mov     8*2(%rsi),$acc2
 800         mov     8*3(%rsi),$acc3
 801
 802         sub     8*0(%rdx),$acc0
 803         sbb     8*1(%rdx),$acc1
 804         sbb     8*2(%rdx),$acc2
 805         sbb     8*3(%rdx),$acc3
 806
 807         sbb     %rax,%rax               # cf -> mask
 808         and     \$38,%rax
 809
 810         sub     %rax,$acc0
 811         sbb     \$0,$acc1
 812         sbb     \$0,$acc2
 813         mov     $acc1,8*1(%rdi)
 814         sbb     \$0,$acc3
 815         mov     $acc2,8*2(%rdi)
 816         sbb     %rax,%rax               # cf -> mask
 817         mov     $acc3,8*3(%rdi)
 818         and     \$38,%rax
 819
 820         sub     %rax,$acc0
 821         mov     $acc0,8*0(%rdi)
 822
 823 .Lfe64_sub_epilogue:
 824         ret
 825 .size   x25519_fe64_sub,.-x25519_fe64_sub
 826
 827 .globl  x25519_fe64_tobytes
 828 .type   x25519_fe64_tobytes,\@function,2
 829 .align  32
 830 x25519_fe64_tobytes:
 831 .Lfe64_to_body:
 832         mov     8*0(%rsi),$acc0
 833         mov     8*1(%rsi),$acc1
 834         mov     8*2(%rsi),$acc2
 835         mov     8*3(%rsi),$acc3
 836
 837         ################################# reduction modulo 2^255-19
 838         lea     ($acc3,$acc3),%rax
 839         sar     \$63,$acc3              # most significant bit -> mask
 840         shr     \$1,%rax                # most significant bit cleared
 841         and     \$19,$acc3
 842         add     \$19,$acc3              # compare to modulus in the same go
 843
 844         add     $acc3,$acc0
 845         adc     \$0,$acc1
 846         adc     \$0,$acc2
 847         adc     \$0,%rax
 848
 849         lea     (%rax,%rax),$acc3
 850         sar     \$63,%rax               # most significant bit -> mask
 851         shr     \$1,$acc3               # most significant bit cleared
 852         not     %rax
 853         and     \$19,%rax
 854
 855         sub     %rax,$acc0
 856         sbb     \$0,$acc1
 857         sbb     \$0,$acc2
 858         sbb     \$0,$acc3
 859
 860         mov     $acc0,8*0(%rdi)
 861         mov     $acc1,8*1(%rdi)
 862         mov     $acc2,8*2(%rdi)
 863         mov     $acc3,8*3(%rdi)
 864
 865 .Lfe64_to_epilogue:
 866         ret
 867 .size   x25519_fe64_tobytes,.-x25519_fe64_tobytes
 868 ___
 869 } else {
 870 $code.=<<___;
 871 .globl  x25519_fe64_eligible
 872 .type   x25519_fe64_eligible,\@abi-omnipotent
 873 .align  32
 874 x25519_fe64_eligible:
 875         xor     %eax,%eax
 876         ret
 877 .size   x25519_fe64_eligible,.-x25519_fe64_eligible
 878
 879 .globl  x25519_fe64_mul
 880 .type   x25519_fe64_mul,\@abi-omnipotent
 881 .globl  x25519_fe64_sqr
 882 .globl  x25519_fe64_mul121666
 883 .globl  x25519_fe64_add
 884 .globl  x25519_fe64_sub
 885 .globl  x25519_fe64_tobytes
 886 x25519_fe64_mul:
 887 x25519_fe64_sqr:
 888 x25519_fe64_mul121666:
 889 x25519_fe64_add:
 890 x25519_fe64_sub:
 891 x25519_fe64_tobytes:
 892         .byte   0x0f,0x0b       # ud2
 893         ret
 894 .size   x25519_fe64_mul,.-x25519_fe64_mul
 895 ___
 896 }
 897 $code.=<<___;
 898 .asciz  "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 899 ___
 900
 901 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 902 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
 903 if ($win64) {
 904 $rec="%rcx";
 905 $frame="%rdx";
 906 $context="%r8";
 907 $disp="%r9";
 908
 909 $code.=<<___;
 910 .extern __imp_RtlVirtualUnwind
 911
 912 .type   short_handler,\@abi-omnipotent
 913 .align  16
 914 short_handler:
 915         push    %rsi
 916         push    %rdi
 917         push    %rbx
 918         push    %rbp
 919         push    %r12
 920         push    %r13
 921         push    %r14
 922         push    %r15
 923         pushfq
 924         sub     \$64,%rsp
 925
 926         mov     120($context),%rax      # pull context->Rax
 927         mov     248($context),%rbx      # pull context->Rip
 928
 929         mov     8($disp),%rsi           # disp->ImageBase
 930         mov     56($disp),%r11          # disp->HandlerData
 931
 932         mov     0(%r11),%r10d           # HandlerData[0]
 933         lea     (%rsi,%r10),%r10        # end of prologue label
 934         cmp     %r10,%rbx               # context->Rip<end of prologue label
 935         jb      .Lcommon_seh_tail
 936
 937         mov     152($context),%rax      # pull context->Rsp
 938         jmp     .Lcommon_seh_tail
 939 .size   short_handler,.-short_handler
 940
 941 .type   full_handler,\@abi-omnipotent
 942 .align  16
 943 full_handler:
 944         push    %rsi
 945         push    %rdi
 946         push    %rbx
 947         push    %rbp
 948         push    %r12
 949         push    %r13
 950         push    %r14
 951         push    %r15
 952         pushfq
 953         sub     \$64,%rsp
 954
 955         mov     120($context),%rax      # pull context->Rax
 956         mov     248($context),%rbx      # pull context->Rip
 957
 958         mov     8($disp),%rsi           # disp->ImageBase
 959         mov     56($disp),%r11          # disp->HandlerData
 960
 961         mov     0(%r11),%r10d           # HandlerData[0]
 962         lea     (%rsi,%r10),%r10        # end of prologue label
 963         cmp     %r10,%rbx               # context->Rip<end of prologue label
 964         jb      .Lcommon_seh_tail
 965
 966         mov     152($context),%rax      # pull context->Rsp
 967
 968         mov     4(%r11),%r10d           # HandlerData[1]
 969         lea     (%rsi,%r10),%r10        # epilogue label
 970         cmp     %r10,%rbx               # context->Rip>=epilogue label
 971         jae     .Lcommon_seh_tail
 972
 973         mov     8(%r11),%r10d           # HandlerData[2]
 974         lea     (%rax,%r10),%rax
 975
 976         mov     -8(%rax),%rbp
 977         mov     -16(%rax),%rbx
 978         mov     -24(%rax),%r12
 979         mov     -32(%rax),%r13
 980         mov     -40(%rax),%r14
 981         mov     -48(%rax),%r15
 982         mov     %rbx,144($context)      # restore context->Rbx
 983         mov     %rbp,160($context)      # restore context->Rbp
 984         mov     %r12,216($context)      # restore context->R12
 985         mov     %r13,224($context)      # restore context->R13
 986         mov     %r14,232($context)      # restore context->R14
 987         mov     %r15,240($context)      # restore context->R15
 988
 989 .Lcommon_seh_tail:
 990         mov     8(%rax),%rdi
 991         mov     16(%rax),%rsi
 992         mov     %rax,152($context)      # restore context->Rsp
 993         mov     %rsi,168($context)      # restore context->Rsi
 994         mov     %rdi,176($context)      # restore context->Rdi
 995
 996         mov     40($disp),%rdi          # disp->ContextRecord
 997         mov     $context,%rsi           # context
 998         mov     \$154,%ecx              # sizeof(CONTEXT)
 999         .long   0xa548f3fc              # cld; rep movsq
1000
1001         mov     $disp,%rsi
1002         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1003         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1004         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1005         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1006         mov     40(%rsi),%r10           # disp->ContextRecord
1007         lea     56(%rsi),%r11           # &disp->HandlerData
1008         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1009         mov     %r10,32(%rsp)           # arg5
1010         mov     %r11,40(%rsp)           # arg6
1011         mov     %r12,48(%rsp)           # arg7
1012         mov     %rcx,56(%rsp)           # arg8, (NULL)
1013         call    *__imp_RtlVirtualUnwind(%rip)
1014
1015         mov     \$1,%eax                # ExceptionContinueSearch
1016         add     \$64,%rsp
1017         popfq
1018         pop     %r15
1019         pop     %r14
1020         pop     %r13
1021         pop     %r12
1022         pop     %rbp
1023         pop     %rbx
1024         pop     %rdi
1025         pop     %rsi
1026         ret
1027 .size   full_handler,.-full_handler
1028
1029 .section        .pdata
1030 .align  4
1031         .rva    .LSEH_begin_x25519_fe51_mul
1032         .rva    .LSEH_end_x25519_fe51_mul
1033         .rva    .LSEH_info_x25519_fe51_mul
1034
1035         .rva    .LSEH_begin_x25519_fe51_sqr
1036         .rva    .LSEH_end_x25519_fe51_sqr
1037         .rva    .LSEH_info_x25519_fe51_sqr
1038
1039         .rva    .LSEH_begin_x25519_fe51_mul121666
1040         .rva    .LSEH_end_x25519_fe51_mul121666
1041         .rva    .LSEH_info_x25519_fe51_mul121666
1042 ___
1043 $code.=<<___    if ($addx);
1044         .rva    .LSEH_begin_x25519_fe64_mul
1045         .rva    .LSEH_end_x25519_fe64_mul
1046         .rva    .LSEH_info_x25519_fe64_mul
1047
1048         .rva    .LSEH_begin_x25519_fe64_sqr
1049         .rva    .LSEH_end_x25519_fe64_sqr
1050         .rva    .LSEH_info_x25519_fe64_sqr
1051
1052         .rva    .LSEH_begin_x25519_fe64_mul121666
1053         .rva    .LSEH_end_x25519_fe64_mul121666
1054         .rva    .LSEH_info_x25519_fe64_mul121666
1055
1056         .rva    .LSEH_begin_x25519_fe64_add
1057         .rva    .LSEH_end_x25519_fe64_add
1058         .rva    .LSEH_info_x25519_fe64_add
1059
1060         .rva    .LSEH_begin_x25519_fe64_sub
1061         .rva    .LSEH_end_x25519_fe64_sub
1062         .rva    .LSEH_info_x25519_fe64_sub
1063
1064         .rva    .LSEH_begin_x25519_fe64_tobytes
1065         .rva    .LSEH_end_x25519_fe64_tobytes
1066         .rva    .LSEH_info_x25519_fe64_tobytes
1067 ___
1068 $code.=<<___;
1069 .section        .xdata
1070 .align  8
1071 .LSEH_info_x25519_fe51_mul:
1072         .byte   9,0,0,0
1073         .rva    full_handler
1074         .rva    .Lfe51_mul_body,.Lfe51_mul_epilogue     # HandlerData[]
1075         .long   88,0
1076 .LSEH_info_x25519_fe51_sqr:
1077         .byte   9,0,0,0
1078         .rva    full_handler
1079         .rva    .Lfe51_sqr_body,.Lfe51_sqr_epilogue     # HandlerData[]
1080         .long   88,0
1081 .LSEH_info_x25519_fe51_mul121666:
1082         .byte   9,0,0,0
1083         .rva    full_handler
1084         .rva    .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[]
1085         .long   88,0
1086 ___
1087 $code.=<<___    if ($addx);
1088 .LSEH_info_x25519_fe64_mul:
1089         .byte   9,0,0,0
1090         .rva    full_handler
1091         .rva    .Lfe64_mul_body,.Lfe64_mul_epilogue     # HandlerData[]
1092         .long   72,0
1093 .LSEH_info_x25519_fe64_sqr:
1094         .byte   9,0,0,0
1095         .rva    full_handler
1096         .rva    .Lfe64_sqr_body,.Lfe64_sqr_epilogue     # HandlerData[]
1097         .long   72,0
1098 .LSEH_info_x25519_fe64_mul121666:
1099         .byte   9,0,0,0
1100         .rva    short_handler
1101         .rva    .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[]
1102 .LSEH_info_x25519_fe64_add:
1103         .byte   9,0,0,0
1104         .rva    short_handler
1105         .rva    .Lfe64_add_body,.Lfe64_add_epilogue     # HandlerData[]
1106 .LSEH_info_x25519_fe64_sub:
1107         .byte   9,0,0,0
1108         .rva    short_handler
1109         .rva    .Lfe64_sub_body,.Lfe64_sub_epilogue     # HandlerData[]
1110 .LSEH_info_x25519_fe64_tobytes:
1111         .byte   9,0,0,0
1112         .rva    short_handler
1113         .rva    .Lfe64_to_body,.Lfe64_to_epilogue       # HandlerData[]
1114 ___
1115 }
1116
1117 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1118 print $code;
1119 close STDOUT;