crypto/ec/asm/x25519-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # X25519 lower-level primitives for x86_86.
  17 #
  18 # February 2018.
  19 #
  20 # This module implements radix 2^51 multiplication and squaring, and
  21 # radix 2^64 multiplication, squaring, addition, subtraction and final
  22 # reduction. Latter radix is used on ADCX/ADOX-capable processors such
  23 # as Broadwell. On related note one should mention that there are
  24 # vector implementations that provide significantly better performance
  25 # on some processors(*), but they are large and overly complex. Which
  26 # in combination with them being effectively processor-specific makes
  27 # the undertaking hard to justify. The goal for this implementation
  28 # is rather versatility and simplicity [and ultimately formal
  29 # verification].
  30 #
  31 # (*)   For example sandy2x should provide ~30% improvement on Sandy
  32 #       Bridge, but only nominal ~5% on Haswell [and big loss on
  33 #       Broadwell and successors].
  34 #
  35 ######################################################################
  36 # Improvement coefficients:
  37 #
  38 #                       amd64-51(*)     gcc-5.x(**)
  39 #
  40 # P4                    +22%            +40%
  41 # Sandy Bridge          -3%             +11%
  42 # Haswell               -1%             +13%
  43 # Broadwell(***)        +26%            +30%
  44 # Skylake(***)          +30%            +47%
  45 # Silvermont            +20%            +26%
  46 # Goldmont              +40%            +50%
  47 # Bulldozer             +20%            +9%
  48 # Ryzen(***)            +35%            +32%
  49 # VIA                   +170%           +120%
  50 #
  51 # (*)   amd64-51 is popular assembly implementation with 2^51 radix,
  52 #       only multiplication and squaring subroutines were linked
  53 #       for comparison, but not complete ladder step; gain on most
  54 #       processors is because this module refrains from shld, and
  55 #       minor regression on others is because this does result in
  56 #       higher instruction count;
  57 # (**)  compiler is free to inline functions, in assembly one would
  58 #       need to implement ladder step to do that, and it will improve
  59 #       performance by several percent;
  60 # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
  61 #       C implementation, so that comparison is always against
  62 #       2^51 radix;
  63
  64 $flavour = shift;
  65 $output  = shift;
  66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  67
  68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  69
  70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  71 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  73 die "can't locate x86_64-xlate.pl";
  74
  75 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  76 *STDOUT=*OUT;
  77
  78 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  79                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  80         $addx = ($1>=2.23);
  81 }
  82
  83 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  84             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  85         $addx = ($1>=2.10);
  86 }
  87
  88 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  89             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  90         $addx = ($1>=12);
  91 }
  92
  93 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
  94         my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
  95         $addx = ($ver>=3.03);
  96 }
  97
  98 $code.=<<___;
  99 .text
 100
 101 .globl  x25519_fe51_mul
 102 .type   x25519_fe51_mul,\@function,3
 103 .align  32
 104 x25519_fe51_mul:
 105         push    %rbp
 106         push    %rbx
 107         push    %r12
 108         push    %r13
 109         push    %r14
 110         push    %r15
 111         lea     -8*5(%rsp),%rsp
 112
 113         mov     8*0(%rsi),%rax          # f[0]
 114         mov     8*0(%rdx),%r11          # load g[0-4]
 115         mov     8*1(%rdx),%r12
 116         mov     8*2(%rdx),%r13
 117         mov     8*3(%rdx),%rbp
 118         mov     8*4(%rdx),%r14
 119
 120         mov     %rdi,8*4(%rsp)          # offload 1st argument
 121         mov     %rax,%rdi
 122         mulq    %r11                    # f[0]*g[0]
 123         mov     %r11,8*0(%rsp)          # offload g[0]
 124         mov     %rax,%rbx               # %rbx:%rcx = h0
 125         mov     %rdi,%rax
 126         mov     %rdx,%rcx
 127         mulq    %r12                    # f[0]*g[1]
 128         mov     %r12,8*1(%rsp)          # offload g[1]
 129         mov     %rax,%r8                # %r8:%r9 = h1
 130         mov     %rdi,%rax
 131         lea     (%r14,%r14,8),%r15
 132         mov     %rdx,%r9
 133         mulq    %r13                    # f[0]*g[2]
 134         mov     %r13,8*2(%rsp)          # offload g[2]
 135         mov     %rax,%r10               # %r10:%r11 = h2
 136         mov     %rdi,%rax
 137         lea     (%r14,%r15,2),%rdi      # g[4]*19
 138         mov     %rdx,%r11
 139         mulq    %rbp                    # f[0]*g[3]
 140         mov     %rax,%r12               # %r12:%r13 = h3
 141         mov     8*0(%rsi),%rax          # f[0]
 142         mov     %rdx,%r13
 143         mulq    %r14                    # f[0]*g[4]
 144         mov     %rax,%r14               # %r14:%r15 = h4
 145         mov     8*1(%rsi),%rax          # f[1]
 146         mov     %rdx,%r15
 147
 148         mulq    %rdi                    # f[1]*g[4]*19
 149         add     %rax,%rbx
 150         mov     8*2(%rsi),%rax          # f[2]
 151         adc     %rdx,%rcx
 152         mulq    %rdi                    # f[2]*g[4]*19
 153         add     %rax,%r8
 154         mov     8*3(%rsi),%rax          # f[3]
 155         adc     %rdx,%r9
 156         mulq    %rdi                    # f[3]*g[4]*19
 157         add     %rax,%r10
 158         mov     8*4(%rsi),%rax          # f[4]
 159         adc     %rdx,%r11
 160         mulq    %rdi                    # f[4]*g[4]*19
 161         imulq   \$19,%rbp,%rdi          # g[3]*19
 162         add     %rax,%r12
 163         mov     8*1(%rsi),%rax          # f[1]
 164         adc     %rdx,%r13
 165         mulq    %rbp                    # f[1]*g[3]
 166         mov     8*2(%rsp),%rbp          # g[2]
 167         add     %rax,%r14
 168         mov     8*2(%rsi),%rax          # f[2]
 169         adc     %rdx,%r15
 170
 171         mulq    %rdi                    # f[2]*g[3]*19
 172         add     %rax,%rbx
 173         mov     8*3(%rsi),%rax          # f[3]
 174         adc     %rdx,%rcx
 175         mulq    %rdi                    # f[3]*g[3]*19
 176         add     %rax,%r8
 177         mov     8*4(%rsi),%rax          # f[4]
 178         adc     %rdx,%r9
 179         mulq    %rdi                    # f[4]*g[3]*19
 180         imulq   \$19,%rbp,%rdi          # g[2]*19
 181         add     %rax,%r10
 182         mov     8*1(%rsi),%rax          # f[1]
 183         adc     %rdx,%r11
 184         mulq    %rbp                    # f[1]*g[2]
 185         add     %rax,%r12
 186         mov     8*2(%rsi),%rax          # f[2]
 187         adc     %rdx,%r13
 188         mulq    %rbp                    # f[2]*g[2]
 189         mov     8*1(%rsp),%rbp          # g[1]
 190         add     %rax,%r14
 191         mov     8*3(%rsi),%rax          # f[3]
 192         adc     %rdx,%r15
 193
 194         mulq    %rdi                    # f[3]*g[2]*19
 195         add     %rax,%rbx
 196         mov     8*4(%rsi),%rax          # f[3]
 197         adc     %rdx,%rcx
 198         mulq    %rdi                    # f[4]*g[2]*19
 199         add     %rax,%r8
 200         mov     8*1(%rsi),%rax          # f[1]
 201         adc     %rdx,%r9
 202         mulq    %rbp                    # f[1]*g[1]
 203         imulq   \$19,%rbp,%rdi
 204         add     %rax,%r10
 205         mov     8*2(%rsi),%rax          # f[2]
 206         adc     %rdx,%r11
 207         mulq    %rbp                    # f[2]*g[1]
 208         add     %rax,%r12
 209         mov     8*3(%rsi),%rax          # f[3]
 210         adc     %rdx,%r13
 211         mulq    %rbp                    # f[3]*g[1]
 212         mov     8*0(%rsp),%rbp          # g[0]
 213         add     %rax,%r14
 214         mov     8*4(%rsi),%rax          # f[4]
 215         adc     %rdx,%r15
 216
 217         mulq    %rdi                    # f[4]*g[1]*19
 218         add     %rax,%rbx
 219         mov     8*1(%rsi),%rax          # f[1]
 220         adc     %rdx,%rcx
 221         mul     %rbp                    # f[1]*g[0]
 222         add     %rax,%r8
 223         mov     8*2(%rsi),%rax          # f[2]
 224         adc     %rdx,%r9
 225         mul     %rbp                    # f[2]*g[0]
 226         add     %rax,%r10
 227         mov     8*3(%rsi),%rax          # f[3]
 228         adc     %rdx,%r11
 229         mul     %rbp                    # f[3]*g[0]
 230         add     %rax,%r12
 231         mov     8*4(%rsi),%rax          # f[4]
 232         adc     %rdx,%r13
 233         mulq    %rbp                    # f[4]*g[0]
 234         add     %rax,%r14
 235         adc     %rdx,%r15
 236
 237         mov     8*4(%rsp),%rdi          # restore 1st argument
 238         jmp     .Lreduce51
 239 .size   x25519_fe51_mul,.-x25519_fe51_mul
 240
 241 .globl  x25519_fe51_sqr
 242 .type   x25519_fe51_sqr,\@function,2
 243 .align  32
 244 x25519_fe51_sqr:
 245         push    %rbp
 246         push    %rbx
 247         push    %r12
 248         push    %r13
 249         push    %r14
 250         push    %r15
 251         lea     -8*5(%rsp),%rsp
 252
 253         mov     8*0(%rsi),%rax          # g[0]
 254         mov     8*2(%rsi),%r15          # g[2]
 255         mov     8*4(%rsi),%rbp          # g[4]
 256
 257         mov     %rdi,8*4(%rsp)          # offload 1st argument
 258         lea     (%rax,%rax),%r14
 259         mulq    %rax                    # g[0]*g[0]
 260         mov     %rax,%rbx
 261         mov     8*1(%rsi),%rax          # g[1]
 262         mov     %rdx,%rcx
 263         mulq    %r14                    # 2*g[0]*g[1]
 264         mov     %rax,%r8
 265         mov     %r15,%rax
 266         mov     %r15,8*0(%rsp)          # offload g[2]
 267         mov     %rdx,%r9
 268         mulq    %r14                    # 2*g[0]*g[2]
 269         mov     %rax,%r10
 270         mov     8*3(%rsi),%rax
 271         mov     %rdx,%r11
 272         imulq   \$19,%rbp,%rdi          # g[4]*19
 273         mulq    %r14                    # 2*g[0]*g[3]
 274         mov     %rax,%r12
 275         mov     %rbp,%rax
 276         mov     %rdx,%r13
 277         mulq    %r14                    # 2*g[0]*g[4]
 278         mov     %rax,%r14
 279         mov     %rbp,%rax
 280         mov     %rdx,%r15
 281
 282         mulq    %rdi                    # g[4]*g[4]*19
 283         add     %rax,%r12
 284         mov     8*1(%rsi),%rax          # g[1]
 285         adc     %rdx,%r13
 286
 287         mov     8*3(%rsi),%rsi          # g[3]
 288         lea     (%rax,%rax),%rbp
 289         mulq    %rax                    # g[1]*g[1]
 290         add     %rax,%r10
 291         mov     8*0(%rsp),%rax          # g[2]
 292         adc     %rdx,%r11
 293         mulq    %rbp                    # 2*g[1]*g[2]
 294         add     %rax,%r12
 295         mov     %rbp,%rax
 296         adc     %rdx,%r13
 297         mulq    %rsi                    # 2*g[1]*g[3]
 298         add     %rax,%r14
 299         mov     %rbp,%rax
 300         adc     %rdx,%r15
 301         imulq   \$19,%rsi,%rbp          # g[3]*19
 302         mulq    %rdi                    # 2*g[1]*g[4]*19
 303         add     %rax,%rbx
 304         lea     (%rsi,%rsi),%rax
 305         adc     %rdx,%rcx
 306
 307         mulq    %rdi                    # 2*g[3]*g[4]*19
 308         add     %rax,%r10
 309         mov     %rsi,%rax
 310         adc     %rdx,%r11
 311         mulq    %rbp                    # g[3]*g[3]*19
 312         add     %rax,%r8
 313         mov     8*0(%rsp),%rax          # g[2]
 314         adc     %rdx,%r9
 315
 316         lea     (%rax,%rax),%rsi
 317         mulq    %rax                    # g[2]*g[2]
 318         add     %rax,%r14
 319         mov     %rbp,%rax
 320         adc     %rdx,%r15
 321         mulq    %rsi                    # 2*g[2]*g[3]*19
 322         add     %rax,%rbx
 323         mov     %rsi,%rax
 324         adc     %rdx,%rcx
 325         mulq    %rdi                    # 2*g[2]*g[4]*19
 326         add     %rax,%r8
 327         adc     %rdx,%r9
 328
 329         mov     8*4(%rsp),%rdi          # restore 1st argument
 330         jmp     .Lreduce51
 331
 332 .align  32
 333 .Lreduce51:
 334         mov     \$0x7ffffffffffff,%rbp
 335
 336         mov     %r10,%rdx
 337         shr     \$51,%r10
 338         shl     \$13,%r11
 339         and     %rbp,%rdx               # %rdx = g2 = h2 & mask
 340         or      %r10,%r11               # h2>>51
 341         add     %r11,%r12
 342         adc     \$0,%r13                # h3 += h2>>51
 343
 344         mov     %rbx,%rax
 345         shr     \$51,%rbx
 346         shl     \$13,%rcx
 347         and     %rbp,%rax               # %rax = g0 = h0 & mask
 348         or      %rbx,%rcx               # h0>>51
 349         add     %rcx,%r8                # h1 += h0>>51
 350         adc     \$0,%r9
 351
 352         mov     %r12,%rbx
 353         shr     \$51,%r12
 354         shl     \$13,%r13
 355         and     %rbp,%rbx               # %rbx = g3 = h3 & mask
 356         or      %r12,%r13               # h3>>51
 357         add     %r13,%r14               # h4 += h3>>51
 358         adc     \$0,%r15
 359
 360         mov     %r8,%rcx
 361         shr     \$51,%r8
 362         shl     \$13,%r9
 363         and     %rbp,%rcx               # %rcx = g1 = h1 & mask
 364         or      %r8,%r9
 365         add     %r9,%rdx                # g2 += h1>>51
 366
 367         mov     %r14,%r10
 368         shr     \$51,%r14
 369         shl     \$13,%r15
 370         and     %rbp,%r10               # %r10 = g4 = h0 & mask
 371         or      %r14,%r15               # h0>>51
 372
 373         lea     (%r15,%r15,8),%r14
 374         lea     (%r15,%r14,2),%r15
 375         add     %r15,%rax               # g0 += (h0>>51)*19
 376
 377         mov     %rdx,%r8
 378         and     %rbp,%rdx               # g2 &= mask
 379         shr     \$51,%r8
 380         add     %r8,%rbx                # g3 += g2>>51
 381
 382         mov     %rax,%r9
 383         and     %rbp,%rax               # g0 &= mask
 384         shr     \$51,%r9
 385         add     %r9,%rcx                # g1 += g0>>51
 386
 387         mov     %rax,8*0(%rdi)          # save the result
 388         mov     %rcx,8*1(%rdi)
 389         mov     %rdx,8*2(%rdi)
 390         mov     %rbx,8*3(%rdi)
 391         mov     %r10,8*4(%rdi)
 392
 393         mov     8*5(%rsp),%r15
 394         mov     8*6(%rsp),%r14
 395         mov     8*7(%rsp),%r13
 396         mov     8*8(%rsp),%r12
 397         mov     8*9(%rsp),%rbx
 398         mov     8*10(%rsp),%rbp
 399         lea     8*11(%rsp),%rsp
 400         ret
 401 .size   x25519_fe51_sqr,.-x25519_fe51_sqr
 402
 403 .globl  x25519_fe51_mul121666
 404 .type   x25519_fe51_mul121666,\@function,2
 405 .align  32
 406 x25519_fe51_mul121666:
 407         push    %rbp
 408         push    %rbx
 409         push    %r12
 410         push    %r13
 411         push    %r14
 412         push    %r15
 413         mov     \$121666,%eax
 414         lea     -8*5(%rsp),%rsp
 415
 416         mulq    8*0(%rsi)
 417         mov     %rax,%rbx               # %rbx:%rcx = h0
 418         mov     \$121666,%eax
 419         mov     %rdx,%rcx
 420         mulq    8*1(%rsi)
 421         mov     %rax,%r8                # %r8:%r9 = h1
 422         mov     \$121666,%eax
 423         mov     %rdx,%r9
 424         mulq    8*2(%rsi)
 425         mov     %rax,%r10               # %r10:%r11 = h2
 426         mov     \$121666,%eax
 427         mov     %rdx,%r11
 428         mulq    8*3(%rsi)
 429         mov     %rax,%r12               # %r12:%r13 = h3
 430         mov     \$121666,%eax           # f[0]
 431         mov     %rdx,%r13
 432         mulq    8*4(%rsi)
 433         mov     %rax,%r14               # %r14:%r15 = h4
 434         mov     %rdx,%r15
 435
 436         jmp     .Lreduce51
 437 .size   x25519_fe51_mul121666,.-x25519_fe51_mul121666
 438 ___
 439 ########################################################################
 440 # Base 2^64 subroutines modulo 2*(2^255-19)
 441 #
 442 if ($addx) {
 443 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
 444
 445 $code.=<<___;
 446 .extern OPENSSL_ia32cap_P
 447 .globl  x25519_fe64_eligible
 448 .type   x25519_fe64_eligible,\@abi-omnipotent
 449 .align  32
 450 x25519_fe64_eligible:
 451         mov     OPENSSL_ia32cap_P+8(%rip),%ecx
 452         xor     %eax,%eax
 453         and     \$0x80100,%ecx
 454         cmp     \$0x80100,%ecx
 455         cmove   %ecx,%eax
 456         ret
 457 .size   x25519_fe64_eligible,.-x25519_fe64_eligible
 458
 459 .globl  x25519_fe64_mul
 460 .type   x25519_fe64_mul,\@function,3
 461 .align  32
 462 x25519_fe64_mul:
 463         push    %rbp
 464         push    %rbx
 465         push    %r12
 466         push    %r13
 467         push    %r14
 468         push    %r15
 469         push    %rdi                    # offload dst
 470         lea     -8*2(%rsp),%rsp
 471
 472         mov     %rdx,%rax
 473         mov     8*0(%rdx),%rbp          # b[0]
 474         mov     8*0(%rsi),%rdx          # a[0]
 475         mov     8*1(%rax),%rcx          # b[1]
 476         mov     8*2(%rax),$acc6         # b[2]
 477         mov     8*3(%rax),$acc7         # b[3]
 478
 479         mulx    %rbp,$acc0,%rax         # a[0]*b[0]
 480         xor     %edi,%edi               # cf=0,of=0
 481         mulx    %rcx,$acc1,%rbx         # a[0]*b[1]
 482         adcx    %rax,$acc1
 483         mulx    $acc6,$acc2,%rax        # a[0]*b[2]
 484         adcx    %rbx,$acc2
 485         mulx    $acc7,$acc3,$acc4       # a[0]*b[3]
 486          mov    8*1(%rsi),%rdx          # a[1]
 487         adcx    %rax,$acc3
 488         mov     $acc6,(%rsp)            # offload b[2]
 489         adcx    %rdi,$acc4              # cf=0
 490
 491         mulx    %rbp,%rax,%rbx          # a[1]*b[0]
 492         adox    %rax,$acc1
 493         adcx    %rbx,$acc2
 494         mulx    %rcx,%rax,%rbx          # a[1]*b[1]
 495         adox    %rax,$acc2
 496         adcx    %rbx,$acc3
 497         mulx    $acc6,%rax,%rbx         # a[1]*b[2]
 498         adox    %rax,$acc3
 499         adcx    %rbx,$acc4
 500         mulx    $acc7,%rax,$acc5        # a[1]*b[3]
 501          mov    8*2(%rsi),%rdx          # a[2]
 502         adox    %rax,$acc4
 503         adcx    %rdi,$acc5              # cf=0
 504         adox    %rdi,$acc5              # of=0
 505
 506         mulx    %rbp,%rax,%rbx          # a[2]*b[0]
 507         adcx    %rax,$acc2
 508         adox    %rbx,$acc3
 509         mulx    %rcx,%rax,%rbx          # a[2]*b[1]
 510         adcx    %rax,$acc3
 511         adox    %rbx,$acc4
 512         mulx    $acc6,%rax,%rbx         # a[2]*b[2]
 513         adcx    %rax,$acc4
 514         adox    %rbx,$acc5
 515         mulx    $acc7,%rax,$acc6        # a[2]*b[3]
 516          mov    8*3(%rsi),%rdx          # a[3]
 517         adcx    %rax,$acc5
 518         adox    %rdi,$acc6              # of=0
 519         adcx    %rdi,$acc6              # cf=0
 520
 521         mulx    %rbp,%rax,%rbx          # a[3]*b[0]
 522         adox    %rax,$acc3
 523         adcx    %rbx,$acc4
 524         mulx    %rcx,%rax,%rbx          # a[3]*b[1]
 525         adox    %rax,$acc4
 526         adcx    %rbx,$acc5
 527         mulx    (%rsp),%rax,%rbx        # a[3]*b[2]
 528         adox    %rax,$acc5
 529         adcx    %rbx,$acc6
 530         mulx    $acc7,%rax,$acc7        # a[3]*b[3]
 531          mov    \$38,%edx
 532         adox    %rax,$acc6
 533         adcx    %rdi,$acc7              # cf=0
 534         adox    %rdi,$acc7              # of=0
 535
 536         jmp     .Lreduce64
 537 .size   x25519_fe64_mul,.-x25519_fe64_mul
 538
 539 .globl  x25519_fe64_sqr
 540 .type   x25519_fe64_sqr,\@function,2
 541 .align  32
 542 x25519_fe64_sqr:
 543         push    %rbp
 544         push    %rbx
 545         push    %r12
 546         push    %r13
 547         push    %r14
 548         push    %r15
 549         push    %rdi                    # offload dst
 550         lea     -8*2(%rsp),%rsp
 551
 552         mov     8*0(%rsi),%rdx          # a[0]
 553         mov     8*1(%rsi),%rcx          # a[1]
 554         mov     8*2(%rsi),%rbp          # a[2]
 555         mov     8*3(%rsi),%rsi          # a[3]
 556
 557         ################################################################
 558         mulx    %rdx,$acc0,$acc7        # a[0]*a[0]
 559         mulx    %rcx,$acc1,%rax         # a[0]*a[1]
 560         xor     %edi,%edi               # cf=0,of=0
 561         mulx    %rbp,$acc2,%rbx         # a[0]*a[2]
 562         adcx    %rax,$acc2
 563         mulx    %rsi,$acc3,$acc4        # a[0]*a[3]
 564          mov    %rcx,%rdx               # a[1]
 565         adcx    %rbx,$acc3
 566         adcx    %rdi,$acc4              # cf=0
 567
 568         ################################################################
 569         mulx    %rbp,%rax,%rbx          # a[1]*a[2]
 570         adox    %rax,$acc3
 571         adcx    %rbx,$acc4
 572         mulx    %rsi,%rax,$acc5         # a[1]*a[3]
 573          mov    %rbp,%rdx               # a[2]
 574         adox    %rax,$acc4
 575         adcx    %rdi,$acc5
 576
 577         ################################################################
 578         mulx    %rsi,%rax,$acc6         # a[2]*a[3]
 579          mov    %rcx,%rdx               # a[1]
 580         adox    %rax,$acc5
 581         adcx    %rdi,$acc6              # cf=0
 582         adox    %rdi,$acc6              # of=0
 583
 584          adcx   $acc1,$acc1             # acc1:6<<1
 585         adox    $acc7,$acc1
 586          adcx   $acc2,$acc2
 587         mulx    %rdx,%rax,%rbx          # a[1]*a[1]
 588          mov    %rbp,%rdx               # a[2]
 589          adcx   $acc3,$acc3
 590         adox    %rax,$acc2
 591          adcx   $acc4,$acc4
 592         adox    %rbx,$acc3
 593         mulx    %rdx,%rax,%rbx          # a[2]*a[2]
 594          mov    %rsi,%rdx               # a[3]
 595          adcx   $acc5,$acc5
 596         adox    %rax,$acc4
 597          adcx   $acc6,$acc6
 598         adox    %rbx,$acc5
 599         mulx    %rdx,%rax,$acc7         # a[3]*a[3]
 600          mov    \$38,%edx
 601         adox    %rax,$acc6
 602         adcx    %rdi,$acc7              # cf=0
 603         adox    %rdi,$acc7              # of=0
 604         jmp     .Lreduce64
 605
 606 .align  32
 607 .Lreduce64:
 608         mulx    $acc4,%rax,%rbx
 609         adcx    %rax,$acc0
 610         adox    %rbx,$acc1
 611         mulx    $acc5,%rax,%rbx
 612         adcx    %rax,$acc1
 613         adox    %rbx,$acc2
 614         mulx    $acc6,%rax,%rbx
 615         adcx    %rax,$acc2
 616         adox    %rbx,$acc3
 617         mulx    $acc7,%rax,$acc4
 618         adcx    %rax,$acc3
 619         adox    %rdi,$acc4
 620         adcx    %rdi,$acc4
 621
 622         mov     8*2(%rsp),%rdi          # restore dst
 623         imulq   %rdx,$acc4
 624
 625         add     $acc4,$acc0
 626         adc     \$0,$acc1
 627         adc     \$0,$acc2
 628         adc     \$0,$acc3
 629
 630         sbb     %rax,%rax               # cf -> mask
 631         and     \$38,%rax
 632
 633         add     %rax,$acc0
 634         adc     \$0,$acc1
 635         mov     $acc0,8*0(%rdi)
 636         adc     \$0,$acc2
 637         mov     $acc1,8*1(%rdi)
 638         adc     \$0,$acc3
 639         mov     $acc2,8*2(%rdi)
 640         mov     $acc3,8*3(%rdi)
 641
 642         mov     8*3(%rsp),%r15
 643         mov     8*4(%rsp),%r14
 644         mov     8*5(%rsp),%r13
 645         mov     8*6(%rsp),%r12
 646         mov     8*7(%rsp),%rbx
 647         mov     8*8(%rsp),%rbp
 648         lea     8*9(%rsp),%rsp
 649         ret
 650 .size   x25519_fe64_sqr,.-x25519_fe64_sqr
 651
 652 .globl  x25519_fe64_mul121666
 653 .type   x25519_fe64_mul121666,\@function,2
 654 .align  32
 655 x25519_fe64_mul121666:
 656         mov     \$121666,%edx
 657         mulx    8*0(%rsi),$acc0,%rcx
 658         mulx    8*1(%rsi),$acc1,%rax
 659         add     %rcx,$acc1
 660         mulx    8*2(%rsi),$acc2,%rcx
 661         adc     %rax,$acc2
 662         mulx    8*3(%rsi),$acc3,%rax
 663         adc     %rcx,$acc3
 664         adc     \$0,%rax
 665
 666         imulq   \$38,%rax,%rax
 667
 668         add     %rax,$acc0
 669         adc     \$0,$acc1
 670         adc     \$0,$acc2
 671         adc     \$0,$acc3
 672
 673         sbb     %rax,%rax               # cf -> mask
 674         and     \$38,%rax
 675
 676         add     %rax,$acc0
 677         adc     \$0,$acc1
 678         mov     $acc0,8*0(%rdi)
 679         adc     \$0,$acc2
 680         mov     $acc1,8*1(%rdi)
 681         adc     \$0,$acc3
 682         mov     $acc2,8*2(%rdi)
 683         mov     $acc3,8*3(%rdi)
 684
 685         ret
 686 .size   x25519_fe64_mul121666,.-x25519_fe64_mul121666
 687
 688 .globl  x25519_fe64_add
 689 .type   x25519_fe64_add,\@function,3
 690 .align  32
 691 x25519_fe64_add:
 692         mov     8*0(%rsi),$acc0
 693         mov     8*1(%rsi),$acc1
 694         mov     8*2(%rsi),$acc2
 695         mov     8*3(%rsi),$acc3
 696
 697         add     8*0(%rdx),$acc0
 698         adc     8*1(%rdx),$acc1
 699         adc     8*2(%rdx),$acc2
 700         adc     8*3(%rdx),$acc3
 701
 702         sbb     %rax,%rax               # cf -> mask
 703         and     \$38,%rax
 704
 705         add     %rax,$acc0
 706         adc     \$0,$acc1
 707         mov     $acc0,8*0(%rdi)
 708         adc     \$0,$acc2
 709         mov     $acc1,8*1(%rdi)
 710         adc     \$0,$acc3
 711         mov     $acc2,8*2(%rdi)
 712         mov     $acc3,8*3(%rdi)
 713
 714         ret
 715 .size   x25519_fe64_add,.-x25519_fe64_add
 716
 717 .globl  x25519_fe64_sub
 718 .type   x25519_fe64_sub,\@function,3
 719 .align  32
 720 x25519_fe64_sub:
 721         mov     8*0(%rsi),$acc0
 722         mov     8*1(%rsi),$acc1
 723         mov     8*2(%rsi),$acc2
 724         mov     8*3(%rsi),$acc3
 725
 726         sub     8*0(%rdx),$acc0
 727         sbb     8*1(%rdx),$acc1
 728         sbb     8*2(%rdx),$acc2
 729         sbb     8*3(%rdx),$acc3
 730
 731         sbb     %rax,%rax               # cf -> mask
 732         and     \$38,%rax
 733
 734         sub     %rax,$acc0
 735         sbb     \$0,$acc1
 736         mov     $acc0,8*0(%rdi)
 737         sbb     \$0,$acc2
 738         mov     $acc1,8*1(%rdi)
 739         sbb     \$0,$acc3
 740         mov     $acc2,8*2(%rdi)
 741         mov     $acc3,8*3(%rdi)
 742
 743         ret
 744 .size   x25519_fe64_sub,.-x25519_fe64_sub
 745
 746 .globl  x25519_fe64_tobytes
 747 .type   x25519_fe64_tobytes,\@function,2
 748 .align  32
 749 x25519_fe64_tobytes:
 750         mov     8*0(%rsi),$acc0
 751         mov     8*1(%rsi),$acc1
 752         mov     8*2(%rsi),$acc2
 753         mov     8*3(%rsi),$acc3
 754
 755         ################################# reduction modulo 2^255-19
 756         lea     ($acc3,$acc3),%rax
 757         sar     \$63,$acc3              # most significant bit -> mask
 758         shr     \$1,%rax                # most significant bit cleared
 759         and     \$19,$acc3
 760
 761         add     $acc3,$acc0
 762         adc     \$0,$acc1
 763         adc     \$0,$acc2
 764         adc     \$0,%rax
 765
 766         lea     (%rax,%rax),$acc3
 767         sar     \$63,%rax               # most significant bit -> mask
 768         shr     \$1,$acc3               # most significant bit cleared
 769         and     \$19,%rax
 770
 771         add     %rax,$acc0
 772         adc     \$0,$acc1
 773         adc     \$0,$acc2
 774         adc     \$0,$acc3
 775
 776         mov     $acc0,8*0(%rdi)
 777         mov     $acc1,8*1(%rdi)
 778         mov     $acc2,8*2(%rdi)
 779         mov     $acc3,8*3(%rdi)
 780
 781         ret
 782 .size   x25519_fe64_tobytes,.-x25519_fe64_tobytes
 783 ___
 784 } else {
 785 $code.=<<___;
 786 .globl  x25519_fe64_eligible
 787 .type   x25519_fe64_eligible,\@function
 788 .align  32
 789 x25519_fe64_eligible:
 790         xor     %eax,%eax
 791         ret
 792 .size   x25519_fe64_eligible,.-x25519_fe64_eligible
 793
 794 .globl  x25519_fe64_mul
 795 .globl  x25519_fe64_sqr
 796 .globl  x25519_fe64_mul121666
 797 .globl  x25519_fe64_add
 798 .globl  x25519_fe64_sub
 799 .globl  x25519_fe64_tobytes
 800 x25519_fe64_mul:
 801 x25519_fe64_sqr:
 802 x25519_fe64_mul121666:
 803 x25519_fe64_add:
 804 x25519_fe64_sub:
 805 x25519_fe64_sub:
 806 x25519_fe64_tobytes:
 807         .byte   0x0f,0x0b       # ud2
 808 ___
 809 }
 810 $code.=<<___;
 811 .asciz  "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 812 ___
 813
 814 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 815 print $code;
 816 close $STDOUT;