2 # Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # X25519 lower-level primitives for x86_86.
20 # This module implements radix 2^51 multiplication and squaring, and
21 # radix 2^64 multiplication, squaring, addition, subtraction and final
22 # reduction. Latter radix is used on ADCX/ADOX-capable processors such
23 # as Broadwell. On related note one should mention that there are
24 # vector implementations that provide significantly better performance
25 # on some processors(*), but they are large and overly complex. Which
26 # in combination with them being effectively processor-specific makes
27 # the undertaking hard to justify. The goal for this implementation
28 # is rather versatility and simplicity [and ultimately formal
31 # (*) For example sandy2x should provide ~30% improvement on Sandy
32 # Bridge, but only nominal ~5% on Haswell [and big loss on
33 # Broadwell and successors].
35 ######################################################################
36 # Improvement coefficients:
38 # amd64-51(*) gcc-5.x(**)
41 # Sandy Bridge -3% +11%
43 # Broadwell(***) +26% +30%
44 # Skylake(***) +30% +47%
45 # Silvermont +20% +26%
48 # Ryzen(***) +35% +32%
51 # (*) amd64-51 is popular assembly implementation with 2^51 radix,
52 # only multiplication and squaring subroutines were linked
53 # for comparison, but not complete ladder step; gain on most
54 # processors is because this module refrains from shld, and
55 # minor regression on others is because this does result in
56 # higher instruction count;
57 # (**) compiler is free to inline functions, in assembly one would
58 # need to implement ladder step to do that, and it will improve
59 # performance by several percent;
60 # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding
61 # C implementation, so that comparison is always against
66 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
68 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
70 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
72 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
73 die "can't locate x86_64-xlate.pl";
75 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
78 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
83 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
88 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
89 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
93 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
94 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
101 .globl x25519_fe51_mul
102 .type x25519_fe51_mul,\@function,3
113 mov 8*0(%rsi),%rax # f[0]
114 mov 8*0(%rdx),%r11 # load g[0-4]
120 mov %rdi,8*4(%rsp) # offload 1st argument
122 mulq %r11 # f[0]*g[0]
123 mov %r11,8*0(%rsp) # offload g[0]
124 mov %rax,%rbx # %rbx:%rcx = h0
127 mulq %r12 # f[0]*g[1]
128 mov %r12,8*1(%rsp) # offload g[1]
129 mov %rax,%r8 # %r8:%r9 = h1
131 lea (%r14,%r14,8),%r15
133 mulq %r13 # f[0]*g[2]
134 mov %r13,8*2(%rsp) # offload g[2]
135 mov %rax,%r10 # %r10:%r11 = h2
137 lea (%r14,%r15,2),%rdi # g[4]*19
139 mulq %rbp # f[0]*g[3]
140 mov %rax,%r12 # %r12:%r13 = h3
141 mov 8*0(%rsi),%rax # f[0]
143 mulq %r14 # f[0]*g[4]
144 mov %rax,%r14 # %r14:%r15 = h4
145 mov 8*1(%rsi),%rax # f[1]
148 mulq %rdi # f[1]*g[4]*19
150 mov 8*2(%rsi),%rax # f[2]
152 mulq %rdi # f[2]*g[4]*19
154 mov 8*3(%rsi),%rax # f[3]
156 mulq %rdi # f[3]*g[4]*19
158 mov 8*4(%rsi),%rax # f[4]
160 mulq %rdi # f[4]*g[4]*19
161 imulq \$19,%rbp,%rdi # g[3]*19
163 mov 8*1(%rsi),%rax # f[1]
165 mulq %rbp # f[1]*g[3]
166 mov 8*2(%rsp),%rbp # g[2]
168 mov 8*2(%rsi),%rax # f[2]
171 mulq %rdi # f[2]*g[3]*19
173 mov 8*3(%rsi),%rax # f[3]
175 mulq %rdi # f[3]*g[3]*19
177 mov 8*4(%rsi),%rax # f[4]
179 mulq %rdi # f[4]*g[3]*19
180 imulq \$19,%rbp,%rdi # g[2]*19
182 mov 8*1(%rsi),%rax # f[1]
184 mulq %rbp # f[1]*g[2]
186 mov 8*2(%rsi),%rax # f[2]
188 mulq %rbp # f[2]*g[2]
189 mov 8*1(%rsp),%rbp # g[1]
191 mov 8*3(%rsi),%rax # f[3]
194 mulq %rdi # f[3]*g[2]*19
196 mov 8*4(%rsi),%rax # f[3]
198 mulq %rdi # f[4]*g[2]*19
200 mov 8*1(%rsi),%rax # f[1]
202 mulq %rbp # f[1]*g[1]
205 mov 8*2(%rsi),%rax # f[2]
207 mulq %rbp # f[2]*g[1]
209 mov 8*3(%rsi),%rax # f[3]
211 mulq %rbp # f[3]*g[1]
212 mov 8*0(%rsp),%rbp # g[0]
214 mov 8*4(%rsi),%rax # f[4]
217 mulq %rdi # f[4]*g[1]*19
219 mov 8*1(%rsi),%rax # f[1]
223 mov 8*2(%rsi),%rax # f[2]
227 mov 8*3(%rsi),%rax # f[3]
231 mov 8*4(%rsi),%rax # f[4]
233 mulq %rbp # f[4]*g[0]
237 mov 8*4(%rsp),%rdi # restore 1st argument
239 .size x25519_fe51_mul,.-x25519_fe51_mul
241 .globl x25519_fe51_sqr
242 .type x25519_fe51_sqr,\@function,2
253 mov 8*0(%rsi),%rax # g[0]
254 mov 8*2(%rsi),%r15 # g[2]
255 mov 8*4(%rsi),%rbp # g[4]
257 mov %rdi,8*4(%rsp) # offload 1st argument
259 mulq %rax # g[0]*g[0]
261 mov 8*1(%rsi),%rax # g[1]
263 mulq %r14 # 2*g[0]*g[1]
266 mov %r15,8*0(%rsp) # offload g[2]
268 mulq %r14 # 2*g[0]*g[2]
272 imulq \$19,%rbp,%rdi # g[4]*19
273 mulq %r14 # 2*g[0]*g[3]
277 mulq %r14 # 2*g[0]*g[4]
282 mulq %rdi # g[4]*g[4]*19
284 mov 8*1(%rsi),%rax # g[1]
287 mov 8*3(%rsi),%rsi # g[3]
289 mulq %rax # g[1]*g[1]
291 mov 8*0(%rsp),%rax # g[2]
293 mulq %rbp # 2*g[1]*g[2]
297 mulq %rsi # 2*g[1]*g[3]
301 imulq \$19,%rsi,%rbp # g[3]*19
302 mulq %rdi # 2*g[1]*g[4]*19
307 mulq %rdi # 2*g[3]*g[4]*19
311 mulq %rbp # g[3]*g[3]*19
313 mov 8*0(%rsp),%rax # g[2]
317 mulq %rax # g[2]*g[2]
321 mulq %rsi # 2*g[2]*g[3]*19
325 mulq %rdi # 2*g[2]*g[4]*19
329 mov 8*4(%rsp),%rdi # restore 1st argument
334 mov \$0x7ffffffffffff,%rbp
339 and %rbp,%rdx # %rdx = g2 = h2 & mask
340 or %r10,%r11 # h2>>51
342 adc \$0,%r13 # h3 += h2>>51
347 and %rbp,%rax # %rax = g0 = h0 & mask
348 or %rbx,%rcx # h0>>51
349 add %rcx,%r8 # h1 += h0>>51
355 and %rbp,%rbx # %rbx = g3 = h3 & mask
356 or %r12,%r13 # h3>>51
357 add %r13,%r14 # h4 += h3>>51
363 and %rbp,%rcx # %rcx = g1 = h1 & mask
365 add %r9,%rdx # g2 += h1>>51
370 and %rbp,%r10 # %r10 = g4 = h0 & mask
371 or %r14,%r15 # h0>>51
373 lea (%r15,%r15,8),%r14
374 lea (%r15,%r14,2),%r15
375 add %r15,%rax # g0 += (h0>>51)*19
378 and %rbp,%rdx # g2 &= mask
380 add %r8,%rbx # g3 += g2>>51
383 and %rbp,%rax # g0 &= mask
385 add %r9,%rcx # g1 += g0>>51
387 mov %rax,8*0(%rdi) # save the result
401 .size x25519_fe51_sqr,.-x25519_fe51_sqr
403 .globl x25519_fe51_mul121666
404 .type x25519_fe51_mul121666,\@function,2
406 x25519_fe51_mul121666:
417 mov %rax,%rbx # %rbx:%rcx = h0
421 mov %rax,%r8 # %r8:%r9 = h1
425 mov %rax,%r10 # %r10:%r11 = h2
429 mov %rax,%r12 # %r12:%r13 = h3
430 mov \$121666,%eax # f[0]
433 mov %rax,%r14 # %r14:%r15 = h4
437 .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
439 ########################################################################
440 # Base 2^64 subroutines modulo 2*(2^255-19)
443 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15));
446 .extern OPENSSL_ia32cap_P
447 .globl x25519_fe64_eligible
448 .type x25519_fe64_eligible,\@abi-omnipotent
450 x25519_fe64_eligible:
451 mov OPENSSL_ia32cap_P+8(%rip),%ecx
457 .size x25519_fe64_eligible,.-x25519_fe64_eligible
459 .globl x25519_fe64_mul
460 .type x25519_fe64_mul,\@function,3
469 push %rdi # offload dst
473 mov 8*0(%rdx),%rbp # b[0]
474 mov 8*0(%rsi),%rdx # a[0]
475 mov 8*1(%rax),%rcx # b[1]
476 mov 8*2(%rax),$acc6 # b[2]
477 mov 8*3(%rax),$acc7 # b[3]
479 mulx %rbp,$acc0,%rax # a[0]*b[0]
480 xor %edi,%edi # cf=0,of=0
481 mulx %rcx,$acc1,%rbx # a[0]*b[1]
483 mulx $acc6,$acc2,%rax # a[0]*b[2]
485 mulx $acc7,$acc3,$acc4 # a[0]*b[3]
486 mov 8*1(%rsi),%rdx # a[1]
488 mov $acc6,(%rsp) # offload b[2]
489 adcx %rdi,$acc4 # cf=0
491 mulx %rbp,%rax,%rbx # a[1]*b[0]
494 mulx %rcx,%rax,%rbx # a[1]*b[1]
497 mulx $acc6,%rax,%rbx # a[1]*b[2]
500 mulx $acc7,%rax,$acc5 # a[1]*b[3]
501 mov 8*2(%rsi),%rdx # a[2]
503 adcx %rdi,$acc5 # cf=0
504 adox %rdi,$acc5 # of=0
506 mulx %rbp,%rax,%rbx # a[2]*b[0]
509 mulx %rcx,%rax,%rbx # a[2]*b[1]
512 mulx $acc6,%rax,%rbx # a[2]*b[2]
515 mulx $acc7,%rax,$acc6 # a[2]*b[3]
516 mov 8*3(%rsi),%rdx # a[3]
518 adox %rdi,$acc6 # of=0
519 adcx %rdi,$acc6 # cf=0
521 mulx %rbp,%rax,%rbx # a[3]*b[0]
524 mulx %rcx,%rax,%rbx # a[3]*b[1]
527 mulx (%rsp),%rax,%rbx # a[3]*b[2]
530 mulx $acc7,%rax,$acc7 # a[3]*b[3]
533 adcx %rdi,$acc7 # cf=0
534 adox %rdi,$acc7 # of=0
537 .size x25519_fe64_mul,.-x25519_fe64_mul
539 .globl x25519_fe64_sqr
540 .type x25519_fe64_sqr,\@function,2
549 push %rdi # offload dst
552 mov 8*0(%rsi),%rdx # a[0]
553 mov 8*1(%rsi),%rcx # a[1]
554 mov 8*2(%rsi),%rbp # a[2]
555 mov 8*3(%rsi),%rsi # a[3]
557 ################################################################
558 mulx %rdx,$acc0,$acc7 # a[0]*a[0]
559 mulx %rcx,$acc1,%rax # a[0]*a[1]
560 xor %edi,%edi # cf=0,of=0
561 mulx %rbp,$acc2,%rbx # a[0]*a[2]
563 mulx %rsi,$acc3,$acc4 # a[0]*a[3]
566 adcx %rdi,$acc4 # cf=0
568 ################################################################
569 mulx %rbp,%rax,%rbx # a[1]*a[2]
572 mulx %rsi,%rax,$acc5 # a[1]*a[3]
577 ################################################################
578 mulx %rsi,%rax,$acc6 # a[2]*a[3]
581 adcx %rdi,$acc6 # cf=0
582 adox %rdi,$acc6 # of=0
584 adcx $acc1,$acc1 # acc1:6<<1
587 mulx %rdx,%rax,%rbx # a[1]*a[1]
593 mulx %rdx,%rax,%rbx # a[2]*a[2]
599 mulx %rdx,%rax,$acc7 # a[3]*a[3]
602 adcx %rdi,$acc7 # cf=0
603 adox %rdi,$acc7 # of=0
617 mulx $acc7,%rax,$acc4
622 mov 8*2(%rsp),%rdi # restore dst
630 sbb %rax,%rax # cf -> mask
650 .size x25519_fe64_sqr,.-x25519_fe64_sqr
652 .globl x25519_fe64_mul121666
653 .type x25519_fe64_mul121666,\@function,2
655 x25519_fe64_mul121666:
657 mulx 8*0(%rsi),$acc0,%rcx
658 mulx 8*1(%rsi),$acc1,%rax
660 mulx 8*2(%rsi),$acc2,%rcx
662 mulx 8*3(%rsi),$acc3,%rax
673 sbb %rax,%rax # cf -> mask
686 .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
688 .globl x25519_fe64_add
689 .type x25519_fe64_add,\@function,3
702 sbb %rax,%rax # cf -> mask
715 .size x25519_fe64_add,.-x25519_fe64_add
717 .globl x25519_fe64_sub
718 .type x25519_fe64_sub,\@function,3
731 sbb %rax,%rax # cf -> mask
744 .size x25519_fe64_sub,.-x25519_fe64_sub
746 .globl x25519_fe64_tobytes
747 .type x25519_fe64_tobytes,\@function,2
755 ################################# reduction modulo 2^255-19
756 lea ($acc3,$acc3),%rax
757 sar \$63,$acc3 # most significant bit -> mask
758 shr \$1,%rax # most significant bit cleared
766 lea (%rax,%rax),$acc3
767 sar \$63,%rax # most significant bit -> mask
768 shr \$1,$acc3 # most significant bit cleared
782 .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
786 .globl x25519_fe64_eligible
787 .type x25519_fe64_eligible,\@abi-omnipotent
789 x25519_fe64_eligible:
792 .size x25519_fe64_eligible,.-x25519_fe64_eligible
794 .globl x25519_fe64_mul
795 .type x25519_fe64_mul,\@abi-omnipotent
796 .globl x25519_fe64_sqr
797 .globl x25519_fe64_mul121666
798 .globl x25519_fe64_add
799 .globl x25519_fe64_sub
800 .globl x25519_fe64_tobytes
803 x25519_fe64_mul121666:
807 .byte 0x0f,0x0b # ud2
809 .size x25519_fe64_mul,.-x25519_fe64_mul
813 .asciz "X25519 primitives for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
816 $code =~ s/\`([^\`]*)\`/eval $1/gem;