3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements Poly1305 hash for x86_64.
14 # Numbers are cycles per processed byte with poly1305_blocks alone,
15 # measured with rdtsc at fixed clock frequency.
17 # IALU/gcc-4.8(*) AVX(**) AVX2
20 # Westmere 1.88/+120% -
21 # Sandy Bridge 1.39/+140% 1.10
22 # Haswell 1.14/+175% 1.11 0.65
23 # Skylake 1.13/+120% 0.96 0.51
24 # Silvermont 2.83/+95% -
25 # VIA Nano 1.82/+150% -
26 # Sledgehammer 1.38/+160% -
27 # Bulldozer 2.30/+130% 0.97
29 # (*) improvement coefficients relative to clang are more modest and
30 # are ~50% on most processors, in both cases we are comparing to
32 # (**) SSE2 implementation was attempted, but among non-AVX processors
33 # it was faster than integer-only code only on older Intel P4 and
34 # Core processors, 50-30%, less newer processor is, but slower on
35 # contemporary ones, for example almost 2x slower on Atom, and as
36 # former are naturally disappearing, SSE2 is deemed unnecessary;
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
49 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
50 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
51 $avx = ($1>=2.19) + ($1>=2.22);
54 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
55 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
56 $avx = ($1>=2.09) + ($1>=2.10);
59 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
60 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
61 $avx = ($1>=10) + ($1>=12);
64 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
65 $avx = ($2>=3.0) + ($2>3.0);
68 open OUT,"| \"$^X\" $xlate $flavour $output";
71 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
72 my ($mac,$nonce)=($inp,$len); # *_emit arguments
73 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
74 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
76 sub poly1305_iteration {
77 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
78 # output: $h0-$h2 *= $r0-$r1
86 mov %rax,$h0 # future $h0
96 mov $h2,$h1 # borrow $h1
100 imulq $s1,$h1 # h2*s1
105 imulq $r0,$h2 # h2*r0
107 mov \$-4,%rax # mask value
110 and $d3,%rax # last reduction step
121 ########################################################################
122 # Layout of opaque area is following.
124 # unsigned __int64 h[3]; # current hash value base 2^64
125 # unsigned __int64 r[2]; # key value base 2^64
130 .extern OPENSSL_ia32cap_P
133 .hidden poly1305_init
134 .globl poly1305_blocks
135 .hidden poly1305_blocks
137 .hidden poly1305_emit
139 .type poly1305_init,\@function,3
143 mov %rax,0($ctx) # initialize hash value
150 lea poly1305_blocks(%rip),%r10
151 lea poly1305_emit(%rip),%r11
153 $code.=<<___ if ($avx);
154 mov OPENSSL_ia32cap_P+4(%rip),%r9
155 lea poly1305_blocks_avx(%rip),%rax
156 lea poly1305_emit_avx(%rip),%rcx
157 bt \$`60-32`,%r9 # AVX?
161 $code.=<<___ if ($avx>1);
162 lea poly1305_blocks_avx2(%rip),%rax
163 bt \$`5+32`,%r9 # AVX2?
167 mov \$0x0ffffffc0fffffff,%rax
168 mov \$0x0ffffffc0ffffffc,%rcx
174 $code.=<<___ if ($flavour !~ /elf32/);
178 $code.=<<___ if ($flavour =~ /elf32/);
186 .size poly1305_init,.-poly1305_init
188 .type poly1305_blocks,\@function,4
193 jz .Lno_data # too short
203 mov $len,%r15 # reassign $len
205 mov 24($ctx),$r0 # load r
208 mov 0($ctx),$h0 # load hash value
215 add $r1,$s1 # s1 = r1 + (r1 >> 2)
220 add 0($inp),$h0 # accumulate input
225 &poly1305_iteration();
231 mov $h0,0($ctx) # store hash value
245 .size poly1305_blocks,.-poly1305_blocks
247 .type poly1305_emit,\@function,3
251 mov 0($ctx),%r8 # load hash value
256 add \$5,%r8 # compare to modulus
260 shr \$2,%r10 # did 130-bit value overfow?
264 add 0($nonce),%rax # accumulate nonce
266 mov %rax,0($mac) # write result
270 .size poly1305_emit,.-poly1305_emit
274 ########################################################################
275 # Layout of opaque area is following.
277 # unsigned __int32 h[5]; # current hash value base 2^26
278 # unsigned __int32 is_base2_26;
279 # unsigned __int64 r[2]; # key value base 2^64
280 # unsigned __int64 pad;
281 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
283 # where r^n are base 2^26 digits of degrees of multiplier key. There are
284 # 5 digits, but last four are interleaved with multiples of 5, totalling
285 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
287 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
288 map("%xmm$_",(0..15));
291 .type __poly1305_block,\@abi-omnipotent
295 &poly1305_iteration();
298 .size __poly1305_block,.-__poly1305_block
300 .type __poly1305_init_avx,\@abi-omnipotent
307 lea 48+64($ctx),$ctx # size optimization
310 call __poly1305_block # r^2
312 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
318 mov %eax,`16*0+0-64`($ctx)
320 mov %edx,`16*0+4-64`($ctx)
327 mov %eax,`16*1+0-64`($ctx)
328 lea (%rax,%rax,4),%eax # *5
329 mov %edx,`16*1+4-64`($ctx)
330 lea (%rdx,%rdx,4),%edx # *5
331 mov %eax,`16*2+0-64`($ctx)
333 mov %edx,`16*2+4-64`($ctx)
344 mov %eax,`16*3+0-64`($ctx)
345 lea (%rax,%rax,4),%eax # *5
346 mov %edx,`16*3+4-64`($ctx)
347 lea (%rdx,%rdx,4),%edx # *5
348 mov %eax,`16*4+0-64`($ctx)
350 mov %edx,`16*4+4-64`($ctx)
359 mov %eax,`16*5+0-64`($ctx)
360 lea (%rax,%rax,4),%eax # *5
361 mov %edx,`16*5+4-64`($ctx)
362 lea (%rdx,%rdx,4),%edx # *5
363 mov %eax,`16*6+0-64`($ctx)
365 mov %edx,`16*6+4-64`($ctx)
371 mov $d1#d,`16*7+0-64`($ctx)
372 lea ($d1,$d1,4),$d1 # *5
373 mov $d2#d,`16*7+4-64`($ctx)
374 lea ($d2,$d2,4),$d2 # *5
375 mov $d1#d,`16*8+0-64`($ctx)
376 mov $d2#d,`16*8+4-64`($ctx)
379 call __poly1305_block # r^3
381 mov \$0x3ffffff,%eax # save r^3 base 2^26
385 mov %eax,`16*0+12-64`($ctx)
389 mov %edx,`16*1+12-64`($ctx)
390 lea (%rdx,%rdx,4),%edx # *5
392 mov %edx,`16*2+12-64`($ctx)
398 mov %eax,`16*3+12-64`($ctx)
399 lea (%rax,%rax,4),%eax # *5
401 mov %eax,`16*4+12-64`($ctx)
406 mov %edx,`16*5+12-64`($ctx)
407 lea (%rdx,%rdx,4),%edx # *5
409 mov %edx,`16*6+12-64`($ctx)
414 mov $d1#d,`16*7+12-64`($ctx)
415 lea ($d1,$d1,4),$d1 # *5
416 mov $d1#d,`16*8+12-64`($ctx)
419 call __poly1305_block # r^4
421 mov \$0x3ffffff,%eax # save r^4 base 2^26
425 mov %eax,`16*0+8-64`($ctx)
429 mov %edx,`16*1+8-64`($ctx)
430 lea (%rdx,%rdx,4),%edx # *5
432 mov %edx,`16*2+8-64`($ctx)
438 mov %eax,`16*3+8-64`($ctx)
439 lea (%rax,%rax,4),%eax # *5
441 mov %eax,`16*4+8-64`($ctx)
446 mov %edx,`16*5+8-64`($ctx)
447 lea (%rdx,%rdx,4),%edx # *5
449 mov %edx,`16*6+8-64`($ctx)
454 mov $d1#d,`16*7+8-64`($ctx)
455 lea ($d1,$d1,4),$d1 # *5
456 mov $d1#d,`16*8+8-64`($ctx)
458 lea -48-64($ctx),$ctx # size [de-]optimization
460 .size __poly1305_init_avx,.-__poly1305_init_avx
462 .type poly1305_blocks_avx,\@function,4
465 mov 20($ctx),%r8d # is_base2_26
491 mov $len,%r15 # reassign $len
493 mov 0($ctx),$d1 # load hash value
497 mov 24($ctx),$r0 # load r
500 ################################# base 2^26 -> base 2^64
502 and \$`-1*(1<<31)`,$d1
503 mov $d2,$r1 # borrow $r1
505 and \$`-1*(1<<31)`,$d2
519 adc \$0,$h2 # can be partially reduced...
521 mov \$-4,$d2 # ... so reduce
534 add $r1,$s1 # s1 = r1 + (r1 >> 2)
536 add 0($inp),$h0 # accumulate input
541 call __poly1305_block
543 test $padbit,$padbit # if $padbit is zero,
544 jz .Lstore_base2_64_avx # store hash in base 2^64 format
546 ################################# base 2^64 -> base 2^26
553 and \$0x3ffffff,%rax # h[0]
555 and \$0x3ffffff,%rdx # h[1]
559 and \$0x3ffffff,$h0 # h[2]
561 and \$0x3ffffff,$h1 # h[3]
565 jz .Lstore_base2_26_avx
575 .Lstore_base2_64_avx:
578 mov $h2,16($ctx) # note that is_base2_26 is zeroed
582 .Lstore_base2_26_avx:
583 mov %rax#d,0($ctx) # store hash value base 2^26
598 .Lblocks_avx_epilogue:
611 mov $len,%r15 # reassign $len
613 mov 24($ctx),$r0 # load r
616 mov 0($ctx),$h0 # load hash value
623 add $r1,$s1 # s1 = r1 + (r1 >> 2)
628 add 0($inp),$h0 # accumulate input
634 call __poly1305_block
637 ################################# base 2^64 -> base 2^26
644 and \$0x3ffffff,%rax # h[0]
646 and \$0x3ffffff,%rdx # h[1]
650 and \$0x3ffffff,$h0 # h[2]
652 and \$0x3ffffff,$h1 # h[3]
660 movl \$1,20($ctx) # set is_base2_26
662 call __poly1305_init_avx
675 .Lbase2_64_avx_epilogue:
680 vmovd 4*0($ctx),$H0 # load hash value
688 $code.=<<___ if (!$win64);
692 $code.=<<___ if ($win64);
695 vmovdqa %xmm6,0x50(%r11)
696 vmovdqa %xmm7,0x60(%r11)
697 vmovdqa %xmm8,0x70(%r11)
698 vmovdqa %xmm9,0x80(%r11)
699 vmovdqa %xmm10,0x90(%r11)
700 vmovdqa %xmm11,0xa0(%r11)
701 vmovdqa %xmm12,0xb0(%r11)
702 vmovdqa %xmm13,0xc0(%r11)
703 vmovdqa %xmm14,0xd0(%r11)
704 vmovdqa %xmm15,0xe0(%r11)
712 vmovdqu `16*3`($ctx),$D4 # preload r0^2
713 lea `16*3+64`($ctx),$ctx # size optimization
714 lea .Lconst(%rip),%rcx
716 ################################################################
718 vmovdqu 16*2($inp),$T0
719 vmovdqu 16*3($inp),$T1
720 vmovdqa 64(%rcx),$MASK # .Lmask26
722 vpsrldq \$6,$T0,$T2 # splat input
724 vpunpckhqdq $T1,$T0,$T4 # 4
725 vpunpcklqdq $T1,$T0,$T0 # 0:1
726 vpunpcklqdq $T3,$T2,$T3 # 2:3
728 vpsrlq \$40,$T4,$T4 # 4
730 vpand $MASK,$T0,$T0 # 0
732 vpand $MASK,$T1,$T1 # 1
734 vpand $MASK,$T2,$T2 # 2
735 vpand $MASK,$T3,$T3 # 3
736 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
740 # expand and copy pre-calculated table to stack
741 vmovdqu `16*1-64`($ctx),$D1
742 vmovdqu `16*2-64`($ctx),$D2
743 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
744 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
745 vmovdqa $D3,-0x90(%r11)
746 vmovdqa $D0,0x00(%rsp)
747 vpshufd \$0xEE,$D1,$D4
748 vmovdqu `16*3-64`($ctx),$D0
749 vpshufd \$0x44,$D1,$D1
750 vmovdqa $D4,-0x80(%r11)
751 vmovdqa $D1,0x10(%rsp)
752 vpshufd \$0xEE,$D2,$D3
753 vmovdqu `16*4-64`($ctx),$D1
754 vpshufd \$0x44,$D2,$D2
755 vmovdqa $D3,-0x70(%r11)
756 vmovdqa $D2,0x20(%rsp)
757 vpshufd \$0xEE,$D0,$D4
758 vmovdqu `16*5-64`($ctx),$D2
759 vpshufd \$0x44,$D0,$D0
760 vmovdqa $D4,-0x60(%r11)
761 vmovdqa $D0,0x30(%rsp)
762 vpshufd \$0xEE,$D1,$D3
763 vmovdqu `16*6-64`($ctx),$D0
764 vpshufd \$0x44,$D1,$D1
765 vmovdqa $D3,-0x50(%r11)
766 vmovdqa $D1,0x40(%rsp)
767 vpshufd \$0xEE,$D2,$D4
768 vmovdqu `16*7-64`($ctx),$D1
769 vpshufd \$0x44,$D2,$D2
770 vmovdqa $D4,-0x40(%r11)
771 vmovdqa $D2,0x50(%rsp)
772 vpshufd \$0xEE,$D0,$D3
773 vmovdqu `16*8-64`($ctx),$D2
774 vpshufd \$0x44,$D0,$D0
775 vmovdqa $D3,-0x30(%r11)
776 vmovdqa $D0,0x60(%rsp)
777 vpshufd \$0xEE,$D1,$D4
778 vpshufd \$0x44,$D1,$D1
779 vmovdqa $D4,-0x20(%r11)
780 vmovdqa $D1,0x70(%rsp)
781 vpshufd \$0xEE,$D2,$D3
782 vmovdqa 0x00(%rsp),$D4 # preload r0^2
783 vpshufd \$0x44,$D2,$D2
784 vmovdqa $D3,-0x10(%r11)
785 vmovdqa $D2,0x80(%rsp)
791 ################################################################
792 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
793 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
794 # \___________________/
795 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
796 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
797 # \___________________/ \____________________/
799 # Note that we start with inp[2:3]*r^2. This is because it
800 # doesn't depend on reduction in previous iteration.
801 ################################################################
802 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
803 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
804 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
805 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
806 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
808 # though note that $Tx and $Hx are "reversed" in this section,
809 # and $D4 is preloaded with r0^2...
811 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
812 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
813 vmovdqa $H2,0x20(%r11) # offload hash
814 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
815 vmovdqa 0x10(%rsp),$H2 # r1^2
816 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
817 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
819 vmovdqa $H0,0x00(%r11) #
820 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
821 vmovdqa $H1,0x10(%r11) #
822 vpmuludq $T3,$H2,$H1 # h3*r1
823 vpaddq $H0,$D0,$D0 # d0 += h4*s1
824 vpaddq $H1,$D4,$D4 # d4 += h3*r1
825 vmovdqa $H3,0x30(%r11) #
826 vpmuludq $T2,$H2,$H0 # h2*r1
827 vpmuludq $T1,$H2,$H1 # h1*r1
828 vpaddq $H0,$D3,$D3 # d3 += h2*r1
829 vmovdqa 0x30(%rsp),$H3 # r2^2
830 vpaddq $H1,$D2,$D2 # d2 += h1*r1
831 vmovdqa $H4,0x40(%r11) #
832 vpmuludq $T0,$H2,$H2 # h0*r1
833 vpmuludq $T2,$H3,$H0 # h2*r2
834 vpaddq $H2,$D1,$D1 # d1 += h0*r1
836 vmovdqa 0x40(%rsp),$H4 # s2^2
837 vpaddq $H0,$D4,$D4 # d4 += h2*r2
838 vpmuludq $T1,$H3,$H1 # h1*r2
839 vpmuludq $T0,$H3,$H3 # h0*r2
840 vpaddq $H1,$D3,$D3 # d3 += h1*r2
841 vmovdqa 0x50(%rsp),$H2 # r3^2
842 vpaddq $H3,$D2,$D2 # d2 += h0*r2
843 vpmuludq $T4,$H4,$H0 # h4*s2
844 vpmuludq $T3,$H4,$H4 # h3*s2
845 vpaddq $H0,$D1,$D1 # d1 += h4*s2
846 vmovdqa 0x60(%rsp),$H3 # s3^2
847 vpaddq $H4,$D0,$D0 # d0 += h3*s2
849 vmovdqa 0x80(%rsp),$H4 # s4^2
850 vpmuludq $T1,$H2,$H1 # h1*r3
851 vpmuludq $T0,$H2,$H2 # h0*r3
852 vpaddq $H1,$D4,$D4 # d4 += h1*r3
853 vpaddq $H2,$D3,$D3 # d3 += h0*r3
854 vpmuludq $T4,$H3,$H0 # h4*s3
855 vpmuludq $T3,$H3,$H1 # h3*s3
856 vpaddq $H0,$D2,$D2 # d2 += h4*s3
857 vmovdqu 16*0($inp),$H0 # load input
858 vpaddq $H1,$D1,$D1 # d1 += h3*s3
859 vpmuludq $T2,$H3,$H3 # h2*s3
860 vpmuludq $T2,$H4,$T2 # h2*s4
861 vpaddq $H3,$D0,$D0 # d0 += h2*s3
863 vmovdqu 16*1($inp),$H1 #
864 vpaddq $T2,$D1,$D1 # d1 += h2*s4
865 vpmuludq $T3,$H4,$T3 # h3*s4
866 vpmuludq $T4,$H4,$T4 # h4*s4
867 vpsrldq \$6,$H0,$H2 # splat input
868 vpaddq $T3,$D2,$D2 # d2 += h3*s4
869 vpaddq $T4,$D3,$D3 # d3 += h4*s4
870 vpsrldq \$6,$H1,$H3 #
871 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
872 vpmuludq $T1,$H4,$T0 # h1*s4
873 vpunpckhqdq $H1,$H0,$H4 # 4
874 vpaddq $T4,$D4,$D4 # d4 += h0*r4
875 vmovdqa -0x90(%r11),$T4 # r0^4
876 vpaddq $T0,$D0,$D0 # d0 += h1*s4
878 vpunpcklqdq $H1,$H0,$H0 # 0:1
879 vpunpcklqdq $H3,$H2,$H3 # 2:3
881 #vpsrlq \$40,$H4,$H4 # 4
882 vpsrldq \$`40/8`,$H4,$H4 # 4
884 vpand $MASK,$H0,$H0 # 0
886 vpand $MASK,$H1,$H1 # 1
887 vpand 0(%rcx),$H4,$H4 # .Lmask24
889 vpand $MASK,$H2,$H2 # 2
890 vpand $MASK,$H3,$H3 # 3
891 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
893 vpaddq 0x00(%r11),$H0,$H0 # add hash value
894 vpaddq 0x10(%r11),$H1,$H1
895 vpaddq 0x20(%r11),$H2,$H2
896 vpaddq 0x30(%r11),$H3,$H3
897 vpaddq 0x40(%r11),$H4,$H4
904 ################################################################
905 # Now we accumulate (inp[0:1]+hash)*r^4
906 ################################################################
907 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
908 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
909 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
910 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
911 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
913 vpmuludq $H0,$T4,$T0 # h0*r0
914 vpmuludq $H1,$T4,$T1 # h1*r0
917 vmovdqa -0x80(%r11),$T2 # r1^4
918 vpmuludq $H2,$T4,$T0 # h2*r0
919 vpmuludq $H3,$T4,$T1 # h3*r0
922 vpmuludq $H4,$T4,$T4 # h4*r0
923 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
926 vpaddq $T0,$D0,$D0 # d0 += h4*s1
927 vpmuludq $H2,$T2,$T1 # h2*r1
928 vpmuludq $H3,$T2,$T0 # h3*r1
929 vpaddq $T1,$D3,$D3 # d3 += h2*r1
930 vmovdqa -0x60(%r11),$T3 # r2^4
931 vpaddq $T0,$D4,$D4 # d4 += h3*r1
932 vpmuludq $H1,$T2,$T1 # h1*r1
933 vpmuludq $H0,$T2,$T2 # h0*r1
934 vpaddq $T1,$D2,$D2 # d2 += h1*r1
935 vpaddq $T2,$D1,$D1 # d1 += h0*r1
937 vmovdqa -0x50(%r11),$T4 # s2^4
938 vpmuludq $H2,$T3,$T0 # h2*r2
939 vpmuludq $H1,$T3,$T1 # h1*r2
940 vpaddq $T0,$D4,$D4 # d4 += h2*r2
941 vpaddq $T1,$D3,$D3 # d3 += h1*r2
942 vmovdqa -0x40(%r11),$T2 # r3^4
943 vpmuludq $H0,$T3,$T3 # h0*r2
944 vpmuludq $H4,$T4,$T0 # h4*s2
945 vpaddq $T3,$D2,$D2 # d2 += h0*r2
946 vpaddq $T0,$D1,$D1 # d1 += h4*s2
947 vmovdqa -0x30(%r11),$T3 # s3^4
948 vpmuludq $H3,$T4,$T4 # h3*s2
949 vpmuludq $H1,$T2,$T1 # h1*r3
950 vpaddq $T4,$D0,$D0 # d0 += h3*s2
952 vmovdqa -0x10(%r11),$T4 # s4^4
953 vpaddq $T1,$D4,$D4 # d4 += h1*r3
954 vpmuludq $H0,$T2,$T2 # h0*r3
955 vpmuludq $H4,$T3,$T0 # h4*s3
956 vpaddq $T2,$D3,$D3 # d3 += h0*r3
957 vpaddq $T0,$D2,$D2 # d2 += h4*s3
958 vmovdqu 16*2($inp),$T0 # load input
959 vpmuludq $H3,$T3,$T2 # h3*s3
960 vpmuludq $H2,$T3,$T3 # h2*s3
961 vpaddq $T2,$D1,$D1 # d1 += h3*s3
962 vmovdqu 16*3($inp),$T1 #
963 vpaddq $T3,$D0,$D0 # d0 += h2*s3
965 vpmuludq $H2,$T4,$H2 # h2*s4
966 vpmuludq $H3,$T4,$H3 # h3*s4
967 vpsrldq \$6,$T0,$T2 # splat input
968 vpaddq $H2,$D1,$D1 # d1 += h2*s4
969 vpmuludq $H4,$T4,$H4 # h4*s4
970 vpsrldq \$6,$T1,$T3 #
971 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
972 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
973 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
975 vpunpckhqdq $T1,$T0,$T4 # 4
976 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
977 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
979 vpunpcklqdq $T1,$T0,$T0 # 0:1
980 vpunpcklqdq $T3,$T2,$T3 # 2:3
982 #vpsrlq \$40,$T4,$T4 # 4
983 vpsrldq \$`40/8`,$T4,$T4 # 4
985 vmovdqa 0x00(%rsp),$D4 # preload r0^2
986 vpand $MASK,$T0,$T0 # 0
988 vpand $MASK,$T1,$T1 # 1
989 vpand 0(%rcx),$T4,$T4 # .Lmask24
991 vpand $MASK,$T2,$T2 # 2
992 vpand $MASK,$T3,$T3 # 3
993 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
995 ################################################################
996 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1001 vpaddq $D3,$H4,$H4 # h3 -> h4
1005 vpaddq $D0,$D1,$H1 # h0 -> h1
1012 vpaddq $D1,$H2,$H2 # h1 -> h2
1016 vpaddq $D0,$H0,$H0 # h4 -> h0
1020 vpaddq $D2,$H3,$H3 # h2 -> h3
1024 vpaddq $D0,$H1,$H1 # h0 -> h1
1028 vpaddq $D3,$H4,$H4 # h3 -> h4
1033 ################################################################
1034 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1036 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1047 vmovdqa $H2,0x20(%r11)
1048 vmovdqa $H0,0x00(%r11)
1049 vmovdqa $H1,0x10(%r11)
1050 vmovdqa $H3,0x30(%r11)
1051 vmovdqa $H4,0x40(%r11)
1053 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1054 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1055 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1056 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1057 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1059 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1060 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1061 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1062 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1063 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1064 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1066 vpmuludq $T3,$H2,$H0 # h3*r1
1067 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1068 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1069 vpmuludq $T2,$H2,$H1 # h2*r1
1070 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1071 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1072 vpmuludq $T1,$H2,$H0 # h1*r1
1073 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1074 vpmuludq $T0,$H2,$H2 # h0*r1
1075 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1076 vpmuludq $T4,$H3,$H3 # h4*s1
1077 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1079 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1080 vpmuludq $T2,$H4,$H1 # h2*r2
1081 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1082 vpmuludq $T1,$H4,$H0 # h1*r2
1083 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1084 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1085 vpmuludq $T0,$H4,$H4 # h0*r2
1086 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1087 vpmuludq $T4,$H2,$H1 # h4*s2
1088 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1089 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1090 vpmuludq $T3,$H2,$H2 # h3*s2
1091 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1093 vpmuludq $T1,$H3,$H0 # h1*r3
1094 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1095 vpmuludq $T0,$H3,$H3 # h0*r3
1096 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1097 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1098 vpmuludq $T4,$H4,$H1 # h4*s3
1099 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1100 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1101 vpmuludq $T3,$H4,$H0 # h3*s3
1102 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1103 vpmuludq $T2,$H4,$H4 # h2*s3
1104 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1106 vpmuludq $T0,$H2,$H2 # h0*r4
1107 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1108 vpmuludq $T4,$H3,$H1 # h4*s4
1109 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1110 vpmuludq $T3,$H3,$H0 # h3*s4
1111 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1112 vpmuludq $T2,$H3,$H1 # h2*s4
1113 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1114 vpmuludq $T1,$H3,$H3 # h1*s4
1115 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1119 vmovdqu 16*0($inp),$H0 # load input
1120 vmovdqu 16*1($inp),$H1
1122 vpsrldq \$6,$H0,$H2 # splat input
1124 vpunpckhqdq $H1,$H0,$H4 # 4
1125 vpunpcklqdq $H1,$H0,$H0 # 0:1
1126 vpunpcklqdq $H3,$H2,$H3 # 2:3
1128 vpsrlq \$40,$H4,$H4 # 4
1130 vpand $MASK,$H0,$H0 # 0
1132 vpand $MASK,$H1,$H1 # 1
1134 vpand $MASK,$H2,$H2 # 2
1135 vpand $MASK,$H3,$H3 # 3
1136 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1138 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1139 vpaddq 0x00(%r11),$H0,$H0
1140 vpaddq 0x10(%r11),$H1,$H1
1141 vpaddq 0x20(%r11),$H2,$H2
1142 vpaddq 0x30(%r11),$H3,$H3
1143 vpaddq 0x40(%r11),$H4,$H4
1145 ################################################################
1146 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1148 vpmuludq $H0,$T4,$T0 # h0*r0
1149 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1150 vpmuludq $H1,$T4,$T1 # h1*r0
1151 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1152 vpmuludq $H2,$T4,$T0 # h2*r0
1153 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1154 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1155 vpmuludq $H3,$T4,$T1 # h3*r0
1156 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1157 vpmuludq $H4,$T4,$T4 # h4*r0
1158 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1160 vpmuludq $H3,$T2,$T0 # h3*r1
1161 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1162 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1163 vpmuludq $H2,$T2,$T1 # h2*r1
1164 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1165 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1166 vpmuludq $H1,$T2,$T0 # h1*r1
1167 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1168 vpmuludq $H0,$T2,$T2 # h0*r1
1169 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1170 vpmuludq $H4,$T3,$T3 # h4*s1
1171 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1173 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1174 vpmuludq $H2,$T4,$T1 # h2*r2
1175 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1176 vpmuludq $H1,$T4,$T0 # h1*r2
1177 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1178 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1179 vpmuludq $H0,$T4,$T4 # h0*r2
1180 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1181 vpmuludq $H4,$T2,$T1 # h4*s2
1182 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1183 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1184 vpmuludq $H3,$T2,$T2 # h3*s2
1185 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1187 vpmuludq $H1,$T3,$T0 # h1*r3
1188 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1189 vpmuludq $H0,$T3,$T3 # h0*r3
1190 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1191 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1192 vpmuludq $H4,$T4,$T1 # h4*s3
1193 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1194 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1195 vpmuludq $H3,$T4,$T0 # h3*s3
1196 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1197 vpmuludq $H2,$T4,$T4 # h2*s3
1198 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1200 vpmuludq $H0,$T2,$T2 # h0*r4
1201 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1202 vpmuludq $H4,$T3,$T1 # h4*s4
1203 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1204 vpmuludq $H3,$T3,$T0 # h3*s4
1205 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1206 vpmuludq $H2,$T3,$T1 # h2*s4
1207 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1208 vpmuludq $H1,$T3,$T3 # h1*s4
1209 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1212 ################################################################
1213 # horizontal addition
1226 ################################################################
1231 vpaddq $H3,$D4,$D4 # h3 -> h4
1235 vpaddq $H0,$D1,$D1 # h0 -> h1
1242 vpaddq $H1,$D2,$D2 # h1 -> h2
1246 vpaddq $H4,$D0,$D0 # h4 -> h0
1250 vpaddq $H2,$D3,$D3 # h2 -> h3
1254 vpaddq $H0,$D1,$D1 # h0 -> h1
1258 vpaddq $H3,$D4,$D4 # h3 -> h4
1260 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1261 vmovd $D1,`4*1-48-64`($ctx)
1262 vmovd $D2,`4*2-48-64`($ctx)
1263 vmovd $D3,`4*3-48-64`($ctx)
1264 vmovd $D4,`4*4-48-64`($ctx)
1266 $code.=<<___ if ($win64);
1267 vmovdqa 0x50(%r11),%xmm6
1268 vmovdqa 0x60(%r11),%xmm7
1269 vmovdqa 0x70(%r11),%xmm8
1270 vmovdqa 0x80(%r11),%xmm9
1271 vmovdqa 0x90(%r11),%xmm10
1272 vmovdqa 0xa0(%r11),%xmm11
1273 vmovdqa 0xb0(%r11),%xmm12
1274 vmovdqa 0xc0(%r11),%xmm13
1275 vmovdqa 0xd0(%r11),%xmm14
1276 vmovdqa 0xe0(%r11),%xmm15
1280 $code.=<<___ if (!$win64);
1286 .size poly1305_blocks_avx,.-poly1305_blocks_avx
1288 .type poly1305_emit_avx,\@function,3
1291 cmpl \$0,20($ctx) # is_base2_26?
1294 mov 0($ctx),%eax # load hash value base 2^26
1300 shl \$26,%rcx # base 2^26 -> base 2^64
1316 mov %r10,%rax # could be partially reduced, so reduce
1327 add \$5,%r8 # compare to modulus
1331 shr \$2,%r10 # did 130-bit value overfow?
1335 add 0($nonce),%rax # accumulate nonce
1337 mov %rax,0($mac) # write result
1341 .size poly1305_emit_avx,.-poly1305_emit_avx
1345 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1346 map("%ymm$_",(0..15));
1350 .type poly1305_blocks_avx2,\@function,4
1352 poly1305_blocks_avx2:
1353 mov 20($ctx),%r8d # is_base2_26
1379 mov $len,%r15 # reassign $len
1381 mov 0($ctx),$d1 # load hash value
1385 mov 24($ctx),$r0 # load r
1388 ################################# base 2^26 -> base 2^64
1390 and \$`-1*(1<<31)`,$d1
1391 mov $d2,$r1 # borrow $r1
1393 and \$`-1*(1<<31)`,$d2
1407 adc \$0,$h2 # can be partially reduced...
1409 mov \$-4,$d2 # ... so reduce
1422 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1424 .Lbase2_26_pre_avx2:
1425 add 0($inp),$h0 # accumulate input
1431 call __poly1305_block
1435 jnz .Lbase2_26_pre_avx2
1437 test $padbit,$padbit # if $padbit is zero,
1438 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1440 ################################# base 2^64 -> base 2^26
1447 and \$0x3ffffff,%rax # h[0]
1449 and \$0x3ffffff,%rdx # h[1]
1453 and \$0x3ffffff,$h0 # h[2]
1455 and \$0x3ffffff,$h1 # h[3]
1459 jz .Lstore_base2_26_avx2
1469 .Lstore_base2_64_avx2:
1472 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1476 .Lstore_base2_26_avx2:
1477 mov %rax#d,0($ctx) # store hash value base 2^26
1492 .Lblocks_avx2_epilogue:
1503 .Lbase2_64_avx2_body:
1505 mov $len,%r15 # reassign $len
1507 mov 24($ctx),$r0 # load r
1510 mov 0($ctx),$h0 # load hash value
1517 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1522 .Lbase2_64_pre_avx2:
1523 add 0($inp),$h0 # accumulate input
1529 call __poly1305_block
1533 jnz .Lbase2_64_pre_avx2
1536 ################################# base 2^64 -> base 2^26
1543 and \$0x3ffffff,%rax # h[0]
1545 and \$0x3ffffff,%rdx # h[1]
1549 and \$0x3ffffff,$h0 # h[2]
1551 and \$0x3ffffff,$h1 # h[3]
1559 movl \$1,20($ctx) # set is_base2_26
1561 call __poly1305_init_avx
1574 .Lbase2_64_avx2_epilogue:
1579 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1580 vmovd 4*1($ctx),%x#$H1
1581 vmovd 4*2($ctx),%x#$H2
1582 vmovd 4*3($ctx),%x#$H3
1583 vmovd 4*4($ctx),%x#$H4
1587 $code.=<<___ if (!$win64);
1591 $code.=<<___ if ($win64);
1592 lea -0xf8(%rsp),%r11
1594 vmovdqa %xmm6,0x50(%r11)
1595 vmovdqa %xmm7,0x60(%r11)
1596 vmovdqa %xmm8,0x70(%r11)
1597 vmovdqa %xmm9,0x80(%r11)
1598 vmovdqa %xmm10,0x90(%r11)
1599 vmovdqa %xmm11,0xa0(%r11)
1600 vmovdqa %xmm12,0xb0(%r11)
1601 vmovdqa %xmm13,0xc0(%r11)
1602 vmovdqa %xmm14,0xd0(%r11)
1603 vmovdqa %xmm15,0xe0(%r11)
1607 lea 48+64($ctx),$ctx # size optimization
1608 lea .Lconst(%rip),%rcx
1610 # expand and copy pre-calculated table to stack
1611 vmovdqu `16*0-64`($ctx),%x#$T2
1613 vmovdqu `16*1-64`($ctx),%x#$T3
1614 vmovdqu `16*2-64`($ctx),%x#$T4
1615 vmovdqu `16*3-64`($ctx),%x#$D0
1616 vmovdqu `16*4-64`($ctx),%x#$D1
1617 vmovdqu `16*5-64`($ctx),%x#$D2
1618 vmovdqu `16*6-64`($ctx),%x#$D3
1619 vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
1620 vmovdqu `16*7-64`($ctx),%x#$D4
1621 vpermq \$0x15,$T3,$T3
1622 vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
1623 vmovdqu `16*8-64`($ctx),%x#$MASK
1624 vpermq \$0x15,$T4,$T4
1625 vpshufd \$0xc8,$T3,$T3
1626 vmovdqa $T2,0x00(%rsp)
1627 vpermq \$0x15,$D0,$D0
1628 vpshufd \$0xc8,$T4,$T4
1629 vmovdqa $T3,0x20(%rsp)
1630 vpermq \$0x15,$D1,$D1
1631 vpshufd \$0xc8,$D0,$D0
1632 vmovdqa $T4,0x40(%rsp)
1633 vpermq \$0x15,$D2,$D2
1634 vpshufd \$0xc8,$D1,$D1
1635 vmovdqa $D0,0x60(%rsp)
1636 vpermq \$0x15,$D3,$D3
1637 vpshufd \$0xc8,$D2,$D2
1638 vmovdqa $D1,0x80(%rsp)
1639 vpermq \$0x15,$D4,$D4
1640 vpshufd \$0xc8,$D3,$D3
1641 vmovdqa $D2,0xa0(%rsp)
1642 vpermq \$0x15,$MASK,$MASK
1643 vpshufd \$0xc8,$D4,$D4
1644 vmovdqa $D3,0xc0(%rsp)
1645 vpshufd \$0xc8,$MASK,$MASK
1646 vmovdqa $D4,0xe0(%rsp)
1647 vmovdqa $MASK,0x100(%rsp)
1648 vmovdqa 64(%rcx),$MASK # .Lmask26
1650 ################################################################
1652 vmovdqu 16*0($inp),%x#$T0
1653 vmovdqu 16*1($inp),%x#$T1
1654 vinserti128 \$1,16*2($inp),$T0,$T0
1655 vinserti128 \$1,16*3($inp),$T1,$T1
1658 vpsrldq \$6,$T0,$T2 # splat input
1660 vpunpckhqdq $T1,$T0,$T4 # 4
1661 vpunpcklqdq $T3,$T2,$T2 # 2:3
1662 vpunpcklqdq $T1,$T0,$T0 # 0:1
1667 vpsrlq \$40,$T4,$T4 # 4
1668 vpand $MASK,$T2,$T2 # 2
1669 vpand $MASK,$T0,$T0 # 0
1670 vpand $MASK,$T1,$T1 # 1
1671 vpand $MASK,$T3,$T3 # 3
1672 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1674 lea 0x90(%rsp),%rax # size optimization
1675 vpaddq $H2,$T2,$H2 # accumulate input
1682 ################################################################
1683 # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
1684 # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
1685 # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
1686 # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
1687 # \________/\________/
1688 ################################################################
1689 #vpaddq $H2,$T2,$H2 # accumulate input
1691 vmovdqa `32*0`(%rsp),$T0 # r0^4
1693 vmovdqa `32*1`(%rsp),$T1 # r1^4
1695 vmovdqa `32*3`(%rsp),$T2 # r2^4
1697 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1698 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1700 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1701 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1702 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1703 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1704 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1706 # however, as h2 is "chronologically" first one available pull
1707 # corresponding operations up, so it's
1709 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1710 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1711 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1712 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1713 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1715 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1716 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1717 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1718 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1719 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1721 vpmuludq $H0,$T1,$T4 # h0*r1
1722 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1723 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1724 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1725 vpmuludq $H3,$T1,$T4 # h3*r1
1726 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1727 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1728 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1729 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1731 vpmuludq $H0,$T0,$T4 # h0*r0
1732 vpmuludq $H1,$T0,$H2 # h1*r0
1733 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1734 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1735 vpmuludq $H3,$T0,$T4 # h3*r0
1736 vpmuludq $H4,$T0,$H2 # h4*r0
1737 vmovdqu 16*0($inp),%x#$T0 # load input
1738 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1739 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1740 vinserti128 \$1,16*2($inp),$T0,$T0
1742 vpmuludq $H3,$T1,$T4 # h3*s2
1743 vpmuludq $H4,$T1,$H2 # h4*s2
1744 vmovdqu 16*1($inp),%x#$T1
1745 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1746 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1747 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1748 vpmuludq $H1,$T2,$T4 # h1*r2
1749 vpmuludq $H0,$T2,$T2 # h0*r2
1750 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1751 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1752 vinserti128 \$1,16*3($inp),$T1,$T1
1755 vpmuludq $H1,$H2,$T4 # h1*r3
1756 vpmuludq $H0,$H2,$H2 # h0*r3
1757 vpsrldq \$6,$T0,$T2 # splat input
1758 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1759 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1760 vpmuludq $H3,$T3,$T4 # h3*s3
1761 vpmuludq $H4,$T3,$H2 # h4*s3
1763 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1764 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1765 vpunpckhqdq $T1,$T0,$T4 # 4
1767 vpmuludq $H3,$S4,$H3 # h3*s4
1768 vpmuludq $H4,$S4,$H4 # h4*s4
1769 vpunpcklqdq $T1,$T0,$T0 # 0:1
1770 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1771 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1772 vpunpcklqdq $T3,$T2,$T3 # 2:3
1773 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1774 vpmuludq $H1,$S4,$H0 # h1*s4
1775 vmovdqa 64(%rcx),$MASK # .Lmask26
1776 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1777 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1779 ################################################################
1780 # lazy reduction (interleaved with tail of input splat)
1784 vpaddq $D3,$H4,$H4 # h3 -> h4
1788 vpaddq $D0,$D1,$H1 # h0 -> h1
1797 vpaddq $D1,$H2,$H2 # h1 -> h2
1801 vpaddq $D4,$H0,$H0 # h4 -> h0
1803 vpand $MASK,$T2,$T2 # 2
1808 vpaddq $D2,$H3,$H3 # h2 -> h3
1810 vpaddq $T2,$H2,$H2 # modulo-scheduled
1815 vpaddq $D0,$H1,$H1 # h0 -> h1
1817 vpsrlq \$40,$T4,$T4 # 4
1821 vpaddq $D3,$H4,$H4 # h3 -> h4
1823 vpand $MASK,$T0,$T0 # 0
1824 vpand $MASK,$T1,$T1 # 1
1825 vpand $MASK,$T3,$T3 # 3
1826 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1833 ################################################################
1834 # while above multiplications were by r^4 in all lanes, in last
1835 # iteration we multiply least significant lane by r^4 and most
1836 # significant one by r, so copy of above except that references
1837 # to the precomputed table are displaced by 4...
1839 #vpaddq $H2,$T2,$H2 # accumulate input
1841 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1843 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1845 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1847 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1848 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1850 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1851 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1852 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1853 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1854 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1856 vpmuludq $H0,$T1,$T4 # h0*r1
1857 vpmuludq $H1,$T1,$H2 # h1*r1
1858 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1859 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1860 vpmuludq $H3,$T1,$T4 # h3*r1
1861 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1862 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1863 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1865 vpmuludq $H0,$T0,$T4 # h0*r0
1866 vpmuludq $H1,$T0,$H2 # h1*r0
1867 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1868 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
1869 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1870 vpmuludq $H3,$T0,$T4 # h3*r0
1871 vpmuludq $H4,$T0,$H2 # h4*r0
1872 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1873 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1875 vpmuludq $H3,$T1,$T4 # h3*s2
1876 vpmuludq $H4,$T1,$H2 # h4*s2
1877 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1878 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1879 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
1880 vpmuludq $H1,$T2,$T4 # h1*r2
1881 vpmuludq $H0,$T2,$T2 # h0*r2
1882 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1883 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1885 vpmuludq $H1,$H2,$T4 # h1*r3
1886 vpmuludq $H0,$H2,$H2 # h0*r3
1887 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1888 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1889 vpmuludq $H3,$T3,$T4 # h3*s3
1890 vpmuludq $H4,$T3,$H2 # h4*s3
1891 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1892 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1894 vpmuludq $H3,$S4,$H3 # h3*s4
1895 vpmuludq $H4,$S4,$H4 # h4*s4
1896 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1897 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1898 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
1899 vpmuludq $H1,$S4,$H0 # h1*s4
1900 vmovdqa 64(%rcx),$MASK # .Lmask26
1901 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1902 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1904 ################################################################
1905 # horizontal addition
1918 vpermq \$0x2,$H3,$T3
1919 vpermq \$0x2,$H4,$T4
1920 vpermq \$0x2,$H0,$T0
1921 vpermq \$0x2,$D1,$T1
1922 vpermq \$0x2,$H2,$T2
1929 ################################################################
1934 vpaddq $D3,$H4,$H4 # h3 -> h4
1938 vpaddq $D0,$D1,$H1 # h0 -> h1
1945 vpaddq $D1,$H2,$H2 # h1 -> h2
1949 vpaddq $D4,$H0,$H0 # h4 -> h0
1953 vpaddq $D2,$H3,$H3 # h2 -> h3
1957 vpaddq $D0,$H1,$H1 # h0 -> h1
1961 vpaddq $D3,$H4,$H4 # h3 -> h4
1963 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
1964 vmovd %x#$H1,`4*1-48-64`($ctx)
1965 vmovd %x#$H2,`4*2-48-64`($ctx)
1966 vmovd %x#$H3,`4*3-48-64`($ctx)
1967 vmovd %x#$H4,`4*4-48-64`($ctx)
1969 $code.=<<___ if ($win64);
1970 vmovdqa 0x50(%r11),%xmm6
1971 vmovdqa 0x60(%r11),%xmm7
1972 vmovdqa 0x70(%r11),%xmm8
1973 vmovdqa 0x80(%r11),%xmm9
1974 vmovdqa 0x90(%r11),%xmm10
1975 vmovdqa 0xa0(%r11),%xmm11
1976 vmovdqa 0xb0(%r11),%xmm12
1977 vmovdqa 0xc0(%r11),%xmm13
1978 vmovdqa 0xd0(%r11),%xmm14
1979 vmovdqa 0xe0(%r11),%xmm15
1983 $code.=<<___ if (!$win64);
1989 .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
1996 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1998 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2000 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2002 .long 5,0,5,0,5,0,5,0
2007 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2011 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2012 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2020 .extern __imp_RtlVirtualUnwind
2021 .type se_handler,\@abi-omnipotent
2035 mov 120($context),%rax # pull context->Rax
2036 mov 248($context),%rbx # pull context->Rip
2038 mov 8($disp),%rsi # disp->ImageBase
2039 mov 56($disp),%r11 # disp->HandlerData
2041 mov 0(%r11),%r10d # HandlerData[0]
2042 lea (%rsi,%r10),%r10 # prologue label
2043 cmp %r10,%rbx # context->Rip<.Lprologue
2044 jb .Lcommon_seh_tail
2046 mov 152($context),%rax # pull context->Rsp
2048 mov 4(%r11),%r10d # HandlerData[1]
2049 lea (%rsi,%r10),%r10 # epilogue label
2050 cmp %r10,%rbx # context->Rip>=.Lepilogue
2051 jae .Lcommon_seh_tail
2061 mov %rbx,144($context) # restore context->Rbx
2062 mov %rbp,160($context) # restore context->Rbp
2063 mov %r12,216($context) # restore context->R12
2064 mov %r13,224($context) # restore context->R13
2065 mov %r14,232($context) # restore context->R14
2066 mov %r15,240($context) # restore context->R14
2068 jmp .Lcommon_seh_tail
2069 .size se_handler,.-se_handler
2071 .type avx_handler,\@abi-omnipotent
2085 mov 120($context),%rax # pull context->Rax
2086 mov 248($context),%rbx # pull context->Rip
2088 mov 8($disp),%rsi # disp->ImageBase
2089 mov 56($disp),%r11 # disp->HandlerData
2091 mov 0(%r11),%r10d # HandlerData[0]
2092 lea (%rsi,%r10),%r10 # prologue label
2093 cmp %r10,%rbx # context->Rip<prologue label
2094 jb .Lcommon_seh_tail
2096 mov 152($context),%rax # pull context->Rsp
2098 mov 4(%r11),%r10d # HandlerData[1]
2099 lea (%rsi,%r10),%r10 # epilogue label
2100 cmp %r10,%rbx # context->Rip>=epilogue label
2101 jae .Lcommon_seh_tail
2103 mov 208($context),%rax # pull context->R11
2107 lea 512($context),%rdi # &context.Xmm6
2109 .long 0xa548f3fc # cld; rep movsq
2114 mov %rax,152($context) # restore context->Rsp
2115 mov %rsi,168($context) # restore context->Rsi
2116 mov %rdi,176($context) # restore context->Rdi
2118 mov 40($disp),%rdi # disp->ContextRecord
2119 mov $context,%rsi # context
2120 mov \$154,%ecx # sizeof(CONTEXT)
2121 .long 0xa548f3fc # cld; rep movsq
2124 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2125 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2126 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2127 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2128 mov 40(%rsi),%r10 # disp->ContextRecord
2129 lea 56(%rsi),%r11 # &disp->HandlerData
2130 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2131 mov %r10,32(%rsp) # arg5
2132 mov %r11,40(%rsp) # arg6
2133 mov %r12,48(%rsp) # arg7
2134 mov %rcx,56(%rsp) # arg8, (NULL)
2135 call *__imp_RtlVirtualUnwind(%rip)
2137 mov \$1,%eax # ExceptionContinueSearch
2149 .size avx_handler,.-avx_handler
2153 .rva .LSEH_begin_poly1305_init
2154 .rva .LSEH_end_poly1305_init
2155 .rva .LSEH_info_poly1305_init
2157 .rva .LSEH_begin_poly1305_blocks
2158 .rva .LSEH_end_poly1305_blocks
2159 .rva .LSEH_info_poly1305_blocks
2161 .rva .LSEH_begin_poly1305_emit
2162 .rva .LSEH_end_poly1305_emit
2163 .rva .LSEH_info_poly1305_emit
2165 $code.=<<___ if ($avx);
2166 .rva .LSEH_begin_poly1305_blocks_avx
2168 .rva .LSEH_info_poly1305_blocks_avx_1
2172 .rva .LSEH_info_poly1305_blocks_avx_2
2175 .rva .LSEH_end_poly1305_blocks_avx
2176 .rva .LSEH_info_poly1305_blocks_avx_3
2178 .rva .LSEH_begin_poly1305_emit_avx
2179 .rva .LSEH_end_poly1305_emit_avx
2180 .rva .LSEH_info_poly1305_emit_avx
2182 $code.=<<___ if ($avx>1);
2183 .rva .LSEH_begin_poly1305_blocks_avx2
2184 .rva .Lbase2_64_avx2
2185 .rva .LSEH_info_poly1305_blocks_avx2_1
2187 .rva .Lbase2_64_avx2
2189 .rva .LSEH_info_poly1305_blocks_avx2_2
2192 .rva .LSEH_end_poly1305_blocks_avx2
2193 .rva .LSEH_info_poly1305_blocks_avx2_3
2198 .LSEH_info_poly1305_init:
2201 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2203 .LSEH_info_poly1305_blocks:
2206 .rva .Lblocks_body,.Lblocks_epilogue
2208 .LSEH_info_poly1305_emit:
2211 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2213 $code.=<<___ if ($avx);
2214 .LSEH_info_poly1305_blocks_avx_1:
2217 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
2219 .LSEH_info_poly1305_blocks_avx_2:
2222 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
2224 .LSEH_info_poly1305_blocks_avx_3:
2227 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
2229 .LSEH_info_poly1305_emit_avx:
2232 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
2234 $code.=<<___ if ($avx>1);
2235 .LSEH_info_poly1305_blocks_avx2_1:
2238 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
2240 .LSEH_info_poly1305_blocks_avx2_2:
2243 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
2245 .LSEH_info_poly1305_blocks_avx2_3:
2248 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
2252 foreach (split('\n',$code)) {
2253 s/\`([^\`]*)\`/eval($1)/ge;
2254 s/%r([a-z]+)#d/%e$1/g;
2255 s/%r([0-9]+)#d/%r$1d/g;