2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for x86_64.
25 # Add AVX512F+VL+BW code path.
27 # Numbers are cycles per processed byte with poly1305_blocks alone,
28 # measured with rdtsc at fixed clock frequency.
30 # IALU/gcc-4.8(*) AVX(**) AVX2
33 # Westmere 1.88/+120% -
34 # Sandy Bridge 1.39/+140% 1.10
35 # Haswell 1.14/+175% 1.11 0.65
36 # Skylake 1.13/+120% 0.96 0.51
37 # Silvermont 2.83/+95% -
38 # Goldmont 1.70/+180% -
39 # VIA Nano 1.82/+150% -
40 # Sledgehammer 1.38/+160% -
41 # Bulldozer 2.30/+130% 0.97
43 # (*) improvement coefficients relative to clang are more modest and
44 # are ~50% on most processors, in both cases we are comparing to
46 # (**) SSE2 implementation was attempted, but among non-AVX processors
47 # it was faster than integer-only code only on older Intel P4 and
48 # Core processors, 50-30%, less newer processor is, but slower on
49 # contemporary ones, for example almost 2x slower on Atom, and as
50 # former are naturally disappearing, SSE2 is deemed unnecessary;
54 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
61 die "can't locate x86_64-xlate.pl";
63 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
68 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
69 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
70 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
71 $avx += 2 if ($1==2.11 && $2>=8);
74 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
75 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76 $avx = ($1>=10) + ($1>=12);
79 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
80 $avx = ($2>=3.0) + ($2>3.0);
83 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
86 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
87 my ($mac,$nonce)=($inp,$len); # *_emit arguments
88 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
89 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
91 sub poly1305_iteration {
92 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
93 # output: $h0-$h2 *= $r0-$r1
101 mov %rax,$h0 # future $h0
111 mov $h2,$h1 # borrow $h1
115 imulq $s1,$h1 # h2*s1
120 imulq $r0,$h2 # h2*r0
122 mov \$-4,%rax # mask value
125 and $d3,%rax # last reduction step
136 ########################################################################
137 # Layout of opaque area is following.
139 # unsigned __int64 h[3]; # current hash value base 2^64
140 # unsigned __int64 r[2]; # key value base 2^64
145 .extern OPENSSL_ia32cap_P
148 .hidden poly1305_init
149 .globl poly1305_blocks
150 .hidden poly1305_blocks
152 .hidden poly1305_emit
154 .type poly1305_init,\@function,3
158 mov %rax,0($ctx) # initialize hash value
165 lea poly1305_blocks(%rip),%r10
166 lea poly1305_emit(%rip),%r11
168 $code.=<<___ if ($avx);
169 mov OPENSSL_ia32cap_P+4(%rip),%r9
170 lea poly1305_blocks_avx(%rip),%rax
171 lea poly1305_emit_avx(%rip),%rcx
172 bt \$`60-32`,%r9 # AVX?
176 $code.=<<___ if ($avx>1);
177 lea poly1305_blocks_avx2(%rip),%rax
178 bt \$`5+32`,%r9 # AVX2?
181 $code.=<<___ if ($avx>3);
182 mov \$`(1<<31|1<<21|1<<16)`,%rax
189 mov \$0x0ffffffc0fffffff,%rax
190 mov \$0x0ffffffc0ffffffc,%rcx
196 $code.=<<___ if ($flavour !~ /elf32/);
200 $code.=<<___ if ($flavour =~ /elf32/);
208 .size poly1305_init,.-poly1305_init
210 .type poly1305_blocks,\@function,4
215 jz .Lno_data # too short
225 mov $len,%r15 # reassign $len
227 mov 24($ctx),$r0 # load r
230 mov 0($ctx),$h0 # load hash value
237 add $r1,$s1 # s1 = r1 + (r1 >> 2)
242 add 0($inp),$h0 # accumulate input
247 &poly1305_iteration();
253 mov $h0,0($ctx) # store hash value
267 .size poly1305_blocks,.-poly1305_blocks
269 .type poly1305_emit,\@function,3
273 mov 0($ctx),%r8 # load hash value
278 add \$5,%r8 # compare to modulus
282 shr \$2,%r10 # did 130-bit value overfow?
286 add 0($nonce),%rax # accumulate nonce
288 mov %rax,0($mac) # write result
292 .size poly1305_emit,.-poly1305_emit
296 ########################################################################
297 # Layout of opaque area is following.
299 # unsigned __int32 h[5]; # current hash value base 2^26
300 # unsigned __int32 is_base2_26;
301 # unsigned __int64 r[2]; # key value base 2^64
302 # unsigned __int64 pad;
303 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
305 # where r^n are base 2^26 digits of degrees of multiplier key. There are
306 # 5 digits, but last four are interleaved with multiples of 5, totalling
307 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
309 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
310 map("%xmm$_",(0..15));
313 .type __poly1305_block,\@abi-omnipotent
317 &poly1305_iteration();
320 .size __poly1305_block,.-__poly1305_block
322 .type __poly1305_init_avx,\@abi-omnipotent
329 lea 48+64($ctx),$ctx # size optimization
332 call __poly1305_block # r^2
334 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
340 mov %eax,`16*0+0-64`($ctx)
342 mov %edx,`16*0+4-64`($ctx)
349 mov %eax,`16*1+0-64`($ctx)
350 lea (%rax,%rax,4),%eax # *5
351 mov %edx,`16*1+4-64`($ctx)
352 lea (%rdx,%rdx,4),%edx # *5
353 mov %eax,`16*2+0-64`($ctx)
355 mov %edx,`16*2+4-64`($ctx)
366 mov %eax,`16*3+0-64`($ctx)
367 lea (%rax,%rax,4),%eax # *5
368 mov %edx,`16*3+4-64`($ctx)
369 lea (%rdx,%rdx,4),%edx # *5
370 mov %eax,`16*4+0-64`($ctx)
372 mov %edx,`16*4+4-64`($ctx)
381 mov %eax,`16*5+0-64`($ctx)
382 lea (%rax,%rax,4),%eax # *5
383 mov %edx,`16*5+4-64`($ctx)
384 lea (%rdx,%rdx,4),%edx # *5
385 mov %eax,`16*6+0-64`($ctx)
387 mov %edx,`16*6+4-64`($ctx)
393 mov $d1#d,`16*7+0-64`($ctx)
394 lea ($d1,$d1,4),$d1 # *5
395 mov $d2#d,`16*7+4-64`($ctx)
396 lea ($d2,$d2,4),$d2 # *5
397 mov $d1#d,`16*8+0-64`($ctx)
398 mov $d2#d,`16*8+4-64`($ctx)
401 call __poly1305_block # r^3
403 mov \$0x3ffffff,%eax # save r^3 base 2^26
407 mov %eax,`16*0+12-64`($ctx)
411 mov %edx,`16*1+12-64`($ctx)
412 lea (%rdx,%rdx,4),%edx # *5
414 mov %edx,`16*2+12-64`($ctx)
420 mov %eax,`16*3+12-64`($ctx)
421 lea (%rax,%rax,4),%eax # *5
423 mov %eax,`16*4+12-64`($ctx)
428 mov %edx,`16*5+12-64`($ctx)
429 lea (%rdx,%rdx,4),%edx # *5
431 mov %edx,`16*6+12-64`($ctx)
436 mov $d1#d,`16*7+12-64`($ctx)
437 lea ($d1,$d1,4),$d1 # *5
438 mov $d1#d,`16*8+12-64`($ctx)
441 call __poly1305_block # r^4
443 mov \$0x3ffffff,%eax # save r^4 base 2^26
447 mov %eax,`16*0+8-64`($ctx)
451 mov %edx,`16*1+8-64`($ctx)
452 lea (%rdx,%rdx,4),%edx # *5
454 mov %edx,`16*2+8-64`($ctx)
460 mov %eax,`16*3+8-64`($ctx)
461 lea (%rax,%rax,4),%eax # *5
463 mov %eax,`16*4+8-64`($ctx)
468 mov %edx,`16*5+8-64`($ctx)
469 lea (%rdx,%rdx,4),%edx # *5
471 mov %edx,`16*6+8-64`($ctx)
476 mov $d1#d,`16*7+8-64`($ctx)
477 lea ($d1,$d1,4),$d1 # *5
478 mov $d1#d,`16*8+8-64`($ctx)
480 lea -48-64($ctx),$ctx # size [de-]optimization
482 .size __poly1305_init_avx,.-__poly1305_init_avx
484 .type poly1305_blocks_avx,\@function,4
487 mov 20($ctx),%r8d # is_base2_26
513 mov $len,%r15 # reassign $len
515 mov 0($ctx),$d1 # load hash value
519 mov 24($ctx),$r0 # load r
522 ################################# base 2^26 -> base 2^64
524 and \$`-1*(1<<31)`,$d1
525 mov $d2,$r1 # borrow $r1
527 and \$`-1*(1<<31)`,$d2
541 adc \$0,$h2 # can be partially reduced...
543 mov \$-4,$d2 # ... so reduce
556 add $r1,$s1 # s1 = r1 + (r1 >> 2)
558 add 0($inp),$h0 # accumulate input
563 call __poly1305_block
565 test $padbit,$padbit # if $padbit is zero,
566 jz .Lstore_base2_64_avx # store hash in base 2^64 format
568 ################################# base 2^64 -> base 2^26
575 and \$0x3ffffff,%rax # h[0]
577 and \$0x3ffffff,%rdx # h[1]
581 and \$0x3ffffff,$h0 # h[2]
583 and \$0x3ffffff,$h1 # h[3]
587 jz .Lstore_base2_26_avx
597 .Lstore_base2_64_avx:
600 mov $h2,16($ctx) # note that is_base2_26 is zeroed
604 .Lstore_base2_26_avx:
605 mov %rax#d,0($ctx) # store hash value base 2^26
620 .Lblocks_avx_epilogue:
633 mov $len,%r15 # reassign $len
635 mov 24($ctx),$r0 # load r
638 mov 0($ctx),$h0 # load hash value
645 add $r1,$s1 # s1 = r1 + (r1 >> 2)
650 add 0($inp),$h0 # accumulate input
656 call __poly1305_block
659 ################################# base 2^64 -> base 2^26
666 and \$0x3ffffff,%rax # h[0]
668 and \$0x3ffffff,%rdx # h[1]
672 and \$0x3ffffff,$h0 # h[2]
674 and \$0x3ffffff,$h1 # h[3]
682 movl \$1,20($ctx) # set is_base2_26
684 call __poly1305_init_avx
697 .Lbase2_64_avx_epilogue:
702 vmovd 4*0($ctx),$H0 # load hash value
710 $code.=<<___ if (!$win64);
714 $code.=<<___ if ($win64);
717 vmovdqa %xmm6,0x50(%r11)
718 vmovdqa %xmm7,0x60(%r11)
719 vmovdqa %xmm8,0x70(%r11)
720 vmovdqa %xmm9,0x80(%r11)
721 vmovdqa %xmm10,0x90(%r11)
722 vmovdqa %xmm11,0xa0(%r11)
723 vmovdqa %xmm12,0xb0(%r11)
724 vmovdqa %xmm13,0xc0(%r11)
725 vmovdqa %xmm14,0xd0(%r11)
726 vmovdqa %xmm15,0xe0(%r11)
734 vmovdqu `16*3`($ctx),$D4 # preload r0^2
735 lea `16*3+64`($ctx),$ctx # size optimization
736 lea .Lconst(%rip),%rcx
738 ################################################################
740 vmovdqu 16*2($inp),$T0
741 vmovdqu 16*3($inp),$T1
742 vmovdqa 64(%rcx),$MASK # .Lmask26
744 vpsrldq \$6,$T0,$T2 # splat input
746 vpunpckhqdq $T1,$T0,$T4 # 4
747 vpunpcklqdq $T1,$T0,$T0 # 0:1
748 vpunpcklqdq $T3,$T2,$T3 # 2:3
750 vpsrlq \$40,$T4,$T4 # 4
752 vpand $MASK,$T0,$T0 # 0
754 vpand $MASK,$T1,$T1 # 1
756 vpand $MASK,$T2,$T2 # 2
757 vpand $MASK,$T3,$T3 # 3
758 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
762 # expand and copy pre-calculated table to stack
763 vmovdqu `16*1-64`($ctx),$D1
764 vmovdqu `16*2-64`($ctx),$D2
765 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
766 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
767 vmovdqa $D3,-0x90(%r11)
768 vmovdqa $D0,0x00(%rsp)
769 vpshufd \$0xEE,$D1,$D4
770 vmovdqu `16*3-64`($ctx),$D0
771 vpshufd \$0x44,$D1,$D1
772 vmovdqa $D4,-0x80(%r11)
773 vmovdqa $D1,0x10(%rsp)
774 vpshufd \$0xEE,$D2,$D3
775 vmovdqu `16*4-64`($ctx),$D1
776 vpshufd \$0x44,$D2,$D2
777 vmovdqa $D3,-0x70(%r11)
778 vmovdqa $D2,0x20(%rsp)
779 vpshufd \$0xEE,$D0,$D4
780 vmovdqu `16*5-64`($ctx),$D2
781 vpshufd \$0x44,$D0,$D0
782 vmovdqa $D4,-0x60(%r11)
783 vmovdqa $D0,0x30(%rsp)
784 vpshufd \$0xEE,$D1,$D3
785 vmovdqu `16*6-64`($ctx),$D0
786 vpshufd \$0x44,$D1,$D1
787 vmovdqa $D3,-0x50(%r11)
788 vmovdqa $D1,0x40(%rsp)
789 vpshufd \$0xEE,$D2,$D4
790 vmovdqu `16*7-64`($ctx),$D1
791 vpshufd \$0x44,$D2,$D2
792 vmovdqa $D4,-0x40(%r11)
793 vmovdqa $D2,0x50(%rsp)
794 vpshufd \$0xEE,$D0,$D3
795 vmovdqu `16*8-64`($ctx),$D2
796 vpshufd \$0x44,$D0,$D0
797 vmovdqa $D3,-0x30(%r11)
798 vmovdqa $D0,0x60(%rsp)
799 vpshufd \$0xEE,$D1,$D4
800 vpshufd \$0x44,$D1,$D1
801 vmovdqa $D4,-0x20(%r11)
802 vmovdqa $D1,0x70(%rsp)
803 vpshufd \$0xEE,$D2,$D3
804 vmovdqa 0x00(%rsp),$D4 # preload r0^2
805 vpshufd \$0x44,$D2,$D2
806 vmovdqa $D3,-0x10(%r11)
807 vmovdqa $D2,0x80(%rsp)
813 ################################################################
814 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
815 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
816 # \___________________/
817 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
818 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
819 # \___________________/ \____________________/
821 # Note that we start with inp[2:3]*r^2. This is because it
822 # doesn't depend on reduction in previous iteration.
823 ################################################################
824 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
825 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
826 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
827 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
828 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
830 # though note that $Tx and $Hx are "reversed" in this section,
831 # and $D4 is preloaded with r0^2...
833 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
834 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
835 vmovdqa $H2,0x20(%r11) # offload hash
836 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
837 vmovdqa 0x10(%rsp),$H2 # r1^2
838 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
839 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
841 vmovdqa $H0,0x00(%r11) #
842 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
843 vmovdqa $H1,0x10(%r11) #
844 vpmuludq $T3,$H2,$H1 # h3*r1
845 vpaddq $H0,$D0,$D0 # d0 += h4*s1
846 vpaddq $H1,$D4,$D4 # d4 += h3*r1
847 vmovdqa $H3,0x30(%r11) #
848 vpmuludq $T2,$H2,$H0 # h2*r1
849 vpmuludq $T1,$H2,$H1 # h1*r1
850 vpaddq $H0,$D3,$D3 # d3 += h2*r1
851 vmovdqa 0x30(%rsp),$H3 # r2^2
852 vpaddq $H1,$D2,$D2 # d2 += h1*r1
853 vmovdqa $H4,0x40(%r11) #
854 vpmuludq $T0,$H2,$H2 # h0*r1
855 vpmuludq $T2,$H3,$H0 # h2*r2
856 vpaddq $H2,$D1,$D1 # d1 += h0*r1
858 vmovdqa 0x40(%rsp),$H4 # s2^2
859 vpaddq $H0,$D4,$D4 # d4 += h2*r2
860 vpmuludq $T1,$H3,$H1 # h1*r2
861 vpmuludq $T0,$H3,$H3 # h0*r2
862 vpaddq $H1,$D3,$D3 # d3 += h1*r2
863 vmovdqa 0x50(%rsp),$H2 # r3^2
864 vpaddq $H3,$D2,$D2 # d2 += h0*r2
865 vpmuludq $T4,$H4,$H0 # h4*s2
866 vpmuludq $T3,$H4,$H4 # h3*s2
867 vpaddq $H0,$D1,$D1 # d1 += h4*s2
868 vmovdqa 0x60(%rsp),$H3 # s3^2
869 vpaddq $H4,$D0,$D0 # d0 += h3*s2
871 vmovdqa 0x80(%rsp),$H4 # s4^2
872 vpmuludq $T1,$H2,$H1 # h1*r3
873 vpmuludq $T0,$H2,$H2 # h0*r3
874 vpaddq $H1,$D4,$D4 # d4 += h1*r3
875 vpaddq $H2,$D3,$D3 # d3 += h0*r3
876 vpmuludq $T4,$H3,$H0 # h4*s3
877 vpmuludq $T3,$H3,$H1 # h3*s3
878 vpaddq $H0,$D2,$D2 # d2 += h4*s3
879 vmovdqu 16*0($inp),$H0 # load input
880 vpaddq $H1,$D1,$D1 # d1 += h3*s3
881 vpmuludq $T2,$H3,$H3 # h2*s3
882 vpmuludq $T2,$H4,$T2 # h2*s4
883 vpaddq $H3,$D0,$D0 # d0 += h2*s3
885 vmovdqu 16*1($inp),$H1 #
886 vpaddq $T2,$D1,$D1 # d1 += h2*s4
887 vpmuludq $T3,$H4,$T3 # h3*s4
888 vpmuludq $T4,$H4,$T4 # h4*s4
889 vpsrldq \$6,$H0,$H2 # splat input
890 vpaddq $T3,$D2,$D2 # d2 += h3*s4
891 vpaddq $T4,$D3,$D3 # d3 += h4*s4
892 vpsrldq \$6,$H1,$H3 #
893 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
894 vpmuludq $T1,$H4,$T0 # h1*s4
895 vpunpckhqdq $H1,$H0,$H4 # 4
896 vpaddq $T4,$D4,$D4 # d4 += h0*r4
897 vmovdqa -0x90(%r11),$T4 # r0^4
898 vpaddq $T0,$D0,$D0 # d0 += h1*s4
900 vpunpcklqdq $H1,$H0,$H0 # 0:1
901 vpunpcklqdq $H3,$H2,$H3 # 2:3
903 #vpsrlq \$40,$H4,$H4 # 4
904 vpsrldq \$`40/8`,$H4,$H4 # 4
906 vpand $MASK,$H0,$H0 # 0
908 vpand $MASK,$H1,$H1 # 1
909 vpand 0(%rcx),$H4,$H4 # .Lmask24
911 vpand $MASK,$H2,$H2 # 2
912 vpand $MASK,$H3,$H3 # 3
913 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
915 vpaddq 0x00(%r11),$H0,$H0 # add hash value
916 vpaddq 0x10(%r11),$H1,$H1
917 vpaddq 0x20(%r11),$H2,$H2
918 vpaddq 0x30(%r11),$H3,$H3
919 vpaddq 0x40(%r11),$H4,$H4
926 ################################################################
927 # Now we accumulate (inp[0:1]+hash)*r^4
928 ################################################################
929 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
930 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
931 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
932 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
933 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
935 vpmuludq $H0,$T4,$T0 # h0*r0
936 vpmuludq $H1,$T4,$T1 # h1*r0
939 vmovdqa -0x80(%r11),$T2 # r1^4
940 vpmuludq $H2,$T4,$T0 # h2*r0
941 vpmuludq $H3,$T4,$T1 # h3*r0
944 vpmuludq $H4,$T4,$T4 # h4*r0
945 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
948 vpaddq $T0,$D0,$D0 # d0 += h4*s1
949 vpmuludq $H2,$T2,$T1 # h2*r1
950 vpmuludq $H3,$T2,$T0 # h3*r1
951 vpaddq $T1,$D3,$D3 # d3 += h2*r1
952 vmovdqa -0x60(%r11),$T3 # r2^4
953 vpaddq $T0,$D4,$D4 # d4 += h3*r1
954 vpmuludq $H1,$T2,$T1 # h1*r1
955 vpmuludq $H0,$T2,$T2 # h0*r1
956 vpaddq $T1,$D2,$D2 # d2 += h1*r1
957 vpaddq $T2,$D1,$D1 # d1 += h0*r1
959 vmovdqa -0x50(%r11),$T4 # s2^4
960 vpmuludq $H2,$T3,$T0 # h2*r2
961 vpmuludq $H1,$T3,$T1 # h1*r2
962 vpaddq $T0,$D4,$D4 # d4 += h2*r2
963 vpaddq $T1,$D3,$D3 # d3 += h1*r2
964 vmovdqa -0x40(%r11),$T2 # r3^4
965 vpmuludq $H0,$T3,$T3 # h0*r2
966 vpmuludq $H4,$T4,$T0 # h4*s2
967 vpaddq $T3,$D2,$D2 # d2 += h0*r2
968 vpaddq $T0,$D1,$D1 # d1 += h4*s2
969 vmovdqa -0x30(%r11),$T3 # s3^4
970 vpmuludq $H3,$T4,$T4 # h3*s2
971 vpmuludq $H1,$T2,$T1 # h1*r3
972 vpaddq $T4,$D0,$D0 # d0 += h3*s2
974 vmovdqa -0x10(%r11),$T4 # s4^4
975 vpaddq $T1,$D4,$D4 # d4 += h1*r3
976 vpmuludq $H0,$T2,$T2 # h0*r3
977 vpmuludq $H4,$T3,$T0 # h4*s3
978 vpaddq $T2,$D3,$D3 # d3 += h0*r3
979 vpaddq $T0,$D2,$D2 # d2 += h4*s3
980 vmovdqu 16*2($inp),$T0 # load input
981 vpmuludq $H3,$T3,$T2 # h3*s3
982 vpmuludq $H2,$T3,$T3 # h2*s3
983 vpaddq $T2,$D1,$D1 # d1 += h3*s3
984 vmovdqu 16*3($inp),$T1 #
985 vpaddq $T3,$D0,$D0 # d0 += h2*s3
987 vpmuludq $H2,$T4,$H2 # h2*s4
988 vpmuludq $H3,$T4,$H3 # h3*s4
989 vpsrldq \$6,$T0,$T2 # splat input
990 vpaddq $H2,$D1,$D1 # d1 += h2*s4
991 vpmuludq $H4,$T4,$H4 # h4*s4
992 vpsrldq \$6,$T1,$T3 #
993 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
994 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
995 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
997 vpunpckhqdq $T1,$T0,$T4 # 4
998 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
999 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1001 vpunpcklqdq $T1,$T0,$T0 # 0:1
1002 vpunpcklqdq $T3,$T2,$T3 # 2:3
1004 #vpsrlq \$40,$T4,$T4 # 4
1005 vpsrldq \$`40/8`,$T4,$T4 # 4
1007 vmovdqa 0x00(%rsp),$D4 # preload r0^2
1008 vpand $MASK,$T0,$T0 # 0
1010 vpand $MASK,$T1,$T1 # 1
1011 vpand 0(%rcx),$T4,$T4 # .Lmask24
1013 vpand $MASK,$T2,$T2 # 2
1014 vpand $MASK,$T3,$T3 # 3
1015 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1017 ################################################################
1018 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1023 vpaddq $D3,$H4,$H4 # h3 -> h4
1027 vpaddq $D0,$D1,$H1 # h0 -> h1
1034 vpaddq $D1,$H2,$H2 # h1 -> h2
1038 vpaddq $D0,$H0,$H0 # h4 -> h0
1042 vpaddq $D2,$H3,$H3 # h2 -> h3
1046 vpaddq $D0,$H1,$H1 # h0 -> h1
1050 vpaddq $D3,$H4,$H4 # h3 -> h4
1055 ################################################################
1056 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1058 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1069 vmovdqa $H2,0x20(%r11)
1070 vmovdqa $H0,0x00(%r11)
1071 vmovdqa $H1,0x10(%r11)
1072 vmovdqa $H3,0x30(%r11)
1073 vmovdqa $H4,0x40(%r11)
1075 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1076 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1077 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1078 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1079 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1081 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1082 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1083 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1084 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1085 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1086 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1088 vpmuludq $T3,$H2,$H0 # h3*r1
1089 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1090 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1091 vpmuludq $T2,$H2,$H1 # h2*r1
1092 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1093 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1094 vpmuludq $T1,$H2,$H0 # h1*r1
1095 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1096 vpmuludq $T0,$H2,$H2 # h0*r1
1097 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1098 vpmuludq $T4,$H3,$H3 # h4*s1
1099 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1101 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1102 vpmuludq $T2,$H4,$H1 # h2*r2
1103 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1104 vpmuludq $T1,$H4,$H0 # h1*r2
1105 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1106 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1107 vpmuludq $T0,$H4,$H4 # h0*r2
1108 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1109 vpmuludq $T4,$H2,$H1 # h4*s2
1110 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1111 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1112 vpmuludq $T3,$H2,$H2 # h3*s2
1113 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1115 vpmuludq $T1,$H3,$H0 # h1*r3
1116 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1117 vpmuludq $T0,$H3,$H3 # h0*r3
1118 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1119 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1120 vpmuludq $T4,$H4,$H1 # h4*s3
1121 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1122 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1123 vpmuludq $T3,$H4,$H0 # h3*s3
1124 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1125 vpmuludq $T2,$H4,$H4 # h2*s3
1126 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1128 vpmuludq $T0,$H2,$H2 # h0*r4
1129 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1130 vpmuludq $T4,$H3,$H1 # h4*s4
1131 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1132 vpmuludq $T3,$H3,$H0 # h3*s4
1133 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1134 vpmuludq $T2,$H3,$H1 # h2*s4
1135 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1136 vpmuludq $T1,$H3,$H3 # h1*s4
1137 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1141 vmovdqu 16*0($inp),$H0 # load input
1142 vmovdqu 16*1($inp),$H1
1144 vpsrldq \$6,$H0,$H2 # splat input
1146 vpunpckhqdq $H1,$H0,$H4 # 4
1147 vpunpcklqdq $H1,$H0,$H0 # 0:1
1148 vpunpcklqdq $H3,$H2,$H3 # 2:3
1150 vpsrlq \$40,$H4,$H4 # 4
1152 vpand $MASK,$H0,$H0 # 0
1154 vpand $MASK,$H1,$H1 # 1
1156 vpand $MASK,$H2,$H2 # 2
1157 vpand $MASK,$H3,$H3 # 3
1158 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1160 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1161 vpaddq 0x00(%r11),$H0,$H0
1162 vpaddq 0x10(%r11),$H1,$H1
1163 vpaddq 0x20(%r11),$H2,$H2
1164 vpaddq 0x30(%r11),$H3,$H3
1165 vpaddq 0x40(%r11),$H4,$H4
1167 ################################################################
1168 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1170 vpmuludq $H0,$T4,$T0 # h0*r0
1171 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1172 vpmuludq $H1,$T4,$T1 # h1*r0
1173 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1174 vpmuludq $H2,$T4,$T0 # h2*r0
1175 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1176 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1177 vpmuludq $H3,$T4,$T1 # h3*r0
1178 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1179 vpmuludq $H4,$T4,$T4 # h4*r0
1180 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1182 vpmuludq $H3,$T2,$T0 # h3*r1
1183 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1184 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1185 vpmuludq $H2,$T2,$T1 # h2*r1
1186 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1187 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1188 vpmuludq $H1,$T2,$T0 # h1*r1
1189 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1190 vpmuludq $H0,$T2,$T2 # h0*r1
1191 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1192 vpmuludq $H4,$T3,$T3 # h4*s1
1193 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1195 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1196 vpmuludq $H2,$T4,$T1 # h2*r2
1197 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1198 vpmuludq $H1,$T4,$T0 # h1*r2
1199 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1200 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1201 vpmuludq $H0,$T4,$T4 # h0*r2
1202 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1203 vpmuludq $H4,$T2,$T1 # h4*s2
1204 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1205 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1206 vpmuludq $H3,$T2,$T2 # h3*s2
1207 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1209 vpmuludq $H1,$T3,$T0 # h1*r3
1210 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1211 vpmuludq $H0,$T3,$T3 # h0*r3
1212 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1213 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1214 vpmuludq $H4,$T4,$T1 # h4*s3
1215 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1216 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1217 vpmuludq $H3,$T4,$T0 # h3*s3
1218 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1219 vpmuludq $H2,$T4,$T4 # h2*s3
1220 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1222 vpmuludq $H0,$T2,$T2 # h0*r4
1223 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1224 vpmuludq $H4,$T3,$T1 # h4*s4
1225 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1226 vpmuludq $H3,$T3,$T0 # h3*s4
1227 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1228 vpmuludq $H2,$T3,$T1 # h2*s4
1229 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1230 vpmuludq $H1,$T3,$T3 # h1*s4
1231 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1234 ################################################################
1235 # horizontal addition
1248 ################################################################
1253 vpaddq $H3,$D4,$D4 # h3 -> h4
1257 vpaddq $H0,$D1,$D1 # h0 -> h1
1264 vpaddq $H1,$D2,$D2 # h1 -> h2
1268 vpaddq $H4,$D0,$D0 # h4 -> h0
1272 vpaddq $H2,$D3,$D3 # h2 -> h3
1276 vpaddq $H0,$D1,$D1 # h0 -> h1
1280 vpaddq $H3,$D4,$D4 # h3 -> h4
1282 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1283 vmovd $D1,`4*1-48-64`($ctx)
1284 vmovd $D2,`4*2-48-64`($ctx)
1285 vmovd $D3,`4*3-48-64`($ctx)
1286 vmovd $D4,`4*4-48-64`($ctx)
1288 $code.=<<___ if ($win64);
1289 vmovdqa 0x50(%r11),%xmm6
1290 vmovdqa 0x60(%r11),%xmm7
1291 vmovdqa 0x70(%r11),%xmm8
1292 vmovdqa 0x80(%r11),%xmm9
1293 vmovdqa 0x90(%r11),%xmm10
1294 vmovdqa 0xa0(%r11),%xmm11
1295 vmovdqa 0xb0(%r11),%xmm12
1296 vmovdqa 0xc0(%r11),%xmm13
1297 vmovdqa 0xd0(%r11),%xmm14
1298 vmovdqa 0xe0(%r11),%xmm15
1302 $code.=<<___ if (!$win64);
1308 .size poly1305_blocks_avx,.-poly1305_blocks_avx
1310 .type poly1305_emit_avx,\@function,3
1313 cmpl \$0,20($ctx) # is_base2_26?
1316 mov 0($ctx),%eax # load hash value base 2^26
1322 shl \$26,%rcx # base 2^26 -> base 2^64
1338 mov %r10,%rax # could be partially reduced, so reduce
1349 add \$5,%r8 # compare to modulus
1353 shr \$2,%r10 # did 130-bit value overfow?
1357 add 0($nonce),%rax # accumulate nonce
1359 mov %rax,0($mac) # write result
1363 .size poly1305_emit_avx,.-poly1305_emit_avx
1367 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1368 map("%ymm$_",(0..15));
1372 .type poly1305_blocks_avx2,\@function,4
1374 poly1305_blocks_avx2:
1375 mov 20($ctx),%r8d # is_base2_26
1401 mov $len,%r15 # reassign $len
1403 mov 0($ctx),$d1 # load hash value
1407 mov 24($ctx),$r0 # load r
1410 ################################# base 2^26 -> base 2^64
1412 and \$`-1*(1<<31)`,$d1
1413 mov $d2,$r1 # borrow $r1
1415 and \$`-1*(1<<31)`,$d2
1429 adc \$0,$h2 # can be partially reduced...
1431 mov \$-4,$d2 # ... so reduce
1444 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1446 .Lbase2_26_pre_avx2:
1447 add 0($inp),$h0 # accumulate input
1453 call __poly1305_block
1457 jnz .Lbase2_26_pre_avx2
1459 test $padbit,$padbit # if $padbit is zero,
1460 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1462 ################################# base 2^64 -> base 2^26
1469 and \$0x3ffffff,%rax # h[0]
1471 and \$0x3ffffff,%rdx # h[1]
1475 and \$0x3ffffff,$h0 # h[2]
1477 and \$0x3ffffff,$h1 # h[3]
1481 jz .Lstore_base2_26_avx2
1491 .Lstore_base2_64_avx2:
1494 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1498 .Lstore_base2_26_avx2:
1499 mov %rax#d,0($ctx) # store hash value base 2^26
1514 .Lblocks_avx2_epilogue:
1525 .Lbase2_64_avx2_body:
1527 mov $len,%r15 # reassign $len
1529 mov 24($ctx),$r0 # load r
1532 mov 0($ctx),$h0 # load hash value
1539 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1544 .Lbase2_64_pre_avx2:
1545 add 0($inp),$h0 # accumulate input
1551 call __poly1305_block
1555 jnz .Lbase2_64_pre_avx2
1558 ################################# base 2^64 -> base 2^26
1565 and \$0x3ffffff,%rax # h[0]
1567 and \$0x3ffffff,%rdx # h[1]
1571 and \$0x3ffffff,$h0 # h[2]
1573 and \$0x3ffffff,$h1 # h[3]
1581 movl \$1,20($ctx) # set is_base2_26
1583 call __poly1305_init_avx
1586 mov %r15,$len # restore $len
1587 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1588 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1598 .Lbase2_64_avx2_epilogue:
1603 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1604 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1605 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1606 vmovd 4*1($ctx),%x#$H1
1607 vmovd 4*2($ctx),%x#$H2
1608 vmovd 4*3($ctx),%x#$H3
1609 vmovd 4*4($ctx),%x#$H4
1613 $code.=<<___ if ($avx>2);
1617 cmp %r11d,%r10d # check for AVX512F+BW+VL
1621 $code.=<<___ if (!$win64);
1625 $code.=<<___ if ($win64);
1626 lea -0xf8(%rsp),%r11
1628 vmovdqa %xmm6,0x50(%r11)
1629 vmovdqa %xmm7,0x60(%r11)
1630 vmovdqa %xmm8,0x70(%r11)
1631 vmovdqa %xmm9,0x80(%r11)
1632 vmovdqa %xmm10,0x90(%r11)
1633 vmovdqa %xmm11,0xa0(%r11)
1634 vmovdqa %xmm12,0xb0(%r11)
1635 vmovdqa %xmm13,0xc0(%r11)
1636 vmovdqa %xmm14,0xd0(%r11)
1637 vmovdqa %xmm15,0xe0(%r11)
1641 lea .Lconst(%rip),%rcx
1642 lea 48+64($ctx),$ctx # size optimization
1643 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1645 # expand and copy pre-calculated table to stack
1646 vmovdqu `16*0-64`($ctx),%x#$T2
1648 vmovdqu `16*1-64`($ctx),%x#$T3
1649 vmovdqu `16*2-64`($ctx),%x#$T4
1650 vmovdqu `16*3-64`($ctx),%x#$D0
1651 vmovdqu `16*4-64`($ctx),%x#$D1
1652 vmovdqu `16*5-64`($ctx),%x#$D2
1653 lea 0x90(%rsp),%rax # size optimization
1654 vmovdqu `16*6-64`($ctx),%x#$D3
1655 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1656 vmovdqu `16*7-64`($ctx),%x#$D4
1658 vmovdqu `16*8-64`($ctx),%x#$MASK
1660 vmovdqa $T2,0x00(%rsp)
1662 vmovdqa $T3,0x20-0x90(%rax)
1664 vmovdqa $T4,0x40-0x90(%rax)
1666 vmovdqa $D0,0x60-0x90(%rax)
1668 vmovdqa $D1,0x80-0x90(%rax)
1670 vmovdqa $D2,0xa0-0x90(%rax)
1671 vpermd $MASK,$T0,$MASK
1672 vmovdqa $D3,0xc0-0x90(%rax)
1673 vmovdqa $D4,0xe0-0x90(%rax)
1674 vmovdqa $MASK,0x100-0x90(%rax)
1675 vmovdqa 64(%rcx),$MASK # .Lmask26
1677 ################################################################
1679 vmovdqu 16*0($inp),%x#$T0
1680 vmovdqu 16*1($inp),%x#$T1
1681 vinserti128 \$1,16*2($inp),$T0,$T0
1682 vinserti128 \$1,16*3($inp),$T1,$T1
1685 vpsrldq \$6,$T0,$T2 # splat input
1687 vpunpckhqdq $T1,$T0,$T4 # 4
1688 vpunpcklqdq $T3,$T2,$T2 # 2:3
1689 vpunpcklqdq $T1,$T0,$T0 # 0:1
1694 vpsrlq \$40,$T4,$T4 # 4
1695 vpand $MASK,$T2,$T2 # 2
1696 vpand $MASK,$T0,$T0 # 0
1697 vpand $MASK,$T1,$T1 # 1
1698 vpand $MASK,$T3,$T3 # 3
1699 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1701 vpaddq $H2,$T2,$H2 # accumulate input
1708 ################################################################
1709 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1710 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1711 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1712 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1713 # \________/\__________/
1714 ################################################################
1715 #vpaddq $H2,$T2,$H2 # accumulate input
1717 vmovdqa `32*0`(%rsp),$T0 # r0^4
1719 vmovdqa `32*1`(%rsp),$T1 # r1^4
1721 vmovdqa `32*3`(%rsp),$T2 # r2^4
1723 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1724 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1726 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1727 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1728 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1729 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1730 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1732 # however, as h2 is "chronologically" first one available pull
1733 # corresponding operations up, so it's
1735 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1736 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1737 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1738 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1739 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1741 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1742 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1743 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1744 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1745 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1747 vpmuludq $H0,$T1,$T4 # h0*r1
1748 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1749 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1750 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1751 vpmuludq $H3,$T1,$T4 # h3*r1
1752 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1753 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1754 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1755 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1757 vpmuludq $H0,$T0,$T4 # h0*r0
1758 vpmuludq $H1,$T0,$H2 # h1*r0
1759 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1760 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1761 vpmuludq $H3,$T0,$T4 # h3*r0
1762 vpmuludq $H4,$T0,$H2 # h4*r0
1763 vmovdqu 16*0($inp),%x#$T0 # load input
1764 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1765 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1766 vinserti128 \$1,16*2($inp),$T0,$T0
1768 vpmuludq $H3,$T1,$T4 # h3*s2
1769 vpmuludq $H4,$T1,$H2 # h4*s2
1770 vmovdqu 16*1($inp),%x#$T1
1771 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1772 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1773 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1774 vpmuludq $H1,$T2,$T4 # h1*r2
1775 vpmuludq $H0,$T2,$T2 # h0*r2
1776 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1777 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1778 vinserti128 \$1,16*3($inp),$T1,$T1
1781 vpmuludq $H1,$H2,$T4 # h1*r3
1782 vpmuludq $H0,$H2,$H2 # h0*r3
1783 vpsrldq \$6,$T0,$T2 # splat input
1784 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1785 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1786 vpmuludq $H3,$T3,$T4 # h3*s3
1787 vpmuludq $H4,$T3,$H2 # h4*s3
1789 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1790 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1791 vpunpckhqdq $T1,$T0,$T4 # 4
1793 vpmuludq $H3,$S4,$H3 # h3*s4
1794 vpmuludq $H4,$S4,$H4 # h4*s4
1795 vpunpcklqdq $T1,$T0,$T0 # 0:1
1796 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1797 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1798 vpunpcklqdq $T3,$T2,$T3 # 2:3
1799 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1800 vpmuludq $H1,$S4,$H0 # h1*s4
1801 vmovdqa 64(%rcx),$MASK # .Lmask26
1802 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1803 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1805 ################################################################
1806 # lazy reduction (interleaved with tail of input splat)
1810 vpaddq $D3,$H4,$H4 # h3 -> h4
1814 vpaddq $D0,$D1,$H1 # h0 -> h1
1823 vpaddq $D1,$H2,$H2 # h1 -> h2
1827 vpaddq $D4,$H0,$H0 # h4 -> h0
1829 vpand $MASK,$T2,$T2 # 2
1834 vpaddq $D2,$H3,$H3 # h2 -> h3
1836 vpaddq $T2,$H2,$H2 # modulo-scheduled
1841 vpaddq $D0,$H1,$H1 # h0 -> h1
1843 vpsrlq \$40,$T4,$T4 # 4
1847 vpaddq $D3,$H4,$H4 # h3 -> h4
1849 vpand $MASK,$T0,$T0 # 0
1850 vpand $MASK,$T1,$T1 # 1
1851 vpand $MASK,$T3,$T3 # 3
1852 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1859 ################################################################
1860 # while above multiplications were by r^4 in all lanes, in last
1861 # iteration we multiply least significant lane by r^4 and most
1862 # significant one by r, so copy of above except that references
1863 # to the precomputed table are displaced by 4...
1865 #vpaddq $H2,$T2,$H2 # accumulate input
1867 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1869 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1871 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1873 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1874 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1876 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1877 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1878 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1879 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1880 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1882 vpmuludq $H0,$T1,$T4 # h0*r1
1883 vpmuludq $H1,$T1,$H2 # h1*r1
1884 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1885 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1886 vpmuludq $H3,$T1,$T4 # h3*r1
1887 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1888 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1889 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1891 vpmuludq $H0,$T0,$T4 # h0*r0
1892 vpmuludq $H1,$T0,$H2 # h1*r0
1893 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1894 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
1895 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1896 vpmuludq $H3,$T0,$T4 # h3*r0
1897 vpmuludq $H4,$T0,$H2 # h4*r0
1898 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1899 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1901 vpmuludq $H3,$T1,$T4 # h3*s2
1902 vpmuludq $H4,$T1,$H2 # h4*s2
1903 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1904 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1905 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
1906 vpmuludq $H1,$T2,$T4 # h1*r2
1907 vpmuludq $H0,$T2,$T2 # h0*r2
1908 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1909 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1911 vpmuludq $H1,$H2,$T4 # h1*r3
1912 vpmuludq $H0,$H2,$H2 # h0*r3
1913 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1914 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1915 vpmuludq $H3,$T3,$T4 # h3*s3
1916 vpmuludq $H4,$T3,$H2 # h4*s3
1917 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1918 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1920 vpmuludq $H3,$S4,$H3 # h3*s4
1921 vpmuludq $H4,$S4,$H4 # h4*s4
1922 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1923 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1924 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
1925 vpmuludq $H1,$S4,$H0 # h1*s4
1926 vmovdqa 64(%rcx),$MASK # .Lmask26
1927 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1928 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1930 ################################################################
1931 # horizontal addition
1944 vpermq \$0x2,$H3,$T3
1945 vpermq \$0x2,$H4,$T4
1946 vpermq \$0x2,$H0,$T0
1947 vpermq \$0x2,$D1,$T1
1948 vpermq \$0x2,$H2,$T2
1955 ################################################################
1960 vpaddq $D3,$H4,$H4 # h3 -> h4
1964 vpaddq $D0,$D1,$H1 # h0 -> h1
1971 vpaddq $D1,$H2,$H2 # h1 -> h2
1975 vpaddq $D4,$H0,$H0 # h4 -> h0
1979 vpaddq $D2,$H3,$H3 # h2 -> h3
1983 vpaddq $D0,$H1,$H1 # h0 -> h1
1987 vpaddq $D3,$H4,$H4 # h3 -> h4
1989 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
1990 vmovd %x#$H1,`4*1-48-64`($ctx)
1991 vmovd %x#$H2,`4*2-48-64`($ctx)
1992 vmovd %x#$H3,`4*3-48-64`($ctx)
1993 vmovd %x#$H4,`4*4-48-64`($ctx)
1995 $code.=<<___ if ($win64);
1996 vmovdqa 0x50(%r11),%xmm6
1997 vmovdqa 0x60(%r11),%xmm7
1998 vmovdqa 0x70(%r11),%xmm8
1999 vmovdqa 0x80(%r11),%xmm9
2000 vmovdqa 0x90(%r11),%xmm10
2001 vmovdqa 0xa0(%r11),%xmm11
2002 vmovdqa 0xb0(%r11),%xmm12
2003 vmovdqa 0xc0(%r11),%xmm13
2004 vmovdqa 0xd0(%r11),%xmm14
2005 vmovdqa 0xe0(%r11),%xmm15
2009 $code.=<<___ if (!$win64);
2015 .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2017 #######################################################################
2019 # On entry we have input length divisible by 64. But since inner loop
2020 # processes 128 bytes per iteration, cases when length is not divisible
2021 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2022 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
2023 # for this tail, we wouldn't have to even allocate stack frame...
2025 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24));
2026 my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29));
2027 my $PADBIT="%zmm30";
2028 my $GATHER="%ymm31";
2031 .type poly1305_blocks_avx512,\@function,4
2033 poly1305_blocks_avx512:
2037 $code.=<<___ if (!$win64);
2041 $code.=<<___ if ($win64);
2042 lea -0xf8(%rsp),%r11
2044 vmovdqa %xmm6,0x50(%r11)
2045 vmovdqa %xmm7,0x60(%r11)
2046 vmovdqa %xmm8,0x70(%r11)
2047 vmovdqa %xmm9,0x80(%r11)
2048 vmovdqa %xmm10,0x90(%r11)
2049 vmovdqa %xmm11,0xa0(%r11)
2050 vmovdqa %xmm12,0xb0(%r11)
2051 vmovdqa %xmm13,0xc0(%r11)
2052 vmovdqa %xmm14,0xd0(%r11)
2053 vmovdqa %xmm15,0xe0(%r11)
2057 lea .Lconst(%rip),%rcx
2058 lea 48+64($ctx),$ctx # size optimization
2059 vmovdqa 96(%rcx),$T2 # .Lpermd_avx2
2061 # expand pre-calculated table
2062 vmovdqu32 `16*0-64`($ctx),%x#$R0
2064 vmovdqu32 `16*1-64`($ctx),%x#$R1
2065 vmovdqu32 `16*2-64`($ctx),%x#$S1
2066 vmovdqu32 `16*3-64`($ctx),%x#$R2
2067 vmovdqu32 `16*4-64`($ctx),%x#$S2
2068 vmovdqu32 `16*5-64`($ctx),%x#$R3
2069 vmovdqu32 `16*6-64`($ctx),%x#$S3
2070 vmovdqu32 `16*7-64`($ctx),%x#$R4
2071 vmovdqu32 `16*8-64`($ctx),%x#$S4
2072 vpermd $R0,$T2,$R0 # 00003412 -> 14243444
2073 vmovdqa64 64(%rcx),$MASK # .Lmask26
2077 vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0
2078 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2080 vmovdqa32 $R1,0x20(%rsp)
2083 vmovdqa32 $S1,0x40(%rsp)
2086 vmovdqa32 $R2,0x60(%rsp)
2088 vmovdqa32 $S2,0x80(%rsp)
2089 vmovdqa32 $R3,0xa0(%rsp)
2090 vmovdqa32 $S3,0xc0(%rsp)
2091 vmovdqa32 $R4,0xe0(%rsp)
2092 vmovdqa32 $S4,0x100(%rsp)
2094 ################################################################
2095 # calculate 5th through 8th powers of the key
2097 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2098 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2099 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2100 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2101 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2103 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2104 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2105 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2106 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2107 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2110 vpmuludq $T1,$S4,$M0
2111 vpmuludq $T1,$R0,$M1
2112 vpmuludq $T1,$R1,$M2
2113 vpmuludq $T1,$R2,$M3
2114 vpmuludq $T1,$R3,$M4
2116 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2117 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2118 vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2119 vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2120 vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2122 vpmuludq $T2,$S3,$M0
2123 vpmuludq $T2,$S4,$M1
2124 vpmuludq $T2,$R1,$M3
2125 vpmuludq $T2,$R2,$M4
2126 vpmuludq $T2,$R0,$M2
2128 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2129 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2130 vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2131 vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2132 vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2134 vpmuludq $T3,$S2,$M0
2135 vpmuludq $T3,$R0,$M3
2136 vpmuludq $T3,$R1,$M4
2137 vpmuludq $T3,$S3,$M1
2138 vpmuludq $T3,$S4,$M2
2139 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2140 vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2141 vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2142 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2143 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2145 vpmuludq $T4,$S4,$M3
2146 vpmuludq $T4,$R0,$M4
2147 vpmuludq $T4,$S1,$M0
2148 vpmuludq $T4,$S2,$M1
2149 vpmuludq $T4,$S3,$M2
2150 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2151 vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2152 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2153 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2154 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2156 ################################################################
2158 vmovdqu64 16*0($inp),%z#$T3
2159 vmovdqu64 16*4($inp),%z#$T4
2162 ################################################################
2166 vpandq $MASK,$D3,$D3
2167 vpaddq $M3,$D4,$D4 # d3 -> d4
2170 vpandq $MASK,$D0,$D0
2171 vpaddq $M0,$D1,$D1 # d0 -> d1
2174 vpandq $MASK,$D4,$D4
2177 vpandq $MASK,$D1,$D1
2178 vpaddq $M1,$D2,$D2 # d1 -> d2
2182 vpaddq $M4,$D0,$D0 # d4 -> d0
2185 vpandq $MASK,$D2,$D2
2186 vpaddq $M2,$D3,$D3 # d2 -> d3
2189 vpandq $MASK,$D0,$D0
2190 vpaddq $M0,$D1,$D1 # d0 -> d1
2193 vpandq $MASK,$D3,$D3
2194 vpaddq $M3,$D4,$D4 # d3 -> d4
2197 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2198 map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
2199 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2200 map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
2201 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2202 map(s/%y/%z/,($MASK));
2204 ################################################################
2205 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2208 vpunpcklqdq $T4,$T3,$T0 # transpose input
2209 vpunpckhqdq $T4,$T3,$T4
2211 # ... since input 64-bit lanes are ordered as 73625140, we could
2212 # "vperm" it to 76543210 (here and in each loop iteration), *or*
2213 # we could just flow along, hence the goal for $R0-$S4 is
2214 # 1858286838784888 ...
2216 mov \$0b0110011001100110,%eax
2217 mov \$0b1100110011001100,%r8d
2218 mov \$0b0101010101010101,%r9d
2223 vpbroadcastq %x#$D0,$M0 # 0808080808080808
2224 vpbroadcastq %x#$D1,$M1
2225 vpbroadcastq %x#$D2,$M2
2226 vpbroadcastq %x#$D3,$M3
2227 vpbroadcastq %x#$D4,$M4
2229 vpexpandd $D0,${D0}{%k1} # 05060708 -> -05--06--07--08-
2230 vpexpandd $D1,${D1}{%k1}
2231 vpexpandd $D2,${D2}{%k1}
2232 vpexpandd $D3,${D3}{%k1}
2233 vpexpandd $D4,${D4}{%k1}
2235 vpexpandd $R0,${D0}{%k2} # -05--06--07--08- -> 145-246-347-448-
2236 vpexpandd $R1,${D1}{%k2}
2237 vpexpandd $R2,${D2}{%k2}
2238 vpexpandd $R3,${D3}{%k2}
2239 vpexpandd $R4,${D4}{%k2}
2241 vpblendmd $M0,$D0,${R0}{%k3} # 1858286838784888
2242 vpblendmd $M1,$D1,${R1}{%k3}
2243 vpblendmd $M2,$D2,${R2}{%k3}
2244 vpblendmd $M3,$D3,${R3}{%k3}
2245 vpblendmd $M4,$D4,${R4}{%k3}
2247 vpslld \$2,$R1,$S1 # *5
2256 vpbroadcastq %x#$MASK,$MASK
2257 vpbroadcastq 32(%rcx),$PADBIT # .L129
2259 vpsrlq \$52,$T0,$T2 # splat input
2264 vpsrlq \$40,$T4,$T4 # 4
2265 vpandq $MASK,$T2,$T2 # 2
2266 vpandq $MASK,$T0,$T0 # 0
2267 vpandq $MASK,$T1,$T1 # 1
2268 vpandq $MASK,$T3,$T3 # 3
2269 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2271 vpaddq $H2,$T2,$H2 # accumulate input
2279 ################################################################
2280 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2281 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2282 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2283 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2284 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2285 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2286 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2287 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2288 # \________/\___________/
2289 ################################################################
2290 #vpaddq $H2,$T2,$H2 # accumulate input
2292 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2293 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2294 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2295 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2296 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2298 # however, as h2 is "chronologically" first one available pull
2299 # corresponding operations up, so it's
2301 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2302 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2303 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2304 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2305 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2307 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2309 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2310 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2311 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2312 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2313 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2314 vpaddq $H1,$T1,$H1 # accumulate input
2318 vmovdqu64 16*0($inp),$T3 # load input
2319 vmovdqu64 16*4($inp),$T4
2321 vpmuludq $H0,$R3,$M3
2322 vpmuludq $H0,$R4,$M4
2323 vpmuludq $H0,$R0,$M0
2324 vpmuludq $H0,$R1,$M1
2325 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2326 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2327 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2328 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2330 vpmuludq $H1,$R2,$M3
2331 vpmuludq $H1,$R3,$M4
2332 vpmuludq $H1,$S4,$M0
2333 vpmuludq $H0,$R2,$M2
2334 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2335 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2336 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2337 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2339 vpunpcklqdq $T4,$T3,$T0 # transpose input
2340 vpunpckhqdq $T4,$T3,$T4
2342 vpmuludq $H3,$R0,$M3
2343 vpmuludq $H3,$R1,$M4
2344 vpmuludq $H1,$R0,$M1
2345 vpmuludq $H1,$R1,$M2
2346 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2347 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2348 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2349 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2351 vpmuludq $H4,$S4,$M3
2352 vpmuludq $H4,$R0,$M4
2353 vpmuludq $H3,$S2,$M0
2354 vpmuludq $H3,$S3,$M1
2355 vpaddq $M3,$D3,$D3 # d3 += h4*s4
2356 vpmuludq $H3,$S4,$M2
2357 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2358 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2359 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2360 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2362 vpmuludq $H4,$S1,$M0
2363 vpmuludq $H4,$S2,$M1
2364 vpmuludq $H4,$S3,$M2
2365 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2366 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2367 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2369 ################################################################
2370 # lazy reduction (interleaved with input splat)
2372 vpsrlq \$52,$T0,$T2 # splat input
2376 vpandq $MASK,$D3,$D3
2377 vpaddq $H3,$D4,$H4 # h3 -> h4
2382 vpandq $MASK,$H0,$H0
2383 vpaddq $D0,$H1,$H1 # h0 -> h1
2385 vpandq $MASK,$T2,$T2 # 2
2388 vpandq $MASK,$H4,$H4
2391 vpandq $MASK,$H1,$H1
2392 vpaddq $D1,$H2,$H2 # h1 -> h2
2396 vpaddq $D4,$H0,$H0 # h4 -> h0
2398 vpaddq $T2,$H2,$H2 # modulo-scheduled
2402 vpandq $MASK,$H2,$H2
2403 vpaddq $D2,$D3,$H3 # h2 -> h3
2408 vpandq $MASK,$H0,$H0
2409 vpaddq $D0,$H1,$H1 # h0 -> h1
2411 vpsrlq \$40,$T4,$T4 # 4
2414 vpandq $MASK,$H3,$H3
2415 vpaddq $D3,$H4,$H4 # h3 -> h4
2417 vpandq $MASK,$T0,$T0 # 0
2418 vpandq $MASK,$T1,$T1 # 1
2419 vpandq $MASK,$T3,$T3 # 3
2420 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2426 ################################################################
2427 # while above multiplications were by r^8 in all lanes, in last
2428 # iteration we multiply least significant lane by r^8 and most
2429 # significant one by r, that's why table gets shifted...
2431 vpsrlq \$32,$R0,$R0 # 0105020603070408
2441 ################################################################
2442 # load either next or last 64 byte of input
2443 lea ($inp,$len),$inp
2445 #vpaddq $H2,$T2,$H2 # accumulate input
2448 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2449 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2450 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2451 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2452 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2453 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2454 vpaddq $H1,$T1,$H1 # accumulate input
2458 vmovdqu64 16*0($inp),%x#$T0
2459 vpmuludq $H0,$R3,$M3
2460 vpmuludq $H0,$R4,$M4
2461 vpmuludq $H0,$R0,$M0
2462 vpmuludq $H0,$R1,$M1
2463 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2464 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2465 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2466 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2468 vmovdqu64 16*1($inp),%x#$T1
2469 vpmuludq $H1,$R2,$M3
2470 vpmuludq $H1,$R3,$M4
2471 vpmuludq $H1,$S4,$M0
2472 vpmuludq $H0,$R2,$M2
2473 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2474 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2475 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2476 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2478 vinserti64x2 \$1,16*2($inp),$T0,$T0
2479 vpmuludq $H3,$R0,$M3
2480 vpmuludq $H3,$R1,$M4
2481 vpmuludq $H1,$R0,$M1
2482 vpmuludq $H1,$R1,$M2
2483 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2484 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2485 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2486 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2488 vinserti64x2 \$1,16*3($inp),$T1,$T1
2489 vpmuludq $H4,$S4,$M3
2490 vpmuludq $H4,$R0,$M4
2491 vpmuludq $H3,$S2,$M0
2492 vpmuludq $H3,$S3,$M1
2493 vpmuludq $H3,$S4,$M2
2494 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2495 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2496 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2497 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2498 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2500 vpmuludq $H4,$S1,$M0
2501 vpmuludq $H4,$S2,$M1
2502 vpmuludq $H4,$S3,$M2
2503 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2504 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2505 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2507 ################################################################
2508 # horizontal addition
2523 vpermq \$0x2,$H3,$D3
2524 vpermq \$0x2,$H4,$D4
2525 vpermq \$0x2,$H0,$D0
2526 vpermq \$0x2,$H1,$D1
2527 vpermq \$0x2,$H2,$D2
2534 vextracti64x4 \$0x1,$H3,%y#$D3
2535 vextracti64x4 \$0x1,$H4,%y#$D4
2536 vextracti64x4 \$0x1,$H0,%y#$D0
2537 vextracti64x4 \$0x1,$H1,%y#$D1
2538 vextracti64x4 \$0x1,$H2,%y#$D2
2539 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2540 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2541 vpaddq $D0,$H0,${H0}{%k3}{z}
2542 vpaddq $D1,$H1,${H1}{%k3}{z}
2543 vpaddq $D2,$H2,${H2}{%k3}{z}
2545 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2546 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2548 ################################################################
2549 # lazy reduction (interleaved with input splat)
2552 vpandq $MASK,$H3,$H3
2553 vpsrldq \$6,$T0,$T2 # splat input
2555 vpunpckhqdq $T1,$T0,$T4 # 4
2556 vpaddq $D3,$H4,$H4 # h3 -> h4
2559 vpandq $MASK,$H0,$H0
2560 vpunpcklqdq $T3,$T2,$T2 # 2:3
2561 vpunpcklqdq $T1,$T0,$T0 # 0:1
2562 vpaddq $D0,$H1,$H1 # h0 -> h1
2565 vpandq $MASK,$H4,$H4
2568 vpandq $MASK,$H1,$H1
2571 vpaddq $D1,$H2,$H2 # h1 -> h2
2576 vpsrlq \$40,$T4,$T4 # 4
2577 vpaddq $D4,$H0,$H0 # h4 -> h0
2580 vpandq $MASK,$H2,$H2
2581 vpandq $MASK,$T2,$T2 # 2
2582 vpandq $MASK,$T0,$T0 # 0
2583 vpaddq $D2,$H3,$H3 # h2 -> h3
2586 vpandq $MASK,$H0,$H0
2587 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2588 vpandq $MASK,$T1,$T1 # 1
2589 vpaddq $D0,$H1,$H1 # h0 -> h1
2592 vpandq $MASK,$H3,$H3
2593 vpandq $MASK,$T3,$T3 # 3
2594 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2595 vpaddq $D3,$H4,$H4 # h3 -> h4
2597 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2601 vpsubq $T2,$H2,$H2 # undo input accumulation
2602 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2603 vmovd %x#$H1,`4*1-48-64`($ctx)
2604 vmovd %x#$H2,`4*2-48-64`($ctx)
2605 vmovd %x#$H3,`4*3-48-64`($ctx)
2606 vmovd %x#$H4,`4*4-48-64`($ctx)
2609 $code.=<<___ if ($win64);
2610 movdqa 0x50(%r11),%xmm6
2611 movdqa 0x60(%r11),%xmm7
2612 movdqa 0x70(%r11),%xmm8
2613 movdqa 0x80(%r11),%xmm9
2614 movdqa 0x90(%r11),%xmm10
2615 movdqa 0xa0(%r11),%xmm11
2616 movdqa 0xb0(%r11),%xmm12
2617 movdqa 0xc0(%r11),%xmm13
2618 movdqa 0xd0(%r11),%xmm14
2619 movdqa 0xe0(%r11),%xmm15
2621 .Ldo_avx512_epilogue:
2623 $code.=<<___ if (!$win64);
2628 .size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2631 ########################################################################
2632 # VPMADD52 version using 2^44 radix.
2634 # One can argue that base 2^52 would be more natural. Well, even though
2635 # some operations would be more natural, one has to recognize couple of
2636 # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2637 # at amount of multiply-n-accumulate operations. Secondly, it makes it
2638 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2639 # reference implementations], which means that more such operations
2640 # would have to be performed in inner loop, which in turn makes critical
2641 # path longer. In other words, even though base 2^44 reduction might
2642 # look less elegant, overall critical path is actually shorter...
2645 .type poly1305_init_base2_44,\@function,3
2647 poly1305_init_base2_44:
2649 mov %rax,0($ctx) # initialize hash value
2654 lea poly1305_blocks_vpmadd52(%rip),%r10
2655 lea poly1305_emit_base2_44(%rip),%r11
2657 mov \$0x0ffffffc0fffffff,%rax
2658 mov \$0x0ffffffc0ffffffc,%rcx
2660 mov \$0x00000fffffffffff,%r8
2662 mov \$0x00000fffffffffff,%r9
2665 mov %r8,40($ctx) # r0
2668 mov %rax,48($ctx) # r1
2669 lea (%rax,%rax,4),%rax # *5
2670 mov %rcx,56($ctx) # r2
2671 shl \$2,%rax # magic <<2
2672 lea (%rcx,%rcx,4),%rcx # *5
2673 shl \$2,%rcx # magic <<2
2674 mov %rax,24($ctx) # s1
2675 mov %rcx,32($ctx) # s2
2677 $code.=<<___ if ($flavour !~ /elf32/);
2681 $code.=<<___ if ($flavour =~ /elf32/);
2688 .size poly1305_init_base2_44,.-poly1305_init_base2_44
2691 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2692 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2693 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2696 .type poly1305_blocks_vpmadd52,\@function,4
2698 poly1305_blocks_vpmadd52:
2700 jz .Lno_data_vpmadd52 # too short
2705 lea .L2_44_inp_permd(%rip),%r10
2709 vmovq $padbit,%x#$PAD
2710 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2711 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2712 vpermq \$0xcf,$PAD,$PAD
2713 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2715 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2716 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2717 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2718 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2720 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2721 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2727 vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2730 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2731 vpsrlvq $inp_shift,$T0,$T0
2732 vpandq $reduc_mask,$T0,$T0
2735 vpaddq $T0,$Dlo,$Dlo # accumulate input
2737 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2738 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2739 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2741 vpxord $Dlo,$Dlo,$Dlo
2742 vpxord $Dhi,$Dhi,$Dhi
2744 vpmadd52luq $r2r1r0,$H0,$Dlo
2745 vpmadd52huq $r2r1r0,$H0,$Dhi
2747 vpmadd52luq $r1r0s2,$H1,$Dlo
2748 vpmadd52huq $r1r0s2,$H1,$Dhi
2750 vpmadd52luq $r0s2s1,$H2,$Dlo
2751 vpmadd52huq $r0s2s1,$H2,$Dhi
2753 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2754 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2755 vpandq $reduc_mask,$Dlo,$Dlo
2757 vpaddq $T0,$Dhi,$Dhi
2759 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2761 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2763 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2764 vpandq $reduc_mask,$Dlo,$Dlo
2766 vpermq \$0b10010011,$T0,$T0
2768 vpaddq $T0,$Dlo,$Dlo
2770 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2772 vpaddq $T0,$Dlo,$Dlo
2775 vpaddq $T0,$Dlo,$Dlo
2780 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
2784 .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2788 .type poly1305_emit_base2_44,\@function,3
2790 poly1305_emit_base2_44:
2791 mov 0($ctx),%r8 # load hash value
2807 add \$5,%r8 # compare to modulus
2811 shr \$2,%r10 # did 130-bit value overfow?
2815 add 0($nonce),%rax # accumulate nonce
2817 mov %rax,0($mac) # write result
2821 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
2828 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2830 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2832 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2834 .long 2,2,2,3,2,0,2,1
2837 .long 0,1,1,2,2,3,7,7
2841 .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
2850 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2854 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2855 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2863 .extern __imp_RtlVirtualUnwind
2864 .type se_handler,\@abi-omnipotent
2878 mov 120($context),%rax # pull context->Rax
2879 mov 248($context),%rbx # pull context->Rip
2881 mov 8($disp),%rsi # disp->ImageBase
2882 mov 56($disp),%r11 # disp->HandlerData
2884 mov 0(%r11),%r10d # HandlerData[0]
2885 lea (%rsi,%r10),%r10 # prologue label
2886 cmp %r10,%rbx # context->Rip<.Lprologue
2887 jb .Lcommon_seh_tail
2889 mov 152($context),%rax # pull context->Rsp
2891 mov 4(%r11),%r10d # HandlerData[1]
2892 lea (%rsi,%r10),%r10 # epilogue label
2893 cmp %r10,%rbx # context->Rip>=.Lepilogue
2894 jae .Lcommon_seh_tail
2904 mov %rbx,144($context) # restore context->Rbx
2905 mov %rbp,160($context) # restore context->Rbp
2906 mov %r12,216($context) # restore context->R12
2907 mov %r13,224($context) # restore context->R13
2908 mov %r14,232($context) # restore context->R14
2909 mov %r15,240($context) # restore context->R14
2911 jmp .Lcommon_seh_tail
2912 .size se_handler,.-se_handler
2914 .type avx_handler,\@abi-omnipotent
2928 mov 120($context),%rax # pull context->Rax
2929 mov 248($context),%rbx # pull context->Rip
2931 mov 8($disp),%rsi # disp->ImageBase
2932 mov 56($disp),%r11 # disp->HandlerData
2934 mov 0(%r11),%r10d # HandlerData[0]
2935 lea (%rsi,%r10),%r10 # prologue label
2936 cmp %r10,%rbx # context->Rip<prologue label
2937 jb .Lcommon_seh_tail
2939 mov 152($context),%rax # pull context->Rsp
2941 mov 4(%r11),%r10d # HandlerData[1]
2942 lea (%rsi,%r10),%r10 # epilogue label
2943 cmp %r10,%rbx # context->Rip>=epilogue label
2944 jae .Lcommon_seh_tail
2946 mov 208($context),%rax # pull context->R11
2950 lea 512($context),%rdi # &context.Xmm6
2952 .long 0xa548f3fc # cld; rep movsq
2957 mov %rax,152($context) # restore context->Rsp
2958 mov %rsi,168($context) # restore context->Rsi
2959 mov %rdi,176($context) # restore context->Rdi
2961 mov 40($disp),%rdi # disp->ContextRecord
2962 mov $context,%rsi # context
2963 mov \$154,%ecx # sizeof(CONTEXT)
2964 .long 0xa548f3fc # cld; rep movsq
2967 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2968 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2969 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2970 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2971 mov 40(%rsi),%r10 # disp->ContextRecord
2972 lea 56(%rsi),%r11 # &disp->HandlerData
2973 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2974 mov %r10,32(%rsp) # arg5
2975 mov %r11,40(%rsp) # arg6
2976 mov %r12,48(%rsp) # arg7
2977 mov %rcx,56(%rsp) # arg8, (NULL)
2978 call *__imp_RtlVirtualUnwind(%rip)
2980 mov \$1,%eax # ExceptionContinueSearch
2992 .size avx_handler,.-avx_handler
2996 .rva .LSEH_begin_poly1305_init
2997 .rva .LSEH_end_poly1305_init
2998 .rva .LSEH_info_poly1305_init
3000 .rva .LSEH_begin_poly1305_blocks
3001 .rva .LSEH_end_poly1305_blocks
3002 .rva .LSEH_info_poly1305_blocks
3004 .rva .LSEH_begin_poly1305_emit
3005 .rva .LSEH_end_poly1305_emit
3006 .rva .LSEH_info_poly1305_emit
3008 $code.=<<___ if ($avx);
3009 .rva .LSEH_begin_poly1305_blocks_avx
3011 .rva .LSEH_info_poly1305_blocks_avx_1
3015 .rva .LSEH_info_poly1305_blocks_avx_2
3018 .rva .LSEH_end_poly1305_blocks_avx
3019 .rva .LSEH_info_poly1305_blocks_avx_3
3021 .rva .LSEH_begin_poly1305_emit_avx
3022 .rva .LSEH_end_poly1305_emit_avx
3023 .rva .LSEH_info_poly1305_emit_avx
3025 $code.=<<___ if ($avx>1);
3026 .rva .LSEH_begin_poly1305_blocks_avx2
3027 .rva .Lbase2_64_avx2
3028 .rva .LSEH_info_poly1305_blocks_avx2_1
3030 .rva .Lbase2_64_avx2
3032 .rva .LSEH_info_poly1305_blocks_avx2_2
3035 .rva .LSEH_end_poly1305_blocks_avx2
3036 .rva .LSEH_info_poly1305_blocks_avx2_3
3038 $code.=<<___ if ($avx>2);
3039 .rva .LSEH_begin_poly1305_blocks_avx512
3040 .rva .LSEH_end_poly1305_blocks_avx512
3041 .rva .LSEH_info_poly1305_blocks_avx512
3046 .LSEH_info_poly1305_init:
3049 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
3051 .LSEH_info_poly1305_blocks:
3054 .rva .Lblocks_body,.Lblocks_epilogue
3056 .LSEH_info_poly1305_emit:
3059 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
3061 $code.=<<___ if ($avx);
3062 .LSEH_info_poly1305_blocks_avx_1:
3065 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
3067 .LSEH_info_poly1305_blocks_avx_2:
3070 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
3072 .LSEH_info_poly1305_blocks_avx_3:
3075 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
3077 .LSEH_info_poly1305_emit_avx:
3080 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
3082 $code.=<<___ if ($avx>1);
3083 .LSEH_info_poly1305_blocks_avx2_1:
3086 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
3088 .LSEH_info_poly1305_blocks_avx2_2:
3091 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
3093 .LSEH_info_poly1305_blocks_avx2_3:
3096 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
3098 $code.=<<___ if ($avx>2);
3099 .LSEH_info_poly1305_blocks_avx512:
3102 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
3106 foreach (split('\n',$code)) {
3107 s/\`([^\`]*)\`/eval($1)/ge;
3108 s/%r([a-z]+)#d/%e$1/g;
3109 s/%r([0-9]+)#d/%r$1d/g;
3110 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;