poly1305/asm/poly1305-x86_64.pl: add VPMADD52 code path.
[openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for x86_64.
18 #
19 # March 2015
20 #
21 # Initial release.
22 #
23 # December 2016
24 #
25 # Add AVX512F+VL+BW code path.
26 #
27 # Numbers are cycles per processed byte with poly1305_blocks alone,
28 # measured with rdtsc at fixed clock frequency.
29 #
30 #               IALU/gcc-4.8(*) AVX(**)         AVX2
31 # P4            4.46/+120%      -
32 # Core 2        2.41/+90%       -
33 # Westmere      1.88/+120%      -
34 # Sandy Bridge  1.39/+140%      1.10
35 # Haswell       1.14/+175%      1.11            0.65
36 # Skylake       1.13/+120%      0.96            0.51
37 # Silvermont    2.83/+95%       -
38 # Goldmont      1.70/+180%      -
39 # VIA Nano      1.82/+150%      -
40 # Sledgehammer  1.38/+160%      -
41 # Bulldozer     2.30/+130%      0.97
42 #
43 # (*)   improvement coefficients relative to clang are more modest and
44 #       are ~50% on most processors, in both cases we are comparing to
45 #       __int128 code;
46 # (**)  SSE2 implementation was attempted, but among non-AVX processors
47 #       it was faster than integer-only code only on older Intel P4 and
48 #       Core processors, 50-30%, less newer processor is, but slower on
49 #       contemporary ones, for example almost 2x slower on Atom, and as
50 #       former are naturally disappearing, SSE2 is deemed unnecessary;
51
52 $flavour = shift;
53 $output  = shift;
54 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
55
56 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
57
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
61 die "can't locate x86_64-xlate.pl";
62
63 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65         $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
66 }
67
68 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
69            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
70         $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
71         $avx += 2 if ($1==2.11 && $2>=8);
72 }
73
74 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
75            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76         $avx = ($1>=10) + ($1>=12);
77 }
78
79 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
80         $avx = ($2>=3.0) + ($2>3.0);
81 }
82
83 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
84 *STDOUT=*OUT;
85
86 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
87 my ($mac,$nonce)=($inp,$len);   # *_emit arguments
88 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
89 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
90
91 sub poly1305_iteration {
92 # input:        copy of $r1 in %rax, $h0-$h2, $r0-$r1
93 # output:       $h0-$h2 *= $r0-$r1
94 $code.=<<___;
95         mulq    $h0                     # h0*r1
96         mov     %rax,$d2
97          mov    $r0,%rax
98         mov     %rdx,$d3
99
100         mulq    $h0                     # h0*r0
101         mov     %rax,$h0                # future $h0
102          mov    $r0,%rax
103         mov     %rdx,$d1
104
105         mulq    $h1                     # h1*r0
106         add     %rax,$d2
107          mov    $s1,%rax
108         adc     %rdx,$d3
109
110         mulq    $h1                     # h1*s1
111          mov    $h2,$h1                 # borrow $h1
112         add     %rax,$h0
113         adc     %rdx,$d1
114
115         imulq   $s1,$h1                 # h2*s1
116         add     $h1,$d2
117          mov    $d1,$h1
118         adc     \$0,$d3
119
120         imulq   $r0,$h2                 # h2*r0
121         add     $d2,$h1
122         mov     \$-4,%rax               # mask value
123         adc     $h2,$d3
124
125         and     $d3,%rax                # last reduction step
126         mov     $d3,$h2
127         shr     \$2,$d3
128         and     \$3,$h2
129         add     $d3,%rax
130         add     %rax,$h0
131         adc     \$0,$h1
132         adc     \$0,$h2
133 ___
134 }
135
136 ########################################################################
137 # Layout of opaque area is following.
138 #
139 #       unsigned __int64 h[3];          # current hash value base 2^64
140 #       unsigned __int64 r[2];          # key value base 2^64
141
142 $code.=<<___;
143 .text
144
145 .extern OPENSSL_ia32cap_P
146
147 .globl  poly1305_init
148 .hidden poly1305_init
149 .globl  poly1305_blocks
150 .hidden poly1305_blocks
151 .globl  poly1305_emit
152 .hidden poly1305_emit
153
154 .type   poly1305_init,\@function,3
155 .align  32
156 poly1305_init:
157         xor     %rax,%rax
158         mov     %rax,0($ctx)            # initialize hash value
159         mov     %rax,8($ctx)
160         mov     %rax,16($ctx)
161
162         cmp     \$0,$inp
163         je      .Lno_key
164
165         lea     poly1305_blocks(%rip),%r10
166         lea     poly1305_emit(%rip),%r11
167 ___
168 $code.=<<___    if ($avx);
169         mov     OPENSSL_ia32cap_P+4(%rip),%r9
170         lea     poly1305_blocks_avx(%rip),%rax
171         lea     poly1305_emit_avx(%rip),%rcx
172         bt      \$`60-32`,%r9           # AVX?
173         cmovc   %rax,%r10
174         cmovc   %rcx,%r11
175 ___
176 $code.=<<___    if ($avx>1);
177         lea     poly1305_blocks_avx2(%rip),%rax
178         bt      \$`5+32`,%r9            # AVX2?
179         cmovc   %rax,%r10
180 ___
181 $code.=<<___    if ($avx>3);
182         mov     \$`(1<<31|1<<21|1<<16)`,%rax
183         shr     \$32,%r9
184         and     %rax,%r9
185         cmp     %rax,%r9
186         je      .Linit_base2_44
187 ___
188 $code.=<<___;
189         mov     \$0x0ffffffc0fffffff,%rax
190         mov     \$0x0ffffffc0ffffffc,%rcx
191         and     0($inp),%rax
192         and     8($inp),%rcx
193         mov     %rax,24($ctx)
194         mov     %rcx,32($ctx)
195 ___
196 $code.=<<___    if ($flavour !~ /elf32/);
197         mov     %r10,0(%rdx)
198         mov     %r11,8(%rdx)
199 ___
200 $code.=<<___    if ($flavour =~ /elf32/);
201         mov     %r10d,0(%rdx)
202         mov     %r11d,4(%rdx)
203 ___
204 $code.=<<___;
205         mov     \$1,%eax
206 .Lno_key:
207         ret
208 .size   poly1305_init,.-poly1305_init
209
210 .type   poly1305_blocks,\@function,4
211 .align  32
212 poly1305_blocks:
213 .Lblocks:
214         shr     \$4,$len
215         jz      .Lno_data               # too short
216
217         push    %rbx
218         push    %rbp
219         push    %r12
220         push    %r13
221         push    %r14
222         push    %r15
223 .Lblocks_body:
224
225         mov     $len,%r15               # reassign $len
226
227         mov     24($ctx),$r0            # load r
228         mov     32($ctx),$s1
229
230         mov     0($ctx),$h0             # load hash value
231         mov     8($ctx),$h1
232         mov     16($ctx),$h2
233
234         mov     $s1,$r1
235         shr     \$2,$s1
236         mov     $r1,%rax
237         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
238         jmp     .Loop
239
240 .align  32
241 .Loop:
242         add     0($inp),$h0             # accumulate input
243         adc     8($inp),$h1
244         lea     16($inp),$inp
245         adc     $padbit,$h2
246 ___
247         &poly1305_iteration();
248 $code.=<<___;
249         mov     $r1,%rax
250         dec     %r15                    # len-=16
251         jnz     .Loop
252
253         mov     $h0,0($ctx)             # store hash value
254         mov     $h1,8($ctx)
255         mov     $h2,16($ctx)
256
257         mov     0(%rsp),%r15
258         mov     8(%rsp),%r14
259         mov     16(%rsp),%r13
260         mov     24(%rsp),%r12
261         mov     32(%rsp),%rbp
262         mov     40(%rsp),%rbx
263         lea     48(%rsp),%rsp
264 .Lno_data:
265 .Lblocks_epilogue:
266         ret
267 .size   poly1305_blocks,.-poly1305_blocks
268
269 .type   poly1305_emit,\@function,3
270 .align  32
271 poly1305_emit:
272 .Lemit:
273         mov     0($ctx),%r8     # load hash value
274         mov     8($ctx),%r9
275         mov     16($ctx),%r10
276
277         mov     %r8,%rax
278         add     \$5,%r8         # compare to modulus
279         mov     %r9,%rcx
280         adc     \$0,%r9
281         adc     \$0,%r10
282         shr     \$2,%r10        # did 130-bit value overfow?
283         cmovnz  %r8,%rax
284         cmovnz  %r9,%rcx
285
286         add     0($nonce),%rax  # accumulate nonce
287         adc     8($nonce),%rcx
288         mov     %rax,0($mac)    # write result
289         mov     %rcx,8($mac)
290
291         ret
292 .size   poly1305_emit,.-poly1305_emit
293 ___
294 if ($avx) {
295
296 ########################################################################
297 # Layout of opaque area is following.
298 #
299 #       unsigned __int32 h[5];          # current hash value base 2^26
300 #       unsigned __int32 is_base2_26;
301 #       unsigned __int64 r[2];          # key value base 2^64
302 #       unsigned __int64 pad;
303 #       struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
304 #
305 # where r^n are base 2^26 digits of degrees of multiplier key. There are
306 # 5 digits, but last four are interleaved with multiples of 5, totalling
307 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
308
309 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
310     map("%xmm$_",(0..15));
311
312 $code.=<<___;
313 .type   __poly1305_block,\@abi-omnipotent
314 .align  32
315 __poly1305_block:
316 ___
317         &poly1305_iteration();
318 $code.=<<___;
319         ret
320 .size   __poly1305_block,.-__poly1305_block
321
322 .type   __poly1305_init_avx,\@abi-omnipotent
323 .align  32
324 __poly1305_init_avx:
325         mov     $r0,$h0
326         mov     $r1,$h1
327         xor     $h2,$h2
328
329         lea     48+64($ctx),$ctx        # size optimization
330
331         mov     $r1,%rax
332         call    __poly1305_block        # r^2
333
334         mov     \$0x3ffffff,%eax        # save interleaved r^2 and r base 2^26
335         mov     \$0x3ffffff,%edx
336         mov     $h0,$d1
337         and     $h0#d,%eax
338         mov     $r0,$d2
339         and     $r0#d,%edx
340         mov     %eax,`16*0+0-64`($ctx)
341         shr     \$26,$d1
342         mov     %edx,`16*0+4-64`($ctx)
343         shr     \$26,$d2
344
345         mov     \$0x3ffffff,%eax
346         mov     \$0x3ffffff,%edx
347         and     $d1#d,%eax
348         and     $d2#d,%edx
349         mov     %eax,`16*1+0-64`($ctx)
350         lea     (%rax,%rax,4),%eax      # *5
351         mov     %edx,`16*1+4-64`($ctx)
352         lea     (%rdx,%rdx,4),%edx      # *5
353         mov     %eax,`16*2+0-64`($ctx)
354         shr     \$26,$d1
355         mov     %edx,`16*2+4-64`($ctx)
356         shr     \$26,$d2
357
358         mov     $h1,%rax
359         mov     $r1,%rdx
360         shl     \$12,%rax
361         shl     \$12,%rdx
362         or      $d1,%rax
363         or      $d2,%rdx
364         and     \$0x3ffffff,%eax
365         and     \$0x3ffffff,%edx
366         mov     %eax,`16*3+0-64`($ctx)
367         lea     (%rax,%rax,4),%eax      # *5
368         mov     %edx,`16*3+4-64`($ctx)
369         lea     (%rdx,%rdx,4),%edx      # *5
370         mov     %eax,`16*4+0-64`($ctx)
371         mov     $h1,$d1
372         mov     %edx,`16*4+4-64`($ctx)
373         mov     $r1,$d2
374
375         mov     \$0x3ffffff,%eax
376         mov     \$0x3ffffff,%edx
377         shr     \$14,$d1
378         shr     \$14,$d2
379         and     $d1#d,%eax
380         and     $d2#d,%edx
381         mov     %eax,`16*5+0-64`($ctx)
382         lea     (%rax,%rax,4),%eax      # *5
383         mov     %edx,`16*5+4-64`($ctx)
384         lea     (%rdx,%rdx,4),%edx      # *5
385         mov     %eax,`16*6+0-64`($ctx)
386         shr     \$26,$d1
387         mov     %edx,`16*6+4-64`($ctx)
388         shr     \$26,$d2
389
390         mov     $h2,%rax
391         shl     \$24,%rax
392         or      %rax,$d1
393         mov     $d1#d,`16*7+0-64`($ctx)
394         lea     ($d1,$d1,4),$d1         # *5
395         mov     $d2#d,`16*7+4-64`($ctx)
396         lea     ($d2,$d2,4),$d2         # *5
397         mov     $d1#d,`16*8+0-64`($ctx)
398         mov     $d2#d,`16*8+4-64`($ctx)
399
400         mov     $r1,%rax
401         call    __poly1305_block        # r^3
402
403         mov     \$0x3ffffff,%eax        # save r^3 base 2^26
404         mov     $h0,$d1
405         and     $h0#d,%eax
406         shr     \$26,$d1
407         mov     %eax,`16*0+12-64`($ctx)
408
409         mov     \$0x3ffffff,%edx
410         and     $d1#d,%edx
411         mov     %edx,`16*1+12-64`($ctx)
412         lea     (%rdx,%rdx,4),%edx      # *5
413         shr     \$26,$d1
414         mov     %edx,`16*2+12-64`($ctx)
415
416         mov     $h1,%rax
417         shl     \$12,%rax
418         or      $d1,%rax
419         and     \$0x3ffffff,%eax
420         mov     %eax,`16*3+12-64`($ctx)
421         lea     (%rax,%rax,4),%eax      # *5
422         mov     $h1,$d1
423         mov     %eax,`16*4+12-64`($ctx)
424
425         mov     \$0x3ffffff,%edx
426         shr     \$14,$d1
427         and     $d1#d,%edx
428         mov     %edx,`16*5+12-64`($ctx)
429         lea     (%rdx,%rdx,4),%edx      # *5
430         shr     \$26,$d1
431         mov     %edx,`16*6+12-64`($ctx)
432
433         mov     $h2,%rax
434         shl     \$24,%rax
435         or      %rax,$d1
436         mov     $d1#d,`16*7+12-64`($ctx)
437         lea     ($d1,$d1,4),$d1         # *5
438         mov     $d1#d,`16*8+12-64`($ctx)
439
440         mov     $r1,%rax
441         call    __poly1305_block        # r^4
442
443         mov     \$0x3ffffff,%eax        # save r^4 base 2^26
444         mov     $h0,$d1
445         and     $h0#d,%eax
446         shr     \$26,$d1
447         mov     %eax,`16*0+8-64`($ctx)
448
449         mov     \$0x3ffffff,%edx
450         and     $d1#d,%edx
451         mov     %edx,`16*1+8-64`($ctx)
452         lea     (%rdx,%rdx,4),%edx      # *5
453         shr     \$26,$d1
454         mov     %edx,`16*2+8-64`($ctx)
455
456         mov     $h1,%rax
457         shl     \$12,%rax
458         or      $d1,%rax
459         and     \$0x3ffffff,%eax
460         mov     %eax,`16*3+8-64`($ctx)
461         lea     (%rax,%rax,4),%eax      # *5
462         mov     $h1,$d1
463         mov     %eax,`16*4+8-64`($ctx)
464
465         mov     \$0x3ffffff,%edx
466         shr     \$14,$d1
467         and     $d1#d,%edx
468         mov     %edx,`16*5+8-64`($ctx)
469         lea     (%rdx,%rdx,4),%edx      # *5
470         shr     \$26,$d1
471         mov     %edx,`16*6+8-64`($ctx)
472
473         mov     $h2,%rax
474         shl     \$24,%rax
475         or      %rax,$d1
476         mov     $d1#d,`16*7+8-64`($ctx)
477         lea     ($d1,$d1,4),$d1         # *5
478         mov     $d1#d,`16*8+8-64`($ctx)
479
480         lea     -48-64($ctx),$ctx       # size [de-]optimization
481         ret
482 .size   __poly1305_init_avx,.-__poly1305_init_avx
483
484 .type   poly1305_blocks_avx,\@function,4
485 .align  32
486 poly1305_blocks_avx:
487         mov     20($ctx),%r8d           # is_base2_26
488         cmp     \$128,$len
489         jae     .Lblocks_avx
490         test    %r8d,%r8d
491         jz      .Lblocks
492
493 .Lblocks_avx:
494         and     \$-16,$len
495         jz      .Lno_data_avx
496
497         vzeroupper
498
499         test    %r8d,%r8d
500         jz      .Lbase2_64_avx
501
502         test    \$31,$len
503         jz      .Leven_avx
504
505         push    %rbx
506         push    %rbp
507         push    %r12
508         push    %r13
509         push    %r14
510         push    %r15
511 .Lblocks_avx_body:
512
513         mov     $len,%r15               # reassign $len
514
515         mov     0($ctx),$d1             # load hash value
516         mov     8($ctx),$d2
517         mov     16($ctx),$h2#d
518
519         mov     24($ctx),$r0            # load r
520         mov     32($ctx),$s1
521
522         ################################# base 2^26 -> base 2^64
523         mov     $d1#d,$h0#d
524         and     \$`-1*(1<<31)`,$d1
525         mov     $d2,$r1                 # borrow $r1
526         mov     $d2#d,$h1#d
527         and     \$`-1*(1<<31)`,$d2
528
529         shr     \$6,$d1
530         shl     \$52,$r1
531         add     $d1,$h0
532         shr     \$12,$h1
533         shr     \$18,$d2
534         add     $r1,$h0
535         adc     $d2,$h1
536
537         mov     $h2,$d1
538         shl     \$40,$d1
539         shr     \$24,$h2
540         add     $d1,$h1
541         adc     \$0,$h2                 # can be partially reduced...
542
543         mov     \$-4,$d2                # ... so reduce
544         mov     $h2,$d1
545         and     $h2,$d2
546         shr     \$2,$d1
547         and     \$3,$h2
548         add     $d2,$d1                 # =*5
549         add     $d1,$h0
550         adc     \$0,$h1
551         adc     \$0,$h2
552
553         mov     $s1,$r1
554         mov     $s1,%rax
555         shr     \$2,$s1
556         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
557
558         add     0($inp),$h0             # accumulate input
559         adc     8($inp),$h1
560         lea     16($inp),$inp
561         adc     $padbit,$h2
562
563         call    __poly1305_block
564
565         test    $padbit,$padbit         # if $padbit is zero,
566         jz      .Lstore_base2_64_avx    # store hash in base 2^64 format
567
568         ################################# base 2^64 -> base 2^26
569         mov     $h0,%rax
570         mov     $h0,%rdx
571         shr     \$52,$h0
572         mov     $h1,$r0
573         mov     $h1,$r1
574         shr     \$26,%rdx
575         and     \$0x3ffffff,%rax        # h[0]
576         shl     \$12,$r0
577         and     \$0x3ffffff,%rdx        # h[1]
578         shr     \$14,$h1
579         or      $r0,$h0
580         shl     \$24,$h2
581         and     \$0x3ffffff,$h0         # h[2]
582         shr     \$40,$r1
583         and     \$0x3ffffff,$h1         # h[3]
584         or      $r1,$h2                 # h[4]
585
586         sub     \$16,%r15
587         jz      .Lstore_base2_26_avx
588
589         vmovd   %rax#d,$H0
590         vmovd   %rdx#d,$H1
591         vmovd   $h0#d,$H2
592         vmovd   $h1#d,$H3
593         vmovd   $h2#d,$H4
594         jmp     .Lproceed_avx
595
596 .align  32
597 .Lstore_base2_64_avx:
598         mov     $h0,0($ctx)
599         mov     $h1,8($ctx)
600         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
601         jmp     .Ldone_avx
602
603 .align  16
604 .Lstore_base2_26_avx:
605         mov     %rax#d,0($ctx)          # store hash value base 2^26
606         mov     %rdx#d,4($ctx)
607         mov     $h0#d,8($ctx)
608         mov     $h1#d,12($ctx)
609         mov     $h2#d,16($ctx)
610 .align  16
611 .Ldone_avx:
612         mov     0(%rsp),%r15
613         mov     8(%rsp),%r14
614         mov     16(%rsp),%r13
615         mov     24(%rsp),%r12
616         mov     32(%rsp),%rbp
617         mov     40(%rsp),%rbx
618         lea     48(%rsp),%rsp
619 .Lno_data_avx:
620 .Lblocks_avx_epilogue:
621         ret
622
623 .align  32
624 .Lbase2_64_avx:
625         push    %rbx
626         push    %rbp
627         push    %r12
628         push    %r13
629         push    %r14
630         push    %r15
631 .Lbase2_64_avx_body:
632
633         mov     $len,%r15               # reassign $len
634
635         mov     24($ctx),$r0            # load r
636         mov     32($ctx),$s1
637
638         mov     0($ctx),$h0             # load hash value
639         mov     8($ctx),$h1
640         mov     16($ctx),$h2#d
641
642         mov     $s1,$r1
643         mov     $s1,%rax
644         shr     \$2,$s1
645         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
646
647         test    \$31,$len
648         jz      .Linit_avx
649
650         add     0($inp),$h0             # accumulate input
651         adc     8($inp),$h1
652         lea     16($inp),$inp
653         adc     $padbit,$h2
654         sub     \$16,%r15
655
656         call    __poly1305_block
657
658 .Linit_avx:
659         ################################# base 2^64 -> base 2^26
660         mov     $h0,%rax
661         mov     $h0,%rdx
662         shr     \$52,$h0
663         mov     $h1,$d1
664         mov     $h1,$d2
665         shr     \$26,%rdx
666         and     \$0x3ffffff,%rax        # h[0]
667         shl     \$12,$d1
668         and     \$0x3ffffff,%rdx        # h[1]
669         shr     \$14,$h1
670         or      $d1,$h0
671         shl     \$24,$h2
672         and     \$0x3ffffff,$h0         # h[2]
673         shr     \$40,$d2
674         and     \$0x3ffffff,$h1         # h[3]
675         or      $d2,$h2                 # h[4]
676
677         vmovd   %rax#d,$H0
678         vmovd   %rdx#d,$H1
679         vmovd   $h0#d,$H2
680         vmovd   $h1#d,$H3
681         vmovd   $h2#d,$H4
682         movl    \$1,20($ctx)            # set is_base2_26
683
684         call    __poly1305_init_avx
685
686 .Lproceed_avx:
687         mov     %r15,$len
688
689         mov     0(%rsp),%r15
690         mov     8(%rsp),%r14
691         mov     16(%rsp),%r13
692         mov     24(%rsp),%r12
693         mov     32(%rsp),%rbp
694         mov     40(%rsp),%rbx
695         lea     48(%rsp),%rax
696         lea     48(%rsp),%rsp
697 .Lbase2_64_avx_epilogue:
698         jmp     .Ldo_avx
699
700 .align  32
701 .Leven_avx:
702         vmovd           4*0($ctx),$H0           # load hash value
703         vmovd           4*1($ctx),$H1
704         vmovd           4*2($ctx),$H2
705         vmovd           4*3($ctx),$H3
706         vmovd           4*4($ctx),$H4
707
708 .Ldo_avx:
709 ___
710 $code.=<<___    if (!$win64);
711         lea             -0x58(%rsp),%r11
712         sub             \$0x178,%rsp
713 ___
714 $code.=<<___    if ($win64);
715         lea             -0xf8(%rsp),%r11
716         sub             \$0x218,%rsp
717         vmovdqa         %xmm6,0x50(%r11)
718         vmovdqa         %xmm7,0x60(%r11)
719         vmovdqa         %xmm8,0x70(%r11)
720         vmovdqa         %xmm9,0x80(%r11)
721         vmovdqa         %xmm10,0x90(%r11)
722         vmovdqa         %xmm11,0xa0(%r11)
723         vmovdqa         %xmm12,0xb0(%r11)
724         vmovdqa         %xmm13,0xc0(%r11)
725         vmovdqa         %xmm14,0xd0(%r11)
726         vmovdqa         %xmm15,0xe0(%r11)
727 .Ldo_avx_body:
728 ___
729 $code.=<<___;
730         sub             \$64,$len
731         lea             -32($inp),%rax
732         cmovc           %rax,$inp
733
734         vmovdqu         `16*3`($ctx),$D4        # preload r0^2
735         lea             `16*3+64`($ctx),$ctx    # size optimization
736         lea             .Lconst(%rip),%rcx
737
738         ################################################################
739         # load input
740         vmovdqu         16*2($inp),$T0
741         vmovdqu         16*3($inp),$T1
742         vmovdqa         64(%rcx),$MASK          # .Lmask26
743
744         vpsrldq         \$6,$T0,$T2             # splat input
745         vpsrldq         \$6,$T1,$T3
746         vpunpckhqdq     $T1,$T0,$T4             # 4
747         vpunpcklqdq     $T1,$T0,$T0             # 0:1
748         vpunpcklqdq     $T3,$T2,$T3             # 2:3
749
750         vpsrlq          \$40,$T4,$T4            # 4
751         vpsrlq          \$26,$T0,$T1
752         vpand           $MASK,$T0,$T0           # 0
753         vpsrlq          \$4,$T3,$T2
754         vpand           $MASK,$T1,$T1           # 1
755         vpsrlq          \$30,$T3,$T3
756         vpand           $MASK,$T2,$T2           # 2
757         vpand           $MASK,$T3,$T3           # 3
758         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
759
760         jbe             .Lskip_loop_avx
761
762         # expand and copy pre-calculated table to stack
763         vmovdqu         `16*1-64`($ctx),$D1
764         vmovdqu         `16*2-64`($ctx),$D2
765         vpshufd         \$0xEE,$D4,$D3          # 34xx -> 3434
766         vpshufd         \$0x44,$D4,$D0          # xx12 -> 1212
767         vmovdqa         $D3,-0x90(%r11)
768         vmovdqa         $D0,0x00(%rsp)
769         vpshufd         \$0xEE,$D1,$D4
770         vmovdqu         `16*3-64`($ctx),$D0
771         vpshufd         \$0x44,$D1,$D1
772         vmovdqa         $D4,-0x80(%r11)
773         vmovdqa         $D1,0x10(%rsp)
774         vpshufd         \$0xEE,$D2,$D3
775         vmovdqu         `16*4-64`($ctx),$D1
776         vpshufd         \$0x44,$D2,$D2
777         vmovdqa         $D3,-0x70(%r11)
778         vmovdqa         $D2,0x20(%rsp)
779         vpshufd         \$0xEE,$D0,$D4
780         vmovdqu         `16*5-64`($ctx),$D2
781         vpshufd         \$0x44,$D0,$D0
782         vmovdqa         $D4,-0x60(%r11)
783         vmovdqa         $D0,0x30(%rsp)
784         vpshufd         \$0xEE,$D1,$D3
785         vmovdqu         `16*6-64`($ctx),$D0
786         vpshufd         \$0x44,$D1,$D1
787         vmovdqa         $D3,-0x50(%r11)
788         vmovdqa         $D1,0x40(%rsp)
789         vpshufd         \$0xEE,$D2,$D4
790         vmovdqu         `16*7-64`($ctx),$D1
791         vpshufd         \$0x44,$D2,$D2
792         vmovdqa         $D4,-0x40(%r11)
793         vmovdqa         $D2,0x50(%rsp)
794         vpshufd         \$0xEE,$D0,$D3
795         vmovdqu         `16*8-64`($ctx),$D2
796         vpshufd         \$0x44,$D0,$D0
797         vmovdqa         $D3,-0x30(%r11)
798         vmovdqa         $D0,0x60(%rsp)
799         vpshufd         \$0xEE,$D1,$D4
800         vpshufd         \$0x44,$D1,$D1
801         vmovdqa         $D4,-0x20(%r11)
802         vmovdqa         $D1,0x70(%rsp)
803         vpshufd         \$0xEE,$D2,$D3
804          vmovdqa        0x00(%rsp),$D4          # preload r0^2
805         vpshufd         \$0x44,$D2,$D2
806         vmovdqa         $D3,-0x10(%r11)
807         vmovdqa         $D2,0x80(%rsp)
808
809         jmp             .Loop_avx
810
811 .align  32
812 .Loop_avx:
813         ################################################################
814         # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
815         # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
816         #   \___________________/
817         # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
818         # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
819         #   \___________________/ \____________________/
820         #
821         # Note that we start with inp[2:3]*r^2. This is because it
822         # doesn't depend on reduction in previous iteration.
823         ################################################################
824         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
825         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
826         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
827         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
828         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
829         #
830         # though note that $Tx and $Hx are "reversed" in this section,
831         # and $D4 is preloaded with r0^2...
832
833         vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
834         vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
835           vmovdqa       $H2,0x20(%r11)                          # offload hash
836         vpmuludq        $T2,$D4,$D2             # d3 = h2*r0
837          vmovdqa        0x10(%rsp),$H2          # r1^2
838         vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
839         vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
840
841           vmovdqa       $H0,0x00(%r11)                          #
842         vpmuludq        0x20(%rsp),$T4,$H0      # h4*s1
843           vmovdqa       $H1,0x10(%r11)                          #
844         vpmuludq        $T3,$H2,$H1             # h3*r1
845         vpaddq          $H0,$D0,$D0             # d0 += h4*s1
846         vpaddq          $H1,$D4,$D4             # d4 += h3*r1
847           vmovdqa       $H3,0x30(%r11)                          #
848         vpmuludq        $T2,$H2,$H0             # h2*r1
849         vpmuludq        $T1,$H2,$H1             # h1*r1
850         vpaddq          $H0,$D3,$D3             # d3 += h2*r1
851          vmovdqa        0x30(%rsp),$H3          # r2^2
852         vpaddq          $H1,$D2,$D2             # d2 += h1*r1
853           vmovdqa       $H4,0x40(%r11)                          #
854         vpmuludq        $T0,$H2,$H2             # h0*r1
855          vpmuludq       $T2,$H3,$H0             # h2*r2
856         vpaddq          $H2,$D1,$D1             # d1 += h0*r1
857
858          vmovdqa        0x40(%rsp),$H4          # s2^2
859         vpaddq          $H0,$D4,$D4             # d4 += h2*r2
860         vpmuludq        $T1,$H3,$H1             # h1*r2
861         vpmuludq        $T0,$H3,$H3             # h0*r2
862         vpaddq          $H1,$D3,$D3             # d3 += h1*r2
863          vmovdqa        0x50(%rsp),$H2          # r3^2
864         vpaddq          $H3,$D2,$D2             # d2 += h0*r2
865         vpmuludq        $T4,$H4,$H0             # h4*s2
866         vpmuludq        $T3,$H4,$H4             # h3*s2
867         vpaddq          $H0,$D1,$D1             # d1 += h4*s2
868          vmovdqa        0x60(%rsp),$H3          # s3^2
869         vpaddq          $H4,$D0,$D0             # d0 += h3*s2
870
871          vmovdqa        0x80(%rsp),$H4          # s4^2
872         vpmuludq        $T1,$H2,$H1             # h1*r3
873         vpmuludq        $T0,$H2,$H2             # h0*r3
874         vpaddq          $H1,$D4,$D4             # d4 += h1*r3
875         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
876         vpmuludq        $T4,$H3,$H0             # h4*s3
877         vpmuludq        $T3,$H3,$H1             # h3*s3
878         vpaddq          $H0,$D2,$D2             # d2 += h4*s3
879          vmovdqu        16*0($inp),$H0                          # load input
880         vpaddq          $H1,$D1,$D1             # d1 += h3*s3
881         vpmuludq        $T2,$H3,$H3             # h2*s3
882          vpmuludq       $T2,$H4,$T2             # h2*s4
883         vpaddq          $H3,$D0,$D0             # d0 += h2*s3
884
885          vmovdqu        16*1($inp),$H1                          #
886         vpaddq          $T2,$D1,$D1             # d1 += h2*s4
887         vpmuludq        $T3,$H4,$T3             # h3*s4
888         vpmuludq        $T4,$H4,$T4             # h4*s4
889          vpsrldq        \$6,$H0,$H2                             # splat input
890         vpaddq          $T3,$D2,$D2             # d2 += h3*s4
891         vpaddq          $T4,$D3,$D3             # d3 += h4*s4
892          vpsrldq        \$6,$H1,$H3                             #
893         vpmuludq        0x70(%rsp),$T0,$T4      # h0*r4
894         vpmuludq        $T1,$H4,$T0             # h1*s4
895          vpunpckhqdq    $H1,$H0,$H4             # 4
896         vpaddq          $T4,$D4,$D4             # d4 += h0*r4
897          vmovdqa        -0x90(%r11),$T4         # r0^4
898         vpaddq          $T0,$D0,$D0             # d0 += h1*s4
899
900         vpunpcklqdq     $H1,$H0,$H0             # 0:1
901         vpunpcklqdq     $H3,$H2,$H3             # 2:3
902
903         #vpsrlq         \$40,$H4,$H4            # 4
904         vpsrldq         \$`40/8`,$H4,$H4        # 4
905         vpsrlq          \$26,$H0,$H1
906         vpand           $MASK,$H0,$H0           # 0
907         vpsrlq          \$4,$H3,$H2
908         vpand           $MASK,$H1,$H1           # 1
909         vpand           0(%rcx),$H4,$H4         # .Lmask24
910         vpsrlq          \$30,$H3,$H3
911         vpand           $MASK,$H2,$H2           # 2
912         vpand           $MASK,$H3,$H3           # 3
913         vpor            32(%rcx),$H4,$H4        # padbit, yes, always
914
915         vpaddq          0x00(%r11),$H0,$H0      # add hash value
916         vpaddq          0x10(%r11),$H1,$H1
917         vpaddq          0x20(%r11),$H2,$H2
918         vpaddq          0x30(%r11),$H3,$H3
919         vpaddq          0x40(%r11),$H4,$H4
920
921         lea             16*2($inp),%rax
922         lea             16*4($inp),$inp
923         sub             \$64,$len
924         cmovc           %rax,$inp
925
926         ################################################################
927         # Now we accumulate (inp[0:1]+hash)*r^4
928         ################################################################
929         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
930         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
931         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
932         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
933         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
934
935         vpmuludq        $H0,$T4,$T0             # h0*r0
936         vpmuludq        $H1,$T4,$T1             # h1*r0
937         vpaddq          $T0,$D0,$D0
938         vpaddq          $T1,$D1,$D1
939          vmovdqa        -0x80(%r11),$T2         # r1^4
940         vpmuludq        $H2,$T4,$T0             # h2*r0
941         vpmuludq        $H3,$T4,$T1             # h3*r0
942         vpaddq          $T0,$D2,$D2
943         vpaddq          $T1,$D3,$D3
944         vpmuludq        $H4,$T4,$T4             # h4*r0
945          vpmuludq       -0x70(%r11),$H4,$T0     # h4*s1
946         vpaddq          $T4,$D4,$D4
947
948         vpaddq          $T0,$D0,$D0             # d0 += h4*s1
949         vpmuludq        $H2,$T2,$T1             # h2*r1
950         vpmuludq        $H3,$T2,$T0             # h3*r1
951         vpaddq          $T1,$D3,$D3             # d3 += h2*r1
952          vmovdqa        -0x60(%r11),$T3         # r2^4
953         vpaddq          $T0,$D4,$D4             # d4 += h3*r1
954         vpmuludq        $H1,$T2,$T1             # h1*r1
955         vpmuludq        $H0,$T2,$T2             # h0*r1
956         vpaddq          $T1,$D2,$D2             # d2 += h1*r1
957         vpaddq          $T2,$D1,$D1             # d1 += h0*r1
958
959          vmovdqa        -0x50(%r11),$T4         # s2^4
960         vpmuludq        $H2,$T3,$T0             # h2*r2
961         vpmuludq        $H1,$T3,$T1             # h1*r2
962         vpaddq          $T0,$D4,$D4             # d4 += h2*r2
963         vpaddq          $T1,$D3,$D3             # d3 += h1*r2
964          vmovdqa        -0x40(%r11),$T2         # r3^4
965         vpmuludq        $H0,$T3,$T3             # h0*r2
966         vpmuludq        $H4,$T4,$T0             # h4*s2
967         vpaddq          $T3,$D2,$D2             # d2 += h0*r2
968         vpaddq          $T0,$D1,$D1             # d1 += h4*s2
969          vmovdqa        -0x30(%r11),$T3         # s3^4
970         vpmuludq        $H3,$T4,$T4             # h3*s2
971          vpmuludq       $H1,$T2,$T1             # h1*r3
972         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
973
974          vmovdqa        -0x10(%r11),$T4         # s4^4
975         vpaddq          $T1,$D4,$D4             # d4 += h1*r3
976         vpmuludq        $H0,$T2,$T2             # h0*r3
977         vpmuludq        $H4,$T3,$T0             # h4*s3
978         vpaddq          $T2,$D3,$D3             # d3 += h0*r3
979         vpaddq          $T0,$D2,$D2             # d2 += h4*s3
980          vmovdqu        16*2($inp),$T0                          # load input
981         vpmuludq        $H3,$T3,$T2             # h3*s3
982         vpmuludq        $H2,$T3,$T3             # h2*s3
983         vpaddq          $T2,$D1,$D1             # d1 += h3*s3
984          vmovdqu        16*3($inp),$T1                          #
985         vpaddq          $T3,$D0,$D0             # d0 += h2*s3
986
987         vpmuludq        $H2,$T4,$H2             # h2*s4
988         vpmuludq        $H3,$T4,$H3             # h3*s4
989          vpsrldq        \$6,$T0,$T2                             # splat input
990         vpaddq          $H2,$D1,$D1             # d1 += h2*s4
991         vpmuludq        $H4,$T4,$H4             # h4*s4
992          vpsrldq        \$6,$T1,$T3                             #
993         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*s4
994         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*s4
995         vpmuludq        -0x20(%r11),$H0,$H4     # h0*r4
996         vpmuludq        $H1,$T4,$H0
997          vpunpckhqdq    $T1,$T0,$T4             # 4
998         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
999         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1000
1001         vpunpcklqdq     $T1,$T0,$T0             # 0:1
1002         vpunpcklqdq     $T3,$T2,$T3             # 2:3
1003
1004         #vpsrlq         \$40,$T4,$T4            # 4
1005         vpsrldq         \$`40/8`,$T4,$T4        # 4
1006         vpsrlq          \$26,$T0,$T1
1007          vmovdqa        0x00(%rsp),$D4          # preload r0^2
1008         vpand           $MASK,$T0,$T0           # 0
1009         vpsrlq          \$4,$T3,$T2
1010         vpand           $MASK,$T1,$T1           # 1
1011         vpand           0(%rcx),$T4,$T4         # .Lmask24
1012         vpsrlq          \$30,$T3,$T3
1013         vpand           $MASK,$T2,$T2           # 2
1014         vpand           $MASK,$T3,$T3           # 3
1015         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1016
1017         ################################################################
1018         # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1019         # and P. Schwabe
1020
1021         vpsrlq          \$26,$H3,$D3
1022         vpand           $MASK,$H3,$H3
1023         vpaddq          $D3,$H4,$H4             # h3 -> h4
1024
1025         vpsrlq          \$26,$H0,$D0
1026         vpand           $MASK,$H0,$H0
1027         vpaddq          $D0,$D1,$H1             # h0 -> h1
1028
1029         vpsrlq          \$26,$H4,$D0
1030         vpand           $MASK,$H4,$H4
1031
1032         vpsrlq          \$26,$H1,$D1
1033         vpand           $MASK,$H1,$H1
1034         vpaddq          $D1,$H2,$H2             # h1 -> h2
1035
1036         vpaddq          $D0,$H0,$H0
1037         vpsllq          \$2,$D0,$D0
1038         vpaddq          $D0,$H0,$H0             # h4 -> h0
1039
1040         vpsrlq          \$26,$H2,$D2
1041         vpand           $MASK,$H2,$H2
1042         vpaddq          $D2,$H3,$H3             # h2 -> h3
1043
1044         vpsrlq          \$26,$H0,$D0
1045         vpand           $MASK,$H0,$H0
1046         vpaddq          $D0,$H1,$H1             # h0 -> h1
1047
1048         vpsrlq          \$26,$H3,$D3
1049         vpand           $MASK,$H3,$H3
1050         vpaddq          $D3,$H4,$H4             # h3 -> h4
1051
1052         ja              .Loop_avx
1053
1054 .Lskip_loop_avx:
1055         ################################################################
1056         # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1057
1058         vpshufd         \$0x10,$D4,$D4          # r0^n, xx12 -> x1x2
1059         add             \$32,$len
1060         jnz             .Long_tail_avx
1061
1062         vpaddq          $H2,$T2,$T2
1063         vpaddq          $H0,$T0,$T0
1064         vpaddq          $H1,$T1,$T1
1065         vpaddq          $H3,$T3,$T3
1066         vpaddq          $H4,$T4,$T4
1067
1068 .Long_tail_avx:
1069         vmovdqa         $H2,0x20(%r11)
1070         vmovdqa         $H0,0x00(%r11)
1071         vmovdqa         $H1,0x10(%r11)
1072         vmovdqa         $H3,0x30(%r11)
1073         vmovdqa         $H4,0x40(%r11)
1074
1075         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1076         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1077         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1078         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1079         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1080
1081         vpmuludq        $T2,$D4,$D2             # d2 = h2*r0
1082         vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
1083          vpshufd        \$0x10,`16*1-64`($ctx),$H2              # r1^n
1084         vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
1085         vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
1086         vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
1087
1088         vpmuludq        $T3,$H2,$H0             # h3*r1
1089         vpaddq          $H0,$D4,$D4             # d4 += h3*r1
1090          vpshufd        \$0x10,`16*2-64`($ctx),$H3              # s1^n
1091         vpmuludq        $T2,$H2,$H1             # h2*r1
1092         vpaddq          $H1,$D3,$D3             # d3 += h2*r1
1093          vpshufd        \$0x10,`16*3-64`($ctx),$H4              # r2^n
1094         vpmuludq        $T1,$H2,$H0             # h1*r1
1095         vpaddq          $H0,$D2,$D2             # d2 += h1*r1
1096         vpmuludq        $T0,$H2,$H2             # h0*r1
1097         vpaddq          $H2,$D1,$D1             # d1 += h0*r1
1098         vpmuludq        $T4,$H3,$H3             # h4*s1
1099         vpaddq          $H3,$D0,$D0             # d0 += h4*s1
1100
1101          vpshufd        \$0x10,`16*4-64`($ctx),$H2              # s2^n
1102         vpmuludq        $T2,$H4,$H1             # h2*r2
1103         vpaddq          $H1,$D4,$D4             # d4 += h2*r2
1104         vpmuludq        $T1,$H4,$H0             # h1*r2
1105         vpaddq          $H0,$D3,$D3             # d3 += h1*r2
1106          vpshufd        \$0x10,`16*5-64`($ctx),$H3              # r3^n
1107         vpmuludq        $T0,$H4,$H4             # h0*r2
1108         vpaddq          $H4,$D2,$D2             # d2 += h0*r2
1109         vpmuludq        $T4,$H2,$H1             # h4*s2
1110         vpaddq          $H1,$D1,$D1             # d1 += h4*s2
1111          vpshufd        \$0x10,`16*6-64`($ctx),$H4              # s3^n
1112         vpmuludq        $T3,$H2,$H2             # h3*s2
1113         vpaddq          $H2,$D0,$D0             # d0 += h3*s2
1114
1115         vpmuludq        $T1,$H3,$H0             # h1*r3
1116         vpaddq          $H0,$D4,$D4             # d4 += h1*r3
1117         vpmuludq        $T0,$H3,$H3             # h0*r3
1118         vpaddq          $H3,$D3,$D3             # d3 += h0*r3
1119          vpshufd        \$0x10,`16*7-64`($ctx),$H2              # r4^n
1120         vpmuludq        $T4,$H4,$H1             # h4*s3
1121         vpaddq          $H1,$D2,$D2             # d2 += h4*s3
1122          vpshufd        \$0x10,`16*8-64`($ctx),$H3              # s4^n
1123         vpmuludq        $T3,$H4,$H0             # h3*s3
1124         vpaddq          $H0,$D1,$D1             # d1 += h3*s3
1125         vpmuludq        $T2,$H4,$H4             # h2*s3
1126         vpaddq          $H4,$D0,$D0             # d0 += h2*s3
1127
1128         vpmuludq        $T0,$H2,$H2             # h0*r4
1129         vpaddq          $H2,$D4,$D4             # h4 = d4 + h0*r4
1130         vpmuludq        $T4,$H3,$H1             # h4*s4
1131         vpaddq          $H1,$D3,$D3             # h3 = d3 + h4*s4
1132         vpmuludq        $T3,$H3,$H0             # h3*s4
1133         vpaddq          $H0,$D2,$D2             # h2 = d2 + h3*s4
1134         vpmuludq        $T2,$H3,$H1             # h2*s4
1135         vpaddq          $H1,$D1,$D1             # h1 = d1 + h2*s4
1136         vpmuludq        $T1,$H3,$H3             # h1*s4
1137         vpaddq          $H3,$D0,$D0             # h0 = d0 + h1*s4
1138
1139         jz              .Lshort_tail_avx
1140
1141         vmovdqu         16*0($inp),$H0          # load input
1142         vmovdqu         16*1($inp),$H1
1143
1144         vpsrldq         \$6,$H0,$H2             # splat input
1145         vpsrldq         \$6,$H1,$H3
1146         vpunpckhqdq     $H1,$H0,$H4             # 4
1147         vpunpcklqdq     $H1,$H0,$H0             # 0:1
1148         vpunpcklqdq     $H3,$H2,$H3             # 2:3
1149
1150         vpsrlq          \$40,$H4,$H4            # 4
1151         vpsrlq          \$26,$H0,$H1
1152         vpand           $MASK,$H0,$H0           # 0
1153         vpsrlq          \$4,$H3,$H2
1154         vpand           $MASK,$H1,$H1           # 1
1155         vpsrlq          \$30,$H3,$H3
1156         vpand           $MASK,$H2,$H2           # 2
1157         vpand           $MASK,$H3,$H3           # 3
1158         vpor            32(%rcx),$H4,$H4        # padbit, yes, always
1159
1160         vpshufd         \$0x32,`16*0-64`($ctx),$T4      # r0^n, 34xx -> x3x4
1161         vpaddq          0x00(%r11),$H0,$H0
1162         vpaddq          0x10(%r11),$H1,$H1
1163         vpaddq          0x20(%r11),$H2,$H2
1164         vpaddq          0x30(%r11),$H3,$H3
1165         vpaddq          0x40(%r11),$H4,$H4
1166
1167         ################################################################
1168         # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1169
1170         vpmuludq        $H0,$T4,$T0             # h0*r0
1171         vpaddq          $T0,$D0,$D0             # d0 += h0*r0
1172         vpmuludq        $H1,$T4,$T1             # h1*r0
1173         vpaddq          $T1,$D1,$D1             # d1 += h1*r0
1174         vpmuludq        $H2,$T4,$T0             # h2*r0
1175         vpaddq          $T0,$D2,$D2             # d2 += h2*r0
1176          vpshufd        \$0x32,`16*1-64`($ctx),$T2              # r1^n
1177         vpmuludq        $H3,$T4,$T1             # h3*r0
1178         vpaddq          $T1,$D3,$D3             # d3 += h3*r0
1179         vpmuludq        $H4,$T4,$T4             # h4*r0
1180         vpaddq          $T4,$D4,$D4             # d4 += h4*r0
1181
1182         vpmuludq        $H3,$T2,$T0             # h3*r1
1183         vpaddq          $T0,$D4,$D4             # d4 += h3*r1
1184          vpshufd        \$0x32,`16*2-64`($ctx),$T3              # s1
1185         vpmuludq        $H2,$T2,$T1             # h2*r1
1186         vpaddq          $T1,$D3,$D3             # d3 += h2*r1
1187          vpshufd        \$0x32,`16*3-64`($ctx),$T4              # r2
1188         vpmuludq        $H1,$T2,$T0             # h1*r1
1189         vpaddq          $T0,$D2,$D2             # d2 += h1*r1
1190         vpmuludq        $H0,$T2,$T2             # h0*r1
1191         vpaddq          $T2,$D1,$D1             # d1 += h0*r1
1192         vpmuludq        $H4,$T3,$T3             # h4*s1
1193         vpaddq          $T3,$D0,$D0             # d0 += h4*s1
1194
1195          vpshufd        \$0x32,`16*4-64`($ctx),$T2              # s2
1196         vpmuludq        $H2,$T4,$T1             # h2*r2
1197         vpaddq          $T1,$D4,$D4             # d4 += h2*r2
1198         vpmuludq        $H1,$T4,$T0             # h1*r2
1199         vpaddq          $T0,$D3,$D3             # d3 += h1*r2
1200          vpshufd        \$0x32,`16*5-64`($ctx),$T3              # r3
1201         vpmuludq        $H0,$T4,$T4             # h0*r2
1202         vpaddq          $T4,$D2,$D2             # d2 += h0*r2
1203         vpmuludq        $H4,$T2,$T1             # h4*s2
1204         vpaddq          $T1,$D1,$D1             # d1 += h4*s2
1205          vpshufd        \$0x32,`16*6-64`($ctx),$T4              # s3
1206         vpmuludq        $H3,$T2,$T2             # h3*s2
1207         vpaddq          $T2,$D0,$D0             # d0 += h3*s2
1208
1209         vpmuludq        $H1,$T3,$T0             # h1*r3
1210         vpaddq          $T0,$D4,$D4             # d4 += h1*r3
1211         vpmuludq        $H0,$T3,$T3             # h0*r3
1212         vpaddq          $T3,$D3,$D3             # d3 += h0*r3
1213          vpshufd        \$0x32,`16*7-64`($ctx),$T2              # r4
1214         vpmuludq        $H4,$T4,$T1             # h4*s3
1215         vpaddq          $T1,$D2,$D2             # d2 += h4*s3
1216          vpshufd        \$0x32,`16*8-64`($ctx),$T3              # s4
1217         vpmuludq        $H3,$T4,$T0             # h3*s3
1218         vpaddq          $T0,$D1,$D1             # d1 += h3*s3
1219         vpmuludq        $H2,$T4,$T4             # h2*s3
1220         vpaddq          $T4,$D0,$D0             # d0 += h2*s3
1221
1222         vpmuludq        $H0,$T2,$T2             # h0*r4
1223         vpaddq          $T2,$D4,$D4             # d4 += h0*r4
1224         vpmuludq        $H4,$T3,$T1             # h4*s4
1225         vpaddq          $T1,$D3,$D3             # d3 += h4*s4
1226         vpmuludq        $H3,$T3,$T0             # h3*s4
1227         vpaddq          $T0,$D2,$D2             # d2 += h3*s4
1228         vpmuludq        $H2,$T3,$T1             # h2*s4
1229         vpaddq          $T1,$D1,$D1             # d1 += h2*s4
1230         vpmuludq        $H1,$T3,$T3             # h1*s4
1231         vpaddq          $T3,$D0,$D0             # d0 += h1*s4
1232
1233 .Lshort_tail_avx:
1234         ################################################################
1235         # horizontal addition
1236
1237         vpsrldq         \$8,$D4,$T4
1238         vpsrldq         \$8,$D3,$T3
1239         vpsrldq         \$8,$D1,$T1
1240         vpsrldq         \$8,$D0,$T0
1241         vpsrldq         \$8,$D2,$T2
1242         vpaddq          $T3,$D3,$D3
1243         vpaddq          $T4,$D4,$D4
1244         vpaddq          $T0,$D0,$D0
1245         vpaddq          $T1,$D1,$D1
1246         vpaddq          $T2,$D2,$D2
1247
1248         ################################################################
1249         # lazy reduction
1250
1251         vpsrlq          \$26,$D3,$H3
1252         vpand           $MASK,$D3,$D3
1253         vpaddq          $H3,$D4,$D4             # h3 -> h4
1254
1255         vpsrlq          \$26,$D0,$H0
1256         vpand           $MASK,$D0,$D0
1257         vpaddq          $H0,$D1,$D1             # h0 -> h1
1258
1259         vpsrlq          \$26,$D4,$H4
1260         vpand           $MASK,$D4,$D4
1261
1262         vpsrlq          \$26,$D1,$H1
1263         vpand           $MASK,$D1,$D1
1264         vpaddq          $H1,$D2,$D2             # h1 -> h2
1265
1266         vpaddq          $H4,$D0,$D0
1267         vpsllq          \$2,$H4,$H4
1268         vpaddq          $H4,$D0,$D0             # h4 -> h0
1269
1270         vpsrlq          \$26,$D2,$H2
1271         vpand           $MASK,$D2,$D2
1272         vpaddq          $H2,$D3,$D3             # h2 -> h3
1273
1274         vpsrlq          \$26,$D0,$H0
1275         vpand           $MASK,$D0,$D0
1276         vpaddq          $H0,$D1,$D1             # h0 -> h1
1277
1278         vpsrlq          \$26,$D3,$H3
1279         vpand           $MASK,$D3,$D3
1280         vpaddq          $H3,$D4,$D4             # h3 -> h4
1281
1282         vmovd           $D0,`4*0-48-64`($ctx)   # save partially reduced
1283         vmovd           $D1,`4*1-48-64`($ctx)
1284         vmovd           $D2,`4*2-48-64`($ctx)
1285         vmovd           $D3,`4*3-48-64`($ctx)
1286         vmovd           $D4,`4*4-48-64`($ctx)
1287 ___
1288 $code.=<<___    if ($win64);
1289         vmovdqa         0x50(%r11),%xmm6
1290         vmovdqa         0x60(%r11),%xmm7
1291         vmovdqa         0x70(%r11),%xmm8
1292         vmovdqa         0x80(%r11),%xmm9
1293         vmovdqa         0x90(%r11),%xmm10
1294         vmovdqa         0xa0(%r11),%xmm11
1295         vmovdqa         0xb0(%r11),%xmm12
1296         vmovdqa         0xc0(%r11),%xmm13
1297         vmovdqa         0xd0(%r11),%xmm14
1298         vmovdqa         0xe0(%r11),%xmm15
1299         lea             0xf8(%r11),%rsp
1300 .Ldo_avx_epilogue:
1301 ___
1302 $code.=<<___    if (!$win64);
1303         lea             0x58(%r11),%rsp
1304 ___
1305 $code.=<<___;
1306         vzeroupper
1307         ret
1308 .size   poly1305_blocks_avx,.-poly1305_blocks_avx
1309
1310 .type   poly1305_emit_avx,\@function,3
1311 .align  32
1312 poly1305_emit_avx:
1313         cmpl    \$0,20($ctx)    # is_base2_26?
1314         je      .Lemit
1315
1316         mov     0($ctx),%eax    # load hash value base 2^26
1317         mov     4($ctx),%ecx
1318         mov     8($ctx),%r8d
1319         mov     12($ctx),%r11d
1320         mov     16($ctx),%r10d
1321
1322         shl     \$26,%rcx       # base 2^26 -> base 2^64
1323         mov     %r8,%r9
1324         shl     \$52,%r8
1325         add     %rcx,%rax
1326         shr     \$12,%r9
1327         add     %rax,%r8        # h0
1328         adc     \$0,%r9
1329
1330         shl     \$14,%r11
1331         mov     %r10,%rax
1332         shr     \$24,%r10
1333         add     %r11,%r9
1334         shl     \$40,%rax
1335         add     %rax,%r9        # h1
1336         adc     \$0,%r10        # h2
1337
1338         mov     %r10,%rax       # could be partially reduced, so reduce
1339         mov     %r10,%rcx
1340         and     \$3,%r10
1341         shr     \$2,%rax
1342         and     \$-4,%rcx
1343         add     %rcx,%rax
1344         add     %rax,%r8
1345         adc     \$0,%r9
1346         adc     \$0,%r10
1347
1348         mov     %r8,%rax
1349         add     \$5,%r8         # compare to modulus
1350         mov     %r9,%rcx
1351         adc     \$0,%r9
1352         adc     \$0,%r10
1353         shr     \$2,%r10        # did 130-bit value overfow?
1354         cmovnz  %r8,%rax
1355         cmovnz  %r9,%rcx
1356
1357         add     0($nonce),%rax  # accumulate nonce
1358         adc     8($nonce),%rcx
1359         mov     %rax,0($mac)    # write result
1360         mov     %rcx,8($mac)
1361
1362         ret
1363 .size   poly1305_emit_avx,.-poly1305_emit_avx
1364 ___
1365
1366 if ($avx>1) {
1367 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1368     map("%ymm$_",(0..15));
1369 my $S4=$MASK;
1370
1371 $code.=<<___;
1372 .type   poly1305_blocks_avx2,\@function,4
1373 .align  32
1374 poly1305_blocks_avx2:
1375         mov     20($ctx),%r8d           # is_base2_26
1376         cmp     \$128,$len
1377         jae     .Lblocks_avx2
1378         test    %r8d,%r8d
1379         jz      .Lblocks
1380
1381 .Lblocks_avx2:
1382         and     \$-16,$len
1383         jz      .Lno_data_avx2
1384
1385         vzeroupper
1386
1387         test    %r8d,%r8d
1388         jz      .Lbase2_64_avx2
1389
1390         test    \$63,$len
1391         jz      .Leven_avx2
1392
1393         push    %rbx
1394         push    %rbp
1395         push    %r12
1396         push    %r13
1397         push    %r14
1398         push    %r15
1399 .Lblocks_avx2_body:
1400
1401         mov     $len,%r15               # reassign $len
1402
1403         mov     0($ctx),$d1             # load hash value
1404         mov     8($ctx),$d2
1405         mov     16($ctx),$h2#d
1406
1407         mov     24($ctx),$r0            # load r
1408         mov     32($ctx),$s1
1409
1410         ################################# base 2^26 -> base 2^64
1411         mov     $d1#d,$h0#d
1412         and     \$`-1*(1<<31)`,$d1
1413         mov     $d2,$r1                 # borrow $r1
1414         mov     $d2#d,$h1#d
1415         and     \$`-1*(1<<31)`,$d2
1416
1417         shr     \$6,$d1
1418         shl     \$52,$r1
1419         add     $d1,$h0
1420         shr     \$12,$h1
1421         shr     \$18,$d2
1422         add     $r1,$h0
1423         adc     $d2,$h1
1424
1425         mov     $h2,$d1
1426         shl     \$40,$d1
1427         shr     \$24,$h2
1428         add     $d1,$h1
1429         adc     \$0,$h2                 # can be partially reduced...
1430
1431         mov     \$-4,$d2                # ... so reduce
1432         mov     $h2,$d1
1433         and     $h2,$d2
1434         shr     \$2,$d1
1435         and     \$3,$h2
1436         add     $d2,$d1                 # =*5
1437         add     $d1,$h0
1438         adc     \$0,$h1
1439         adc     \$0,$h2
1440
1441         mov     $s1,$r1
1442         mov     $s1,%rax
1443         shr     \$2,$s1
1444         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1445
1446 .Lbase2_26_pre_avx2:
1447         add     0($inp),$h0             # accumulate input
1448         adc     8($inp),$h1
1449         lea     16($inp),$inp
1450         adc     $padbit,$h2
1451         sub     \$16,%r15
1452
1453         call    __poly1305_block
1454         mov     $r1,%rax
1455
1456         test    \$63,%r15
1457         jnz     .Lbase2_26_pre_avx2
1458
1459         test    $padbit,$padbit         # if $padbit is zero,
1460         jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
1461
1462         ################################# base 2^64 -> base 2^26
1463         mov     $h0,%rax
1464         mov     $h0,%rdx
1465         shr     \$52,$h0
1466         mov     $h1,$r0
1467         mov     $h1,$r1
1468         shr     \$26,%rdx
1469         and     \$0x3ffffff,%rax        # h[0]
1470         shl     \$12,$r0
1471         and     \$0x3ffffff,%rdx        # h[1]
1472         shr     \$14,$h1
1473         or      $r0,$h0
1474         shl     \$24,$h2
1475         and     \$0x3ffffff,$h0         # h[2]
1476         shr     \$40,$r1
1477         and     \$0x3ffffff,$h1         # h[3]
1478         or      $r1,$h2                 # h[4]
1479
1480         test    %r15,%r15
1481         jz      .Lstore_base2_26_avx2
1482
1483         vmovd   %rax#d,%x#$H0
1484         vmovd   %rdx#d,%x#$H1
1485         vmovd   $h0#d,%x#$H2
1486         vmovd   $h1#d,%x#$H3
1487         vmovd   $h2#d,%x#$H4
1488         jmp     .Lproceed_avx2
1489
1490 .align  32
1491 .Lstore_base2_64_avx2:
1492         mov     $h0,0($ctx)
1493         mov     $h1,8($ctx)
1494         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
1495         jmp     .Ldone_avx2
1496
1497 .align  16
1498 .Lstore_base2_26_avx2:
1499         mov     %rax#d,0($ctx)          # store hash value base 2^26
1500         mov     %rdx#d,4($ctx)
1501         mov     $h0#d,8($ctx)
1502         mov     $h1#d,12($ctx)
1503         mov     $h2#d,16($ctx)
1504 .align  16
1505 .Ldone_avx2:
1506         mov     0(%rsp),%r15
1507         mov     8(%rsp),%r14
1508         mov     16(%rsp),%r13
1509         mov     24(%rsp),%r12
1510         mov     32(%rsp),%rbp
1511         mov     40(%rsp),%rbx
1512         lea     48(%rsp),%rsp
1513 .Lno_data_avx2:
1514 .Lblocks_avx2_epilogue:
1515         ret
1516
1517 .align  32
1518 .Lbase2_64_avx2:
1519         push    %rbx
1520         push    %rbp
1521         push    %r12
1522         push    %r13
1523         push    %r14
1524         push    %r15
1525 .Lbase2_64_avx2_body:
1526
1527         mov     $len,%r15               # reassign $len
1528
1529         mov     24($ctx),$r0            # load r
1530         mov     32($ctx),$s1
1531
1532         mov     0($ctx),$h0             # load hash value
1533         mov     8($ctx),$h1
1534         mov     16($ctx),$h2#d
1535
1536         mov     $s1,$r1
1537         mov     $s1,%rax
1538         shr     \$2,$s1
1539         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1540
1541         test    \$63,$len
1542         jz      .Linit_avx2
1543
1544 .Lbase2_64_pre_avx2:
1545         add     0($inp),$h0             # accumulate input
1546         adc     8($inp),$h1
1547         lea     16($inp),$inp
1548         adc     $padbit,$h2
1549         sub     \$16,%r15
1550
1551         call    __poly1305_block
1552         mov     $r1,%rax
1553
1554         test    \$63,%r15
1555         jnz     .Lbase2_64_pre_avx2
1556
1557 .Linit_avx2:
1558         ################################# base 2^64 -> base 2^26
1559         mov     $h0,%rax
1560         mov     $h0,%rdx
1561         shr     \$52,$h0
1562         mov     $h1,$d1
1563         mov     $h1,$d2
1564         shr     \$26,%rdx
1565         and     \$0x3ffffff,%rax        # h[0]
1566         shl     \$12,$d1
1567         and     \$0x3ffffff,%rdx        # h[1]
1568         shr     \$14,$h1
1569         or      $d1,$h0
1570         shl     \$24,$h2
1571         and     \$0x3ffffff,$h0         # h[2]
1572         shr     \$40,$d2
1573         and     \$0x3ffffff,$h1         # h[3]
1574         or      $d2,$h2                 # h[4]
1575
1576         vmovd   %rax#d,%x#$H0
1577         vmovd   %rdx#d,%x#$H1
1578         vmovd   $h0#d,%x#$H2
1579         vmovd   $h1#d,%x#$H3
1580         vmovd   $h2#d,%x#$H4
1581         movl    \$1,20($ctx)            # set is_base2_26
1582
1583         call    __poly1305_init_avx
1584
1585 .Lproceed_avx2:
1586         mov     %r15,$len                       # restore $len
1587         mov     OPENSSL_ia32cap_P+8(%rip),%r10d
1588         mov     \$`(1<<31|1<<30|1<<16)`,%r11d
1589
1590         mov     0(%rsp),%r15
1591         mov     8(%rsp),%r14
1592         mov     16(%rsp),%r13
1593         mov     24(%rsp),%r12
1594         mov     32(%rsp),%rbp
1595         mov     40(%rsp),%rbx
1596         lea     48(%rsp),%rax
1597         lea     48(%rsp),%rsp
1598 .Lbase2_64_avx2_epilogue:
1599         jmp     .Ldo_avx2
1600
1601 .align  32
1602 .Leven_avx2:
1603         mov             OPENSSL_ia32cap_P+8(%rip),%r10d
1604         mov             \$`(1<<31|1<<30|1<<16)`,%r11d
1605         vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
1606         vmovd           4*1($ctx),%x#$H1
1607         vmovd           4*2($ctx),%x#$H2
1608         vmovd           4*3($ctx),%x#$H3
1609         vmovd           4*4($ctx),%x#$H4
1610
1611 .Ldo_avx2:
1612 ___
1613 $code.=<<___            if ($avx>2);
1614         cmp             \$512,$len
1615         jb              .Lskip_avx512
1616         and             %r11d,%r10d
1617         cmp             %r11d,%r10d             # check for AVX512F+BW+VL
1618         je              .Lblocks_avx512
1619 .Lskip_avx512:
1620 ___
1621 $code.=<<___    if (!$win64);
1622         lea             -8(%rsp),%r11
1623         sub             \$0x128,%rsp
1624 ___
1625 $code.=<<___    if ($win64);
1626         lea             -0xf8(%rsp),%r11
1627         sub             \$0x1c8,%rsp
1628         vmovdqa         %xmm6,0x50(%r11)
1629         vmovdqa         %xmm7,0x60(%r11)
1630         vmovdqa         %xmm8,0x70(%r11)
1631         vmovdqa         %xmm9,0x80(%r11)
1632         vmovdqa         %xmm10,0x90(%r11)
1633         vmovdqa         %xmm11,0xa0(%r11)
1634         vmovdqa         %xmm12,0xb0(%r11)
1635         vmovdqa         %xmm13,0xc0(%r11)
1636         vmovdqa         %xmm14,0xd0(%r11)
1637         vmovdqa         %xmm15,0xe0(%r11)
1638 .Ldo_avx2_body:
1639 ___
1640 $code.=<<___;
1641         lea             .Lconst(%rip),%rcx
1642         lea             48+64($ctx),$ctx        # size optimization
1643         vmovdqa         96(%rcx),$T0            # .Lpermd_avx2
1644
1645         # expand and copy pre-calculated table to stack
1646         vmovdqu         `16*0-64`($ctx),%x#$T2
1647         and             \$-512,%rsp
1648         vmovdqu         `16*1-64`($ctx),%x#$T3
1649         vmovdqu         `16*2-64`($ctx),%x#$T4
1650         vmovdqu         `16*3-64`($ctx),%x#$D0
1651         vmovdqu         `16*4-64`($ctx),%x#$D1
1652         vmovdqu         `16*5-64`($ctx),%x#$D2
1653         lea             0x90(%rsp),%rax         # size optimization
1654         vmovdqu         `16*6-64`($ctx),%x#$D3
1655         vpermd          $T2,$T0,$T2             # 00003412 -> 14243444
1656         vmovdqu         `16*7-64`($ctx),%x#$D4
1657         vpermd          $T3,$T0,$T3
1658         vmovdqu         `16*8-64`($ctx),%x#$MASK
1659         vpermd          $T4,$T0,$T4
1660         vmovdqa         $T2,0x00(%rsp)
1661         vpermd          $D0,$T0,$D0
1662         vmovdqa         $T3,0x20-0x90(%rax)
1663         vpermd          $D1,$T0,$D1
1664         vmovdqa         $T4,0x40-0x90(%rax)
1665         vpermd          $D2,$T0,$D2
1666         vmovdqa         $D0,0x60-0x90(%rax)
1667         vpermd          $D3,$T0,$D3
1668         vmovdqa         $D1,0x80-0x90(%rax)
1669         vpermd          $D4,$T0,$D4
1670         vmovdqa         $D2,0xa0-0x90(%rax)
1671         vpermd          $MASK,$T0,$MASK
1672         vmovdqa         $D3,0xc0-0x90(%rax)
1673         vmovdqa         $D4,0xe0-0x90(%rax)
1674         vmovdqa         $MASK,0x100-0x90(%rax)
1675         vmovdqa         64(%rcx),$MASK          # .Lmask26
1676
1677         ################################################################
1678         # load input
1679         vmovdqu         16*0($inp),%x#$T0
1680         vmovdqu         16*1($inp),%x#$T1
1681         vinserti128     \$1,16*2($inp),$T0,$T0
1682         vinserti128     \$1,16*3($inp),$T1,$T1
1683         lea             16*4($inp),$inp
1684
1685         vpsrldq         \$6,$T0,$T2             # splat input
1686         vpsrldq         \$6,$T1,$T3
1687         vpunpckhqdq     $T1,$T0,$T4             # 4
1688         vpunpcklqdq     $T3,$T2,$T2             # 2:3
1689         vpunpcklqdq     $T1,$T0,$T0             # 0:1
1690
1691         vpsrlq          \$30,$T2,$T3
1692         vpsrlq          \$4,$T2,$T2
1693         vpsrlq          \$26,$T0,$T1
1694         vpsrlq          \$40,$T4,$T4            # 4
1695         vpand           $MASK,$T2,$T2           # 2
1696         vpand           $MASK,$T0,$T0           # 0
1697         vpand           $MASK,$T1,$T1           # 1
1698         vpand           $MASK,$T3,$T3           # 3
1699         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1700
1701         vpaddq          $H2,$T2,$H2             # accumulate input
1702         sub             \$64,$len
1703         jz              .Ltail_avx2
1704         jmp             .Loop_avx2
1705
1706 .align  32
1707 .Loop_avx2:
1708         ################################################################
1709         # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1710         # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1711         # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1712         # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1713         #   \________/\__________/
1714         ################################################################
1715         #vpaddq         $H2,$T2,$H2             # accumulate input
1716         vpaddq          $H0,$T0,$H0
1717         vmovdqa         `32*0`(%rsp),$T0        # r0^4
1718         vpaddq          $H1,$T1,$H1
1719         vmovdqa         `32*1`(%rsp),$T1        # r1^4
1720         vpaddq          $H3,$T3,$H3
1721         vmovdqa         `32*3`(%rsp),$T2        # r2^4
1722         vpaddq          $H4,$T4,$H4
1723         vmovdqa         `32*6-0x90`(%rax),$T3   # s3^4
1724         vmovdqa         `32*8-0x90`(%rax),$S4   # s4^4
1725
1726         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1727         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1728         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1729         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1730         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1731         #
1732         # however, as h2 is "chronologically" first one available pull
1733         # corresponding operations up, so it's
1734         #
1735         # d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
1736         # d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
1737         # d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1738         # d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
1739         # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
1740
1741         vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
1742         vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
1743         vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
1744         vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
1745         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
1746
1747         vpmuludq        $H0,$T1,$T4             # h0*r1
1748         vpmuludq        $H1,$T1,$H2             # h1*r1, borrow $H2 as temp
1749         vpaddq          $T4,$D1,$D1             # d1 += h0*r1
1750         vpaddq          $H2,$D2,$D2             # d2 += h1*r1
1751         vpmuludq        $H3,$T1,$T4             # h3*r1
1752         vpmuludq        `32*2`(%rsp),$H4,$H2    # h4*s1
1753         vpaddq          $T4,$D4,$D4             # d4 += h3*r1
1754         vpaddq          $H2,$D0,$D0             # d0 += h4*s1
1755          vmovdqa        `32*4-0x90`(%rax),$T1   # s2
1756
1757         vpmuludq        $H0,$T0,$T4             # h0*r0
1758         vpmuludq        $H1,$T0,$H2             # h1*r0
1759         vpaddq          $T4,$D0,$D0             # d0 += h0*r0
1760         vpaddq          $H2,$D1,$D1             # d1 += h1*r0
1761         vpmuludq        $H3,$T0,$T4             # h3*r0
1762         vpmuludq        $H4,$T0,$H2             # h4*r0
1763          vmovdqu        16*0($inp),%x#$T0       # load input
1764         vpaddq          $T4,$D3,$D3             # d3 += h3*r0
1765         vpaddq          $H2,$D4,$D4             # d4 += h4*r0
1766          vinserti128    \$1,16*2($inp),$T0,$T0
1767
1768         vpmuludq        $H3,$T1,$T4             # h3*s2
1769         vpmuludq        $H4,$T1,$H2             # h4*s2
1770          vmovdqu        16*1($inp),%x#$T1
1771         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1772         vpaddq          $H2,$D1,$D1             # d1 += h4*s2
1773          vmovdqa        `32*5-0x90`(%rax),$H2   # r3
1774         vpmuludq        $H1,$T2,$T4             # h1*r2
1775         vpmuludq        $H0,$T2,$T2             # h0*r2
1776         vpaddq          $T4,$D3,$D3             # d3 += h1*r2
1777         vpaddq          $T2,$D2,$D2             # d2 += h0*r2
1778          vinserti128    \$1,16*3($inp),$T1,$T1
1779          lea            16*4($inp),$inp
1780
1781         vpmuludq        $H1,$H2,$T4             # h1*r3
1782         vpmuludq        $H0,$H2,$H2             # h0*r3
1783          vpsrldq        \$6,$T0,$T2             # splat input
1784         vpaddq          $T4,$D4,$D4             # d4 += h1*r3
1785         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1786         vpmuludq        $H3,$T3,$T4             # h3*s3
1787         vpmuludq        $H4,$T3,$H2             # h4*s3
1788          vpsrldq        \$6,$T1,$T3
1789         vpaddq          $T4,$D1,$D1             # d1 += h3*s3
1790         vpaddq          $H2,$D2,$D2             # d2 += h4*s3
1791          vpunpckhqdq    $T1,$T0,$T4             # 4
1792
1793         vpmuludq        $H3,$S4,$H3             # h3*s4
1794         vpmuludq        $H4,$S4,$H4             # h4*s4
1795          vpunpcklqdq    $T1,$T0,$T0             # 0:1
1796         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
1797         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
1798          vpunpcklqdq    $T3,$T2,$T3             # 2:3
1799         vpmuludq        `32*7-0x90`(%rax),$H0,$H4       # h0*r4
1800         vpmuludq        $H1,$S4,$H0             # h1*s4
1801         vmovdqa         64(%rcx),$MASK          # .Lmask26
1802         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1803         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1804
1805         ################################################################
1806         # lazy reduction (interleaved with tail of input splat)
1807
1808         vpsrlq          \$26,$H3,$D3
1809         vpand           $MASK,$H3,$H3
1810         vpaddq          $D3,$H4,$H4             # h3 -> h4
1811
1812         vpsrlq          \$26,$H0,$D0
1813         vpand           $MASK,$H0,$H0
1814         vpaddq          $D0,$D1,$H1             # h0 -> h1
1815
1816         vpsrlq          \$26,$H4,$D4
1817         vpand           $MASK,$H4,$H4
1818
1819          vpsrlq         \$4,$T3,$T2
1820
1821         vpsrlq          \$26,$H1,$D1
1822         vpand           $MASK,$H1,$H1
1823         vpaddq          $D1,$H2,$H2             # h1 -> h2
1824
1825         vpaddq          $D4,$H0,$H0
1826         vpsllq          \$2,$D4,$D4
1827         vpaddq          $D4,$H0,$H0             # h4 -> h0
1828
1829          vpand          $MASK,$T2,$T2           # 2
1830          vpsrlq         \$26,$T0,$T1
1831
1832         vpsrlq          \$26,$H2,$D2
1833         vpand           $MASK,$H2,$H2
1834         vpaddq          $D2,$H3,$H3             # h2 -> h3
1835
1836          vpaddq         $T2,$H2,$H2             # modulo-scheduled
1837          vpsrlq         \$30,$T3,$T3
1838
1839         vpsrlq          \$26,$H0,$D0
1840         vpand           $MASK,$H0,$H0
1841         vpaddq          $D0,$H1,$H1             # h0 -> h1
1842
1843          vpsrlq         \$40,$T4,$T4            # 4
1844
1845         vpsrlq          \$26,$H3,$D3
1846         vpand           $MASK,$H3,$H3
1847         vpaddq          $D3,$H4,$H4             # h3 -> h4
1848
1849          vpand          $MASK,$T0,$T0           # 0
1850          vpand          $MASK,$T1,$T1           # 1
1851          vpand          $MASK,$T3,$T3           # 3
1852          vpor           32(%rcx),$T4,$T4        # padbit, yes, always
1853
1854         sub             \$64,$len
1855         jnz             .Loop_avx2
1856
1857         .byte           0x66,0x90
1858 .Ltail_avx2:
1859         ################################################################
1860         # while above multiplications were by r^4 in all lanes, in last
1861         # iteration we multiply least significant lane by r^4 and most
1862         # significant one by r, so copy of above except that references
1863         # to the precomputed table are displaced by 4...
1864
1865         #vpaddq         $H2,$T2,$H2             # accumulate input
1866         vpaddq          $H0,$T0,$H0
1867         vmovdqu         `32*0+4`(%rsp),$T0      # r0^4
1868         vpaddq          $H1,$T1,$H1
1869         vmovdqu         `32*1+4`(%rsp),$T1      # r1^4
1870         vpaddq          $H3,$T3,$H3
1871         vmovdqu         `32*3+4`(%rsp),$T2      # r2^4
1872         vpaddq          $H4,$T4,$H4
1873         vmovdqu         `32*6+4-0x90`(%rax),$T3 # s3^4
1874         vmovdqu         `32*8+4-0x90`(%rax),$S4 # s4^4
1875
1876         vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
1877         vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
1878         vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
1879         vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
1880         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
1881
1882         vpmuludq        $H0,$T1,$T4             # h0*r1
1883         vpmuludq        $H1,$T1,$H2             # h1*r1
1884         vpaddq          $T4,$D1,$D1             # d1 += h0*r1
1885         vpaddq          $H2,$D2,$D2             # d2 += h1*r1
1886         vpmuludq        $H3,$T1,$T4             # h3*r1
1887         vpmuludq        `32*2+4`(%rsp),$H4,$H2  # h4*s1
1888         vpaddq          $T4,$D4,$D4             # d4 += h3*r1
1889         vpaddq          $H2,$D0,$D0             # d0 += h4*s1
1890
1891         vpmuludq        $H0,$T0,$T4             # h0*r0
1892         vpmuludq        $H1,$T0,$H2             # h1*r0
1893         vpaddq          $T4,$D0,$D0             # d0 += h0*r0
1894          vmovdqu        `32*4+4-0x90`(%rax),$T1 # s2
1895         vpaddq          $H2,$D1,$D1             # d1 += h1*r0
1896         vpmuludq        $H3,$T0,$T4             # h3*r0
1897         vpmuludq        $H4,$T0,$H2             # h4*r0
1898         vpaddq          $T4,$D3,$D3             # d3 += h3*r0
1899         vpaddq          $H2,$D4,$D4             # d4 += h4*r0
1900
1901         vpmuludq        $H3,$T1,$T4             # h3*s2
1902         vpmuludq        $H4,$T1,$H2             # h4*s2
1903         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1904         vpaddq          $H2,$D1,$D1             # d1 += h4*s2
1905          vmovdqu        `32*5+4-0x90`(%rax),$H2 # r3
1906         vpmuludq        $H1,$T2,$T4             # h1*r2
1907         vpmuludq        $H0,$T2,$T2             # h0*r2
1908         vpaddq          $T4,$D3,$D3             # d3 += h1*r2
1909         vpaddq          $T2,$D2,$D2             # d2 += h0*r2
1910
1911         vpmuludq        $H1,$H2,$T4             # h1*r3
1912         vpmuludq        $H0,$H2,$H2             # h0*r3
1913         vpaddq          $T4,$D4,$D4             # d4 += h1*r3
1914         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1915         vpmuludq        $H3,$T3,$T4             # h3*s3
1916         vpmuludq        $H4,$T3,$H2             # h4*s3
1917         vpaddq          $T4,$D1,$D1             # d1 += h3*s3
1918         vpaddq          $H2,$D2,$D2             # d2 += h4*s3
1919
1920         vpmuludq        $H3,$S4,$H3             # h3*s4
1921         vpmuludq        $H4,$S4,$H4             # h4*s4
1922         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
1923         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
1924         vpmuludq        `32*7+4-0x90`(%rax),$H0,$H4             # h0*r4
1925         vpmuludq        $H1,$S4,$H0             # h1*s4
1926         vmovdqa         64(%rcx),$MASK          # .Lmask26
1927         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1928         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1929
1930         ################################################################
1931         # horizontal addition
1932
1933         vpsrldq         \$8,$D1,$T1
1934         vpsrldq         \$8,$H2,$T2
1935         vpsrldq         \$8,$H3,$T3
1936         vpsrldq         \$8,$H4,$T4
1937         vpsrldq         \$8,$H0,$T0
1938         vpaddq          $T1,$D1,$D1
1939         vpaddq          $T2,$H2,$H2
1940         vpaddq          $T3,$H3,$H3
1941         vpaddq          $T4,$H4,$H4
1942         vpaddq          $T0,$H0,$H0
1943
1944         vpermq          \$0x2,$H3,$T3
1945         vpermq          \$0x2,$H4,$T4
1946         vpermq          \$0x2,$H0,$T0
1947         vpermq          \$0x2,$D1,$T1
1948         vpermq          \$0x2,$H2,$T2
1949         vpaddq          $T3,$H3,$H3
1950         vpaddq          $T4,$H4,$H4
1951         vpaddq          $T0,$H0,$H0
1952         vpaddq          $T1,$D1,$D1
1953         vpaddq          $T2,$H2,$H2
1954
1955         ################################################################
1956         # lazy reduction
1957
1958         vpsrlq          \$26,$H3,$D3
1959         vpand           $MASK,$H3,$H3
1960         vpaddq          $D3,$H4,$H4             # h3 -> h4
1961
1962         vpsrlq          \$26,$H0,$D0
1963         vpand           $MASK,$H0,$H0
1964         vpaddq          $D0,$D1,$H1             # h0 -> h1
1965
1966         vpsrlq          \$26,$H4,$D4
1967         vpand           $MASK,$H4,$H4
1968
1969         vpsrlq          \$26,$H1,$D1
1970         vpand           $MASK,$H1,$H1
1971         vpaddq          $D1,$H2,$H2             # h1 -> h2
1972
1973         vpaddq          $D4,$H0,$H0
1974         vpsllq          \$2,$D4,$D4
1975         vpaddq          $D4,$H0,$H0             # h4 -> h0
1976
1977         vpsrlq          \$26,$H2,$D2
1978         vpand           $MASK,$H2,$H2
1979         vpaddq          $D2,$H3,$H3             # h2 -> h3
1980
1981         vpsrlq          \$26,$H0,$D0
1982         vpand           $MASK,$H0,$H0
1983         vpaddq          $D0,$H1,$H1             # h0 -> h1
1984
1985         vpsrlq          \$26,$H3,$D3
1986         vpand           $MASK,$H3,$H3
1987         vpaddq          $D3,$H4,$H4             # h3 -> h4
1988
1989         vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
1990         vmovd           %x#$H1,`4*1-48-64`($ctx)
1991         vmovd           %x#$H2,`4*2-48-64`($ctx)
1992         vmovd           %x#$H3,`4*3-48-64`($ctx)
1993         vmovd           %x#$H4,`4*4-48-64`($ctx)
1994 ___
1995 $code.=<<___    if ($win64);
1996         vmovdqa         0x50(%r11),%xmm6
1997         vmovdqa         0x60(%r11),%xmm7
1998         vmovdqa         0x70(%r11),%xmm8
1999         vmovdqa         0x80(%r11),%xmm9
2000         vmovdqa         0x90(%r11),%xmm10
2001         vmovdqa         0xa0(%r11),%xmm11
2002         vmovdqa         0xb0(%r11),%xmm12
2003         vmovdqa         0xc0(%r11),%xmm13
2004         vmovdqa         0xd0(%r11),%xmm14
2005         vmovdqa         0xe0(%r11),%xmm15
2006         lea             0xf8(%r11),%rsp
2007 .Ldo_avx2_epilogue:
2008 ___
2009 $code.=<<___    if (!$win64);
2010         lea             8(%r11),%rsp
2011 ___
2012 $code.=<<___;
2013         vzeroupper
2014         ret
2015 .size   poly1305_blocks_avx2,.-poly1305_blocks_avx2
2016 ___
2017 #######################################################################
2018 if ($avx>2) {
2019 # On entry we have input length divisible by 64. But since inner loop
2020 # processes 128 bytes per iteration, cases when length is not divisible
2021 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2022 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
2023 # for this tail, we wouldn't have to even allocate stack frame...
2024
2025 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24));
2026 my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29));
2027 my $PADBIT="%zmm30";
2028 my $GATHER="%ymm31";
2029
2030 $code.=<<___;
2031 .type   poly1305_blocks_avx512,\@function,4
2032 .align  32
2033 poly1305_blocks_avx512:
2034 .Lblocks_avx512:
2035         vzeroupper
2036 ___
2037 $code.=<<___    if (!$win64);
2038         lea             -8(%rsp),%r11
2039         sub             \$0x128,%rsp
2040 ___
2041 $code.=<<___    if ($win64);
2042         lea             -0xf8(%rsp),%r11
2043         sub             \$0x1c8,%rsp
2044         vmovdqa         %xmm6,0x50(%r11)
2045         vmovdqa         %xmm7,0x60(%r11)
2046         vmovdqa         %xmm8,0x70(%r11)
2047         vmovdqa         %xmm9,0x80(%r11)
2048         vmovdqa         %xmm10,0x90(%r11)
2049         vmovdqa         %xmm11,0xa0(%r11)
2050         vmovdqa         %xmm12,0xb0(%r11)
2051         vmovdqa         %xmm13,0xc0(%r11)
2052         vmovdqa         %xmm14,0xd0(%r11)
2053         vmovdqa         %xmm15,0xe0(%r11)
2054 .Ldo_avx512_body:
2055 ___
2056 $code.=<<___;
2057         lea             .Lconst(%rip),%rcx
2058         lea             48+64($ctx),$ctx        # size optimization
2059         vmovdqa         96(%rcx),$T2            # .Lpermd_avx2
2060
2061         # expand pre-calculated table
2062         vmovdqu32       `16*0-64`($ctx),%x#$R0
2063         and             \$-512,%rsp
2064         vmovdqu32       `16*1-64`($ctx),%x#$R1
2065         vmovdqu32       `16*2-64`($ctx),%x#$S1
2066         vmovdqu32       `16*3-64`($ctx),%x#$R2
2067         vmovdqu32       `16*4-64`($ctx),%x#$S2
2068         vmovdqu32       `16*5-64`($ctx),%x#$R3
2069         vmovdqu32       `16*6-64`($ctx),%x#$S3
2070         vmovdqu32       `16*7-64`($ctx),%x#$R4
2071         vmovdqu32       `16*8-64`($ctx),%x#$S4
2072         vpermd          $R0,$T2,$R0             # 00003412 -> 14243444
2073         vmovdqa64       64(%rcx),$MASK          # .Lmask26
2074         vpermd          $R1,$T2,$R1
2075         vpermd          $S1,$T2,$S1
2076         vpermd          $R2,$T2,$R2
2077         vmovdqa32       $R0,0x00(%rsp)          # save in case $len%128 != 0
2078          vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
2079         vpermd          $S2,$T2,$S2
2080         vmovdqa32       $R1,0x20(%rsp)
2081          vpsrlq         \$32,$R1,$T1
2082         vpermd          $R3,$T2,$R3
2083         vmovdqa32       $S1,0x40(%rsp)
2084         vpermd          $S3,$T2,$S3
2085         vpermd          $R4,$T2,$R4
2086         vmovdqa32       $R2,0x60(%rsp)
2087         vpermd          $S4,$T2,$S4
2088         vmovdqa32       $S2,0x80(%rsp)
2089         vmovdqa32       $R3,0xa0(%rsp)
2090         vmovdqa32       $S3,0xc0(%rsp)
2091         vmovdqa32       $R4,0xe0(%rsp)
2092         vmovdqa32       $S4,0x100(%rsp)
2093
2094         ################################################################
2095         # calculate 5th through 8th powers of the key
2096         #
2097         # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2098         # d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2099         # d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
2100         # d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
2101         # d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
2102
2103         vpmuludq        $T0,$R0,$D0             # d0 = r0'*r0
2104         vpmuludq        $T0,$R1,$D1             # d1 = r0'*r1
2105         vpmuludq        $T0,$R2,$D2             # d2 = r0'*r2
2106         vpmuludq        $T0,$R3,$D3             # d3 = r0'*r3
2107         vpmuludq        $T0,$R4,$D4             # d4 = r0'*r4
2108          vpsrlq         \$32,$R2,$T2
2109
2110         vpmuludq        $T1,$S4,$M0
2111         vpmuludq        $T1,$R0,$M1
2112         vpmuludq        $T1,$R1,$M2
2113         vpmuludq        $T1,$R2,$M3
2114         vpmuludq        $T1,$R3,$M4
2115          vpsrlq         \$32,$R3,$T3
2116         vpaddq          $M0,$D0,$D0             # d0 += r1'*5*r4
2117         vpaddq          $M1,$D1,$D1             # d1 += r1'*r0
2118         vpaddq          $M2,$D2,$D2             # d2 += r1'*r1
2119         vpaddq          $M3,$D3,$D3             # d3 += r1'*r2
2120         vpaddq          $M4,$D4,$D4             # d4 += r1'*r3
2121
2122         vpmuludq        $T2,$S3,$M0
2123         vpmuludq        $T2,$S4,$M1
2124         vpmuludq        $T2,$R1,$M3
2125         vpmuludq        $T2,$R2,$M4
2126         vpmuludq        $T2,$R0,$M2
2127          vpsrlq         \$32,$R4,$T4
2128         vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r3
2129         vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r4
2130         vpaddq          $M3,$D3,$D3             # d3 += r2'*r1
2131         vpaddq          $M4,$D4,$D4             # d4 += r2'*r2
2132         vpaddq          $M2,$D2,$D2             # d2 += r2'*r0
2133
2134         vpmuludq        $T3,$S2,$M0
2135         vpmuludq        $T3,$R0,$M3
2136         vpmuludq        $T3,$R1,$M4
2137         vpmuludq        $T3,$S3,$M1
2138         vpmuludq        $T3,$S4,$M2
2139         vpaddq          $M0,$D0,$D0             # d0 += r3'*5*r2
2140         vpaddq          $M3,$D3,$D3             # d3 += r3'*r0
2141         vpaddq          $M4,$D4,$D4             # d4 += r3'*r1
2142         vpaddq          $M1,$D1,$D1             # d1 += r3'*5*r3
2143         vpaddq          $M2,$D2,$D2             # d2 += r3'*5*r4
2144
2145         vpmuludq        $T4,$S4,$M3
2146         vpmuludq        $T4,$R0,$M4
2147         vpmuludq        $T4,$S1,$M0
2148         vpmuludq        $T4,$S2,$M1
2149         vpmuludq        $T4,$S3,$M2
2150         vpaddq          $M3,$D3,$D3             # d3 += r2'*5*r4
2151         vpaddq          $M4,$D4,$D4             # d4 += r2'*r0
2152         vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r1
2153         vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r2
2154         vpaddq          $M2,$D2,$D2             # d2 += r2'*5*r3
2155
2156         ################################################################
2157         # load input
2158         vmovdqu64       16*0($inp),%z#$T3
2159         vmovdqu64       16*4($inp),%z#$T4
2160         lea             16*8($inp),$inp
2161
2162         ################################################################
2163         # lazy reduction
2164
2165         vpsrlq          \$26,$D3,$M3
2166         vpandq          $MASK,$D3,$D3
2167         vpaddq          $M3,$D4,$D4             # d3 -> d4
2168
2169         vpsrlq          \$26,$D0,$M0
2170         vpandq          $MASK,$D0,$D0
2171         vpaddq          $M0,$D1,$D1             # d0 -> d1
2172
2173         vpsrlq          \$26,$D4,$M4
2174         vpandq          $MASK,$D4,$D4
2175
2176         vpsrlq          \$26,$D1,$M1
2177         vpandq          $MASK,$D1,$D1
2178         vpaddq          $M1,$D2,$D2             # d1 -> d2
2179
2180         vpaddq          $M4,$D0,$D0
2181         vpsllq          \$2,$M4,$M4
2182         vpaddq          $M4,$D0,$D0             # d4 -> d0
2183
2184         vpsrlq          \$26,$D2,$M2
2185         vpandq          $MASK,$D2,$D2
2186         vpaddq          $M2,$D3,$D3             # d2 -> d3
2187
2188         vpsrlq          \$26,$D0,$M0
2189         vpandq          $MASK,$D0,$D0
2190         vpaddq          $M0,$D1,$D1             # d0 -> d1
2191
2192         vpsrlq          \$26,$D3,$M3
2193         vpandq          $MASK,$D3,$D3
2194         vpaddq          $M3,$D4,$D4             # d3 -> d4
2195
2196 ___
2197 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));            # switch to %zmm domain
2198 map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
2199 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2200 map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
2201 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2202 map(s/%y/%z/,($MASK));
2203 $code.=<<___;
2204         ################################################################
2205         # at this point we have 14243444 in $R0-$S4 and 05060708 in
2206         # $D0-$D4, ...
2207
2208         vpunpcklqdq     $T4,$T3,$T0     # transpose input
2209         vpunpckhqdq     $T4,$T3,$T4
2210
2211         # ... since input 64-bit lanes are ordered as 73625140, we could
2212         # "vperm" it to 76543210 (here and in each loop iteration), *or*
2213         # we could just flow along, hence the goal for $R0-$S4 is
2214         # 1858286838784888 ...
2215
2216         mov             \$0b0110011001100110,%eax
2217         mov             \$0b1100110011001100,%r8d
2218         mov             \$0b0101010101010101,%r9d
2219         kmovw           %eax,%k1
2220         kmovw           %r8d,%k2
2221         kmovw           %r9d,%k3
2222
2223         vpbroadcastq    %x#$D0,$M0      # 0808080808080808
2224         vpbroadcastq    %x#$D1,$M1
2225         vpbroadcastq    %x#$D2,$M2
2226         vpbroadcastq    %x#$D3,$M3
2227         vpbroadcastq    %x#$D4,$M4
2228
2229         vpexpandd       $D0,${D0}{%k1}  # 05060708 -> -05--06--07--08-
2230         vpexpandd       $D1,${D1}{%k1}
2231         vpexpandd       $D2,${D2}{%k1}
2232         vpexpandd       $D3,${D3}{%k1}
2233         vpexpandd       $D4,${D4}{%k1}
2234
2235         vpexpandd       $R0,${D0}{%k2}  # -05--06--07--08- -> 145-246-347-448-
2236         vpexpandd       $R1,${D1}{%k2}
2237         vpexpandd       $R2,${D2}{%k2}
2238         vpexpandd       $R3,${D3}{%k2}
2239         vpexpandd       $R4,${D4}{%k2}
2240
2241         vpblendmd       $M0,$D0,${R0}{%k3}      # 1858286838784888
2242         vpblendmd       $M1,$D1,${R1}{%k3}
2243         vpblendmd       $M2,$D2,${R2}{%k3}
2244         vpblendmd       $M3,$D3,${R3}{%k3}
2245         vpblendmd       $M4,$D4,${R4}{%k3}
2246
2247         vpslld          \$2,$R1,$S1             # *5
2248         vpslld          \$2,$R2,$S2
2249         vpslld          \$2,$R3,$S3
2250         vpslld          \$2,$R4,$S4
2251         vpaddd          $R1,$S1,$S1
2252         vpaddd          $R2,$S2,$S2
2253         vpaddd          $R3,$S3,$S3
2254         vpaddd          $R4,$S4,$S4
2255
2256         vpbroadcastq    %x#$MASK,$MASK
2257         vpbroadcastq    32(%rcx),$PADBIT        # .L129
2258
2259         vpsrlq          \$52,$T0,$T2            # splat input
2260         vpsllq          \$12,$T4,$T3
2261         vporq           $T3,$T2,$T2
2262         vpsrlq          \$26,$T0,$T1
2263         vpsrlq          \$14,$T4,$T3
2264         vpsrlq          \$40,$T4,$T4            # 4
2265         vpandq          $MASK,$T2,$T2           # 2
2266         vpandq          $MASK,$T0,$T0           # 0
2267         vpandq          $MASK,$T1,$T1           # 1
2268         vpandq          $MASK,$T3,$T3           # 3
2269         #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2270
2271         vpaddq          $H2,$T2,$H2             # accumulate input
2272         mov             \$0x0f,%eax
2273         sub             \$192,$len
2274         jbe             .Ltail_avx512
2275         jmp             .Loop_avx512
2276
2277 .align  32
2278 .Loop_avx512:
2279         ################################################################
2280         # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2281         # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2282         # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2283         # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2284         # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2285         # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2286         # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2287         # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2288         #   \________/\___________/
2289         ################################################################
2290         #vpaddq         $H2,$T2,$H2             # accumulate input
2291
2292         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
2293         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
2294         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
2295         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
2296         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2297         #
2298         # however, as h2 is "chronologically" first one available pull
2299         # corresponding operations up, so it's
2300         #
2301         # d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
2302         # d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
2303         # d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
2304         # d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
2305         # d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
2306
2307         vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
2308          vpaddq         $H0,$T0,$H0
2309         vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
2310         vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
2311         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
2312          vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2313         vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
2314          vpaddq         $H1,$T1,$H1             # accumulate input
2315          vpaddq         $H3,$T3,$H3
2316          vpaddq         $H4,$T4,$H4
2317
2318           vmovdqu64     16*0($inp),$T3          # load input
2319           vmovdqu64     16*4($inp),$T4
2320           lea           16*8($inp),$inp
2321         vpmuludq        $H0,$R3,$M3
2322         vpmuludq        $H0,$R4,$M4
2323         vpmuludq        $H0,$R0,$M0
2324         vpmuludq        $H0,$R1,$M1
2325         vpaddq          $M3,$D3,$D3             # d3 += h0*r3
2326         vpaddq          $M4,$D4,$D4             # d4 += h0*r4
2327         vpaddq          $M0,$D0,$D0             # d0 += h0*r0
2328         vpaddq          $M1,$D1,$D1             # d1 += h0*r1
2329
2330         vpmuludq        $H1,$R2,$M3
2331         vpmuludq        $H1,$R3,$M4
2332         vpmuludq        $H1,$S4,$M0
2333         vpmuludq        $H0,$R2,$M2
2334         vpaddq          $M3,$D3,$D3             # d3 += h1*r2
2335         vpaddq          $M4,$D4,$D4             # d4 += h1*r3
2336         vpaddq          $M0,$D0,$D0             # d0 += h1*s4
2337         vpaddq          $M2,$D2,$D2             # d2 += h0*r2
2338
2339           vpunpcklqdq   $T4,$T3,$T0             # transpose input
2340           vpunpckhqdq   $T4,$T3,$T4
2341
2342         vpmuludq        $H3,$R0,$M3
2343         vpmuludq        $H3,$R1,$M4
2344         vpmuludq        $H1,$R0,$M1
2345         vpmuludq        $H1,$R1,$M2
2346         vpaddq          $M3,$D3,$D3             # d3 += h3*r0
2347         vpaddq          $M4,$D4,$D4             # d4 += h3*r1
2348         vpaddq          $M1,$D1,$D1             # d1 += h1*r0
2349         vpaddq          $M2,$D2,$D2             # d2 += h1*r1
2350
2351         vpmuludq        $H4,$S4,$M3
2352         vpmuludq        $H4,$R0,$M4
2353         vpmuludq        $H3,$S2,$M0
2354         vpmuludq        $H3,$S3,$M1
2355         vpaddq          $M3,$D3,$D3             # d3 += h4*s4
2356         vpmuludq        $H3,$S4,$M2
2357         vpaddq          $M4,$D4,$D4             # d4 += h4*r0
2358         vpaddq          $M0,$D0,$D0             # d0 += h3*s2
2359         vpaddq          $M1,$D1,$D1             # d1 += h3*s3
2360         vpaddq          $M2,$D2,$D2             # d2 += h3*s4
2361
2362         vpmuludq        $H4,$S1,$M0
2363         vpmuludq        $H4,$S2,$M1
2364         vpmuludq        $H4,$S3,$M2
2365         vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
2366         vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
2367         vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
2368
2369         ################################################################
2370         # lazy reduction (interleaved with input splat)
2371
2372          vpsrlq         \$52,$T0,$T2            # splat input
2373          vpsllq         \$12,$T4,$T3
2374
2375         vpsrlq          \$26,$D3,$H3
2376         vpandq          $MASK,$D3,$D3
2377         vpaddq          $H3,$D4,$H4             # h3 -> h4
2378
2379          vporq          $T3,$T2,$T2
2380
2381         vpsrlq          \$26,$H0,$D0
2382         vpandq          $MASK,$H0,$H0
2383         vpaddq          $D0,$H1,$H1             # h0 -> h1
2384
2385          vpandq         $MASK,$T2,$T2           # 2
2386
2387         vpsrlq          \$26,$H4,$D4
2388         vpandq          $MASK,$H4,$H4
2389
2390         vpsrlq          \$26,$H1,$D1
2391         vpandq          $MASK,$H1,$H1
2392         vpaddq          $D1,$H2,$H2             # h1 -> h2
2393
2394         vpaddq          $D4,$H0,$H0
2395         vpsllq          \$2,$D4,$D4
2396         vpaddq          $D4,$H0,$H0             # h4 -> h0
2397
2398          vpaddq         $T2,$H2,$H2             # modulo-scheduled
2399          vpsrlq         \$26,$T0,$T1
2400
2401         vpsrlq          \$26,$H2,$D2
2402         vpandq          $MASK,$H2,$H2
2403         vpaddq          $D2,$D3,$H3             # h2 -> h3
2404
2405          vpsrlq         \$14,$T4,$T3
2406
2407         vpsrlq          \$26,$H0,$D0
2408         vpandq          $MASK,$H0,$H0
2409         vpaddq          $D0,$H1,$H1             # h0 -> h1
2410
2411          vpsrlq         \$40,$T4,$T4            # 4
2412
2413         vpsrlq          \$26,$H3,$D3
2414         vpandq          $MASK,$H3,$H3
2415         vpaddq          $D3,$H4,$H4             # h3 -> h4
2416
2417          vpandq         $MASK,$T0,$T0           # 0
2418          vpandq         $MASK,$T1,$T1           # 1
2419          vpandq         $MASK,$T3,$T3           # 3
2420          #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
2421
2422         sub             \$128,$len
2423         ja              .Loop_avx512
2424
2425 .Ltail_avx512:
2426         ################################################################
2427         # while above multiplications were by r^8 in all lanes, in last
2428         # iteration we multiply least significant lane by r^8 and most
2429         # significant one by r, that's why table gets shifted...
2430
2431         vpsrlq          \$32,$R0,$R0            # 0105020603070408
2432         vpsrlq          \$32,$R1,$R1
2433         vpsrlq          \$32,$R2,$R2
2434         vpsrlq          \$32,$S3,$S3
2435         vpsrlq          \$32,$S4,$S4
2436         vpsrlq          \$32,$R3,$R3
2437         vpsrlq          \$32,$R4,$R4
2438         vpsrlq          \$32,$S1,$S1
2439         vpsrlq          \$32,$S2,$S2
2440
2441         ################################################################
2442         # load either next or last 64 byte of input
2443         lea             ($inp,$len),$inp
2444
2445         #vpaddq         $H2,$T2,$H2             # accumulate input
2446         vpaddq          $H0,$T0,$H0
2447
2448         vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
2449         vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
2450         vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
2451         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
2452         vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
2453          vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2454          vpaddq         $H1,$T1,$H1             # accumulate input
2455          vpaddq         $H3,$T3,$H3
2456          vpaddq         $H4,$T4,$H4
2457
2458           vmovdqu64     16*0($inp),%x#$T0
2459         vpmuludq        $H0,$R3,$M3
2460         vpmuludq        $H0,$R4,$M4
2461         vpmuludq        $H0,$R0,$M0
2462         vpmuludq        $H0,$R1,$M1
2463         vpaddq          $M3,$D3,$D3             # d3 += h0*r3
2464         vpaddq          $M4,$D4,$D4             # d4 += h0*r4
2465         vpaddq          $M0,$D0,$D0             # d0 += h0*r0
2466         vpaddq          $M1,$D1,$D1             # d1 += h0*r1
2467
2468           vmovdqu64     16*1($inp),%x#$T1
2469         vpmuludq        $H1,$R2,$M3
2470         vpmuludq        $H1,$R3,$M4
2471         vpmuludq        $H1,$S4,$M0
2472         vpmuludq        $H0,$R2,$M2
2473         vpaddq          $M3,$D3,$D3             # d3 += h1*r2
2474         vpaddq          $M4,$D4,$D4             # d4 += h1*r3
2475         vpaddq          $M0,$D0,$D0             # d0 += h1*s4
2476         vpaddq          $M2,$D2,$D2             # d2 += h0*r2
2477
2478           vinserti64x2  \$1,16*2($inp),$T0,$T0
2479         vpmuludq        $H3,$R0,$M3
2480         vpmuludq        $H3,$R1,$M4
2481         vpmuludq        $H1,$R0,$M1
2482         vpmuludq        $H1,$R1,$M2
2483         vpaddq          $M3,$D3,$D3             # d3 += h3*r0
2484         vpaddq          $M4,$D4,$D4             # d4 += h3*r1
2485         vpaddq          $M1,$D1,$D1             # d1 += h1*r0
2486         vpaddq          $M2,$D2,$D2             # d2 += h1*r1
2487
2488           vinserti64x2  \$1,16*3($inp),$T1,$T1
2489         vpmuludq        $H4,$S4,$M3
2490         vpmuludq        $H4,$R0,$M4
2491         vpmuludq        $H3,$S2,$M0
2492         vpmuludq        $H3,$S3,$M1
2493         vpmuludq        $H3,$S4,$M2
2494         vpaddq          $M3,$D3,$H3             # h3 = d3 + h4*s4
2495         vpaddq          $M4,$D4,$D4             # d4 += h4*r0
2496         vpaddq          $M0,$D0,$D0             # d0 += h3*s2
2497         vpaddq          $M1,$D1,$D1             # d1 += h3*s3
2498         vpaddq          $M2,$D2,$D2             # d2 += h3*s4
2499
2500         vpmuludq        $H4,$S1,$M0
2501         vpmuludq        $H4,$S2,$M1
2502         vpmuludq        $H4,$S3,$M2
2503         vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
2504         vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
2505         vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
2506
2507         ################################################################
2508         # horizontal addition
2509
2510         mov             \$1,%eax
2511         vpsrldq         \$8,$H3,$D3
2512         vpsrldq         \$8,$D4,$H4
2513         vpsrldq         \$8,$H0,$D0
2514         vpsrldq         \$8,$H1,$D1
2515         vpsrldq         \$8,$H2,$D2
2516         vpaddq          $D3,$H3,$H3
2517         vpaddq          $D4,$H4,$H4
2518         vpaddq          $D0,$H0,$H0
2519         vpaddq          $D1,$H1,$H1
2520         vpaddq          $D2,$H2,$H2
2521
2522         kmovw           %eax,%k3
2523         vpermq          \$0x2,$H3,$D3
2524         vpermq          \$0x2,$H4,$D4
2525         vpermq          \$0x2,$H0,$D0
2526         vpermq          \$0x2,$H1,$D1
2527         vpermq          \$0x2,$H2,$D2
2528         vpaddq          $D3,$H3,$H3
2529         vpaddq          $D4,$H4,$H4
2530         vpaddq          $D0,$H0,$H0
2531         vpaddq          $D1,$H1,$H1
2532         vpaddq          $D2,$H2,$H2
2533
2534         vextracti64x4   \$0x1,$H3,%y#$D3
2535         vextracti64x4   \$0x1,$H4,%y#$D4
2536         vextracti64x4   \$0x1,$H0,%y#$D0
2537         vextracti64x4   \$0x1,$H1,%y#$D1
2538         vextracti64x4   \$0x1,$H2,%y#$D2
2539         vpaddq          $D3,$H3,${H3}{%k3}{z}   # keep single qword in case
2540         vpaddq          $D4,$H4,${H4}{%k3}{z}   # it's passed to .Ltail_avx2
2541         vpaddq          $D0,$H0,${H0}{%k3}{z}
2542         vpaddq          $D1,$H1,${H1}{%k3}{z}
2543         vpaddq          $D2,$H2,${H2}{%k3}{z}
2544 ___
2545 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2546 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2547 $code.=<<___;
2548         ################################################################
2549         # lazy reduction (interleaved with input splat)
2550
2551         vpsrlq          \$26,$H3,$D3
2552         vpandq          $MASK,$H3,$H3
2553          vpsrldq        \$6,$T0,$T2             # splat input
2554          vpsrldq        \$6,$T1,$T3
2555          vpunpckhqdq    $T1,$T0,$T4             # 4
2556         vpaddq          $D3,$H4,$H4             # h3 -> h4
2557
2558         vpsrlq          \$26,$H0,$D0
2559         vpandq          $MASK,$H0,$H0
2560          vpunpcklqdq    $T3,$T2,$T2             # 2:3
2561          vpunpcklqdq    $T1,$T0,$T0             # 0:1
2562         vpaddq          $D0,$H1,$H1             # h0 -> h1
2563
2564         vpsrlq          \$26,$H4,$D4
2565         vpandq          $MASK,$H4,$H4
2566
2567         vpsrlq          \$26,$H1,$D1
2568         vpandq          $MASK,$H1,$H1
2569          vpsrlq         \$30,$T2,$T3
2570          vpsrlq         \$4,$T2,$T2
2571         vpaddq          $D1,$H2,$H2             # h1 -> h2
2572
2573         vpaddq          $D4,$H0,$H0
2574         vpsllq          \$2,$D4,$D4
2575          vpsrlq         \$26,$T0,$T1
2576          vpsrlq         \$40,$T4,$T4            # 4
2577         vpaddq          $D4,$H0,$H0             # h4 -> h0
2578
2579         vpsrlq          \$26,$H2,$D2
2580         vpandq          $MASK,$H2,$H2
2581          vpandq         $MASK,$T2,$T2           # 2
2582          vpandq         $MASK,$T0,$T0           # 0
2583         vpaddq          $D2,$H3,$H3             # h2 -> h3
2584
2585         vpsrlq          \$26,$H0,$D0
2586         vpandq          $MASK,$H0,$H0
2587          vpaddq         $H2,$T2,$H2             # accumulate input for .Ltail_avx2
2588          vpandq         $MASK,$T1,$T1           # 1
2589         vpaddq          $D0,$H1,$H1             # h0 -> h1
2590
2591         vpsrlq          \$26,$H3,$D3
2592         vpandq          $MASK,$H3,$H3
2593          vpandq         $MASK,$T3,$T3           # 3
2594          vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2595         vpaddq          $D3,$H4,$H4             # h3 -> h4
2596
2597         lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
2598         add             \$64,$len
2599         jnz             .Ltail_avx2
2600
2601         vpsubq          $T2,$H2,$H2             # undo input accumulation
2602         vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2603         vmovd           %x#$H1,`4*1-48-64`($ctx)
2604         vmovd           %x#$H2,`4*2-48-64`($ctx)
2605         vmovd           %x#$H3,`4*3-48-64`($ctx)
2606         vmovd           %x#$H4,`4*4-48-64`($ctx)
2607         vzeroall
2608 ___
2609 $code.=<<___    if ($win64);
2610         movdqa          0x50(%r11),%xmm6
2611         movdqa          0x60(%r11),%xmm7
2612         movdqa          0x70(%r11),%xmm8
2613         movdqa          0x80(%r11),%xmm9
2614         movdqa          0x90(%r11),%xmm10
2615         movdqa          0xa0(%r11),%xmm11
2616         movdqa          0xb0(%r11),%xmm12
2617         movdqa          0xc0(%r11),%xmm13
2618         movdqa          0xd0(%r11),%xmm14
2619         movdqa          0xe0(%r11),%xmm15
2620         lea             0xf8(%r11),%rsp
2621 .Ldo_avx512_epilogue:
2622 ___
2623 $code.=<<___    if (!$win64);
2624         lea             8(%r11),%rsp
2625 ___
2626 $code.=<<___;
2627         ret
2628 .size   poly1305_blocks_avx512,.-poly1305_blocks_avx512
2629 ___
2630 if ($avx>3) {
2631 ########################################################################
2632 # VPMADD52 version using 2^44 radix.
2633 #
2634 # One can argue that base 2^52 would be more natural. Well, even though
2635 # some operations would be more natural, one has to recognize couple of
2636 # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2637 # at amount of multiply-n-accumulate operations. Secondly, it makes it
2638 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2639 # reference implementations], which means that more such operations
2640 # would have to be performed in inner loop, which in turn makes critical
2641 # path longer. In other words, even though base 2^44 reduction might
2642 # look less elegant, overall critical path is actually shorter...
2643
2644 $code.=<<___;
2645 .type   poly1305_init_base2_44,\@function,3
2646 .align  32
2647 poly1305_init_base2_44:
2648         xor     %rax,%rax
2649         mov     %rax,0($ctx)            # initialize hash value
2650         mov     %rax,8($ctx)
2651         mov     %rax,16($ctx)
2652
2653 .Linit_base2_44:
2654         lea     poly1305_blocks_vpmadd52(%rip),%r10
2655         lea     poly1305_emit_base2_44(%rip),%r11
2656
2657         mov     \$0x0ffffffc0fffffff,%rax
2658         mov     \$0x0ffffffc0ffffffc,%rcx
2659         and     0($inp),%rax
2660         mov     \$0x00000fffffffffff,%r8
2661         and     8($inp),%rcx
2662         mov     \$0x00000fffffffffff,%r9
2663         and     %rax,%r8
2664         shrd    \$44,%rcx,%rax
2665         mov     %r8,40($ctx)            # r0
2666         and     %r9,%rax
2667         shr     \$24,%rcx
2668         mov     %rax,48($ctx)           # r1
2669         lea     (%rax,%rax,4),%rax      # *5
2670         mov     %rcx,56($ctx)           # r2
2671         shl     \$2,%rax                # magic <<2
2672         lea     (%rcx,%rcx,4),%rcx      # *5
2673         shl     \$2,%rcx                # magic <<2
2674         mov     %rax,24($ctx)           # s1
2675         mov     %rcx,32($ctx)           # s2
2676 ___
2677 $code.=<<___    if ($flavour !~ /elf32/);
2678         mov     %r10,0(%rdx)
2679         mov     %r11,8(%rdx)
2680 ___
2681 $code.=<<___    if ($flavour =~ /elf32/);
2682         mov     %r10d,0(%rdx)
2683         mov     %r11d,4(%rdx)
2684 ___
2685 $code.=<<___;
2686         mov     \$1,%eax
2687         ret
2688 .size   poly1305_init_base2_44,.-poly1305_init_base2_44
2689 ___
2690 {
2691 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2692 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2693 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2694
2695 $code.=<<___;
2696 .type   poly1305_blocks_vpmadd52,\@function,4
2697 .align  32
2698 poly1305_blocks_vpmadd52:
2699         shr     \$4,$len
2700         jz      .Lno_data_vpmadd52              # too short
2701
2702         mov             \$7,%r10d
2703         mov             \$1,%r11d
2704         kmovw           %r10d,%k7
2705         lea             .L2_44_inp_permd(%rip),%r10
2706         shl             \$40,$padbit
2707         kmovw           %r11d,%k1
2708
2709         vmovq           $padbit,%x#$PAD
2710         vmovdqa64       0(%r10),$inp_permd      # .L2_44_inp_permd
2711         vmovdqa64       32(%r10),$inp_shift     # .L2_44_inp_shift
2712         vpermq          \$0xcf,$PAD,$PAD
2713         vmovdqa64       64(%r10),$reduc_mask    # .L2_44_mask
2714
2715         vmovdqu64       0($ctx),${Dlo}{%k7}{z}          # load hash value
2716         vmovdqu64       40($ctx),${r2r1r0}{%k7}{z}      # load keys
2717         vmovdqu64       32($ctx),${r1r0s2}{%k7}{z}
2718         vmovdqu64       24($ctx),${r0s2s1}{%k7}{z}
2719
2720         vmovdqa64       96(%r10),$reduc_rght    # .L2_44_shift_rgt
2721         vmovdqa64       128(%r10),$reduc_left   # .L2_44_shift_lft
2722
2723         jmp             .Loop_vpmadd52
2724
2725 .align  32
2726 .Loop_vpmadd52:
2727         vmovdqu32       0($inp),%x#$T0          # load input as ----3210
2728         lea             16($inp),$inp
2729
2730         vpermd          $T0,$inp_permd,$T0      # ----3210 -> --322110
2731         vpsrlvq         $inp_shift,$T0,$T0
2732         vpandq          $reduc_mask,$T0,$T0
2733         vporq           $PAD,$T0,$T0
2734
2735         vpaddq          $T0,$Dlo,$Dlo           # accumulate input
2736
2737         vpermq          \$0,$Dlo,${H0}{%k7}{z}  # smash hash value
2738         vpermq          \$0b01010101,$Dlo,${H1}{%k7}{z}
2739         vpermq          \$0b10101010,$Dlo,${H2}{%k7}{z}
2740
2741         vpxord          $Dlo,$Dlo,$Dlo
2742         vpxord          $Dhi,$Dhi,$Dhi
2743
2744         vpmadd52luq     $r2r1r0,$H0,$Dlo
2745         vpmadd52huq     $r2r1r0,$H0,$Dhi
2746
2747         vpmadd52luq     $r1r0s2,$H1,$Dlo
2748         vpmadd52huq     $r1r0s2,$H1,$Dhi
2749
2750         vpmadd52luq     $r0s2s1,$H2,$Dlo
2751         vpmadd52huq     $r0s2s1,$H2,$Dhi
2752
2753         vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost qword
2754         vpsllvq         $reduc_left,$Dhi,$Dhi   # 0 in topmost qword
2755         vpandq          $reduc_mask,$Dlo,$Dlo
2756
2757         vpaddq          $T0,$Dhi,$Dhi
2758
2759         vpermq          \$0b10010011,$Dhi,$Dhi  # 0 in lowest qword
2760
2761         vpaddq          $Dhi,$Dlo,$Dlo          # note topmost qword :-)
2762
2763         vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost word
2764         vpandq          $reduc_mask,$Dlo,$Dlo
2765
2766         vpermq          \$0b10010011,$T0,$T0
2767
2768         vpaddq          $T0,$Dlo,$Dlo
2769
2770         vpermq          \$0b10010011,$Dlo,${T0}{%k1}{z}
2771
2772         vpaddq          $T0,$Dlo,$Dlo
2773         vpsllq          \$2,$T0,$T0
2774
2775         vpaddq          $T0,$Dlo,$Dlo
2776
2777         dec             $len                    # len-=16
2778         jnz             .Loop_vpmadd52
2779
2780         vmovdqu64       $Dlo,0($ctx){%k7}       # store hash value
2781
2782 .Lno_data_vpmadd52:
2783         ret
2784 .size   poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2785 ___
2786 }
2787 $code.=<<___;
2788 .type   poly1305_emit_base2_44,\@function,3
2789 .align  32
2790 poly1305_emit_base2_44:
2791         mov     0($ctx),%r8     # load hash value
2792         mov     8($ctx),%r9
2793         mov     16($ctx),%r10
2794
2795         mov     %r9,%rax
2796         shr     \$20,%r9
2797         shl     \$44,%rax
2798         mov     %r10,%rcx
2799         shr     \$40,%r10
2800         shl     \$24,%rcx
2801
2802         add     %rax,%r8
2803         adc     %rcx,%r9
2804         adc     \$0,%r10
2805
2806         mov     %r8,%rax
2807         add     \$5,%r8         # compare to modulus
2808         mov     %r9,%rcx
2809         adc     \$0,%r9
2810         adc     \$0,%r10
2811         shr     \$2,%r10        # did 130-bit value overfow?
2812         cmovnz  %r8,%rax
2813         cmovnz  %r9,%rcx
2814
2815         add     0($nonce),%rax  # accumulate nonce
2816         adc     8($nonce),%rcx
2817         mov     %rax,0($mac)    # write result
2818         mov     %rcx,8($mac)
2819
2820         ret
2821 .size   poly1305_emit_base2_44,.-poly1305_emit_base2_44
2822 ___
2823 }       }       }
2824 $code.=<<___;
2825 .align  64
2826 .Lconst:
2827 .Lmask24:
2828 .long   0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2829 .L129:
2830 .long   `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2831 .Lmask26:
2832 .long   0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2833 .Lpermd_avx2:
2834 .long   2,2,2,3,2,0,2,1
2835
2836 .L2_44_inp_permd:
2837 .long   0,1,1,2,2,3,7,7
2838 .L2_44_inp_shift:
2839 .quad   0,12,24,64
2840 .L2_44_mask:
2841 .quad   0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
2842 .L2_44_shift_rgt:
2843 .quad   44,44,42,64
2844 .L2_44_shift_lft:
2845 .quad   8,8,10,64
2846 ___
2847 }
2848
2849 $code.=<<___;
2850 .asciz  "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2851 .align  16
2852 ___
2853
2854 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2855 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
2856 if ($win64) {
2857 $rec="%rcx";
2858 $frame="%rdx";
2859 $context="%r8";
2860 $disp="%r9";
2861
2862 $code.=<<___;
2863 .extern __imp_RtlVirtualUnwind
2864 .type   se_handler,\@abi-omnipotent
2865 .align  16
2866 se_handler:
2867         push    %rsi
2868         push    %rdi
2869         push    %rbx
2870         push    %rbp
2871         push    %r12
2872         push    %r13
2873         push    %r14
2874         push    %r15
2875         pushfq
2876         sub     \$64,%rsp
2877
2878         mov     120($context),%rax      # pull context->Rax
2879         mov     248($context),%rbx      # pull context->Rip
2880
2881         mov     8($disp),%rsi           # disp->ImageBase
2882         mov     56($disp),%r11          # disp->HandlerData
2883
2884         mov     0(%r11),%r10d           # HandlerData[0]
2885         lea     (%rsi,%r10),%r10        # prologue label
2886         cmp     %r10,%rbx               # context->Rip<.Lprologue
2887         jb      .Lcommon_seh_tail
2888
2889         mov     152($context),%rax      # pull context->Rsp
2890
2891         mov     4(%r11),%r10d           # HandlerData[1]
2892         lea     (%rsi,%r10),%r10        # epilogue label
2893         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
2894         jae     .Lcommon_seh_tail
2895
2896         lea     48(%rax),%rax
2897
2898         mov     -8(%rax),%rbx
2899         mov     -16(%rax),%rbp
2900         mov     -24(%rax),%r12
2901         mov     -32(%rax),%r13
2902         mov     -40(%rax),%r14
2903         mov     -48(%rax),%r15
2904         mov     %rbx,144($context)      # restore context->Rbx
2905         mov     %rbp,160($context)      # restore context->Rbp
2906         mov     %r12,216($context)      # restore context->R12
2907         mov     %r13,224($context)      # restore context->R13
2908         mov     %r14,232($context)      # restore context->R14
2909         mov     %r15,240($context)      # restore context->R14
2910
2911         jmp     .Lcommon_seh_tail
2912 .size   se_handler,.-se_handler
2913
2914 .type   avx_handler,\@abi-omnipotent
2915 .align  16
2916 avx_handler:
2917         push    %rsi
2918         push    %rdi
2919         push    %rbx
2920         push    %rbp
2921         push    %r12
2922         push    %r13
2923         push    %r14
2924         push    %r15
2925         pushfq
2926         sub     \$64,%rsp
2927
2928         mov     120($context),%rax      # pull context->Rax
2929         mov     248($context),%rbx      # pull context->Rip
2930
2931         mov     8($disp),%rsi           # disp->ImageBase
2932         mov     56($disp),%r11          # disp->HandlerData
2933
2934         mov     0(%r11),%r10d           # HandlerData[0]
2935         lea     (%rsi,%r10),%r10        # prologue label
2936         cmp     %r10,%rbx               # context->Rip<prologue label
2937         jb      .Lcommon_seh_tail
2938
2939         mov     152($context),%rax      # pull context->Rsp
2940
2941         mov     4(%r11),%r10d           # HandlerData[1]
2942         lea     (%rsi,%r10),%r10        # epilogue label
2943         cmp     %r10,%rbx               # context->Rip>=epilogue label
2944         jae     .Lcommon_seh_tail
2945
2946         mov     208($context),%rax      # pull context->R11
2947
2948         lea     0x50(%rax),%rsi
2949         lea     0xf8(%rax),%rax
2950         lea     512($context),%rdi      # &context.Xmm6
2951         mov     \$20,%ecx
2952         .long   0xa548f3fc              # cld; rep movsq
2953
2954 .Lcommon_seh_tail:
2955         mov     8(%rax),%rdi
2956         mov     16(%rax),%rsi
2957         mov     %rax,152($context)      # restore context->Rsp
2958         mov     %rsi,168($context)      # restore context->Rsi
2959         mov     %rdi,176($context)      # restore context->Rdi
2960
2961         mov     40($disp),%rdi          # disp->ContextRecord
2962         mov     $context,%rsi           # context
2963         mov     \$154,%ecx              # sizeof(CONTEXT)
2964         .long   0xa548f3fc              # cld; rep movsq
2965
2966         mov     $disp,%rsi
2967         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
2968         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
2969         mov     0(%rsi),%r8             # arg3, disp->ControlPc
2970         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
2971         mov     40(%rsi),%r10           # disp->ContextRecord
2972         lea     56(%rsi),%r11           # &disp->HandlerData
2973         lea     24(%rsi),%r12           # &disp->EstablisherFrame
2974         mov     %r10,32(%rsp)           # arg5
2975         mov     %r11,40(%rsp)           # arg6
2976         mov     %r12,48(%rsp)           # arg7
2977         mov     %rcx,56(%rsp)           # arg8, (NULL)
2978         call    *__imp_RtlVirtualUnwind(%rip)
2979
2980         mov     \$1,%eax                # ExceptionContinueSearch
2981         add     \$64,%rsp
2982         popfq
2983         pop     %r15
2984         pop     %r14
2985         pop     %r13
2986         pop     %r12
2987         pop     %rbp
2988         pop     %rbx
2989         pop     %rdi
2990         pop     %rsi
2991         ret
2992 .size   avx_handler,.-avx_handler
2993
2994 .section        .pdata
2995 .align  4
2996         .rva    .LSEH_begin_poly1305_init
2997         .rva    .LSEH_end_poly1305_init
2998         .rva    .LSEH_info_poly1305_init
2999
3000         .rva    .LSEH_begin_poly1305_blocks
3001         .rva    .LSEH_end_poly1305_blocks
3002         .rva    .LSEH_info_poly1305_blocks
3003
3004         .rva    .LSEH_begin_poly1305_emit
3005         .rva    .LSEH_end_poly1305_emit
3006         .rva    .LSEH_info_poly1305_emit
3007 ___
3008 $code.=<<___ if ($avx);
3009         .rva    .LSEH_begin_poly1305_blocks_avx
3010         .rva    .Lbase2_64_avx
3011         .rva    .LSEH_info_poly1305_blocks_avx_1
3012
3013         .rva    .Lbase2_64_avx
3014         .rva    .Leven_avx
3015         .rva    .LSEH_info_poly1305_blocks_avx_2
3016
3017         .rva    .Leven_avx
3018         .rva    .LSEH_end_poly1305_blocks_avx
3019         .rva    .LSEH_info_poly1305_blocks_avx_3
3020
3021         .rva    .LSEH_begin_poly1305_emit_avx
3022         .rva    .LSEH_end_poly1305_emit_avx
3023         .rva    .LSEH_info_poly1305_emit_avx
3024 ___
3025 $code.=<<___ if ($avx>1);
3026         .rva    .LSEH_begin_poly1305_blocks_avx2
3027         .rva    .Lbase2_64_avx2
3028         .rva    .LSEH_info_poly1305_blocks_avx2_1
3029
3030         .rva    .Lbase2_64_avx2
3031         .rva    .Leven_avx2
3032         .rva    .LSEH_info_poly1305_blocks_avx2_2
3033
3034         .rva    .Leven_avx2
3035         .rva    .LSEH_end_poly1305_blocks_avx2
3036         .rva    .LSEH_info_poly1305_blocks_avx2_3
3037 ___
3038 $code.=<<___ if ($avx>2);
3039         .rva    .LSEH_begin_poly1305_blocks_avx512
3040         .rva    .LSEH_end_poly1305_blocks_avx512
3041         .rva    .LSEH_info_poly1305_blocks_avx512
3042 ___
3043 $code.=<<___;
3044 .section        .xdata
3045 .align  8
3046 .LSEH_info_poly1305_init:
3047         .byte   9,0,0,0
3048         .rva    se_handler
3049         .rva    .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
3050
3051 .LSEH_info_poly1305_blocks:
3052         .byte   9,0,0,0
3053         .rva    se_handler
3054         .rva    .Lblocks_body,.Lblocks_epilogue
3055
3056 .LSEH_info_poly1305_emit:
3057         .byte   9,0,0,0
3058         .rva    se_handler
3059         .rva    .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
3060 ___
3061 $code.=<<___ if ($avx);
3062 .LSEH_info_poly1305_blocks_avx_1:
3063         .byte   9,0,0,0
3064         .rva    se_handler
3065         .rva    .Lblocks_avx_body,.Lblocks_avx_epilogue         # HandlerData[]
3066
3067 .LSEH_info_poly1305_blocks_avx_2:
3068         .byte   9,0,0,0
3069         .rva    se_handler
3070         .rva    .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue     # HandlerData[]
3071
3072 .LSEH_info_poly1305_blocks_avx_3:
3073         .byte   9,0,0,0
3074         .rva    avx_handler
3075         .rva    .Ldo_avx_body,.Ldo_avx_epilogue                 # HandlerData[]
3076
3077 .LSEH_info_poly1305_emit_avx:
3078         .byte   9,0,0,0
3079         .rva    se_handler
3080         .rva    .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
3081 ___
3082 $code.=<<___ if ($avx>1);
3083 .LSEH_info_poly1305_blocks_avx2_1:
3084         .byte   9,0,0,0
3085         .rva    se_handler
3086         .rva    .Lblocks_avx2_body,.Lblocks_avx2_epilogue       # HandlerData[]
3087
3088 .LSEH_info_poly1305_blocks_avx2_2:
3089         .byte   9,0,0,0
3090         .rva    se_handler
3091         .rva    .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue   # HandlerData[]
3092
3093 .LSEH_info_poly1305_blocks_avx2_3:
3094         .byte   9,0,0,0
3095         .rva    avx_handler
3096         .rva    .Ldo_avx2_body,.Ldo_avx2_epilogue               # HandlerData[]
3097 ___
3098 $code.=<<___ if ($avx>2);
3099 .LSEH_info_poly1305_blocks_avx512:
3100         .byte   9,0,0,0
3101         .rva    avx_handler
3102         .rva    .Ldo_avx512_body,.Ldo_avx512_epilogue           # HandlerData[]
3103 ___
3104 }
3105
3106 foreach (split('\n',$code)) {
3107         s/\`([^\`]*)\`/eval($1)/ge;
3108         s/%r([a-z]+)#d/%e$1/g;
3109         s/%r([0-9]+)#d/%r$1d/g;
3110         s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
3111
3112         print $_,"\n";
3113 }
3114 close STDOUT;