poly1305/asm/poly1305-x86_64.pl: add poly1305_blocks_vpmadd52_8x.
[openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements Poly1305 hash for x86_64.
18 #
19 # March 2015
20 #
21 # Initial release.
22 #
23 # December 2016
24 #
25 # Add AVX512F+VL+BW code path.
26 #
27 # Numbers are cycles per processed byte with poly1305_blocks alone,
28 # measured with rdtsc at fixed clock frequency.
29 #
30 #               IALU/gcc-4.8(*) AVX(**)         AVX2
31 # P4            4.46/+120%      -
32 # Core 2        2.41/+90%       -
33 # Westmere      1.88/+120%      -
34 # Sandy Bridge  1.39/+140%      1.10
35 # Haswell       1.14/+175%      1.11            0.65
36 # Skylake       1.13/+120%      0.96            0.51
37 # Silvermont    2.83/+95%       -
38 # Goldmont      1.70/+180%      -
39 # VIA Nano      1.82/+150%      -
40 # Sledgehammer  1.38/+160%      -
41 # Bulldozer     2.30/+130%      0.97
42 # Ryzen         1.15/?          1.08            1.18
43 #
44 # (*)   improvement coefficients relative to clang are more modest and
45 #       are ~50% on most processors, in both cases we are comparing to
46 #       __int128 code;
47 # (**)  SSE2 implementation was attempted, but among non-AVX processors
48 #       it was faster than integer-only code only on older Intel P4 and
49 #       Core processors, 50-30%, less newer processor is, but slower on
50 #       contemporary ones, for example almost 2x slower on Atom, and as
51 #       former are naturally disappearing, SSE2 is deemed unnecessary;
52
53 $flavour = shift;
54 $output  = shift;
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
63
64 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66         $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
67 }
68
69 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
71         $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
72         $avx += 2 if ($1==2.11 && $2>=8);
73 }
74
75 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
76            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
77         $avx = ($1>=10) + ($1>=12);
78 }
79
80 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
81         $avx = ($2>=3.0) + ($2>3.0);
82 }
83
84 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
85 *STDOUT=*OUT;
86
87 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
88 my ($mac,$nonce)=($inp,$len);   # *_emit arguments
89 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
90 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
91
92 sub poly1305_iteration {
93 # input:        copy of $r1 in %rax, $h0-$h2, $r0-$r1
94 # output:       $h0-$h2 *= $r0-$r1
95 $code.=<<___;
96         mulq    $h0                     # h0*r1
97         mov     %rax,$d2
98          mov    $r0,%rax
99         mov     %rdx,$d3
100
101         mulq    $h0                     # h0*r0
102         mov     %rax,$h0                # future $h0
103          mov    $r0,%rax
104         mov     %rdx,$d1
105
106         mulq    $h1                     # h1*r0
107         add     %rax,$d2
108          mov    $s1,%rax
109         adc     %rdx,$d3
110
111         mulq    $h1                     # h1*s1
112          mov    $h2,$h1                 # borrow $h1
113         add     %rax,$h0
114         adc     %rdx,$d1
115
116         imulq   $s1,$h1                 # h2*s1
117         add     $h1,$d2
118          mov    $d1,$h1
119         adc     \$0,$d3
120
121         imulq   $r0,$h2                 # h2*r0
122         add     $d2,$h1
123         mov     \$-4,%rax               # mask value
124         adc     $h2,$d3
125
126         and     $d3,%rax                # last reduction step
127         mov     $d3,$h2
128         shr     \$2,$d3
129         and     \$3,$h2
130         add     $d3,%rax
131         add     %rax,$h0
132         adc     \$0,$h1
133         adc     \$0,$h2
134 ___
135 }
136
137 ########################################################################
138 # Layout of opaque area is following.
139 #
140 #       unsigned __int64 h[3];          # current hash value base 2^64
141 #       unsigned __int64 r[2];          # key value base 2^64
142
143 $code.=<<___;
144 .text
145
146 .extern OPENSSL_ia32cap_P
147
148 .globl  poly1305_init
149 .hidden poly1305_init
150 .globl  poly1305_blocks
151 .hidden poly1305_blocks
152 .globl  poly1305_emit
153 .hidden poly1305_emit
154
155 .type   poly1305_init,\@function,3
156 .align  32
157 poly1305_init:
158         xor     %rax,%rax
159         mov     %rax,0($ctx)            # initialize hash value
160         mov     %rax,8($ctx)
161         mov     %rax,16($ctx)
162
163         cmp     \$0,$inp
164         je      .Lno_key
165
166         lea     poly1305_blocks(%rip),%r10
167         lea     poly1305_emit(%rip),%r11
168 ___
169 $code.=<<___    if ($avx);
170         mov     OPENSSL_ia32cap_P+4(%rip),%r9
171         lea     poly1305_blocks_avx(%rip),%rax
172         lea     poly1305_emit_avx(%rip),%rcx
173         bt      \$`60-32`,%r9           # AVX?
174         cmovc   %rax,%r10
175         cmovc   %rcx,%r11
176 ___
177 $code.=<<___    if ($avx>1);
178         lea     poly1305_blocks_avx2(%rip),%rax
179         bt      \$`5+32`,%r9            # AVX2?
180         cmovc   %rax,%r10
181 ___
182 $code.=<<___    if ($avx>3);
183         mov     \$`(1<<31|1<<21|1<<16)`,%rax
184         shr     \$32,%r9
185         and     %rax,%r9
186         cmp     %rax,%r9
187         je      .Linit_base2_44
188 ___
189 $code.=<<___;
190         mov     \$0x0ffffffc0fffffff,%rax
191         mov     \$0x0ffffffc0ffffffc,%rcx
192         and     0($inp),%rax
193         and     8($inp),%rcx
194         mov     %rax,24($ctx)
195         mov     %rcx,32($ctx)
196 ___
197 $code.=<<___    if ($flavour !~ /elf32/);
198         mov     %r10,0(%rdx)
199         mov     %r11,8(%rdx)
200 ___
201 $code.=<<___    if ($flavour =~ /elf32/);
202         mov     %r10d,0(%rdx)
203         mov     %r11d,4(%rdx)
204 ___
205 $code.=<<___;
206         mov     \$1,%eax
207 .Lno_key:
208         ret
209 .size   poly1305_init,.-poly1305_init
210
211 .type   poly1305_blocks,\@function,4
212 .align  32
213 poly1305_blocks:
214 .cfi_startproc
215 .Lblocks:
216         shr     \$4,$len
217         jz      .Lno_data               # too short
218
219         push    %rbx
220 .cfi_push       %rbx
221         push    %rbp
222 .cfi_push       %rbp
223         push    %r12
224 .cfi_push       %r12
225         push    %r13
226 .cfi_push       %r13
227         push    %r14
228 .cfi_push       %r14
229         push    %r15
230 .cfi_push       %r15
231 .Lblocks_body:
232
233         mov     $len,%r15               # reassign $len
234
235         mov     24($ctx),$r0            # load r
236         mov     32($ctx),$s1
237
238         mov     0($ctx),$h0             # load hash value
239         mov     8($ctx),$h1
240         mov     16($ctx),$h2
241
242         mov     $s1,$r1
243         shr     \$2,$s1
244         mov     $r1,%rax
245         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
246         jmp     .Loop
247
248 .align  32
249 .Loop:
250         add     0($inp),$h0             # accumulate input
251         adc     8($inp),$h1
252         lea     16($inp),$inp
253         adc     $padbit,$h2
254 ___
255         &poly1305_iteration();
256 $code.=<<___;
257         mov     $r1,%rax
258         dec     %r15                    # len-=16
259         jnz     .Loop
260
261         mov     $h0,0($ctx)             # store hash value
262         mov     $h1,8($ctx)
263         mov     $h2,16($ctx)
264
265         mov     0(%rsp),%r15
266 .cfi_restore    %r15
267         mov     8(%rsp),%r14
268 .cfi_restore    %r14
269         mov     16(%rsp),%r13
270 .cfi_restore    %r13
271         mov     24(%rsp),%r12
272 .cfi_restore    %r12
273         mov     32(%rsp),%rbp
274 .cfi_restore    %rbp
275         mov     40(%rsp),%rbx
276 .cfi_restore    %rbx
277         lea     48(%rsp),%rsp
278 .cfi_adjust_cfa_offset  -48
279 .Lno_data:
280 .Lblocks_epilogue:
281         ret
282 .cfi_endproc
283 .size   poly1305_blocks,.-poly1305_blocks
284
285 .type   poly1305_emit,\@function,3
286 .align  32
287 poly1305_emit:
288 .Lemit:
289         mov     0($ctx),%r8     # load hash value
290         mov     8($ctx),%r9
291         mov     16($ctx),%r10
292
293         mov     %r8,%rax
294         add     \$5,%r8         # compare to modulus
295         mov     %r9,%rcx
296         adc     \$0,%r9
297         adc     \$0,%r10
298         shr     \$2,%r10        # did 130-bit value overfow?
299         cmovnz  %r8,%rax
300         cmovnz  %r9,%rcx
301
302         add     0($nonce),%rax  # accumulate nonce
303         adc     8($nonce),%rcx
304         mov     %rax,0($mac)    # write result
305         mov     %rcx,8($mac)
306
307         ret
308 .size   poly1305_emit,.-poly1305_emit
309 ___
310 if ($avx) {
311
312 ########################################################################
313 # Layout of opaque area is following.
314 #
315 #       unsigned __int32 h[5];          # current hash value base 2^26
316 #       unsigned __int32 is_base2_26;
317 #       unsigned __int64 r[2];          # key value base 2^64
318 #       unsigned __int64 pad;
319 #       struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
320 #
321 # where r^n are base 2^26 digits of degrees of multiplier key. There are
322 # 5 digits, but last four are interleaved with multiples of 5, totalling
323 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
324
325 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
326     map("%xmm$_",(0..15));
327
328 $code.=<<___;
329 .type   __poly1305_block,\@abi-omnipotent
330 .align  32
331 __poly1305_block:
332 ___
333         &poly1305_iteration();
334 $code.=<<___;
335         ret
336 .size   __poly1305_block,.-__poly1305_block
337
338 .type   __poly1305_init_avx,\@abi-omnipotent
339 .align  32
340 __poly1305_init_avx:
341         mov     $r0,$h0
342         mov     $r1,$h1
343         xor     $h2,$h2
344
345         lea     48+64($ctx),$ctx        # size optimization
346
347         mov     $r1,%rax
348         call    __poly1305_block        # r^2
349
350         mov     \$0x3ffffff,%eax        # save interleaved r^2 and r base 2^26
351         mov     \$0x3ffffff,%edx
352         mov     $h0,$d1
353         and     $h0#d,%eax
354         mov     $r0,$d2
355         and     $r0#d,%edx
356         mov     %eax,`16*0+0-64`($ctx)
357         shr     \$26,$d1
358         mov     %edx,`16*0+4-64`($ctx)
359         shr     \$26,$d2
360
361         mov     \$0x3ffffff,%eax
362         mov     \$0x3ffffff,%edx
363         and     $d1#d,%eax
364         and     $d2#d,%edx
365         mov     %eax,`16*1+0-64`($ctx)
366         lea     (%rax,%rax,4),%eax      # *5
367         mov     %edx,`16*1+4-64`($ctx)
368         lea     (%rdx,%rdx,4),%edx      # *5
369         mov     %eax,`16*2+0-64`($ctx)
370         shr     \$26,$d1
371         mov     %edx,`16*2+4-64`($ctx)
372         shr     \$26,$d2
373
374         mov     $h1,%rax
375         mov     $r1,%rdx
376         shl     \$12,%rax
377         shl     \$12,%rdx
378         or      $d1,%rax
379         or      $d2,%rdx
380         and     \$0x3ffffff,%eax
381         and     \$0x3ffffff,%edx
382         mov     %eax,`16*3+0-64`($ctx)
383         lea     (%rax,%rax,4),%eax      # *5
384         mov     %edx,`16*3+4-64`($ctx)
385         lea     (%rdx,%rdx,4),%edx      # *5
386         mov     %eax,`16*4+0-64`($ctx)
387         mov     $h1,$d1
388         mov     %edx,`16*4+4-64`($ctx)
389         mov     $r1,$d2
390
391         mov     \$0x3ffffff,%eax
392         mov     \$0x3ffffff,%edx
393         shr     \$14,$d1
394         shr     \$14,$d2
395         and     $d1#d,%eax
396         and     $d2#d,%edx
397         mov     %eax,`16*5+0-64`($ctx)
398         lea     (%rax,%rax,4),%eax      # *5
399         mov     %edx,`16*5+4-64`($ctx)
400         lea     (%rdx,%rdx,4),%edx      # *5
401         mov     %eax,`16*6+0-64`($ctx)
402         shr     \$26,$d1
403         mov     %edx,`16*6+4-64`($ctx)
404         shr     \$26,$d2
405
406         mov     $h2,%rax
407         shl     \$24,%rax
408         or      %rax,$d1
409         mov     $d1#d,`16*7+0-64`($ctx)
410         lea     ($d1,$d1,4),$d1         # *5
411         mov     $d2#d,`16*7+4-64`($ctx)
412         lea     ($d2,$d2,4),$d2         # *5
413         mov     $d1#d,`16*8+0-64`($ctx)
414         mov     $d2#d,`16*8+4-64`($ctx)
415
416         mov     $r1,%rax
417         call    __poly1305_block        # r^3
418
419         mov     \$0x3ffffff,%eax        # save r^3 base 2^26
420         mov     $h0,$d1
421         and     $h0#d,%eax
422         shr     \$26,$d1
423         mov     %eax,`16*0+12-64`($ctx)
424
425         mov     \$0x3ffffff,%edx
426         and     $d1#d,%edx
427         mov     %edx,`16*1+12-64`($ctx)
428         lea     (%rdx,%rdx,4),%edx      # *5
429         shr     \$26,$d1
430         mov     %edx,`16*2+12-64`($ctx)
431
432         mov     $h1,%rax
433         shl     \$12,%rax
434         or      $d1,%rax
435         and     \$0x3ffffff,%eax
436         mov     %eax,`16*3+12-64`($ctx)
437         lea     (%rax,%rax,4),%eax      # *5
438         mov     $h1,$d1
439         mov     %eax,`16*4+12-64`($ctx)
440
441         mov     \$0x3ffffff,%edx
442         shr     \$14,$d1
443         and     $d1#d,%edx
444         mov     %edx,`16*5+12-64`($ctx)
445         lea     (%rdx,%rdx,4),%edx      # *5
446         shr     \$26,$d1
447         mov     %edx,`16*6+12-64`($ctx)
448
449         mov     $h2,%rax
450         shl     \$24,%rax
451         or      %rax,$d1
452         mov     $d1#d,`16*7+12-64`($ctx)
453         lea     ($d1,$d1,4),$d1         # *5
454         mov     $d1#d,`16*8+12-64`($ctx)
455
456         mov     $r1,%rax
457         call    __poly1305_block        # r^4
458
459         mov     \$0x3ffffff,%eax        # save r^4 base 2^26
460         mov     $h0,$d1
461         and     $h0#d,%eax
462         shr     \$26,$d1
463         mov     %eax,`16*0+8-64`($ctx)
464
465         mov     \$0x3ffffff,%edx
466         and     $d1#d,%edx
467         mov     %edx,`16*1+8-64`($ctx)
468         lea     (%rdx,%rdx,4),%edx      # *5
469         shr     \$26,$d1
470         mov     %edx,`16*2+8-64`($ctx)
471
472         mov     $h1,%rax
473         shl     \$12,%rax
474         or      $d1,%rax
475         and     \$0x3ffffff,%eax
476         mov     %eax,`16*3+8-64`($ctx)
477         lea     (%rax,%rax,4),%eax      # *5
478         mov     $h1,$d1
479         mov     %eax,`16*4+8-64`($ctx)
480
481         mov     \$0x3ffffff,%edx
482         shr     \$14,$d1
483         and     $d1#d,%edx
484         mov     %edx,`16*5+8-64`($ctx)
485         lea     (%rdx,%rdx,4),%edx      # *5
486         shr     \$26,$d1
487         mov     %edx,`16*6+8-64`($ctx)
488
489         mov     $h2,%rax
490         shl     \$24,%rax
491         or      %rax,$d1
492         mov     $d1#d,`16*7+8-64`($ctx)
493         lea     ($d1,$d1,4),$d1         # *5
494         mov     $d1#d,`16*8+8-64`($ctx)
495
496         lea     -48-64($ctx),$ctx       # size [de-]optimization
497         ret
498 .size   __poly1305_init_avx,.-__poly1305_init_avx
499
500 .type   poly1305_blocks_avx,\@function,4
501 .align  32
502 poly1305_blocks_avx:
503 .cfi_startproc
504         mov     20($ctx),%r8d           # is_base2_26
505         cmp     \$128,$len
506         jae     .Lblocks_avx
507         test    %r8d,%r8d
508         jz      .Lblocks
509
510 .Lblocks_avx:
511         and     \$-16,$len
512         jz      .Lno_data_avx
513
514         vzeroupper
515
516         test    %r8d,%r8d
517         jz      .Lbase2_64_avx
518
519         test    \$31,$len
520         jz      .Leven_avx
521
522         push    %rbx
523 .cfi_push       %rbx
524         push    %rbp
525 .cfi_push       %rbp
526         push    %r12
527 .cfi_push       %r12
528         push    %r13
529 .cfi_push       %r13
530         push    %r14
531 .cfi_push       %r14
532         push    %r15
533 .cfi_push       %r15
534 .Lblocks_avx_body:
535
536         mov     $len,%r15               # reassign $len
537
538         mov     0($ctx),$d1             # load hash value
539         mov     8($ctx),$d2
540         mov     16($ctx),$h2#d
541
542         mov     24($ctx),$r0            # load r
543         mov     32($ctx),$s1
544
545         ################################# base 2^26 -> base 2^64
546         mov     $d1#d,$h0#d
547         and     \$`-1*(1<<31)`,$d1
548         mov     $d2,$r1                 # borrow $r1
549         mov     $d2#d,$h1#d
550         and     \$`-1*(1<<31)`,$d2
551
552         shr     \$6,$d1
553         shl     \$52,$r1
554         add     $d1,$h0
555         shr     \$12,$h1
556         shr     \$18,$d2
557         add     $r1,$h0
558         adc     $d2,$h1
559
560         mov     $h2,$d1
561         shl     \$40,$d1
562         shr     \$24,$h2
563         add     $d1,$h1
564         adc     \$0,$h2                 # can be partially reduced...
565
566         mov     \$-4,$d2                # ... so reduce
567         mov     $h2,$d1
568         and     $h2,$d2
569         shr     \$2,$d1
570         and     \$3,$h2
571         add     $d2,$d1                 # =*5
572         add     $d1,$h0
573         adc     \$0,$h1
574         adc     \$0,$h2
575
576         mov     $s1,$r1
577         mov     $s1,%rax
578         shr     \$2,$s1
579         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
580
581         add     0($inp),$h0             # accumulate input
582         adc     8($inp),$h1
583         lea     16($inp),$inp
584         adc     $padbit,$h2
585
586         call    __poly1305_block
587
588         test    $padbit,$padbit         # if $padbit is zero,
589         jz      .Lstore_base2_64_avx    # store hash in base 2^64 format
590
591         ################################# base 2^64 -> base 2^26
592         mov     $h0,%rax
593         mov     $h0,%rdx
594         shr     \$52,$h0
595         mov     $h1,$r0
596         mov     $h1,$r1
597         shr     \$26,%rdx
598         and     \$0x3ffffff,%rax        # h[0]
599         shl     \$12,$r0
600         and     \$0x3ffffff,%rdx        # h[1]
601         shr     \$14,$h1
602         or      $r0,$h0
603         shl     \$24,$h2
604         and     \$0x3ffffff,$h0         # h[2]
605         shr     \$40,$r1
606         and     \$0x3ffffff,$h1         # h[3]
607         or      $r1,$h2                 # h[4]
608
609         sub     \$16,%r15
610         jz      .Lstore_base2_26_avx
611
612         vmovd   %rax#d,$H0
613         vmovd   %rdx#d,$H1
614         vmovd   $h0#d,$H2
615         vmovd   $h1#d,$H3
616         vmovd   $h2#d,$H4
617         jmp     .Lproceed_avx
618
619 .align  32
620 .Lstore_base2_64_avx:
621         mov     $h0,0($ctx)
622         mov     $h1,8($ctx)
623         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
624         jmp     .Ldone_avx
625
626 .align  16
627 .Lstore_base2_26_avx:
628         mov     %rax#d,0($ctx)          # store hash value base 2^26
629         mov     %rdx#d,4($ctx)
630         mov     $h0#d,8($ctx)
631         mov     $h1#d,12($ctx)
632         mov     $h2#d,16($ctx)
633 .align  16
634 .Ldone_avx:
635         mov     0(%rsp),%r15
636 .cfi_restore    %r15
637         mov     8(%rsp),%r14
638 .cfi_restore    %r14
639         mov     16(%rsp),%r13
640 .cfi_restore    %r13
641         mov     24(%rsp),%r12
642 .cfi_restore    %r12
643         mov     32(%rsp),%rbp
644 .cfi_restore    %rbp
645         mov     40(%rsp),%rbx
646 .cfi_restore    %rbx
647         lea     48(%rsp),%rsp
648 .cfi_adjust_cfa_offset  -48
649 .Lno_data_avx:
650 .Lblocks_avx_epilogue:
651         ret
652 .cfi_endproc
653
654 .align  32
655 .Lbase2_64_avx:
656 .cfi_startproc
657         push    %rbx
658 .cfi_push       %rbx
659         push    %rbp
660 .cfi_push       %rbp
661         push    %r12
662 .cfi_push       %r12
663         push    %r13
664 .cfi_push       %r13
665         push    %r14
666 .cfi_push       %r14
667         push    %r15
668 .cfi_push       %r15
669 .Lbase2_64_avx_body:
670
671         mov     $len,%r15               # reassign $len
672
673         mov     24($ctx),$r0            # load r
674         mov     32($ctx),$s1
675
676         mov     0($ctx),$h0             # load hash value
677         mov     8($ctx),$h1
678         mov     16($ctx),$h2#d
679
680         mov     $s1,$r1
681         mov     $s1,%rax
682         shr     \$2,$s1
683         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
684
685         test    \$31,$len
686         jz      .Linit_avx
687
688         add     0($inp),$h0             # accumulate input
689         adc     8($inp),$h1
690         lea     16($inp),$inp
691         adc     $padbit,$h2
692         sub     \$16,%r15
693
694         call    __poly1305_block
695
696 .Linit_avx:
697         ################################# base 2^64 -> base 2^26
698         mov     $h0,%rax
699         mov     $h0,%rdx
700         shr     \$52,$h0
701         mov     $h1,$d1
702         mov     $h1,$d2
703         shr     \$26,%rdx
704         and     \$0x3ffffff,%rax        # h[0]
705         shl     \$12,$d1
706         and     \$0x3ffffff,%rdx        # h[1]
707         shr     \$14,$h1
708         or      $d1,$h0
709         shl     \$24,$h2
710         and     \$0x3ffffff,$h0         # h[2]
711         shr     \$40,$d2
712         and     \$0x3ffffff,$h1         # h[3]
713         or      $d2,$h2                 # h[4]
714
715         vmovd   %rax#d,$H0
716         vmovd   %rdx#d,$H1
717         vmovd   $h0#d,$H2
718         vmovd   $h1#d,$H3
719         vmovd   $h2#d,$H4
720         movl    \$1,20($ctx)            # set is_base2_26
721
722         call    __poly1305_init_avx
723
724 .Lproceed_avx:
725         mov     %r15,$len
726
727         mov     0(%rsp),%r15
728 .cfi_restore    %r15
729         mov     8(%rsp),%r14
730 .cfi_restore    %r14
731         mov     16(%rsp),%r13
732 .cfi_restore    %r13
733         mov     24(%rsp),%r12
734 .cfi_restore    %r12
735         mov     32(%rsp),%rbp
736 .cfi_restore    %rbp
737         mov     40(%rsp),%rbx
738 .cfi_restore    %rbx
739         lea     48(%rsp),%rax
740         lea     48(%rsp),%rsp
741 .cfi_adjust_cfa_offset  -48
742 .Lbase2_64_avx_epilogue:
743         jmp     .Ldo_avx
744 .cfi_endproc
745
746 .align  32
747 .Leven_avx:
748 .cfi_startproc
749         vmovd           4*0($ctx),$H0           # load hash value
750         vmovd           4*1($ctx),$H1
751         vmovd           4*2($ctx),$H2
752         vmovd           4*3($ctx),$H3
753         vmovd           4*4($ctx),$H4
754
755 .Ldo_avx:
756 ___
757 $code.=<<___    if (!$win64);
758         lea             -0x58(%rsp),%r11
759 .cfi_def_cfa            %r11,0x60
760         sub             \$0x178,%rsp
761 ___
762 $code.=<<___    if ($win64);
763         lea             -0xf8(%rsp),%r11
764         sub             \$0x218,%rsp
765         vmovdqa         %xmm6,0x50(%r11)
766         vmovdqa         %xmm7,0x60(%r11)
767         vmovdqa         %xmm8,0x70(%r11)
768         vmovdqa         %xmm9,0x80(%r11)
769         vmovdqa         %xmm10,0x90(%r11)
770         vmovdqa         %xmm11,0xa0(%r11)
771         vmovdqa         %xmm12,0xb0(%r11)
772         vmovdqa         %xmm13,0xc0(%r11)
773         vmovdqa         %xmm14,0xd0(%r11)
774         vmovdqa         %xmm15,0xe0(%r11)
775 .Ldo_avx_body:
776 ___
777 $code.=<<___;
778         sub             \$64,$len
779         lea             -32($inp),%rax
780         cmovc           %rax,$inp
781
782         vmovdqu         `16*3`($ctx),$D4        # preload r0^2
783         lea             `16*3+64`($ctx),$ctx    # size optimization
784         lea             .Lconst(%rip),%rcx
785
786         ################################################################
787         # load input
788         vmovdqu         16*2($inp),$T0
789         vmovdqu         16*3($inp),$T1
790         vmovdqa         64(%rcx),$MASK          # .Lmask26
791
792         vpsrldq         \$6,$T0,$T2             # splat input
793         vpsrldq         \$6,$T1,$T3
794         vpunpckhqdq     $T1,$T0,$T4             # 4
795         vpunpcklqdq     $T1,$T0,$T0             # 0:1
796         vpunpcklqdq     $T3,$T2,$T3             # 2:3
797
798         vpsrlq          \$40,$T4,$T4            # 4
799         vpsrlq          \$26,$T0,$T1
800         vpand           $MASK,$T0,$T0           # 0
801         vpsrlq          \$4,$T3,$T2
802         vpand           $MASK,$T1,$T1           # 1
803         vpsrlq          \$30,$T3,$T3
804         vpand           $MASK,$T2,$T2           # 2
805         vpand           $MASK,$T3,$T3           # 3
806         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
807
808         jbe             .Lskip_loop_avx
809
810         # expand and copy pre-calculated table to stack
811         vmovdqu         `16*1-64`($ctx),$D1
812         vmovdqu         `16*2-64`($ctx),$D2
813         vpshufd         \$0xEE,$D4,$D3          # 34xx -> 3434
814         vpshufd         \$0x44,$D4,$D0          # xx12 -> 1212
815         vmovdqa         $D3,-0x90(%r11)
816         vmovdqa         $D0,0x00(%rsp)
817         vpshufd         \$0xEE,$D1,$D4
818         vmovdqu         `16*3-64`($ctx),$D0
819         vpshufd         \$0x44,$D1,$D1
820         vmovdqa         $D4,-0x80(%r11)
821         vmovdqa         $D1,0x10(%rsp)
822         vpshufd         \$0xEE,$D2,$D3
823         vmovdqu         `16*4-64`($ctx),$D1
824         vpshufd         \$0x44,$D2,$D2
825         vmovdqa         $D3,-0x70(%r11)
826         vmovdqa         $D2,0x20(%rsp)
827         vpshufd         \$0xEE,$D0,$D4
828         vmovdqu         `16*5-64`($ctx),$D2
829         vpshufd         \$0x44,$D0,$D0
830         vmovdqa         $D4,-0x60(%r11)
831         vmovdqa         $D0,0x30(%rsp)
832         vpshufd         \$0xEE,$D1,$D3
833         vmovdqu         `16*6-64`($ctx),$D0
834         vpshufd         \$0x44,$D1,$D1
835         vmovdqa         $D3,-0x50(%r11)
836         vmovdqa         $D1,0x40(%rsp)
837         vpshufd         \$0xEE,$D2,$D4
838         vmovdqu         `16*7-64`($ctx),$D1
839         vpshufd         \$0x44,$D2,$D2
840         vmovdqa         $D4,-0x40(%r11)
841         vmovdqa         $D2,0x50(%rsp)
842         vpshufd         \$0xEE,$D0,$D3
843         vmovdqu         `16*8-64`($ctx),$D2
844         vpshufd         \$0x44,$D0,$D0
845         vmovdqa         $D3,-0x30(%r11)
846         vmovdqa         $D0,0x60(%rsp)
847         vpshufd         \$0xEE,$D1,$D4
848         vpshufd         \$0x44,$D1,$D1
849         vmovdqa         $D4,-0x20(%r11)
850         vmovdqa         $D1,0x70(%rsp)
851         vpshufd         \$0xEE,$D2,$D3
852          vmovdqa        0x00(%rsp),$D4          # preload r0^2
853         vpshufd         \$0x44,$D2,$D2
854         vmovdqa         $D3,-0x10(%r11)
855         vmovdqa         $D2,0x80(%rsp)
856
857         jmp             .Loop_avx
858
859 .align  32
860 .Loop_avx:
861         ################################################################
862         # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
863         # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
864         #   \___________________/
865         # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
866         # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
867         #   \___________________/ \____________________/
868         #
869         # Note that we start with inp[2:3]*r^2. This is because it
870         # doesn't depend on reduction in previous iteration.
871         ################################################################
872         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
873         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
874         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
875         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
876         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
877         #
878         # though note that $Tx and $Hx are "reversed" in this section,
879         # and $D4 is preloaded with r0^2...
880
881         vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
882         vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
883           vmovdqa       $H2,0x20(%r11)                          # offload hash
884         vpmuludq        $T2,$D4,$D2             # d3 = h2*r0
885          vmovdqa        0x10(%rsp),$H2          # r1^2
886         vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
887         vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
888
889           vmovdqa       $H0,0x00(%r11)                          #
890         vpmuludq        0x20(%rsp),$T4,$H0      # h4*s1
891           vmovdqa       $H1,0x10(%r11)                          #
892         vpmuludq        $T3,$H2,$H1             # h3*r1
893         vpaddq          $H0,$D0,$D0             # d0 += h4*s1
894         vpaddq          $H1,$D4,$D4             # d4 += h3*r1
895           vmovdqa       $H3,0x30(%r11)                          #
896         vpmuludq        $T2,$H2,$H0             # h2*r1
897         vpmuludq        $T1,$H2,$H1             # h1*r1
898         vpaddq          $H0,$D3,$D3             # d3 += h2*r1
899          vmovdqa        0x30(%rsp),$H3          # r2^2
900         vpaddq          $H1,$D2,$D2             # d2 += h1*r1
901           vmovdqa       $H4,0x40(%r11)                          #
902         vpmuludq        $T0,$H2,$H2             # h0*r1
903          vpmuludq       $T2,$H3,$H0             # h2*r2
904         vpaddq          $H2,$D1,$D1             # d1 += h0*r1
905
906          vmovdqa        0x40(%rsp),$H4          # s2^2
907         vpaddq          $H0,$D4,$D4             # d4 += h2*r2
908         vpmuludq        $T1,$H3,$H1             # h1*r2
909         vpmuludq        $T0,$H3,$H3             # h0*r2
910         vpaddq          $H1,$D3,$D3             # d3 += h1*r2
911          vmovdqa        0x50(%rsp),$H2          # r3^2
912         vpaddq          $H3,$D2,$D2             # d2 += h0*r2
913         vpmuludq        $T4,$H4,$H0             # h4*s2
914         vpmuludq        $T3,$H4,$H4             # h3*s2
915         vpaddq          $H0,$D1,$D1             # d1 += h4*s2
916          vmovdqa        0x60(%rsp),$H3          # s3^2
917         vpaddq          $H4,$D0,$D0             # d0 += h3*s2
918
919          vmovdqa        0x80(%rsp),$H4          # s4^2
920         vpmuludq        $T1,$H2,$H1             # h1*r3
921         vpmuludq        $T0,$H2,$H2             # h0*r3
922         vpaddq          $H1,$D4,$D4             # d4 += h1*r3
923         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
924         vpmuludq        $T4,$H3,$H0             # h4*s3
925         vpmuludq        $T3,$H3,$H1             # h3*s3
926         vpaddq          $H0,$D2,$D2             # d2 += h4*s3
927          vmovdqu        16*0($inp),$H0                          # load input
928         vpaddq          $H1,$D1,$D1             # d1 += h3*s3
929         vpmuludq        $T2,$H3,$H3             # h2*s3
930          vpmuludq       $T2,$H4,$T2             # h2*s4
931         vpaddq          $H3,$D0,$D0             # d0 += h2*s3
932
933          vmovdqu        16*1($inp),$H1                          #
934         vpaddq          $T2,$D1,$D1             # d1 += h2*s4
935         vpmuludq        $T3,$H4,$T3             # h3*s4
936         vpmuludq        $T4,$H4,$T4             # h4*s4
937          vpsrldq        \$6,$H0,$H2                             # splat input
938         vpaddq          $T3,$D2,$D2             # d2 += h3*s4
939         vpaddq          $T4,$D3,$D3             # d3 += h4*s4
940          vpsrldq        \$6,$H1,$H3                             #
941         vpmuludq        0x70(%rsp),$T0,$T4      # h0*r4
942         vpmuludq        $T1,$H4,$T0             # h1*s4
943          vpunpckhqdq    $H1,$H0,$H4             # 4
944         vpaddq          $T4,$D4,$D4             # d4 += h0*r4
945          vmovdqa        -0x90(%r11),$T4         # r0^4
946         vpaddq          $T0,$D0,$D0             # d0 += h1*s4
947
948         vpunpcklqdq     $H1,$H0,$H0             # 0:1
949         vpunpcklqdq     $H3,$H2,$H3             # 2:3
950
951         #vpsrlq         \$40,$H4,$H4            # 4
952         vpsrldq         \$`40/8`,$H4,$H4        # 4
953         vpsrlq          \$26,$H0,$H1
954         vpand           $MASK,$H0,$H0           # 0
955         vpsrlq          \$4,$H3,$H2
956         vpand           $MASK,$H1,$H1           # 1
957         vpand           0(%rcx),$H4,$H4         # .Lmask24
958         vpsrlq          \$30,$H3,$H3
959         vpand           $MASK,$H2,$H2           # 2
960         vpand           $MASK,$H3,$H3           # 3
961         vpor            32(%rcx),$H4,$H4        # padbit, yes, always
962
963         vpaddq          0x00(%r11),$H0,$H0      # add hash value
964         vpaddq          0x10(%r11),$H1,$H1
965         vpaddq          0x20(%r11),$H2,$H2
966         vpaddq          0x30(%r11),$H3,$H3
967         vpaddq          0x40(%r11),$H4,$H4
968
969         lea             16*2($inp),%rax
970         lea             16*4($inp),$inp
971         sub             \$64,$len
972         cmovc           %rax,$inp
973
974         ################################################################
975         # Now we accumulate (inp[0:1]+hash)*r^4
976         ################################################################
977         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
978         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
979         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
980         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
981         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
982
983         vpmuludq        $H0,$T4,$T0             # h0*r0
984         vpmuludq        $H1,$T4,$T1             # h1*r0
985         vpaddq          $T0,$D0,$D0
986         vpaddq          $T1,$D1,$D1
987          vmovdqa        -0x80(%r11),$T2         # r1^4
988         vpmuludq        $H2,$T4,$T0             # h2*r0
989         vpmuludq        $H3,$T4,$T1             # h3*r0
990         vpaddq          $T0,$D2,$D2
991         vpaddq          $T1,$D3,$D3
992         vpmuludq        $H4,$T4,$T4             # h4*r0
993          vpmuludq       -0x70(%r11),$H4,$T0     # h4*s1
994         vpaddq          $T4,$D4,$D4
995
996         vpaddq          $T0,$D0,$D0             # d0 += h4*s1
997         vpmuludq        $H2,$T2,$T1             # h2*r1
998         vpmuludq        $H3,$T2,$T0             # h3*r1
999         vpaddq          $T1,$D3,$D3             # d3 += h2*r1
1000          vmovdqa        -0x60(%r11),$T3         # r2^4
1001         vpaddq          $T0,$D4,$D4             # d4 += h3*r1
1002         vpmuludq        $H1,$T2,$T1             # h1*r1
1003         vpmuludq        $H0,$T2,$T2             # h0*r1
1004         vpaddq          $T1,$D2,$D2             # d2 += h1*r1
1005         vpaddq          $T2,$D1,$D1             # d1 += h0*r1
1006
1007          vmovdqa        -0x50(%r11),$T4         # s2^4
1008         vpmuludq        $H2,$T3,$T0             # h2*r2
1009         vpmuludq        $H1,$T3,$T1             # h1*r2
1010         vpaddq          $T0,$D4,$D4             # d4 += h2*r2
1011         vpaddq          $T1,$D3,$D3             # d3 += h1*r2
1012          vmovdqa        -0x40(%r11),$T2         # r3^4
1013         vpmuludq        $H0,$T3,$T3             # h0*r2
1014         vpmuludq        $H4,$T4,$T0             # h4*s2
1015         vpaddq          $T3,$D2,$D2             # d2 += h0*r2
1016         vpaddq          $T0,$D1,$D1             # d1 += h4*s2
1017          vmovdqa        -0x30(%r11),$T3         # s3^4
1018         vpmuludq        $H3,$T4,$T4             # h3*s2
1019          vpmuludq       $H1,$T2,$T1             # h1*r3
1020         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1021
1022          vmovdqa        -0x10(%r11),$T4         # s4^4
1023         vpaddq          $T1,$D4,$D4             # d4 += h1*r3
1024         vpmuludq        $H0,$T2,$T2             # h0*r3
1025         vpmuludq        $H4,$T3,$T0             # h4*s3
1026         vpaddq          $T2,$D3,$D3             # d3 += h0*r3
1027         vpaddq          $T0,$D2,$D2             # d2 += h4*s3
1028          vmovdqu        16*2($inp),$T0                          # load input
1029         vpmuludq        $H3,$T3,$T2             # h3*s3
1030         vpmuludq        $H2,$T3,$T3             # h2*s3
1031         vpaddq          $T2,$D1,$D1             # d1 += h3*s3
1032          vmovdqu        16*3($inp),$T1                          #
1033         vpaddq          $T3,$D0,$D0             # d0 += h2*s3
1034
1035         vpmuludq        $H2,$T4,$H2             # h2*s4
1036         vpmuludq        $H3,$T4,$H3             # h3*s4
1037          vpsrldq        \$6,$T0,$T2                             # splat input
1038         vpaddq          $H2,$D1,$D1             # d1 += h2*s4
1039         vpmuludq        $H4,$T4,$H4             # h4*s4
1040          vpsrldq        \$6,$T1,$T3                             #
1041         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*s4
1042         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*s4
1043         vpmuludq        -0x20(%r11),$H0,$H4     # h0*r4
1044         vpmuludq        $H1,$T4,$H0
1045          vpunpckhqdq    $T1,$T0,$T4             # 4
1046         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1047         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1048
1049         vpunpcklqdq     $T1,$T0,$T0             # 0:1
1050         vpunpcklqdq     $T3,$T2,$T3             # 2:3
1051
1052         #vpsrlq         \$40,$T4,$T4            # 4
1053         vpsrldq         \$`40/8`,$T4,$T4        # 4
1054         vpsrlq          \$26,$T0,$T1
1055          vmovdqa        0x00(%rsp),$D4          # preload r0^2
1056         vpand           $MASK,$T0,$T0           # 0
1057         vpsrlq          \$4,$T3,$T2
1058         vpand           $MASK,$T1,$T1           # 1
1059         vpand           0(%rcx),$T4,$T4         # .Lmask24
1060         vpsrlq          \$30,$T3,$T3
1061         vpand           $MASK,$T2,$T2           # 2
1062         vpand           $MASK,$T3,$T3           # 3
1063         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1064
1065         ################################################################
1066         # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1067         # and P. Schwabe
1068
1069         vpsrlq          \$26,$H3,$D3
1070         vpand           $MASK,$H3,$H3
1071         vpaddq          $D3,$H4,$H4             # h3 -> h4
1072
1073         vpsrlq          \$26,$H0,$D0
1074         vpand           $MASK,$H0,$H0
1075         vpaddq          $D0,$D1,$H1             # h0 -> h1
1076
1077         vpsrlq          \$26,$H4,$D0
1078         vpand           $MASK,$H4,$H4
1079
1080         vpsrlq          \$26,$H1,$D1
1081         vpand           $MASK,$H1,$H1
1082         vpaddq          $D1,$H2,$H2             # h1 -> h2
1083
1084         vpaddq          $D0,$H0,$H0
1085         vpsllq          \$2,$D0,$D0
1086         vpaddq          $D0,$H0,$H0             # h4 -> h0
1087
1088         vpsrlq          \$26,$H2,$D2
1089         vpand           $MASK,$H2,$H2
1090         vpaddq          $D2,$H3,$H3             # h2 -> h3
1091
1092         vpsrlq          \$26,$H0,$D0
1093         vpand           $MASK,$H0,$H0
1094         vpaddq          $D0,$H1,$H1             # h0 -> h1
1095
1096         vpsrlq          \$26,$H3,$D3
1097         vpand           $MASK,$H3,$H3
1098         vpaddq          $D3,$H4,$H4             # h3 -> h4
1099
1100         ja              .Loop_avx
1101
1102 .Lskip_loop_avx:
1103         ################################################################
1104         # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1105
1106         vpshufd         \$0x10,$D4,$D4          # r0^n, xx12 -> x1x2
1107         add             \$32,$len
1108         jnz             .Long_tail_avx
1109
1110         vpaddq          $H2,$T2,$T2
1111         vpaddq          $H0,$T0,$T0
1112         vpaddq          $H1,$T1,$T1
1113         vpaddq          $H3,$T3,$T3
1114         vpaddq          $H4,$T4,$T4
1115
1116 .Long_tail_avx:
1117         vmovdqa         $H2,0x20(%r11)
1118         vmovdqa         $H0,0x00(%r11)
1119         vmovdqa         $H1,0x10(%r11)
1120         vmovdqa         $H3,0x30(%r11)
1121         vmovdqa         $H4,0x40(%r11)
1122
1123         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1124         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1125         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1126         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1127         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1128
1129         vpmuludq        $T2,$D4,$D2             # d2 = h2*r0
1130         vpmuludq        $T0,$D4,$D0             # d0 = h0*r0
1131          vpshufd        \$0x10,`16*1-64`($ctx),$H2              # r1^n
1132         vpmuludq        $T1,$D4,$D1             # d1 = h1*r0
1133         vpmuludq        $T3,$D4,$D3             # d3 = h3*r0
1134         vpmuludq        $T4,$D4,$D4             # d4 = h4*r0
1135
1136         vpmuludq        $T3,$H2,$H0             # h3*r1
1137         vpaddq          $H0,$D4,$D4             # d4 += h3*r1
1138          vpshufd        \$0x10,`16*2-64`($ctx),$H3              # s1^n
1139         vpmuludq        $T2,$H2,$H1             # h2*r1
1140         vpaddq          $H1,$D3,$D3             # d3 += h2*r1
1141          vpshufd        \$0x10,`16*3-64`($ctx),$H4              # r2^n
1142         vpmuludq        $T1,$H2,$H0             # h1*r1
1143         vpaddq          $H0,$D2,$D2             # d2 += h1*r1
1144         vpmuludq        $T0,$H2,$H2             # h0*r1
1145         vpaddq          $H2,$D1,$D1             # d1 += h0*r1
1146         vpmuludq        $T4,$H3,$H3             # h4*s1
1147         vpaddq          $H3,$D0,$D0             # d0 += h4*s1
1148
1149          vpshufd        \$0x10,`16*4-64`($ctx),$H2              # s2^n
1150         vpmuludq        $T2,$H4,$H1             # h2*r2
1151         vpaddq          $H1,$D4,$D4             # d4 += h2*r2
1152         vpmuludq        $T1,$H4,$H0             # h1*r2
1153         vpaddq          $H0,$D3,$D3             # d3 += h1*r2
1154          vpshufd        \$0x10,`16*5-64`($ctx),$H3              # r3^n
1155         vpmuludq        $T0,$H4,$H4             # h0*r2
1156         vpaddq          $H4,$D2,$D2             # d2 += h0*r2
1157         vpmuludq        $T4,$H2,$H1             # h4*s2
1158         vpaddq          $H1,$D1,$D1             # d1 += h4*s2
1159          vpshufd        \$0x10,`16*6-64`($ctx),$H4              # s3^n
1160         vpmuludq        $T3,$H2,$H2             # h3*s2
1161         vpaddq          $H2,$D0,$D0             # d0 += h3*s2
1162
1163         vpmuludq        $T1,$H3,$H0             # h1*r3
1164         vpaddq          $H0,$D4,$D4             # d4 += h1*r3
1165         vpmuludq        $T0,$H3,$H3             # h0*r3
1166         vpaddq          $H3,$D3,$D3             # d3 += h0*r3
1167          vpshufd        \$0x10,`16*7-64`($ctx),$H2              # r4^n
1168         vpmuludq        $T4,$H4,$H1             # h4*s3
1169         vpaddq          $H1,$D2,$D2             # d2 += h4*s3
1170          vpshufd        \$0x10,`16*8-64`($ctx),$H3              # s4^n
1171         vpmuludq        $T3,$H4,$H0             # h3*s3
1172         vpaddq          $H0,$D1,$D1             # d1 += h3*s3
1173         vpmuludq        $T2,$H4,$H4             # h2*s3
1174         vpaddq          $H4,$D0,$D0             # d0 += h2*s3
1175
1176         vpmuludq        $T0,$H2,$H2             # h0*r4
1177         vpaddq          $H2,$D4,$D4             # h4 = d4 + h0*r4
1178         vpmuludq        $T4,$H3,$H1             # h4*s4
1179         vpaddq          $H1,$D3,$D3             # h3 = d3 + h4*s4
1180         vpmuludq        $T3,$H3,$H0             # h3*s4
1181         vpaddq          $H0,$D2,$D2             # h2 = d2 + h3*s4
1182         vpmuludq        $T2,$H3,$H1             # h2*s4
1183         vpaddq          $H1,$D1,$D1             # h1 = d1 + h2*s4
1184         vpmuludq        $T1,$H3,$H3             # h1*s4
1185         vpaddq          $H3,$D0,$D0             # h0 = d0 + h1*s4
1186
1187         jz              .Lshort_tail_avx
1188
1189         vmovdqu         16*0($inp),$H0          # load input
1190         vmovdqu         16*1($inp),$H1
1191
1192         vpsrldq         \$6,$H0,$H2             # splat input
1193         vpsrldq         \$6,$H1,$H3
1194         vpunpckhqdq     $H1,$H0,$H4             # 4
1195         vpunpcklqdq     $H1,$H0,$H0             # 0:1
1196         vpunpcklqdq     $H3,$H2,$H3             # 2:3
1197
1198         vpsrlq          \$40,$H4,$H4            # 4
1199         vpsrlq          \$26,$H0,$H1
1200         vpand           $MASK,$H0,$H0           # 0
1201         vpsrlq          \$4,$H3,$H2
1202         vpand           $MASK,$H1,$H1           # 1
1203         vpsrlq          \$30,$H3,$H3
1204         vpand           $MASK,$H2,$H2           # 2
1205         vpand           $MASK,$H3,$H3           # 3
1206         vpor            32(%rcx),$H4,$H4        # padbit, yes, always
1207
1208         vpshufd         \$0x32,`16*0-64`($ctx),$T4      # r0^n, 34xx -> x3x4
1209         vpaddq          0x00(%r11),$H0,$H0
1210         vpaddq          0x10(%r11),$H1,$H1
1211         vpaddq          0x20(%r11),$H2,$H2
1212         vpaddq          0x30(%r11),$H3,$H3
1213         vpaddq          0x40(%r11),$H4,$H4
1214
1215         ################################################################
1216         # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1217
1218         vpmuludq        $H0,$T4,$T0             # h0*r0
1219         vpaddq          $T0,$D0,$D0             # d0 += h0*r0
1220         vpmuludq        $H1,$T4,$T1             # h1*r0
1221         vpaddq          $T1,$D1,$D1             # d1 += h1*r0
1222         vpmuludq        $H2,$T4,$T0             # h2*r0
1223         vpaddq          $T0,$D2,$D2             # d2 += h2*r0
1224          vpshufd        \$0x32,`16*1-64`($ctx),$T2              # r1^n
1225         vpmuludq        $H3,$T4,$T1             # h3*r0
1226         vpaddq          $T1,$D3,$D3             # d3 += h3*r0
1227         vpmuludq        $H4,$T4,$T4             # h4*r0
1228         vpaddq          $T4,$D4,$D4             # d4 += h4*r0
1229
1230         vpmuludq        $H3,$T2,$T0             # h3*r1
1231         vpaddq          $T0,$D4,$D4             # d4 += h3*r1
1232          vpshufd        \$0x32,`16*2-64`($ctx),$T3              # s1
1233         vpmuludq        $H2,$T2,$T1             # h2*r1
1234         vpaddq          $T1,$D3,$D3             # d3 += h2*r1
1235          vpshufd        \$0x32,`16*3-64`($ctx),$T4              # r2
1236         vpmuludq        $H1,$T2,$T0             # h1*r1
1237         vpaddq          $T0,$D2,$D2             # d2 += h1*r1
1238         vpmuludq        $H0,$T2,$T2             # h0*r1
1239         vpaddq          $T2,$D1,$D1             # d1 += h0*r1
1240         vpmuludq        $H4,$T3,$T3             # h4*s1
1241         vpaddq          $T3,$D0,$D0             # d0 += h4*s1
1242
1243          vpshufd        \$0x32,`16*4-64`($ctx),$T2              # s2
1244         vpmuludq        $H2,$T4,$T1             # h2*r2
1245         vpaddq          $T1,$D4,$D4             # d4 += h2*r2
1246         vpmuludq        $H1,$T4,$T0             # h1*r2
1247         vpaddq          $T0,$D3,$D3             # d3 += h1*r2
1248          vpshufd        \$0x32,`16*5-64`($ctx),$T3              # r3
1249         vpmuludq        $H0,$T4,$T4             # h0*r2
1250         vpaddq          $T4,$D2,$D2             # d2 += h0*r2
1251         vpmuludq        $H4,$T2,$T1             # h4*s2
1252         vpaddq          $T1,$D1,$D1             # d1 += h4*s2
1253          vpshufd        \$0x32,`16*6-64`($ctx),$T4              # s3
1254         vpmuludq        $H3,$T2,$T2             # h3*s2
1255         vpaddq          $T2,$D0,$D0             # d0 += h3*s2
1256
1257         vpmuludq        $H1,$T3,$T0             # h1*r3
1258         vpaddq          $T0,$D4,$D4             # d4 += h1*r3
1259         vpmuludq        $H0,$T3,$T3             # h0*r3
1260         vpaddq          $T3,$D3,$D3             # d3 += h0*r3
1261          vpshufd        \$0x32,`16*7-64`($ctx),$T2              # r4
1262         vpmuludq        $H4,$T4,$T1             # h4*s3
1263         vpaddq          $T1,$D2,$D2             # d2 += h4*s3
1264          vpshufd        \$0x32,`16*8-64`($ctx),$T3              # s4
1265         vpmuludq        $H3,$T4,$T0             # h3*s3
1266         vpaddq          $T0,$D1,$D1             # d1 += h3*s3
1267         vpmuludq        $H2,$T4,$T4             # h2*s3
1268         vpaddq          $T4,$D0,$D0             # d0 += h2*s3
1269
1270         vpmuludq        $H0,$T2,$T2             # h0*r4
1271         vpaddq          $T2,$D4,$D4             # d4 += h0*r4
1272         vpmuludq        $H4,$T3,$T1             # h4*s4
1273         vpaddq          $T1,$D3,$D3             # d3 += h4*s4
1274         vpmuludq        $H3,$T3,$T0             # h3*s4
1275         vpaddq          $T0,$D2,$D2             # d2 += h3*s4
1276         vpmuludq        $H2,$T3,$T1             # h2*s4
1277         vpaddq          $T1,$D1,$D1             # d1 += h2*s4
1278         vpmuludq        $H1,$T3,$T3             # h1*s4
1279         vpaddq          $T3,$D0,$D0             # d0 += h1*s4
1280
1281 .Lshort_tail_avx:
1282         ################################################################
1283         # horizontal addition
1284
1285         vpsrldq         \$8,$D4,$T4
1286         vpsrldq         \$8,$D3,$T3
1287         vpsrldq         \$8,$D1,$T1
1288         vpsrldq         \$8,$D0,$T0
1289         vpsrldq         \$8,$D2,$T2
1290         vpaddq          $T3,$D3,$D3
1291         vpaddq          $T4,$D4,$D4
1292         vpaddq          $T0,$D0,$D0
1293         vpaddq          $T1,$D1,$D1
1294         vpaddq          $T2,$D2,$D2
1295
1296         ################################################################
1297         # lazy reduction
1298
1299         vpsrlq          \$26,$D3,$H3
1300         vpand           $MASK,$D3,$D3
1301         vpaddq          $H3,$D4,$D4             # h3 -> h4
1302
1303         vpsrlq          \$26,$D0,$H0
1304         vpand           $MASK,$D0,$D0
1305         vpaddq          $H0,$D1,$D1             # h0 -> h1
1306
1307         vpsrlq          \$26,$D4,$H4
1308         vpand           $MASK,$D4,$D4
1309
1310         vpsrlq          \$26,$D1,$H1
1311         vpand           $MASK,$D1,$D1
1312         vpaddq          $H1,$D2,$D2             # h1 -> h2
1313
1314         vpaddq          $H4,$D0,$D0
1315         vpsllq          \$2,$H4,$H4
1316         vpaddq          $H4,$D0,$D0             # h4 -> h0
1317
1318         vpsrlq          \$26,$D2,$H2
1319         vpand           $MASK,$D2,$D2
1320         vpaddq          $H2,$D3,$D3             # h2 -> h3
1321
1322         vpsrlq          \$26,$D0,$H0
1323         vpand           $MASK,$D0,$D0
1324         vpaddq          $H0,$D1,$D1             # h0 -> h1
1325
1326         vpsrlq          \$26,$D3,$H3
1327         vpand           $MASK,$D3,$D3
1328         vpaddq          $H3,$D4,$D4             # h3 -> h4
1329
1330         vmovd           $D0,`4*0-48-64`($ctx)   # save partially reduced
1331         vmovd           $D1,`4*1-48-64`($ctx)
1332         vmovd           $D2,`4*2-48-64`($ctx)
1333         vmovd           $D3,`4*3-48-64`($ctx)
1334         vmovd           $D4,`4*4-48-64`($ctx)
1335 ___
1336 $code.=<<___    if ($win64);
1337         vmovdqa         0x50(%r11),%xmm6
1338         vmovdqa         0x60(%r11),%xmm7
1339         vmovdqa         0x70(%r11),%xmm8
1340         vmovdqa         0x80(%r11),%xmm9
1341         vmovdqa         0x90(%r11),%xmm10
1342         vmovdqa         0xa0(%r11),%xmm11
1343         vmovdqa         0xb0(%r11),%xmm12
1344         vmovdqa         0xc0(%r11),%xmm13
1345         vmovdqa         0xd0(%r11),%xmm14
1346         vmovdqa         0xe0(%r11),%xmm15
1347         lea             0xf8(%r11),%rsp
1348 .Ldo_avx_epilogue:
1349 ___
1350 $code.=<<___    if (!$win64);
1351         lea             0x58(%r11),%rsp
1352 .cfi_def_cfa            %rsp,8
1353 ___
1354 $code.=<<___;
1355         vzeroupper
1356         ret
1357 .cfi_endproc
1358 .size   poly1305_blocks_avx,.-poly1305_blocks_avx
1359
1360 .type   poly1305_emit_avx,\@function,3
1361 .align  32
1362 poly1305_emit_avx:
1363         cmpl    \$0,20($ctx)    # is_base2_26?
1364         je      .Lemit
1365
1366         mov     0($ctx),%eax    # load hash value base 2^26
1367         mov     4($ctx),%ecx
1368         mov     8($ctx),%r8d
1369         mov     12($ctx),%r11d
1370         mov     16($ctx),%r10d
1371
1372         shl     \$26,%rcx       # base 2^26 -> base 2^64
1373         mov     %r8,%r9
1374         shl     \$52,%r8
1375         add     %rcx,%rax
1376         shr     \$12,%r9
1377         add     %rax,%r8        # h0
1378         adc     \$0,%r9
1379
1380         shl     \$14,%r11
1381         mov     %r10,%rax
1382         shr     \$24,%r10
1383         add     %r11,%r9
1384         shl     \$40,%rax
1385         add     %rax,%r9        # h1
1386         adc     \$0,%r10        # h2
1387
1388         mov     %r10,%rax       # could be partially reduced, so reduce
1389         mov     %r10,%rcx
1390         and     \$3,%r10
1391         shr     \$2,%rax
1392         and     \$-4,%rcx
1393         add     %rcx,%rax
1394         add     %rax,%r8
1395         adc     \$0,%r9
1396         adc     \$0,%r10
1397
1398         mov     %r8,%rax
1399         add     \$5,%r8         # compare to modulus
1400         mov     %r9,%rcx
1401         adc     \$0,%r9
1402         adc     \$0,%r10
1403         shr     \$2,%r10        # did 130-bit value overfow?
1404         cmovnz  %r8,%rax
1405         cmovnz  %r9,%rcx
1406
1407         add     0($nonce),%rax  # accumulate nonce
1408         adc     8($nonce),%rcx
1409         mov     %rax,0($mac)    # write result
1410         mov     %rcx,8($mac)
1411
1412         ret
1413 .size   poly1305_emit_avx,.-poly1305_emit_avx
1414 ___
1415
1416 if ($avx>1) {
1417 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1418     map("%ymm$_",(0..15));
1419 my $S4=$MASK;
1420
1421 $code.=<<___;
1422 .type   poly1305_blocks_avx2,\@function,4
1423 .align  32
1424 poly1305_blocks_avx2:
1425 .cfi_startproc
1426         mov     20($ctx),%r8d           # is_base2_26
1427         cmp     \$128,$len
1428         jae     .Lblocks_avx2
1429         test    %r8d,%r8d
1430         jz      .Lblocks
1431
1432 .Lblocks_avx2:
1433         and     \$-16,$len
1434         jz      .Lno_data_avx2
1435
1436         vzeroupper
1437
1438         test    %r8d,%r8d
1439         jz      .Lbase2_64_avx2
1440
1441         test    \$63,$len
1442         jz      .Leven_avx2
1443
1444         push    %rbx
1445 .cfi_push       %rbx
1446         push    %rbp
1447 .cfi_push       %rbp
1448         push    %r12
1449 .cfi_push       %r12
1450         push    %r13
1451 .cfi_push       %r13
1452         push    %r14
1453 .cfi_push       %r14
1454         push    %r15
1455 .cfi_push       %r15
1456 .Lblocks_avx2_body:
1457
1458         mov     $len,%r15               # reassign $len
1459
1460         mov     0($ctx),$d1             # load hash value
1461         mov     8($ctx),$d2
1462         mov     16($ctx),$h2#d
1463
1464         mov     24($ctx),$r0            # load r
1465         mov     32($ctx),$s1
1466
1467         ################################# base 2^26 -> base 2^64
1468         mov     $d1#d,$h0#d
1469         and     \$`-1*(1<<31)`,$d1
1470         mov     $d2,$r1                 # borrow $r1
1471         mov     $d2#d,$h1#d
1472         and     \$`-1*(1<<31)`,$d2
1473
1474         shr     \$6,$d1
1475         shl     \$52,$r1
1476         add     $d1,$h0
1477         shr     \$12,$h1
1478         shr     \$18,$d2
1479         add     $r1,$h0
1480         adc     $d2,$h1
1481
1482         mov     $h2,$d1
1483         shl     \$40,$d1
1484         shr     \$24,$h2
1485         add     $d1,$h1
1486         adc     \$0,$h2                 # can be partially reduced...
1487
1488         mov     \$-4,$d2                # ... so reduce
1489         mov     $h2,$d1
1490         and     $h2,$d2
1491         shr     \$2,$d1
1492         and     \$3,$h2
1493         add     $d2,$d1                 # =*5
1494         add     $d1,$h0
1495         adc     \$0,$h1
1496         adc     \$0,$h2
1497
1498         mov     $s1,$r1
1499         mov     $s1,%rax
1500         shr     \$2,$s1
1501         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1502
1503 .Lbase2_26_pre_avx2:
1504         add     0($inp),$h0             # accumulate input
1505         adc     8($inp),$h1
1506         lea     16($inp),$inp
1507         adc     $padbit,$h2
1508         sub     \$16,%r15
1509
1510         call    __poly1305_block
1511         mov     $r1,%rax
1512
1513         test    \$63,%r15
1514         jnz     .Lbase2_26_pre_avx2
1515
1516         test    $padbit,$padbit         # if $padbit is zero,
1517         jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
1518
1519         ################################# base 2^64 -> base 2^26
1520         mov     $h0,%rax
1521         mov     $h0,%rdx
1522         shr     \$52,$h0
1523         mov     $h1,$r0
1524         mov     $h1,$r1
1525         shr     \$26,%rdx
1526         and     \$0x3ffffff,%rax        # h[0]
1527         shl     \$12,$r0
1528         and     \$0x3ffffff,%rdx        # h[1]
1529         shr     \$14,$h1
1530         or      $r0,$h0
1531         shl     \$24,$h2
1532         and     \$0x3ffffff,$h0         # h[2]
1533         shr     \$40,$r1
1534         and     \$0x3ffffff,$h1         # h[3]
1535         or      $r1,$h2                 # h[4]
1536
1537         test    %r15,%r15
1538         jz      .Lstore_base2_26_avx2
1539
1540         vmovd   %rax#d,%x#$H0
1541         vmovd   %rdx#d,%x#$H1
1542         vmovd   $h0#d,%x#$H2
1543         vmovd   $h1#d,%x#$H3
1544         vmovd   $h2#d,%x#$H4
1545         jmp     .Lproceed_avx2
1546
1547 .align  32
1548 .Lstore_base2_64_avx2:
1549         mov     $h0,0($ctx)
1550         mov     $h1,8($ctx)
1551         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
1552         jmp     .Ldone_avx2
1553
1554 .align  16
1555 .Lstore_base2_26_avx2:
1556         mov     %rax#d,0($ctx)          # store hash value base 2^26
1557         mov     %rdx#d,4($ctx)
1558         mov     $h0#d,8($ctx)
1559         mov     $h1#d,12($ctx)
1560         mov     $h2#d,16($ctx)
1561 .align  16
1562 .Ldone_avx2:
1563         mov     0(%rsp),%r15
1564 .cfi_restore    %r15
1565         mov     8(%rsp),%r14
1566 .cfi_restore    %r14
1567         mov     16(%rsp),%r13
1568 .cfi_restore    %r13
1569         mov     24(%rsp),%r12
1570 .cfi_restore    %r12
1571         mov     32(%rsp),%rbp
1572 .cfi_restore    %rbp
1573         mov     40(%rsp),%rbx
1574 .cfi_restore    %rbx
1575         lea     48(%rsp),%rsp
1576 .cfi_adjust_cfa_offset  -48
1577 .Lno_data_avx2:
1578 .Lblocks_avx2_epilogue:
1579         ret
1580 .cfi_endproc
1581
1582 .align  32
1583 .Lbase2_64_avx2:
1584 .cfi_startproc
1585         push    %rbx
1586 .cfi_push       %rbx
1587         push    %rbp
1588 .cfi_push       %rbp
1589         push    %r12
1590 .cfi_push       %r12
1591         push    %r13
1592 .cfi_push       %r13
1593         push    %r14
1594 .cfi_push       %r14
1595         push    %r15
1596 .cfi_push       %r15
1597 .Lbase2_64_avx2_body:
1598
1599         mov     $len,%r15               # reassign $len
1600
1601         mov     24($ctx),$r0            # load r
1602         mov     32($ctx),$s1
1603
1604         mov     0($ctx),$h0             # load hash value
1605         mov     8($ctx),$h1
1606         mov     16($ctx),$h2#d
1607
1608         mov     $s1,$r1
1609         mov     $s1,%rax
1610         shr     \$2,$s1
1611         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
1612
1613         test    \$63,$len
1614         jz      .Linit_avx2
1615
1616 .Lbase2_64_pre_avx2:
1617         add     0($inp),$h0             # accumulate input
1618         adc     8($inp),$h1
1619         lea     16($inp),$inp
1620         adc     $padbit,$h2
1621         sub     \$16,%r15
1622
1623         call    __poly1305_block
1624         mov     $r1,%rax
1625
1626         test    \$63,%r15
1627         jnz     .Lbase2_64_pre_avx2
1628
1629 .Linit_avx2:
1630         ################################# base 2^64 -> base 2^26
1631         mov     $h0,%rax
1632         mov     $h0,%rdx
1633         shr     \$52,$h0
1634         mov     $h1,$d1
1635         mov     $h1,$d2
1636         shr     \$26,%rdx
1637         and     \$0x3ffffff,%rax        # h[0]
1638         shl     \$12,$d1
1639         and     \$0x3ffffff,%rdx        # h[1]
1640         shr     \$14,$h1
1641         or      $d1,$h0
1642         shl     \$24,$h2
1643         and     \$0x3ffffff,$h0         # h[2]
1644         shr     \$40,$d2
1645         and     \$0x3ffffff,$h1         # h[3]
1646         or      $d2,$h2                 # h[4]
1647
1648         vmovd   %rax#d,%x#$H0
1649         vmovd   %rdx#d,%x#$H1
1650         vmovd   $h0#d,%x#$H2
1651         vmovd   $h1#d,%x#$H3
1652         vmovd   $h2#d,%x#$H4
1653         movl    \$1,20($ctx)            # set is_base2_26
1654
1655         call    __poly1305_init_avx
1656
1657 .Lproceed_avx2:
1658         mov     %r15,$len                       # restore $len
1659         mov     OPENSSL_ia32cap_P+8(%rip),%r10d
1660         mov     \$`(1<<31|1<<30|1<<16)`,%r11d
1661
1662         mov     0(%rsp),%r15
1663 .cfi_restore    %r15
1664         mov     8(%rsp),%r14
1665 .cfi_restore    %r14
1666         mov     16(%rsp),%r13
1667 .cfi_restore    %r13
1668         mov     24(%rsp),%r12
1669 .cfi_restore    %r12
1670         mov     32(%rsp),%rbp
1671 .cfi_restore    %rbp
1672         mov     40(%rsp),%rbx
1673 .cfi_restore    %rbx
1674         lea     48(%rsp),%rax
1675         lea     48(%rsp),%rsp
1676 .cfi_adjust_cfa_offset  -48
1677 .Lbase2_64_avx2_epilogue:
1678         jmp     .Ldo_avx2
1679 .cfi_endproc
1680
1681 .align  32
1682 .Leven_avx2:
1683 .cfi_startproc
1684         mov             OPENSSL_ia32cap_P+8(%rip),%r10d
1685         mov             \$`(1<<31|1<<30|1<<16)`,%r11d
1686         vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
1687         vmovd           4*1($ctx),%x#$H1
1688         vmovd           4*2($ctx),%x#$H2
1689         vmovd           4*3($ctx),%x#$H3
1690         vmovd           4*4($ctx),%x#$H4
1691
1692 .Ldo_avx2:
1693 ___
1694 $code.=<<___            if ($avx>2);
1695         cmp             \$512,$len
1696         jb              .Lskip_avx512
1697         and             %r11d,%r10d
1698         cmp             %r11d,%r10d             # check for AVX512F+BW+VL
1699         je              .Lblocks_avx512
1700 .Lskip_avx512:
1701 ___
1702 $code.=<<___    if (!$win64);
1703         lea             -8(%rsp),%r11
1704 .cfi_def_cfa            %r11,16
1705         sub             \$0x128,%rsp
1706 ___
1707 $code.=<<___    if ($win64);
1708         lea             -0xf8(%rsp),%r11
1709         sub             \$0x1c8,%rsp
1710         vmovdqa         %xmm6,0x50(%r11)
1711         vmovdqa         %xmm7,0x60(%r11)
1712         vmovdqa         %xmm8,0x70(%r11)
1713         vmovdqa         %xmm9,0x80(%r11)
1714         vmovdqa         %xmm10,0x90(%r11)
1715         vmovdqa         %xmm11,0xa0(%r11)
1716         vmovdqa         %xmm12,0xb0(%r11)
1717         vmovdqa         %xmm13,0xc0(%r11)
1718         vmovdqa         %xmm14,0xd0(%r11)
1719         vmovdqa         %xmm15,0xe0(%r11)
1720 .Ldo_avx2_body:
1721 ___
1722 $code.=<<___;
1723         lea             .Lconst(%rip),%rcx
1724         lea             48+64($ctx),$ctx        # size optimization
1725         vmovdqa         96(%rcx),$T0            # .Lpermd_avx2
1726
1727         # expand and copy pre-calculated table to stack
1728         vmovdqu         `16*0-64`($ctx),%x#$T2
1729         and             \$-512,%rsp
1730         vmovdqu         `16*1-64`($ctx),%x#$T3
1731         vmovdqu         `16*2-64`($ctx),%x#$T4
1732         vmovdqu         `16*3-64`($ctx),%x#$D0
1733         vmovdqu         `16*4-64`($ctx),%x#$D1
1734         vmovdqu         `16*5-64`($ctx),%x#$D2
1735         lea             0x90(%rsp),%rax         # size optimization
1736         vmovdqu         `16*6-64`($ctx),%x#$D3
1737         vpermd          $T2,$T0,$T2             # 00003412 -> 14243444
1738         vmovdqu         `16*7-64`($ctx),%x#$D4
1739         vpermd          $T3,$T0,$T3
1740         vmovdqu         `16*8-64`($ctx),%x#$MASK
1741         vpermd          $T4,$T0,$T4
1742         vmovdqa         $T2,0x00(%rsp)
1743         vpermd          $D0,$T0,$D0
1744         vmovdqa         $T3,0x20-0x90(%rax)
1745         vpermd          $D1,$T0,$D1
1746         vmovdqa         $T4,0x40-0x90(%rax)
1747         vpermd          $D2,$T0,$D2
1748         vmovdqa         $D0,0x60-0x90(%rax)
1749         vpermd          $D3,$T0,$D3
1750         vmovdqa         $D1,0x80-0x90(%rax)
1751         vpermd          $D4,$T0,$D4
1752         vmovdqa         $D2,0xa0-0x90(%rax)
1753         vpermd          $MASK,$T0,$MASK
1754         vmovdqa         $D3,0xc0-0x90(%rax)
1755         vmovdqa         $D4,0xe0-0x90(%rax)
1756         vmovdqa         $MASK,0x100-0x90(%rax)
1757         vmovdqa         64(%rcx),$MASK          # .Lmask26
1758
1759         ################################################################
1760         # load input
1761         vmovdqu         16*0($inp),%x#$T0
1762         vmovdqu         16*1($inp),%x#$T1
1763         vinserti128     \$1,16*2($inp),$T0,$T0
1764         vinserti128     \$1,16*3($inp),$T1,$T1
1765         lea             16*4($inp),$inp
1766
1767         vpsrldq         \$6,$T0,$T2             # splat input
1768         vpsrldq         \$6,$T1,$T3
1769         vpunpckhqdq     $T1,$T0,$T4             # 4
1770         vpunpcklqdq     $T3,$T2,$T2             # 2:3
1771         vpunpcklqdq     $T1,$T0,$T0             # 0:1
1772
1773         vpsrlq          \$30,$T2,$T3
1774         vpsrlq          \$4,$T2,$T2
1775         vpsrlq          \$26,$T0,$T1
1776         vpsrlq          \$40,$T4,$T4            # 4
1777         vpand           $MASK,$T2,$T2           # 2
1778         vpand           $MASK,$T0,$T0           # 0
1779         vpand           $MASK,$T1,$T1           # 1
1780         vpand           $MASK,$T3,$T3           # 3
1781         vpor            32(%rcx),$T4,$T4        # padbit, yes, always
1782
1783         vpaddq          $H2,$T2,$H2             # accumulate input
1784         sub             \$64,$len
1785         jz              .Ltail_avx2
1786         jmp             .Loop_avx2
1787
1788 .align  32
1789 .Loop_avx2:
1790         ################################################################
1791         # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1792         # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1793         # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1794         # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1795         #   \________/\__________/
1796         ################################################################
1797         #vpaddq         $H2,$T2,$H2             # accumulate input
1798         vpaddq          $H0,$T0,$H0
1799         vmovdqa         `32*0`(%rsp),$T0        # r0^4
1800         vpaddq          $H1,$T1,$H1
1801         vmovdqa         `32*1`(%rsp),$T1        # r1^4
1802         vpaddq          $H3,$T3,$H3
1803         vmovdqa         `32*3`(%rsp),$T2        # r2^4
1804         vpaddq          $H4,$T4,$H4
1805         vmovdqa         `32*6-0x90`(%rax),$T3   # s3^4
1806         vmovdqa         `32*8-0x90`(%rax),$S4   # s4^4
1807
1808         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1809         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1810         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1811         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1812         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1813         #
1814         # however, as h2 is "chronologically" first one available pull
1815         # corresponding operations up, so it's
1816         #
1817         # d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
1818         # d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
1819         # d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1820         # d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
1821         # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
1822
1823         vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
1824         vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
1825         vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
1826         vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
1827         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
1828
1829         vpmuludq        $H0,$T1,$T4             # h0*r1
1830         vpmuludq        $H1,$T1,$H2             # h1*r1, borrow $H2 as temp
1831         vpaddq          $T4,$D1,$D1             # d1 += h0*r1
1832         vpaddq          $H2,$D2,$D2             # d2 += h1*r1
1833         vpmuludq        $H3,$T1,$T4             # h3*r1
1834         vpmuludq        `32*2`(%rsp),$H4,$H2    # h4*s1
1835         vpaddq          $T4,$D4,$D4             # d4 += h3*r1
1836         vpaddq          $H2,$D0,$D0             # d0 += h4*s1
1837          vmovdqa        `32*4-0x90`(%rax),$T1   # s2
1838
1839         vpmuludq        $H0,$T0,$T4             # h0*r0
1840         vpmuludq        $H1,$T0,$H2             # h1*r0
1841         vpaddq          $T4,$D0,$D0             # d0 += h0*r0
1842         vpaddq          $H2,$D1,$D1             # d1 += h1*r0
1843         vpmuludq        $H3,$T0,$T4             # h3*r0
1844         vpmuludq        $H4,$T0,$H2             # h4*r0
1845          vmovdqu        16*0($inp),%x#$T0       # load input
1846         vpaddq          $T4,$D3,$D3             # d3 += h3*r0
1847         vpaddq          $H2,$D4,$D4             # d4 += h4*r0
1848          vinserti128    \$1,16*2($inp),$T0,$T0
1849
1850         vpmuludq        $H3,$T1,$T4             # h3*s2
1851         vpmuludq        $H4,$T1,$H2             # h4*s2
1852          vmovdqu        16*1($inp),%x#$T1
1853         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1854         vpaddq          $H2,$D1,$D1             # d1 += h4*s2
1855          vmovdqa        `32*5-0x90`(%rax),$H2   # r3
1856         vpmuludq        $H1,$T2,$T4             # h1*r2
1857         vpmuludq        $H0,$T2,$T2             # h0*r2
1858         vpaddq          $T4,$D3,$D3             # d3 += h1*r2
1859         vpaddq          $T2,$D2,$D2             # d2 += h0*r2
1860          vinserti128    \$1,16*3($inp),$T1,$T1
1861          lea            16*4($inp),$inp
1862
1863         vpmuludq        $H1,$H2,$T4             # h1*r3
1864         vpmuludq        $H0,$H2,$H2             # h0*r3
1865          vpsrldq        \$6,$T0,$T2             # splat input
1866         vpaddq          $T4,$D4,$D4             # d4 += h1*r3
1867         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1868         vpmuludq        $H3,$T3,$T4             # h3*s3
1869         vpmuludq        $H4,$T3,$H2             # h4*s3
1870          vpsrldq        \$6,$T1,$T3
1871         vpaddq          $T4,$D1,$D1             # d1 += h3*s3
1872         vpaddq          $H2,$D2,$D2             # d2 += h4*s3
1873          vpunpckhqdq    $T1,$T0,$T4             # 4
1874
1875         vpmuludq        $H3,$S4,$H3             # h3*s4
1876         vpmuludq        $H4,$S4,$H4             # h4*s4
1877          vpunpcklqdq    $T1,$T0,$T0             # 0:1
1878         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
1879         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
1880          vpunpcklqdq    $T3,$T2,$T3             # 2:3
1881         vpmuludq        `32*7-0x90`(%rax),$H0,$H4       # h0*r4
1882         vpmuludq        $H1,$S4,$H0             # h1*s4
1883         vmovdqa         64(%rcx),$MASK          # .Lmask26
1884         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
1885         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
1886
1887         ################################################################
1888         # lazy reduction (interleaved with tail of input splat)
1889
1890         vpsrlq          \$26,$H3,$D3
1891         vpand           $MASK,$H3,$H3
1892         vpaddq          $D3,$H4,$H4             # h3 -> h4
1893
1894         vpsrlq          \$26,$H0,$D0
1895         vpand           $MASK,$H0,$H0
1896         vpaddq          $D0,$D1,$H1             # h0 -> h1
1897
1898         vpsrlq          \$26,$H4,$D4
1899         vpand           $MASK,$H4,$H4
1900
1901          vpsrlq         \$4,$T3,$T2
1902
1903         vpsrlq          \$26,$H1,$D1
1904         vpand           $MASK,$H1,$H1
1905         vpaddq          $D1,$H2,$H2             # h1 -> h2
1906
1907         vpaddq          $D4,$H0,$H0
1908         vpsllq          \$2,$D4,$D4
1909         vpaddq          $D4,$H0,$H0             # h4 -> h0
1910
1911          vpand          $MASK,$T2,$T2           # 2
1912          vpsrlq         \$26,$T0,$T1
1913
1914         vpsrlq          \$26,$H2,$D2
1915         vpand           $MASK,$H2,$H2
1916         vpaddq          $D2,$H3,$H3             # h2 -> h3
1917
1918          vpaddq         $T2,$H2,$H2             # modulo-scheduled
1919          vpsrlq         \$30,$T3,$T3
1920
1921         vpsrlq          \$26,$H0,$D0
1922         vpand           $MASK,$H0,$H0
1923         vpaddq          $D0,$H1,$H1             # h0 -> h1
1924
1925          vpsrlq         \$40,$T4,$T4            # 4
1926
1927         vpsrlq          \$26,$H3,$D3
1928         vpand           $MASK,$H3,$H3
1929         vpaddq          $D3,$H4,$H4             # h3 -> h4
1930
1931          vpand          $MASK,$T0,$T0           # 0
1932          vpand          $MASK,$T1,$T1           # 1
1933          vpand          $MASK,$T3,$T3           # 3
1934          vpor           32(%rcx),$T4,$T4        # padbit, yes, always
1935
1936         sub             \$64,$len
1937         jnz             .Loop_avx2
1938
1939         .byte           0x66,0x90
1940 .Ltail_avx2:
1941         ################################################################
1942         # while above multiplications were by r^4 in all lanes, in last
1943         # iteration we multiply least significant lane by r^4 and most
1944         # significant one by r, so copy of above except that references
1945         # to the precomputed table are displaced by 4...
1946
1947         #vpaddq         $H2,$T2,$H2             # accumulate input
1948         vpaddq          $H0,$T0,$H0
1949         vmovdqu         `32*0+4`(%rsp),$T0      # r0^4
1950         vpaddq          $H1,$T1,$H1
1951         vmovdqu         `32*1+4`(%rsp),$T1      # r1^4
1952         vpaddq          $H3,$T3,$H3
1953         vmovdqu         `32*3+4`(%rsp),$T2      # r2^4
1954         vpaddq          $H4,$T4,$H4
1955         vmovdqu         `32*6+4-0x90`(%rax),$T3 # s3^4
1956         vmovdqu         `32*8+4-0x90`(%rax),$S4 # s4^4
1957
1958         vpmuludq        $H2,$T0,$D2             # d2 = h2*r0
1959         vpmuludq        $H2,$T1,$D3             # d3 = h2*r1
1960         vpmuludq        $H2,$T2,$D4             # d4 = h2*r2
1961         vpmuludq        $H2,$T3,$D0             # d0 = h2*s3
1962         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
1963
1964         vpmuludq        $H0,$T1,$T4             # h0*r1
1965         vpmuludq        $H1,$T1,$H2             # h1*r1
1966         vpaddq          $T4,$D1,$D1             # d1 += h0*r1
1967         vpaddq          $H2,$D2,$D2             # d2 += h1*r1
1968         vpmuludq        $H3,$T1,$T4             # h3*r1
1969         vpmuludq        `32*2+4`(%rsp),$H4,$H2  # h4*s1
1970         vpaddq          $T4,$D4,$D4             # d4 += h3*r1
1971         vpaddq          $H2,$D0,$D0             # d0 += h4*s1
1972
1973         vpmuludq        $H0,$T0,$T4             # h0*r0
1974         vpmuludq        $H1,$T0,$H2             # h1*r0
1975         vpaddq          $T4,$D0,$D0             # d0 += h0*r0
1976          vmovdqu        `32*4+4-0x90`(%rax),$T1 # s2
1977         vpaddq          $H2,$D1,$D1             # d1 += h1*r0
1978         vpmuludq        $H3,$T0,$T4             # h3*r0
1979         vpmuludq        $H4,$T0,$H2             # h4*r0
1980         vpaddq          $T4,$D3,$D3             # d3 += h3*r0
1981         vpaddq          $H2,$D4,$D4             # d4 += h4*r0
1982
1983         vpmuludq        $H3,$T1,$T4             # h3*s2
1984         vpmuludq        $H4,$T1,$H2             # h4*s2
1985         vpaddq          $T4,$D0,$D0             # d0 += h3*s2
1986         vpaddq          $H2,$D1,$D1             # d1 += h4*s2
1987          vmovdqu        `32*5+4-0x90`(%rax),$H2 # r3
1988         vpmuludq        $H1,$T2,$T4             # h1*r2
1989         vpmuludq        $H0,$T2,$T2             # h0*r2
1990         vpaddq          $T4,$D3,$D3             # d3 += h1*r2
1991         vpaddq          $T2,$D2,$D2             # d2 += h0*r2
1992
1993         vpmuludq        $H1,$H2,$T4             # h1*r3
1994         vpmuludq        $H0,$H2,$H2             # h0*r3
1995         vpaddq          $T4,$D4,$D4             # d4 += h1*r3
1996         vpaddq          $H2,$D3,$D3             # d3 += h0*r3
1997         vpmuludq        $H3,$T3,$T4             # h3*s3
1998         vpmuludq        $H4,$T3,$H2             # h4*s3
1999         vpaddq          $T4,$D1,$D1             # d1 += h3*s3
2000         vpaddq          $H2,$D2,$D2             # d2 += h4*s3
2001
2002         vpmuludq        $H3,$S4,$H3             # h3*s4
2003         vpmuludq        $H4,$S4,$H4             # h4*s4
2004         vpaddq          $H3,$D2,$H2             # h2 = d2 + h3*r4
2005         vpaddq          $H4,$D3,$H3             # h3 = d3 + h4*r4
2006         vpmuludq        `32*7+4-0x90`(%rax),$H0,$H4             # h0*r4
2007         vpmuludq        $H1,$S4,$H0             # h1*s4
2008         vmovdqa         64(%rcx),$MASK          # .Lmask26
2009         vpaddq          $H4,$D4,$H4             # h4 = d4 + h0*r4
2010         vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
2011
2012         ################################################################
2013         # horizontal addition
2014
2015         vpsrldq         \$8,$D1,$T1
2016         vpsrldq         \$8,$H2,$T2
2017         vpsrldq         \$8,$H3,$T3
2018         vpsrldq         \$8,$H4,$T4
2019         vpsrldq         \$8,$H0,$T0
2020         vpaddq          $T1,$D1,$D1
2021         vpaddq          $T2,$H2,$H2
2022         vpaddq          $T3,$H3,$H3
2023         vpaddq          $T4,$H4,$H4
2024         vpaddq          $T0,$H0,$H0
2025
2026         vpermq          \$0x2,$H3,$T3
2027         vpermq          \$0x2,$H4,$T4
2028         vpermq          \$0x2,$H0,$T0
2029         vpermq          \$0x2,$D1,$T1
2030         vpermq          \$0x2,$H2,$T2
2031         vpaddq          $T3,$H3,$H3
2032         vpaddq          $T4,$H4,$H4
2033         vpaddq          $T0,$H0,$H0
2034         vpaddq          $T1,$D1,$D1
2035         vpaddq          $T2,$H2,$H2
2036
2037         ################################################################
2038         # lazy reduction
2039
2040         vpsrlq          \$26,$H3,$D3
2041         vpand           $MASK,$H3,$H3
2042         vpaddq          $D3,$H4,$H4             # h3 -> h4
2043
2044         vpsrlq          \$26,$H0,$D0
2045         vpand           $MASK,$H0,$H0
2046         vpaddq          $D0,$D1,$H1             # h0 -> h1
2047
2048         vpsrlq          \$26,$H4,$D4
2049         vpand           $MASK,$H4,$H4
2050
2051         vpsrlq          \$26,$H1,$D1
2052         vpand           $MASK,$H1,$H1
2053         vpaddq          $D1,$H2,$H2             # h1 -> h2
2054
2055         vpaddq          $D4,$H0,$H0
2056         vpsllq          \$2,$D4,$D4
2057         vpaddq          $D4,$H0,$H0             # h4 -> h0
2058
2059         vpsrlq          \$26,$H2,$D2
2060         vpand           $MASK,$H2,$H2
2061         vpaddq          $D2,$H3,$H3             # h2 -> h3
2062
2063         vpsrlq          \$26,$H0,$D0
2064         vpand           $MASK,$H0,$H0
2065         vpaddq          $D0,$H1,$H1             # h0 -> h1
2066
2067         vpsrlq          \$26,$H3,$D3
2068         vpand           $MASK,$H3,$H3
2069         vpaddq          $D3,$H4,$H4             # h3 -> h4
2070
2071         vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2072         vmovd           %x#$H1,`4*1-48-64`($ctx)
2073         vmovd           %x#$H2,`4*2-48-64`($ctx)
2074         vmovd           %x#$H3,`4*3-48-64`($ctx)
2075         vmovd           %x#$H4,`4*4-48-64`($ctx)
2076 ___
2077 $code.=<<___    if ($win64);
2078         vmovdqa         0x50(%r11),%xmm6
2079         vmovdqa         0x60(%r11),%xmm7
2080         vmovdqa         0x70(%r11),%xmm8
2081         vmovdqa         0x80(%r11),%xmm9
2082         vmovdqa         0x90(%r11),%xmm10
2083         vmovdqa         0xa0(%r11),%xmm11
2084         vmovdqa         0xb0(%r11),%xmm12
2085         vmovdqa         0xc0(%r11),%xmm13
2086         vmovdqa         0xd0(%r11),%xmm14
2087         vmovdqa         0xe0(%r11),%xmm15
2088         lea             0xf8(%r11),%rsp
2089 .Ldo_avx2_epilogue:
2090 ___
2091 $code.=<<___    if (!$win64);
2092         lea             8(%r11),%rsp
2093 .cfi_def_cfa            %rsp,8
2094 ___
2095 $code.=<<___;
2096         vzeroupper
2097         ret
2098 .cfi_endproc
2099 .size   poly1305_blocks_avx2,.-poly1305_blocks_avx2
2100 ___
2101 #######################################################################
2102 if ($avx>2) {
2103 # On entry we have input length divisible by 64. But since inner loop
2104 # processes 128 bytes per iteration, cases when length is not divisible
2105 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2106 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
2107 # for this tail, we wouldn't have to even allocate stack frame...
2108
2109 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24));
2110 my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29));
2111 my $PADBIT="%zmm30";
2112 my $GATHER="%ymm31";
2113
2114 $code.=<<___;
2115 .type   poly1305_blocks_avx512,\@function,4
2116 .align  32
2117 poly1305_blocks_avx512:
2118 .cfi_startproc
2119 .Lblocks_avx512:
2120         vzeroupper
2121 ___
2122 $code.=<<___    if (!$win64);
2123         lea             -8(%rsp),%r11
2124 .cfi_def_cfa            %r11,16
2125         sub             \$0x128,%rsp
2126 ___
2127 $code.=<<___    if ($win64);
2128         lea             -0xf8(%rsp),%r11
2129         sub             \$0x1c8,%rsp
2130         vmovdqa         %xmm6,0x50(%r11)
2131         vmovdqa         %xmm7,0x60(%r11)
2132         vmovdqa         %xmm8,0x70(%r11)
2133         vmovdqa32       %xmm9,0x80(%r11)
2134         vmovdqa32       %xmm10,0x90(%r11)
2135         vmovdqa32       %xmm11,0xa0(%r11)
2136         vmovdqa32       %xmm12,0xb0(%r11)
2137         vmovdqa32       %xmm13,0xc0(%r11)
2138         vmovdqa32       %xmm14,0xd0(%r11)
2139         vmovdqa32       %xmm15,0xe0(%r11)
2140 .Ldo_avx512_body:
2141 ___
2142 $code.=<<___;
2143         lea             .Lconst(%rip),%rcx
2144         lea             48+64($ctx),$ctx        # size optimization
2145         vmovdqa         96(%rcx),$T2            # .Lpermd_avx2
2146
2147         # expand pre-calculated table
2148         vmovdqu32       `16*0-64`($ctx),%x#$R0
2149         and             \$-512,%rsp
2150         vmovdqu32       `16*1-64`($ctx),%x#$R1
2151         vmovdqu32       `16*2-64`($ctx),%x#$S1
2152         vmovdqu32       `16*3-64`($ctx),%x#$R2
2153         vmovdqu32       `16*4-64`($ctx),%x#$S2
2154         vmovdqu32       `16*5-64`($ctx),%x#$R3
2155         vmovdqu32       `16*6-64`($ctx),%x#$S3
2156         vmovdqu32       `16*7-64`($ctx),%x#$R4
2157         vmovdqu32       `16*8-64`($ctx),%x#$S4
2158         vpermd          $R0,$T2,$R0             # 00003412 -> 14243444
2159         vmovdqa64       64(%rcx),$MASK          # .Lmask26
2160         vpermd          $R1,$T2,$R1
2161         vpermd          $S1,$T2,$S1
2162         vpermd          $R2,$T2,$R2
2163         vmovdqa32       $R0,0x00(%rsp)          # save in case $len%128 != 0
2164          vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
2165         vpermd          $S2,$T2,$S2
2166         vmovdqa32       $R1,0x20(%rsp)
2167          vpsrlq         \$32,$R1,$T1
2168         vpermd          $R3,$T2,$R3
2169         vmovdqa32       $S1,0x40(%rsp)
2170         vpermd          $S3,$T2,$S3
2171         vpermd          $R4,$T2,$R4
2172         vmovdqa32       $R2,0x60(%rsp)
2173         vpermd          $S4,$T2,$S4
2174         vmovdqa32       $S2,0x80(%rsp)
2175         vmovdqa32       $R3,0xa0(%rsp)
2176         vmovdqa32       $S3,0xc0(%rsp)
2177         vmovdqa32       $R4,0xe0(%rsp)
2178         vmovdqa32       $S4,0x100(%rsp)
2179
2180         ################################################################
2181         # calculate 5th through 8th powers of the key
2182         #
2183         # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2184         # d1 = r0'*r1 + r1'*r0   + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2185         # d2 = r0'*r2 + r1'*r1   + r2'*r0   + r3'*5*r4 + r4'*5*r3
2186         # d3 = r0'*r3 + r1'*r2   + r2'*r1   + r3'*r0   + r4'*5*r4
2187         # d4 = r0'*r4 + r1'*r3   + r2'*r2   + r3'*r1   + r4'*r0
2188
2189         vpmuludq        $T0,$R0,$D0             # d0 = r0'*r0
2190         vpmuludq        $T0,$R1,$D1             # d1 = r0'*r1
2191         vpmuludq        $T0,$R2,$D2             # d2 = r0'*r2
2192         vpmuludq        $T0,$R3,$D3             # d3 = r0'*r3
2193         vpmuludq        $T0,$R4,$D4             # d4 = r0'*r4
2194          vpsrlq         \$32,$R2,$T2
2195
2196         vpmuludq        $T1,$S4,$M0
2197         vpmuludq        $T1,$R0,$M1
2198         vpmuludq        $T1,$R1,$M2
2199         vpmuludq        $T1,$R2,$M3
2200         vpmuludq        $T1,$R3,$M4
2201          vpsrlq         \$32,$R3,$T3
2202         vpaddq          $M0,$D0,$D0             # d0 += r1'*5*r4
2203         vpaddq          $M1,$D1,$D1             # d1 += r1'*r0
2204         vpaddq          $M2,$D2,$D2             # d2 += r1'*r1
2205         vpaddq          $M3,$D3,$D3             # d3 += r1'*r2
2206         vpaddq          $M4,$D4,$D4             # d4 += r1'*r3
2207
2208         vpmuludq        $T2,$S3,$M0
2209         vpmuludq        $T2,$S4,$M1
2210         vpmuludq        $T2,$R1,$M3
2211         vpmuludq        $T2,$R2,$M4
2212         vpmuludq        $T2,$R0,$M2
2213          vpsrlq         \$32,$R4,$T4
2214         vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r3
2215         vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r4
2216         vpaddq          $M3,$D3,$D3             # d3 += r2'*r1
2217         vpaddq          $M4,$D4,$D4             # d4 += r2'*r2
2218         vpaddq          $M2,$D2,$D2             # d2 += r2'*r0
2219
2220         vpmuludq        $T3,$S2,$M0
2221         vpmuludq        $T3,$R0,$M3
2222         vpmuludq        $T3,$R1,$M4
2223         vpmuludq        $T3,$S3,$M1
2224         vpmuludq        $T3,$S4,$M2
2225         vpaddq          $M0,$D0,$D0             # d0 += r3'*5*r2
2226         vpaddq          $M3,$D3,$D3             # d3 += r3'*r0
2227         vpaddq          $M4,$D4,$D4             # d4 += r3'*r1
2228         vpaddq          $M1,$D1,$D1             # d1 += r3'*5*r3
2229         vpaddq          $M2,$D2,$D2             # d2 += r3'*5*r4
2230
2231         vpmuludq        $T4,$S4,$M3
2232         vpmuludq        $T4,$R0,$M4
2233         vpmuludq        $T4,$S1,$M0
2234         vpmuludq        $T4,$S2,$M1
2235         vpmuludq        $T4,$S3,$M2
2236         vpaddq          $M3,$D3,$D3             # d3 += r2'*5*r4
2237         vpaddq          $M4,$D4,$D4             # d4 += r2'*r0
2238         vpaddq          $M0,$D0,$D0             # d0 += r2'*5*r1
2239         vpaddq          $M1,$D1,$D1             # d1 += r2'*5*r2
2240         vpaddq          $M2,$D2,$D2             # d2 += r2'*5*r3
2241
2242         ################################################################
2243         # load input
2244         vmovdqu64       16*0($inp),%z#$T3
2245         vmovdqu64       16*4($inp),%z#$T4
2246         lea             16*8($inp),$inp
2247
2248         ################################################################
2249         # lazy reduction
2250
2251         vpsrlq          \$26,$D3,$M3
2252         vpandq          $MASK,$D3,$D3
2253         vpaddq          $M3,$D4,$D4             # d3 -> d4
2254
2255         vpsrlq          \$26,$D0,$M0
2256         vpandq          $MASK,$D0,$D0
2257         vpaddq          $M0,$D1,$D1             # d0 -> d1
2258
2259         vpsrlq          \$26,$D4,$M4
2260         vpandq          $MASK,$D4,$D4
2261
2262         vpsrlq          \$26,$D1,$M1
2263         vpandq          $MASK,$D1,$D1
2264         vpaddq          $M1,$D2,$D2             # d1 -> d2
2265
2266         vpaddq          $M4,$D0,$D0
2267         vpsllq          \$2,$M4,$M4
2268         vpaddq          $M4,$D0,$D0             # d4 -> d0
2269
2270         vpsrlq          \$26,$D2,$M2
2271         vpandq          $MASK,$D2,$D2
2272         vpaddq          $M2,$D3,$D3             # d2 -> d3
2273
2274         vpsrlq          \$26,$D0,$M0
2275         vpandq          $MASK,$D0,$D0
2276         vpaddq          $M0,$D1,$D1             # d0 -> d1
2277
2278         vpsrlq          \$26,$D3,$M3
2279         vpandq          $MASK,$D3,$D3
2280         vpaddq          $M3,$D4,$D4             # d3 -> d4
2281
2282 ___
2283 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));            # switch to %zmm domain
2284 map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
2285 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2286 map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
2287 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2288 map(s/%y/%z/,($MASK));
2289 $code.=<<___;
2290         ################################################################
2291         # at this point we have 14243444 in $R0-$S4 and 05060708 in
2292         # $D0-$D4, ...
2293
2294         vpunpcklqdq     $T4,$T3,$T0     # transpose input
2295         vpunpckhqdq     $T4,$T3,$T4
2296
2297         # ... since input 64-bit lanes are ordered as 73625140, we could
2298         # "vperm" it to 76543210 (here and in each loop iteration), *or*
2299         # we could just flow along, hence the goal for $R0-$S4 is
2300         # 1858286838784888 ...
2301
2302         vmovdqa32       128(%rcx),$M0           # .Lpermd_avx512:
2303         mov             \$0x7777,%eax
2304         kmovw           %eax,%k1
2305
2306         vpermd          $R0,$M0,$R0             # 14243444 -> 1---2---3---4---
2307         vpermd          $R1,$M0,$R1
2308         vpermd          $R2,$M0,$R2
2309         vpermd          $R3,$M0,$R3
2310         vpermd          $R4,$M0,$R4
2311
2312         vpermd          $D0,$M0,${R0}{%k1}      # 05060708 -> 1858286838784888
2313         vpermd          $D1,$M0,${R1}{%k1}
2314         vpermd          $D2,$M0,${R2}{%k1}
2315         vpermd          $D3,$M0,${R3}{%k1}
2316         vpermd          $D4,$M0,${R4}{%k1}
2317
2318         vpslld          \$2,$R1,$S1             # *5
2319         vpslld          \$2,$R2,$S2
2320         vpslld          \$2,$R3,$S3
2321         vpslld          \$2,$R4,$S4
2322         vpaddd          $R1,$S1,$S1
2323         vpaddd          $R2,$S2,$S2
2324         vpaddd          $R3,$S3,$S3
2325         vpaddd          $R4,$S4,$S4
2326
2327         vpbroadcastq    %x#$MASK,$MASK
2328         vpbroadcastq    32(%rcx),$PADBIT        # .L129
2329
2330         vpsrlq          \$52,$T0,$T2            # splat input
2331         vpsllq          \$12,$T4,$T3
2332         vporq           $T3,$T2,$T2
2333         vpsrlq          \$26,$T0,$T1
2334         vpsrlq          \$14,$T4,$T3
2335         vpsrlq          \$40,$T4,$T4            # 4
2336         vpandq          $MASK,$T2,$T2           # 2
2337         vpandq          $MASK,$T0,$T0           # 0
2338         #vpandq         $MASK,$T1,$T1           # 1
2339         #vpandq         $MASK,$T3,$T3           # 3
2340         #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2341
2342         vpaddq          $H2,$T2,$H2             # accumulate input
2343         sub             \$192,$len
2344         jbe             .Ltail_avx512
2345         #jmp            .Loop_avx512
2346
2347 .align  32
2348 .Loop_avx512:
2349         ################################################################
2350         # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2351         # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2352         # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2353         # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2354         # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2355         # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2356         # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2357         # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2358         #   \________/\___________/
2359         ################################################################
2360         #vpaddq         $H2,$T2,$H2             # accumulate input
2361
2362         # d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
2363         # d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
2364         # d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
2365         # d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
2366         # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2367         #
2368         # however, as h2 is "chronologically" first one available pull
2369         # corresponding operations up, so it's
2370         #
2371         # d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0 + h4*5*r4
2372         # d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1 + h4*r0
2373         # d0 = h2*5*r3 + h0*r0 + h1*5*r4         + h3*5*r2 + h4*5*r1
2374         # d1 = h2*5*r4 + h0*r1           + h1*r0 + h3*5*r3 + h4*5*r2
2375         # d2 = h2*r0           + h0*r2   + h1*r1 + h3*5*r4 + h4*5*r3
2376
2377         vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
2378          vpaddq         $H0,$T0,$H0
2379         vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
2380          vpandq         $MASK,$T1,$T1           # 1
2381         vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
2382          vpandq         $MASK,$T3,$T3           # 3
2383         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
2384          vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2385         vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
2386          vpaddq         $H1,$T1,$H1             # accumulate input
2387          vpaddq         $H3,$T3,$H3
2388          vpaddq         $H4,$T4,$H4
2389
2390           vmovdqu64     16*0($inp),$T3          # load input
2391           vmovdqu64     16*4($inp),$T4
2392           lea           16*8($inp),$inp
2393         vpmuludq        $H0,$R3,$M3
2394         vpmuludq        $H0,$R4,$M4
2395         vpmuludq        $H0,$R0,$M0
2396         vpmuludq        $H0,$R1,$M1
2397         vpaddq          $M3,$D3,$D3             # d3 += h0*r3
2398         vpaddq          $M4,$D4,$D4             # d4 += h0*r4
2399         vpaddq          $M0,$D0,$D0             # d0 += h0*r0
2400         vpaddq          $M1,$D1,$D1             # d1 += h0*r1
2401
2402         vpmuludq        $H1,$R2,$M3
2403         vpmuludq        $H1,$R3,$M4
2404         vpmuludq        $H1,$S4,$M0
2405         vpmuludq        $H0,$R2,$M2
2406         vpaddq          $M3,$D3,$D3             # d3 += h1*r2
2407         vpaddq          $M4,$D4,$D4             # d4 += h1*r3
2408         vpaddq          $M0,$D0,$D0             # d0 += h1*s4
2409         vpaddq          $M2,$D2,$D2             # d2 += h0*r2
2410
2411           vpunpcklqdq   $T4,$T3,$T0             # transpose input
2412           vpunpckhqdq   $T4,$T3,$T4
2413
2414         vpmuludq        $H3,$R0,$M3
2415         vpmuludq        $H3,$R1,$M4
2416         vpmuludq        $H1,$R0,$M1
2417         vpmuludq        $H1,$R1,$M2
2418         vpaddq          $M3,$D3,$D3             # d3 += h3*r0
2419         vpaddq          $M4,$D4,$D4             # d4 += h3*r1
2420         vpaddq          $M1,$D1,$D1             # d1 += h1*r0
2421         vpaddq          $M2,$D2,$D2             # d2 += h1*r1
2422
2423         vpmuludq        $H4,$S4,$M3
2424         vpmuludq        $H4,$R0,$M4
2425         vpmuludq        $H3,$S2,$M0
2426         vpmuludq        $H3,$S3,$M1
2427         vpaddq          $M3,$D3,$D3             # d3 += h4*s4
2428         vpmuludq        $H3,$S4,$M2
2429         vpaddq          $M4,$D4,$D4             # d4 += h4*r0
2430         vpaddq          $M0,$D0,$D0             # d0 += h3*s2
2431         vpaddq          $M1,$D1,$D1             # d1 += h3*s3
2432         vpaddq          $M2,$D2,$D2             # d2 += h3*s4
2433
2434         vpmuludq        $H4,$S1,$M0
2435         vpmuludq        $H4,$S2,$M1
2436         vpmuludq        $H4,$S3,$M2
2437         vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
2438         vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
2439         vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
2440
2441         ################################################################
2442         # lazy reduction (interleaved with input splat)
2443
2444          vpsrlq         \$52,$T0,$T2            # splat input
2445          vpsllq         \$12,$T4,$T3
2446
2447         vpsrlq          \$26,$D3,$H3
2448         vpandq          $MASK,$D3,$D3
2449         vpaddq          $H3,$D4,$H4             # h3 -> h4
2450
2451          vporq          $T3,$T2,$T2
2452
2453         vpsrlq          \$26,$H0,$D0
2454         vpandq          $MASK,$H0,$H0
2455         vpaddq          $D0,$H1,$H1             # h0 -> h1
2456
2457          vpandq         $MASK,$T2,$T2           # 2
2458
2459         vpsrlq          \$26,$H4,$D4
2460         vpandq          $MASK,$H4,$H4
2461
2462         vpsrlq          \$26,$H1,$D1
2463         vpandq          $MASK,$H1,$H1
2464         vpaddq          $D1,$H2,$H2             # h1 -> h2
2465
2466         vpaddq          $D4,$H0,$H0
2467         vpsllq          \$2,$D4,$D4
2468         vpaddq          $D4,$H0,$H0             # h4 -> h0
2469
2470          vpaddq         $T2,$H2,$H2             # modulo-scheduled
2471          vpsrlq         \$26,$T0,$T1
2472
2473         vpsrlq          \$26,$H2,$D2
2474         vpandq          $MASK,$H2,$H2
2475         vpaddq          $D2,$D3,$H3             # h2 -> h3
2476
2477          vpsrlq         \$14,$T4,$T3
2478
2479         vpsrlq          \$26,$H0,$D0
2480         vpandq          $MASK,$H0,$H0
2481         vpaddq          $D0,$H1,$H1             # h0 -> h1
2482
2483          vpsrlq         \$40,$T4,$T4            # 4
2484
2485         vpsrlq          \$26,$H3,$D3
2486         vpandq          $MASK,$H3,$H3
2487         vpaddq          $D3,$H4,$H4             # h3 -> h4
2488
2489          vpandq         $MASK,$T0,$T0           # 0
2490          #vpandq        $MASK,$T1,$T1           # 1
2491          #vpandq        $MASK,$T3,$T3           # 3
2492          #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
2493
2494         sub             \$128,$len
2495         ja              .Loop_avx512
2496
2497 .Ltail_avx512:
2498         ################################################################
2499         # while above multiplications were by r^8 in all lanes, in last
2500         # iteration we multiply least significant lane by r^8 and most
2501         # significant one by r, that's why table gets shifted...
2502
2503         vpsrlq          \$32,$R0,$R0            # 0105020603070408
2504         vpsrlq          \$32,$R1,$R1
2505         vpsrlq          \$32,$R2,$R2
2506         vpsrlq          \$32,$S3,$S3
2507         vpsrlq          \$32,$S4,$S4
2508         vpsrlq          \$32,$R3,$R3
2509         vpsrlq          \$32,$R4,$R4
2510         vpsrlq          \$32,$S1,$S1
2511         vpsrlq          \$32,$S2,$S2
2512
2513         ################################################################
2514         # load either next or last 64 byte of input
2515         lea             ($inp,$len),$inp
2516
2517         #vpaddq         $H2,$T2,$H2             # accumulate input
2518         vpaddq          $H0,$T0,$H0
2519
2520         vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
2521         vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
2522         vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
2523          vpandq         $MASK,$T1,$T1           # 1
2524         vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
2525          vpandq         $MASK,$T3,$T3           # 3
2526         vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
2527          vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2528          vpaddq         $H1,$T1,$H1             # accumulate input
2529          vpaddq         $H3,$T3,$H3
2530          vpaddq         $H4,$T4,$H4
2531
2532           vmovdqu64     16*0($inp),%x#$T0
2533         vpmuludq        $H0,$R3,$M3
2534         vpmuludq        $H0,$R4,$M4
2535         vpmuludq        $H0,$R0,$M0
2536         vpmuludq        $H0,$R1,$M1
2537         vpaddq          $M3,$D3,$D3             # d3 += h0*r3
2538         vpaddq          $M4,$D4,$D4             # d4 += h0*r4
2539         vpaddq          $M0,$D0,$D0             # d0 += h0*r0
2540         vpaddq          $M1,$D1,$D1             # d1 += h0*r1
2541
2542           vmovdqu64     16*1($inp),%x#$T1
2543         vpmuludq        $H1,$R2,$M3
2544         vpmuludq        $H1,$R3,$M4
2545         vpmuludq        $H1,$S4,$M0
2546         vpmuludq        $H0,$R2,$M2
2547         vpaddq          $M3,$D3,$D3             # d3 += h1*r2
2548         vpaddq          $M4,$D4,$D4             # d4 += h1*r3
2549         vpaddq          $M0,$D0,$D0             # d0 += h1*s4
2550         vpaddq          $M2,$D2,$D2             # d2 += h0*r2
2551
2552           vinserti64x2  \$1,16*2($inp),$T0,$T0
2553         vpmuludq        $H3,$R0,$M3
2554         vpmuludq        $H3,$R1,$M4
2555         vpmuludq        $H1,$R0,$M1
2556         vpmuludq        $H1,$R1,$M2
2557         vpaddq          $M3,$D3,$D3             # d3 += h3*r0
2558         vpaddq          $M4,$D4,$D4             # d4 += h3*r1
2559         vpaddq          $M1,$D1,$D1             # d1 += h1*r0
2560         vpaddq          $M2,$D2,$D2             # d2 += h1*r1
2561
2562           vinserti64x2  \$1,16*3($inp),$T1,$T1
2563         vpmuludq        $H4,$S4,$M3
2564         vpmuludq        $H4,$R0,$M4
2565         vpmuludq        $H3,$S2,$M0
2566         vpmuludq        $H3,$S3,$M1
2567         vpmuludq        $H3,$S4,$M2
2568         vpaddq          $M3,$D3,$H3             # h3 = d3 + h4*s4
2569         vpaddq          $M4,$D4,$D4             # d4 += h4*r0
2570         vpaddq          $M0,$D0,$D0             # d0 += h3*s2
2571         vpaddq          $M1,$D1,$D1             # d1 += h3*s3
2572         vpaddq          $M2,$D2,$D2             # d2 += h3*s4
2573
2574         vpmuludq        $H4,$S1,$M0
2575         vpmuludq        $H4,$S2,$M1
2576         vpmuludq        $H4,$S3,$M2
2577         vpaddq          $M0,$D0,$H0             # h0 = d0 + h4*s1
2578         vpaddq          $M1,$D1,$H1             # h1 = d2 + h4*s2
2579         vpaddq          $M2,$D2,$H2             # h2 = d3 + h4*s3
2580
2581         ################################################################
2582         # horizontal addition
2583
2584         mov             \$1,%eax
2585         vpsrldq         \$8,$H3,$D3
2586         vpsrldq         \$8,$D4,$H4
2587         vpsrldq         \$8,$H0,$D0
2588         vpsrldq         \$8,$H1,$D1
2589         vpsrldq         \$8,$H2,$D2
2590         vpaddq          $D3,$H3,$H3
2591         vpaddq          $D4,$H4,$H4
2592         vpaddq          $D0,$H0,$H0
2593         vpaddq          $D1,$H1,$H1
2594         vpaddq          $D2,$H2,$H2
2595
2596         kmovw           %eax,%k3
2597         vpermq          \$0x2,$H3,$D3
2598         vpermq          \$0x2,$H4,$D4
2599         vpermq          \$0x2,$H0,$D0
2600         vpermq          \$0x2,$H1,$D1
2601         vpermq          \$0x2,$H2,$D2
2602         vpaddq          $D3,$H3,$H3
2603         vpaddq          $D4,$H4,$H4
2604         vpaddq          $D0,$H0,$H0
2605         vpaddq          $D1,$H1,$H1
2606         vpaddq          $D2,$H2,$H2
2607
2608         vextracti64x4   \$0x1,$H3,%y#$D3
2609         vextracti64x4   \$0x1,$H4,%y#$D4
2610         vextracti64x4   \$0x1,$H0,%y#$D0
2611         vextracti64x4   \$0x1,$H1,%y#$D1
2612         vextracti64x4   \$0x1,$H2,%y#$D2
2613         vpaddq          $D3,$H3,${H3}{%k3}{z}   # keep single qword in case
2614         vpaddq          $D4,$H4,${H4}{%k3}{z}   # it's passed to .Ltail_avx2
2615         vpaddq          $D0,$H0,${H0}{%k3}{z}
2616         vpaddq          $D1,$H1,${H1}{%k3}{z}
2617         vpaddq          $D2,$H2,${H2}{%k3}{z}
2618 ___
2619 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2620 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2621 $code.=<<___;
2622         ################################################################
2623         # lazy reduction (interleaved with input splat)
2624
2625         vpsrlq          \$26,$H3,$D3
2626         vpandq          $MASK,$H3,$H3
2627          vpsrldq        \$6,$T0,$T2             # splat input
2628          vpsrldq        \$6,$T1,$T3
2629          vpunpckhqdq    $T1,$T0,$T4             # 4
2630         vpaddq          $D3,$H4,$H4             # h3 -> h4
2631
2632         vpsrlq          \$26,$H0,$D0
2633         vpandq          $MASK,$H0,$H0
2634          vpunpcklqdq    $T3,$T2,$T2             # 2:3
2635          vpunpcklqdq    $T1,$T0,$T0             # 0:1
2636         vpaddq          $D0,$H1,$H1             # h0 -> h1
2637
2638         vpsrlq          \$26,$H4,$D4
2639         vpandq          $MASK,$H4,$H4
2640
2641         vpsrlq          \$26,$H1,$D1
2642         vpandq          $MASK,$H1,$H1
2643          vpsrlq         \$30,$T2,$T3
2644          vpsrlq         \$4,$T2,$T2
2645         vpaddq          $D1,$H2,$H2             # h1 -> h2
2646
2647         vpaddq          $D4,$H0,$H0
2648         vpsllq          \$2,$D4,$D4
2649          vpsrlq         \$26,$T0,$T1
2650          vpsrlq         \$40,$T4,$T4            # 4
2651         vpaddq          $D4,$H0,$H0             # h4 -> h0
2652
2653         vpsrlq          \$26,$H2,$D2
2654         vpandq          $MASK,$H2,$H2
2655          vpandq         $MASK,$T2,$T2           # 2
2656          vpandq         $MASK,$T0,$T0           # 0
2657         vpaddq          $D2,$H3,$H3             # h2 -> h3
2658
2659         vpsrlq          \$26,$H0,$D0
2660         vpandq          $MASK,$H0,$H0
2661          vpaddq         $H2,$T2,$H2             # accumulate input for .Ltail_avx2
2662          vpandq         $MASK,$T1,$T1           # 1
2663         vpaddq          $D0,$H1,$H1             # h0 -> h1
2664
2665         vpsrlq          \$26,$H3,$D3
2666         vpandq          $MASK,$H3,$H3
2667          vpandq         $MASK,$T3,$T3           # 3
2668          vporq          $PADBIT,$T4,$T4         # padbit, yes, always
2669         vpaddq          $D3,$H4,$H4             # h3 -> h4
2670
2671         lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
2672         add             \$64,$len
2673         jnz             .Ltail_avx2
2674
2675         vpsubq          $T2,$H2,$H2             # undo input accumulation
2676         vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2677         vmovd           %x#$H1,`4*1-48-64`($ctx)
2678         vmovd           %x#$H2,`4*2-48-64`($ctx)
2679         vmovd           %x#$H3,`4*3-48-64`($ctx)
2680         vmovd           %x#$H4,`4*4-48-64`($ctx)
2681         vzeroall
2682 ___
2683 $code.=<<___    if ($win64);
2684         movdqa          0x50(%r11),%xmm6
2685         movdqa          0x60(%r11),%xmm7
2686         movdqa          0x70(%r11),%xmm8
2687         movdqa          0x80(%r11),%xmm9
2688         movdqa          0x90(%r11),%xmm10
2689         movdqa          0xa0(%r11),%xmm11
2690         movdqa          0xb0(%r11),%xmm12
2691         movdqa          0xc0(%r11),%xmm13
2692         movdqa          0xd0(%r11),%xmm14
2693         movdqa          0xe0(%r11),%xmm15
2694         lea             0xf8(%r11),%rsp
2695 .Ldo_avx512_epilogue:
2696 ___
2697 $code.=<<___    if (!$win64);
2698         lea             8(%r11),%rsp
2699 .cfi_def_cfa            %rsp,8
2700 ___
2701 $code.=<<___;
2702         ret
2703 .cfi_endproc
2704 .size   poly1305_blocks_avx512,.-poly1305_blocks_avx512
2705 ___
2706 if ($avx>3) {
2707 ########################################################################
2708 # VPMADD52 version using 2^44 radix.
2709 #
2710 # One can argue that base 2^52 would be more natural. Well, even though
2711 # some operations would be more natural, one has to recognize couple of
2712 # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2713 # at amount of multiply-n-accumulate operations. Secondly, it makes it
2714 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2715 # reference implementations], which means that more such operations
2716 # would have to be performed in inner loop, which in turn makes critical
2717 # path longer. In other words, even though base 2^44 reduction might
2718 # look less elegant, overall critical path is actually shorter...
2719
2720 ########################################################################
2721 # Layout of opaque area is following.
2722 #
2723 #       unsigned __int64 h[3];          # current hash value base 2^44
2724 #       unsigned __int64 s[2];          # key value*20 base 2^44
2725 #       unsigned __int64 r[3];          # key value base 2^44
2726 #       struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2727 #                                       # r^n positions reflect
2728 #                                       # placement in register, not
2729 #                                       # memory, R[3] is R[1]*20
2730
2731 $code.=<<___;
2732 .type   poly1305_init_base2_44,\@function,3
2733 .align  32
2734 poly1305_init_base2_44:
2735         xor     %rax,%rax
2736         mov     %rax,0($ctx)            # initialize hash value
2737         mov     %rax,8($ctx)
2738         mov     %rax,16($ctx)
2739
2740 .Linit_base2_44:
2741         lea     poly1305_blocks_vpmadd52(%rip),%r10
2742         lea     poly1305_emit_base2_44(%rip),%r11
2743
2744         mov     \$0x0ffffffc0fffffff,%rax
2745         mov     \$0x0ffffffc0ffffffc,%rcx
2746         and     0($inp),%rax
2747         mov     \$0x00000fffffffffff,%r8
2748         and     8($inp),%rcx
2749         mov     \$0x00000fffffffffff,%r9
2750         and     %rax,%r8
2751         shrd    \$44,%rcx,%rax
2752         mov     %r8,40($ctx)            # r0
2753         and     %r9,%rax
2754         shr     \$24,%rcx
2755         mov     %rax,48($ctx)           # r1
2756         lea     (%rax,%rax,4),%rax      # *5
2757         mov     %rcx,56($ctx)           # r2
2758         shl     \$2,%rax                # magic <<2
2759         lea     (%rcx,%rcx,4),%rcx      # *5
2760         shl     \$2,%rcx                # magic <<2
2761         mov     %rax,24($ctx)           # s1
2762         mov     %rcx,32($ctx)           # s2
2763         movq    \$-1,64($ctx)           # write impossible value
2764 ___
2765 $code.=<<___    if ($flavour !~ /elf32/);
2766         mov     %r10,0(%rdx)
2767         mov     %r11,8(%rdx)
2768 ___
2769 $code.=<<___    if ($flavour =~ /elf32/);
2770         mov     %r10d,0(%rdx)
2771         mov     %r11d,4(%rdx)
2772 ___
2773 $code.=<<___;
2774         mov     \$1,%eax
2775         ret
2776 .size   poly1305_init_base2_44,.-poly1305_init_base2_44
2777 ___
2778 {
2779 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2780 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2781 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2782
2783 $code.=<<___;
2784 .type   poly1305_blocks_vpmadd52,\@function,4
2785 .align  32
2786 poly1305_blocks_vpmadd52:
2787         shr     \$4,$len
2788         jz      .Lno_data_vpmadd52              # too short
2789
2790         shl     \$40,$padbit
2791         mov     64($ctx),%r8                    # peek on power of the key
2792
2793         # if powers of the key are not calculated yet, process up to 3
2794         # blocks with this single-block subroutine, otherwise ensure that
2795         # length is divisible by 2 blocks and pass the rest down to next
2796         # subroutine...
2797
2798         mov     \$3,%rax
2799         mov     \$1,%r10
2800         cmp     \$4,$len                        # is input long
2801         cmovae  %r10,%rax
2802         test    %r8,%r8                         # is power value impossible?
2803         cmovns  %r10,%rax
2804
2805         and     $len,%rax                       # is input of favourable length?
2806         jz      .Lblocks_vpmadd52_4x
2807
2808         sub             %rax,$len
2809         mov             \$7,%r10d
2810         mov             \$1,%r11d
2811         kmovw           %r10d,%k7
2812         lea             .L2_44_inp_permd(%rip),%r10
2813         kmovw           %r11d,%k1
2814
2815         vmovq           $padbit,%x#$PAD
2816         vmovdqa64       0(%r10),$inp_permd      # .L2_44_inp_permd
2817         vmovdqa64       32(%r10),$inp_shift     # .L2_44_inp_shift
2818         vpermq          \$0xcf,$PAD,$PAD
2819         vmovdqa64       64(%r10),$reduc_mask    # .L2_44_mask
2820
2821         vmovdqu64       0($ctx),${Dlo}{%k7}{z}          # load hash value
2822         vmovdqu64       40($ctx),${r2r1r0}{%k7}{z}      # load keys
2823         vmovdqu64       32($ctx),${r1r0s2}{%k7}{z}
2824         vmovdqu64       24($ctx),${r0s2s1}{%k7}{z}
2825
2826         vmovdqa64       96(%r10),$reduc_rght    # .L2_44_shift_rgt
2827         vmovdqa64       128(%r10),$reduc_left   # .L2_44_shift_lft
2828
2829         jmp             .Loop_vpmadd52
2830
2831 .align  32
2832 .Loop_vpmadd52:
2833         vmovdqu32       0($inp),%x#$T0          # load input as ----3210
2834         lea             16($inp),$inp
2835
2836         vpermd          $T0,$inp_permd,$T0      # ----3210 -> --322110
2837         vpsrlvq         $inp_shift,$T0,$T0
2838         vpandq          $reduc_mask,$T0,$T0
2839         vporq           $PAD,$T0,$T0
2840
2841         vpaddq          $T0,$Dlo,$Dlo           # accumulate input
2842
2843         vpermq          \$0,$Dlo,${H0}{%k7}{z}  # smash hash value
2844         vpermq          \$0b01010101,$Dlo,${H1}{%k7}{z}
2845         vpermq          \$0b10101010,$Dlo,${H2}{%k7}{z}
2846
2847         vpxord          $Dlo,$Dlo,$Dlo
2848         vpxord          $Dhi,$Dhi,$Dhi
2849
2850         vpmadd52luq     $r2r1r0,$H0,$Dlo
2851         vpmadd52huq     $r2r1r0,$H0,$Dhi
2852
2853         vpmadd52luq     $r1r0s2,$H1,$Dlo
2854         vpmadd52huq     $r1r0s2,$H1,$Dhi
2855
2856         vpmadd52luq     $r0s2s1,$H2,$Dlo
2857         vpmadd52huq     $r0s2s1,$H2,$Dhi
2858
2859         vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost qword
2860         vpsllvq         $reduc_left,$Dhi,$Dhi   # 0 in topmost qword
2861         vpandq          $reduc_mask,$Dlo,$Dlo
2862
2863         vpaddq          $T0,$Dhi,$Dhi
2864
2865         vpermq          \$0b10010011,$Dhi,$Dhi  # 0 in lowest qword
2866
2867         vpaddq          $Dhi,$Dlo,$Dlo          # note topmost qword :-)
2868
2869         vpsrlvq         $reduc_rght,$Dlo,$T0    # 0 in topmost word
2870         vpandq          $reduc_mask,$Dlo,$Dlo
2871
2872         vpermq          \$0b10010011,$T0,$T0
2873
2874         vpaddq          $T0,$Dlo,$Dlo
2875
2876         vpermq          \$0b10010011,$Dlo,${T0}{%k1}{z}
2877
2878         vpaddq          $T0,$Dlo,$Dlo
2879         vpsllq          \$2,$T0,$T0
2880
2881         vpaddq          $T0,$Dlo,$Dlo
2882
2883         dec             %rax                    # len-=16
2884         jnz             .Loop_vpmadd52
2885
2886         vmovdqu64       $Dlo,0($ctx){%k7}       # store hash value
2887
2888         test            $len,$len
2889         jnz             .Lblocks_vpmadd52_4x
2890
2891 .Lno_data_vpmadd52:
2892         ret
2893 .size   poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2894 ___
2895 }
2896 {
2897 ########################################################################
2898 # As implied by its name 4x subroutine processes 4 blocks in parallel
2899 # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2900 # and is handled in 256-bit %ymm registers.
2901
2902 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2903 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2904 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2905
2906 $code.=<<___;
2907 .type   poly1305_blocks_vpmadd52_4x,\@function,4
2908 .align  32
2909 poly1305_blocks_vpmadd52_4x:
2910         shr     \$4,$len
2911         jz      .Lno_data_vpmadd52_4x           # too short
2912
2913         shl     \$40,$padbit
2914         mov     64($ctx),%r8                    # peek on power of the key
2915
2916 .Lblocks_vpmadd52_4x:
2917         vpbroadcastq    $padbit,$PAD
2918
2919         vmovdqa64       .Lx_mask44(%rip),$mask44
2920         mov             \$5,%eax
2921         vmovdqa64       .Lx_mask42(%rip),$mask42
2922         kmovw           %eax,%k1                # used in 2x path
2923
2924         test            %r8,%r8                 # is power value impossible?
2925         js              .Linit_vpmadd52         # if it is, then init R[4]
2926
2927         vmovq           0($ctx),%x#$H0          # load current hash value
2928         vmovq           8($ctx),%x#$H1
2929         vmovq           16($ctx),%x#$H2
2930
2931         test            \$3,$len                # is length 4*n+2?
2932         jnz             .Lblocks_vpmadd52_2x_do
2933
2934 .Lblocks_vpmadd52_4x_do:
2935         vpbroadcastq    64($ctx),$R0            # load 4th power of the key
2936         vpbroadcastq    96($ctx),$R1
2937         vpbroadcastq    128($ctx),$R2
2938         vpbroadcastq    160($ctx),$S1
2939
2940 .Lblocks_vpmadd52_4x_key_loaded:
2941         vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
2942         vpaddq          $R2,$S2,$S2
2943         vpsllq          \$2,$S2,$S2
2944
2945         test            \$7,$len                # is len 8*n?
2946         jz              .Lblocks_vpmadd52_8x
2947
2948         vmovdqu64       16*0($inp),$T2          # load data
2949         vmovdqu64       16*2($inp),$T3
2950         lea             16*4($inp),$inp
2951
2952         vpunpcklqdq     $T3,$T2,$T1             # transpose data
2953         vpunpckhqdq     $T3,$T2,$T3
2954
2955         # at this point 64-bit lanes are ordered as 3-1-2-0
2956
2957         vpsrlq          \$24,$T3,$T2            # splat the data
2958         vporq           $PAD,$T2,$T2
2959          vpaddq         $T2,$H2,$H2             # accumulate input
2960         vpandq          $mask44,$T1,$T0
2961         vpsrlq          \$44,$T1,$T1
2962         vpsllq          \$20,$T3,$T3
2963         vporq           $T3,$T1,$T1
2964         vpandq          $mask44,$T1,$T1
2965
2966         sub             \$4,$len
2967         jz              .Ltail_vpmadd52_4x
2968         jmp             .Loop_vpmadd52_4x
2969         ud2
2970
2971 .align  32
2972 .Linit_vpmadd52:
2973         vmovq           24($ctx),%x#$S1         # load key
2974         vmovq           56($ctx),%x#$H2
2975         vmovq           32($ctx),%x#$S2
2976         vmovq           40($ctx),%x#$R0
2977         vmovq           48($ctx),%x#$R1
2978
2979         vmovdqa         $R0,$H0
2980         vmovdqa         $R1,$H1
2981         vmovdqa         $H2,$R2
2982
2983         mov             \$2,%eax
2984
2985 .Lmul_init_vpmadd52:
2986         vpxorq          $D0lo,$D0lo,$D0lo
2987         vpmadd52luq     $H2,$S1,$D0lo
2988         vpxorq          $D0hi,$D0hi,$D0hi
2989         vpmadd52huq     $H2,$S1,$D0hi
2990         vpxorq          $D1lo,$D1lo,$D1lo
2991         vpmadd52luq     $H2,$S2,$D1lo
2992         vpxorq          $D1hi,$D1hi,$D1hi
2993         vpmadd52huq     $H2,$S2,$D1hi
2994         vpxorq          $D2lo,$D2lo,$D2lo
2995         vpmadd52luq     $H2,$R0,$D2lo
2996         vpxorq          $D2hi,$D2hi,$D2hi
2997         vpmadd52huq     $H2,$R0,$D2hi
2998
2999         vpmadd52luq     $H0,$R0,$D0lo
3000         vpmadd52huq     $H0,$R0,$D0hi
3001         vpmadd52luq     $H0,$R1,$D1lo
3002         vpmadd52huq     $H0,$R1,$D1hi
3003         vpmadd52luq     $H0,$R2,$D2lo
3004         vpmadd52huq     $H0,$R2,$D2hi
3005
3006         vpmadd52luq     $H1,$S2,$D0lo
3007         vpmadd52huq     $H1,$S2,$D0hi
3008         vpmadd52luq     $H1,$R0,$D1lo
3009         vpmadd52huq     $H1,$R0,$D1hi
3010         vpmadd52luq     $H1,$R1,$D2lo
3011         vpmadd52huq     $H1,$R1,$D2hi
3012
3013         ################################################################
3014         # partial reduction
3015         vpsrlq          \$44,$D0lo,$tmp
3016         vpsllq          \$8,$D0hi,$D0hi
3017         vpandq          $mask44,$D0lo,$H0
3018         vpaddq          $tmp,$D0hi,$D0hi
3019
3020         vpaddq          $D0hi,$D1lo,$D1lo
3021
3022         vpsrlq          \$44,$D1lo,$tmp
3023         vpsllq          \$8,$D1hi,$D1hi
3024         vpandq          $mask44,$D1lo,$H1
3025         vpaddq          $tmp,$D1hi,$D1hi
3026
3027         vpaddq          $D1hi,$D2lo,$D2lo
3028
3029         vpsrlq          \$42,$D2lo,$tmp
3030         vpsllq          \$10,$D2hi,$D2hi
3031         vpandq          $mask42,$D2lo,$H2
3032         vpaddq          $tmp,$D2hi,$D2hi
3033
3034         vpaddq          $D2hi,$H0,$H0
3035         vpsllq          \$2,$D2hi,$D2hi
3036
3037         vpaddq          $D2hi,$H0,$H0
3038
3039         vpsrlq          \$44,$H0,$tmp           # additional step
3040         vpandq          $mask44,$H0,$H0
3041
3042         vpaddq          $tmp,$H1,$H1
3043
3044         dec             %eax
3045         jz              .Ldone_init_vpmadd52
3046
3047         vpunpcklqdq     $R1,$H1,$R1             # 1,2
3048         vpbroadcastq    %x#$H1,%x#$H1           # 2,2
3049         vpunpcklqdq     $R2,$H2,$R2
3050         vpbroadcastq    %x#$H2,%x#$H2
3051         vpunpcklqdq     $R0,$H0,$R0
3052         vpbroadcastq    %x#$H0,%x#$H0
3053
3054         vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
3055         vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
3056         vpaddq          $R1,$S1,$S1
3057         vpaddq          $R2,$S2,$S2
3058         vpsllq          \$2,$S1,$S1
3059         vpsllq          \$2,$S2,$S2
3060
3061         jmp             .Lmul_init_vpmadd52
3062         ud2
3063
3064 .align  32
3065 .Ldone_init_vpmadd52:
3066         vinserti128     \$1,%x#$R1,$H1,$R1      # 1,2,3,4
3067         vinserti128     \$1,%x#$R2,$H2,$R2
3068         vinserti128     \$1,%x#$R0,$H0,$R0
3069
3070         vpermq          \$0b11011000,$R1,$R1    # 1,3,2,4
3071         vpermq          \$0b11011000,$R2,$R2
3072         vpermq          \$0b11011000,$R0,$R0
3073
3074         vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
3075         vpaddq          $R1,$S1,$S1
3076         vpsllq          \$2,$S1,$S1
3077
3078         vmovq           0($ctx),%x#$H0          # load current hash value
3079         vmovq           8($ctx),%x#$H1
3080         vmovq           16($ctx),%x#$H2
3081
3082         test            \$3,$len                # is length 4*n+2?
3083         jnz             .Ldone_init_vpmadd52_2x
3084
3085         vmovdqu64       $R0,64($ctx)            # save key powers
3086         vpbroadcastq    %x#$R0,$R0              # broadcast 4th power
3087         vmovdqu64       $R1,96($ctx)
3088         vpbroadcastq    %x#$R1,$R1
3089         vmovdqu64       $R2,128($ctx)
3090         vpbroadcastq    %x#$R2,$R2
3091         vmovdqu64       $S1,160($ctx)
3092         vpbroadcastq    %x#$S1,$S1
3093
3094         jmp             .Lblocks_vpmadd52_4x_key_loaded
3095         ud2
3096
3097 .align  32
3098 .Ldone_init_vpmadd52_2x:
3099         vmovdqu64       $R0,64($ctx)            # save key powers
3100         vpsrldq         \$8,$R0,$R0             # 0-1-0-2
3101         vmovdqu64       $R1,96($ctx)
3102         vpsrldq         \$8,$R1,$R1
3103         vmovdqu64       $R2,128($ctx)
3104         vpsrldq         \$8,$R2,$R2
3105         vmovdqu64       $S1,160($ctx)
3106         vpsrldq         \$8,$S1,$S1
3107         jmp             .Lblocks_vpmadd52_2x_key_loaded
3108         ud2
3109
3110 .align  32
3111 .Lblocks_vpmadd52_2x_do:
3112         vmovdqu64       128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3113         vmovdqu64       160+8($ctx),${S1}{%k1}{z}
3114         vmovdqu64       64+8($ctx),${R0}{%k1}{z}
3115         vmovdqu64       96+8($ctx),${R1}{%k1}{z}
3116
3117 .Lblocks_vpmadd52_2x_key_loaded:
3118         vmovdqu64       16*0($inp),$T2          # load data
3119         vpxorq          $T3,$T3,$T3
3120         lea             16*2($inp),$inp
3121
3122         vpunpcklqdq     $T3,$T2,$T1             # transpose data
3123         vpunpckhqdq     $T3,$T2,$T3
3124
3125         # at this point 64-bit lanes are ordered as x-1-x-0
3126
3127         vpsrlq          \$24,$T3,$T2            # splat the data
3128         vporq           $PAD,$T2,$T2
3129          vpaddq         $T2,$H2,$H2             # accumulate input
3130         vpandq          $mask44,$T1,$T0
3131         vpsrlq          \$44,$T1,$T1
3132         vpsllq          \$20,$T3,$T3
3133         vporq           $T3,$T1,$T1
3134         vpandq          $mask44,$T1,$T1
3135
3136         jmp             .Ltail_vpmadd52_2x
3137         ud2
3138
3139 .align  32
3140 .Loop_vpmadd52_4x:
3141         #vpaddq         $T2,$H2,$H2             # accumulate input
3142         vpaddq          $T0,$H0,$H0
3143         vpaddq          $T1,$H1,$H1
3144
3145         vpxorq          $D0lo,$D0lo,$D0lo
3146         vpmadd52luq     $H2,$S1,$D0lo
3147         vpxorq          $D0hi,$D0hi,$D0hi
3148         vpmadd52huq     $H2,$S1,$D0hi
3149         vpxorq          $D1lo,$D1lo,$D1lo
3150         vpmadd52luq     $H2,$S2,$D1lo
3151         vpxorq          $D1hi,$D1hi,$D1hi
3152         vpmadd52huq     $H2,$S2,$D1hi
3153         vpxorq          $D2lo,$D2lo,$D2lo
3154         vpmadd52luq     $H2,$R0,$D2lo
3155         vpxorq          $D2hi,$D2hi,$D2hi
3156         vpmadd52huq     $H2,$R0,$D2hi
3157
3158          vmovdqu64      16*0($inp),$T2          # load data
3159          vmovdqu64      16*2($inp),$T3
3160          lea            16*4($inp),$inp
3161         vpmadd52luq     $H0,$R0,$D0lo
3162         vpmadd52huq     $H0,$R0,$D0hi
3163         vpmadd52luq     $H0,$R1,$D1lo
3164         vpmadd52huq     $H0,$R1,$D1hi
3165         vpmadd52luq     $H0,$R2,$D2lo
3166         vpmadd52huq     $H0,$R2,$D2hi
3167
3168          vpunpcklqdq    $T3,$T2,$T1             # transpose data
3169          vpunpckhqdq    $T3,$T2,$T3
3170         vpmadd52luq     $H1,$S2,$D0lo
3171         vpmadd52huq     $H1,$S2,$D0hi
3172         vpmadd52luq     $H1,$R0,$D1lo
3173         vpmadd52huq     $H1,$R0,$D1hi
3174         vpmadd52luq     $H1,$R1,$D2lo
3175         vpmadd52huq     $H1,$R1,$D2hi
3176
3177         ################################################################
3178         # partial reduction (interleaved with data splat)
3179         vpsrlq          \$44,$D0lo,$tmp
3180         vpsllq          \$8,$D0hi,$D0hi
3181         vpandq          $mask44,$D0lo,$H0
3182         vpaddq          $tmp,$D0hi,$D0hi
3183
3184          vpsrlq         \$24,$T3,$T2
3185          vporq          $PAD,$T2,$T2
3186         vpaddq          $D0hi,$D1lo,$D1lo
3187
3188         vpsrlq          \$44,$D1lo,$tmp
3189         vpsllq          \$8,$D1hi,$D1hi
3190         vpandq          $mask44,$D1lo,$H1
3191         vpaddq          $tmp,$D1hi,$D1hi
3192
3193          vpandq         $mask44,$T1,$T0
3194          vpsrlq         \$44,$T1,$T1
3195          vpsllq         \$20,$T3,$T3
3196         vpaddq          $D1hi,$D2lo,$D2lo
3197
3198         vpsrlq          \$42,$D2lo,$tmp
3199         vpsllq          \$10,$D2hi,$D2hi
3200         vpandq          $mask42,$D2lo,$H2
3201         vpaddq          $tmp,$D2hi,$D2hi
3202
3203           vpaddq        $T2,$H2,$H2             # accumulate input
3204         vpaddq          $D2hi,$H0,$H0
3205         vpsllq          \$2,$D2hi,$D2hi
3206
3207         vpaddq          $D2hi,$H0,$H0
3208          vporq          $T3,$T1,$T1
3209          vpandq         $mask44,$T1,$T1
3210
3211         vpsrlq          \$44,$H0,$tmp           # additional step
3212         vpandq          $mask44,$H0,$H0
3213
3214         vpaddq          $tmp,$H1,$H1
3215
3216         sub             \$4,$len                # len-=64
3217         jnz             .Loop_vpmadd52_4x
3218
3219 .Ltail_vpmadd52_4x:
3220         vmovdqu64       128($ctx),$R2           # load all key powers
3221         vmovdqu64       160($ctx),$S1
3222         vmovdqu64       64($ctx),$R0
3223         vmovdqu64       96($ctx),$R1
3224
3225 .Ltail_vpmadd52_2x:
3226         vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
3227         vpaddq          $R2,$S2,$S2
3228         vpsllq          \$2,$S2,$S2
3229
3230         #vpaddq         $T2,$H2,$H2             # accumulate input
3231         vpaddq          $T0,$H0,$H0
3232         vpaddq          $T1,$H1,$H1
3233
3234         vpxorq          $D0lo,$D0lo,$D0lo
3235         vpmadd52luq     $H2,$S1,$D0lo
3236         vpxorq          $D0hi,$D0hi,$D0hi
3237         vpmadd52huq     $H2,$S1,$D0hi
3238         vpxorq          $D1lo,$D1lo,$D1lo
3239         vpmadd52luq     $H2,$S2,$D1lo
3240         vpxorq          $D1hi,$D1hi,$D1hi
3241         vpmadd52huq     $H2,$S2,$D1hi
3242         vpxorq          $D2lo,$D2lo,$D2lo
3243         vpmadd52luq     $H2,$R0,$D2lo
3244         vpxorq          $D2hi,$D2hi,$D2hi
3245         vpmadd52huq     $H2,$R0,$D2hi
3246
3247         vpmadd52luq     $H0,$R0,$D0lo
3248         vpmadd52huq     $H0,$R0,$D0hi
3249         vpmadd52luq     $H0,$R1,$D1lo
3250         vpmadd52huq     $H0,$R1,$D1hi
3251         vpmadd52luq     $H0,$R2,$D2lo
3252         vpmadd52huq     $H0,$R2,$D2hi
3253
3254         vpmadd52luq     $H1,$S2,$D0lo
3255         vpmadd52huq     $H1,$S2,$D0hi
3256         vpmadd52luq     $H1,$R0,$D1lo
3257         vpmadd52huq     $H1,$R0,$D1hi
3258         vpmadd52luq     $H1,$R1,$D2lo
3259         vpmadd52huq     $H1,$R1,$D2hi
3260
3261         ################################################################
3262         # horizontal addition
3263
3264         mov             \$1,%eax
3265         kmovw           %eax,%k1
3266         vpsrldq         \$8,$D0lo,$T0
3267         vpsrldq         \$8,$D0hi,$H0
3268         vpsrldq         \$8,$D1lo,$T1
3269         vpsrldq         \$8,$D1hi,$H1
3270         vpaddq          $T0,$D0lo,$D0lo
3271         vpaddq          $H0,$D0hi,$D0hi
3272         vpsrldq         \$8,$D2lo,$T2
3273         vpsrldq         \$8,$D2hi,$H2
3274         vpaddq          $T1,$D1lo,$D1lo
3275         vpaddq          $H1,$D1hi,$D1hi
3276          vpermq         \$0x2,$D0lo,$T0
3277          vpermq         \$0x2,$D0hi,$H0
3278         vpaddq          $T2,$D2lo,$D2lo
3279         vpaddq          $H2,$D2hi,$D2hi
3280
3281         vpermq          \$0x2,$D1lo,$T1
3282         vpermq          \$0x2,$D1hi,$H1
3283         vpaddq          $T0,$D0lo,${D0lo}{%k1}{z}
3284         vpaddq          $H0,$D0hi,${D0hi}{%k1}{z}
3285         vpermq          \$0x2,$D2lo,$T2
3286         vpermq          \$0x2,$D2hi,$H2
3287         vpaddq          $T1,$D1lo,${D1lo}{%k1}{z}
3288         vpaddq          $H1,$D1hi,${D1hi}{%k1}{z}
3289         vpaddq          $T2,$D2lo,${D2lo}{%k1}{z}
3290         vpaddq          $H2,$D2hi,${D2hi}{%k1}{z}
3291
3292         ################################################################
3293         # partial reduction
3294         vpsrlq          \$44,$D0lo,$tmp
3295         vpsllq          \$8,$D0hi,$D0hi
3296         vpandq          $mask44,$D0lo,$H0
3297         vpaddq          $tmp,$D0hi,$D0hi
3298
3299         vpaddq          $D0hi,$D1lo,$D1lo
3300
3301         vpsrlq          \$44,$D1lo,$tmp
3302         vpsllq          \$8,$D1hi,$D1hi
3303         vpandq          $mask44,$D1lo,$H1
3304         vpaddq          $tmp,$D1hi,$D1hi
3305
3306         vpaddq          $D1hi,$D2lo,$D2lo
3307
3308         vpsrlq          \$42,$D2lo,$tmp
3309         vpsllq          \$10,$D2hi,$D2hi
3310         vpandq          $mask42,$D2lo,$H2
3311         vpaddq          $tmp,$D2hi,$D2hi
3312
3313         vpaddq          $D2hi,$H0,$H0
3314         vpsllq          \$2,$D2hi,$D2hi
3315
3316         vpaddq          $D2hi,$H0,$H0
3317
3318         vpsrlq          \$44,$H0,$tmp           # additional step
3319         vpandq          $mask44,$H0,$H0
3320
3321         vpaddq          $tmp,$H1,$H1
3322                                                 # at this point $len is
3323                                                 # either 4*n+2 or 0...
3324         sub             \$2,$len                # len-=32
3325         ja              .Lblocks_vpmadd52_4x_do
3326
3327         vmovq           %x#$H0,0($ctx)
3328         vmovq           %x#$H1,8($ctx)
3329         vmovq           %x#$H2,16($ctx)
3330         vzeroall
3331
3332 .Lno_data_vpmadd52_4x:
3333         ret
3334 .size   poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3335 ___
3336 }
3337 {
3338 ########################################################################
3339 # As implied by its name 8x subroutine processes 8 blocks in parallel...
3340 # This is intermediate version, as it's used only in cases when input
3341 # length is either 8*n, 8*n+1 or 8*n+2...
3342
3343 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3344 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3345 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3346 my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3347
3348 $code.=<<___;
3349 .type   poly1305_blocks_vpmadd52_8x,\@function,4
3350 .align  32
3351 poly1305_blocks_vpmadd52_8x:
3352         shr     \$4,$len
3353         jz      .Lno_data_vpmadd52_8x           # too short
3354
3355         shl     \$40,$padbit
3356         mov     64($ctx),%r8                    # peek on power of the key
3357
3358         vmovdqa64       .Lx_mask44(%rip),$mask44
3359         vmovdqa64       .Lx_mask42(%rip),$mask42
3360
3361         test    %r8,%r8                         # is power value impossible?
3362         js      .Linit_vpmadd52                 # if it is, then init R[4]
3363
3364         vmovq   0($ctx),%x#$H0                  # load current hash value
3365         vmovq   8($ctx),%x#$H1
3366         vmovq   16($ctx),%x#$H2
3367
3368 .Lblocks_vpmadd52_8x:
3369         ################################################################
3370         # fist we calculate more key powers
3371
3372         vmovdqu64       128($ctx),$R2           # load 1-3-2-4 powers
3373         vmovdqu64       160($ctx),$S1
3374         vmovdqu64       64($ctx),$R0
3375         vmovdqu64       96($ctx),$R1
3376
3377         vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
3378         vpaddq          $R2,$S2,$S2
3379         vpsllq          \$2,$S2,$S2
3380
3381         vpbroadcastq    %x#$R2,$RR2             # broadcast 4th power
3382         vpbroadcastq    %x#$R0,$RR0
3383         vpbroadcastq    %x#$R1,$RR1
3384
3385         vpxorq          $D0lo,$D0lo,$D0lo
3386         vpmadd52luq     $RR2,$S1,$D0lo
3387         vpxorq          $D0hi,$D0hi,$D0hi
3388         vpmadd52huq     $RR2,$S1,$D0hi
3389         vpxorq          $D1lo,$D1lo,$D1lo
3390         vpmadd52luq     $RR2,$S2,$D1lo
3391         vpxorq          $D1hi,$D1hi,$D1hi
3392         vpmadd52huq     $RR2,$S2,$D1hi
3393         vpxorq          $D2lo,$D2lo,$D2lo
3394         vpmadd52luq     $RR2,$R0,$D2lo
3395         vpxorq          $D2hi,$D2hi,$D2hi
3396         vpmadd52huq     $RR2,$R0,$D2hi
3397
3398         vpmadd52luq     $RR0,$R0,$D0lo
3399         vpmadd52huq     $RR0,$R0,$D0hi
3400         vpmadd52luq     $RR0,$R1,$D1lo
3401         vpmadd52huq     $RR0,$R1,$D1hi
3402         vpmadd52luq     $RR0,$R2,$D2lo
3403         vpmadd52huq     $RR0,$R2,$D2hi
3404
3405         vpmadd52luq     $RR1,$S2,$D0lo
3406         vpmadd52huq     $RR1,$S2,$D0hi
3407         vpmadd52luq     $RR1,$R0,$D1lo
3408         vpmadd52huq     $RR1,$R0,$D1hi
3409         vpmadd52luq     $RR1,$R1,$D2lo
3410         vpmadd52huq     $RR1,$R1,$D2hi
3411
3412         ################################################################
3413         # partial reduction
3414         vpsrlq          \$44,$D0lo,$tmp
3415         vpsllq          \$8,$D0hi,$D0hi
3416         vpandq          $mask44,$D0lo,$RR0
3417         vpaddq          $tmp,$D0hi,$D0hi
3418
3419         vpaddq          $D0hi,$D1lo,$D1lo
3420
3421         vpsrlq          \$44,$D1lo,$tmp
3422         vpsllq          \$8,$D1hi,$D1hi
3423         vpandq          $mask44,$D1lo,$RR1
3424         vpaddq          $tmp,$D1hi,$D1hi
3425
3426         vpaddq          $D1hi,$D2lo,$D2lo
3427
3428         vpsrlq          \$42,$D2lo,$tmp
3429         vpsllq          \$10,$D2hi,$D2hi
3430         vpandq          $mask42,$D2lo,$RR2
3431         vpaddq          $tmp,$D2hi,$D2hi
3432
3433         vpaddq          $D2hi,$RR0,$RR0
3434         vpsllq          \$2,$D2hi,$D2hi
3435
3436         vpaddq          $D2hi,$RR0,$RR0
3437
3438         vpsrlq          \$44,$RR0,$tmp          # additional step
3439         vpandq          $mask44,$RR0,$RR0
3440
3441         vpaddq          $tmp,$RR1,$RR1
3442
3443         ################################################################
3444         # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3445         # is 15263748, which reflects how data is loaded...
3446
3447         vpunpcklqdq     $R2,$RR2,$T2            # 3748
3448         vpunpckhqdq     $R2,$RR2,$R2            # 1526
3449         vpunpcklqdq     $R0,$RR0,$T0
3450         vpunpckhqdq     $R0,$RR0,$R0
3451         vpunpcklqdq     $R1,$RR1,$T1
3452         vpunpckhqdq     $R1,$RR1,$R1
3453 ___
3454 ######## switch to %zmm
3455 map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3456 map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3457 map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3458 map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3459
3460 $code.=<<___;
3461         vshufi64x2      \$0x44,$R2,$T2,$RR2     # 15263748
3462         vshufi64x2      \$0x44,$R0,$T0,$RR0
3463         vshufi64x2      \$0x44,$R1,$T1,$RR1
3464
3465         vmovdqu64       16*0($inp),$T2          # load data