#! /usr/bin/env perl
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# Add AVX512F+VL+BW code path.
#
+# November 2017
+#
+# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
+# executed even on Knights Landing. Trigger for modification was
+# observation that AVX512 code paths can negatively affect overall
+# Skylake-X system performance. Since we are likely to suppress
+# AVX512F capability flag [at least on Skylake-X], conversion serves
+# as kind of "investment protection". Note that next *lake processor,
+# Cannolake, has AVX512IFMA code path to execute...
+#
# Numbers are cycles per processed byte with poly1305_blocks alone,
# measured with rdtsc at fixed clock frequency.
#
-# IALU/gcc-4.8(*) AVX(**) AVX2
+# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
# P4 4.46/+120% -
# Core 2 2.41/+90% -
# Westmere 1.88/+120% -
# Sandy Bridge 1.39/+140% 1.10
# Haswell 1.14/+175% 1.11 0.65
-# Skylake 1.13/+120% 0.96 0.51
+# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
# Silvermont 2.83/+95% -
+# Knights L 3.60/? 1.65 1.10 0.41(***)
# Goldmont 1.70/+180% -
# VIA Nano 1.82/+150% -
# Sledgehammer 1.38/+160% -
# Bulldozer 2.30/+130% 0.97
+# Ryzen 1.15/+200% 1.08 1.18
#
# (*) improvement coefficients relative to clang are more modest and
# are ~50% on most processors, in both cases we are comparing to
# Core processors, 50-30%, less newer processor is, but slower on
# contemporary ones, for example almost 2x slower on Atom, and as
# former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***) strangely enough performance seems to vary from core to core,
+# listed result is best case;
$flavour = shift;
$output = shift;
.type poly1305_blocks,\@function,4
.align 32
poly1305_blocks:
+.cfi_startproc
.Lblocks:
shr \$4,$len
jz .Lno_data # too short
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
.Lblocks_body:
mov $len,%r15 # reassign $len
mov $h2,16($ctx)
mov 0(%rsp),%r15
+.cfi_restore %r15
mov 8(%rsp),%r14
+.cfi_restore %r14
mov 16(%rsp),%r13
+.cfi_restore %r13
mov 24(%rsp),%r12
+.cfi_restore %r12
mov 32(%rsp),%rbp
+.cfi_restore %rbp
mov 40(%rsp),%rbx
+.cfi_restore %rbx
lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
.Lno_data:
.Lblocks_epilogue:
ret
+.cfi_endproc
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,\@function,3
mov %r9,%rcx
adc \$0,%r9
adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overfow?
+ shr \$2,%r10 # did 130-bit value overflow?
cmovnz %r8,%rax
cmovnz %r9,%rcx
.type poly1305_blocks_avx,\@function,4
.align 32
poly1305_blocks_avx:
+.cfi_startproc
mov 20($ctx),%r8d # is_base2_26
cmp \$128,$len
jae .Lblocks_avx
jz .Leven_avx
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
.Lblocks_avx_body:
mov $len,%r15 # reassign $len
.align 16
.Ldone_avx:
mov 0(%rsp),%r15
+.cfi_restore %r15
mov 8(%rsp),%r14
+.cfi_restore %r14
mov 16(%rsp),%r13
+.cfi_restore %r13
mov 24(%rsp),%r12
+.cfi_restore %r12
mov 32(%rsp),%rbp
+.cfi_restore %rbp
mov 40(%rsp),%rbx
+.cfi_restore %rbx
lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
.Lno_data_avx:
.Lblocks_avx_epilogue:
ret
+.cfi_endproc
.align 32
.Lbase2_64_avx:
+.cfi_startproc
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
.Lbase2_64_avx_body:
mov $len,%r15 # reassign $len
mov %r15,$len
mov 0(%rsp),%r15
+.cfi_restore %r15
mov 8(%rsp),%r14
+.cfi_restore %r14
mov 16(%rsp),%r13
+.cfi_restore %r13
mov 24(%rsp),%r12
+.cfi_restore %r12
mov 32(%rsp),%rbp
+.cfi_restore %rbp
mov 40(%rsp),%rbx
+.cfi_restore %rbx
lea 48(%rsp),%rax
lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
.Lbase2_64_avx_epilogue:
jmp .Ldo_avx
+.cfi_endproc
.align 32
.Leven_avx:
+.cfi_startproc
vmovd 4*0($ctx),$H0 # load hash value
vmovd 4*1($ctx),$H1
vmovd 4*2($ctx),$H2
___
$code.=<<___ if (!$win64);
lea -0x58(%rsp),%r11
+.cfi_def_cfa %r11,0x60
sub \$0x178,%rsp
___
$code.=<<___ if ($win64);
___
$code.=<<___ if (!$win64);
lea 0x58(%r11),%rsp
+.cfi_def_cfa %rsp,8
___
$code.=<<___;
vzeroupper
ret
+.cfi_endproc
.size poly1305_blocks_avx,.-poly1305_blocks_avx
.type poly1305_emit_avx,\@function,3
mov %r9,%rcx
adc \$0,%r9
adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overfow?
+ shr \$2,%r10 # did 130-bit value overflow?
cmovnz %r8,%rax
cmovnz %r9,%rcx
.type poly1305_blocks_avx2,\@function,4
.align 32
poly1305_blocks_avx2:
+.cfi_startproc
mov 20($ctx),%r8d # is_base2_26
cmp \$128,$len
jae .Lblocks_avx2
jz .Leven_avx2
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
.Lblocks_avx2_body:
mov $len,%r15 # reassign $len
.align 16
.Ldone_avx2:
mov 0(%rsp),%r15
+.cfi_restore %r15
mov 8(%rsp),%r14
+.cfi_restore %r14
mov 16(%rsp),%r13
+.cfi_restore %r13
mov 24(%rsp),%r12
+.cfi_restore %r12
mov 32(%rsp),%rbp
+.cfi_restore %rbp
mov 40(%rsp),%rbx
+.cfi_restore %rbx
lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
.Lno_data_avx2:
.Lblocks_avx2_epilogue:
ret
+.cfi_endproc
.align 32
.Lbase2_64_avx2:
+.cfi_startproc
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
.Lbase2_64_avx2_body:
mov $len,%r15 # reassign $len
mov \$`(1<<31|1<<30|1<<16)`,%r11d
mov 0(%rsp),%r15
+.cfi_restore %r15
mov 8(%rsp),%r14
+.cfi_restore %r14
mov 16(%rsp),%r13
+.cfi_restore %r13
mov 24(%rsp),%r12
+.cfi_restore %r12
mov 32(%rsp),%rbp
+.cfi_restore %rbp
mov 40(%rsp),%rbx
+.cfi_restore %rbx
lea 48(%rsp),%rax
lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
.Lbase2_64_avx2_epilogue:
jmp .Ldo_avx2
+.cfi_endproc
.align 32
.Leven_avx2:
+.cfi_startproc
mov OPENSSL_ia32cap_P+8(%rip),%r10d
- mov \$`(1<<31|1<<30|1<<16)`,%r11d
vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
vmovd 4*1($ctx),%x#$H1
vmovd 4*2($ctx),%x#$H2
cmp \$512,$len
jb .Lskip_avx512
and %r11d,%r10d
- cmp %r11d,%r10d # check for AVX512F+BW+VL
- je .Lblocks_avx512
+ test \$`1<<16`,%r10d # check for AVX512F
+ jnz .Lblocks_avx512
.Lskip_avx512:
___
$code.=<<___ if (!$win64);
lea -8(%rsp),%r11
+.cfi_def_cfa %r11,16
sub \$0x128,%rsp
___
$code.=<<___ if ($win64);
___
$code.=<<___ if (!$win64);
lea 8(%r11),%rsp
+.cfi_def_cfa %rsp,8
___
$code.=<<___;
vzeroupper
ret
+.cfi_endproc
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
___
#######################################################################
# reason stack layout is kept identical to poly1305_blocks_avx2. If not
# for this tail, we wouldn't have to even allocate stack frame...
-my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24));
-my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29));
+my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
my $PADBIT="%zmm30";
-my $GATHER="%ymm31";
+
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+map(s/%y/%z/,($MASK));
$code.=<<___;
.type poly1305_blocks_avx512,\@function,4
.align 32
poly1305_blocks_avx512:
+.cfi_startproc
.Lblocks_avx512:
- vzeroupper
+ mov \$15,%eax
+ kmovw %eax,%k2
___
$code.=<<___ if (!$win64);
lea -8(%rsp),%r11
+.cfi_def_cfa %r11,16
sub \$0x128,%rsp
___
$code.=<<___ if ($win64);
$code.=<<___;
lea .Lconst(%rip),%rcx
lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),$T2 # .Lpermd_avx2
+ vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
# expand pre-calculated table
- vmovdqu32 `16*0-64`($ctx),%x#$R0
+ vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
and \$-512,%rsp
- vmovdqu32 `16*1-64`($ctx),%x#$R1
- vmovdqu32 `16*2-64`($ctx),%x#$S1
- vmovdqu32 `16*3-64`($ctx),%x#$R2
- vmovdqu32 `16*4-64`($ctx),%x#$S2
- vmovdqu32 `16*5-64`($ctx),%x#$R3
- vmovdqu32 `16*6-64`($ctx),%x#$S3
- vmovdqu32 `16*7-64`($ctx),%x#$R4
- vmovdqu32 `16*8-64`($ctx),%x#$S4
- vpermd $R0,$T2,$R0 # 00003412 -> 14243444
- vmovdqa64 64(%rcx),$MASK # .Lmask26
- vpermd $R1,$T2,$R1
- vpermd $S1,$T2,$S1
- vpermd $R2,$T2,$R2
- vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0
+ vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
+ mov \$0x20,%rax
+ vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
+ vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
+ vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
+ vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
+ vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
+ vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
+ vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
+ vpermd $D0,$T2,$R0 # 00003412 -> 14243444
+ vpbroadcastq 64(%rcx),$MASK # .Lmask26
+ vpermd $D1,$T2,$R1
+ vpermd $T0,$T2,$S1
+ vpermd $D2,$T2,$R2
+ vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
- vpermd $S2,$T2,$S2
- vmovdqa32 $R1,0x20(%rsp)
+ vpermd $T1,$T2,$S2
+ vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
vpsrlq \$32,$R1,$T1
- vpermd $R3,$T2,$R3
- vmovdqa32 $S1,0x40(%rsp)
- vpermd $S3,$T2,$S3
- vpermd $R4,$T2,$R4
- vmovdqa32 $R2,0x60(%rsp)
- vpermd $S4,$T2,$S4
- vmovdqa32 $S2,0x80(%rsp)
- vmovdqa32 $R3,0xa0(%rsp)
- vmovdqa32 $S3,0xc0(%rsp)
- vmovdqa32 $R4,0xe0(%rsp)
- vmovdqa32 $S4,0x100(%rsp)
+ vpermd $D3,$T2,$R3
+ vmovdqa64 $S1,0x40(%rsp){%k2}
+ vpermd $T3,$T2,$S3
+ vpermd $D4,$T2,$R4
+ vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
+ vpermd $T4,$T2,$S4
+ vmovdqa64 $S2,0x80(%rsp){%k2}
+ vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
+ vmovdqa64 $S3,0xc0(%rsp){%k2}
+ vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
+ vmovdqa64 $S4,0x100(%rsp){%k2}
################################################################
# calculate 5th through 8th powers of the key
vpandq $MASK,$D3,$D3
vpaddq $M3,$D4,$D4 # d3 -> d4
-___
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
-map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($MASK));
-$code.=<<___;
################################################################
# at this point we have 14243444 in $R0-$S4 and 05060708 in
# $D0-$D4, ...
# we could just flow along, hence the goal for $R0-$S4 is
# 1858286838784888 ...
- mov \$0b0110011001100110,%eax
- mov \$0b1100110011001100,%r8d
- mov \$0b0101010101010101,%r9d
+ vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
+ mov \$0x7777,%eax
kmovw %eax,%k1
- kmovw %r8d,%k2
- kmovw %r9d,%k3
-
- vpbroadcastq %x#$D0,$M0 # 0808080808080808
- vpbroadcastq %x#$D1,$M1
- vpbroadcastq %x#$D2,$M2
- vpbroadcastq %x#$D3,$M3
- vpbroadcastq %x#$D4,$M4
-
- vpexpandd $D0,${D0}{%k1} # 05060708 -> -05--06--07--08-
- vpexpandd $D1,${D1}{%k1}
- vpexpandd $D2,${D2}{%k1}
- vpexpandd $D3,${D3}{%k1}
- vpexpandd $D4,${D4}{%k1}
-
- vpexpandd $R0,${D0}{%k2} # -05--06--07--08- -> 145-246-347-448-
- vpexpandd $R1,${D1}{%k2}
- vpexpandd $R2,${D2}{%k2}
- vpexpandd $R3,${D3}{%k2}
- vpexpandd $R4,${D4}{%k2}
-
- vpblendmd $M0,$D0,${R0}{%k3} # 1858286838784888
- vpblendmd $M1,$D1,${R1}{%k3}
- vpblendmd $M2,$D2,${R2}{%k3}
- vpblendmd $M3,$D3,${R3}{%k3}
- vpblendmd $M4,$D4,${R4}{%k3}
+
+ vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
+ vpermd $R1,$M0,$R1
+ vpermd $R2,$M0,$R2
+ vpermd $R3,$M0,$R3
+ vpermd $R4,$M0,$R4
+
+ vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
+ vpermd $D1,$M0,${R1}{%k1}
+ vpermd $D2,$M0,${R2}{%k1}
+ vpermd $D3,$M0,${R3}{%k1}
+ vpermd $D4,$M0,${R4}{%k1}
vpslld \$2,$R1,$S1 # *5
vpslld \$2,$R2,$S2
vpaddd $R3,$S3,$S3
vpaddd $R4,$S4,$S4
- vpbroadcastq %x#$MASK,$MASK
vpbroadcastq 32(%rcx),$PADBIT # .L129
vpsrlq \$52,$T0,$T2 # splat input
vpsrlq \$40,$T4,$T4 # 4
vpandq $MASK,$T2,$T2 # 2
vpandq $MASK,$T0,$T0 # 0
- vpandq $MASK,$T1,$T1 # 1
- vpandq $MASK,$T3,$T3 # 3
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpaddq $H2,$T2,$H2 # accumulate input
- mov \$0x0f,%eax
sub \$192,$len
jbe .Ltail_avx512
jmp .Loop_avx512
vpmuludq $H2,$R1,$D3 # d3 = h2*r1
vpaddq $H0,$T0,$H0
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
+ vpandq $MASK,$T1,$T1 # 1
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T3,$T3 # 3
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
vpaddq $D3,$H4,$H4 # h3 -> h4
vpandq $MASK,$T0,$T0 # 0
- vpandq $MASK,$T1,$T1 # 1
- vpandq $MASK,$T3,$T3 # 3
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
#vporq $PADBIT,$T4,$T4 # padbit, yes, always
sub \$128,$len
vpmuludq $H2,$R1,$D3 # d3 = h2*r1
vpmuludq $H2,$R2,$D4 # d4 = h2*r2
vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T1,$T1 # 1
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+ vpandq $MASK,$T3,$T3 # 3
vpmuludq $H2,$R0,$D2 # d2 = h2*r0
vporq $PADBIT,$T4,$T4 # padbit, yes, always
vpaddq $H1,$T1,$H1 # accumulate input
vpaddq $H3,$T3,$H3
vpaddq $H4,$T4,$H4
- vmovdqu64 16*0($inp),%x#$T0
+ vmovdqu 16*0($inp),%x#$T0
vpmuludq $H0,$R3,$M3
vpmuludq $H0,$R4,$M4
vpmuludq $H0,$R0,$M0
vpaddq $M0,$D0,$D0 # d0 += h0*r0
vpaddq $M1,$D1,$D1 # d1 += h0*r1
- vmovdqu64 16*1($inp),%x#$T1
+ vmovdqu 16*1($inp),%x#$T1
vpmuludq $H1,$R2,$M3
vpmuludq $H1,$R3,$M4
vpmuludq $H1,$S4,$M0
vpaddq $M0,$D0,$D0 # d0 += h1*s4
vpaddq $M2,$D2,$D2 # d2 += h0*r2
- vinserti64x2 \$1,16*2($inp),$T0,$T0
+ vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
vpmuludq $H3,$R0,$M3
vpmuludq $H3,$R1,$M4
vpmuludq $H1,$R0,$M1
vpaddq $M1,$D1,$D1 # d1 += h1*r0
vpaddq $M2,$D2,$D2 # d2 += h1*r1
- vinserti64x2 \$1,16*3($inp),$T1,$T1
+ vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
vpmuludq $H4,$S4,$M3
vpmuludq $H4,$R0,$M4
vpmuludq $H3,$S2,$M0
# horizontal addition
mov \$1,%eax
- vpsrldq \$8,$H3,$D3
- vpsrldq \$8,$D4,$H4
- vpsrldq \$8,$H0,$D0
- vpsrldq \$8,$H1,$D1
- vpsrldq \$8,$H2,$D2
+ vpermq \$0xb1,$H3,$D3
+ vpermq \$0xb1,$D4,$H4
+ vpermq \$0xb1,$H0,$D0
+ vpermq \$0xb1,$H1,$D1
+ vpermq \$0xb1,$H2,$D2
vpaddq $D3,$H3,$H3
vpaddq $D4,$H4,$H4
vpaddq $D0,$H0,$H0
# lazy reduction (interleaved with input splat)
vpsrlq \$26,$H3,$D3
- vpandq $MASK,$H3,$H3
+ vpand $MASK,$H3,$H3
vpsrldq \$6,$T0,$T2 # splat input
vpsrldq \$6,$T1,$T3
vpunpckhqdq $T1,$T0,$T4 # 4
vpaddq $D3,$H4,$H4 # h3 -> h4
vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
+ vpand $MASK,$H0,$H0
vpunpcklqdq $T3,$T2,$T2 # 2:3
vpunpcklqdq $T1,$T0,$T0 # 0:1
vpaddq $D0,$H1,$H1 # h0 -> h1
vpsrlq \$26,$H4,$D4
- vpandq $MASK,$H4,$H4
+ vpand $MASK,$H4,$H4
vpsrlq \$26,$H1,$D1
- vpandq $MASK,$H1,$H1
+ vpand $MASK,$H1,$H1
vpsrlq \$30,$T2,$T3
vpsrlq \$4,$T2,$T2
vpaddq $D1,$H2,$H2 # h1 -> h2
vpaddq $D4,$H0,$H0 # h4 -> h0
vpsrlq \$26,$H2,$D2
- vpandq $MASK,$H2,$H2
- vpandq $MASK,$T2,$T2 # 2
- vpandq $MASK,$T0,$T0 # 0
+ vpand $MASK,$H2,$H2
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
vpaddq $D2,$H3,$H3 # h2 -> h3
vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
+ vpand $MASK,$H0,$H0
vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
- vpandq $MASK,$T1,$T1 # 1
+ vpand $MASK,$T1,$T1 # 1
vpaddq $D0,$H1,$H1 # h0 -> h1
vpsrlq \$26,$H3,$D3
- vpandq $MASK,$H3,$H3
- vpandq $MASK,$T3,$T3 # 3
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
+ vpand $MASK,$H3,$H3
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
vpaddq $D3,$H4,$H4 # h3 -> h4
lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
___
$code.=<<___ if (!$win64);
lea 8(%r11),%rsp
+.cfi_def_cfa %rsp,8
___
$code.=<<___;
ret
+.cfi_endproc
.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
___
if ($avx>3) {
# path longer. In other words, even though base 2^44 reduction might
# look less elegant, overall critical path is actually shorter...
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^44
+# unsigned __int64 s[2]; # key value*20 base 2^44
+# unsigned __int64 r[3]; # key value base 2^44
+# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
+# # r^n positions reflect
+# # placement in register, not
+# # memory, R[3] is R[1]*20
+
$code.=<<___;
.type poly1305_init_base2_44,\@function,3
.align 32
shl \$2,%rcx # magic <<2
mov %rax,24($ctx) # s1
mov %rcx,32($ctx) # s2
+ movq \$-1,64($ctx) # write impossible value
___
$code.=<<___ if ($flavour !~ /elf32/);
mov %r10,0(%rdx)
shr \$4,$len
jz .Lno_data_vpmadd52 # too short
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ # if powers of the key are not calculated yet, process up to 3
+ # blocks with this single-block subroutine, otherwise ensure that
+ # length is divisible by 2 blocks and pass the rest down to next
+ # subroutine...
+
+ mov \$3,%rax
+ mov \$1,%r10
+ cmp \$4,$len # is input long
+ cmovae %r10,%rax
+ test %r8,%r8 # is power value impossible?
+ cmovns %r10,%rax
+
+ and $len,%rax # is input of favourable length?
+ jz .Lblocks_vpmadd52_4x
+
+ sub %rax,$len
mov \$7,%r10d
mov \$1,%r11d
kmovw %r10d,%k7
lea .L2_44_inp_permd(%rip),%r10
- shl \$40,$padbit
kmovw %r11d,%k1
vmovq $padbit,%x#$PAD
vpaddq $T0,$Dlo,$Dlo
- dec $len # len-=16
+ dec %rax # len-=16
jnz .Loop_vpmadd52
vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
+ test $len,$len
+ jnz .Lblocks_vpmadd52_4x
+
.Lno_data_vpmadd52:
ret
.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
___
}
+{
+########################################################################
+# As implied by its name 4x subroutine processes 4 blocks in parallel
+# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
+# and is handled in 256-bit %ymm registers.
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_4x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_4x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_4x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+.Lblocks_vpmadd52_4x:
+ vpbroadcastq $padbit,$PAD
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ mov \$5,%eax
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+ kmovw %eax,%k1 # used in 2x path
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+ vpbroadcastq 64($ctx),$R0 # load 4th power of the key
+ vpbroadcastq 96($ctx),$R1
+ vpbroadcastq 128($ctx),$R2
+ vpbroadcastq 160($ctx),$S1
+
+.Lblocks_vpmadd52_4x_key_loaded:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ test \$7,$len # is len 8*n?
+ jz .Lblocks_vpmadd52_8x
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 3-1-2-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$4,$len
+ jz .Ltail_vpmadd52_4x
+ jmp .Loop_vpmadd52_4x
+ ud2
+
+.align 32
+.Linit_vpmadd52:
+ vmovq 24($ctx),%x#$S1 # load key
+ vmovq 56($ctx),%x#$H2
+ vmovq 32($ctx),%x#$S2
+ vmovq 40($ctx),%x#$R0
+ vmovq 48($ctx),%x#$R1
+
+ vmovdqa $R0,$H0
+ vmovdqa $R1,$H1
+ vmovdqa $H2,$R2
+
+ mov \$2,%eax
+
+.Lmul_init_vpmadd52:
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ dec %eax
+ jz .Ldone_init_vpmadd52
+
+ vpunpcklqdq $R1,$H1,$R1 # 1,2
+ vpbroadcastq %x#$H1,%x#$H1 # 2,2
+ vpunpcklqdq $R2,$H2,$R2
+ vpbroadcastq %x#$H2,%x#$H2
+ vpunpcklqdq $R0,$H0,$R0
+ vpbroadcastq %x#$H0,%x#$H0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R1,$S1,$S1
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S1,$S1
+ vpsllq \$2,$S2,$S2
+
+ jmp .Lmul_init_vpmadd52
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52:
+ vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
+ vinserti128 \$1,%x#$R2,$H2,$R2
+ vinserti128 \$1,%x#$R0,$H0,$R0
+
+ vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
+ vpermq \$0b11011000,$R2,$R2
+ vpermq \$0b11011000,$R0,$R0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpaddq $R1,$S1,$S1
+ vpsllq \$2,$S1,$S1
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Ldone_init_vpmadd52_2x
+
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpbroadcastq %x#$R0,$R0 # broadcast 4th power
+ vmovdqu64 $R1,96($ctx)
+ vpbroadcastq %x#$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpbroadcastq %x#$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpbroadcastq %x#$S1,$S1
+
+ jmp .Lblocks_vpmadd52_4x_key_loaded
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52_2x:
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpsrldq \$8,$R0,$R0 # 0-1-0-2
+ vmovdqu64 $R1,96($ctx)
+ vpsrldq \$8,$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpsrldq \$8,$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpsrldq \$8,$S1,$S1
+ jmp .Lblocks_vpmadd52_2x_key_loaded
+ ud2
+
+.align 32
+.Lblocks_vpmadd52_2x_do:
+ vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
+ vmovdqu64 160+8($ctx),${S1}{%k1}{z}
+ vmovdqu64 64+8($ctx),${R0}{%k1}{z}
+ vmovdqu64 96+8($ctx),${R1}{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+ vmovdqu64 16*0($inp),$T2 # load data
+ vpxorq $T3,$T3,$T3
+ lea 16*2($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as x-1-x-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ jmp .Ltail_vpmadd52_2x
+ ud2
+
+.align 32
+.Loop_vpmadd52_4x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$4,$len # len-=64
+ jnz .Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+ vmovdqu64 128($ctx),$R2 # load all key powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+.Ltail_vpmadd52_2x:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+ # at this point $len is
+ # either 4*n+2 or 0...
+ sub \$2,$len # len-=32
+ ja .Lblocks_vpmadd52_4x_do
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_4x:
+ ret
+.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+___
+}
+{
+########################################################################
+# As implied by its name 8x subroutine processes 8 blocks in parallel...
+# This is intermediate version, as it's used only in cases when input
+# length is either 8*n, 8*n+1 or 8*n+2...
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_8x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_8x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_8x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+.Lblocks_vpmadd52_8x:
+ ################################################################
+ # fist we calculate more key powers
+
+ vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
+ vpbroadcastq %x#$R0,$RR0
+ vpbroadcastq %x#$R1,$RR1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $RR2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $RR2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $RR2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $RR2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $RR2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $RR2,$R0,$D2hi
+
+ vpmadd52luq $RR0,$R0,$D0lo
+ vpmadd52huq $RR0,$R0,$D0hi
+ vpmadd52luq $RR0,$R1,$D1lo
+ vpmadd52huq $RR0,$R1,$D1hi
+ vpmadd52luq $RR0,$R2,$D2lo
+ vpmadd52huq $RR0,$R2,$D2hi
+
+ vpmadd52luq $RR1,$S2,$D0lo
+ vpmadd52huq $RR1,$S2,$D0hi
+ vpmadd52luq $RR1,$R0,$D1lo
+ vpmadd52huq $RR1,$R0,$D1hi
+ vpmadd52luq $RR1,$R1,$D2lo
+ vpmadd52huq $RR1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$RR0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$RR1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$RR2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+
+ vpsrlq \$44,$RR0,$tmp # additional step
+ vpandq $mask44,$RR0,$RR0
+
+ vpaddq $tmp,$RR1,$RR1
+
+ ################################################################
+ # At this point Rx holds 1324 powers, RRx - 5768, and the goal
+ # is 15263748, which reflects how data is loaded...
+
+ vpunpcklqdq $R2,$RR2,$T2 # 3748
+ vpunpckhqdq $R2,$RR2,$R2 # 1526
+ vpunpcklqdq $R0,$RR0,$T0
+ vpunpckhqdq $R0,$RR0,$R0
+ vpunpcklqdq $R1,$RR1,$T1
+ vpunpckhqdq $R1,$RR1,$R1
+___
+######## switch to %zmm
+map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
+
+$code.=<<___;
+ vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
+ vshufi64x2 \$0x44,$R0,$T0,$RR0
+ vshufi64x2 \$0x44,$R1,$T1,$RR1
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+
+ vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
+ vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
+ vpaddq $RR2,$SS2,$SS2
+ vpaddq $RR1,$SS1,$SS1
+ vpsllq \$2,$SS2,$SS2
+ vpsllq \$2,$SS1,$SS1
+
+ vpbroadcastq $padbit,$PAD
+ vpbroadcastq %x#$mask44,$mask44
+ vpbroadcastq %x#$mask42,$mask42
+
+ vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
+ vpbroadcastq %x#$SS2,$S2
+ vpbroadcastq %x#$RR0,$R0
+ vpbroadcastq %x#$RR1,$R1
+ vpbroadcastq %x#$RR2,$R2
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 73625140
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$8,$len
+ jz .Ltail_vpmadd52_8x
+ jmp .Loop_vpmadd52_8x
+
+.align 32
+.Loop_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$8,$len # len-=128
+ jnz .Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$SS1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$SS1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$SS2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$SS2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$RR0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$RR0,$D2hi
+
+ vpmadd52luq $H0,$RR0,$D0lo
+ vpmadd52huq $H0,$RR0,$D0hi
+ vpmadd52luq $H0,$RR1,$D1lo
+ vpmadd52huq $H0,$RR1,$D1hi
+ vpmadd52luq $H0,$RR2,$D2lo
+ vpmadd52huq $H0,$RR2,$D2hi
+
+ vpmadd52luq $H1,$SS2,$D0lo
+ vpmadd52huq $H1,$SS2,$D0hi
+ vpmadd52luq $H1,$RR0,$D1lo
+ vpmadd52huq $H1,$RR0,$D1hi
+ vpmadd52luq $H1,$RR1,$D2lo
+ vpmadd52huq $H1,$RR1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vextracti64x4 \$1,$D0lo,%y#$T0
+ vextracti64x4 \$1,$D0hi,%y#$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vextracti64x4 \$1,$D1lo,%y#$T1
+ vextracti64x4 \$1,$D1hi,%y#$H1
+ vextracti64x4 \$1,$D2lo,%y#$T2
+ vextracti64x4 \$1,$D2hi,%y#$H2
+___
+######## switch back to %ymm
+map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+
+$code.=<<___;
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ ################################################################
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_8x:
+ ret
+.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+___
+}
$code.=<<___;
.type poly1305_emit_base2_44,\@function,3
.align 32
mov %r9,%rcx
adc \$0,%r9
adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overfow?
+ shr \$2,%r10 # did 130-bit value overflow?
cmovnz %r8,%rax
cmovnz %r9,%rcx
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lpermd_avx2:
.long 2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
.L2_44_inp_permd:
.long 0,1,1,2,2,3,7,7
.quad 44,44,42,64
.L2_44_shift_lft:
.quad 8,8,10,64
+
+.align 64
+.Lx_mask44:
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
___
}
-
$code.=<<___;
.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
+{ # chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+$code.=<<___;
+.globl xor128_encrypt_n_pad
+.type xor128_encrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_encrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_enc
+ nop
+.Loop_enc_xmm:
+ movdqu ($inp,$otp),%xmm0
+ pxor ($otp),%xmm0
+ movdqu %xmm0,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_enc_xmm
+
+ and \$15,%r10 # len % 16
+ jz .Ldone_enc
+
+.Ltail_enc:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+.Loop_enc_byte:
+ mov ($inp,$otp),%al
+ xor ($otp),%al
+ mov %al,($out,$otp)
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_enc_byte
+
+ xor %eax,%eax
+.Loop_enc_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_enc_pad
+
+.Ldone_enc:
+ mov $otp,%rax
+ ret
+.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl xor128_decrypt_n_pad
+.type xor128_decrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_decrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_dec
+ nop
+.Loop_dec_xmm:
+ movdqu ($inp,$otp),%xmm0
+ movdqa ($otp),%xmm1
+ pxor %xmm0,%xmm1
+ movdqu %xmm1,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_dec_xmm
+
+ pxor %xmm1,%xmm1
+ and \$15,%r10 # len % 16
+ jz .Ldone_dec
+
+.Ltail_dec:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+ xor %r11,%r11
+.Loop_dec_byte:
+ mov ($inp,$otp),%r11b
+ mov ($otp),%al
+ xor %r11b,%al
+ mov %al,($out,$otp)
+ mov %r11b,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_dec_byte
+
+ xor %eax,%eax
+.Loop_dec_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_dec_pad
+
+.Ldone_dec:
+ mov $otp,%rax
+ ret
+.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
+
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {