X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fpoly1305%2Fasm%2Fpoly1305-x86_64.pl;h=0d1c0de6458c7effbee7978cff51a1d2dc56a2b1;hp=ff4efb33e6a2fc6a702b9470e14ee92e02cf40e8;hb=fd38836ba8158cb30f0731f8a61780ed4b5a6825;hpb=fd910ef9593d4e16dabf4686ecabb351830045b6 diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index ff4efb33e6..0d1c0de645 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -24,21 +24,33 @@ # # Add AVX512F+VL+BW code path. # +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannolake, has AVX512IFMA code path to execute... +# # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # -# IALU/gcc-4.8(*) AVX(**) AVX2 +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 # P4 4.46/+120% - # Core 2 2.41/+90% - # Westmere 1.88/+120% - # Sandy Bridge 1.39/+140% 1.10 # Haswell 1.14/+175% 1.11 0.65 -# Skylake 1.13/+120% 0.96 0.51 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] # Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) # Goldmont 1.70/+180% - # VIA Nano 1.82/+150% - # Sledgehammer 1.38/+160% - # Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 # # (*) improvement coefficients relative to clang are more modest and # are ~50% on most processors, in both cases we are comparing to @@ -48,6 +60,8 @@ # Core processors, 50-30%, less newer processor is, but slower on # contemporary ones, for example almost 2x slower on Atom, and as # former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; $flavour = shift; $output = shift; @@ -210,16 +224,23 @@ $code.=<<___; .type poly1305_blocks,\@function,4 .align 32 poly1305_blocks: +.cfi_startproc .Lblocks: shr \$4,$len jz .Lno_data # too short push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_body: mov $len,%r15 # reassign $len @@ -255,15 +276,23 @@ $code.=<<___; mov $h2,16($ctx) mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: ret +.cfi_endproc .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,\@function,3 @@ -279,7 +308,7 @@ poly1305_emit: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -484,6 +513,7 @@ __poly1305_init_avx: .type poly1305_blocks_avx,\@function,4 .align 32 poly1305_blocks_avx: +.cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx @@ -503,11 +533,17 @@ poly1305_blocks_avx: jz .Leven_avx push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_avx_body: mov $len,%r15 # reassign $len @@ -610,24 +646,39 @@ poly1305_blocks_avx: .align 16 .Ldone_avx: mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx: .Lblocks_avx_epilogue: ret +.cfi_endproc .align 32 .Lbase2_64_avx: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lbase2_64_avx_body: mov $len,%r15 # reassign $len @@ -687,18 +738,27 @@ poly1305_blocks_avx: mov %r15,$len mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx_epilogue: jmp .Ldo_avx +.cfi_endproc .align 32 .Leven_avx: +.cfi_startproc vmovd 4*0($ctx),$H0 # load hash value vmovd 4*1($ctx),$H1 vmovd 4*2($ctx),$H2 @@ -709,6 +769,7 @@ poly1305_blocks_avx: ___ $code.=<<___ if (!$win64); lea -0x58(%rsp),%r11 +.cfi_def_cfa %r11,0x60 sub \$0x178,%rsp ___ $code.=<<___ if ($win64); @@ -1301,10 +1362,12 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 0x58(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret +.cfi_endproc .size poly1305_blocks_avx,.-poly1305_blocks_avx .type poly1305_emit_avx,\@function,3 @@ -1350,7 +1413,7 @@ poly1305_emit_avx: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -1372,6 +1435,7 @@ $code.=<<___; .type poly1305_blocks_avx2,\@function,4 .align 32 poly1305_blocks_avx2: +.cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx2 @@ -1391,11 +1455,17 @@ poly1305_blocks_avx2: jz .Leven_avx2 push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lblocks_avx2_body: mov $len,%r15 # reassign $len @@ -1504,24 +1574,39 @@ poly1305_blocks_avx2: .align 16 .Ldone_avx2: mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lno_data_avx2: .Lblocks_avx2_epilogue: ret +.cfi_endproc .align 32 .Lbase2_64_avx2: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lbase2_64_avx2_body: mov $len,%r15 # reassign $len @@ -1588,20 +1673,28 @@ poly1305_blocks_avx2: mov \$`(1<<31|1<<30|1<<16)`,%r11d mov 0(%rsp),%r15 +.cfi_restore %r15 mov 8(%rsp),%r14 +.cfi_restore %r14 mov 16(%rsp),%r13 +.cfi_restore %r13 mov 24(%rsp),%r12 +.cfi_restore %r12 mov 32(%rsp),%rbp +.cfi_restore %rbp mov 40(%rsp),%rbx +.cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 .Lbase2_64_avx2_epilogue: jmp .Ldo_avx2 +.cfi_endproc .align 32 .Leven_avx2: +.cfi_startproc mov OPENSSL_ia32cap_P+8(%rip),%r10d - mov \$`(1<<31|1<<30|1<<16)`,%r11d vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 vmovd 4*1($ctx),%x#$H1 vmovd 4*2($ctx),%x#$H2 @@ -1614,12 +1707,13 @@ $code.=<<___ if ($avx>2); cmp \$512,$len jb .Lskip_avx512 and %r11d,%r10d - cmp %r11d,%r10d # check for AVX512F+BW+VL - je .Lblocks_avx512 + test \$`1<<16`,%r10d # check for AVX512F + jnz .Lblocks_avx512 .Lskip_avx512: ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); @@ -2008,10 +2102,12 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret +.cfi_endproc .size poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ ####################################################################### @@ -2022,20 +2118,27 @@ if ($avx>2) { # reason stack layout is kept identical to poly1305_blocks_avx2. If not # for this tail, we wouldn't have to even allocate stack frame... -my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24)); -my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29)); +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); my $PADBIT="%zmm30"; -my $GATHER="%ymm31"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); $code.=<<___; .type poly1305_blocks_avx512,\@function,4 .align 32 poly1305_blocks_avx512: +.cfi_startproc .Lblocks_avx512: - vzeroupper + mov \$15,%eax + kmovw %eax,%k2 ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); @@ -2056,40 +2159,41 @@ ___ $code.=<<___; lea .Lconst(%rip),%rcx lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),$T2 # .Lpermd_avx2 + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 # expand pre-calculated table - vmovdqu32 `16*0-64`($ctx),%x#$R0 + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} and \$-512,%rsp - vmovdqu32 `16*1-64`($ctx),%x#$R1 - vmovdqu32 `16*2-64`($ctx),%x#$S1 - vmovdqu32 `16*3-64`($ctx),%x#$R2 - vmovdqu32 `16*4-64`($ctx),%x#$S2 - vmovdqu32 `16*5-64`($ctx),%x#$R3 - vmovdqu32 `16*6-64`($ctx),%x#$S3 - vmovdqu32 `16*7-64`($ctx),%x#$R4 - vmovdqu32 `16*8-64`($ctx),%x#$S4 - vpermd $R0,$T2,$R0 # 00003412 -> 14243444 - vmovdqa64 64(%rcx),$MASK # .Lmask26 - vpermd $R1,$T2,$R1 - vpermd $S1,$T2,$S1 - vpermd $R2,$T2,$R2 - vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0 + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 - vpermd $S2,$T2,$S2 - vmovdqa32 $R1,0x20(%rsp) + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} vpsrlq \$32,$R1,$T1 - vpermd $R3,$T2,$R3 - vmovdqa32 $S1,0x40(%rsp) - vpermd $S3,$T2,$S3 - vpermd $R4,$T2,$R4 - vmovdqa32 $R2,0x60(%rsp) - vpermd $S4,$T2,$S4 - vmovdqa32 $S2,0x80(%rsp) - vmovdqa32 $R3,0xa0(%rsp) - vmovdqa32 $S3,0xc0(%rsp) - vmovdqa32 $R4,0xe0(%rsp) - vmovdqa32 $S4,0x100(%rsp) + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} ################################################################ # calculate 5th through 8th powers of the key @@ -2193,14 +2297,6 @@ $code.=<<___; vpandq $MASK,$D3,$D3 vpaddq $M3,$D4,$D4 # d3 -> d4 -___ -map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain -map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3)); -map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); -map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4)); -map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); -map(s/%y/%z/,($MASK)); -$code.=<<___; ################################################################ # at this point we have 14243444 in $R0-$S4 and 05060708 in # $D0-$D4, ... @@ -2213,36 +2309,21 @@ $code.=<<___; # we could just flow along, hence the goal for $R0-$S4 is # 1858286838784888 ... - mov \$0b0110011001100110,%eax - mov \$0b1100110011001100,%r8d - mov \$0b0101010101010101,%r9d + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax kmovw %eax,%k1 - kmovw %r8d,%k2 - kmovw %r9d,%k3 - - vpbroadcastq %x#$D0,$M0 # 0808080808080808 - vpbroadcastq %x#$D1,$M1 - vpbroadcastq %x#$D2,$M2 - vpbroadcastq %x#$D3,$M3 - vpbroadcastq %x#$D4,$M4 - - vpexpandd $D0,${D0}{%k1} # 05060708 -> -05--06--07--08- - vpexpandd $D1,${D1}{%k1} - vpexpandd $D2,${D2}{%k1} - vpexpandd $D3,${D3}{%k1} - vpexpandd $D4,${D4}{%k1} - - vpexpandd $R0,${D0}{%k2} # -05--06--07--08- -> 145-246-347-448- - vpexpandd $R1,${D1}{%k2} - vpexpandd $R2,${D2}{%k2} - vpexpandd $R3,${D3}{%k2} - vpexpandd $R4,${D4}{%k2} - - vpblendmd $M0,$D0,${R0}{%k3} # 1858286838784888 - vpblendmd $M1,$D1,${R1}{%k3} - vpblendmd $M2,$D2,${R2}{%k3} - vpblendmd $M3,$D3,${R3}{%k3} - vpblendmd $M4,$D4,${R4}{%k3} + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} vpslld \$2,$R1,$S1 # *5 vpslld \$2,$R2,$S2 @@ -2253,7 +2334,6 @@ $code.=<<___; vpaddd $R3,$S3,$S3 vpaddd $R4,$S4,$S4 - vpbroadcastq %x#$MASK,$MASK vpbroadcastq 32(%rcx),$PADBIT # .L129 vpsrlq \$52,$T0,$T2 # splat input @@ -2264,12 +2344,11 @@ $code.=<<___; vpsrlq \$40,$T4,$T4 # 4 vpandq $MASK,$T2,$T2 # 2 vpandq $MASK,$T0,$T0 # 0 - vpandq $MASK,$T1,$T1 # 1 - vpandq $MASK,$T3,$T3 # 3 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H2,$T2,$H2 # accumulate input - mov \$0x0f,%eax sub \$192,$len jbe .Ltail_avx512 jmp .Loop_avx512 @@ -2307,7 +2386,9 @@ $code.=<<___; vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpaddq $H0,$T0,$H0 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpmuludq $H2,$R0,$D2 # d2 = h2*r0 @@ -2415,8 +2496,8 @@ $code.=<<___; vpaddq $D3,$H4,$H4 # h3 -> h4 vpandq $MASK,$T0,$T0 # 0 - vpandq $MASK,$T1,$T1 # 1 - vpandq $MASK,$T3,$T3 # 3 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always sub \$128,$len @@ -2448,14 +2529,16 @@ $code.=<<___; vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H1,$T1,$H1 # accumulate input vpaddq $H3,$T3,$H3 vpaddq $H4,$T4,$H4 - vmovdqu64 16*0($inp),%x#$T0 + vmovdqu 16*0($inp),%x#$T0 vpmuludq $H0,$R3,$M3 vpmuludq $H0,$R4,$M4 vpmuludq $H0,$R0,$M0 @@ -2465,7 +2548,7 @@ $code.=<<___; vpaddq $M0,$D0,$D0 # d0 += h0*r0 vpaddq $M1,$D1,$D1 # d1 += h0*r1 - vmovdqu64 16*1($inp),%x#$T1 + vmovdqu 16*1($inp),%x#$T1 vpmuludq $H1,$R2,$M3 vpmuludq $H1,$R3,$M4 vpmuludq $H1,$S4,$M0 @@ -2475,7 +2558,7 @@ $code.=<<___; vpaddq $M0,$D0,$D0 # d0 += h1*s4 vpaddq $M2,$D2,$D2 # d2 += h0*r2 - vinserti64x2 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 vpmuludq $H3,$R0,$M3 vpmuludq $H3,$R1,$M4 vpmuludq $H1,$R0,$M1 @@ -2485,7 +2568,7 @@ $code.=<<___; vpaddq $M1,$D1,$D1 # d1 += h1*r0 vpaddq $M2,$D2,$D2 # d2 += h1*r1 - vinserti64x2 \$1,16*3($inp),$T1,$T1 + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 vpmuludq $H4,$S4,$M3 vpmuludq $H4,$R0,$M4 vpmuludq $H3,$S2,$M0 @@ -2508,11 +2591,11 @@ $code.=<<___; # horizontal addition mov \$1,%eax - vpsrldq \$8,$H3,$D3 - vpsrldq \$8,$D4,$H4 - vpsrldq \$8,$H0,$D0 - vpsrldq \$8,$H1,$D1 - vpsrldq \$8,$H2,$D2 + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 vpaddq $D3,$H3,$H3 vpaddq $D4,$H4,$H4 vpaddq $D0,$H0,$H0 @@ -2549,23 +2632,23 @@ $code.=<<___; # lazy reduction (interleaved with input splat) vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 + vpand $MASK,$H3,$H3 vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 + vpand $MASK,$H0,$H0 vpunpcklqdq $T3,$T2,$T2 # 2:3 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 - vpandq $MASK,$H4,$H4 + vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 - vpandq $MASK,$H1,$H1 + vpand $MASK,$H1,$H1 vpsrlq \$30,$T2,$T3 vpsrlq \$4,$T2,$T2 vpaddq $D1,$H2,$H2 # h1 -> h2 @@ -2577,21 +2660,21 @@ $code.=<<___; vpaddq $D4,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 - vpandq $MASK,$H2,$H2 - vpandq $MASK,$T2,$T2 # 2 - vpandq $MASK,$T0,$T0 # 0 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 + vpand $MASK,$H0,$H0 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 - vpandq $MASK,$T1,$T1 # 1 + vpand $MASK,$T1,$T1 # 1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 - vpandq $MASK,$T3,$T3 # 3 - vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always vpaddq $D3,$H4,$H4 # h3 -> h4 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 @@ -2622,9 +2705,11 @@ $code.=<<___ if ($win64); ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 ___ $code.=<<___; ret +.cfi_endproc .size poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ if ($avx>3) { @@ -2641,6 +2726,17 @@ if ($avx>3) { # path longer. In other words, even though base 2^44 reduction might # look less elegant, overall critical path is actually shorter... +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + $code.=<<___; .type poly1305_init_base2_44,\@function,3 .align 32 @@ -2673,6 +2769,7 @@ poly1305_init_base2_44: shl \$2,%rcx # magic <<2 mov %rax,24($ctx) # s1 mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value ___ $code.=<<___ if ($flavour !~ /elf32/); mov %r10,0(%rdx) @@ -2699,11 +2796,29 @@ poly1305_blocks_vpmadd52: shr \$4,$len jz .Lno_data_vpmadd52 # too short + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len mov \$7,%r10d mov \$1,%r11d kmovw %r10d,%k7 lea .L2_44_inp_permd(%rip),%r10 - shl \$40,$padbit kmovw %r11d,%k1 vmovq $padbit,%x#$PAD @@ -2774,16 +2889,833 @@ poly1305_blocks_vpmadd52: vpaddq $T0,$Dlo,$Dlo - dec $len # len-=16 + dec %rax # len-=16 jnz .Loop_vpmadd52 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + test $len,$len + jnz .Lblocks_vpmadd52_4x + .Lno_data_vpmadd52: ret .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 ___ } +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + ret +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + ret +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} $code.=<<___; .type poly1305_emit_base2_44,\@function,3 .align 32 @@ -2808,7 +3740,7 @@ poly1305_emit_base2_44: mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overfow? + shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx @@ -2832,6 +3764,8 @@ $code.=<<___; .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 .Lpermd_avx2: .long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 .L2_44_inp_permd: .long 0,1,1,2,2,3,7,7 @@ -2843,6 +3777,14 @@ $code.=<<___; .quad 44,44,42,64 .L2_44_shift_lft: .quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff ___ }