poly1305/asm/poly1305-x86_64.pl: add poly1305_blocks_vpmadd52_4x.
authorAndy Polyakov <appro@openssl.org>
Sun, 12 Mar 2017 14:37:16 +0000 (15:37 +0100)
committerAndy Polyakov <appro@openssl.org>
Mon, 13 Mar 2017 17:48:34 +0000 (18:48 +0100)
As hinted by its name new subroutine processes 4 input blocks in
parallel. It still operates on 256-bit registers and is just
another step toward full-blown AVX512IFMA procedure.

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/poly1305/asm/poly1305-x86_64.pl

index a3970198b714b11d4b6b3d4a5aab00c9c56a7dda..71a9efb994a5994fb631c7b684f08c4d06b6302e 100755 (executable)
@@ -2716,6 +2716,17 @@ if ($avx>3) {
 # path longer. In other words, even though base 2^44 reduction might
 # look less elegant, overall critical path is actually shorter...
 
+########################################################################
+# Layout of opaque area is following.
+#
+#      unsigned __int64 h[3];          # current hash value base 2^44
+#      unsigned __int64 s[2];          # key value*20 base 2^44
+#      unsigned __int64 r[3];          # key value base 2^44
+#      struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
+#                                      # r^n positions reflect
+#                                      # placement in register, not
+#                                      # memory, R[3] is R[1]*20
+
 $code.=<<___;
 .type  poly1305_init_base2_44,\@function,3
 .align 32
@@ -2748,6 +2759,7 @@ poly1305_init_base2_44:
        shl     \$2,%rcx                # magic <<2
        mov     %rax,24($ctx)           # s1
        mov     %rcx,32($ctx)           # s2
+       movq    \$-1,64($ctx)           # write impossible value
 ___
 $code.=<<___   if ($flavour !~ /elf32/);
        mov     %r10,0(%rdx)
@@ -2774,11 +2786,29 @@ poly1305_blocks_vpmadd52:
        shr     \$4,$len
        jz      .Lno_data_vpmadd52              # too short
 
+       shl     \$40,$padbit
+       mov     64($ctx),%r8                    # peek on power of the key
+
+       # if powers of the key are not calculated yet, process up to 3
+       # blocks with this single-block subroutine, otherwise ensure that
+       # length is divisible by 2 blocks and pass the rest down to next
+       # subroutine...
+
+       mov     \$3,%rax
+       mov     \$1,%r10
+       cmp     \$4,$len                        # is input long
+       cmovae  %r10,%rax
+       test    %r8,%r8                         # is power value impossible?
+       cmovns  %r10,%rax
+
+       and     $len,%rax                       # is input of favourable length?
+       jz      .Lblocks_vpmadd52_4x
+
+       sub             %rax,$len
        mov             \$7,%r10d
        mov             \$1,%r11d
        kmovw           %r10d,%k7
        lea             .L2_44_inp_permd(%rip),%r10
-       shl             \$40,$padbit
        kmovw           %r11d,%k1
 
        vmovq           $padbit,%x#$PAD
@@ -2849,16 +2879,451 @@ poly1305_blocks_vpmadd52:
 
        vpaddq          $T0,$Dlo,$Dlo
 
-       dec             $len                    # len-=16
+       dec             %rax                    # len-=16
        jnz             .Loop_vpmadd52
 
        vmovdqu64       $Dlo,0($ctx){%k7}       # store hash value
 
+       test            $len,$len
+       jnz             .Lblocks_vpmadd52_4x
+
 .Lno_data_vpmadd52:
        ret
 .size  poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
 ___
 }
+{
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+
+$code.=<<___;
+.type  poly1305_blocks_vpmadd52_4x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_4x:
+       shr     \$4,$len
+       jz      .Lno_data_vpmadd52_4x           # too short
+
+       shl     \$40,$padbit
+       mov     64($ctx),%r8                    # peek on power of the key
+
+.Lblocks_vpmadd52_4x:
+       vpbroadcastq    $padbit,$PAD
+
+       vmovdqa64       .Lx_mask44(%rip),$mask44
+       mov             \$5,%eax
+       vmovdqa64       .Lx_mask42(%rip),$mask42
+       kmovw           %eax,%k1                # used in 2x path
+
+       test            %r8,%r8                 # is power value impossible?
+       js              .Linit_vpmadd52         # if it is, then init R[4]
+
+       vmovq           0($ctx),%x#$H0          # load current hash value
+       vmovq           8($ctx),%x#$H1
+       vmovq           16($ctx),%x#$H2
+
+       test            \$3,$len                # is length 4*n+2?
+       jnz             .Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+       vpbroadcastq    64($ctx),$R0            # load 4th power of the key
+       vpbroadcastq    96($ctx),$R1
+       vpbroadcastq    128($ctx),$R2
+       vpbroadcastq    160($ctx),$S1
+
+.Lblocks_vpmadd52_4x_key_loaded:
+       vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
+       vpaddq          $R2,$S2,$S2
+       vpsllq          \$2,$S2,$S2
+
+       vmovdqu64       16*0($inp),$T2          # load data
+       vmovdqu64       16*2($inp),$T3
+       lea             16*4($inp),$inp
+
+       vpunpcklqdq     $T3,$T2,$T1             # transpose data
+       vpunpckhqdq     $T3,$T2,$T3
+
+       # at this point 64-bit lanes are ordered as 3-1-2-0
+
+       vpsrlq          \$24,$T3,$T2            # splat the data
+       vporq           $PAD,$T2,$T2
+        vpaddq         $T2,$H2,$H2             # accumulate input
+       vpandq          $mask44,$T1,$T0
+       vpsrlq          \$44,$T1,$T1
+       vpsllq          \$20,$T3,$T3
+       vporq           $T3,$T1,$T1
+       vpandq          $mask44,$T1,$T1
+
+       sub             \$4,$len
+       jz              .Ltail_vpmadd52_4x
+       jmp             .Loop_vpmadd52_4x
+       ud2
+
+.align 32
+.Linit_vpmadd52:
+       vmovq           24($ctx),%x#$S1         # load key
+       vmovq           56($ctx),%x#$H2
+       vmovq           32($ctx),%x#$S2
+       vmovq           40($ctx),%x#$R0
+       vmovq           48($ctx),%x#$R1
+
+       vmovdqa         $R0,$H0
+       vmovdqa         $R1,$H1
+       vmovdqa         $H2,$R2
+
+       mov             \$2,%eax
+
+.Lmul_init_vpmadd52:
+       vpxorq          $D0lo,$D0lo,$D0lo
+       vpmadd52luq     $H2,$S1,$D0lo
+       vpxorq          $D0hi,$D0hi,$D0hi
+       vpmadd52huq     $H2,$S1,$D0hi
+       vpxorq          $D1lo,$D1lo,$D1lo
+       vpmadd52luq     $H2,$S2,$D1lo
+       vpxorq          $D1hi,$D1hi,$D1hi
+       vpmadd52huq     $H2,$S2,$D1hi
+       vpxorq          $D2lo,$D2lo,$D2lo
+       vpmadd52luq     $H2,$R0,$D2lo
+       vpxorq          $D2hi,$D2hi,$D2hi
+       vpmadd52huq     $H2,$R0,$D2hi
+
+       vpmadd52luq     $H0,$R0,$D0lo
+       vpmadd52huq     $H0,$R0,$D0hi
+       vpmadd52luq     $H0,$R1,$D1lo
+       vpmadd52huq     $H0,$R1,$D1hi
+       vpmadd52luq     $H0,$R2,$D2lo
+       vpmadd52huq     $H0,$R2,$D2hi
+
+       vpmadd52luq     $H1,$S2,$D0lo
+       vpmadd52huq     $H1,$S2,$D0hi
+       vpmadd52luq     $H1,$R0,$D1lo
+       vpmadd52huq     $H1,$R0,$D1hi
+       vpmadd52luq     $H1,$R1,$D2lo
+       vpmadd52huq     $H1,$R1,$D2hi
+
+       ################################################################
+       # partial reduction
+       vpsrlq          \$44,$D0lo,$tmp
+       vpsllq          \$8,$D0hi,$D0hi
+       vpandq          $mask44,$D0lo,$H0
+       vpaddq          $tmp,$D0hi,$D0hi
+
+       vpaddq          $D0hi,$D1lo,$D1lo
+
+       vpsrlq          \$44,$D1lo,$tmp
+       vpsllq          \$8,$D1hi,$D1hi
+       vpandq          $mask44,$D1lo,$H1
+       vpaddq          $tmp,$D1hi,$D1hi
+
+       vpaddq          $D1hi,$D2lo,$D2lo
+
+       vpsrlq          \$42,$D2lo,$tmp
+       vpsllq          \$10,$D2hi,$D2hi
+       vpandq          $mask42,$D2lo,$H2
+       vpaddq          $tmp,$D2hi,$D2hi
+
+       vpaddq          $D2hi,$H0,$H0
+       vpsllq          \$2,$D2hi,$D2hi
+
+       vpaddq          $D2hi,$H0,$H0
+
+       vpsrlq          \$44,$H0,$tmp           # additional step
+       vpandq          $mask44,$H0,$H0
+
+       vpaddq          $tmp,$H1,$H1
+
+       dec             %eax
+       jz              .Ldone_init_vpmadd52
+
+       vpunpcklqdq     $R1,$H1,$R1             # 1,2
+       vpbroadcastq    %x#$H1,%x#$H1           # 2,2
+       vpunpcklqdq     $R2,$H2,$R2
+       vpbroadcastq    %x#$H2,%x#$H2
+       vpunpcklqdq     $R0,$H0,$R0
+       vpbroadcastq    %x#$H0,%x#$H0
+
+       vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
+       vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
+       vpaddq          $R1,$S1,$S1
+       vpaddq          $R2,$S2,$S2
+       vpsllq          \$2,$S1,$S1
+       vpsllq          \$2,$S2,$S2
+
+       jmp             .Lmul_init_vpmadd52
+       ud2
+
+.align 32
+.Ldone_init_vpmadd52:
+       vinserti128     \$1,%x#$R1,$H1,$R1      # 1,2,3,4
+       vinserti128     \$1,%x#$R2,$H2,$R2
+       vinserti128     \$1,%x#$R0,$H0,$R0
+
+       vpermq          \$0b11011000,$R1,$R1    # 1,3,2,4
+       vpermq          \$0b11011000,$R2,$R2
+       vpermq          \$0b11011000,$R0,$R0
+
+       vpsllq          \$2,$R1,$S1             # S1 = R1*5*4
+       vpaddq          $R1,$S1,$S1
+       vpsllq          \$2,$S1,$S1
+
+       vmovq           0($ctx),%x#$H0          # load current hash value
+       vmovq           8($ctx),%x#$H1
+       vmovq           16($ctx),%x#$H2
+
+       test            \$3,$len                # is length 4*n+2?
+       jnz             .Ldone_init_vpmadd52_2x
+
+       vmovdqu64       $R0,64($ctx)            # save key powers
+       vpbroadcastq    %x#$R0,$R0              # broadcast 4th power
+       vmovdqu64       $R1,96($ctx)
+       vpbroadcastq    %x#$R1,$R1
+       vmovdqu64       $R2,128($ctx)
+       vpbroadcastq    %x#$R2,$R2
+       vmovdqu64       $S1,160($ctx)
+       vpbroadcastq    %x#$S1,$S1
+
+       jmp             .Lblocks_vpmadd52_4x_key_loaded
+       ud2
+
+.align 32
+.Ldone_init_vpmadd52_2x:
+       vmovdqu64       $R0,64($ctx)            # save key powers
+       vpsrldq         \$8,$R0,$R0             # 0-1-0-2
+       vmovdqu64       $R1,96($ctx)
+       vpsrldq         \$8,$R1,$R1
+       vmovdqu64       $R2,128($ctx)
+       vpsrldq         \$8,$R2,$R2
+       vmovdqu64       $S1,160($ctx)
+       vpsrldq         \$8,$S1,$S1
+       jmp             .Lblocks_vpmadd52_2x_key_loaded
+       ud2
+
+.align 32
+.Lblocks_vpmadd52_2x_do:
+       vmovdqu64       128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
+       vmovdqu64       160+8($ctx),${S1}{%k1}{z}
+       vmovdqu64       64+8($ctx),${R0}{%k1}{z}
+       vmovdqu64       96+8($ctx),${R1}{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+       vmovdqu64       16*0($inp),$T2          # load data
+       vpxorq          $T3,$T3,$T3
+       lea             16*2($inp),$inp
+
+       vpunpcklqdq     $T3,$T2,$T1             # transpose data
+       vpunpckhqdq     $T3,$T2,$T3
+
+       # at this point 64-bit lanes are ordered as x-1-x-0
+
+       vpsrlq          \$24,$T3,$T2            # splat the data
+       vporq           $PAD,$T2,$T2
+        vpaddq         $T2,$H2,$H2             # accumulate input
+       vpandq          $mask44,$T1,$T0
+       vpsrlq          \$44,$T1,$T1
+       vpsllq          \$20,$T3,$T3
+       vporq           $T3,$T1,$T1
+       vpandq          $mask44,$T1,$T1
+
+       jmp             .Ltail_vpmadd52_2x
+       ud2
+
+.align 32
+.Loop_vpmadd52_4x:
+       #vpaddq         $T2,$H2,$H2             # accumulate input
+       vpaddq          $T0,$H0,$H0
+       vpaddq          $T1,$H1,$H1
+
+       vpxorq          $D0lo,$D0lo,$D0lo
+       vpmadd52luq     $H2,$S1,$D0lo
+       vpxorq          $D0hi,$D0hi,$D0hi
+       vpmadd52huq     $H2,$S1,$D0hi
+       vpxorq          $D1lo,$D1lo,$D1lo
+       vpmadd52luq     $H2,$S2,$D1lo
+       vpxorq          $D1hi,$D1hi,$D1hi
+       vpmadd52huq     $H2,$S2,$D1hi
+       vpxorq          $D2lo,$D2lo,$D2lo
+       vpmadd52luq     $H2,$R0,$D2lo
+       vpxorq          $D2hi,$D2hi,$D2hi
+       vpmadd52huq     $H2,$R0,$D2hi
+
+        vmovdqu64      16*0($inp),$T2          # load data
+        vmovdqu64      16*2($inp),$T3
+        lea            16*4($inp),$inp
+       vpmadd52luq     $H0,$R0,$D0lo
+       vpmadd52huq     $H0,$R0,$D0hi
+       vpmadd52luq     $H0,$R1,$D1lo
+       vpmadd52huq     $H0,$R1,$D1hi
+       vpmadd52luq     $H0,$R2,$D2lo
+       vpmadd52huq     $H0,$R2,$D2hi
+
+        vpunpcklqdq    $T3,$T2,$T1             # transpose data
+        vpunpckhqdq    $T3,$T2,$T3
+       vpmadd52luq     $H1,$S2,$D0lo
+       vpmadd52huq     $H1,$S2,$D0hi
+       vpmadd52luq     $H1,$R0,$D1lo
+       vpmadd52huq     $H1,$R0,$D1hi
+       vpmadd52luq     $H1,$R1,$D2lo
+       vpmadd52huq     $H1,$R1,$D2hi
+
+       ################################################################
+       # partial reduction (interleaved with data splat)
+       vpsrlq          \$44,$D0lo,$tmp
+       vpsllq          \$8,$D0hi,$D0hi
+       vpandq          $mask44,$D0lo,$H0
+       vpaddq          $tmp,$D0hi,$D0hi
+
+        vpsrlq         \$24,$T3,$T2
+        vporq          $PAD,$T2,$T2
+       vpaddq          $D0hi,$D1lo,$D1lo
+
+       vpsrlq          \$44,$D1lo,$tmp
+       vpsllq          \$8,$D1hi,$D1hi
+       vpandq          $mask44,$D1lo,$H1
+       vpaddq          $tmp,$D1hi,$D1hi
+
+        vpandq         $mask44,$T1,$T0
+        vpsrlq         \$44,$T1,$T1
+        vpsllq         \$20,$T3,$T3
+       vpaddq          $D1hi,$D2lo,$D2lo
+
+       vpsrlq          \$42,$D2lo,$tmp
+       vpsllq          \$10,$D2hi,$D2hi
+       vpandq          $mask42,$D2lo,$H2
+       vpaddq          $tmp,$D2hi,$D2hi
+
+         vpaddq        $T2,$H2,$H2             # accumulate input
+       vpaddq          $D2hi,$H0,$H0
+       vpsllq          \$2,$D2hi,$D2hi
+
+       vpaddq          $D2hi,$H0,$H0
+        vporq          $T3,$T1,$T1
+        vpandq         $mask44,$T1,$T1
+
+       vpsrlq          \$44,$H0,$tmp           # additional step
+       vpandq          $mask44,$H0,$H0
+
+       vpaddq          $tmp,$H1,$H1
+
+       sub             \$4,$len                # len-=64
+       jnz             .Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+       vmovdqu64       128($ctx),$R2           # load all key powers
+       vmovdqu64       160($ctx),$S1
+       vmovdqu64       64($ctx),$R0
+       vmovdqu64       96($ctx),$R1
+
+.Ltail_vpmadd52_2x:
+       vpsllq          \$2,$R2,$S2             # S2 = R2*5*4
+       vpaddq          $R2,$S2,$S2
+       vpsllq          \$2,$S2,$S2
+
+       #vpaddq         $T2,$H2,$H2             # accumulate input
+       vpaddq          $T0,$H0,$H0
+       vpaddq          $T1,$H1,$H1
+
+       vpxorq          $D0lo,$D0lo,$D0lo
+       vpmadd52luq     $H2,$S1,$D0lo
+       vpxorq          $D0hi,$D0hi,$D0hi
+       vpmadd52huq     $H2,$S1,$D0hi
+       vpxorq          $D1lo,$D1lo,$D1lo
+       vpmadd52luq     $H2,$S2,$D1lo
+       vpxorq          $D1hi,$D1hi,$D1hi
+       vpmadd52huq     $H2,$S2,$D1hi
+       vpxorq          $D2lo,$D2lo,$D2lo
+       vpmadd52luq     $H2,$R0,$D2lo
+       vpxorq          $D2hi,$D2hi,$D2hi
+       vpmadd52huq     $H2,$R0,$D2hi
+
+       vpmadd52luq     $H0,$R0,$D0lo
+       vpmadd52huq     $H0,$R0,$D0hi
+       vpmadd52luq     $H0,$R1,$D1lo
+       vpmadd52huq     $H0,$R1,$D1hi
+       vpmadd52luq     $H0,$R2,$D2lo
+       vpmadd52huq     $H0,$R2,$D2hi
+
+       vpmadd52luq     $H1,$S2,$D0lo
+       vpmadd52huq     $H1,$S2,$D0hi
+       vpmadd52luq     $H1,$R0,$D1lo
+       vpmadd52huq     $H1,$R0,$D1hi
+       vpmadd52luq     $H1,$R1,$D2lo
+       vpmadd52huq     $H1,$R1,$D2hi
+
+       ################################################################
+       # horizontal addition
+
+       mov             \$1,%eax
+       kmovw           %eax,%k1
+       vpsrldq         \$8,$D0lo,$T0
+       vpsrldq         \$8,$D0hi,$H0
+       vpsrldq         \$8,$D1lo,$T1
+       vpsrldq         \$8,$D1hi,$H1
+       vpaddq          $T0,$D0lo,$D0lo
+       vpaddq          $H0,$D0hi,$D0hi
+       vpsrldq         \$8,$D2lo,$T2
+       vpsrldq         \$8,$D2hi,$H2
+       vpaddq          $T1,$D1lo,$D1lo
+       vpaddq          $H1,$D1hi,$D1hi
+        vpermq         \$0x2,$D0lo,$T0
+        vpermq         \$0x2,$D0hi,$H0
+       vpaddq          $T2,$D2lo,$D2lo
+       vpaddq          $H2,$D2hi,$D2hi
+
+       vpermq          \$0x2,$D1lo,$T1
+       vpermq          \$0x2,$D1hi,$H1
+       vpaddq          $T0,$D0lo,${D0lo}{%k1}{z}
+       vpaddq          $H0,$D0hi,${D0hi}{%k1}{z}
+       vpermq          \$0x2,$D2lo,$T2
+       vpermq          \$0x2,$D2hi,$H2
+       vpaddq          $T1,$D1lo,${D1lo}{%k1}{z}
+       vpaddq          $H1,$D1hi,${D1hi}{%k1}{z}
+       vpaddq          $T2,$D2lo,${D2lo}{%k1}{z}
+       vpaddq          $H2,$D2hi,${D2hi}{%k1}{z}
+
+       ################################################################
+       # partial reduction
+       vpsrlq          \$44,$D0lo,$tmp
+       vpsllq          \$8,$D0hi,$D0hi
+       vpandq          $mask44,$D0lo,$H0
+       vpaddq          $tmp,$D0hi,$D0hi
+
+       vpaddq          $D0hi,$D1lo,$D1lo
+
+       vpsrlq          \$44,$D1lo,$tmp
+       vpsllq          \$8,$D1hi,$D1hi
+       vpandq          $mask44,$D1lo,$H1
+       vpaddq          $tmp,$D1hi,$D1hi
+
+       vpaddq          $D1hi,$D2lo,$D2lo
+
+       vpsrlq          \$42,$D2lo,$tmp
+       vpsllq          \$10,$D2hi,$D2hi
+       vpandq          $mask42,$D2lo,$H2
+       vpaddq          $tmp,$D2hi,$D2hi
+
+       vpaddq          $D2hi,$H0,$H0
+       vpsllq          \$2,$D2hi,$D2hi
+
+       vpaddq          $D2hi,$H0,$H0
+
+       vpsrlq          \$44,$H0,$tmp           # additional step
+       vpandq          $mask44,$H0,$H0
+
+       vpaddq          $tmp,$H1,$H1
+                                               # at this point $len is
+                                               # either 4*n+2 or 0...
+       sub             \$2,$len                # len-=32
+       ja              .Lblocks_vpmadd52_4x_do
+
+       vmovq           %x#$H0,0($ctx)
+       vmovq           %x#$H1,8($ctx)
+       vmovq           %x#$H2,16($ctx)
+
+.Lno_data_vpmadd52_4x:
+       ret
+.size  poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+___
+}
 $code.=<<___;
 .type  poly1305_emit_base2_44,\@function,3
 .align 32
@@ -2920,6 +3385,13 @@ $code.=<<___;
 .quad  44,44,42,64
 .L2_44_shift_lft:
 .quad  8,8,10,64
+
+.Lx_mask44:
+.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 ___
 }