poly1305/asm/poly1305-x86_64.pl: switch to vpermdd in table expansion.
authorAndy Polyakov <appro@openssl.org>
Sun, 25 Dec 2016 12:10:00 +0000 (13:10 +0100)
committerAndy Polyakov <appro@openssl.org>
Sat, 25 Feb 2017 17:36:37 +0000 (18:36 +0100)
Effectively it's minor size optimization, 5-6% per affected subroutine.

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/poly1305/asm/poly1305-x86_64.pl

index f602368ea34213a412a811d79e3d3daba82717e4..c989b3421cbf2cc4865c919d1dee8217ed4cf8e8 100755 (executable)
@@ -1631,8 +1631,9 @@ $code.=<<___      if ($win64);
 .Ldo_avx2_body:
 ___
 $code.=<<___;
 .Ldo_avx2_body:
 ___
 $code.=<<___;
-       lea             48+64($ctx),$ctx        # size optimization
        lea             .Lconst(%rip),%rcx
        lea             .Lconst(%rip),%rcx
+       lea             48+64($ctx),$ctx        # size optimization
+       vmovdqa         96(%rcx),$T0            # .Lpermd_avx2
 
        # expand and copy pre-calculated table to stack
        vmovdqu         `16*0-64`($ctx),%x#$T2
 
        # expand and copy pre-calculated table to stack
        vmovdqu         `16*0-64`($ctx),%x#$T2
@@ -1642,36 +1643,28 @@ $code.=<<___;
        vmovdqu         `16*3-64`($ctx),%x#$D0
        vmovdqu         `16*4-64`($ctx),%x#$D1
        vmovdqu         `16*5-64`($ctx),%x#$D2
        vmovdqu         `16*3-64`($ctx),%x#$D0
        vmovdqu         `16*4-64`($ctx),%x#$D1
        vmovdqu         `16*5-64`($ctx),%x#$D2
+       lea             0x90(%rsp),%rax         # size optimization
        vmovdqu         `16*6-64`($ctx),%x#$D3
        vmovdqu         `16*6-64`($ctx),%x#$D3
-       vpermq          \$0x15,$T2,$T2          # 00003412 -> 12343434
+       vpermd          $T2,$T0,$T2             # 00003412 -> 14243444
        vmovdqu         `16*7-64`($ctx),%x#$D4
        vmovdqu         `16*7-64`($ctx),%x#$D4
-       vpermq          \$0x15,$T3,$T3
-       vpshufd         \$0xc8,$T2,$T2          # 12343434 -> 14243444
+       vpermd          $T3,$T0,$T3
        vmovdqu         `16*8-64`($ctx),%x#$MASK
        vmovdqu         `16*8-64`($ctx),%x#$MASK
-       vpermq          \$0x15,$T4,$T4
-       vpshufd         \$0xc8,$T3,$T3
+       vpermd          $T4,$T0,$T4
        vmovdqa         $T2,0x00(%rsp)
        vmovdqa         $T2,0x00(%rsp)
-       vpermq          \$0x15,$D0,$D0
-       vpshufd         \$0xc8,$T4,$T4
-       vmovdqa         $T3,0x20(%rsp)
-       vpermq          \$0x15,$D1,$D1
-       vpshufd         \$0xc8,$D0,$D0
-       vmovdqa         $T4,0x40(%rsp)
-       vpermq          \$0x15,$D2,$D2
-       vpshufd         \$0xc8,$D1,$D1
-       vmovdqa         $D0,0x60(%rsp)
-       vpermq          \$0x15,$D3,$D3
-       vpshufd         \$0xc8,$D2,$D2
-       vmovdqa         $D1,0x80(%rsp)
-       vpermq          \$0x15,$D4,$D4
-       vpshufd         \$0xc8,$D3,$D3
-       vmovdqa         $D2,0xa0(%rsp)
-       vpermq          \$0x15,$MASK,$MASK
-       vpshufd         \$0xc8,$D4,$D4
-       vmovdqa         $D3,0xc0(%rsp)
-       vpshufd         \$0xc8,$MASK,$MASK
-       vmovdqa         $D4,0xe0(%rsp)
-       vmovdqa         $MASK,0x100(%rsp)
+       vpermd          $D0,$T0,$D0
+       vmovdqa         $T3,0x20-0x90(%rax)
+       vpermd          $D1,$T0,$D1
+       vmovdqa         $T4,0x40-0x90(%rax)
+       vpermd          $D2,$T0,$D2
+       vmovdqa         $D0,0x60-0x90(%rax)
+       vpermd          $D3,$T0,$D3
+       vmovdqa         $D1,0x80-0x90(%rax)
+       vpermd          $D4,$T0,$D4
+       vmovdqa         $D2,0xa0-0x90(%rax)
+       vpermd          $MASK,$T0,$MASK
+       vmovdqa         $D3,0xc0-0x90(%rax)
+       vmovdqa         $D4,0xe0-0x90(%rax)
+       vmovdqa         $MASK,0x100-0x90(%rax)
        vmovdqa         64(%rcx),$MASK          # .Lmask26
 
        ################################################################
        vmovdqa         64(%rcx),$MASK          # .Lmask26
 
        ################################################################
@@ -1698,7 +1691,6 @@ $code.=<<___;
        vpand           $MASK,$T3,$T3           # 3
        vpor            32(%rcx),$T4,$T4        # padbit, yes, always
 
        vpand           $MASK,$T3,$T3           # 3
        vpor            32(%rcx),$T4,$T4        # padbit, yes, always
 
-       lea             0x90(%rsp),%rax         # size optimization
        vpaddq          $H2,$T2,$H2             # accumulate input
        sub             \$64,$len
        jz              .Ltail_avx2
        vpaddq          $H2,$T2,$H2             # accumulate input
        sub             \$64,$len
        jz              .Ltail_avx2
@@ -2055,8 +2047,9 @@ $code.=<<___      if ($win64);
 .Ldo_avx512_body:
 ___
 $code.=<<___;
 .Ldo_avx512_body:
 ___
 $code.=<<___;
-       lea             48+64($ctx),$ctx        # size optimization
        lea             .Lconst(%rip),%rcx
        lea             .Lconst(%rip),%rcx
+       lea             48+64($ctx),$ctx        # size optimization
+       vmovdqa         96(%rcx),$T2            # .Lpermd_avx2
 
        # expand pre-calculated table
        vmovdqu32       `16*0-64`($ctx),%x#$R0
 
        # expand pre-calculated table
        vmovdqu32       `16*0-64`($ctx),%x#$R0
@@ -2069,33 +2062,23 @@ $code.=<<___;
        vmovdqu32       `16*6-64`($ctx),%x#$S3
        vmovdqu32       `16*7-64`($ctx),%x#$R4
        vmovdqu32       `16*8-64`($ctx),%x#$S4
        vmovdqu32       `16*6-64`($ctx),%x#$S3
        vmovdqu32       `16*7-64`($ctx),%x#$R4
        vmovdqu32       `16*8-64`($ctx),%x#$S4
-       vpermq          \$0x15,$R0,$R0          # 00003412 -> 12343434
+       vpermd          $R0,$T2,$R0             # 00003412 -> 14243444
        vmovdqa64       64(%rcx),$MASK          # .Lmask26
        vmovdqa64       64(%rcx),$MASK          # .Lmask26
-       vpermq          \$0x15,$R1,$R1
-       vmovdqa32       128(%rcx),$GATHER       # .Lgather
-       vpermq          \$0x15,$S1,$S1
-       vpshufd         \$0xc8,$R0,$R0          # 12343434 -> 14243444
-       vpermq          \$0x15,$R2,$R2
-       vpshufd         \$0xc8,$R1,$R1
+       vpermd          $R1,$T2,$R1
+       vpermd          $S1,$T2,$S1
+       vpermd          $R2,$T2,$R2
        vmovdqa32       $R0,0x00(%rsp)          # save in case $len%128 != 0
         vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
        vmovdqa32       $R0,0x00(%rsp)          # save in case $len%128 != 0
         vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
-       vpermq          \$0x15,$S2,$S2
-       vpshufd         \$0xc8,$S1,$S1
+       vpermd          $S2,$T2,$S2
        vmovdqa32       $R1,0x20(%rsp)
         vpsrlq         \$32,$R1,$T1
        vmovdqa32       $R1,0x20(%rsp)
         vpsrlq         \$32,$R1,$T1
-       vpermq          \$0x15,$R3,$R3
-       vpshufd         \$0xc8,$R2,$R2
+       vpermd          $R3,$T2,$R3
        vmovdqa32       $S1,0x40(%rsp)
        vmovdqa32       $S1,0x40(%rsp)
-       vpermq          \$0x15,$S3,$S3
-       vpshufd         \$0xc8,$S2,$S2
-       vpermq          \$0x15,$R4,$R4
-       vpshufd         \$0xc8,$R3,$R3
+       vpermd          $S3,$T2,$S3
+       vpermd          $R4,$T2,$R4
        vmovdqa32       $R2,0x60(%rsp)
        vmovdqa32       $R2,0x60(%rsp)
-       vpermq          \$0x15,$S4,$S4
-       vpshufd         \$0xc8,$S3,$S3
+       vpermd          $S4,$T2,$S4
        vmovdqa32       $S2,0x80(%rsp)
        vmovdqa32       $S2,0x80(%rsp)
-       vpshufd         \$0xc8,$R4,$R4
-       vpshufd         \$0xc8,$S4,$S4
        vmovdqa32       $R3,0xa0(%rsp)
        vmovdqa32       $S3,0xc0(%rsp)
        vmovdqa32       $R4,0xe0(%rsp)
        vmovdqa32       $R3,0xa0(%rsp)
        vmovdqa32       $S3,0xc0(%rsp)
        vmovdqa32       $R4,0xe0(%rsp)
@@ -2275,14 +2258,16 @@ $code.=<<___;
        vpandq          $MASK,$T2,$T2           # 2
        vpandq          $MASK,$T0,$T0           # 0
        vpandq          $MASK,$T1,$T1           # 1
        vpandq          $MASK,$T2,$T2           # 2
        vpandq          $MASK,$T0,$T0           # 0
        vpandq          $MASK,$T1,$T1           # 1
-       #vpandq         $MASK,$T3,$T3           # 3
+       vpandq          $MASK,$T3,$T3           # 3
        #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
 
        vpaddq          $H2,$T2,$H2             # accumulate input
        mov             \$0x0f,%eax
        sub             \$192,$len
        jbe             .Ltail_avx512
        #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
 
        vpaddq          $H2,$T2,$H2             # accumulate input
        mov             \$0x0f,%eax
        sub             \$192,$len
        jbe             .Ltail_avx512
+       jmp             .Loop_avx512
 
 
+.align 32
 .Loop_avx512:
        ################################################################
        # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
 .Loop_avx512:
        ################################################################
        # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
@@ -2316,7 +2301,6 @@ $code.=<<___;
         vpaddq         $H0,$T0,$H0
        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
         vpaddq         $H0,$T0,$H0
        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
-        vpandq         $MASK,$T3,$T3           # 3, module-scheduled
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
@@ -2425,7 +2409,7 @@ $code.=<<___;
 
         vpandq         $MASK,$T0,$T0           # 0
         vpandq         $MASK,$T1,$T1           # 1
 
         vpandq         $MASK,$T0,$T0           # 0
         vpandq         $MASK,$T1,$T1           # 1
-        #vpandq        $MASK,$T3,$T3           # 3
+        vpandq         $MASK,$T3,$T3           # 3
         #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
 
        sub             \$128,$len
         #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
 
        sub             \$128,$len
@@ -2459,7 +2443,6 @@ $code.=<<___;
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
-        vpandq         $MASK,$T3,$T3           # 3, module-scheduled
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
         vpaddq         $H1,$T1,$H1             # accumulate input
         vpaddq         $H3,$T3,$H3
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
         vpaddq         $H1,$T1,$H1             # accumulate input
         vpaddq         $H3,$T3,$H3
@@ -2647,10 +2630,8 @@ $code.=<<___;
 .long  `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 .Lmask26:
 .long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 .long  `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 .Lmask26:
 .long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lfive:
-.long  5,0,5,0,5,0,5,0
-.Lgather:
-.long  0,8, 32,40, 64,72, 96,104
+.Lpermd_avx2:
+.long  2,2,2,3,2,0,2,1
 ___
 }
 
 ___
 }