author Andy Polyakov Sun, 25 Dec 2016 12:10:00 +0000 (13:10 +0100) committer Andy Polyakov Sat, 25 Feb 2017 17:36:37 +0000 (18:36 +0100)
Effectively it's minor size optimization, 5-6% per affected subroutine.

Reviewed-by: Rich Salz <rsalz@openssl.org>

index f602368..c989b34 100755 (executable)
@@ -1631,8 +1631,9 @@ \$code.=<<___      if (\$win64);
.Ldo_avx2_body:
___
\$code.=<<___;
-       lea             48+64(\$ctx),\$ctx        # size optimization
lea             .Lconst(%rip),%rcx
+       lea             48+64(\$ctx),\$ctx        # size optimization
+       vmovdqa         96(%rcx),\$T0            # .Lpermd_avx2

# expand and copy pre-calculated table to stack
vmovdqu         `16*0-64`(\$ctx),%x#\$T2
@@ -1642,36 +1643,28 @@ \$code.=<<___;
vmovdqu         `16*3-64`(\$ctx),%x#\$D0
vmovdqu         `16*4-64`(\$ctx),%x#\$D1
vmovdqu         `16*5-64`(\$ctx),%x#\$D2
+       lea             0x90(%rsp),%rax         # size optimization
vmovdqu         `16*6-64`(\$ctx),%x#\$D3
-       vpermq          \\$0x15,\$T2,\$T2          # 00003412 -> 12343434
+       vpermd          \$T2,\$T0,\$T2             # 00003412 -> 14243444
vmovdqu         `16*7-64`(\$ctx),%x#\$D4
-       vpermq          \\$0x15,\$T3,\$T3
-       vpshufd         \\$0xc8,\$T2,\$T2          # 12343434 -> 14243444
+       vpermd          \$T3,\$T0,\$T3
-       vpermq          \\$0x15,\$T4,\$T4
-       vpshufd         \\$0xc8,\$T3,\$T3
+       vpermd          \$T4,\$T0,\$T4
vmovdqa         \$T2,0x00(%rsp)
-       vpermq          \\$0x15,\$D0,\$D0
-       vpshufd         \\$0xc8,\$T4,\$T4
-       vmovdqa         \$T3,0x20(%rsp)
-       vpermq          \\$0x15,\$D1,\$D1
-       vpshufd         \\$0xc8,\$D0,\$D0
-       vmovdqa         \$T4,0x40(%rsp)
-       vpermq          \\$0x15,\$D2,\$D2
-       vpshufd         \\$0xc8,\$D1,\$D1
-       vmovdqa         \$D0,0x60(%rsp)
-       vpermq          \\$0x15,\$D3,\$D3
-       vpshufd         \\$0xc8,\$D2,\$D2
-       vmovdqa         \$D1,0x80(%rsp)
-       vpermq          \\$0x15,\$D4,\$D4
-       vpshufd         \\$0xc8,\$D3,\$D3
-       vmovdqa         \$D2,0xa0(%rsp)
-       vpshufd         \\$0xc8,\$D4,\$D4
-       vmovdqa         \$D3,0xc0(%rsp)
-       vmovdqa         \$D4,0xe0(%rsp)
+       vpermd          \$D0,\$T0,\$D0
+       vmovdqa         \$T3,0x20-0x90(%rax)
+       vpermd          \$D1,\$T0,\$D1
+       vmovdqa         \$T4,0x40-0x90(%rax)
+       vpermd          \$D2,\$T0,\$D2
+       vmovdqa         \$D0,0x60-0x90(%rax)
+       vpermd          \$D3,\$T0,\$D3
+       vmovdqa         \$D1,0x80-0x90(%rax)
+       vpermd          \$D4,\$T0,\$D4
+       vmovdqa         \$D2,0xa0-0x90(%rax)
+       vmovdqa         \$D3,0xc0-0x90(%rax)
+       vmovdqa         \$D4,0xe0-0x90(%rax)

################################################################
@@ -1698,7 +1691,6 @@ \$code.=<<___;
vpor            32(%rcx),\$T4,\$T4        # padbit, yes, always

-       lea             0x90(%rsp),%rax         # size optimization
sub             \\$64,\$len
jz              .Ltail_avx2
@@ -2055,8 +2047,9 @@ \$code.=<<___      if (\$win64);
.Ldo_avx512_body:
___
\$code.=<<___;
-       lea             48+64(\$ctx),\$ctx        # size optimization
lea             .Lconst(%rip),%rcx
+       lea             48+64(\$ctx),\$ctx        # size optimization
+       vmovdqa         96(%rcx),\$T2            # .Lpermd_avx2

# expand pre-calculated table
vmovdqu32       `16*0-64`(\$ctx),%x#\$R0
@@ -2069,33 +2062,23 @@ \$code.=<<___;
vmovdqu32       `16*6-64`(\$ctx),%x#\$S3
vmovdqu32       `16*7-64`(\$ctx),%x#\$R4
vmovdqu32       `16*8-64`(\$ctx),%x#\$S4
-       vpermq          \\$0x15,\$R0,\$R0          # 00003412 -> 12343434
+       vpermd          \$R0,\$T2,\$R0             # 00003412 -> 14243444
-       vpermq          \\$0x15,\$R1,\$R1
-       vmovdqa32       128(%rcx),\$GATHER       # .Lgather
-       vpermq          \\$0x15,\$S1,\$S1
-       vpshufd         \\$0xc8,\$R0,\$R0          # 12343434 -> 14243444
-       vpermq          \\$0x15,\$R2,\$R2
-       vpshufd         \\$0xc8,\$R1,\$R1
+       vpermd          \$R1,\$T2,\$R1
+       vpermd          \$S1,\$T2,\$S1
+       vpermd          \$R2,\$T2,\$R2
vmovdqa32       \$R0,0x00(%rsp)          # save in case \$len%128 != 0
vpsrlq         \\$32,\$R0,\$T0            # 14243444 -> 01020304
-       vpermq          \\$0x15,\$S2,\$S2
-       vpshufd         \\$0xc8,\$S1,\$S1
+       vpermd          \$S2,\$T2,\$S2
vmovdqa32       \$R1,0x20(%rsp)
vpsrlq         \\$32,\$R1,\$T1
-       vpermq          \\$0x15,\$R3,\$R3
-       vpshufd         \\$0xc8,\$R2,\$R2
+       vpermd          \$R3,\$T2,\$R3
vmovdqa32       \$S1,0x40(%rsp)
-       vpermq          \\$0x15,\$S3,\$S3
-       vpshufd         \\$0xc8,\$S2,\$S2
-       vpermq          \\$0x15,\$R4,\$R4
-       vpshufd         \\$0xc8,\$R3,\$R3
+       vpermd          \$S3,\$T2,\$S3
+       vpermd          \$R4,\$T2,\$R4
vmovdqa32       \$R2,0x60(%rsp)
-       vpermq          \\$0x15,\$S4,\$S4
-       vpshufd         \\$0xc8,\$S3,\$S3
+       vpermd          \$S4,\$T2,\$S4
vmovdqa32       \$S2,0x80(%rsp)
-       vpshufd         \\$0xc8,\$R4,\$R4
-       vpshufd         \\$0xc8,\$S4,\$S4
vmovdqa32       \$R3,0xa0(%rsp)
vmovdqa32       \$S3,0xc0(%rsp)
vmovdqa32       \$R4,0xe0(%rsp)
@@ -2275,14 +2258,16 @@ \$code.=<<___;

mov             \\$0x0f,%eax
sub             \\$192,\$len
jbe             .Ltail_avx512
+       jmp             .Loop_avx512

+.align 32
.Loop_avx512:
################################################################
# ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
@@ -2316,7 +2301,6 @@ \$code.=<<___;
vpmuludq        \$H2,\$R2,\$D4             # d4 = h2*r2
vpmuludq        \$H2,\$S3,\$D0             # d0 = h2*s3
-        vpandq         \$MASK,\$T3,\$T3           # 3, module-scheduled
vpmuludq        \$H2,\$S4,\$D1             # d1 = h2*s4
vpmuludq        \$H2,\$R0,\$D2             # d2 = h2*r0
@@ -2425,7 +2409,7 @@ \$code.=<<___;

sub             \\$128,\$len
@@ -2459,7 +2443,6 @@ \$code.=<<___;
vpmuludq        \$H2,\$S3,\$D0             # d0 = h2*s3
vpmuludq        \$H2,\$S4,\$D1             # d1 = h2*s4
vpmuludq        \$H2,\$R0,\$D2             # d2 = h2*r0
-        vpandq         \$MASK,\$T3,\$T3           # 3, module-scheduled