sha/asm/keccak1600-avx512.pl: improve performance by 17%.
[openssl.git] / crypto / sha / asm / keccak1600-avx512.pl
index 70dec4ed980587a7012f51fdfab3f673ac4d6658..2f3215147169b3d74ac2766333e853a82b315820 100755 (executable)
 # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
 # Pretty straightforward, the only "magic" is data layout in registers.
 # It's impossible to have one that is optimal for every step, hence
-# it's changing as algorithm progresses. Data is saved in order that
-# benefits Chi, but at the same time is easily convertible to order
-# that benefits Theta. Conversion from Chi layout to Theta is
-# explicit and reverse one is kind of fused with Pi...
+# it's changing as algorithm progresses. Data is saved in linear order,
+# but in-register order morphs between rounds. Even rounds take in
+# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
 #
 ########################################################################
 # Numbers are cycles per processed byte out of large message.
 #
 #                      r=1088(*)
 #
-# Knights Landing      8.9
-# Skylake-X            6.7
+# Knights Landing      7.6
+# Skylake-X            5.7
 #
 # (*)  Corresponds to SHA3-256.
 
 ########################################################################
-# Coordinates below correspond to those in sha/keccak1600.c. Layout
-# suitable for Chi is one with y coordinates aligned column-wise. Trick
-# is to add regular shift to x coordinate, so that Chi can still be
-# performed with as little as 7 instructions, yet be converted to layout
-# suitable for Theta with intra-register permutations alone. Here is
-# "magic" layout for Chi (with pre-Theta shuffle):
+# Below code is combination of two ideas. One is taken from Keccak Code
+# Package, hereafter KCP, and another one from initial version of this
+# module. What is common is observation that Pi's input and output are
+# "mostly transposed", i.e. if input is aligned by x coordinate, then
+# output is [mostly] aligned by y. Both versions, KCP and predecessor,
+# were trying to use one of them from round to round, which resulted in
+# some kind of transposition in each round. This version still does
+# transpose data, but only every second round. Another essential factor
+# is that KCP transposition has to be performed with instructions that
+# turned to be rather expensive on Knights Landing, both latency- and
+# throughput-wise. Not to mention that some of them have to depend on
+# each other. On the other hand initial version of this module was
+# relying heavily on blend instructions. There were lots of them,
+# resulting in higher instruction count, yet it performed better on
+# Knights Landing, because processor can execute pair of them each
+# cycle and they have minimal latency. This module is an attempt to
+# bring best parts together:-)
+#
+# Coordinates below correspond to those in sha/keccak1600.c. Input
+# layout is straight linear:
+#
+# [0][4] [0][3] [0][2] [0][1] [0][0]
+# [1][4] [1][3] [1][2] [1][1] [1][0]
+# [2][4] [2][3] [2][2] [2][1] [2][0]
+# [3][4] [3][3] [3][2] [3][1] [3][0]
+# [4][4] [4][3] [4][2] [4][1] [4][0]
+#
+# It's perfect for Theta, while Pi is reduced to intra-register
+# permutations which yield layout perfect for Chi:
+#
+# [4][0] [3][0] [2][0] [1][0] [0][0]
+# [4][1] [3][1] [2][1] [1][1] [0][1]
+# [4][2] [3][2] [2][2] [1][2] [0][2]
+# [4][3] [3][3] [2][3] [1][3] [0][3]
+# [4][4] [3][4] [2][4] [1][4] [0][4]
+#
+# Now instead of performing full transposition and feeding it to next
+# identical round, we perform kind of diagonal transposition to layout
+# from initial version of this module, and make it suitable for Theta:
 #
 # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
 # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
 # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
 # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
 #
-# Layout suitable to Theta has x coordinates aligned column-wise
-# [it's interleaved with Pi indices transformation for reference]:
+# Now intra-register permutations yield initial [almost] straight
+# linear layout:
 #
-# [4][4] [3][3] [2][2] [1][1] [0][0]   $A00
+# [4][4] [3][3] [2][2] [1][1] [0][0]
 ##[0][4] [0][3] [0][2] [0][1] [0][0]
-# [3][4] [2][3] [1][2] [0][1] [4][0]   $A01
+# [3][4] [2][3] [1][2] [0][1] [4][0]
 ##[2][3] [2][2] [2][1] [2][0] [2][4]
-# [2][4] [1][3] [0][2] [4][1] [3][0]   $A02
+# [2][4] [1][3] [0][2] [4][1] [3][0]
 ##[4][2] [4][1] [4][0] [4][4] [4][3]
-# [1][4] [0][3] [4][2] [3][1] [2][0]   $A03
+# [1][4] [0][3] [4][2] [3][1] [2][0]
 ##[1][1] [1][0] [1][4] [1][3] [1][2]
-# [0][4] [4][3] [3][2] [2][1] [1][0]   $A04
+# [0][4] [4][3] [3][2] [2][1] [1][0]
 ##[3][0] [3][4] [3][3] [3][2] [3][1]
 #
-# Pi itself is performed by blending above data and finally shuffling it
-# to original Chi layout:
-#
-# [1][1] [2][2] [3][3] [4][4] [0][0]>1.2.3.4.0>[4][4] [3][3] [2][2] [1][1] [0][0]
-# [2][3] [3][4] [4][0] [0][1] [1][2]>2.3.4.0.1>[4][0] [3][4] [2][3] [1][2] [0][1]
-# [3][0] [4][1] [0][2] [1][3] [2][4]>3.4.0.1.2>[4][1] [3][0] [2][4] [1][3] [0][2]
-# [4][2] [0][3] [1][4] [2][0] [3][1]>4.0.1.2.3>[4][2] [3][1] [2][0] [1][4] [0][3]
-# [0][4] [1][0] [2][1] [3][2] [4][3]>0.1.2.3.4>[4][3] [3][2] [2][1] [1][0] [0][4]
+# This means that odd round Chi is performed in less suitable layout,
+# with a number of additional permutations. But overall it turned to be
+# a win. Permutations are fastest possible on Knights Landing and they
+# are laid down to be independent of each other. In the essence I traded
+# 20 blend instructions for 3 permutations. The result is 13% faster
+# than KCP on Skylake-X, and >40% on Knights Landing.
 #
-# As implied, data is loaded in Chi layout. Digits in variables' names
-# represent right most coordinates of loaded data chunk:
-
-my ($A00,      # [4][4] [3][3] [2][2] [1][1] [0][0]
-    $A01,      # [4][0] [3][4] [2][3] [1][2] [0][1]
-    $A02,      # [4][1] [3][0] [2][4] [1][3] [0][2]
-    $A03,      # [4][2] [3][1] [2][0] [1][4] [0][3]
-    $A04) =    # [4][3] [3][2] [2][1] [1][0] [0][4]
+# As implied, data is loaded in straight linear order. Digits in
+# variables' names represent coordinates of right-most element of
+# loaded data chunk:
+
+my ($A00,      # [0][4] [0][3] [0][2] [0][1] [0][0]
+    $A10,      # [1][4] [1][3] [1][2] [1][1] [1][0]
+    $A20,      # [2][4] [2][3] [2][2] [2][1] [2][0]
+    $A30,      # [3][4] [3][3] [3][2] [3][1] [3][0]
+    $A40) =    # [4][4] [4][3] [4][2] [4][1] [4][0]
     map("%zmm$_",(0..4));
 
 # We also need to map the magic order into offsets within structure:
 
-my @A_jagged = ([0,0], [1,0], [2,0], [3,0], [4,0],
-               [4,1], [0,1], [1,1], [2,1], [3,1],
-               [3,2], [4,2], [0,2], [1,2], [2,2],
-               [2,3], [3,3], [4,3], [0,3], [1,3],
-               [1,4], [2,4], [3,4], [4,4], [0,4]);
-   @A_jagged_in  = map(8*($$_[0]*8+$$_[1]), @A_jagged);        # ... and now linear
-   @A_jagged_out = map(8*($$_[0]*5+$$_[1]), @A_jagged);        # ... and now linear
+my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
+               [1,0], [1,1], [1,2], [1,3], [1,4],
+               [2,0], [2,1], [2,2], [2,3], [2,4],
+               [3,0], [3,1], [3,2], [3,3], [3,4],
+               [4,0], [4,1], [4,2], [4,3], [4,4]);
+   @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged);    # ... and now linear
 
-my @T       = map("%zmm$_",(5..7,16..17));
-my @Chi     = map("%zmm$_",(18..22));
-my @Theta   = map("%zmm$_",(33,23..26));       # invalid @Theta[0] is not typo
-my @Rhotate = map("%zmm$_",(27..31));
+my @T        = map("%zmm$_",(5..12));
+my @Theta    = map("%zmm$_",(33,13..16));      # invalid @Theta[0] is not typo
+my @Pi0      = map("%zmm$_",(17..21));
+my @Rhotate0 = map("%zmm$_",(22..26));
+my @Rhotate1 = map("%zmm$_",(27..31));
 
 my ($C00,$D00) = @T[0..1];
 my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
@@ -107,82 +138,136 @@ $code.=<<___;
 .align 32
 __KeccakF1600:
        lea             iotas(%rip),%r10
-       mov             \$24,%eax
+       mov             \$12,%eax
        jmp             .Loop_avx512
 
 .align 32
 .Loop_avx512:
-       ######################################### Theta
-       #vpermq         $A00,@Theta[0],$A00     # doesn't actually change order
-       vpermq          $A01,@Theta[1],$A01
-       vpermq          $A02,@Theta[2],$A02
-       vpermq          $A03,@Theta[3],$A03
-       vpermq          $A04,@Theta[4],$A04
-
+       ######################################### Theta, even round
        vmovdqa64       $A00,@T[0]              # put aside original A00
-       vpternlogq      \$0x96,$A02,$A01,$A00   # and use it as "C00"
-       vpternlogq      \$0x96,$A04,$A03,$A00
+       vpternlogq      \$0x96,$A20,$A10,$A00   # and use it as "C00"
+       vpternlogq      \$0x96,$A40,$A30,$A00
 
        vprolq          \$1,$A00,$D00
        vpermq          $A00,@Theta[1],$A00
        vpermq          $D00,@Theta[4],$D00
 
        vpternlogq      \$0x96,$A00,$D00,@T[0]  # T[0] is original A00
-       vpternlogq      \$0x96,$A00,$D00,$A01
-       vpternlogq      \$0x96,$A00,$D00,$A02
-       vpternlogq      \$0x96,$A00,$D00,$A03
-       vpternlogq      \$0x96,$A00,$D00,$A04
+       vpternlogq      \$0x96,$A00,$D00,$A10
+       vpternlogq      \$0x96,$A00,$D00,$A20
+       vpternlogq      \$0x96,$A00,$D00,$A30
+       vpternlogq      \$0x96,$A00,$D00,$A40
 
        ######################################### Rho
-       vprolvq         @Rhotate[0],@T[0],$A00  # T[0] is original A00
-       vprolvq         @Rhotate[1],$A01,$A01
-       vprolvq         @Rhotate[2],$A02,$A02
-       vprolvq         @Rhotate[3],$A03,$A03
-       vprolvq         @Rhotate[4],$A04,$A04
+       vprolvq         @Rhotate0[0],@T[0],$A00 # T[0] is original A00
+       vprolvq         @Rhotate0[1],$A10,$A10
+       vprolvq         @Rhotate0[2],$A20,$A20
+       vprolvq         @Rhotate0[3],$A30,$A30
+       vprolvq         @Rhotate0[4],$A40,$A40
 
        ######################################### Pi
-       vpblendmq       $A02,$A00,@{T[0]}{$k00010}
-       vpblendmq       $A00,$A03,@{T[1]}{$k00010}
-       vpblendmq       $A03,$A01,@{T[2]}{$k00010}
-       vpblendmq       $A01,$A04,@{T[3]}{$k00010}
-       vpblendmq       $A04,$A02,@{T[4]}{$k00010}
-
-       vpblendmq       $A04,@T[0],@{T[0]}{$k00100}
-       vpblendmq       $A02,@T[1],@{T[1]}{$k00100}
-       vpblendmq       $A00,@T[2],@{T[2]}{$k00100}
-       vpblendmq       $A03,@T[3],@{T[3]}{$k00100}
-       vpblendmq       $A01,@T[4],@{T[4]}{$k00100}
-
-       vpblendmq       $A01,@T[0],@{T[0]}{$k01000}
-       vpblendmq       $A04,@T[1],@{T[1]}{$k01000}
-       vpblendmq       $A02,@T[2],@{T[2]}{$k01000}
-       vpblendmq       $A00,@T[3],@{T[3]}{$k01000}
-       vpblendmq       $A03,@T[4],@{T[4]}{$k01000}
-
-       vpblendmq       $A03,@T[0],@{T[0]}{$k10000}
-       vpblendmq       $A01,@T[1],@{T[1]}{$k10000}
-       vpblendmq       $A04,@T[2],@{T[2]}{$k10000}
-       vpblendmq       $A02,@T[3],@{T[3]}{$k10000}
-       vpblendmq       $A00,@T[4],@{T[4]}{$k10000}
-
-       vpermq          @T[0],@Chi[0],$A00
-       vpermq          @T[1],@Chi[1],$A01
-       vpermq          @T[2],@Chi[2],$A02
-       vpermq          @T[3],@Chi[3],$A03
-       vpermq          @T[4],@Chi[4],$A04
+       vpermq          $A00,@Pi0[0],$A00
+       vpermq          $A10,@Pi0[1],$A10
+       vpermq          $A20,@Pi0[2],$A20
+       vpermq          $A30,@Pi0[3],$A30
+       vpermq          $A40,@Pi0[4],$A40
 
        ######################################### Chi
        vmovdqa64       $A00,@T[0]
-       vpternlogq      \$0xD2,$A02,$A01,$A00
-       vmovdqa64       $A01,@T[1]
-       vpternlogq      \$0xD2,$A03,$A02,$A01
-       vpternlogq      \$0xD2,$A04,$A03,$A02
-       vpternlogq      \$0xD2,@T[0],$A04,$A03
-       vpternlogq      \$0xD2,@T[1],@T[0],$A04
+       vmovdqa64       $A10,@T[1]
+       vpternlogq      \$0xD2,$A20,$A10,$A00
+       vpternlogq      \$0xD2,$A30,$A20,$A10
+       vpternlogq      \$0xD2,$A40,$A30,$A20
+       vpternlogq      \$0xD2,@T[0],$A40,$A30
+       vpternlogq      \$0xD2,@T[1],@T[0],$A40
 
        ######################################### Iota
        vpxorq          (%r10),$A00,${A00}{$k00001}
-       lea             8(%r10),%r10
+       lea             16(%r10),%r10
+
+       ######################################### Harmonize rounds
+       vpblendmq       $A20,$A10,@{T[1]}{$k00010}
+       vpblendmq       $A30,$A20,@{T[2]}{$k00010}
+       vpblendmq       $A40,$A30,@{T[3]}{$k00010}
+        vpblendmq      $A10,$A00,@{T[0]}{$k00010}
+       vpblendmq       $A00,$A40,@{T[4]}{$k00010}
+
+       vpblendmq       $A30,@T[1],@{T[1]}{$k00100}
+       vpblendmq       $A40,@T[2],@{T[2]}{$k00100}
+        vpblendmq      $A20,@T[0],@{T[0]}{$k00100}
+       vpblendmq       $A00,@T[3],@{T[3]}{$k00100}
+       vpblendmq       $A10,@T[4],@{T[4]}{$k00100}
+
+       vpblendmq       $A40,@T[1],@{T[1]}{$k01000}
+        vpblendmq      $A30,@T[0],@{T[0]}{$k01000}
+       vpblendmq       $A00,@T[2],@{T[2]}{$k01000}
+       vpblendmq       $A10,@T[3],@{T[3]}{$k01000}
+       vpblendmq       $A20,@T[4],@{T[4]}{$k01000}
+
+       vpblendmq       $A40,@T[0],@{T[0]}{$k10000}
+       vpblendmq       $A00,@T[1],@{T[1]}{$k10000}
+       vpblendmq       $A10,@T[2],@{T[2]}{$k10000}
+       vpblendmq       $A20,@T[3],@{T[3]}{$k10000}
+       vpblendmq       $A30,@T[4],@{T[4]}{$k10000}
+
+       #vpermq         @T[0],@Theta[0],$A00    # doesn't actually change order
+       vpermq          @T[1],@Theta[1],$A10
+       vpermq          @T[2],@Theta[2],$A20
+       vpermq          @T[3],@Theta[3],$A30
+       vpermq          @T[4],@Theta[4],$A40
+
+       ######################################### Theta, odd round
+       vmovdqa64       $T[0],$A00              # real A00
+       vpternlogq      \$0x96,$A20,$A10,$C00   # C00 is @T[0]'s alias
+       vpternlogq      \$0x96,$A40,$A30,$C00
+
+       vprolq          \$1,$C00,$D00
+       vpermq          $C00,@Theta[1],$C00
+       vpermq          $D00,@Theta[4],$D00
+
+       vpternlogq      \$0x96,$C00,$D00,$A00
+       vpternlogq      \$0x96,$C00,$D00,$A30
+       vpternlogq      \$0x96,$C00,$D00,$A10
+       vpternlogq      \$0x96,$C00,$D00,$A40
+       vpternlogq      \$0x96,$C00,$D00,$A20
+
+       ######################################### Rho
+       vprolvq         @Rhotate1[0],$A00,$A00
+       vprolvq         @Rhotate1[3],$A30,@T[1]
+       vprolvq         @Rhotate1[1],$A10,@T[2]
+       vprolvq         @Rhotate1[4],$A40,@T[3]
+       vprolvq         @Rhotate1[2],$A20,@T[4]
+
+        vpermq         $A00,@Theta[4],@T[5]
+        vpermq         $A00,@Theta[3],@T[6]
+
+       ######################################### Iota
+       vpxorq          -8(%r10),$A00,${A00}{$k00001}
+
+       ######################################### Pi
+       vpermq          @T[1],@Theta[2],$A10
+       vpermq          @T[2],@Theta[4],$A20
+       vpermq          @T[3],@Theta[1],$A30
+       vpermq          @T[4],@Theta[3],$A40
+
+       ######################################### Chi
+       vpternlogq      \$0xD2,@T[6],@T[5],$A00
+
+       vpermq          @T[1],@Theta[1],@T[7]
+       #vpermq         @T[1],@Theta[0],@T[1]
+       vpternlogq      \$0xD2,@T[1],@T[7],$A10
+
+       vpermq          @T[2],@Theta[3],@T[0]
+       vpermq          @T[2],@Theta[2],@T[2]
+       vpternlogq      \$0xD2,@T[2],@T[0],$A20
+
+       #vpermq         @T[3],@Theta[0],@T[3]
+       vpermq          @T[3],@Theta[4],@T[1]
+       vpternlogq      \$0xD2,@T[1],@T[3],$A30
+
+       vpermq          @T[4],@Theta[2],@T[0]
+       vpermq          @T[4],@Theta[1],@T[4]
+       vpternlogq      \$0xD2,@T[4],@T[0],$A40
 
        dec             %eax
        jnz             .Loop_avx512
@@ -208,8 +293,6 @@ SHA3_absorb:
        lea     96($inp),$inp
        lea     128(%rsp),%r9
 
-       vzeroupper
-
        lea             theta_perm(%rip),%r8
 
        kxnorw          $k11111,$k11111,$k11111
@@ -226,24 +309,30 @@ SHA3_absorb:
        vmovdqa64       64*3(%r8),@Theta[3]
        vmovdqa64       64*4(%r8),@Theta[4]
 
-       vmovdqa64       64*5(%r8),@Rhotate[0]
-       vmovdqa64       64*6(%r8),@Rhotate[1]
-       vmovdqa64       64*7(%r8),@Rhotate[2]
-       vmovdqa64       64*8(%r8),@Rhotate[3]
-       vmovdqa64       64*9(%r8),@Rhotate[4]
+       vmovdqa64       64*5(%r8),@Rhotate1[0]
+       vmovdqa64       64*6(%r8),@Rhotate1[1]
+       vmovdqa64       64*7(%r8),@Rhotate1[2]
+       vmovdqa64       64*8(%r8),@Rhotate1[3]
+       vmovdqa64       64*9(%r8),@Rhotate1[4]
+
+       vmovdqa64       64*10(%r8),@Rhotate0[0]
+       vmovdqa64       64*11(%r8),@Rhotate0[1]
+       vmovdqa64       64*12(%r8),@Rhotate0[2]
+       vmovdqa64       64*13(%r8),@Rhotate0[3]
+       vmovdqa64       64*14(%r8),@Rhotate0[4]
 
-       vmovdqa64       64*10(%r8),@Chi[0]
-       vmovdqa64       64*11(%r8),@Chi[1]
-       vmovdqa64       64*12(%r8),@Chi[2]
-       vmovdqa64       64*13(%r8),@Chi[3]
-       vmovdqa64       64*14(%r8),@Chi[4]
+       vmovdqa64       64*15(%r8),@Pi0[0]
+       vmovdqa64       64*16(%r8),@Pi0[1]
+       vmovdqa64       64*17(%r8),@Pi0[2]
+       vmovdqa64       64*18(%r8),@Pi0[3]
+       vmovdqa64       64*19(%r8),@Pi0[4]
 
        vmovdqu64       40*0-96($A_flat),${A00}{$k11111}{z}
        vpxorq          @T[0],@T[0],@T[0]
-       vmovdqu64       40*1-96($A_flat),${A01}{$k11111}{z}
-       vmovdqu64       40*2-96($A_flat),${A02}{$k11111}{z}
-       vmovdqu64       40*3-96($A_flat),${A03}{$k11111}{z}
-       vmovdqu64       40*4-96($A_flat),${A04}{$k11111}{z}
+       vmovdqu64       40*1-96($A_flat),${A10}{$k11111}{z}
+       vmovdqu64       40*2-96($A_flat),${A20}{$k11111}{z}
+       vmovdqu64       40*3-96($A_flat),${A30}{$k11111}{z}
+       vmovdqu64       40*4-96($A_flat),${A40}{$k11111}{z}
 
        vmovdqa64       @T[0],0*64-128(%r9)     # zero transfer area on stack
        vmovdqa64       @T[0],1*64-128(%r9)
@@ -263,7 +352,7 @@ ___
 for(my $i=0; $i<25; $i++) {
 $code.=<<___
        mov     8*$i-96($inp),%r8
-       mov     %r8,$A_jagged_in[$i]-128(%r9)
+       mov     %r8,$A_jagged[$i]-128(%r9)
        dec     %eax
        jz      .Labsorved_avx512
 ___
@@ -273,10 +362,10 @@ $code.=<<___;
        lea     ($inp,$bsz),$inp
 
        vpxorq  64*0-128(%r9),$A00,$A00
-       vpxorq  64*1-128(%r9),$A01,$A01
-       vpxorq  64*2-128(%r9),$A02,$A02
-       vpxorq  64*3-128(%r9),$A03,$A03
-       vpxorq  64*4-128(%r9),$A04,$A04
+       vpxorq  64*1-128(%r9),$A10,$A10
+       vpxorq  64*2-128(%r9),$A20,$A20
+       vpxorq  64*3-128(%r9),$A30,$A30
+       vpxorq  64*4-128(%r9),$A40,$A40
 
        call    __KeccakF1600
 
@@ -285,10 +374,10 @@ $code.=<<___;
 .align 32
 .Ldone_absorb_avx512:
        vmovdqu64       $A00,40*0-96($A_flat){$k11111}
-       vmovdqu64       $A01,40*1-96($A_flat){$k11111}
-       vmovdqu64       $A02,40*2-96($A_flat){$k11111}
-       vmovdqu64       $A03,40*3-96($A_flat){$k11111}
-       vmovdqu64       $A04,40*4-96($A_flat){$k11111}
+       vmovdqu64       $A10,40*1-96($A_flat){$k11111}
+       vmovdqu64       $A20,40*2-96($A_flat){$k11111}
+       vmovdqu64       $A30,40*3-96($A_flat){$k11111}
+       vmovdqu64       $A40,40*4-96($A_flat){$k11111}
 
        vzeroupper
 
@@ -307,8 +396,6 @@ SHA3_squeeze:
        cmp     $bsz,$len
        jbe     .Lno_output_extension_avx512
 
-       vzeroupper
-
        lea             theta_perm(%rip),%r8
 
        kxnorw          $k11111,$k11111,$k11111
@@ -325,65 +412,72 @@ SHA3_squeeze:
        vmovdqa64       64*3(%r8),@Theta[3]
        vmovdqa64       64*4(%r8),@Theta[4]
 
-       vmovdqa64       64*5(%r8),@Rhotate[0]
-       vmovdqa64       64*6(%r8),@Rhotate[1]
-       vmovdqa64       64*7(%r8),@Rhotate[2]
-       vmovdqa64       64*8(%r8),@Rhotate[3]
-       vmovdqa64       64*9(%r8),@Rhotate[4]
+       vmovdqa64       64*5(%r8),@Rhotate1[0]
+       vmovdqa64       64*6(%r8),@Rhotate1[1]
+       vmovdqa64       64*7(%r8),@Rhotate1[2]
+       vmovdqa64       64*8(%r8),@Rhotate1[3]
+       vmovdqa64       64*9(%r8),@Rhotate1[4]
+
+       vmovdqa64       64*10(%r8),@Rhotate0[0]
+       vmovdqa64       64*11(%r8),@Rhotate0[1]
+       vmovdqa64       64*12(%r8),@Rhotate0[2]
+       vmovdqa64       64*13(%r8),@Rhotate0[3]
+       vmovdqa64       64*14(%r8),@Rhotate0[4]
 
-       vmovdqa64       64*10(%r8),@Chi[0]
-       vmovdqa64       64*11(%r8),@Chi[1]
-       vmovdqa64       64*12(%r8),@Chi[2]
-       vmovdqa64       64*13(%r8),@Chi[3]
-       vmovdqa64       64*14(%r8),@Chi[4]
+       vmovdqa64       64*15(%r8),@Pi0[0]
+       vmovdqa64       64*16(%r8),@Pi0[1]
+       vmovdqa64       64*17(%r8),@Pi0[2]
+       vmovdqa64       64*18(%r8),@Pi0[3]
+       vmovdqa64       64*19(%r8),@Pi0[4]
 
        vmovdqu64       40*0-96($A_flat),${A00}{$k11111}{z}
-       vmovdqu64       40*1-96($A_flat),${A01}{$k11111}{z}
-       vmovdqu64       40*2-96($A_flat),${A02}{$k11111}{z}
-       vmovdqu64       40*3-96($A_flat),${A03}{$k11111}{z}
-       vmovdqu64       40*4-96($A_flat),${A04}{$k11111}{z}
+       vmovdqu64       40*1-96($A_flat),${A10}{$k11111}{z}
+       vmovdqu64       40*2-96($A_flat),${A20}{$k11111}{z}
+       vmovdqu64       40*3-96($A_flat),${A30}{$k11111}{z}
+       vmovdqu64       40*4-96($A_flat),${A40}{$k11111}{z}
 
 .Lno_output_extension_avx512:
        shr     \$3,$bsz
+       lea     -96($A_flat),%r9
        mov     $bsz,%rax
+       jmp     .Loop_squeeze_avx512
 
+.align 32
 .Loop_squeeze_avx512:
-       mov     @A_jagged_out[$i]-96($A_flat),%r8
-___
-for (my $i=0; $i<25; $i++) {
-$code.=<<___;
-       sub     \$8,$len
-       jc      .Ltail_squeeze_avx512
+       cmp     \$8,$len
+       jb      .Ltail_squeeze_avx512
+
+       mov     (%r9),%r8
+       lea     8(%r9),%r9
        mov     %r8,($out)
        lea     8($out),$out
-       je      .Ldone_squeeze_avx512
-       dec     %eax
-       je      .Lextend_output_avx512
-       mov     @A_jagged_out[$i+1]-96($A_flat),%r8
-___
-}
-$code.=<<___;
-.Lextend_output_avx512:
-       call    __KeccakF1600
+       sub     \$8,$len                # len -= 8
+       jz      .Ldone_squeeze_avx512
+
+       sub     \$1,%rax                # bsz--
+       jnz     .Loop_squeeze_avx512
+
+       #vpermq         @Theta[4],@Theta[4],@Theta[3]
+       #vpermq         @Theta[3],@Theta[4],@Theta[2]
+       #vpermq         @Theta[3],@Theta[3],@Theta[1]
+
+       call            __KeccakF1600
 
        vmovdqu64       $A00,40*0-96($A_flat){$k11111}
-       vmovdqu64       $A01,40*1-96($A_flat){$k11111}
-       vmovdqu64       $A02,40*2-96($A_flat){$k11111}
-       vmovdqu64       $A03,40*3-96($A_flat){$k11111}
-       vmovdqu64       $A04,40*4-96($A_flat){$k11111}
+       vmovdqu64       $A10,40*1-96($A_flat){$k11111}
+       vmovdqu64       $A20,40*2-96($A_flat){$k11111}
+       vmovdqu64       $A30,40*3-96($A_flat){$k11111}
+       vmovdqu64       $A40,40*4-96($A_flat){$k11111}
 
+       lea     -96($A_flat),%r9
        mov     $bsz,%rax
        jmp     .Loop_squeeze_avx512
 
-
 .Ltail_squeeze_avx512:
-       add     \$8,$len
-.Loop_tail_avx512:
-       mov     %r8b,($out)
-       lea     1($out),$out
-       shr     \$8,%r8
-       dec     $len
-       jnz     .Loop_tail_avx512
+       mov     %r9,%rsi
+       mov     $out,%rdi
+       mov     $len,%rcx
+       .byte   0xf3,0xa4               # rep movsb
 
 .Ldone_squeeze_avx512:
        vzeroupper
@@ -400,19 +494,27 @@ theta_perm:
        .quad   2, 3, 4, 0, 1, 5, 6, 7
        .quad   1, 2, 3, 4, 0, 5, 6, 7
 
-rhotates:
+rhotates1:
        .quad   0,  44, 43, 21, 14, 0, 0, 0     # [0][0] [1][1] [2][2] [3][3] [4][4]
        .quad   18, 1,  6,  25, 8,  0, 0, 0     # [4][0] [0][1] [1][2] [2][3] [3][4]
        .quad   41, 2,  62, 55, 39, 0, 0, 0     # [3][0] [4][1] [0][2] [1][3] [2][4]
        .quad   3,  45, 61, 28, 20, 0, 0, 0     # [2][0] [3][1] [4][2] [0][3] [1][4]
        .quad   36, 10, 15, 56, 27, 0, 0, 0     # [1][0] [2][1] [3][2] [4][3] [0][4]
 
-chi_perm:
-       .quad   0, 4, 3, 2, 1, 5, 6, 7
-       .quad   1, 0, 4, 3, 2, 5, 6, 7
-       .quad   2, 1, 0, 4, 3, 5, 6, 7
-       .quad   3, 2, 1, 0, 4, 5, 6, 7
-       .quad   4, 3, 2, 1, 0, 5, 6, 7
+rhotates0:
+       .quad    0,  1, 62, 28, 27, 0, 0, 0
+       .quad   36, 44,  6, 55, 20, 0, 0, 0
+       .quad    3, 10, 43, 25, 39, 0, 0, 0
+       .quad   41, 45, 15, 21,  8, 0, 0, 0
+       .quad   18,  2, 61, 56, 14, 0, 0, 0
+
+pi0_perm:
+       .quad   0, 3, 1, 4, 2, 5, 6, 7
+       .quad   1, 4, 2, 0, 3, 5, 6, 7
+       .quad   2, 0, 3, 1, 4, 5, 6, 7
+       .quad   3, 1, 4, 2, 0, 5, 6, 7
+       .quad   4, 2, 0, 3, 1, 5, 6, 7
+
 
 iotas:
        .quad   0x0000000000000001