- ######################################### Pi + pre-Chi shuffle
- vpermq \$0b01110010,$A41,@T[6] # vpermq \$0b00011011,$A41,$A11
- vpermq \$0b00011011,$A21,@T[5] # vpermq \$0b01110010,$A21,$A41
- vpermq \$0b10001101,$A31,@T[4] # vpermq \$0b10001101,$A31,$A21
- vpermq \$0b10001101,$A20,@T[3] # vpermq \$0b01110010,$A20,$A31
- vmovdqa $A01,@T[2]
- vmovdqa $A11,@T[1]
+ vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
+ vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
+ vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
+ vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
+ vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
+ vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
+ vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
+ vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
+ vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
+ vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
+ vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
+ vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
+ vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
+ vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]