crypto/sha/asm/keccak1600-avx512vl.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for AVX512VL.
  17 #
  18 # December 2017.
  19 #
  20 # This is an adaptation of AVX2 module that reuses register data
  21 # layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
  22 # module for further information on layout.
  23 #
  24 ########################################################################
  25 # Numbers are cycles per processed byte out of large message.
  26 #
  27 #                       r=1088(*)
  28 #
  29 # Skylake-X             6.4/+47%
  30 #
  31 # (*)   Corresponds to SHA3-256. Percentage after slash is improvement
  32 #       coefficient in comparison to scalar keccak1600-x86_64.pl.
  33
  34 # Digits in variables' names denote right-most coordinates:
  35
  36 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
  37     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
  38     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
  39     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
  40     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
  41     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
  42     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
  43     map("%ymm$_",(0..6));
  44
  45 # We also need to map the magic order into offsets within structure:
  46
  47 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
  48                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
  49                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
  50                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
  51                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
  52    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
  53
  54 my @T = map("%ymm$_",(7..15));
  55 my ($C14,$C00,$D00,$D14) = @T[5..8];
  56 my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
  57
  58 $code.=<<___;
  59 .text
  60
  61 .type   __KeccakF1600,\@function
  62 .align  32
  63 __KeccakF1600:
  64         lea             iotas(%rip),%r10
  65         mov             \$24,%eax
  66         jmp             .Loop_avx512vl
  67
  68 .align  32
  69 .Loop_avx512vl:
  70         ######################################### Theta
  71         vpshufd         \$0b01001110,$A20,$C00
  72         vpxor           $A31,$A41,$C14
  73         vpxor           $A11,$A21,@T[2]
  74         vpternlogq      \$0x96,$A01,$T[2],$C14  # C[1..4]
  75
  76         vpxor           $A20,$C00,$C00
  77         vpermq          \$0b01001110,$C00,@T[0]
  78
  79         vpermq          \$0b10010011,$C14,@T[4]
  80         vprolq          \$1,$C14,@T[1]          # ROL64(C[1..4],1)
  81
  82         vpermq          \$0b00111001,@T[1],$D14
  83         vpxor           @T[4],@T[1],$D00
  84         vpermq          \$0b00000000,$D00,$D00  # D[0..0] = ROL64(C[1],1) ^ C[4]
  85
  86         vpternlogq      \$0x96,@T[0],$A00,$C00  # C[0..0]
  87         vprolq          \$1,$C00,@T[1]          # ROL64(C[0..0],1)
  88
  89         vpxor           $D00,$A00,$A00          # ^= D[0..0]
  90
  91         vpblendd        \$0b11000000,@T[1],$D14,$D14
  92         vpblendd        \$0b00000011,$C00,@T[4],@T[0]
  93
  94         ######################################### Rho + Pi + pre-Chi shuffle
  95          vpxor          $D00,$A20,$A20          # ^= D[0..0] from Theta
  96         vprolvq         $R20,$A20,$A20
  97
  98          vpternlogq     \$0x96,@T[0],$D14,$A31  # ^= D[1..4] from Theta
  99         vprolvq         $R31,$A31,$A31
 100
 101          vpternlogq     \$0x96,@T[0],$D14,$A21  # ^= D[1..4] from Theta
 102         vprolvq         $R21,$A21,$A21
 103
 104          vpternlogq     \$0x96,@T[0],$D14,$A41  # ^= D[1..4] from Theta
 105         vprolvq         $R41,$A41,$A41
 106
 107          vpermq         \$0b10001101,$A20,@T[3] # $A20 -> future $A31
 108          vpermq         \$0b10001101,$A31,@T[4] # $A31 -> future $A21
 109          vpternlogq     \$0x96,@T[0],$D14,$A11  # ^= D[1..4] from Theta
 110         vprolvq         $R11,$A11,@T[1]         # $A11 -> future $A01
 111
 112          vpermq         \$0b00011011,$A21,@T[5] # $A21 -> future $A41
 113          vpermq         \$0b01110010,$A41,@T[6] # $A41 -> future $A11
 114          vpternlogq     \$0x96,@T[0],$D14,$A01  # ^= D[1..4] from Theta
 115         vprolvq         $R01,$A01,@T[2]         # $A01 -> future $A20
 116
 117         ######################################### Chi
 118         vpblendd        \$0b00001100,@T[6],@T[2],$A31   #               [4][4] [2][0]
 119         vpblendd        \$0b00001100,@T[2],@T[4],@T[8]  #               [4][0] [2][1]
 120          vpblendd       \$0b00001100,@T[4],@T[3],$A41   #               [4][2] [2][4]
 121          vpblendd       \$0b00001100,@T[3],@T[2],@T[7]  #               [4][3] [2][0]
 122         vpblendd        \$0b00110000,@T[4],$A31,$A31    #        [1][3] [4][4] [2][0]
 123         vpblendd        \$0b00110000,@T[5],@T[8],@T[8]  #        [1][4] [4][0] [2][1]
 124          vpblendd       \$0b00110000,@T[2],$A41,$A41    #        [1][0] [4][2] [2][4]
 125          vpblendd       \$0b00110000,@T[6],@T[7],@T[7]  #        [1][1] [4][3] [2][0]
 126         vpblendd        \$0b11000000,@T[5],$A31,$A31    # [3][2] [1][3] [4][4] [2][0]
 127         vpblendd        \$0b11000000,@T[6],@T[8],@T[8]  # [3][3] [1][4] [4][0] [2][1]
 128          vpblendd       \$0b11000000,@T[6],$A41,$A41    # [3][3] [1][0] [4][2] [2][4]
 129          vpblendd       \$0b11000000,@T[4],@T[7],@T[7]  # [3][4] [1][1] [4][3] [2][0]
 130         vpternlogq      \$0xC6,@T[8],@T[3],$A31         # [3][1] [1][2] [4][3] [2][4]
 131          vpternlogq     \$0xC6,@T[7],@T[5],$A41         # [3][2] [1][4] [4][1] [2][3]
 132
 133         vpsrldq         \$8,@T[1],@T[0]
 134         vpandn          @T[0],@T[1],@T[0]       # tgting  [0][0] [0][0] [0][0] [0][0]
 135
 136         vpblendd        \$0b00001100,@T[2],@T[5],$A11   #               [4][0] [2][3]
 137         vpblendd        \$0b00001100,@T[5],@T[3],@T[8]  #               [4][1] [2][4]
 138         vpblendd        \$0b00110000,@T[3],$A11,$A11    #        [1][2] [4][0] [2][3]
 139         vpblendd        \$0b00110000,@T[4],@T[8],@T[8]  #        [1][3] [4][1] [2][4]
 140         vpblendd        \$0b11000000,@T[4],$A11,$A11    # [3][4] [1][2] [4][0] [2][3]
 141         vpblendd        \$0b11000000,@T[2],@T[8],@T[8]  # [3][0] [1][3] [4][1] [2][4]
 142         vpternlogq      \$0xC6,@T[8],@T[6],$A11         # [3][3] [1][1] [4][4] [2][2]
 143
 144           vpermq        \$0b00011110,@T[1],$A21         # [0][1] [0][2] [0][4] [0][3]
 145           vpblendd      \$0b00110000,$A00,$A21,@T[8]    # [0][1] [0][0] [0][4] [0][3]
 146           vpermq        \$0b00111001,@T[1],$A01         # [0][1] [0][4] [0][3] [0][2]
 147           vpblendd      \$0b11000000,$A00,$A01,$A01     # [0][0] [0][4] [0][3] [0][2]
 148
 149         vpblendd        \$0b00001100,@T[5],@T[4],$A20   #               [4][1] [2][1]
 150         vpblendd        \$0b00001100,@T[4],@T[6],@T[7]  #               [4][2] [2][2]
 151         vpblendd        \$0b00110000,@T[6],$A20,$A20    #        [1][1] [4][1] [2][1]
 152         vpblendd        \$0b00110000,@T[3],@T[7],@T[7]  #        [1][2] [4][2] [2][2]
 153         vpblendd        \$0b11000000,@T[3],$A20,$A20    # [3][1] [1][1] [4][1] [2][1]
 154         vpblendd        \$0b11000000,@T[5],@T[7],@T[7]  # [3][2] [1][2] [4][2] [2][2]
 155         vpternlogq      \$0xC6,@T[7],@T[2],$A20         # [3][0] [1][0] [4][0] [2][0]
 156
 157          vpermq         \$0b00000000,@T[0],@T[0]        # [0][0] [0][0] [0][0] [0][0]
 158          vpermq         \$0b00011011,$A31,$A31          # post-Chi shuffle
 159          vpermq         \$0b10001101,$A41,$A41
 160          vpermq         \$0b01110010,$A11,$A11
 161
 162         vpblendd        \$0b00001100,@T[3],@T[6],$A21   #               [4][3] [2][2]
 163         vpblendd        \$0b00001100,@T[6],@T[5],@T[7]  #               [4][4] [2][3]
 164         vpblendd        \$0b00110000,@T[5],$A21,$A21    #        [1][4] [4][3] [2][2]
 165         vpblendd        \$0b00110000,@T[2],@T[7],@T[7]  #        [1][0] [4][4] [2][3]
 166         vpblendd        \$0b11000000,@T[2],$A21,$A21    # [3][0] [1][4] [4][3] [2][2]
 167         vpblendd        \$0b11000000,@T[3],@T[7],@T[7]  # [3][1] [1][0] [4][4] [2][3]
 168
 169         vpternlogq      \$0xC6,@T[8],@T[1],$A01         # [0][4] [0][3] [0][2] [0][1]
 170         vpternlogq      \$0xC6,@T[7],@T[4],$A21         # [3][4] [1][3] [4][2] [2][1]
 171
 172         ######################################### Iota
 173         vpternlogq      \$0x96,(%r10),@T[0],$A00
 174         lea             32(%r10),%r10
 175
 176         dec             %eax
 177         jnz             .Loop_avx512vl
 178
 179         ret
 180 .size   __KeccakF1600,.-__KeccakF1600
 181 ___
 182 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 183 my  $out = $inp;        # in squeeze
 184
 185 $code.=<<___;
 186 .globl  SHA3_absorb
 187 .type   SHA3_absorb,\@function
 188 .align  32
 189 SHA3_absorb:
 190         mov     %rsp,%r11
 191
 192         lea     -240(%rsp),%rsp
 193         and     \$-32,%rsp
 194
 195         lea     96($A_flat),$A_flat
 196         lea     96($inp),$inp
 197         lea     96(%rsp),%r10
 198         lea     rhotates_left(%rip),%r8
 199
 200         vzeroupper
 201
 202         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
 203         vmovdqu         8+32*0-96($A_flat),$A01
 204         vmovdqu         8+32*1-96($A_flat),$A20
 205         vmovdqu         8+32*2-96($A_flat),$A31
 206         vmovdqu         8+32*3-96($A_flat),$A21
 207         vmovdqu         8+32*4-96($A_flat),$A41
 208         vmovdqu         8+32*5-96($A_flat),$A11
 209
 210         vmovdqa64       0*32(%r8),$R20          # load "rhotate" indices
 211         vmovdqa64       1*32(%r8),$R01
 212         vmovdqa64       2*32(%r8),$R31
 213         vmovdqa64       3*32(%r8),$R21
 214         vmovdqa64       4*32(%r8),$R41
 215         vmovdqa64       5*32(%r8),$R11
 216
 217         vpxor           @T[0],@T[0],@T[0]
 218         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
 219         vmovdqa         @T[0],32*3-96(%r10)
 220         vmovdqa         @T[0],32*4-96(%r10)
 221         vmovdqa         @T[0],32*5-96(%r10)
 222         vmovdqa         @T[0],32*6-96(%r10)
 223
 224 .Loop_absorb_avx512vl:
 225         mov             $bsz,%rax
 226         sub             $bsz,$len
 227         jc              .Ldone_absorb_avx512vl
 228
 229         shr             \$3,%eax
 230         vpbroadcastq    0-96($inp),@T[0]
 231         vmovdqu         8-96($inp),@T[1]
 232         sub             \$4,%eax
 233 ___
 234 for(my $i=5; $i<25; $i++) {
 235 $code.=<<___
 236         dec     %eax
 237         jz      .Labsorved_avx512vl
 238         mov     8*$i-96($inp),%r8
 239         mov     %r8,$A_jagged[$i]-96(%r10)
 240 ___
 241 }
 242 $code.=<<___;
 243 .Labsorved_avx512vl:
 244         lea     ($inp,$bsz),$inp
 245
 246         vpxor   @T[0],$A00,$A00
 247         vpxor   @T[1],$A01,$A01
 248         vpxor   32*2-96(%r10),$A20,$A20
 249         vpxor   32*3-96(%r10),$A31,$A31
 250         vpxor   32*4-96(%r10),$A21,$A21
 251         vpxor   32*5-96(%r10),$A41,$A41
 252         vpxor   32*6-96(%r10),$A11,$A11
 253
 254         call    __KeccakF1600
 255
 256         lea     96(%rsp),%r10
 257         jmp     .Loop_absorb_avx512vl
 258
 259 .Ldone_absorb_avx512vl:
 260         vmovq   %xmm0,-96($A_flat)
 261         vmovdqu $A01,8+32*0-96($A_flat)
 262         vmovdqu $A20,8+32*1-96($A_flat)
 263         vmovdqu $A31,8+32*2-96($A_flat)
 264         vmovdqu $A21,8+32*3-96($A_flat)
 265         vmovdqu $A41,8+32*4-96($A_flat)
 266         vmovdqu $A11,8+32*5-96($A_flat)
 267
 268         vzeroupper
 269
 270         lea     (%r11),%rsp
 271         lea     ($len,$bsz),%rax                # return value
 272         ret
 273 .size   SHA3_absorb,.-SHA3_absorb
 274
 275 .globl  SHA3_squeeze
 276 .type   SHA3_squeeze,\@function
 277 .align  32
 278 SHA3_squeeze:
 279         mov     %rsp,%r11
 280
 281         lea     96($A_flat),$A_flat
 282         lea     rhotates_left(%rip),%r8
 283         shr     \$3,$bsz
 284
 285         vzeroupper
 286
 287         vpbroadcastq    -96($A_flat),$A00
 288         vpxor           @T[0],@T[0],@T[0]
 289         vmovdqu         8+32*0-96($A_flat),$A01
 290         vmovdqu         8+32*1-96($A_flat),$A20
 291         vmovdqu         8+32*2-96($A_flat),$A31
 292         vmovdqu         8+32*3-96($A_flat),$A21
 293         vmovdqu         8+32*4-96($A_flat),$A41
 294         vmovdqu         8+32*5-96($A_flat),$A11
 295
 296         vmovdqa64       0*32(%r8),$R20          # load "rhotate" indices
 297         vmovdqa64       1*32(%r8),$R01
 298         vmovdqa64       2*32(%r8),$R31
 299         vmovdqa64       3*32(%r8),$R21
 300         vmovdqa64       4*32(%r8),$R41
 301         vmovdqa64       5*32(%r8),$R11
 302
 303         mov     $bsz,%rax
 304
 305 .Loop_squeeze_avx512vl:
 306         mov     @A_jagged[$i]-96($A_flat),%r8
 307 ___
 308 for (my $i=0; $i<25; $i++) {
 309 $code.=<<___;
 310         sub     \$8,$len
 311         jc      .Ltail_squeeze_avx512vl
 312         mov     %r8,($out)
 313         lea     8($out),$out
 314         je      .Ldone_squeeze_avx512vl
 315         dec     %eax
 316         je      .Lextend_output_avx512vl
 317         mov     @A_jagged[$i+1]-120($A_flat),%r8
 318 ___
 319 }
 320 $code.=<<___;
 321 .Lextend_output_avx512vl:
 322         call    __KeccakF1600
 323
 324         vmovq   %xmm0,-96($A_flat)
 325         vmovdqu $A01,8+32*0-96($A_flat)
 326         vmovdqu $A20,8+32*1-96($A_flat)
 327         vmovdqu $A31,8+32*2-96($A_flat)
 328         vmovdqu $A21,8+32*3-96($A_flat)
 329         vmovdqu $A41,8+32*4-96($A_flat)
 330         vmovdqu $A11,8+32*5-96($A_flat)
 331
 332         mov     $bsz,%rax
 333         jmp     .Loop_squeeze_avx512vl
 334
 335
 336 .Ltail_squeeze_avx512vl:
 337         add     \$8,$len
 338 .Loop_tail_avx512vl:
 339         mov     %r8b,($out)
 340         lea     1($out),$out
 341         shr     \$8,%r8
 342         dec     $len
 343         jnz     .Loop_tail_avx512vl
 344
 345 .Ldone_squeeze_avx512vl:
 346         vzeroupper
 347
 348         lea     (%r11),%rsp
 349         ret
 350 .size   SHA3_squeeze,.-SHA3_squeeze
 351
 352 .align  64
 353 rhotates_left:
 354         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
 355         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
 356         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
 357         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
 358         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
 359         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
 360 iotas:
 361         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
 362         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
 363         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
 364         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
 365         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
 366         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
 367         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
 368         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
 369         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
 370         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
 371         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
 372         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
 373         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
 374         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
 375         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
 376         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
 377         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
 378         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
 379         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
 380         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
 381         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
 382         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
 383         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
 384         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
 385
 386 .asciz  "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
 387 ___
 388
 389 print $code;
 390 close STDOUT;