crypto/sha/asm/keccak1600-mmx.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for x86 MMX.
  17 #
  18 # June 2017.
  19 #
  20 # Below code is KECCAK_2X implementation (see sha/keccak1600.c) with
  21 # C[5] held in register bank and D[5] offloaded to memory. Though
  22 # instead of actually unrolling the loop pair-wise I simply flip
  23 # pointers to T[][] and A[][] and the end of round. Since number of
  24 # rounds is even, last round writes to A[][] and everything works out.
  25 # It's argued that MMX is the only code path meaningful to implement
  26 # for x86. This is because non-MMX-capable processors is an extinct
  27 # breed, and they as well can lurk executing compiler-generated code.
  28 # For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per
  29 # processed byte on Pentium. Which is fair result. But older compilers
  30 # produce worse code. On the other hand one can wonder why not 128-bit
  31 # SSE2? Well, SSE2 won't provide double improvement, rather far from
  32 # that, if any at all on some processors, because it will take extra
  33 # permutations and inter-bank data trasfers. Besides, contemporary
  34 # CPUs are better off executing 64-bit code, and it makes lesser sense
  35 # to invest into fancy 32-bit code. And the decision doesn't seem to
  36 # be inadequate, if one compares below results to "64-bit platforms in
  37 # 32-bit mode" SIMD data points available at
  38 # http://keccak.noekeon.org/sw_performance.html.
  39 #
  40 ########################################################################
  41 # Numbers are cycles per processed byte out of large message.
  42 #
  43 #                       r=1088(i)
  44 #
  45 # PIII                  30/+150%
  46 # Pentium M             27/+150%
  47 # P4                    40/+85%
  48 # Core 2                19/+170%
  49 # Sandy Bridge(ii)      18/+140%
  50 # Atom                  33/+180%
  51 # Silvermont(ii)        30/+180%
  52 # VIA Nano(ii)          43/+60%
  53 # Sledgehammer(ii)(iii) 24/+130%
  54 #
  55 # (i)   Corresponds to SHA3-256. Numbers after slash are improvement
  56 #       coefficients over KECCAK_2X [with bit interleave and lane
  57 #       complementing] position-independent *scalar* code generated
  58 #       by gcc-5.x. It's not exactly fair comparison, but it's a
  59 #       datapoint...
  60 # (ii)  64-bit processor executing 32-bit code.
  61 # (iii) Result is considered to be representative even for older AMD
  62 #       processors.
  63
  64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  65 push(@INC,"${dir}","${dir}../../perlasm");
  66 require "x86asm.pl";
  67
  68 $output=pop and open STDOUT,">$output";
  69
  70 &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
  71
  72 my @C = map("mm$_",(0..4));
  73 my @T = map("mm$_",(5..7));
  74 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  75               8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  76 my @D = map(8*$_+4, (0..4));
  77 my @rhotates = ([  0,  1, 62, 28, 27 ],
  78                 [ 36, 44,  6, 55, 20 ],
  79                 [  3, 10, 43, 25, 39 ],
  80                 [ 41, 45, 15, 21,  8 ],
  81                 [ 18,  2, 61, 56, 14 ]);
  82
  83 &static_label("iotas");
  84
  85 &function_begin_B("_KeccakF1600");
  86         &movq   (@C[0],&QWP($A[4][0],"esi"));
  87         &movq   (@C[1],&QWP($A[4][1],"esi"));
  88         &movq   (@C[2],&QWP($A[4][2],"esi"));
  89         &movq   (@C[3],&QWP($A[4][3],"esi"));
  90         &movq   (@C[4],&QWP($A[4][4],"esi"));
  91
  92         &mov    ("ecx",24);                     # loop counter
  93         &jmp    (&label("loop"));
  94
  95     &set_label("loop",16);
  96         ######################################### Theta
  97         &pxor   (@C[0],&QWP($A[0][0],"esi"));
  98         &pxor   (@C[1],&QWP($A[0][1],"esi"));
  99         &pxor   (@C[2],&QWP($A[0][2],"esi"));
 100         &pxor   (@C[3],&QWP($A[0][3],"esi"));
 101         &pxor   (@C[4],&QWP($A[0][4],"esi"));
 102
 103         &pxor   (@C[0],&QWP($A[1][0],"esi"));
 104         &pxor   (@C[1],&QWP($A[1][1],"esi"));
 105         &pxor   (@C[2],&QWP($A[1][2],"esi"));
 106         &pxor   (@C[3],&QWP($A[1][3],"esi"));
 107         &pxor   (@C[4],&QWP($A[1][4],"esi"));
 108
 109         &pxor   (@C[0],&QWP($A[2][0],"esi"));
 110         &pxor   (@C[1],&QWP($A[2][1],"esi"));
 111         &pxor   (@C[2],&QWP($A[2][2],"esi"));
 112         &pxor   (@C[3],&QWP($A[2][3],"esi"));
 113         &pxor   (@C[4],&QWP($A[2][4],"esi"));
 114
 115         &pxor   (@C[2],&QWP($A[3][2],"esi"));
 116         &pxor   (@C[0],&QWP($A[3][0],"esi"));
 117         &pxor   (@C[1],&QWP($A[3][1],"esi"));
 118         &pxor   (@C[3],&QWP($A[3][3],"esi"));
 119          &movq  (@T[0],@C[2]);
 120         &pxor   (@C[4],&QWP($A[3][4],"esi"));
 121
 122          &movq  (@T[2],@C[2]);
 123          &psrlq (@T[0],63);
 124         &movq   (@T[1],@C[0]);
 125          &psllq (@T[2],1);
 126          &pxor  (@T[0],@C[0]);
 127         &psrlq  (@C[0],63);
 128          &pxor  (@T[0],@T[2]);
 129         &psllq  (@T[1],1);
 130          &movq  (@T[2],@C[1]);
 131          &movq  (&QWP(@D[1],"esp"),@T[0]);      # D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
 132
 133         &pxor   (@T[1],@C[0]);
 134          &psrlq (@T[2],63);
 135         &pxor   (@T[1],@C[3]);
 136          &movq  (@C[0],@C[1]);
 137         &movq   (&QWP(@D[4],"esp"),@T[1]);      # D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
 138
 139          &psllq (@C[0],1);
 140          &pxor  (@T[2],@C[4]);
 141          &pxor  (@C[0],@T[2]);
 142
 143         &movq   (@T[2],@C[3]);
 144         &psrlq  (@C[3],63);
 145          &movq  (&QWP(@D[0],"esp"),@C[0]);      # D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
 146         &psllq  (@T[2],1);
 147          &movq  (@T[0],@C[4]);
 148          &psrlq (@C[4],63);
 149         &pxor   (@C[1],@C[3]);
 150          &psllq (@T[0],1);
 151         &pxor   (@C[1],@T[2]);
 152          &pxor  (@C[2],@C[4]);
 153         &movq   (&QWP(@D[2],"esp"),@C[1]);      # D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
 154          &pxor  (@C[2],@T[0]);
 155
 156         ######################################### first Rho(0) is special
 157         &movq   (@C[3],&QWP($A[3][3],"esi"));
 158          &movq  (&QWP(@D[3],"esp"),@C[2]);      # D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
 159         &pxor   (@C[3],@C[2]);
 160          &movq  (@C[4],&QWP($A[4][4],"esi"));
 161         &movq   (@T[2],@C[3]);
 162         &psrlq  (@C[3],64-$rhotates[3][3]);
 163          &pxor  (@C[4],@T[1]);
 164         &psllq  (@T[2],$rhotates[3][3]);
 165          &movq  (@T[1],@C[4]);
 166          &psrlq (@C[4],64-$rhotates[4][4]);
 167         &por    (@C[3],@T[2]);          # C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
 168          &psllq (@T[1],$rhotates[4][4]);
 169
 170         &movq   (@C[2],&QWP($A[2][2],"esi"));
 171          &por   (@C[4],@T[1]);          # C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
 172         &pxor   (@C[2],@C[1]);
 173          &movq  (@C[1],&QWP($A[1][1],"esi"));
 174         &movq   (@T[1],@C[2]);
 175         &psrlq  (@C[2],64-$rhotates[2][2]);
 176          &pxor  (@C[1],&QWP(@D[1],"esp"));
 177         &psllq  (@T[1],$rhotates[2][2]);
 178
 179          &movq  (@T[2],@C[1]);
 180          &psrlq (@C[1],64-$rhotates[1][1]);
 181         &por    (@C[2],@T[1]);          # C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
 182          &psllq (@T[2],$rhotates[1][1]);
 183         &pxor   (@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */  /* D[0] */
 184          &por   (@C[1],@T[2]);          # C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
 185
 186 sub Chi() {                             ######### regular Chi step
 187     my ($y,$xrho) = @_;
 188
 189         &movq   (@T[0],@C[1]);
 190          &movq  (@T[1],@C[2]);
 191         &pandn  (@T[0],@C[2]);
 192          &pandn (@C[2],@C[3]);
 193         &pxor   (@T[0],@C[0]);
 194          &pxor  (@C[2],@C[1]);
 195         &pxor   (@T[0],&QWP(0,"ebx"))           if ($y == 0);
 196         &lea    ("ebx",&DWP(8,"ebx"))           if ($y == 0);
 197
 198         &movq   (@T[2],@C[3]);
 199         &movq   (&QWP($A[$y][0],"edi"),@T[0]);  # R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
 200          &movq  (@T[0],@C[4]);
 201         &pandn  (@C[3],@C[4]);
 202          &pandn (@C[4],@C[0]);
 203         &pxor   (@C[3],@T[1]);
 204          &movq  (&QWP($A[$y][1],"edi"),@C[2]);  # R[0][1] = C[1] ^ (~C[2] & C[3]);
 205          &pxor  (@C[4],@T[2]);
 206           &movq (@T[2],&QWP($A[0][$xrho],"esi"))        if (defined($xrho));
 207
 208          &movq  (&QWP($A[$y][2],"edi"),@C[3]);  # R[0][2] = C[2] ^ (~C[3] & C[4]);
 209         &pandn  (@C[0],@C[1]);
 210          &movq  (&QWP($A[$y][3],"edi"),@C[4]);  # R[0][3] = C[3] ^ (~C[4] & C[0]);
 211         &pxor   (@C[0],@T[0]);
 212           &pxor (@T[2],&QWP(@D[$xrho],"esp"))           if (defined($xrho));
 213         &movq   (&QWP($A[$y][4],"edi"),@C[0]);  # R[0][4] = C[4] ^ (~C[0] & C[1]);
 214 }
 215         &Chi    (0, 3);
 216
 217 sub Rho() {                             ######### regular Rho step
 218     my $x = shift;
 219
 220         #&movq  (@T[2],&QWP($A[0][$x],"esi"));  # moved to Chi
 221         #&pxor  (@T[2],&QWP(@D[$x],"esp"));     # moved to Chi
 222         &movq   (@C[0],@T[2]);
 223         &psrlq  (@T[2],64-$rhotates[0][$x]);
 224          &movq  (@C[1],&QWP($A[1][($x+1)%5],"esi"));
 225         &psllq  (@C[0],$rhotates[0][$x]);
 226          &pxor  (@C[1],&QWP(@D[($x+1)%5],"esp"));
 227         &por    (@C[0],@T[2]);          # C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
 228
 229          &movq  (@T[1],@C[1]);
 230          &psrlq (@C[1],64-$rhotates[1][($x+1)%5]);
 231         &movq   (@C[2],&QWP($A[2][($x+2)%5],"esi"));
 232          &psllq (@T[1],$rhotates[1][($x+1)%5]);
 233         &pxor   (@C[2],&QWP(@D[($x+2)%5],"esp"));
 234          &por   (@C[1],@T[1]);          # C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
 235
 236         &movq   (@T[2],@C[2]);
 237         &psrlq  (@C[2],64-$rhotates[2][($x+2)%5]);
 238          &movq  (@C[3],&QWP($A[3][($x+3)%5],"esi"));
 239         &psllq  (@T[2],$rhotates[2][($x+2)%5]);
 240          &pxor  (@C[3],&QWP(@D[($x+3)%5],"esp"));
 241         &por    (@C[2],@T[2]);          # C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
 242
 243          &movq  (@T[0],@C[3]);
 244          &psrlq (@C[3],64-$rhotates[3][($x+3)%5]);
 245         &movq   (@C[4],&QWP($A[4][($x+4)%5],"esi"));
 246          &psllq (@T[0],$rhotates[3][($x+3)%5]);
 247         &pxor   (@C[4],&QWP(@D[($x+4)%5],"esp"));
 248          &por   (@C[3],@T[0]);          # C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
 249
 250         &movq   (@T[1],@C[4]);
 251         &psrlq  (@C[4],64-$rhotates[4][($x+4)%5]);
 252         &psllq  (@T[1],$rhotates[4][($x+4)%5]);
 253         &por    (@C[4],@T[1]);          # C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
 254 }
 255         &Rho    (3);    &Chi    (1, 1);
 256         &Rho    (1);    &Chi    (2, 4);
 257         &Rho    (4);    &Chi    (3, 2);
 258         &Rho    (2);    ###&Chi (4);
 259
 260         &movq   (@T[0],@C[0]);          ######### last Chi(4) is special
 261          &xor   ("edi","esi");          # &xchg ("esi","edi");
 262         &movq   (&QWP(@D[1],"esp"),@C[1]);
 263          &xor   ("esi","edi");
 264          &xor   ("edi","esi");
 265
 266         &movq   (@T[1],@C[1]);
 267          &movq  (@T[2],@C[2]);
 268         &pandn  (@T[1],@C[2]);
 269          &pandn (@T[2],@C[3]);
 270         &pxor   (@C[0],@T[1]);
 271          &pxor  (@C[1],@T[2]);
 272
 273         &movq   (@T[1],@C[3]);
 274          &movq  (&QWP($A[4][0],"esi"),@C[0]);   # R[4][0] = C[0] ^= (~C[1] & C[2]);
 275         &pandn  (@T[1],@C[4]);
 276          &movq  (&QWP($A[4][1],"esi"),@C[1]);   # R[4][1] = C[1] ^= (~C[2] & C[3]);
 277         &pxor   (@C[2],@T[1]);
 278          &movq  (@T[2],@C[4]);
 279         &movq   (&QWP($A[4][2],"esi"),@C[2]);   # R[4][2] = C[2] ^= (~C[3] & C[4]);
 280
 281         &pandn  (@T[2],@T[0]);
 282          &pandn (@T[0],&QWP(@D[1],"esp"));
 283         &pxor   (@C[3],@T[2]);
 284          &pxor  (@C[4],@T[0]);
 285         &movq   (&QWP($A[4][3],"esi"),@C[3]);   # R[4][3] = C[3] ^= (~C[4] & D[0]);
 286         &sub    ("ecx",1);
 287          &movq  (&QWP($A[4][4],"esi"),@C[4]);   # R[4][4] = C[4] ^= (~D[0] & D[1]);
 288         &jnz    (&label("loop"));
 289
 290         &lea    ("ebx",&DWP(-192,"ebx"));       # rewind iotas
 291         &ret    ();
 292 &function_end_B("_KeccakF1600");
 293
 294 &function_begin("KeccakF1600");
 295         &mov    ("esi",&wparam(0));
 296         &mov    ("ebp","esp");
 297         &sub    ("esp",240);
 298         &call   (&label("pic_point"));
 299     &set_label("pic_point");
 300         &blindpop("ebx");
 301         &lea    ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
 302         &and    ("esp",-8);
 303         &lea    ("esi",&DWP(100,"esi"));        # size optimization
 304         &lea    ("edi",&DWP(8*5+100,"esp"));    # size optimization
 305
 306         &call   ("_KeccakF1600");
 307
 308         &mov    ("esp","ebp");
 309         &emms   ();
 310 &function_end("KeccakF1600");
 311
 312 &function_begin("SHA3_absorb");
 313         &mov    ("esi",&wparam(0));             # A[][]
 314         &mov    ("eax",&wparam(1));             # inp
 315         &mov    ("ecx",&wparam(2));             # len
 316         &mov    ("edx",&wparam(3));             # bsz
 317         &mov    ("ebp","esp");
 318         &sub    ("esp",240+8);
 319         &call   (&label("pic_point"));
 320     &set_label("pic_point");
 321         &blindpop("ebx");
 322         &lea    ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
 323         &and    ("esp",-8);
 324
 325         &mov    ("edi","esi");
 326         &lea    ("esi",&DWP(100,"esi"));        # size optimization
 327         &mov    (&DWP(-4,"ebp"),"edx");         # save bsz
 328         &jmp    (&label("loop"));
 329
 330 &set_label("loop",16);
 331         &cmp    ("ecx","edx");                  # len < bsz?
 332         &jc     (&label("absorbed"));
 333
 334         &shr    ("edx",3);                      # bsz /= 8
 335 &set_label("block");
 336         &movq   ("mm0",&QWP(0,"eax"));
 337         &lea    ("eax",&DWP(8,"eax"));
 338         &pxor   ("mm0",&QWP(0,"edi"));
 339         &lea    ("edi",&DWP(8,"edi"));
 340         &sub    ("ecx",8);                      # len -= 8
 341         &movq   (&QWP(-8,"edi"),"mm0");
 342         &dec    ("edx");                        # bsz--
 343         &jnz    (&label("block"));
 344
 345         &lea    ("edi",&DWP(8*5+100,"esp"));    # size optimization
 346         &mov    (&DWP(-8,"ebp"),"ecx");         # save len
 347         &call   ("_KeccakF1600");
 348         &mov    ("ecx",&DWP(-8,"ebp"));         # pull len
 349         &mov    ("edx",&DWP(-4,"ebp"));         # pull bsz
 350         &lea    ("edi",&DWP(-100,"esi"));
 351         &jmp    (&label("loop"));
 352
 353 &set_label("absorbed",16);
 354         &mov    ("eax","ecx");                  # return value
 355         &mov    ("esp","ebp");
 356         &emms   ();
 357 &function_end("SHA3_absorb");
 358
 359 &function_begin("SHA3_squeeze");
 360         &mov    ("esi",&wparam(0));             # A[][]
 361         &mov    ("eax",&wparam(1));             # out
 362         &mov    ("ecx",&wparam(2));             # len
 363         &mov    ("edx",&wparam(3));             # bsz
 364         &mov    ("ebp","esp");
 365         &sub    ("esp",240+8);
 366         &call   (&label("pic_point"));
 367     &set_label("pic_point");
 368         &blindpop("ebx");
 369         &lea    ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
 370         &and    ("esp",-8);
 371
 372         &shr    ("edx",3);                      # bsz /= 8
 373         &mov    ("edi","esi");
 374         &lea    ("esi",&DWP(100,"esi"));        # size optimization
 375         &mov    (&DWP(-4,"ebp"),"edx");         # save bsz
 376         &jmp    (&label("loop"));
 377
 378 &set_label("loop",16);
 379         &cmp    ("ecx",8);                      # len < 8?
 380         &jc     (&label("tail"));
 381
 382         &movq   ("mm0",&QWP(0,"edi"));
 383         &lea    ("edi",&DWP(8,"edi"));
 384         &movq   (&QWP(0,"eax"),"mm0");
 385         &lea    ("eax",&DWP(8,"eax"));
 386         &sub    ("ecx",8);                      # len -= 8
 387         &jz     (&label("done"));
 388
 389         &dec    ("edx");                        # bsz--
 390         &jnz    (&label("loop"));
 391
 392         &lea    ("edi",&DWP(8*5+100,"esp"));    # size optimization
 393         &mov    (&DWP(-8,"ebp"),"ecx");         # save len
 394         &call   ("_KeccakF1600");
 395         &mov    ("ecx",&DWP(-8,"ebp"));         # pull len
 396         &mov    ("edx",&DWP(-4,"ebp"));         # pull bsz
 397         &lea    ("edi",&DWP(-100,"esi"));
 398         &jmp    (&label("loop"));
 399
 400 &set_label("tail",16);
 401         &mov    ("esi","edi");
 402         &mov    ("edi","eax");
 403         &data_word("0xA4F39066");               # rep movsb
 404
 405 &set_label("done");
 406         &mov    ("esp","ebp");
 407         &emms   ();
 408 &function_end("SHA3_squeeze");
 409
 410 &set_label("iotas",32);
 411         &data_word(0x00000001,0x00000000);
 412         &data_word(0x00008082,0x00000000);
 413         &data_word(0x0000808a,0x80000000);
 414         &data_word(0x80008000,0x80000000);
 415         &data_word(0x0000808b,0x00000000);
 416         &data_word(0x80000001,0x00000000);
 417         &data_word(0x80008081,0x80000000);
 418         &data_word(0x00008009,0x80000000);
 419         &data_word(0x0000008a,0x00000000);
 420         &data_word(0x00000088,0x00000000);
 421         &data_word(0x80008009,0x00000000);
 422         &data_word(0x8000000a,0x00000000);
 423         &data_word(0x8000808b,0x00000000);
 424         &data_word(0x0000008b,0x80000000);
 425         &data_word(0x00008089,0x80000000);
 426         &data_word(0x00008003,0x80000000);
 427         &data_word(0x00008002,0x80000000);
 428         &data_word(0x00000080,0x80000000);
 429         &data_word(0x0000800a,0x00000000);
 430         &data_word(0x8000000a,0x80000000);
 431         &data_word(0x80008081,0x80000000);
 432         &data_word(0x00008080,0x80000000);
 433         &data_word(0x80000001,0x00000000);
 434         &data_word(0x80008008,0x80000000);
 435 &asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>");
 436
 437 &asm_finish();
 438
 439 close STDOUT or die "error closing STDOUT";