crypto/modes/asm/ghash-x86.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # The module implements "4-bit" Galois field multiplication and
  11 # streamed GHASH function. "4-bit" means that it uses 256 bytes
  12 # per-key table [+128/256 bytes fixed table]. It has two code paths:
  13 # vanilla x86 and vanilla MMX. Former will be executed on 486 and
  14 # Pentium, latter on all others. Performance results are for streamed
  15 # GHASH subroutine and are expressed in cycles per processed byte,
  16 # less is better:
  17 #
  18 #               gcc 2.95.3(*)   MMX assembler   x86 assembler
  19 #
  20 # Pentium       100/112(**)     -               50
  21 # PIII          63 /77          17              24
  22 # P4            96 /122         33              84(***)
  23 # Opteron       50 /71          22              30
  24 # Core2         63 /102         21              28
  25 #
  26 # (*)   gcc 3.4.x was observed to generate few percent slower code,
  27 #       which is one of reasons why 2.95.3 result were chosen;
  28 #       another reason is lack of 3.4.x results for older CPUs;
  29 # (**)  second number is result for code compiled with -fPIC flag,
  30 #       which is actually more relevant, because assembler code is
  31 #       position-independent;
  32 # (***) see comment in non-MMX routine for further details;
  33 #
  34 # To summarize, it's 2-3 times faster than gcc-generated code. To
  35 # anchor it to something else SHA1 assembler processes single byte
  36 # in 11-13 cycles.
  37
  38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  39 push(@INC,"${dir}","${dir}../../perlasm");
  40 require "x86asm.pl";
  41
  42 &asm_init($ARGV[0],"gcm-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
  43
  44 &static_label("rem_4bit") if (!$x86only);
  45
  46 $Zhh  = "ebp";
  47 $Zhl  = "edx";
  48 $Zlh  = "ecx";
  49 $Zll  = "ebx";
  50 $inp  = "edi";
  51 $Htbl = "esi";
  52
  53 $unroll = 0;    # Affects x86 loop. Folded loop performs ~7% worse
  54                 # than unrolled, which has to be weighted against
  55                 # almost 2x code size reduction. Well, *overall*
  56                 # code size. x86-specific code shrinks by 7.5x...
  57
  58 sub mmx_loop() {
  59 # MMX version performs 2.5 times better on P4 (see comment in non-MMX
  60 # routine for further details), 35% better on Opteron and Core2, 40%
  61 # better on PIII... In other words effort is considered to be well
  62 # spent...
  63     my $inp = shift;
  64     my $rem_4bit = shift;
  65     my $cnt = $Zhh;
  66     my $nhi = $Zhl;
  67     my $nlo = $Zlh;
  68     my $rem = $Zll;
  69
  70     my $Zlo = "mm0";
  71     my $Zhi = "mm1";
  72     my $tmp = "mm2";
  73
  74         &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
  75         &mov    ($nhi,$Zll);
  76         &mov    (&LB($nlo),&LB($nhi));
  77         &mov    ($cnt,15);
  78         &shl    (&LB($nlo),4);
  79         &and    ($nhi,0xf0);
  80         &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
  81         &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
  82         &movd   ($rem,$Zlo);
  83         &jmp    (&label("mmx_loop"));
  84
  85     &set_label("mmx_loop",16);
  86         &psrlq  ($Zlo,4);
  87         &and    ($rem,0xf);
  88         &movq   ($tmp,$Zhi);
  89         &psrlq  ($Zhi,4);
  90         &dec    ($cnt);
  91         &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
  92         &psllq  ($tmp,60);
  93         &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
  94         &movd   ($rem,$Zlo);
  95         &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
  96         &pxor   ($Zlo,$tmp);
  97         &js     (&label("mmx_break"));
  98
  99         &movz   ($nhi,&BP(0,$inp,$cnt));
 100         &psrlq  ($Zlo,4);
 101         &mov    (&LB($nlo),&LB($nhi));
 102         &movq   ($tmp,$Zhi);
 103         &shl    (&LB($nlo),4);
 104         &psrlq  ($Zhi,4);
 105         &and    ($rem,0xf);
 106         &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
 107         &psllq  ($tmp,60);
 108         &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
 109         &movd   ($rem,$Zlo);
 110         &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
 111         &pxor   ($Zlo,$tmp);
 112         &and    ($nhi,0xf0);
 113         &jmp    (&label("mmx_loop"));
 114
 115     &set_label("mmx_break",16);
 116         &psrlq  ($Zlo,32);      # lower part of Zlo is already there
 117         &movd   ($Zhl,$Zhi);
 118         &psrlq  ($Zhi,32);
 119         &movd   ($Zlh,$Zlo);
 120         &movd   ($Zhh,$Zhi);
 121
 122         &bswap  ($Zll);
 123         &bswap  ($Zhl);
 124         &bswap  ($Zlh);
 125         &bswap  ($Zhh);
 126 }
 127
 128 sub x86_loop {
 129     my $off = shift;
 130     my $rem = "eax";
 131
 132         &mov    ($Zhh,&DWP(4,$Htbl,$Zll));
 133         &mov    ($Zhl,&DWP(0,$Htbl,$Zll));
 134         &mov    ($Zlh,&DWP(12,$Htbl,$Zll));
 135         &mov    ($Zll,&DWP(8,$Htbl,$Zll));
 136         &xor    ($rem,$rem);    # avoid partial register stalls on PIII
 137
 138         # shrd practically kills P4, 2.5x deterioration, but P4 has
 139         # MMX code-path to execute. shrd runs tad faster [than twice
 140         # the shifts, move's and or's] on pre-MMX Pentium (as well as
 141         # PIII and Core2), *but* minimizes code size, spares register
 142         # and thus allows to fold the loop...
 143         if (!$unroll) {
 144         my $cnt = $inp;
 145         &mov    ($cnt,15);
 146         &jmp    (&label("x86_loop"));
 147         &set_label("x86_loop",16);
 148             for($i=1;$i<=2;$i++) {
 149                 &mov    (&LB($rem),&LB($Zll));
 150                 &shrd   ($Zll,$Zlh,4);
 151                 &and    (&LB($rem),0xf);
 152                 &shrd   ($Zlh,$Zhl,4);
 153                 &shrd   ($Zhl,$Zhh,4);
 154                 &shr    ($Zhh,4);
 155                 &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
 156
 157                 &mov    (&LB($rem),&BP($off,"esp",$cnt));
 158                 if ($i&1) {
 159                         &and    (&LB($rem),0xf0);
 160                 } else {
 161                         &shl    (&LB($rem),4);
 162                 }
 163
 164                 &xor    ($Zll,&DWP(8,$Htbl,$rem));
 165                 &xor    ($Zlh,&DWP(12,$Htbl,$rem));
 166                 &xor    ($Zhl,&DWP(0,$Htbl,$rem));
 167                 &xor    ($Zhh,&DWP(4,$Htbl,$rem));
 168
 169                 if ($i&1) {
 170                         &dec    ($cnt);
 171                         &js     (&label("x86_break"));
 172                 } else {
 173                         &jmp    (&label("x86_loop"));
 174                 }
 175             }
 176         &set_label("x86_break",16);
 177         } else {
 178             for($i=1;$i<32;$i++) {
 179                 &comment($i);
 180                 &mov    (&LB($rem),&LB($Zll));
 181                 &shrd   ($Zll,$Zlh,4);
 182                 &and    (&LB($rem),0xf);
 183                 &shrd   ($Zlh,$Zhl,4);
 184                 &shrd   ($Zhl,$Zhh,4);
 185                 &shr    ($Zhh,4);
 186                 &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
 187
 188                 if ($i&1) {
 189                         &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
 190                         &and    (&LB($rem),0xf0);
 191                 } else {
 192                         &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
 193                         &shl    (&LB($rem),4);
 194                 }
 195
 196                 &xor    ($Zll,&DWP(8,$Htbl,$rem));
 197                 &xor    ($Zlh,&DWP(12,$Htbl,$rem));
 198                 &xor    ($Zhl,&DWP(0,$Htbl,$rem));
 199                 &xor    ($Zhh,&DWP(4,$Htbl,$rem));
 200             }
 201         }
 202         &bswap  ($Zll);
 203         &bswap  ($Zlh);
 204         &bswap  ($Zhl);
 205         if (!$x86only) {
 206                 &bswap  ($Zhh);
 207         } else {
 208                 &mov    ("eax",$Zhh);
 209                 &bswap  ("eax");
 210                 &mov    ($Zhh,"eax");
 211         }
 212 }
 213
 214 if ($unroll) {
 215     &function_begin_B("_x86_gmult_4bit_inner");
 216         &x86_loop(4);
 217         &ret    ();
 218     &function_end_B("_x86_gmult_4bit_inner");
 219 }
 220
 221 &function_begin("gcm_gmult_4bit");
 222     if (!$x86only) {
 223         &call   (&label("pic_point"));
 224         &set_label("pic_point");
 225         &blindpop("eax");
 226         &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
 227         &bt     (&DWP(0,"ebp"),23);     # check for MMX bit
 228         &jnc    (&label("x86"));
 229
 230         &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
 231
 232         &mov    ($inp,&wparam(0));      # load Xi
 233         &mov    ($Htbl,&wparam(1));     # load Htable
 234
 235         &movz   ($Zll,&BP(15,$inp));
 236
 237         &mmx_loop($inp,"eax");
 238
 239         &emms   ();
 240         &mov    (&DWP(12,$inp),$Zll);
 241         &mov    (&DWP(4,$inp),$Zhl);
 242         &mov    (&DWP(8,$inp),$Zlh);
 243         &mov    (&DWP(0,$inp),$Zhh);
 244
 245         &function_end_A();
 246     &set_label("x86",16);
 247     }
 248         &stack_push(16+4+1);                    # +1 for stack alignment
 249         &mov    ($inp,&wparam(0));              # load Xi
 250         &mov    ($Htbl,&wparam(1));             # load Htable
 251
 252         &mov    ($Zhh,&DWP(0,$inp));            # load Xi[16]
 253         &mov    ($Zhl,&DWP(4,$inp));
 254         &mov    ($Zlh,&DWP(8,$inp));
 255         &mov    ($Zll,&DWP(12,$inp));
 256
 257         &deposit_rem_4bit(16);
 258
 259         &mov    (&DWP(0,"esp"),$Zhh);           # copy Xi[16] on stack
 260         &mov    (&DWP(4,"esp"),$Zhl);
 261         &mov    (&DWP(8,"esp"),$Zlh);
 262         &mov    (&DWP(12,"esp"),$Zll);
 263         &shr    ($Zll,20);
 264         &and    ($Zll,0xf0);
 265
 266         if ($unroll) {
 267                 &call   ("_x86_gmult_4bit_inner");
 268         } else {
 269                 &x86_loop(0);
 270                 &mov    ($inp,&wparam(0));
 271         }
 272
 273         &mov    (&DWP(12,$inp),$Zll);
 274         &mov    (&DWP(8,$inp),$Zlh);
 275         &mov    (&DWP(4,$inp),$Zhl);
 276         &mov    (&DWP(0,$inp),$Zhh);
 277         &stack_pop(16+4+1);
 278 &function_end("gcm_gmult_4bit");
 279
 280 # Streamed version performs 20% better on P4, 7% on Opteron,
 281 # 10% on Core2 and PIII...
 282 &function_begin("gcm_ghash_4bit");
 283     if (!$x86only) {
 284         &call   (&label("pic_point"));
 285         &set_label("pic_point");
 286         &blindpop("eax");
 287         &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
 288         &bt     (&DWP(0,"ebp"),23);     # check for MMX bit
 289         &jnc    (&label("x86"));
 290
 291         &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
 292
 293         &mov    ($inp,&wparam(0));      # load in
 294         &mov    ($Zlh,&wparam(1));      # load len
 295         &mov    ($Zhh,&wparam(2));      # load Xi
 296         &mov    ($Htbl,&wparam(3));     # load Htable
 297         &add    ($Zlh,$inp);
 298         &mov    (&wparam(1),$Zlh);      # len to point at the end of input
 299         &stack_push(4+1);               # +1 for stack alignment
 300         &mov    ($Zll,&DWP(12,$Zhh));   # load Xi[16]
 301         &mov    ($Zhl,&DWP(4,$Zhh));
 302         &mov    ($Zlh,&DWP(8,$Zhh));
 303         &mov    ($Zhh,&DWP(0,$Zhh));
 304
 305     &set_label("mmx_outer_loop",16);
 306         &xor    ($Zll,&DWP(12,$inp));
 307         &xor    ($Zhl,&DWP(4,$inp));
 308         &xor    ($Zlh,&DWP(8,$inp));
 309         &xor    ($Zhh,&DWP(0,$inp));
 310         &mov    (&DWP(12,"esp"),$Zll);
 311         &mov    (&DWP(4,"esp"),$Zhl);
 312         &mov    (&DWP(8,"esp"),$Zlh);
 313         &mov    (&DWP(0,"esp"),$Zhh);
 314
 315         &shr    ($Zll,24);
 316
 317         &mmx_loop("esp","eax");
 318
 319         &lea    ($inp,&DWP(16,$inp));
 320         &cmp    ($inp,&wparam(1));
 321         &jb     (&label("mmx_outer_loop"));
 322
 323         &mov    ($inp,&wparam(2));      # load Xi
 324         &emms   ();
 325         &mov    (&DWP(12,$inp),$Zll);
 326         &mov    (&DWP(4,$inp),$Zhl);
 327         &mov    (&DWP(8,$inp),$Zlh);
 328         &mov    (&DWP(0,$inp),$Zhh);
 329
 330         &stack_pop(4+1);
 331         &function_end_A();
 332     &set_label("x86",16);
 333     }
 334         &stack_push(16+4+1);                    # +1 for 64-bit alignment
 335         &mov    ($inp,&wparam(0));              # load in
 336         &mov    ("ecx",&wparam(1));             # load len
 337         &mov    ($Zll,&wparam(2));              # load Xi
 338         &mov    ($Htbl,&wparam(3));             # load Htable
 339         &add    ("ecx",$inp);
 340         &mov    (&wparam(1),"ecx");
 341
 342         &mov    ($Zhh,&DWP(0,$Zll));            # load Xi[16]
 343         &mov    ($Zhl,&DWP(4,$Zll));
 344         &mov    ($Zlh,&DWP(8,$Zll));
 345         &mov    ($Zll,&DWP(12,$Zll));
 346
 347         &deposit_rem_4bit(16);
 348
 349     &set_label("x86_outer_loop",16);
 350         &xor    ($Zll,&DWP(12,$inp));           # xor with input
 351         &xor    ($Zlh,&DWP(8,$inp));
 352         &xor    ($Zhl,&DWP(4,$inp));
 353         &xor    ($Zhh,&DWP(0,$inp));
 354         &mov    (&DWP(12,"esp"),$Zll);          # dump it on stack
 355         &mov    (&DWP(8,"esp"),$Zlh);
 356         &mov    (&DWP(4,"esp"),$Zhl);
 357         &mov    (&DWP(0,"esp"),$Zhh);
 358
 359         &shr    ($Zll,20);
 360         &and    ($Zll,0xf0);
 361
 362         if ($unroll) {
 363                 &call   ("_x86_gmult_4bit_inner");
 364         } else {
 365                 &x86_loop(0);
 366                 &mov    ($inp,&wparam(0));
 367         }
 368         &lea    ($inp,&DWP(16,$inp));
 369         &cmp    ($inp,&wparam(1));
 370         &mov    (&wparam(0),$inp)       if (!$unroll);
 371         &jb     (&label("x86_outer_loop"));
 372
 373         &mov    ($inp,&wparam(2));      # load Xi
 374         &mov    (&DWP(12,$inp),$Zll);
 375         &mov    (&DWP(8,$inp),$Zlh);
 376         &mov    (&DWP(4,$inp),$Zhl);
 377         &mov    (&DWP(0,$inp),$Zhh);
 378         &stack_pop(16+4+1);
 379 &function_end("gcm_ghash_4bit");
 380
 381 sub deposit_rem_4bit {
 382     my $bias = shift;
 383
 384         &mov    (&DWP($bias+0, "esp"),0x0000<<16);
 385         &mov    (&DWP($bias+4, "esp"),0x1C20<<16);
 386         &mov    (&DWP($bias+8, "esp"),0x3840<<16);
 387         &mov    (&DWP($bias+12,"esp"),0x2460<<16);
 388         &mov    (&DWP($bias+16,"esp"),0x7080<<16);
 389         &mov    (&DWP($bias+20,"esp"),0x6CA0<<16);
 390         &mov    (&DWP($bias+24,"esp"),0x48C0<<16);
 391         &mov    (&DWP($bias+28,"esp"),0x54E0<<16);
 392         &mov    (&DWP($bias+32,"esp"),0xE100<<16);
 393         &mov    (&DWP($bias+36,"esp"),0xFD20<<16);
 394         &mov    (&DWP($bias+40,"esp"),0xD940<<16);
 395         &mov    (&DWP($bias+44,"esp"),0xC560<<16);
 396         &mov    (&DWP($bias+48,"esp"),0x9180<<16);
 397         &mov    (&DWP($bias+52,"esp"),0x8DA0<<16);
 398         &mov    (&DWP($bias+56,"esp"),0xA9C0<<16);
 399         &mov    (&DWP($bias+60,"esp"),0xB5E0<<16);
 400 }
 401
 402 if (!$x86only) {
 403 &set_label("rem_4bit",64);
 404         &data_word(0,0x0000<<16,0,0x1C20<<16,0,0x3840<<16,0,0x2460<<16);
 405         &data_word(0,0x7080<<16,0,0x6CA0<<16,0,0x48C0<<16,0,0x54E0<<16);
 406         &data_word(0,0xE100<<16,0,0xFD20<<16,0,0xD940<<16,0,0xC560<<16);
 407         &data_word(0,0x9180<<16,0,0x8DA0<<16,0,0xA9C0<<16,0,0xB5E0<<16);
 408 }
 409 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
 410 &asm_finish();