3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # The module implements "4-bit" Galois field multiplication and
11 # streamed GHASH function. "4-bit" means that it uses 256 bytes
12 # per-key table [+128/256 bytes fixed table]. It has two code paths:
13 # vanilla x86 and vanilla MMX. Former will be executed on 486 and
14 # Pentium, latter on all others. Performance results are for streamed
15 # GHASH subroutine and are expressed in cycles per processed byte,
18 # gcc 2.95.3(*) MMX assembler x86 assembler
20 # Pentium 100/112(**) - 50
22 # P4 96 /122 33 84(***)
23 # Opteron 50 /71 22 30
26 # (*) gcc 3.4.x was observed to generate few percent slower code,
27 # which is one of reasons why 2.95.3 result were chosen;
28 # another reason is lack of 3.4.x results for older CPUs;
29 # (**) second number is result for code compiled with -fPIC flag,
30 # which is actually more relevant, because assembler code is
31 # position-independent;
32 # (***) see comment in non-MMX routine for further details;
34 # To summarize, it's 2-3 times faster than gcc-generated code. To
35 # anchor it to something else SHA1 assembler processes single byte
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 push(@INC,"${dir}","${dir}../../perlasm");
42 &asm_init($ARGV[0],"gcm-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
44 &static_label("rem_4bit") if (!$x86only);
53 $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
54 # than unrolled, which has to be weighted against
55 # almost 2x code size reduction. Well, *overall*
56 # code size. x86-specific code shrinks by 7.5x...
59 # MMX version performs 2.5 times better on P4 (see comment in non-MMX
60 # routine for further details), 35% better on Opteron and Core2, 40%
61 # better on PIII... In other words effort is considered to be well
74 &xor ($nlo,$nlo); # avoid partial register stalls on PIII
76 &mov (&LB($nlo),&LB($nhi));
80 &movq ($Zlo,&QWP(8,$Htbl,$nlo));
81 &movq ($Zhi,&QWP(0,$Htbl,$nlo));
83 &jmp (&label("mmx_loop"));
85 &set_label("mmx_loop",16);
91 &pxor ($Zlo,&QWP(8,$Htbl,$nhi));
93 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
95 &pxor ($Zhi,&QWP(0,$Htbl,$nhi));
97 &js (&label("mmx_break"));
99 &movz ($nhi,&BP(0,$inp,$cnt));
101 &mov (&LB($nlo),&LB($nhi));
106 &pxor ($Zlo,&QWP(8,$Htbl,$nlo));
108 &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
110 &pxor ($Zhi,&QWP(0,$Htbl,$nlo));
113 &jmp (&label("mmx_loop"));
115 &set_label("mmx_break",16);
116 &psrlq ($Zlo,32); # lower part of Zlo is already there
132 &mov ($Zhh,&DWP(4,$Htbl,$Zll));
133 &mov ($Zhl,&DWP(0,$Htbl,$Zll));
134 &mov ($Zlh,&DWP(12,$Htbl,$Zll));
135 &mov ($Zll,&DWP(8,$Htbl,$Zll));
136 &xor ($rem,$rem); # avoid partial register stalls on PIII
138 # shrd practically kills P4, 2.5x deterioration, but P4 has
139 # MMX code-path to execute. shrd runs tad faster [than twice
140 # the shifts, move's and or's] on pre-MMX Pentium (as well as
141 # PIII and Core2), *but* minimizes code size, spares register
142 # and thus allows to fold the loop...
146 &jmp (&label("x86_loop"));
147 &set_label("x86_loop",16);
148 for($i=1;$i<=2;$i++) {
149 &mov (&LB($rem),&LB($Zll));
151 &and (&LB($rem),0xf);
155 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
157 &mov (&LB($rem),&BP($off,"esp",$cnt));
159 &and (&LB($rem),0xf0);
164 &xor ($Zll,&DWP(8,$Htbl,$rem));
165 &xor ($Zlh,&DWP(12,$Htbl,$rem));
166 &xor ($Zhl,&DWP(0,$Htbl,$rem));
167 &xor ($Zhh,&DWP(4,$Htbl,$rem));
171 &js (&label("x86_break"));
173 &jmp (&label("x86_loop"));
176 &set_label("x86_break",16);
178 for($i=1;$i<32;$i++) {
180 &mov (&LB($rem),&LB($Zll));
182 &and (&LB($rem),0xf);
186 &xor ($Zhh,&DWP($off+16,"esp",$rem,4));
189 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
190 &and (&LB($rem),0xf0);
192 &mov (&LB($rem),&BP($off+15-($i>>1),"esp"));
196 &xor ($Zll,&DWP(8,$Htbl,$rem));
197 &xor ($Zlh,&DWP(12,$Htbl,$rem));
198 &xor ($Zhl,&DWP(0,$Htbl,$rem));
199 &xor ($Zhh,&DWP(4,$Htbl,$rem));
215 &function_begin_B("_x86_gmult_4bit_inner");
218 &function_end_B("_x86_gmult_4bit_inner");
221 &function_begin("gcm_gmult_4bit");
223 &call (&label("pic_point"));
224 &set_label("pic_point");
226 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
227 &bt (&DWP(0,"ebp"),23); # check for MMX bit
228 &jnc (&label("x86"));
230 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
232 &mov ($inp,&wparam(0)); # load Xi
233 &mov ($Htbl,&wparam(1)); # load Htable
235 &movz ($Zll,&BP(15,$inp));
237 &mmx_loop($inp,"eax");
240 &mov (&DWP(12,$inp),$Zll);
241 &mov (&DWP(4,$inp),$Zhl);
242 &mov (&DWP(8,$inp),$Zlh);
243 &mov (&DWP(0,$inp),$Zhh);
246 &set_label("x86",16);
248 &stack_push(16+4+1); # +1 for stack alignment
249 &mov ($inp,&wparam(0)); # load Xi
250 &mov ($Htbl,&wparam(1)); # load Htable
252 &mov ($Zhh,&DWP(0,$inp)); # load Xi[16]
253 &mov ($Zhl,&DWP(4,$inp));
254 &mov ($Zlh,&DWP(8,$inp));
255 &mov ($Zll,&DWP(12,$inp));
257 &deposit_rem_4bit(16);
259 &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack
260 &mov (&DWP(4,"esp"),$Zhl);
261 &mov (&DWP(8,"esp"),$Zlh);
262 &mov (&DWP(12,"esp"),$Zll);
267 &call ("_x86_gmult_4bit_inner");
270 &mov ($inp,&wparam(0));
273 &mov (&DWP(12,$inp),$Zll);
274 &mov (&DWP(8,$inp),$Zlh);
275 &mov (&DWP(4,$inp),$Zhl);
276 &mov (&DWP(0,$inp),$Zhh);
278 &function_end("gcm_gmult_4bit");
280 # Streamed version performs 20% better on P4, 7% on Opteron,
281 # 10% on Core2 and PIII...
282 &function_begin("gcm_ghash_4bit");
284 &call (&label("pic_point"));
285 &set_label("pic_point");
287 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
288 &bt (&DWP(0,"ebp"),23); # check for MMX bit
289 &jnc (&label("x86"));
291 &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
293 &mov ($inp,&wparam(0)); # load in
294 &mov ($Zlh,&wparam(1)); # load len
295 &mov ($Zhh,&wparam(2)); # load Xi
296 &mov ($Htbl,&wparam(3)); # load Htable
298 &mov (&wparam(1),$Zlh); # len to point at the end of input
299 &stack_push(4+1); # +1 for stack alignment
300 &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
301 &mov ($Zhl,&DWP(4,$Zhh));
302 &mov ($Zlh,&DWP(8,$Zhh));
303 &mov ($Zhh,&DWP(0,$Zhh));
305 &set_label("mmx_outer_loop",16);
306 &xor ($Zll,&DWP(12,$inp));
307 &xor ($Zhl,&DWP(4,$inp));
308 &xor ($Zlh,&DWP(8,$inp));
309 &xor ($Zhh,&DWP(0,$inp));
310 &mov (&DWP(12,"esp"),$Zll);
311 &mov (&DWP(4,"esp"),$Zhl);
312 &mov (&DWP(8,"esp"),$Zlh);
313 &mov (&DWP(0,"esp"),$Zhh);
317 &mmx_loop("esp","eax");
319 &lea ($inp,&DWP(16,$inp));
320 &cmp ($inp,&wparam(1));
321 &jb (&label("mmx_outer_loop"));
323 &mov ($inp,&wparam(2)); # load Xi
325 &mov (&DWP(12,$inp),$Zll);
326 &mov (&DWP(4,$inp),$Zhl);
327 &mov (&DWP(8,$inp),$Zlh);
328 &mov (&DWP(0,$inp),$Zhh);
332 &set_label("x86",16);
334 &stack_push(16+4+1); # +1 for 64-bit alignment
335 &mov ($inp,&wparam(0)); # load in
336 &mov ("ecx",&wparam(1)); # load len
337 &mov ($Zll,&wparam(2)); # load Xi
338 &mov ($Htbl,&wparam(3)); # load Htable
340 &mov (&wparam(1),"ecx");
342 &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
343 &mov ($Zhl,&DWP(4,$Zll));
344 &mov ($Zlh,&DWP(8,$Zll));
345 &mov ($Zll,&DWP(12,$Zll));
347 &deposit_rem_4bit(16);
349 &set_label("x86_outer_loop",16);
350 &xor ($Zll,&DWP(12,$inp)); # xor with input
351 &xor ($Zlh,&DWP(8,$inp));
352 &xor ($Zhl,&DWP(4,$inp));
353 &xor ($Zhh,&DWP(0,$inp));
354 &mov (&DWP(12,"esp"),$Zll); # dump it on stack
355 &mov (&DWP(8,"esp"),$Zlh);
356 &mov (&DWP(4,"esp"),$Zhl);
357 &mov (&DWP(0,"esp"),$Zhh);
363 &call ("_x86_gmult_4bit_inner");
366 &mov ($inp,&wparam(0));
368 &lea ($inp,&DWP(16,$inp));
369 &cmp ($inp,&wparam(1));
370 &mov (&wparam(0),$inp) if (!$unroll);
371 &jb (&label("x86_outer_loop"));
373 &mov ($inp,&wparam(2)); # load Xi
374 &mov (&DWP(12,$inp),$Zll);
375 &mov (&DWP(8,$inp),$Zlh);
376 &mov (&DWP(4,$inp),$Zhl);
377 &mov (&DWP(0,$inp),$Zhh);
379 &function_end("gcm_ghash_4bit");
381 sub deposit_rem_4bit {
384 &mov (&DWP($bias+0, "esp"),0x0000<<16);
385 &mov (&DWP($bias+4, "esp"),0x1C20<<16);
386 &mov (&DWP($bias+8, "esp"),0x3840<<16);
387 &mov (&DWP($bias+12,"esp"),0x2460<<16);
388 &mov (&DWP($bias+16,"esp"),0x7080<<16);
389 &mov (&DWP($bias+20,"esp"),0x6CA0<<16);
390 &mov (&DWP($bias+24,"esp"),0x48C0<<16);
391 &mov (&DWP($bias+28,"esp"),0x54E0<<16);
392 &mov (&DWP($bias+32,"esp"),0xE100<<16);
393 &mov (&DWP($bias+36,"esp"),0xFD20<<16);
394 &mov (&DWP($bias+40,"esp"),0xD940<<16);
395 &mov (&DWP($bias+44,"esp"),0xC560<<16);
396 &mov (&DWP($bias+48,"esp"),0x9180<<16);
397 &mov (&DWP($bias+52,"esp"),0x8DA0<<16);
398 &mov (&DWP($bias+56,"esp"),0xA9C0<<16);
399 &mov (&DWP($bias+60,"esp"),0xB5E0<<16);
403 &set_label("rem_4bit",64);
404 &data_word(0,0x0000<<16,0,0x1C20<<16,0,0x3840<<16,0,0x2460<<16);
405 &data_word(0,0x7080<<16,0,0x6CA0<<16,0,0x48C0<<16,0,0x54E0<<16);
406 &data_word(0,0xE100<<16,0,0xFD20<<16,0,0xD940<<16,0,0xC560<<16);
407 &data_word(0,0x9180<<16,0,0x8DA0<<16,0,0xA9C0<<16,0,0xB5E0<<16);
409 &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");