Remove filename argument to x86 asm_init.
[openssl.git] / crypto / bn / asm / bn-586.pl
1 #! /usr/bin/env perl
2 # Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
11 push(@INC,"${dir}","${dir}../../perlasm");
12 require "x86asm.pl";
13
14 $output = pop;
15 open STDOUT,">$output";
16
17 &asm_init($ARGV[0]);
18
19 $sse2=0;
20 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
21
22 &external_label("OPENSSL_ia32cap_P") if ($sse2);
23
24 &bn_mul_add_words("bn_mul_add_words");
25 &bn_mul_words("bn_mul_words");
26 &bn_sqr_words("bn_sqr_words");
27 &bn_div_words("bn_div_words");
28 &bn_add_words("bn_add_words");
29 &bn_sub_words("bn_sub_words");
30 &bn_sub_part_words("bn_sub_part_words");
31
32 &asm_finish();
33
34 close STDOUT;
35
36 sub bn_mul_add_words
37         {
38         local($name)=@_;
39
40         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
41
42         $r="eax";
43         $a="edx";
44         $c="ecx";
45
46         if ($sse2) {
47                 &picmeup("eax","OPENSSL_ia32cap_P");
48                 &bt(&DWP(0,"eax"),26);
49                 &jnc(&label("maw_non_sse2"));
50
51                 &mov($r,&wparam(0));
52                 &mov($a,&wparam(1));
53                 &mov($c,&wparam(2));
54                 &movd("mm0",&wparam(3));        # mm0 = w
55                 &pxor("mm1","mm1");             # mm1 = carry_in
56                 &jmp(&label("maw_sse2_entry"));
57
58         &set_label("maw_sse2_unrolled",16);
59                 &movd("mm3",&DWP(0,$r,"",0));   # mm3 = r[0]
60                 &paddq("mm1","mm3");            # mm1 = carry_in + r[0]
61                 &movd("mm2",&DWP(0,$a,"",0));   # mm2 = a[0]
62                 &pmuludq("mm2","mm0");          # mm2 = w*a[0]
63                 &movd("mm4",&DWP(4,$a,"",0));   # mm4 = a[1]
64                 &pmuludq("mm4","mm0");          # mm4 = w*a[1]
65                 &movd("mm6",&DWP(8,$a,"",0));   # mm6 = a[2]
66                 &pmuludq("mm6","mm0");          # mm6 = w*a[2]
67                 &movd("mm7",&DWP(12,$a,"",0));  # mm7 = a[3]
68                 &pmuludq("mm7","mm0");          # mm7 = w*a[3]
69                 &paddq("mm1","mm2");            # mm1 = carry_in + r[0] + w*a[0]
70                 &movd("mm3",&DWP(4,$r,"",0));   # mm3 = r[1]
71                 &paddq("mm3","mm4");            # mm3 = r[1] + w*a[1]
72                 &movd("mm5",&DWP(8,$r,"",0));   # mm5 = r[2]
73                 &paddq("mm5","mm6");            # mm5 = r[2] + w*a[2]
74                 &movd("mm4",&DWP(12,$r,"",0));  # mm4 = r[3]
75                 &paddq("mm7","mm4");            # mm7 = r[3] + w*a[3]
76                 &movd(&DWP(0,$r,"",0),"mm1");
77                 &movd("mm2",&DWP(16,$a,"",0));  # mm2 = a[4]
78                 &pmuludq("mm2","mm0");          # mm2 = w*a[4]
79                 &psrlq("mm1",32);               # mm1 = carry0
80                 &movd("mm4",&DWP(20,$a,"",0));  # mm4 = a[5]
81                 &pmuludq("mm4","mm0");          # mm4 = w*a[5]
82                 &paddq("mm1","mm3");            # mm1 = carry0 + r[1] + w*a[1]
83                 &movd("mm6",&DWP(24,$a,"",0));  # mm6 = a[6]
84                 &pmuludq("mm6","mm0");          # mm6 = w*a[6]
85                 &movd(&DWP(4,$r,"",0),"mm1");
86                 &psrlq("mm1",32);               # mm1 = carry1
87                 &movd("mm3",&DWP(28,$a,"",0));  # mm3 = a[7]
88                 &add($a,32);
89                 &pmuludq("mm3","mm0");          # mm3 = w*a[7]
90                 &paddq("mm1","mm5");            # mm1 = carry1 + r[2] + w*a[2]
91                 &movd("mm5",&DWP(16,$r,"",0));  # mm5 = r[4]
92                 &paddq("mm2","mm5");            # mm2 = r[4] + w*a[4]
93                 &movd(&DWP(8,$r,"",0),"mm1");
94                 &psrlq("mm1",32);               # mm1 = carry2
95                 &paddq("mm1","mm7");            # mm1 = carry2 + r[3] + w*a[3]
96                 &movd("mm5",&DWP(20,$r,"",0));  # mm5 = r[5]
97                 &paddq("mm4","mm5");            # mm4 = r[5] + w*a[5]
98                 &movd(&DWP(12,$r,"",0),"mm1");
99                 &psrlq("mm1",32);               # mm1 = carry3
100                 &paddq("mm1","mm2");            # mm1 = carry3 + r[4] + w*a[4]
101                 &movd("mm5",&DWP(24,$r,"",0));  # mm5 = r[6]
102                 &paddq("mm6","mm5");            # mm6 = r[6] + w*a[6]
103                 &movd(&DWP(16,$r,"",0),"mm1");
104                 &psrlq("mm1",32);               # mm1 = carry4
105                 &paddq("mm1","mm4");            # mm1 = carry4 + r[5] + w*a[5]
106                 &movd("mm5",&DWP(28,$r,"",0));  # mm5 = r[7]
107                 &paddq("mm3","mm5");            # mm3 = r[7] + w*a[7]
108                 &movd(&DWP(20,$r,"",0),"mm1");
109                 &psrlq("mm1",32);               # mm1 = carry5
110                 &paddq("mm1","mm6");            # mm1 = carry5 + r[6] + w*a[6]
111                 &movd(&DWP(24,$r,"",0),"mm1");
112                 &psrlq("mm1",32);               # mm1 = carry6
113                 &paddq("mm1","mm3");            # mm1 = carry6 + r[7] + w*a[7]
114                 &movd(&DWP(28,$r,"",0),"mm1");
115                 &lea($r,&DWP(32,$r));
116                 &psrlq("mm1",32);               # mm1 = carry_out
117
118                 &sub($c,8);
119                 &jz(&label("maw_sse2_exit"));
120         &set_label("maw_sse2_entry");
121                 &test($c,0xfffffff8);
122                 &jnz(&label("maw_sse2_unrolled"));
123
124         &set_label("maw_sse2_loop",4);
125                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
126                 &movd("mm3",&DWP(0,$r));        # mm3 = r[i]
127                 &pmuludq("mm2","mm0");          # a[i] *= w
128                 &lea($a,&DWP(4,$a));
129                 &paddq("mm1","mm3");            # carry += r[i]
130                 &paddq("mm1","mm2");            # carry += a[i]*w
131                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
132                 &sub($c,1);
133                 &psrlq("mm1",32);               # carry = carry_high
134                 &lea($r,&DWP(4,$r));
135                 &jnz(&label("maw_sse2_loop"));
136         &set_label("maw_sse2_exit");
137                 &movd("eax","mm1");             # c = carry_out
138                 &emms();
139                 &ret();
140
141         &set_label("maw_non_sse2",16);
142         }
143
144         # function_begin prologue
145         &push("ebp");
146         &push("ebx");
147         &push("esi");
148         &push("edi");
149
150         &comment("");
151         $Low="eax";
152         $High="edx";
153         $a="ebx";
154         $w="ebp";
155         $r="edi";
156         $c="esi";
157
158         &xor($c,$c);            # clear carry
159         &mov($r,&wparam(0));    #
160
161         &mov("ecx",&wparam(2)); #
162         &mov($a,&wparam(1));    #
163
164         &and("ecx",0xfffffff8); # num / 8
165         &mov($w,&wparam(3));    #
166
167         &push("ecx");           # Up the stack for a tmp variable
168
169         &jz(&label("maw_finish"));
170
171         &set_label("maw_loop",16);
172
173         for ($i=0; $i<32; $i+=4)
174                 {
175                 &comment("Round $i");
176
177                  &mov("eax",&DWP($i,$a));       # *a
178                 &mul($w);                       # *a * w
179                 &add("eax",$c);                 # L(t)+= c
180                 &adc("edx",0);                  # H(t)+=carry
181                  &add("eax",&DWP($i,$r));       # L(t)+= *r
182                 &adc("edx",0);                  # H(t)+=carry
183                  &mov(&DWP($i,$r),"eax");       # *r= L(t);
184                 &mov($c,"edx");                 # c=  H(t);
185                 }
186
187         &comment("");
188         &sub("ecx",8);
189         &lea($a,&DWP(32,$a));
190         &lea($r,&DWP(32,$r));
191         &jnz(&label("maw_loop"));
192
193         &set_label("maw_finish",0);
194         &mov("ecx",&wparam(2)); # get num
195         &and("ecx",7);
196         &jnz(&label("maw_finish2"));    # helps branch prediction
197         &jmp(&label("maw_end"));
198
199         &set_label("maw_finish2",1);
200         for ($i=0; $i<7; $i++)
201                 {
202                 &comment("Tail Round $i");
203                  &mov("eax",&DWP($i*4,$a));     # *a
204                 &mul($w);                       # *a * w
205                 &add("eax",$c);                 # L(t)+=c
206                 &adc("edx",0);                  # H(t)+=carry
207                  &add("eax",&DWP($i*4,$r));     # L(t)+= *r
208                 &adc("edx",0);                  # H(t)+=carry
209                  &dec("ecx") if ($i != 7-1);
210                 &mov(&DWP($i*4,$r),"eax");      # *r= L(t);
211                  &mov($c,"edx");                # c=  H(t);
212                 &jz(&label("maw_end")) if ($i != 7-1);
213                 }
214         &set_label("maw_end",0);
215         &mov("eax",$c);
216
217         &pop("ecx");    # clear variable from
218
219         &function_end($name);
220         }
221
222 sub bn_mul_words
223         {
224         local($name)=@_;
225
226         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
227
228         $r="eax";
229         $a="edx";
230         $c="ecx";
231
232         if ($sse2) {
233                 &picmeup("eax","OPENSSL_ia32cap_P");
234                 &bt(&DWP(0,"eax"),26);
235                 &jnc(&label("mw_non_sse2"));
236
237                 &mov($r,&wparam(0));
238                 &mov($a,&wparam(1));
239                 &mov($c,&wparam(2));
240                 &movd("mm0",&wparam(3));        # mm0 = w
241                 &pxor("mm1","mm1");             # mm1 = carry = 0
242
243         &set_label("mw_sse2_loop",16);
244                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
245                 &pmuludq("mm2","mm0");          # a[i] *= w
246                 &lea($a,&DWP(4,$a));
247                 &paddq("mm1","mm2");            # carry += a[i]*w
248                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
249                 &sub($c,1);
250                 &psrlq("mm1",32);               # carry = carry_high
251                 &lea($r,&DWP(4,$r));
252                 &jnz(&label("mw_sse2_loop"));
253
254                 &movd("eax","mm1");             # return carry
255                 &emms();
256                 &ret();
257         &set_label("mw_non_sse2",16);
258         }
259
260         # function_begin prologue
261         &push("ebp");
262         &push("ebx");
263         &push("esi");
264         &push("edi");
265
266         &comment("");
267         $Low="eax";
268         $High="edx";
269         $a="ebx";
270         $w="ecx";
271         $r="edi";
272         $c="esi";
273         $num="ebp";
274
275         &xor($c,$c);            # clear carry
276         &mov($r,&wparam(0));    #
277         &mov($a,&wparam(1));    #
278         &mov($num,&wparam(2));  #
279         &mov($w,&wparam(3));    #
280
281         &and($num,0xfffffff8);  # num / 8
282         &jz(&label("mw_finish"));
283
284         &set_label("mw_loop",0);
285         for ($i=0; $i<32; $i+=4)
286                 {
287                 &comment("Round $i");
288
289                  &mov("eax",&DWP($i,$a,"",0));  # *a
290                 &mul($w);                       # *a * w
291                 &add("eax",$c);                 # L(t)+=c
292                  # XXX
293
294                 &adc("edx",0);                  # H(t)+=carry
295                  &mov(&DWP($i,$r,"",0),"eax");  # *r= L(t);
296
297                 &mov($c,"edx");                 # c=  H(t);
298                 }
299
300         &comment("");
301         &add($a,32);
302         &add($r,32);
303         &sub($num,8);
304         &jz(&label("mw_finish"));
305         &jmp(&label("mw_loop"));
306
307         &set_label("mw_finish",0);
308         &mov($num,&wparam(2));  # get num
309         &and($num,7);
310         &jnz(&label("mw_finish2"));
311         &jmp(&label("mw_end"));
312
313         &set_label("mw_finish2",1);
314         for ($i=0; $i<7; $i++)
315                 {
316                 &comment("Tail Round $i");
317                  &mov("eax",&DWP($i*4,$a,"",0));# *a
318                 &mul($w);                       # *a * w
319                 &add("eax",$c);                 # L(t)+=c
320                  # XXX
321                 &adc("edx",0);                  # H(t)+=carry
322                  &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
323                 &mov($c,"edx");                 # c=  H(t);
324                  &dec($num) if ($i != 7-1);
325                 &jz(&label("mw_end")) if ($i != 7-1);
326                 }
327         &set_label("mw_end",0);
328         &mov("eax",$c);
329
330         &function_end($name);
331         }
332
333 sub bn_sqr_words
334         {
335         local($name)=@_;
336
337         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
338
339         $r="eax";
340         $a="edx";
341         $c="ecx";
342
343         if ($sse2) {
344                 &picmeup("eax","OPENSSL_ia32cap_P");
345                 &bt(&DWP(0,"eax"),26);
346                 &jnc(&label("sqr_non_sse2"));
347
348                 &mov($r,&wparam(0));
349                 &mov($a,&wparam(1));
350                 &mov($c,&wparam(2));
351
352         &set_label("sqr_sse2_loop",16);
353                 &movd("mm0",&DWP(0,$a));        # mm0 = a[i]
354                 &pmuludq("mm0","mm0");          # a[i] *= a[i]
355                 &lea($a,&DWP(4,$a));            # a++
356                 &movq(&QWP(0,$r),"mm0");        # r[i] = a[i]*a[i]
357                 &sub($c,1);
358                 &lea($r,&DWP(8,$r));            # r += 2
359                 &jnz(&label("sqr_sse2_loop"));
360
361                 &emms();
362                 &ret();
363         &set_label("sqr_non_sse2",16);
364         }
365
366         # function_begin prologue
367         &push("ebp");
368         &push("ebx");
369         &push("esi");
370         &push("edi");
371
372         &comment("");
373         $r="esi";
374         $a="edi";
375         $num="ebx";
376
377         &mov($r,&wparam(0));    #
378         &mov($a,&wparam(1));    #
379         &mov($num,&wparam(2));  #
380
381         &and($num,0xfffffff8);  # num / 8
382         &jz(&label("sw_finish"));
383
384         &set_label("sw_loop",0);
385         for ($i=0; $i<32; $i+=4)
386                 {
387                 &comment("Round $i");
388                 &mov("eax",&DWP($i,$a,"",0));   # *a
389                  # XXX
390                 &mul("eax");                    # *a * *a
391                 &mov(&DWP($i*2,$r,"",0),"eax"); #
392                  &mov(&DWP($i*2+4,$r,"",0),"edx");#
393                 }
394
395         &comment("");
396         &add($a,32);
397         &add($r,64);
398         &sub($num,8);
399         &jnz(&label("sw_loop"));
400
401         &set_label("sw_finish",0);
402         &mov($num,&wparam(2));  # get num
403         &and($num,7);
404         &jz(&label("sw_end"));
405
406         for ($i=0; $i<7; $i++)
407                 {
408                 &comment("Tail Round $i");
409                 &mov("eax",&DWP($i*4,$a,"",0)); # *a
410                  # XXX
411                 &mul("eax");                    # *a * *a
412                 &mov(&DWP($i*8,$r,"",0),"eax"); #
413                  &dec($num) if ($i != 7-1);
414                 &mov(&DWP($i*8+4,$r,"",0),"edx");
415                  &jz(&label("sw_end")) if ($i != 7-1);
416                 }
417         &set_label("sw_end",0);
418
419         &function_end($name);
420         }
421
422 sub bn_div_words
423         {
424         local($name)=@_;
425
426         &function_begin_B($name,"");
427         &mov("edx",&wparam(0)); #
428         &mov("eax",&wparam(1)); #
429         &mov("ecx",&wparam(2)); #
430         &div("ecx");
431         &ret();
432         &function_end_B($name);
433         }
434
435 sub bn_add_words
436         {
437         local($name)=@_;
438
439         &function_begin($name,"");
440
441         &comment("");
442         $a="esi";
443         $b="edi";
444         $c="eax";
445         $r="ebx";
446         $tmp1="ecx";
447         $tmp2="edx";
448         $num="ebp";
449
450         &mov($r,&wparam(0));    # get r
451          &mov($a,&wparam(1));   # get a
452         &mov($b,&wparam(2));    # get b
453          &mov($num,&wparam(3)); # get num
454         &xor($c,$c);            # clear carry
455          &and($num,0xfffffff8); # num / 8
456
457         &jz(&label("aw_finish"));
458
459         &set_label("aw_loop",0);
460         for ($i=0; $i<8; $i++)
461                 {
462                 &comment("Round $i");
463
464                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
465                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
466                 &add($tmp1,$c);
467                  &mov($c,0);
468                 &adc($c,$c);
469                  &add($tmp1,$tmp2);
470                 &adc($c,0);
471                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
472                 }
473
474         &comment("");
475         &add($a,32);
476          &add($b,32);
477         &add($r,32);
478          &sub($num,8);
479         &jnz(&label("aw_loop"));
480
481         &set_label("aw_finish",0);
482         &mov($num,&wparam(3));  # get num
483         &and($num,7);
484          &jz(&label("aw_end"));
485
486         for ($i=0; $i<7; $i++)
487                 {
488                 &comment("Tail Round $i");
489                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
490                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
491                 &add($tmp1,$c);
492                  &mov($c,0);
493                 &adc($c,$c);
494                  &add($tmp1,$tmp2);
495                 &adc($c,0);
496                  &dec($num) if ($i != 6);
497                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
498                  &jz(&label("aw_end")) if ($i != 6);
499                 }
500         &set_label("aw_end",0);
501
502 #       &mov("eax",$c);         # $c is "eax"
503
504         &function_end($name);
505         }
506
507 sub bn_sub_words
508         {
509         local($name)=@_;
510
511         &function_begin($name,"");
512
513         &comment("");
514         $a="esi";
515         $b="edi";
516         $c="eax";
517         $r="ebx";
518         $tmp1="ecx";
519         $tmp2="edx";
520         $num="ebp";
521
522         &mov($r,&wparam(0));    # get r
523          &mov($a,&wparam(1));   # get a
524         &mov($b,&wparam(2));    # get b
525          &mov($num,&wparam(3)); # get num
526         &xor($c,$c);            # clear carry
527          &and($num,0xfffffff8); # num / 8
528
529         &jz(&label("aw_finish"));
530
531         &set_label("aw_loop",0);
532         for ($i=0; $i<8; $i++)
533                 {
534                 &comment("Round $i");
535
536                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
537                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
538                 &sub($tmp1,$c);
539                  &mov($c,0);
540                 &adc($c,$c);
541                  &sub($tmp1,$tmp2);
542                 &adc($c,0);
543                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
544                 }
545
546         &comment("");
547         &add($a,32);
548          &add($b,32);
549         &add($r,32);
550          &sub($num,8);
551         &jnz(&label("aw_loop"));
552
553         &set_label("aw_finish",0);
554         &mov($num,&wparam(3));  # get num
555         &and($num,7);
556          &jz(&label("aw_end"));
557
558         for ($i=0; $i<7; $i++)
559                 {
560                 &comment("Tail Round $i");
561                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
562                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
563                 &sub($tmp1,$c);
564                  &mov($c,0);
565                 &adc($c,$c);
566                  &sub($tmp1,$tmp2);
567                 &adc($c,0);
568                  &dec($num) if ($i != 6);
569                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
570                  &jz(&label("aw_end")) if ($i != 6);
571                 }
572         &set_label("aw_end",0);
573
574 #       &mov("eax",$c);         # $c is "eax"
575
576         &function_end($name);
577         }
578
579 sub bn_sub_part_words
580         {
581         local($name)=@_;
582
583         &function_begin($name,"");
584
585         &comment("");
586         $a="esi";
587         $b="edi";
588         $c="eax";
589         $r="ebx";
590         $tmp1="ecx";
591         $tmp2="edx";
592         $num="ebp";
593
594         &mov($r,&wparam(0));    # get r
595          &mov($a,&wparam(1));   # get a
596         &mov($b,&wparam(2));    # get b
597          &mov($num,&wparam(3)); # get num
598         &xor($c,$c);            # clear carry
599          &and($num,0xfffffff8); # num / 8
600
601         &jz(&label("aw_finish"));
602
603         &set_label("aw_loop",0);
604         for ($i=0; $i<8; $i++)
605                 {
606                 &comment("Round $i");
607
608                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
609                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
610                 &sub($tmp1,$c);
611                  &mov($c,0);
612                 &adc($c,$c);
613                  &sub($tmp1,$tmp2);
614                 &adc($c,0);
615                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
616                 }
617
618         &comment("");
619         &add($a,32);
620          &add($b,32);
621         &add($r,32);
622          &sub($num,8);
623         &jnz(&label("aw_loop"));
624
625         &set_label("aw_finish",0);
626         &mov($num,&wparam(3));  # get num
627         &and($num,7);
628          &jz(&label("aw_end"));
629
630         for ($i=0; $i<7; $i++)
631                 {
632                 &comment("Tail Round $i");
633                 &mov($tmp1,&DWP(0,$a,"",0));    # *a
634                  &mov($tmp2,&DWP(0,$b,"",0));# *b
635                 &sub($tmp1,$c);
636                  &mov($c,0);
637                 &adc($c,$c);
638                  &sub($tmp1,$tmp2);
639                 &adc($c,0);
640                 &mov(&DWP(0,$r,"",0),$tmp1);    # *r
641                 &add($a, 4);
642                 &add($b, 4);
643                 &add($r, 4);
644                  &dec($num) if ($i != 6);
645                  &jz(&label("aw_end")) if ($i != 6);
646                 }
647         &set_label("aw_end",0);
648
649         &cmp(&wparam(4),0);
650         &je(&label("pw_end"));
651
652         &mov($num,&wparam(4));  # get dl
653         &cmp($num,0);
654         &je(&label("pw_end"));
655         &jge(&label("pw_pos"));
656
657         &comment("pw_neg");
658         &mov($tmp2,0);
659         &sub($tmp2,$num);
660         &mov($num,$tmp2);
661         &and($num,0xfffffff8);  # num / 8
662         &jz(&label("pw_neg_finish"));
663
664         &set_label("pw_neg_loop",0);
665         for ($i=0; $i<8; $i++)
666         {
667             &comment("dl<0 Round $i");
668
669             &mov($tmp1,0);
670             &mov($tmp2,&DWP($i*4,$b,"",0));     # *b
671             &sub($tmp1,$c);
672             &mov($c,0);
673             &adc($c,$c);
674             &sub($tmp1,$tmp2);
675             &adc($c,0);
676             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
677         }
678
679         &comment("");
680         &add($b,32);
681         &add($r,32);
682         &sub($num,8);
683         &jnz(&label("pw_neg_loop"));
684
685         &set_label("pw_neg_finish",0);
686         &mov($tmp2,&wparam(4)); # get dl
687         &mov($num,0);
688         &sub($num,$tmp2);
689         &and($num,7);
690         &jz(&label("pw_end"));
691
692         for ($i=0; $i<7; $i++)
693         {
694             &comment("dl<0 Tail Round $i");
695             &mov($tmp1,0);
696             &mov($tmp2,&DWP($i*4,$b,"",0));# *b
697             &sub($tmp1,$c);
698             &mov($c,0);
699             &adc($c,$c);
700             &sub($tmp1,$tmp2);
701             &adc($c,0);
702             &dec($num) if ($i != 6);
703             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
704             &jz(&label("pw_end")) if ($i != 6);
705         }
706
707         &jmp(&label("pw_end"));
708
709         &set_label("pw_pos",0);
710
711         &and($num,0xfffffff8);  # num / 8
712         &jz(&label("pw_pos_finish"));
713
714         &set_label("pw_pos_loop",0);
715
716         for ($i=0; $i<8; $i++)
717         {
718             &comment("dl>0 Round $i");
719
720             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
721             &sub($tmp1,$c);
722             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
723             &jnc(&label("pw_nc".$i));
724         }
725
726         &comment("");
727         &add($a,32);
728         &add($r,32);
729         &sub($num,8);
730         &jnz(&label("pw_pos_loop"));
731
732         &set_label("pw_pos_finish",0);
733         &mov($num,&wparam(4));  # get dl
734         &and($num,7);
735         &jz(&label("pw_end"));
736
737         for ($i=0; $i<7; $i++)
738         {
739             &comment("dl>0 Tail Round $i");
740             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
741             &sub($tmp1,$c);
742             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
743             &jnc(&label("pw_tail_nc".$i));
744             &dec($num) if ($i != 6);
745             &jz(&label("pw_end")) if ($i != 6);
746         }
747         &mov($c,1);
748         &jmp(&label("pw_end"));
749
750         &set_label("pw_nc_loop",0);
751         for ($i=0; $i<8; $i++)
752         {
753             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
754             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
755             &set_label("pw_nc".$i,0);
756         }
757
758         &comment("");
759         &add($a,32);
760         &add($r,32);
761         &sub($num,8);
762         &jnz(&label("pw_nc_loop"));
763
764         &mov($num,&wparam(4));  # get dl
765         &and($num,7);
766         &jz(&label("pw_nc_end"));
767
768         for ($i=0; $i<7; $i++)
769         {
770             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
771             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
772             &set_label("pw_tail_nc".$i,0);
773             &dec($num) if ($i != 6);
774             &jz(&label("pw_nc_end")) if ($i != 6);
775         }
776
777         &set_label("pw_nc_end",0);
778         &mov($c,0);
779
780         &set_label("pw_end",0);
781
782 #       &mov("eax",$c);         # $c is "eax"
783
784         &function_end($name);
785         }