3f34abef9e38c0d52632b86037f24bfe7e15ead5
[openssl.git] / crypto / bn / asm / bn-586.pl
1 #!/usr/local/bin/perl
2
3 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4 push(@INC,"${dir}","${dir}../../perlasm");
5 require "x86asm.pl";
6
7 $output = pop;
8 open STDOUT,">$output";
9
10 &asm_init($ARGV[0],$0);
11
12 $sse2=0;
13 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
14
15 &external_label("OPENSSL_ia32cap_P") if ($sse2);
16
17 &bn_mul_add_words("bn_mul_add_words");
18 &bn_mul_words("bn_mul_words");
19 &bn_sqr_words("bn_sqr_words");
20 &bn_div_words("bn_div_words");
21 &bn_add_words("bn_add_words");
22 &bn_sub_words("bn_sub_words");
23 &bn_sub_part_words("bn_sub_part_words");
24
25 &asm_finish();
26
27 close STDOUT;
28
29 sub bn_mul_add_words
30         {
31         local($name)=@_;
32
33         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
34
35         $r="eax";
36         $a="edx";
37         $c="ecx";
38
39         if ($sse2) {
40                 &picmeup("eax","OPENSSL_ia32cap_P");
41                 &bt(&DWP(0,"eax"),26);
42                 &jnc(&label("maw_non_sse2"));
43
44                 &mov($r,&wparam(0));
45                 &mov($a,&wparam(1));
46                 &mov($c,&wparam(2));
47                 &movd("mm0",&wparam(3));        # mm0 = w
48                 &pxor("mm1","mm1");             # mm1 = carry_in
49                 &jmp(&label("maw_sse2_entry"));
50                 
51         &set_label("maw_sse2_unrolled",16);
52                 &movd("mm3",&DWP(0,$r,"",0));   # mm3 = r[0]
53                 &paddq("mm1","mm3");            # mm1 = carry_in + r[0]
54                 &movd("mm2",&DWP(0,$a,"",0));   # mm2 = a[0]
55                 &pmuludq("mm2","mm0");          # mm2 = w*a[0]
56                 &movd("mm4",&DWP(4,$a,"",0));   # mm4 = a[1]
57                 &pmuludq("mm4","mm0");          # mm4 = w*a[1]
58                 &movd("mm6",&DWP(8,$a,"",0));   # mm6 = a[2]
59                 &pmuludq("mm6","mm0");          # mm6 = w*a[2]
60                 &movd("mm7",&DWP(12,$a,"",0));  # mm7 = a[3]
61                 &pmuludq("mm7","mm0");          # mm7 = w*a[3]
62                 &paddq("mm1","mm2");            # mm1 = carry_in + r[0] + w*a[0]
63                 &movd("mm3",&DWP(4,$r,"",0));   # mm3 = r[1]
64                 &paddq("mm3","mm4");            # mm3 = r[1] + w*a[1]
65                 &movd("mm5",&DWP(8,$r,"",0));   # mm5 = r[2]
66                 &paddq("mm5","mm6");            # mm5 = r[2] + w*a[2]
67                 &movd("mm4",&DWP(12,$r,"",0));  # mm4 = r[3]
68                 &paddq("mm7","mm4");            # mm7 = r[3] + w*a[3]
69                 &movd(&DWP(0,$r,"",0),"mm1");
70                 &movd("mm2",&DWP(16,$a,"",0));  # mm2 = a[4]
71                 &pmuludq("mm2","mm0");          # mm2 = w*a[4]
72                 &psrlq("mm1",32);               # mm1 = carry0
73                 &movd("mm4",&DWP(20,$a,"",0));  # mm4 = a[5]
74                 &pmuludq("mm4","mm0");          # mm4 = w*a[5]
75                 &paddq("mm1","mm3");            # mm1 = carry0 + r[1] + w*a[1]
76                 &movd("mm6",&DWP(24,$a,"",0));  # mm6 = a[6]
77                 &pmuludq("mm6","mm0");          # mm6 = w*a[6]
78                 &movd(&DWP(4,$r,"",0),"mm1");
79                 &psrlq("mm1",32);               # mm1 = carry1
80                 &movd("mm3",&DWP(28,$a,"",0));  # mm3 = a[7]
81                 &add($a,32);
82                 &pmuludq("mm3","mm0");          # mm3 = w*a[7]
83                 &paddq("mm1","mm5");            # mm1 = carry1 + r[2] + w*a[2]
84                 &movd("mm5",&DWP(16,$r,"",0));  # mm5 = r[4]
85                 &paddq("mm2","mm5");            # mm2 = r[4] + w*a[4]
86                 &movd(&DWP(8,$r,"",0),"mm1");
87                 &psrlq("mm1",32);               # mm1 = carry2
88                 &paddq("mm1","mm7");            # mm1 = carry2 + r[3] + w*a[3]
89                 &movd("mm5",&DWP(20,$r,"",0));  # mm5 = r[5]
90                 &paddq("mm4","mm5");            # mm4 = r[5] + w*a[5]
91                 &movd(&DWP(12,$r,"",0),"mm1");
92                 &psrlq("mm1",32);               # mm1 = carry3
93                 &paddq("mm1","mm2");            # mm1 = carry3 + r[4] + w*a[4]
94                 &movd("mm5",&DWP(24,$r,"",0));  # mm5 = r[6]
95                 &paddq("mm6","mm5");            # mm6 = r[6] + w*a[6]
96                 &movd(&DWP(16,$r,"",0),"mm1");
97                 &psrlq("mm1",32);               # mm1 = carry4
98                 &paddq("mm1","mm4");            # mm1 = carry4 + r[5] + w*a[5]
99                 &movd("mm5",&DWP(28,$r,"",0));  # mm5 = r[7]
100                 &paddq("mm3","mm5");            # mm3 = r[7] + w*a[7]
101                 &movd(&DWP(20,$r,"",0),"mm1");
102                 &psrlq("mm1",32);               # mm1 = carry5
103                 &paddq("mm1","mm6");            # mm1 = carry5 + r[6] + w*a[6]
104                 &movd(&DWP(24,$r,"",0),"mm1");
105                 &psrlq("mm1",32);               # mm1 = carry6
106                 &paddq("mm1","mm3");            # mm1 = carry6 + r[7] + w*a[7]
107                 &movd(&DWP(28,$r,"",0),"mm1");
108                 &lea($r,&DWP(32,$r));
109                 &psrlq("mm1",32);               # mm1 = carry_out
110
111                 &sub($c,8);
112                 &jz(&label("maw_sse2_exit"));
113         &set_label("maw_sse2_entry");
114                 &test($c,0xfffffff8);
115                 &jnz(&label("maw_sse2_unrolled"));
116
117         &set_label("maw_sse2_loop",4);
118                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
119                 &movd("mm3",&DWP(0,$r));        # mm3 = r[i]
120                 &pmuludq("mm2","mm0");          # a[i] *= w
121                 &lea($a,&DWP(4,$a));
122                 &paddq("mm1","mm3");            # carry += r[i]
123                 &paddq("mm1","mm2");            # carry += a[i]*w
124                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
125                 &sub($c,1);
126                 &psrlq("mm1",32);               # carry = carry_high
127                 &lea($r,&DWP(4,$r));
128                 &jnz(&label("maw_sse2_loop"));
129         &set_label("maw_sse2_exit");
130                 &movd("eax","mm1");             # c = carry_out
131                 &emms();
132                 &ret();
133
134         &set_label("maw_non_sse2",16);
135         }
136
137         # function_begin prologue
138         &push("ebp");
139         &push("ebx");
140         &push("esi");
141         &push("edi");
142
143         &comment("");
144         $Low="eax";
145         $High="edx";
146         $a="ebx";
147         $w="ebp";
148         $r="edi";
149         $c="esi";
150
151         &xor($c,$c);            # clear carry
152         &mov($r,&wparam(0));    #
153
154         &mov("ecx",&wparam(2)); #
155         &mov($a,&wparam(1));    #
156
157         &and("ecx",0xfffffff8); # num / 8
158         &mov($w,&wparam(3));    #
159
160         &push("ecx");           # Up the stack for a tmp variable
161
162         &jz(&label("maw_finish"));
163
164         &set_label("maw_loop",16);
165
166         for ($i=0; $i<32; $i+=4)
167                 {
168                 &comment("Round $i");
169
170                  &mov("eax",&DWP($i,$a));       # *a
171                 &mul($w);                       # *a * w
172                 &add("eax",$c);                 # L(t)+= c
173                 &adc("edx",0);                  # H(t)+=carry
174                  &add("eax",&DWP($i,$r));       # L(t)+= *r
175                 &adc("edx",0);                  # H(t)+=carry
176                  &mov(&DWP($i,$r),"eax");       # *r= L(t);
177                 &mov($c,"edx");                 # c=  H(t);
178                 }
179
180         &comment("");
181         &sub("ecx",8);
182         &lea($a,&DWP(32,$a));
183         &lea($r,&DWP(32,$r));
184         &jnz(&label("maw_loop"));
185
186         &set_label("maw_finish",0);
187         &mov("ecx",&wparam(2)); # get num
188         &and("ecx",7);
189         &jnz(&label("maw_finish2"));    # helps branch prediction
190         &jmp(&label("maw_end"));
191
192         &set_label("maw_finish2",1);
193         for ($i=0; $i<7; $i++)
194                 {
195                 &comment("Tail Round $i");
196                  &mov("eax",&DWP($i*4,$a));     # *a
197                 &mul($w);                       # *a * w
198                 &add("eax",$c);                 # L(t)+=c
199                 &adc("edx",0);                  # H(t)+=carry
200                  &add("eax",&DWP($i*4,$r));     # L(t)+= *r
201                 &adc("edx",0);                  # H(t)+=carry
202                  &dec("ecx") if ($i != 7-1);
203                 &mov(&DWP($i*4,$r),"eax");      # *r= L(t);
204                  &mov($c,"edx");                # c=  H(t);
205                 &jz(&label("maw_end")) if ($i != 7-1);
206                 }
207         &set_label("maw_end",0);
208         &mov("eax",$c);
209
210         &pop("ecx");    # clear variable from
211
212         &function_end($name);
213         }
214
215 sub bn_mul_words
216         {
217         local($name)=@_;
218
219         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
220
221         $r="eax";
222         $a="edx";
223         $c="ecx";
224
225         if ($sse2) {
226                 &picmeup("eax","OPENSSL_ia32cap_P");
227                 &bt(&DWP(0,"eax"),26);
228                 &jnc(&label("mw_non_sse2"));
229
230                 &mov($r,&wparam(0));
231                 &mov($a,&wparam(1));
232                 &mov($c,&wparam(2));
233                 &movd("mm0",&wparam(3));        # mm0 = w
234                 &pxor("mm1","mm1");             # mm1 = carry = 0
235
236         &set_label("mw_sse2_loop",16);
237                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
238                 &pmuludq("mm2","mm0");          # a[i] *= w
239                 &lea($a,&DWP(4,$a));
240                 &paddq("mm1","mm2");            # carry += a[i]*w
241                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
242                 &sub($c,1);
243                 &psrlq("mm1",32);               # carry = carry_high
244                 &lea($r,&DWP(4,$r));
245                 &jnz(&label("mw_sse2_loop"));
246
247                 &movd("eax","mm1");             # return carry
248                 &emms();
249                 &ret();
250         &set_label("mw_non_sse2",16);
251         }
252
253         # function_begin prologue
254         &push("ebp");
255         &push("ebx");
256         &push("esi");
257         &push("edi");
258
259         &comment("");
260         $Low="eax";
261         $High="edx";
262         $a="ebx";
263         $w="ecx";
264         $r="edi";
265         $c="esi";
266         $num="ebp";
267
268         &xor($c,$c);            # clear carry
269         &mov($r,&wparam(0));    #
270         &mov($a,&wparam(1));    #
271         &mov($num,&wparam(2));  #
272         &mov($w,&wparam(3));    #
273
274         &and($num,0xfffffff8);  # num / 8
275         &jz(&label("mw_finish"));
276
277         &set_label("mw_loop",0);
278         for ($i=0; $i<32; $i+=4)
279                 {
280                 &comment("Round $i");
281
282                  &mov("eax",&DWP($i,$a,"",0));  # *a
283                 &mul($w);                       # *a * w
284                 &add("eax",$c);                 # L(t)+=c
285                  # XXX
286
287                 &adc("edx",0);                  # H(t)+=carry
288                  &mov(&DWP($i,$r,"",0),"eax");  # *r= L(t);
289
290                 &mov($c,"edx");                 # c=  H(t);
291                 }
292
293         &comment("");
294         &add($a,32);
295         &add($r,32);
296         &sub($num,8);
297         &jz(&label("mw_finish"));
298         &jmp(&label("mw_loop"));
299
300         &set_label("mw_finish",0);
301         &mov($num,&wparam(2));  # get num
302         &and($num,7);
303         &jnz(&label("mw_finish2"));
304         &jmp(&label("mw_end"));
305
306         &set_label("mw_finish2",1);
307         for ($i=0; $i<7; $i++)
308                 {
309                 &comment("Tail Round $i");
310                  &mov("eax",&DWP($i*4,$a,"",0));# *a
311                 &mul($w);                       # *a * w
312                 &add("eax",$c);                 # L(t)+=c
313                  # XXX
314                 &adc("edx",0);                  # H(t)+=carry
315                  &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
316                 &mov($c,"edx");                 # c=  H(t);
317                  &dec($num) if ($i != 7-1);
318                 &jz(&label("mw_end")) if ($i != 7-1);
319                 }
320         &set_label("mw_end",0);
321         &mov("eax",$c);
322
323         &function_end($name);
324         }
325
326 sub bn_sqr_words
327         {
328         local($name)=@_;
329
330         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
331
332         $r="eax";
333         $a="edx";
334         $c="ecx";
335
336         if ($sse2) {
337                 &picmeup("eax","OPENSSL_ia32cap_P");
338                 &bt(&DWP(0,"eax"),26);
339                 &jnc(&label("sqr_non_sse2"));
340
341                 &mov($r,&wparam(0));
342                 &mov($a,&wparam(1));
343                 &mov($c,&wparam(2));
344
345         &set_label("sqr_sse2_loop",16);
346                 &movd("mm0",&DWP(0,$a));        # mm0 = a[i]
347                 &pmuludq("mm0","mm0");          # a[i] *= a[i]
348                 &lea($a,&DWP(4,$a));            # a++
349                 &movq(&QWP(0,$r),"mm0");        # r[i] = a[i]*a[i]
350                 &sub($c,1);
351                 &lea($r,&DWP(8,$r));            # r += 2
352                 &jnz(&label("sqr_sse2_loop"));
353
354                 &emms();
355                 &ret();
356         &set_label("sqr_non_sse2",16);
357         }
358
359         # function_begin prologue
360         &push("ebp");
361         &push("ebx");
362         &push("esi");
363         &push("edi");
364
365         &comment("");
366         $r="esi";
367         $a="edi";
368         $num="ebx";
369
370         &mov($r,&wparam(0));    #
371         &mov($a,&wparam(1));    #
372         &mov($num,&wparam(2));  #
373
374         &and($num,0xfffffff8);  # num / 8
375         &jz(&label("sw_finish"));
376
377         &set_label("sw_loop",0);
378         for ($i=0; $i<32; $i+=4)
379                 {
380                 &comment("Round $i");
381                 &mov("eax",&DWP($i,$a,"",0));   # *a
382                  # XXX
383                 &mul("eax");                    # *a * *a
384                 &mov(&DWP($i*2,$r,"",0),"eax"); #
385                  &mov(&DWP($i*2+4,$r,"",0),"edx");#
386                 }
387
388         &comment("");
389         &add($a,32);
390         &add($r,64);
391         &sub($num,8);
392         &jnz(&label("sw_loop"));
393
394         &set_label("sw_finish",0);
395         &mov($num,&wparam(2));  # get num
396         &and($num,7);
397         &jz(&label("sw_end"));
398
399         for ($i=0; $i<7; $i++)
400                 {
401                 &comment("Tail Round $i");
402                 &mov("eax",&DWP($i*4,$a,"",0)); # *a
403                  # XXX
404                 &mul("eax");                    # *a * *a
405                 &mov(&DWP($i*8,$r,"",0),"eax"); #
406                  &dec($num) if ($i != 7-1);
407                 &mov(&DWP($i*8+4,$r,"",0),"edx");
408                  &jz(&label("sw_end")) if ($i != 7-1);
409                 }
410         &set_label("sw_end",0);
411
412         &function_end($name);
413         }
414
415 sub bn_div_words
416         {
417         local($name)=@_;
418
419         &function_begin_B($name,"");
420         &mov("edx",&wparam(0)); #
421         &mov("eax",&wparam(1)); #
422         &mov("ecx",&wparam(2)); #
423         &div("ecx");
424         &ret();
425         &function_end_B($name);
426         }
427
428 sub bn_add_words
429         {
430         local($name)=@_;
431
432         &function_begin($name,"");
433
434         &comment("");
435         $a="esi";
436         $b="edi";
437         $c="eax";
438         $r="ebx";
439         $tmp1="ecx";
440         $tmp2="edx";
441         $num="ebp";
442
443         &mov($r,&wparam(0));    # get r
444          &mov($a,&wparam(1));   # get a
445         &mov($b,&wparam(2));    # get b
446          &mov($num,&wparam(3)); # get num
447         &xor($c,$c);            # clear carry
448          &and($num,0xfffffff8); # num / 8
449
450         &jz(&label("aw_finish"));
451
452         &set_label("aw_loop",0);
453         for ($i=0; $i<8; $i++)
454                 {
455                 &comment("Round $i");
456
457                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
458                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
459                 &add($tmp1,$c);
460                  &mov($c,0);
461                 &adc($c,$c);
462                  &add($tmp1,$tmp2);
463                 &adc($c,0);
464                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
465                 }
466
467         &comment("");
468         &add($a,32);
469          &add($b,32);
470         &add($r,32);
471          &sub($num,8);
472         &jnz(&label("aw_loop"));
473
474         &set_label("aw_finish",0);
475         &mov($num,&wparam(3));  # get num
476         &and($num,7);
477          &jz(&label("aw_end"));
478
479         for ($i=0; $i<7; $i++)
480                 {
481                 &comment("Tail Round $i");
482                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
483                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
484                 &add($tmp1,$c);
485                  &mov($c,0);
486                 &adc($c,$c);
487                  &add($tmp1,$tmp2);
488                 &adc($c,0);
489                  &dec($num) if ($i != 6);
490                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
491                  &jz(&label("aw_end")) if ($i != 6);
492                 }
493         &set_label("aw_end",0);
494
495 #       &mov("eax",$c);         # $c is "eax"
496
497         &function_end($name);
498         }
499
500 sub bn_sub_words
501         {
502         local($name)=@_;
503
504         &function_begin($name,"");
505
506         &comment("");
507         $a="esi";
508         $b="edi";
509         $c="eax";
510         $r="ebx";
511         $tmp1="ecx";
512         $tmp2="edx";
513         $num="ebp";
514
515         &mov($r,&wparam(0));    # get r
516          &mov($a,&wparam(1));   # get a
517         &mov($b,&wparam(2));    # get b
518          &mov($num,&wparam(3)); # get num
519         &xor($c,$c);            # clear carry
520          &and($num,0xfffffff8); # num / 8
521
522         &jz(&label("aw_finish"));
523
524         &set_label("aw_loop",0);
525         for ($i=0; $i<8; $i++)
526                 {
527                 &comment("Round $i");
528
529                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
530                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
531                 &sub($tmp1,$c);
532                  &mov($c,0);
533                 &adc($c,$c);
534                  &sub($tmp1,$tmp2);
535                 &adc($c,0);
536                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
537                 }
538
539         &comment("");
540         &add($a,32);
541          &add($b,32);
542         &add($r,32);
543          &sub($num,8);
544         &jnz(&label("aw_loop"));
545
546         &set_label("aw_finish",0);
547         &mov($num,&wparam(3));  # get num
548         &and($num,7);
549          &jz(&label("aw_end"));
550
551         for ($i=0; $i<7; $i++)
552                 {
553                 &comment("Tail Round $i");
554                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
555                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
556                 &sub($tmp1,$c);
557                  &mov($c,0);
558                 &adc($c,$c);
559                  &sub($tmp1,$tmp2);
560                 &adc($c,0);
561                  &dec($num) if ($i != 6);
562                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
563                  &jz(&label("aw_end")) if ($i != 6);
564                 }
565         &set_label("aw_end",0);
566
567 #       &mov("eax",$c);         # $c is "eax"
568
569         &function_end($name);
570         }
571
572 sub bn_sub_part_words
573         {
574         local($name)=@_;
575
576         &function_begin($name,"");
577
578         &comment("");
579         $a="esi";
580         $b="edi";
581         $c="eax";
582         $r="ebx";
583         $tmp1="ecx";
584         $tmp2="edx";
585         $num="ebp";
586
587         &mov($r,&wparam(0));    # get r
588          &mov($a,&wparam(1));   # get a
589         &mov($b,&wparam(2));    # get b
590          &mov($num,&wparam(3)); # get num
591         &xor($c,$c);            # clear carry
592          &and($num,0xfffffff8); # num / 8
593
594         &jz(&label("aw_finish"));
595
596         &set_label("aw_loop",0);
597         for ($i=0; $i<8; $i++)
598                 {
599                 &comment("Round $i");
600
601                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
602                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
603                 &sub($tmp1,$c);
604                  &mov($c,0);
605                 &adc($c,$c);
606                  &sub($tmp1,$tmp2);
607                 &adc($c,0);
608                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
609                 }
610
611         &comment("");
612         &add($a,32);
613          &add($b,32);
614         &add($r,32);
615          &sub($num,8);
616         &jnz(&label("aw_loop"));
617
618         &set_label("aw_finish",0);
619         &mov($num,&wparam(3));  # get num
620         &and($num,7);
621          &jz(&label("aw_end"));
622
623         for ($i=0; $i<7; $i++)
624                 {
625                 &comment("Tail Round $i");
626                 &mov($tmp1,&DWP(0,$a,"",0));    # *a
627                  &mov($tmp2,&DWP(0,$b,"",0));# *b
628                 &sub($tmp1,$c);
629                  &mov($c,0);
630                 &adc($c,$c);
631                  &sub($tmp1,$tmp2);
632                 &adc($c,0);
633                 &mov(&DWP(0,$r,"",0),$tmp1);    # *r
634                 &add($a, 4);
635                 &add($b, 4);
636                 &add($r, 4);
637                  &dec($num) if ($i != 6);
638                  &jz(&label("aw_end")) if ($i != 6);
639                 }
640         &set_label("aw_end",0);
641
642         &cmp(&wparam(4),0);
643         &je(&label("pw_end"));
644
645         &mov($num,&wparam(4));  # get dl
646         &cmp($num,0);
647         &je(&label("pw_end"));
648         &jge(&label("pw_pos"));
649
650         &comment("pw_neg");
651         &mov($tmp2,0);
652         &sub($tmp2,$num);
653         &mov($num,$tmp2);
654         &and($num,0xfffffff8);  # num / 8
655         &jz(&label("pw_neg_finish"));
656
657         &set_label("pw_neg_loop",0);
658         for ($i=0; $i<8; $i++)
659         {
660             &comment("dl<0 Round $i");
661
662             &mov($tmp1,0);
663             &mov($tmp2,&DWP($i*4,$b,"",0));     # *b
664             &sub($tmp1,$c);
665             &mov($c,0);
666             &adc($c,$c);
667             &sub($tmp1,$tmp2);
668             &adc($c,0);
669             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
670         }
671             
672         &comment("");
673         &add($b,32);
674         &add($r,32);
675         &sub($num,8);
676         &jnz(&label("pw_neg_loop"));
677             
678         &set_label("pw_neg_finish",0);
679         &mov($tmp2,&wparam(4)); # get dl
680         &mov($num,0);
681         &sub($num,$tmp2);
682         &and($num,7);
683         &jz(&label("pw_end"));
684             
685         for ($i=0; $i<7; $i++)
686         {
687             &comment("dl<0 Tail Round $i");
688             &mov($tmp1,0);
689             &mov($tmp2,&DWP($i*4,$b,"",0));# *b
690             &sub($tmp1,$c);
691             &mov($c,0);
692             &adc($c,$c);
693             &sub($tmp1,$tmp2);
694             &adc($c,0);
695             &dec($num) if ($i != 6);
696             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
697             &jz(&label("pw_end")) if ($i != 6);
698         }
699
700         &jmp(&label("pw_end"));
701         
702         &set_label("pw_pos",0);
703         
704         &and($num,0xfffffff8);  # num / 8
705         &jz(&label("pw_pos_finish"));
706
707         &set_label("pw_pos_loop",0);
708
709         for ($i=0; $i<8; $i++)
710         {
711             &comment("dl>0 Round $i");
712
713             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
714             &sub($tmp1,$c);
715             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
716             &jnc(&label("pw_nc".$i));
717         }
718             
719         &comment("");
720         &add($a,32);
721         &add($r,32);
722         &sub($num,8);
723         &jnz(&label("pw_pos_loop"));
724             
725         &set_label("pw_pos_finish",0);
726         &mov($num,&wparam(4));  # get dl
727         &and($num,7);
728         &jz(&label("pw_end"));
729             
730         for ($i=0; $i<7; $i++)
731         {
732             &comment("dl>0 Tail Round $i");
733             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
734             &sub($tmp1,$c);
735             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
736             &jnc(&label("pw_tail_nc".$i));
737             &dec($num) if ($i != 6);
738             &jz(&label("pw_end")) if ($i != 6);
739         }
740         &mov($c,1);
741         &jmp(&label("pw_end"));
742
743         &set_label("pw_nc_loop",0);
744         for ($i=0; $i<8; $i++)
745         {
746             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
747             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
748             &set_label("pw_nc".$i,0);
749         }
750             
751         &comment("");
752         &add($a,32);
753         &add($r,32);
754         &sub($num,8);
755         &jnz(&label("pw_nc_loop"));
756             
757         &mov($num,&wparam(4));  # get dl
758         &and($num,7);
759         &jz(&label("pw_nc_end"));
760             
761         for ($i=0; $i<7; $i++)
762         {
763             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
764             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
765             &set_label("pw_tail_nc".$i,0);
766             &dec($num) if ($i != 6);
767             &jz(&label("pw_nc_end")) if ($i != 6);
768         }
769
770         &set_label("pw_nc_end",0);
771         &mov($c,0);
772
773         &set_label("pw_end",0);
774
775 #       &mov("eax",$c);         # $c is "eax"
776
777         &function_end($name);
778         }