1468906be1fbc9bce7966665d2c5b9ded7cdd007
[openssl.git] / crypto / bn / asm / bn-586.pl
1 #!/usr/local/bin/perl
2
3 push(@INC,"perlasm","../../perlasm");
4 require "x86asm.pl";
5
6 &asm_init($ARGV[0],$0);
7
8 $sse2=0;
9 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
10
11 &external_label("OPENSSL_ia32cap_P") if ($sse2);
12
13 &bn_mul_add_words("bn_mul_add_words");
14 &bn_mul_words("bn_mul_words");
15 &bn_sqr_words("bn_sqr_words");
16 &bn_div_words("bn_div_words");
17 &bn_add_words("bn_add_words");
18 &bn_sub_words("bn_sub_words");
19 &bn_sub_part_words("bn_sub_part_words");
20
21 &asm_finish();
22
23 sub bn_mul_add_words
24         {
25         local($name)=@_;
26
27         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
28
29         $r="eax";
30         $a="edx";
31         $c="ecx";
32
33         if ($sse2) {
34                 &picmeup("eax","OPENSSL_ia32cap_P");
35                 &bt(&DWP(0,"eax"),26);
36                 &jnc(&label("maw_non_sse2"));
37
38                 &mov($r,&wparam(0));
39                 &mov($a,&wparam(1));
40                 &mov($c,&wparam(2));
41                 &movd("mm0",&wparam(3));        # mm0 = w
42                 &pxor("mm1","mm1");             # mm1 = carry_in
43                 &jmp(&label("maw_sse2_entry"));
44                 
45         &set_label("maw_sse2_unrolled",16);
46                 &movd("mm3",&DWP(0,$r,"",0));   # mm3 = r[0]
47                 &paddq("mm1","mm3");            # mm1 = carry_in + r[0]
48                 &movd("mm2",&DWP(0,$a,"",0));   # mm2 = a[0]
49                 &pmuludq("mm2","mm0");          # mm2 = w*a[0]
50                 &movd("mm4",&DWP(4,$a,"",0));   # mm4 = a[1]
51                 &pmuludq("mm4","mm0");          # mm4 = w*a[1]
52                 &movd("mm6",&DWP(8,$a,"",0));   # mm6 = a[2]
53                 &pmuludq("mm6","mm0");          # mm6 = w*a[2]
54                 &movd("mm7",&DWP(12,$a,"",0));  # mm7 = a[3]
55                 &pmuludq("mm7","mm0");          # mm7 = w*a[3]
56                 &paddq("mm1","mm2");            # mm1 = carry_in + r[0] + w*a[0]
57                 &movd("mm3",&DWP(4,$r,"",0));   # mm3 = r[1]
58                 &paddq("mm3","mm4");            # mm3 = r[1] + w*a[1]
59                 &movd("mm5",&DWP(8,$r,"",0));   # mm5 = r[2]
60                 &paddq("mm5","mm6");            # mm5 = r[2] + w*a[2]
61                 &movd("mm4",&DWP(12,$r,"",0));  # mm4 = r[3]
62                 &paddq("mm7","mm4");            # mm7 = r[3] + w*a[3]
63                 &movd(&DWP(0,$r,"",0),"mm1");
64                 &movd("mm2",&DWP(16,$a,"",0));  # mm2 = a[4]
65                 &pmuludq("mm2","mm0");          # mm2 = w*a[4]
66                 &psrlq("mm1",32);               # mm1 = carry0
67                 &movd("mm4",&DWP(20,$a,"",0));  # mm4 = a[5]
68                 &pmuludq("mm4","mm0");          # mm4 = w*a[5]
69                 &paddq("mm1","mm3");            # mm1 = carry0 + r[1] + w*a[1]
70                 &movd("mm6",&DWP(24,$a,"",0));  # mm6 = a[6]
71                 &pmuludq("mm6","mm0");          # mm6 = w*a[6]
72                 &movd(&DWP(4,$r,"",0),"mm1");
73                 &psrlq("mm1",32);               # mm1 = carry1
74                 &movd("mm3",&DWP(28,$a,"",0));  # mm3 = a[7]
75                 &add($a,32);
76                 &pmuludq("mm3","mm0");          # mm3 = w*a[7]
77                 &paddq("mm1","mm5");            # mm1 = carry1 + r[2] + w*a[2]
78                 &movd("mm5",&DWP(16,$r,"",0));  # mm5 = r[4]
79                 &paddq("mm2","mm5");            # mm2 = r[4] + w*a[4]
80                 &movd(&DWP(8,$r,"",0),"mm1");
81                 &psrlq("mm1",32);               # mm1 = carry2
82                 &paddq("mm1","mm7");            # mm1 = carry2 + r[3] + w*a[3]
83                 &movd("mm5",&DWP(20,$r,"",0));  # mm5 = r[5]
84                 &paddq("mm4","mm5");            # mm4 = r[5] + w*a[5]
85                 &movd(&DWP(12,$r,"",0),"mm1");
86                 &psrlq("mm1",32);               # mm1 = carry3
87                 &paddq("mm1","mm2");            # mm1 = carry3 + r[4] + w*a[4]
88                 &movd("mm5",&DWP(24,$r,"",0));  # mm5 = r[6]
89                 &paddq("mm6","mm5");            # mm6 = r[6] + w*a[6]
90                 &movd(&DWP(16,$r,"",0),"mm1");
91                 &psrlq("mm1",32);               # mm1 = carry4
92                 &paddq("mm1","mm4");            # mm1 = carry4 + r[5] + w*a[5]
93                 &movd("mm5",&DWP(28,$r,"",0));  # mm5 = r[7]
94                 &paddq("mm3","mm5");            # mm3 = r[7] + w*a[7]
95                 &movd(&DWP(20,$r,"",0),"mm1");
96                 &psrlq("mm1",32);               # mm1 = carry5
97                 &paddq("mm1","mm6");            # mm1 = carry5 + r[6] + w*a[6]
98                 &movd(&DWP(24,$r,"",0),"mm1");
99                 &psrlq("mm1",32);               # mm1 = carry6
100                 &paddq("mm1","mm3");            # mm1 = carry6 + r[7] + w*a[7]
101                 &movd(&DWP(28,$r,"",0),"mm1");
102                 &lea($r,&DWP(32,$r));
103                 &psrlq("mm1",32);               # mm1 = carry_out
104
105                 &sub($c,8);
106                 &jz(&label("maw_sse2_exit"));
107         &set_label("maw_sse2_entry");
108                 &test($c,0xfffffff8);
109                 &jnz(&label("maw_sse2_unrolled"));
110
111         &set_label("maw_sse2_loop",4);
112                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
113                 &movd("mm3",&DWP(0,$r));        # mm3 = r[i]
114                 &pmuludq("mm2","mm0");          # a[i] *= w
115                 &lea($a,&DWP(4,$a));
116                 &paddq("mm1","mm3");            # carry += r[i]
117                 &paddq("mm1","mm2");            # carry += a[i]*w
118                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
119                 &sub($c,1);
120                 &psrlq("mm1",32);               # carry = carry_high
121                 &lea($r,&DWP(4,$r));
122                 &jnz(&label("maw_sse2_loop"));
123         &set_label("maw_sse2_exit");
124                 &movd("eax","mm1");             # c = carry_out
125                 &emms();
126                 &ret();
127
128         &set_label("maw_non_sse2",16);
129         }
130
131         # function_begin prologue
132         &push("ebp");
133         &push("ebx");
134         &push("esi");
135         &push("edi");
136
137         &comment("");
138         $Low="eax";
139         $High="edx";
140         $a="ebx";
141         $w="ebp";
142         $r="edi";
143         $c="esi";
144
145         &xor($c,$c);            # clear carry
146         &mov($r,&wparam(0));    #
147
148         &mov("ecx",&wparam(2)); #
149         &mov($a,&wparam(1));    #
150
151         &and("ecx",0xfffffff8); # num / 8
152         &mov($w,&wparam(3));    #
153
154         &push("ecx");           # Up the stack for a tmp variable
155
156         &jz(&label("maw_finish"));
157
158         &set_label("maw_loop",16);
159
160         for ($i=0; $i<32; $i+=4)
161                 {
162                 &comment("Round $i");
163
164                  &mov("eax",&DWP($i,$a));       # *a
165                 &mul($w);                       # *a * w
166                 &add("eax",$c);                 # L(t)+= c
167                 &adc("edx",0);                  # H(t)+=carry
168                  &add("eax",&DWP($i,$r));       # L(t)+= *r
169                 &adc("edx",0);                  # H(t)+=carry
170                  &mov(&DWP($i,$r),"eax");       # *r= L(t);
171                 &mov($c,"edx");                 # c=  H(t);
172                 }
173
174         &comment("");
175         &sub("ecx",8);
176         &lea($a,&DWP(32,$a));
177         &lea($r,&DWP(32,$r));
178         &jnz(&label("maw_loop"));
179
180         &set_label("maw_finish",0);
181         &mov("ecx",&wparam(2)); # get num
182         &and("ecx",7);
183         &jnz(&label("maw_finish2"));    # helps branch prediction
184         &jmp(&label("maw_end"));
185
186         &set_label("maw_finish2",1);
187         for ($i=0; $i<7; $i++)
188                 {
189                 &comment("Tail Round $i");
190                  &mov("eax",&DWP($i*4,$a));     # *a
191                 &mul($w);                       # *a * w
192                 &add("eax",$c);                 # L(t)+=c
193                 &adc("edx",0);                  # H(t)+=carry
194                  &add("eax",&DWP($i*4,$r));     # L(t)+= *r
195                 &adc("edx",0);                  # H(t)+=carry
196                  &dec("ecx") if ($i != 7-1);
197                 &mov(&DWP($i*4,$r),"eax");      # *r= L(t);
198                  &mov($c,"edx");                # c=  H(t);
199                 &jz(&label("maw_end")) if ($i != 7-1);
200                 }
201         &set_label("maw_end",0);
202         &mov("eax",$c);
203
204         &pop("ecx");    # clear variable from
205
206         &function_end($name);
207         }
208
209 sub bn_mul_words
210         {
211         local($name)=@_;
212
213         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
214
215         $r="eax";
216         $a="edx";
217         $c="ecx";
218
219         if ($sse2) {
220                 &picmeup("eax","OPENSSL_ia32cap_P");
221                 &bt(&DWP(0,"eax"),26);
222                 &jnc(&label("mw_non_sse2"));
223
224                 &mov($r,&wparam(0));
225                 &mov($a,&wparam(1));
226                 &mov($c,&wparam(2));
227                 &movd("mm0",&wparam(3));        # mm0 = w
228                 &pxor("mm1","mm1");             # mm1 = carry = 0
229
230         &set_label("mw_sse2_loop",16);
231                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
232                 &pmuludq("mm2","mm0");          # a[i] *= w
233                 &lea($a,&DWP(4,$a));
234                 &paddq("mm1","mm2");            # carry += a[i]*w
235                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
236                 &sub($c,1);
237                 &psrlq("mm1",32);               # carry = carry_high
238                 &lea($r,&DWP(4,$r));
239                 &jnz(&label("mw_sse2_loop"));
240
241                 &movd("eax","mm1");             # return carry
242                 &emms();
243                 &ret();
244         &set_label("mw_non_sse2",16);
245         }
246
247         # function_begin prologue
248         &push("ebp");
249         &push("ebx");
250         &push("esi");
251         &push("edi");
252
253         &comment("");
254         $Low="eax";
255         $High="edx";
256         $a="ebx";
257         $w="ecx";
258         $r="edi";
259         $c="esi";
260         $num="ebp";
261
262         &xor($c,$c);            # clear carry
263         &mov($r,&wparam(0));    #
264         &mov($a,&wparam(1));    #
265         &mov($num,&wparam(2));  #
266         &mov($w,&wparam(3));    #
267
268         &and($num,0xfffffff8);  # num / 8
269         &jz(&label("mw_finish"));
270
271         &set_label("mw_loop",0);
272         for ($i=0; $i<32; $i+=4)
273                 {
274                 &comment("Round $i");
275
276                  &mov("eax",&DWP($i,$a,"",0));  # *a
277                 &mul($w);                       # *a * w
278                 &add("eax",$c);                 # L(t)+=c
279                  # XXX
280
281                 &adc("edx",0);                  # H(t)+=carry
282                  &mov(&DWP($i,$r,"",0),"eax");  # *r= L(t);
283
284                 &mov($c,"edx");                 # c=  H(t);
285                 }
286
287         &comment("");
288         &add($a,32);
289         &add($r,32);
290         &sub($num,8);
291         &jz(&label("mw_finish"));
292         &jmp(&label("mw_loop"));
293
294         &set_label("mw_finish",0);
295         &mov($num,&wparam(2));  # get num
296         &and($num,7);
297         &jnz(&label("mw_finish2"));
298         &jmp(&label("mw_end"));
299
300         &set_label("mw_finish2",1);
301         for ($i=0; $i<7; $i++)
302                 {
303                 &comment("Tail Round $i");
304                  &mov("eax",&DWP($i*4,$a,"",0));# *a
305                 &mul($w);                       # *a * w
306                 &add("eax",$c);                 # L(t)+=c
307                  # XXX
308                 &adc("edx",0);                  # H(t)+=carry
309                  &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
310                 &mov($c,"edx");                 # c=  H(t);
311                  &dec($num) if ($i != 7-1);
312                 &jz(&label("mw_end")) if ($i != 7-1);
313                 }
314         &set_label("mw_end",0);
315         &mov("eax",$c);
316
317         &function_end($name);
318         }
319
320 sub bn_sqr_words
321         {
322         local($name)=@_;
323
324         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
325
326         $r="eax";
327         $a="edx";
328         $c="ecx";
329
330         if ($sse2) {
331                 &picmeup("eax","OPENSSL_ia32cap_P");
332                 &bt(&DWP(0,"eax"),26);
333                 &jnc(&label("sqr_non_sse2"));
334
335                 &mov($r,&wparam(0));
336                 &mov($a,&wparam(1));
337                 &mov($c,&wparam(2));
338
339         &set_label("sqr_sse2_loop",16);
340                 &movd("mm0",&DWP(0,$a));        # mm0 = a[i]
341                 &pmuludq("mm0","mm0");          # a[i] *= a[i]
342                 &lea($a,&DWP(4,$a));            # a++
343                 &movq(&QWP(0,$r),"mm0");        # r[i] = a[i]*a[i]
344                 &sub($c,1);
345                 &lea($r,&DWP(8,$r));            # r += 2
346                 &jnz(&label("sqr_sse2_loop"));
347
348                 &emms();
349                 &ret();
350         &set_label("sqr_non_sse2",16);
351         }
352
353         # function_begin prologue
354         &push("ebp");
355         &push("ebx");
356         &push("esi");
357         &push("edi");
358
359         &comment("");
360         $r="esi";
361         $a="edi";
362         $num="ebx";
363
364         &mov($r,&wparam(0));    #
365         &mov($a,&wparam(1));    #
366         &mov($num,&wparam(2));  #
367
368         &and($num,0xfffffff8);  # num / 8
369         &jz(&label("sw_finish"));
370
371         &set_label("sw_loop",0);
372         for ($i=0; $i<32; $i+=4)
373                 {
374                 &comment("Round $i");
375                 &mov("eax",&DWP($i,$a,"",0));   # *a
376                  # XXX
377                 &mul("eax");                    # *a * *a
378                 &mov(&DWP($i*2,$r,"",0),"eax"); #
379                  &mov(&DWP($i*2+4,$r,"",0),"edx");#
380                 }
381
382         &comment("");
383         &add($a,32);
384         &add($r,64);
385         &sub($num,8);
386         &jnz(&label("sw_loop"));
387
388         &set_label("sw_finish",0);
389         &mov($num,&wparam(2));  # get num
390         &and($num,7);
391         &jz(&label("sw_end"));
392
393         for ($i=0; $i<7; $i++)
394                 {
395                 &comment("Tail Round $i");
396                 &mov("eax",&DWP($i*4,$a,"",0)); # *a
397                  # XXX
398                 &mul("eax");                    # *a * *a
399                 &mov(&DWP($i*8,$r,"",0),"eax"); #
400                  &dec($num) if ($i != 7-1);
401                 &mov(&DWP($i*8+4,$r,"",0),"edx");
402                  &jz(&label("sw_end")) if ($i != 7-1);
403                 }
404         &set_label("sw_end",0);
405
406         &function_end($name);
407         }
408
409 sub bn_div_words
410         {
411         local($name)=@_;
412
413         &function_begin_B($name,"");
414         &mov("edx",&wparam(0)); #
415         &mov("eax",&wparam(1)); #
416         &mov("ecx",&wparam(2)); #
417         &div("ecx");
418         &ret();
419         &function_end_B($name);
420         }
421
422 sub bn_add_words
423         {
424         local($name)=@_;
425
426         &function_begin($name,"");
427
428         &comment("");
429         $a="esi";
430         $b="edi";
431         $c="eax";
432         $r="ebx";
433         $tmp1="ecx";
434         $tmp2="edx";
435         $num="ebp";
436
437         &mov($r,&wparam(0));    # get r
438          &mov($a,&wparam(1));   # get a
439         &mov($b,&wparam(2));    # get b
440          &mov($num,&wparam(3)); # get num
441         &xor($c,$c);            # clear carry
442          &and($num,0xfffffff8); # num / 8
443
444         &jz(&label("aw_finish"));
445
446         &set_label("aw_loop",0);
447         for ($i=0; $i<8; $i++)
448                 {
449                 &comment("Round $i");
450
451                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
452                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
453                 &add($tmp1,$c);
454                  &mov($c,0);
455                 &adc($c,$c);
456                  &add($tmp1,$tmp2);
457                 &adc($c,0);
458                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
459                 }
460
461         &comment("");
462         &add($a,32);
463          &add($b,32);
464         &add($r,32);
465          &sub($num,8);
466         &jnz(&label("aw_loop"));
467
468         &set_label("aw_finish",0);
469         &mov($num,&wparam(3));  # get num
470         &and($num,7);
471          &jz(&label("aw_end"));
472
473         for ($i=0; $i<7; $i++)
474                 {
475                 &comment("Tail Round $i");
476                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
477                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
478                 &add($tmp1,$c);
479                  &mov($c,0);
480                 &adc($c,$c);
481                  &add($tmp1,$tmp2);
482                 &adc($c,0);
483                  &dec($num) if ($i != 6);
484                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
485                  &jz(&label("aw_end")) if ($i != 6);
486                 }
487         &set_label("aw_end",0);
488
489 #       &mov("eax",$c);         # $c is "eax"
490
491         &function_end($name);
492         }
493
494 sub bn_sub_words
495         {
496         local($name)=@_;
497
498         &function_begin($name,"");
499
500         &comment("");
501         $a="esi";
502         $b="edi";
503         $c="eax";
504         $r="ebx";
505         $tmp1="ecx";
506         $tmp2="edx";
507         $num="ebp";
508
509         &mov($r,&wparam(0));    # get r
510          &mov($a,&wparam(1));   # get a
511         &mov($b,&wparam(2));    # get b
512          &mov($num,&wparam(3)); # get num
513         &xor($c,$c);            # clear carry
514          &and($num,0xfffffff8); # num / 8
515
516         &jz(&label("aw_finish"));
517
518         &set_label("aw_loop",0);
519         for ($i=0; $i<8; $i++)
520                 {
521                 &comment("Round $i");
522
523                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
524                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
525                 &sub($tmp1,$c);
526                  &mov($c,0);
527                 &adc($c,$c);
528                  &sub($tmp1,$tmp2);
529                 &adc($c,0);
530                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
531                 }
532
533         &comment("");
534         &add($a,32);
535          &add($b,32);
536         &add($r,32);
537          &sub($num,8);
538         &jnz(&label("aw_loop"));
539
540         &set_label("aw_finish",0);
541         &mov($num,&wparam(3));  # get num
542         &and($num,7);
543          &jz(&label("aw_end"));
544
545         for ($i=0; $i<7; $i++)
546                 {
547                 &comment("Tail Round $i");
548                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
549                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
550                 &sub($tmp1,$c);
551                  &mov($c,0);
552                 &adc($c,$c);
553                  &sub($tmp1,$tmp2);
554                 &adc($c,0);
555                  &dec($num) if ($i != 6);
556                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
557                  &jz(&label("aw_end")) if ($i != 6);
558                 }
559         &set_label("aw_end",0);
560
561 #       &mov("eax",$c);         # $c is "eax"
562
563         &function_end($name);
564         }
565
566 sub bn_sub_part_words
567         {
568         local($name)=@_;
569
570         &function_begin($name,"");
571
572         &comment("");
573         $a="esi";
574         $b="edi";
575         $c="eax";
576         $r="ebx";
577         $tmp1="ecx";
578         $tmp2="edx";
579         $num="ebp";
580
581         &mov($r,&wparam(0));    # get r
582          &mov($a,&wparam(1));   # get a
583         &mov($b,&wparam(2));    # get b
584          &mov($num,&wparam(3)); # get num
585         &xor($c,$c);            # clear carry
586          &and($num,0xfffffff8); # num / 8
587
588         &jz(&label("aw_finish"));
589
590         &set_label("aw_loop",0);
591         for ($i=0; $i<8; $i++)
592                 {
593                 &comment("Round $i");
594
595                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
596                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
597                 &sub($tmp1,$c);
598                  &mov($c,0);
599                 &adc($c,$c);
600                  &sub($tmp1,$tmp2);
601                 &adc($c,0);
602                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
603                 }
604
605         &comment("");
606         &add($a,32);
607          &add($b,32);
608         &add($r,32);
609          &sub($num,8);
610         &jnz(&label("aw_loop"));
611
612         &set_label("aw_finish",0);
613         &mov($num,&wparam(3));  # get num
614         &and($num,7);
615          &jz(&label("aw_end"));
616
617         for ($i=0; $i<7; $i++)
618                 {
619                 &comment("Tail Round $i");
620                 &mov($tmp1,&DWP(0,$a,"",0));    # *a
621                  &mov($tmp2,&DWP(0,$b,"",0));# *b
622                 &sub($tmp1,$c);
623                  &mov($c,0);
624                 &adc($c,$c);
625                  &sub($tmp1,$tmp2);
626                 &adc($c,0);
627                 &mov(&DWP(0,$r,"",0),$tmp1);    # *r
628                 &add($a, 4);
629                 &add($b, 4);
630                 &add($r, 4);
631                  &dec($num) if ($i != 6);
632                  &jz(&label("aw_end")) if ($i != 6);
633                 }
634         &set_label("aw_end",0);
635
636         &cmp(&wparam(4),0);
637         &je(&label("pw_end"));
638
639         &mov($num,&wparam(4));  # get dl
640         &cmp($num,0);
641         &je(&label("pw_end"));
642         &jge(&label("pw_pos"));
643
644         &comment("pw_neg");
645         &mov($tmp2,0);
646         &sub($tmp2,$num);
647         &mov($num,$tmp2);
648         &and($num,0xfffffff8);  # num / 8
649         &jz(&label("pw_neg_finish"));
650
651         &set_label("pw_neg_loop",0);
652         for ($i=0; $i<8; $i++)
653         {
654             &comment("dl<0 Round $i");
655
656             &mov($tmp1,0);
657             &mov($tmp2,&DWP($i*4,$b,"",0));     # *b
658             &sub($tmp1,$c);
659             &mov($c,0);
660             &adc($c,$c);
661             &sub($tmp1,$tmp2);
662             &adc($c,0);
663             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
664         }
665             
666         &comment("");
667         &add($b,32);
668         &add($r,32);
669         &sub($num,8);
670         &jnz(&label("pw_neg_loop"));
671             
672         &set_label("pw_neg_finish",0);
673         &mov($tmp2,&wparam(4)); # get dl
674         &mov($num,0);
675         &sub($num,$tmp2);
676         &and($num,7);
677         &jz(&label("pw_end"));
678             
679         for ($i=0; $i<7; $i++)
680         {
681             &comment("dl<0 Tail Round $i");
682             &mov($tmp1,0);
683             &mov($tmp2,&DWP($i*4,$b,"",0));# *b
684             &sub($tmp1,$c);
685             &mov($c,0);
686             &adc($c,$c);
687             &sub($tmp1,$tmp2);
688             &adc($c,0);
689             &dec($num) if ($i != 6);
690             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
691             &jz(&label("pw_end")) if ($i != 6);
692         }
693
694         &jmp(&label("pw_end"));
695         
696         &set_label("pw_pos",0);
697         
698         &and($num,0xfffffff8);  # num / 8
699         &jz(&label("pw_pos_finish"));
700
701         &set_label("pw_pos_loop",0);
702
703         for ($i=0; $i<8; $i++)
704         {
705             &comment("dl>0 Round $i");
706
707             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
708             &sub($tmp1,$c);
709             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
710             &jnc(&label("pw_nc".$i));
711         }
712             
713         &comment("");
714         &add($a,32);
715         &add($r,32);
716         &sub($num,8);
717         &jnz(&label("pw_pos_loop"));
718             
719         &set_label("pw_pos_finish",0);
720         &mov($num,&wparam(4));  # get dl
721         &and($num,7);
722         &jz(&label("pw_end"));
723             
724         for ($i=0; $i<7; $i++)
725         {
726             &comment("dl>0 Tail Round $i");
727             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
728             &sub($tmp1,$c);
729             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
730             &jnc(&label("pw_tail_nc".$i));
731             &dec($num) if ($i != 6);
732             &jz(&label("pw_end")) if ($i != 6);
733         }
734         &mov($c,1);
735         &jmp(&label("pw_end"));
736
737         &set_label("pw_nc_loop",0);
738         for ($i=0; $i<8; $i++)
739         {
740             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
741             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
742             &set_label("pw_nc".$i,0);
743         }
744             
745         &comment("");
746         &add($a,32);
747         &add($r,32);
748         &sub($num,8);
749         &jnz(&label("pw_nc_loop"));
750             
751         &mov($num,&wparam(4));  # get dl
752         &and($num,7);
753         &jz(&label("pw_nc_end"));
754             
755         for ($i=0; $i<7; $i++)
756         {
757             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
758             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
759             &set_label("pw_tail_nc".$i,0);
760             &dec($num) if ($i != 6);
761             &jz(&label("pw_nc_end")) if ($i != 6);
762         }
763
764         &set_label("pw_nc_end",0);
765         &mov($c,0);
766
767         &set_label("pw_end",0);
768
769 #       &mov("eax",$c);         # $c is "eax"
770
771         &function_end($name);
772         }
773