287162161c6461e212edb51784e781bf2a1311a6
[openssl.git] / crypto / bn / asm / bn-586.pl
1 #! /usr/bin/env perl
2 # Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
11 push(@INC,"${dir}","${dir}../../perlasm");
12 require "x86asm.pl";
13
14 $output = pop and open STDOUT,">$output";
15
16 &asm_init($ARGV[0]);
17
18 $sse2=0;
19 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
20
21 &external_label("OPENSSL_ia32cap_P") if ($sse2);
22
23 &bn_mul_add_words("bn_mul_add_words");
24 &bn_mul_words("bn_mul_words");
25 &bn_sqr_words("bn_sqr_words");
26 &bn_div_words("bn_div_words");
27 &bn_add_words("bn_add_words");
28 &bn_sub_words("bn_sub_words");
29 &bn_sub_part_words("bn_sub_part_words");
30
31 &asm_finish();
32
33 close STDOUT;
34
35 sub bn_mul_add_words
36         {
37         local($name)=@_;
38
39         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
40
41         $r="eax";
42         $a="edx";
43         $c="ecx";
44
45         if ($sse2) {
46                 &picmeup("eax","OPENSSL_ia32cap_P");
47                 &bt(&DWP(0,"eax"),26);
48                 &jnc(&label("maw_non_sse2"));
49
50                 &mov($r,&wparam(0));
51                 &mov($a,&wparam(1));
52                 &mov($c,&wparam(2));
53                 &movd("mm0",&wparam(3));        # mm0 = w
54                 &pxor("mm1","mm1");             # mm1 = carry_in
55                 &jmp(&label("maw_sse2_entry"));
56
57         &set_label("maw_sse2_unrolled",16);
58                 &movd("mm3",&DWP(0,$r,"",0));   # mm3 = r[0]
59                 &paddq("mm1","mm3");            # mm1 = carry_in + r[0]
60                 &movd("mm2",&DWP(0,$a,"",0));   # mm2 = a[0]
61                 &pmuludq("mm2","mm0");          # mm2 = w*a[0]
62                 &movd("mm4",&DWP(4,$a,"",0));   # mm4 = a[1]
63                 &pmuludq("mm4","mm0");          # mm4 = w*a[1]
64                 &movd("mm6",&DWP(8,$a,"",0));   # mm6 = a[2]
65                 &pmuludq("mm6","mm0");          # mm6 = w*a[2]
66                 &movd("mm7",&DWP(12,$a,"",0));  # mm7 = a[3]
67                 &pmuludq("mm7","mm0");          # mm7 = w*a[3]
68                 &paddq("mm1","mm2");            # mm1 = carry_in + r[0] + w*a[0]
69                 &movd("mm3",&DWP(4,$r,"",0));   # mm3 = r[1]
70                 &paddq("mm3","mm4");            # mm3 = r[1] + w*a[1]
71                 &movd("mm5",&DWP(8,$r,"",0));   # mm5 = r[2]
72                 &paddq("mm5","mm6");            # mm5 = r[2] + w*a[2]
73                 &movd("mm4",&DWP(12,$r,"",0));  # mm4 = r[3]
74                 &paddq("mm7","mm4");            # mm7 = r[3] + w*a[3]
75                 &movd(&DWP(0,$r,"",0),"mm1");
76                 &movd("mm2",&DWP(16,$a,"",0));  # mm2 = a[4]
77                 &pmuludq("mm2","mm0");          # mm2 = w*a[4]
78                 &psrlq("mm1",32);               # mm1 = carry0
79                 &movd("mm4",&DWP(20,$a,"",0));  # mm4 = a[5]
80                 &pmuludq("mm4","mm0");          # mm4 = w*a[5]
81                 &paddq("mm1","mm3");            # mm1 = carry0 + r[1] + w*a[1]
82                 &movd("mm6",&DWP(24,$a,"",0));  # mm6 = a[6]
83                 &pmuludq("mm6","mm0");          # mm6 = w*a[6]
84                 &movd(&DWP(4,$r,"",0),"mm1");
85                 &psrlq("mm1",32);               # mm1 = carry1
86                 &movd("mm3",&DWP(28,$a,"",0));  # mm3 = a[7]
87                 &add($a,32);
88                 &pmuludq("mm3","mm0");          # mm3 = w*a[7]
89                 &paddq("mm1","mm5");            # mm1 = carry1 + r[2] + w*a[2]
90                 &movd("mm5",&DWP(16,$r,"",0));  # mm5 = r[4]
91                 &paddq("mm2","mm5");            # mm2 = r[4] + w*a[4]
92                 &movd(&DWP(8,$r,"",0),"mm1");
93                 &psrlq("mm1",32);               # mm1 = carry2
94                 &paddq("mm1","mm7");            # mm1 = carry2 + r[3] + w*a[3]
95                 &movd("mm5",&DWP(20,$r,"",0));  # mm5 = r[5]
96                 &paddq("mm4","mm5");            # mm4 = r[5] + w*a[5]
97                 &movd(&DWP(12,$r,"",0),"mm1");
98                 &psrlq("mm1",32);               # mm1 = carry3
99                 &paddq("mm1","mm2");            # mm1 = carry3 + r[4] + w*a[4]
100                 &movd("mm5",&DWP(24,$r,"",0));  # mm5 = r[6]
101                 &paddq("mm6","mm5");            # mm6 = r[6] + w*a[6]
102                 &movd(&DWP(16,$r,"",0),"mm1");
103                 &psrlq("mm1",32);               # mm1 = carry4
104                 &paddq("mm1","mm4");            # mm1 = carry4 + r[5] + w*a[5]
105                 &movd("mm5",&DWP(28,$r,"",0));  # mm5 = r[7]
106                 &paddq("mm3","mm5");            # mm3 = r[7] + w*a[7]
107                 &movd(&DWP(20,$r,"",0),"mm1");
108                 &psrlq("mm1",32);               # mm1 = carry5
109                 &paddq("mm1","mm6");            # mm1 = carry5 + r[6] + w*a[6]
110                 &movd(&DWP(24,$r,"",0),"mm1");
111                 &psrlq("mm1",32);               # mm1 = carry6
112                 &paddq("mm1","mm3");            # mm1 = carry6 + r[7] + w*a[7]
113                 &movd(&DWP(28,$r,"",0),"mm1");
114                 &lea($r,&DWP(32,$r));
115                 &psrlq("mm1",32);               # mm1 = carry_out
116
117                 &sub($c,8);
118                 &jz(&label("maw_sse2_exit"));
119         &set_label("maw_sse2_entry");
120                 &test($c,0xfffffff8);
121                 &jnz(&label("maw_sse2_unrolled"));
122
123         &set_label("maw_sse2_loop",4);
124                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
125                 &movd("mm3",&DWP(0,$r));        # mm3 = r[i]
126                 &pmuludq("mm2","mm0");          # a[i] *= w
127                 &lea($a,&DWP(4,$a));
128                 &paddq("mm1","mm3");            # carry += r[i]
129                 &paddq("mm1","mm2");            # carry += a[i]*w
130                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
131                 &sub($c,1);
132                 &psrlq("mm1",32);               # carry = carry_high
133                 &lea($r,&DWP(4,$r));
134                 &jnz(&label("maw_sse2_loop"));
135         &set_label("maw_sse2_exit");
136                 &movd("eax","mm1");             # c = carry_out
137                 &emms();
138                 &ret();
139
140         &set_label("maw_non_sse2",16);
141         }
142
143         # function_begin prologue
144         &push("ebp");
145         &push("ebx");
146         &push("esi");
147         &push("edi");
148
149         &comment("");
150         $Low="eax";
151         $High="edx";
152         $a="ebx";
153         $w="ebp";
154         $r="edi";
155         $c="esi";
156
157         &xor($c,$c);            # clear carry
158         &mov($r,&wparam(0));    #
159
160         &mov("ecx",&wparam(2)); #
161         &mov($a,&wparam(1));    #
162
163         &and("ecx",0xfffffff8); # num / 8
164         &mov($w,&wparam(3));    #
165
166         &push("ecx");           # Up the stack for a tmp variable
167
168         &jz(&label("maw_finish"));
169
170         &set_label("maw_loop",16);
171
172         for ($i=0; $i<32; $i+=4)
173                 {
174                 &comment("Round $i");
175
176                  &mov("eax",&DWP($i,$a));       # *a
177                 &mul($w);                       # *a * w
178                 &add("eax",$c);                 # L(t)+= c
179                 &adc("edx",0);                  # H(t)+=carry
180                  &add("eax",&DWP($i,$r));       # L(t)+= *r
181                 &adc("edx",0);                  # H(t)+=carry
182                  &mov(&DWP($i,$r),"eax");       # *r= L(t);
183                 &mov($c,"edx");                 # c=  H(t);
184                 }
185
186         &comment("");
187         &sub("ecx",8);
188         &lea($a,&DWP(32,$a));
189         &lea($r,&DWP(32,$r));
190         &jnz(&label("maw_loop"));
191
192         &set_label("maw_finish",0);
193         &mov("ecx",&wparam(2)); # get num
194         &and("ecx",7);
195         &jnz(&label("maw_finish2"));    # helps branch prediction
196         &jmp(&label("maw_end"));
197
198         &set_label("maw_finish2",1);
199         for ($i=0; $i<7; $i++)
200                 {
201                 &comment("Tail Round $i");
202                  &mov("eax",&DWP($i*4,$a));     # *a
203                 &mul($w);                       # *a * w
204                 &add("eax",$c);                 # L(t)+=c
205                 &adc("edx",0);                  # H(t)+=carry
206                  &add("eax",&DWP($i*4,$r));     # L(t)+= *r
207                 &adc("edx",0);                  # H(t)+=carry
208                  &dec("ecx") if ($i != 7-1);
209                 &mov(&DWP($i*4,$r),"eax");      # *r= L(t);
210                  &mov($c,"edx");                # c=  H(t);
211                 &jz(&label("maw_end")) if ($i != 7-1);
212                 }
213         &set_label("maw_end",0);
214         &mov("eax",$c);
215
216         &pop("ecx");    # clear variable from
217
218         &function_end($name);
219         }
220
221 sub bn_mul_words
222         {
223         local($name)=@_;
224
225         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
226
227         $r="eax";
228         $a="edx";
229         $c="ecx";
230
231         if ($sse2) {
232                 &picmeup("eax","OPENSSL_ia32cap_P");
233                 &bt(&DWP(0,"eax"),26);
234                 &jnc(&label("mw_non_sse2"));
235
236                 &mov($r,&wparam(0));
237                 &mov($a,&wparam(1));
238                 &mov($c,&wparam(2));
239                 &movd("mm0",&wparam(3));        # mm0 = w
240                 &pxor("mm1","mm1");             # mm1 = carry = 0
241
242         &set_label("mw_sse2_loop",16);
243                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
244                 &pmuludq("mm2","mm0");          # a[i] *= w
245                 &lea($a,&DWP(4,$a));
246                 &paddq("mm1","mm2");            # carry += a[i]*w
247                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
248                 &sub($c,1);
249                 &psrlq("mm1",32);               # carry = carry_high
250                 &lea($r,&DWP(4,$r));
251                 &jnz(&label("mw_sse2_loop"));
252
253                 &movd("eax","mm1");             # return carry
254                 &emms();
255                 &ret();
256         &set_label("mw_non_sse2",16);
257         }
258
259         # function_begin prologue
260         &push("ebp");
261         &push("ebx");
262         &push("esi");
263         &push("edi");
264
265         &comment("");
266         $Low="eax";
267         $High="edx";
268         $a="ebx";
269         $w="ecx";
270         $r="edi";
271         $c="esi";
272         $num="ebp";
273
274         &xor($c,$c);            # clear carry
275         &mov($r,&wparam(0));    #
276         &mov($a,&wparam(1));    #
277         &mov($num,&wparam(2));  #
278         &mov($w,&wparam(3));    #
279
280         &and($num,0xfffffff8);  # num / 8
281         &jz(&label("mw_finish"));
282
283         &set_label("mw_loop",0);
284         for ($i=0; $i<32; $i+=4)
285                 {
286                 &comment("Round $i");
287
288                  &mov("eax",&DWP($i,$a,"",0));  # *a
289                 &mul($w);                       # *a * w
290                 &add("eax",$c);                 # L(t)+=c
291                  # XXX
292
293                 &adc("edx",0);                  # H(t)+=carry
294                  &mov(&DWP($i,$r,"",0),"eax");  # *r= L(t);
295
296                 &mov($c,"edx");                 # c=  H(t);
297                 }
298
299         &comment("");
300         &add($a,32);
301         &add($r,32);
302         &sub($num,8);
303         &jz(&label("mw_finish"));
304         &jmp(&label("mw_loop"));
305
306         &set_label("mw_finish",0);
307         &mov($num,&wparam(2));  # get num
308         &and($num,7);
309         &jnz(&label("mw_finish2"));
310         &jmp(&label("mw_end"));
311
312         &set_label("mw_finish2",1);
313         for ($i=0; $i<7; $i++)
314                 {
315                 &comment("Tail Round $i");
316                  &mov("eax",&DWP($i*4,$a,"",0));# *a
317                 &mul($w);                       # *a * w
318                 &add("eax",$c);                 # L(t)+=c
319                  # XXX
320                 &adc("edx",0);                  # H(t)+=carry
321                  &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
322                 &mov($c,"edx");                 # c=  H(t);
323                  &dec($num) if ($i != 7-1);
324                 &jz(&label("mw_end")) if ($i != 7-1);
325                 }
326         &set_label("mw_end",0);
327         &mov("eax",$c);
328
329         &function_end($name);
330         }
331
332 sub bn_sqr_words
333         {
334         local($name)=@_;
335
336         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
337
338         $r="eax";
339         $a="edx";
340         $c="ecx";
341
342         if ($sse2) {
343                 &picmeup("eax","OPENSSL_ia32cap_P");
344                 &bt(&DWP(0,"eax"),26);
345                 &jnc(&label("sqr_non_sse2"));
346
347                 &mov($r,&wparam(0));
348                 &mov($a,&wparam(1));
349                 &mov($c,&wparam(2));
350
351         &set_label("sqr_sse2_loop",16);
352                 &movd("mm0",&DWP(0,$a));        # mm0 = a[i]
353                 &pmuludq("mm0","mm0");          # a[i] *= a[i]
354                 &lea($a,&DWP(4,$a));            # a++
355                 &movq(&QWP(0,$r),"mm0");        # r[i] = a[i]*a[i]
356                 &sub($c,1);
357                 &lea($r,&DWP(8,$r));            # r += 2
358                 &jnz(&label("sqr_sse2_loop"));
359
360                 &emms();
361                 &ret();
362         &set_label("sqr_non_sse2",16);
363         }
364
365         # function_begin prologue
366         &push("ebp");
367         &push("ebx");
368         &push("esi");
369         &push("edi");
370
371         &comment("");
372         $r="esi";
373         $a="edi";
374         $num="ebx";
375
376         &mov($r,&wparam(0));    #
377         &mov($a,&wparam(1));    #
378         &mov($num,&wparam(2));  #
379
380         &and($num,0xfffffff8);  # num / 8
381         &jz(&label("sw_finish"));
382
383         &set_label("sw_loop",0);
384         for ($i=0; $i<32; $i+=4)
385                 {
386                 &comment("Round $i");
387                 &mov("eax",&DWP($i,$a,"",0));   # *a
388                  # XXX
389                 &mul("eax");                    # *a * *a
390                 &mov(&DWP($i*2,$r,"",0),"eax"); #
391                  &mov(&DWP($i*2+4,$r,"",0),"edx");#
392                 }
393
394         &comment("");
395         &add($a,32);
396         &add($r,64);
397         &sub($num,8);
398         &jnz(&label("sw_loop"));
399
400         &set_label("sw_finish",0);
401         &mov($num,&wparam(2));  # get num
402         &and($num,7);
403         &jz(&label("sw_end"));
404
405         for ($i=0; $i<7; $i++)
406                 {
407                 &comment("Tail Round $i");
408                 &mov("eax",&DWP($i*4,$a,"",0)); # *a
409                  # XXX
410                 &mul("eax");                    # *a * *a
411                 &mov(&DWP($i*8,$r,"",0),"eax"); #
412                  &dec($num) if ($i != 7-1);
413                 &mov(&DWP($i*8+4,$r,"",0),"edx");
414                  &jz(&label("sw_end")) if ($i != 7-1);
415                 }
416         &set_label("sw_end",0);
417
418         &function_end($name);
419         }
420
421 sub bn_div_words
422         {
423         local($name)=@_;
424
425         &function_begin_B($name,"");
426         &mov("edx",&wparam(0)); #
427         &mov("eax",&wparam(1)); #
428         &mov("ecx",&wparam(2)); #
429         &div("ecx");
430         &ret();
431         &function_end_B($name);
432         }
433
434 sub bn_add_words
435         {
436         local($name)=@_;
437
438         &function_begin($name,"");
439
440         &comment("");
441         $a="esi";
442         $b="edi";
443         $c="eax";
444         $r="ebx";
445         $tmp1="ecx";
446         $tmp2="edx";
447         $num="ebp";
448
449         &mov($r,&wparam(0));    # get r
450          &mov($a,&wparam(1));   # get a
451         &mov($b,&wparam(2));    # get b
452          &mov($num,&wparam(3)); # get num
453         &xor($c,$c);            # clear carry
454          &and($num,0xfffffff8); # num / 8
455
456         &jz(&label("aw_finish"));
457
458         &set_label("aw_loop",0);
459         for ($i=0; $i<8; $i++)
460                 {
461                 &comment("Round $i");
462
463                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
464                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
465                 &add($tmp1,$c);
466                  &mov($c,0);
467                 &adc($c,$c);
468                  &add($tmp1,$tmp2);
469                 &adc($c,0);
470                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
471                 }
472
473         &comment("");
474         &add($a,32);
475          &add($b,32);
476         &add($r,32);
477          &sub($num,8);
478         &jnz(&label("aw_loop"));
479
480         &set_label("aw_finish",0);
481         &mov($num,&wparam(3));  # get num
482         &and($num,7);
483          &jz(&label("aw_end"));
484
485         for ($i=0; $i<7; $i++)
486                 {
487                 &comment("Tail Round $i");
488                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
489                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
490                 &add($tmp1,$c);
491                  &mov($c,0);
492                 &adc($c,$c);
493                  &add($tmp1,$tmp2);
494                 &adc($c,0);
495                  &dec($num) if ($i != 6);
496                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
497                  &jz(&label("aw_end")) if ($i != 6);
498                 }
499         &set_label("aw_end",0);
500
501 #       &mov("eax",$c);         # $c is "eax"
502
503         &function_end($name);
504         }
505
506 sub bn_sub_words
507         {
508         local($name)=@_;
509
510         &function_begin($name,"");
511
512         &comment("");
513         $a="esi";
514         $b="edi";
515         $c="eax";
516         $r="ebx";
517         $tmp1="ecx";
518         $tmp2="edx";
519         $num="ebp";
520
521         &mov($r,&wparam(0));    # get r
522          &mov($a,&wparam(1));   # get a
523         &mov($b,&wparam(2));    # get b
524          &mov($num,&wparam(3)); # get num
525         &xor($c,$c);            # clear carry
526          &and($num,0xfffffff8); # num / 8
527
528         &jz(&label("aw_finish"));
529
530         &set_label("aw_loop",0);
531         for ($i=0; $i<8; $i++)
532                 {
533                 &comment("Round $i");
534
535                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
536                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
537                 &sub($tmp1,$c);
538                  &mov($c,0);
539                 &adc($c,$c);
540                  &sub($tmp1,$tmp2);
541                 &adc($c,0);
542                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
543                 }
544
545         &comment("");
546         &add($a,32);
547          &add($b,32);
548         &add($r,32);
549          &sub($num,8);
550         &jnz(&label("aw_loop"));
551
552         &set_label("aw_finish",0);
553         &mov($num,&wparam(3));  # get num
554         &and($num,7);
555          &jz(&label("aw_end"));
556
557         for ($i=0; $i<7; $i++)
558                 {
559                 &comment("Tail Round $i");
560                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
561                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
562                 &sub($tmp1,$c);
563                  &mov($c,0);
564                 &adc($c,$c);
565                  &sub($tmp1,$tmp2);
566                 &adc($c,0);
567                  &dec($num) if ($i != 6);
568                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
569                  &jz(&label("aw_end")) if ($i != 6);
570                 }
571         &set_label("aw_end",0);
572
573 #       &mov("eax",$c);         # $c is "eax"
574
575         &function_end($name);
576         }
577
578 sub bn_sub_part_words
579         {
580         local($name)=@_;
581
582         &function_begin($name,"");
583
584         &comment("");
585         $a="esi";
586         $b="edi";
587         $c="eax";
588         $r="ebx";
589         $tmp1="ecx";
590         $tmp2="edx";
591         $num="ebp";
592
593         &mov($r,&wparam(0));    # get r
594          &mov($a,&wparam(1));   # get a
595         &mov($b,&wparam(2));    # get b
596          &mov($num,&wparam(3)); # get num
597         &xor($c,$c);            # clear carry
598          &and($num,0xfffffff8); # num / 8
599
600         &jz(&label("aw_finish"));
601
602         &set_label("aw_loop",0);
603         for ($i=0; $i<8; $i++)
604                 {
605                 &comment("Round $i");
606
607                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
608                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
609                 &sub($tmp1,$c);
610                  &mov($c,0);
611                 &adc($c,$c);
612                  &sub($tmp1,$tmp2);
613                 &adc($c,0);
614                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
615                 }
616
617         &comment("");
618         &add($a,32);
619          &add($b,32);
620         &add($r,32);
621          &sub($num,8);
622         &jnz(&label("aw_loop"));
623
624         &set_label("aw_finish",0);
625         &mov($num,&wparam(3));  # get num
626         &and($num,7);
627          &jz(&label("aw_end"));
628
629         for ($i=0; $i<7; $i++)
630                 {
631                 &comment("Tail Round $i");
632                 &mov($tmp1,&DWP(0,$a,"",0));    # *a
633                  &mov($tmp2,&DWP(0,$b,"",0));# *b
634                 &sub($tmp1,$c);
635                  &mov($c,0);
636                 &adc($c,$c);
637                  &sub($tmp1,$tmp2);
638                 &adc($c,0);
639                 &mov(&DWP(0,$r,"",0),$tmp1);    # *r
640                 &add($a, 4);
641                 &add($b, 4);
642                 &add($r, 4);
643                  &dec($num) if ($i != 6);
644                  &jz(&label("aw_end")) if ($i != 6);
645                 }
646         &set_label("aw_end",0);
647
648         &cmp(&wparam(4),0);
649         &je(&label("pw_end"));
650
651         &mov($num,&wparam(4));  # get dl
652         &cmp($num,0);
653         &je(&label("pw_end"));
654         &jge(&label("pw_pos"));
655
656         &comment("pw_neg");
657         &mov($tmp2,0);
658         &sub($tmp2,$num);
659         &mov($num,$tmp2);
660         &and($num,0xfffffff8);  # num / 8
661         &jz(&label("pw_neg_finish"));
662
663         &set_label("pw_neg_loop",0);
664         for ($i=0; $i<8; $i++)
665         {
666             &comment("dl<0 Round $i");
667
668             &mov($tmp1,0);
669             &mov($tmp2,&DWP($i*4,$b,"",0));     # *b
670             &sub($tmp1,$c);
671             &mov($c,0);
672             &adc($c,$c);
673             &sub($tmp1,$tmp2);
674             &adc($c,0);
675             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
676         }
677
678         &comment("");
679         &add($b,32);
680         &add($r,32);
681         &sub($num,8);
682         &jnz(&label("pw_neg_loop"));
683
684         &set_label("pw_neg_finish",0);
685         &mov($tmp2,&wparam(4)); # get dl
686         &mov($num,0);
687         &sub($num,$tmp2);
688         &and($num,7);
689         &jz(&label("pw_end"));
690
691         for ($i=0; $i<7; $i++)
692         {
693             &comment("dl<0 Tail Round $i");
694             &mov($tmp1,0);
695             &mov($tmp2,&DWP($i*4,$b,"",0));# *b
696             &sub($tmp1,$c);
697             &mov($c,0);
698             &adc($c,$c);
699             &sub($tmp1,$tmp2);
700             &adc($c,0);
701             &dec($num) if ($i != 6);
702             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
703             &jz(&label("pw_end")) if ($i != 6);
704         }
705
706         &jmp(&label("pw_end"));
707
708         &set_label("pw_pos",0);
709
710         &and($num,0xfffffff8);  # num / 8
711         &jz(&label("pw_pos_finish"));
712
713         &set_label("pw_pos_loop",0);
714
715         for ($i=0; $i<8; $i++)
716         {
717             &comment("dl>0 Round $i");
718
719             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
720             &sub($tmp1,$c);
721             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
722             &jnc(&label("pw_nc".$i));
723         }
724
725         &comment("");
726         &add($a,32);
727         &add($r,32);
728         &sub($num,8);
729         &jnz(&label("pw_pos_loop"));
730
731         &set_label("pw_pos_finish",0);
732         &mov($num,&wparam(4));  # get dl
733         &and($num,7);
734         &jz(&label("pw_end"));
735
736         for ($i=0; $i<7; $i++)
737         {
738             &comment("dl>0 Tail Round $i");
739             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
740             &sub($tmp1,$c);
741             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
742             &jnc(&label("pw_tail_nc".$i));
743             &dec($num) if ($i != 6);
744             &jz(&label("pw_end")) if ($i != 6);
745         }
746         &mov($c,1);
747         &jmp(&label("pw_end"));
748
749         &set_label("pw_nc_loop",0);
750         for ($i=0; $i<8; $i++)
751         {
752             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
753             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
754             &set_label("pw_nc".$i,0);
755         }
756
757         &comment("");
758         &add($a,32);
759         &add($r,32);
760         &sub($num,8);
761         &jnz(&label("pw_nc_loop"));
762
763         &mov($num,&wparam(4));  # get dl
764         &and($num,7);
765         &jz(&label("pw_nc_end"));
766
767         for ($i=0; $i<7; $i++)
768         {
769             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
770             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
771             &set_label("pw_tail_nc".$i,0);
772             &dec($num) if ($i != 6);
773             &jz(&label("pw_nc_end")) if ($i != 6);
774         }
775
776         &set_label("pw_nc_end",0);
777         &mov($c,0);
778
779         &set_label("pw_end",0);
780
781 #       &mov("eax",$c);         # $c is "eax"
782
783         &function_end($name);
784         }