2 * x86_64 BIGNUM accelerator version 0.1, December 2002.
4 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * Rights for redistribution and usage in source and binary forms are
8 * granted according to the OpenSSL license. Warranty of any kind is
11 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12 * versions, like 1.0...
13 * A. Well, that's because this code is basically a quick-n-dirty
14 * proof-of-concept hack. As you can see it's implemented with
15 * inline assembler, which means that you're bound to GCC and that
16 * there must be a room for fine-tuning.
18 * Q. Why inline assembler?
19 * A. x86_64 features own ABI I'm not familiar with. Which is why
20 * I decided to let the compiler take care of subroutine
21 * prologue/epilogue as well as register allocation.
23 * Q. How much faster does it get?
24 * A. Unfortunately people sitting on x86_64 hardware are prohibited
25 * to disclose the performance numbers, so they (SuSE labs to be
26 * specific) wouldn't tell me. However! Very similar coding technique
27 * (reaching out for 128-bit result from 64x64-bit multiplication)
28 * results in >3 times performance improvement on MIPS and I see no
29 * reason why gain on x86_64 would be so much different:-)
32 #define BN_ULONG unsigned long
35 * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code;
36 * "g"(0) let the compiler to decide where does it
37 * want to keep the value of zero;
39 #define mul_add(r,a,word,carry) do { \
40 register BN_ULONG high,low; \
42 : "=a"(low),"=d"(high) \
45 asm ("addq %2,%0; adcq %3,%1" \
46 : "+r"(carry),"+d"(high)\
49 asm ("addq %2,%0; adcq %3,%1" \
50 : "+m"(r),"+d"(high) \
56 #define mul(r,a,word,carry) do { \
57 register BN_ULONG high,low; \
59 : "=a"(low),"=d"(high) \
62 asm ("addq %2,%0; adcq %3,%1" \
63 : "+r"(carry),"+d"(high)\
66 (r)=carry, carry=high; \
69 #define sqr(r0,r1,a) \
75 BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
79 if (num <= 0) return(c1);
83 mul_add(rp[0],ap[0],w,c1);
84 mul_add(rp[1],ap[1],w,c1);
85 mul_add(rp[2],ap[2],w,c1);
86 mul_add(rp[3],ap[3],w,c1);
91 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
92 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
93 mul_add(rp[2],ap[2],w,c1); return c1;
99 BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
103 if (num <= 0) return(c1);
107 mul(rp[0],ap[0],w,c1);
108 mul(rp[1],ap[1],w,c1);
109 mul(rp[2],ap[2],w,c1);
110 mul(rp[3],ap[3],w,c1);
111 ap+=4; rp+=4; num-=4;
115 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
116 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
117 mul(rp[2],ap[2],w,c1);
122 void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
136 sqr(r[0],r[1],a[0]); if (--n == 0) return;
137 sqr(r[2],r[3],a[1]); if (--n == 0) return;
142 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
143 { BN_ULONG ret,waste;
146 : "=a"(ret),"=d"(waste)
147 : "a"(l),"d"(h),"g"(d)
153 BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
156 if (n <= 0) return 0;
161 "1: movq (%4,%2,8),%0 \n"
162 " adcq (%5,%2,8),%0 \n"
163 " movq %0,(%3,%2,8) \n"
167 : "+a"(ret),"+c"(n),"+r"(i)
168 : "r"(rp),"r"(ap),"r"(bp)
176 BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
179 if (n <= 0) return 0;
184 "1: movq (%4,%2,8),%0 \n"
185 " sbbq (%5,%2,8),%0 \n"
186 " movq %0,(%3,%2,8) \n"
190 : "+a"(ret),"+c"(n),"+r"(i)
191 : "r"(rp),"r"(ap),"r"(bp)
198 /* Simics 1.4<7 has buggy sbbq:-( */
199 #define BN_MASK2 0xffffffffffffffffL
200 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
205 if (n <= 0) return((BN_ULONG)0);
210 r[0]=(t1-t2-c)&BN_MASK2;
211 if (t1 != t2) c=(t1 < t2);
215 r[1]=(t1-t2-c)&BN_MASK2;
216 if (t1 != t2) c=(t1 < t2);
220 r[2]=(t1-t2-c)&BN_MASK2;
221 if (t1 != t2) c=(t1 < t2);
225 r[3]=(t1-t2-c)&BN_MASK2;
226 if (t1 != t2) c=(t1 < t2);
237 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
238 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
239 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
240 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
243 /* original macros are kept for reference purposes */
244 #define mul_add_c(a,b,c0,c1,c2) { \
245 BN_ULONG ta=(a),tb=(b); \
247 t2 = BN_UMULT_HIGH(ta,tb); \
248 c0 += t1; t2 += (c0<t1)?1:0; \
249 c1 += t2; c2 += (c1<t2)?1:0; \
252 #define mul_add_c2(a,b,c0,c1,c2) { \
253 BN_ULONG ta=(a),tb=(b),t0; \
254 t1 = BN_UMULT_HIGH(ta,tb); \
256 t2 = t1+t1; c2 += (t2<t1)?1:0; \
257 t1 = t0+t0; t2 += (t1<t0)?1:0; \
258 c0 += t1; t2 += (c0<t1)?1:0; \
259 c1 += t2; c2 += (c1<t2)?1:0; \
262 #define mul_add_c(a,b,c0,c1,c2) do { \
264 : "=a"(t1),"=d"(t2) \
267 asm ("addq %2,%0; adcq %3,%1" \
268 : "+r"(c0),"+d"(t2) \
271 asm ("addq %2,%0; adcq %3,%1" \
272 : "+r"(c1),"+r"(c2) \
277 #define sqr_add_c(a,i,c0,c1,c2) do { \
279 : "=a"(t1),"=d"(t2) \
282 asm ("addq %2,%0; adcq %3,%1" \
283 : "+r"(c0),"+d"(t2) \
286 asm ("addq %2,%0; adcq %3,%1" \
287 : "+r"(c1),"+r"(c2) \
292 #define mul_add_c2(a,b,c0,c1,c2) do { \
294 : "=a"(t1),"=d"(t2) \
297 asm ("addq %0,%0; adcq %2,%1" \
298 : "+d"(t2),"+r"(c2) \
301 asm ("addq %0,%0; adcq %2,%1" \
302 : "+a"(t1),"+d"(t2) \
305 asm ("addq %2,%0; adcq %3,%1" \
306 : "+r"(c0),"+d"(t2) \
309 asm ("addq %2,%0; adcq %3,%1" \
310 : "+r"(c1),"+r"(c2) \
316 #define sqr_add_c2(a,i,j,c0,c1,c2) \
317 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
319 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
328 mul_add_c(a[0],b[0],c1,c2,c3);
331 mul_add_c(a[0],b[1],c2,c3,c1);
332 mul_add_c(a[1],b[0],c2,c3,c1);
335 mul_add_c(a[2],b[0],c3,c1,c2);
336 mul_add_c(a[1],b[1],c3,c1,c2);
337 mul_add_c(a[0],b[2],c3,c1,c2);
340 mul_add_c(a[0],b[3],c1,c2,c3);
341 mul_add_c(a[1],b[2],c1,c2,c3);
342 mul_add_c(a[2],b[1],c1,c2,c3);
343 mul_add_c(a[3],b[0],c1,c2,c3);
346 mul_add_c(a[4],b[0],c2,c3,c1);
347 mul_add_c(a[3],b[1],c2,c3,c1);
348 mul_add_c(a[2],b[2],c2,c3,c1);
349 mul_add_c(a[1],b[3],c2,c3,c1);
350 mul_add_c(a[0],b[4],c2,c3,c1);
353 mul_add_c(a[0],b[5],c3,c1,c2);
354 mul_add_c(a[1],b[4],c3,c1,c2);
355 mul_add_c(a[2],b[3],c3,c1,c2);
356 mul_add_c(a[3],b[2],c3,c1,c2);
357 mul_add_c(a[4],b[1],c3,c1,c2);
358 mul_add_c(a[5],b[0],c3,c1,c2);
361 mul_add_c(a[6],b[0],c1,c2,c3);
362 mul_add_c(a[5],b[1],c1,c2,c3);
363 mul_add_c(a[4],b[2],c1,c2,c3);
364 mul_add_c(a[3],b[3],c1,c2,c3);
365 mul_add_c(a[2],b[4],c1,c2,c3);
366 mul_add_c(a[1],b[5],c1,c2,c3);
367 mul_add_c(a[0],b[6],c1,c2,c3);
370 mul_add_c(a[0],b[7],c2,c3,c1);
371 mul_add_c(a[1],b[6],c2,c3,c1);
372 mul_add_c(a[2],b[5],c2,c3,c1);
373 mul_add_c(a[3],b[4],c2,c3,c1);
374 mul_add_c(a[4],b[3],c2,c3,c1);
375 mul_add_c(a[5],b[2],c2,c3,c1);
376 mul_add_c(a[6],b[1],c2,c3,c1);
377 mul_add_c(a[7],b[0],c2,c3,c1);
380 mul_add_c(a[7],b[1],c3,c1,c2);
381 mul_add_c(a[6],b[2],c3,c1,c2);
382 mul_add_c(a[5],b[3],c3,c1,c2);
383 mul_add_c(a[4],b[4],c3,c1,c2);
384 mul_add_c(a[3],b[5],c3,c1,c2);
385 mul_add_c(a[2],b[6],c3,c1,c2);
386 mul_add_c(a[1],b[7],c3,c1,c2);
389 mul_add_c(a[2],b[7],c1,c2,c3);
390 mul_add_c(a[3],b[6],c1,c2,c3);
391 mul_add_c(a[4],b[5],c1,c2,c3);
392 mul_add_c(a[5],b[4],c1,c2,c3);
393 mul_add_c(a[6],b[3],c1,c2,c3);
394 mul_add_c(a[7],b[2],c1,c2,c3);
397 mul_add_c(a[7],b[3],c2,c3,c1);
398 mul_add_c(a[6],b[4],c2,c3,c1);
399 mul_add_c(a[5],b[5],c2,c3,c1);
400 mul_add_c(a[4],b[6],c2,c3,c1);
401 mul_add_c(a[3],b[7],c2,c3,c1);
404 mul_add_c(a[4],b[7],c3,c1,c2);
405 mul_add_c(a[5],b[6],c3,c1,c2);
406 mul_add_c(a[6],b[5],c3,c1,c2);
407 mul_add_c(a[7],b[4],c3,c1,c2);
410 mul_add_c(a[7],b[5],c1,c2,c3);
411 mul_add_c(a[6],b[6],c1,c2,c3);
412 mul_add_c(a[5],b[7],c1,c2,c3);
415 mul_add_c(a[6],b[7],c2,c3,c1);
416 mul_add_c(a[7],b[6],c2,c3,c1);
419 mul_add_c(a[7],b[7],c3,c1,c2);
424 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
433 mul_add_c(a[0],b[0],c1,c2,c3);
436 mul_add_c(a[0],b[1],c2,c3,c1);
437 mul_add_c(a[1],b[0],c2,c3,c1);
440 mul_add_c(a[2],b[0],c3,c1,c2);
441 mul_add_c(a[1],b[1],c3,c1,c2);
442 mul_add_c(a[0],b[2],c3,c1,c2);
445 mul_add_c(a[0],b[3],c1,c2,c3);
446 mul_add_c(a[1],b[2],c1,c2,c3);
447 mul_add_c(a[2],b[1],c1,c2,c3);
448 mul_add_c(a[3],b[0],c1,c2,c3);
451 mul_add_c(a[3],b[1],c2,c3,c1);
452 mul_add_c(a[2],b[2],c2,c3,c1);
453 mul_add_c(a[1],b[3],c2,c3,c1);
456 mul_add_c(a[2],b[3],c3,c1,c2);
457 mul_add_c(a[3],b[2],c3,c1,c2);
460 mul_add_c(a[3],b[3],c1,c2,c3);
465 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
474 sqr_add_c(a,0,c1,c2,c3);
477 sqr_add_c2(a,1,0,c2,c3,c1);
480 sqr_add_c(a,1,c3,c1,c2);
481 sqr_add_c2(a,2,0,c3,c1,c2);
484 sqr_add_c2(a,3,0,c1,c2,c3);
485 sqr_add_c2(a,2,1,c1,c2,c3);
488 sqr_add_c(a,2,c2,c3,c1);
489 sqr_add_c2(a,3,1,c2,c3,c1);
490 sqr_add_c2(a,4,0,c2,c3,c1);
493 sqr_add_c2(a,5,0,c3,c1,c2);
494 sqr_add_c2(a,4,1,c3,c1,c2);
495 sqr_add_c2(a,3,2,c3,c1,c2);
498 sqr_add_c(a,3,c1,c2,c3);
499 sqr_add_c2(a,4,2,c1,c2,c3);
500 sqr_add_c2(a,5,1,c1,c2,c3);
501 sqr_add_c2(a,6,0,c1,c2,c3);
504 sqr_add_c2(a,7,0,c2,c3,c1);
505 sqr_add_c2(a,6,1,c2,c3,c1);
506 sqr_add_c2(a,5,2,c2,c3,c1);
507 sqr_add_c2(a,4,3,c2,c3,c1);
510 sqr_add_c(a,4,c3,c1,c2);
511 sqr_add_c2(a,5,3,c3,c1,c2);
512 sqr_add_c2(a,6,2,c3,c1,c2);
513 sqr_add_c2(a,7,1,c3,c1,c2);
516 sqr_add_c2(a,7,2,c1,c2,c3);
517 sqr_add_c2(a,6,3,c1,c2,c3);
518 sqr_add_c2(a,5,4,c1,c2,c3);
521 sqr_add_c(a,5,c2,c3,c1);
522 sqr_add_c2(a,6,4,c2,c3,c1);
523 sqr_add_c2(a,7,3,c2,c3,c1);
526 sqr_add_c2(a,7,4,c3,c1,c2);
527 sqr_add_c2(a,6,5,c3,c1,c2);
530 sqr_add_c(a,6,c1,c2,c3);
531 sqr_add_c2(a,7,5,c1,c2,c3);
534 sqr_add_c2(a,7,6,c2,c3,c1);
537 sqr_add_c(a,7,c3,c1,c2);
542 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
551 sqr_add_c(a,0,c1,c2,c3);
554 sqr_add_c2(a,1,0,c2,c3,c1);
557 sqr_add_c(a,1,c3,c1,c2);
558 sqr_add_c2(a,2,0,c3,c1,c2);
561 sqr_add_c2(a,3,0,c1,c2,c3);
562 sqr_add_c2(a,2,1,c1,c2,c3);
565 sqr_add_c(a,2,c2,c3,c1);
566 sqr_add_c2(a,3,1,c2,c3,c1);
569 sqr_add_c2(a,3,2,c3,c1,c2);
572 sqr_add_c(a,3,c1,c2,c3);