+#if 1
+ B=b->d;
+ /* Check if the previous number needs to be copied */
+ if (B != NULL)
+ {
+#if 0
+ /* This lot is an unrolled loop to copy b->top
+ * BN_ULONGs from B to A
+ */
+/*
+ * I have nothing against unrolling but it's usually done for
+ * several reasons, namely:
+ * - minimize percentage of decision making code, i.e. branches;
+ * - avoid cache trashing;
+ * - make it possible to schedule loads earlier;
+ * Now let's examine the code below. The cornerstone of C is
+ * "programmer is always right" and that's what we love it for:-)
+ * For this very reason C compilers have to be paranoid when it
+ * comes to data aliasing and assume the worst. Yeah, but what
+ * does it mean in real life? This means that loop body below will
+ * be compiled to sequence of loads immediately followed by stores
+ * as compiler assumes the worst, something in A==B+1 style. As a
+ * result CPU pipeline is going to starve for incoming data. Secondly
+ * if A and B happen to share same cache line such code is going to
+ * cause severe cache trashing. Both factors have severe impact on
+ * performance of modern CPUs and this is the reason why this
+ * particular piece of code is #ifdefed away and replaced by more
+ * "friendly" version found in #else section below. This comment
+ * also applies to BN_copy function.
+ *
+ * <appro@fy.chalmers.se>
+ */
+ for (i=b->top&(~7); i>0; i-=8)
+ {
+ A[0]=B[0]; A[1]=B[1]; A[2]=B[2]; A[3]=B[3];
+ A[4]=B[4]; A[5]=B[5]; A[6]=B[6]; A[7]=B[7];
+ A+=8;
+ B+=8;
+ }
+ switch (b->top&7)
+ {
+ case 7:
+ A[6]=B[6];
+ case 6:
+ A[5]=B[5];
+ case 5:
+ A[4]=B[4];
+ case 4:
+ A[3]=B[3];
+ case 3:
+ A[2]=B[2];
+ case 2:
+ A[1]=B[1];
+ case 1:
+ A[0]=B[0];
+ case 0:
+ /* I need the 'case 0' entry for utrix cc.
+ * If the optimizer is turned on, it does the
+ * switch table by doing
+ * a=top&7
+ * a--;
+ * goto jump_table[a];
+ * If top is 0, this makes us jump to 0xffffffc
+ * which is rather bad :-(.
+ * eric 23-Apr-1998
+ */
+ ;
+ }
+#else
+ for (i=b->top>>2; i>0; i--,A+=4,B+=4)
+ {
+ /*
+ * The fact that the loop is unrolled
+ * 4-wise is a tribute to Intel. It's
+ * the one that doesn't have enough
+ * registers to accomodate more data.
+ * I'd unroll it 8-wise otherwise:-)
+ *
+ * <appro@fy.chalmers.se>
+ */
+ BN_ULONG a0,a1,a2,a3;
+ a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
+ A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
+ }
+ switch (b->top&3)
+ {
+ case 3: A[2]=B[2];
+ case 2: A[1]=B[1];
+ case 1: A[0]=B[0];
+ case 0: ; /* ultrix cc workaround, see above */
+ }
+#endif
+ }