bn/asm/x86_64-mont.pl: minor optimization [for Decoded ICache].
authorAndy Polyakov <appro@openssl.org>
Fri, 25 Oct 2013 08:12:17 +0000 (10:12 +0200)
committerAndy Polyakov <appro@openssl.org>
Fri, 25 Oct 2013 08:14:20 +0000 (10:14 +0200)
crypto/bn/asm/x86_64-mont.pl

index d268fb0ddf04da9f1d9f5097f7b52eca8ad698b6..b6db337457d715f822dcee4f64600f1da6a1b2f8 100755 (executable)
@@ -1685,6 +1685,7 @@ bn_mulx4x_mont:
        push    %r15
 
        shl     \$3,${num}d             # convert $num to bytes
+       .byte   0x67
        xor     %r10,%r10
        mov     %rsp,%r11               # put aside %rsp
        sub     $num,%r10               # -$num
@@ -1725,15 +1726,14 @@ $code.=<<___;
        mov     ($bp),%rdx              # b[0], $bp==%rdx actually
        lea     64+32(%rsp),$tptr
        mov     %rdx,$bi
-       xor     $zero,$zero             # of=0,cf=0
 
        mulx    0*8($aptr),$mi,%rax     # a[0]*b[0]
        mulx    1*8($aptr),%r11,%r14    # a[1]*b[0]
-       adcx    %rax,%r11
+       ad    %rax,%r11
        mov     $bptr,8(%rsp)           # off-load &b[i]
        mulx    2*8($aptr),%r12,%r13    # ...
-       adcx    %r14,%r12
-       adcx    $zero,%r13
+       adc     %r14,%r12
+       adc     \$0,%r13
 
        mov     $mi,$bptr               # borrow $bptr
        imulq   24(%rsp),$mi            # "t[0]"*n0
@@ -1751,13 +1751,12 @@ $code.=<<___;
        mulx    1*8($nptr),%rax,%r11
        adcx    %rax,%r10
        adox    %r12,%r11
-       mulx    2*8($nptr),%rax,%r12
+       .byte   0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00    # mulx  2*8($nptr),%rax,%r12
        mov     48(%rsp),$bptr          # counter value
        mov     %r10,-4*8($tptr)
        adcx    %rax,%r11
        adox    %r13,%r12
        mulx    3*8($nptr),%rax,%r15
-        .byte  0x66,0x66
         mov    $bi,%rdx
        mov     %r11,-3*8($tptr)
        adcx    %rax,%r12
@@ -1765,7 +1764,7 @@ $code.=<<___;
        lea     4*8($nptr),$nptr
        mov     %r12,-2*8($tptr)
 
-       #jmp    .Lmulx4x_1st
+       jmp     .Lmulx4x_1st
 
 .align 32
 .Lmulx4x_1st:
@@ -1863,7 +1862,6 @@ $code.=<<___;
        adox    %r12,%r11
        mulx    2*8($nptr),%rax,%r12
        mov     %r10,-4*8($tptr)
-       mov     0*8($tptr),%r10
        adcx    %rax,%r11
        adox    %r13,%r12
        mulx    3*8($nptr),%rax,%r15
@@ -1872,23 +1870,22 @@ $code.=<<___;
        adcx    %rax,%r12
        adox    $zero,%r15              # of=0
        mov     48(%rsp),$bptr          # counter value
-       .byte   0x66,0x3e
        mov     %r12,-2*8($tptr)
+       .byte   0x66
        lea     4*8($nptr),$nptr
 
-       jmp     .Lmulx4x_inner
+       #jmp    .Lmulx4x_inner
 
 .align 32
 .Lmulx4x_inner:
        adcx    $zero,%r15              # cf=0, modulo-scheduled
-       adox    %r10,%r14
+       adox    0*8($tptr),%r14
        mulx    0*8($aptr),%r10,%rax    # a[4]*b[i]
-       mov     1*8($tptr),%r13
        adcx    %r14,%r10
        mulx    1*8($aptr),%r11,%r14    # a[5]*b[i]
        adox    %rax,%r11
        mulx    2*8($aptr),%r12,%rax    # ...
-       adcx    %r13,%r11
+       adcx    1*8($tptr),%r11
        adox    %r14,%r12
        mulx    3*8($aptr),%r13,%r14
         mov    $mi,%rdx
@@ -1896,8 +1893,8 @@ $code.=<<___;
        adox    %rax,%r13
        adcx    3*8($tptr),%r13
        adox    $zero,%r14              # of=0
-       .byte   0x48,0x8d,0xb6,0x20,0x00,0x00,0x00      # lea   4*8($aptr),$aptr
-       .byte   0x48,0x8d,0x9b,0x20,0x00,0x00,0x00      # lea   4*8($tptr),$tptr
+       lea     4*8($aptr),$aptr
+       lea     4*8($tptr),$tptr
        adcx    $zero,%r14              # cf=0
 
        adox    %r15,%r10
@@ -1909,7 +1906,6 @@ $code.=<<___;
        adox    %r15,%r12
        mulx    2*8($nptr),%rax,%r15
        mov     %r10,-5*8($tptr)
-       mov     0*8($tptr),%r10
        adcx    %rax,%r12
        adox    %r15,%r13
        mulx    3*8($nptr),%rax,%r15
@@ -1927,7 +1923,7 @@ $code.=<<___;
        mov     0(%rsp),$num            # load num
        mov     8(%rsp),$bptr           # re-load &b[i]
        adc     $zero,%r15              # modulo-scheduled
-       sub     %r10,$zero              # pull top-most carry
+       sub     0*8($tptr),$zero        # pull top-most carry
        adc     %r15,%r14
        sbb     %r15,%r15               # top-most carry
        mov     %r14,-1*8($tptr)
@@ -1936,10 +1932,10 @@ $code.=<<___;
        jne     .Lmulx4x_outer
 
        neg     $num
+       xor     %rdx,%rdx
        mov     32(%rsp),$rptr          # restore rp
        lea     64(%rsp),$tptr
 
-       xor     %rdx,%rdx
        pxor    %xmm0,%xmm0
        mov     0*8($nptr,$num),%r8
        mov     1*8($nptr,$num),%r9
@@ -2022,6 +2018,7 @@ bn_sqrx8x_mont:
        push    %r15
 
        shl     \$3,${num}d             # convert $num to bytes
+       .byte   0x67
        xor     %r10,%r10
        mov     %rsp,%r11               # put aside %rsp
        sub     $num,%r10               # -$num
@@ -2043,6 +2040,12 @@ bn_sqrx8x_mont:
        movq    %r10, %xmm3             # -$num
        movq    %r11, %xmm4             # save original %rsp
        mov     $n0,  32(%rsp)
+___
+$code.=<<___ if ($win64);
+       jmp     .Lsqrx8x_body
+.align 32
+___
+$code.=<<___;
 .Lsqrx8x_body:
        ##################################################################
        # Squaring part:
@@ -2096,12 +2099,15 @@ $code.=<<___;
        mov     $aaptr,8(%rsp)          # save end of $aptr
        jmp     .Lsqr8x_zero_start
 
+.align 32
+.byte  0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
 .Lsqrx8x_zero:
+       .byte   0x3e
        movdqa  %xmm0,0*8($tptr)
        movdqa  %xmm0,2*8($tptr)
        movdqa  %xmm0,4*8($tptr)
        movdqa  %xmm0,6*8($tptr)
-.Lsqr8x_zero_start:
+.Lsqr8x_zero_start:                    # aligned at 32
        movdqa  %xmm0,8*8($tptr)
        movdqa  %xmm0,10*8($tptr)
        movdqa  %xmm0,12*8($tptr)
@@ -2111,47 +2117,47 @@ $code.=<<___;
        jnz     .Lsqrx8x_zero
 
        mov     0*8($aptr),%rdx         # a[0], modulo-scheduled
-       xor     %r8,%r8
-       xor     %r9,%r9
+       #xor    %r9,%r9                 # t[1], ex-$num, zero already
        xor     %r10,%r10
        xor     %r11,%r11
        xor     %r12,%r12
        xor     %r13,%r13
        xor     %r14,%r14
+       xor     %r15,%r15
        lea     48(%rsp),$tptr
        xor     $zero,$zero             # cf=0, cf=0
        jmp     .Lsqrx8x_outer_loop
 
 .align 32
 .Lsqrx8x_outer_loop:
-       mulx    1*8($aptr),%rax,%rbx    # a[1]*a[0]
-       adcx    %rax,%r8                # a[1]*a[0]+=t[1]
-       adox    %rbx,%r9
-       mulx    2*8($aptr),%rax,%rbx    # a[2]*a[0]
-       adcx    %rax,%r9
-       adox    %rbx,%r10
-       .byte   0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00    # mulx  3*8($aptr),%rax,%rbx    # ...
-       adcx    %rax,%r10
-       adox    %rbx,%r11
-       .byte   0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00    # mulx  4*8($aptr),%rax,%rbx
-       adcx    %rax,%r11
-       adox    %rbx,%r12
-       mulx    5*8($aptr),%rax,%rbx
-       adcx    %rax,%r12
-       adox    %rbx,%r13
-       mulx    6*8($aptr),%rax,%rbx
-       adcx    %rax,%r13
-       adox    %rbx,%r14
-       mulx    7*8($aptr),%rax,%r15
+       mulx    1*8($aptr),%r8,%rax     # a[1]*a[0]
+       adcx    %r9,%r8                 # a[1]*a[0]+=t[1]
+       adox    %rax,%r10
+       mulx    2*8($aptr),%r9,%rax     # a[2]*a[0]
+       adcx    %r10,%r9
+       adox    %rax,%r11
+       .byte   0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00    # mulx  3*8($aptr),%r10,%rax    # ...
+       adcx    %r11,%r10
+       adox    %rax,%r12
+       .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00    # mulx  4*8($aptr),%r11,%rax
+       adcx    %r12,%r11
+       adox    %rax,%r13
+       mulx    5*8($aptr),%r12,%rax
+       adcx    %r13,%r12
+       adox    %rax,%r14
+       mulx    6*8($aptr),%r13,%rax
+       adcx    %r14,%r13
+       adox    %r15,%rax
+       mulx    7*8($aptr),%r14,%r15
         mov    1*8($aptr),%rdx         # a[1]
        adcx    %rax,%r14
        adox    $zero,%r15
        adc     8*8($tptr),%r15
+       mov     %r8,1*8($tptr)          # t[1]
+       mov     %r9,2*8($tptr)          # t[2]
        sbb     $carry,$carry           # mov %cf,$carry
        xor     $zero,$zero             # cf=0, of=0
 
-       mov     %r8,1*8($tptr)          # t[1]
-       mov     %r9,2*8($tptr)          # t[2]
 
        mulx    2*8($aptr),%r8,%rbx     # a[2]*a[1]
        mulx    3*8($aptr),%r9,%rax     # a[3]*a[1]
@@ -2193,13 +2199,12 @@ $code.=<<___;
        adcx    %rbx,%r11
        adox    %rax,%r12
        adcx    %r14,%r12
-       adox    $zero,%r13              # of=0
-       adcx    $zero,%r13              # cf=0
-
        mov     %r8,5*8($tptr)          # t[5]
        mov     %r9,6*8($tptr)          # t[6]
+        mulx   4*8($aptr),%r8,%rax     # a[4]*a[3]
+       adox    $zero,%r13              # of=0
+       adcx    $zero,%r13              # cf=0
 
-       mulx    4*8($aptr),%r8,%rax     # a[4]*a[3]
        mulx    5*8($aptr),%r9,%rbx     # a[5]*a[3]
        adcx    %r10,%r8
        adox    %rax,%r9
@@ -2239,9 +2244,9 @@ $code.=<<___;
        adcx    %r14,%r11
        adox    %rbx,%r12
        adcx    %rax,%r12
-       .byte   0x66,0x66
        adox    $zero,%r13
 
+       .byte   0x67,0x67
        mulx    %r8,%r8,%r14            # a[7]*a[6]
        adcx    %r8,%r13
        adcx    $zero,%r14
@@ -2250,26 +2255,26 @@ $code.=<<___;
        je      .Lsqrx8x_outer_break
 
        neg     $carry                  # mov $carry,%cf
+       mov     \$-8,%rcx
        mov     $zero,%r15
        mov     8*8($tptr),%r8
-       adc     9*8($tptr),%r9          # +=t[9]
-       adc     10*8($tptr),%r10        # ...
-       adc     11*8($tptr),%r11
+       adcx    9*8($tptr),%r9          # +=t[9]
+       adcx    10*8($tptr),%r10        # ...
+       adcx    11*8($tptr),%r11
        adc     12*8($tptr),%r12
        adc     13*8($tptr),%r13
        adc     14*8($tptr),%r14
        adc     15*8($tptr),%r15
-       lea     8*8($tptr),$tptr
-       sbb     $carry,$carry           # mov %cf,$carry
+       lea     ($aptr),$aaptr
+       lea     2*8*8($tptr),$tptr
+       sbb     %rax,%rax               # mov %cf,$carry
 
        mov     -64($aptr),%rdx         # a[0]
-       lea     ($aptr),$aaptr
-       mov     $carry,16(%rsp)         # offload $carry
+       mov     %rax,16(%rsp)           # offload $carry
        mov     $tptr,24(%rsp)
 
-       lea     8*8($tptr),$tptr
+       #lea    8*8($tptr),$tptr        # see 2*8*8($tptr) above
        xor     %eax,%eax               # cf=0, of=0
-       mov     \$-8,%rcx
        jmp     .Lsqrx8x_loop
 
 .align 32
@@ -2311,17 +2316,20 @@ $code.=<<___;
        adox    %rbx,%r15               # %rbx is 0, of=0
        adcx    %rbx,%r15               # cf=0
 
+       .byte   0x67
        inc     %rcx                    # of=0
        jnz     .Lsqrx8x_loop
 
        lea     8*8($aaptr),$aaptr
+       mov     \$-8,%rcx
        cmp     8(%rsp),$aaptr          # done?
        je      .Lsqrx8x_break
 
        sub     16(%rsp),%rbx           # mov 16(%rsp),%cf
+       .byte   0x66
        mov     -64($aptr),%rdx
-       adc     0*8($tptr),%r8
-       adc     1*8($tptr),%r9
+       adcx    0*8($tptr),%r8
+       adcx    1*8($tptr),%r9
        adc     2*8($tptr),%r10
        adc     3*8($tptr),%r11
        adc     4*8($tptr),%r12
@@ -2329,35 +2337,37 @@ $code.=<<___;
        adc     6*8($tptr),%r14
        adc     7*8($tptr),%r15
        lea     8*8($tptr),$tptr
-       sbb     %rbx,%rbx               # mov %cf,%rbx
-       xor     %eax,%eax               # cf=0, of=0
-       mov     %rbx,16(%rsp)           # offload carry
-       mov     \$-8,%rcx
+       .byte   0x67
+       sbb     %rax,%rax               # mov %cf,%rax
+       xor     %ebx,%ebx               # cf=0, of=0
+       mov     %rax,16(%rsp)           # offload carry
        jmp     .Lsqrx8x_loop
 
 .align 32
 .Lsqrx8x_break:
        sub     16(%rsp),%r8            # consume last carry
-       mov     24(%rsp),$aaptr         # initial $tptr
+       mov     24(%rsp),$carry         # initial $tptr, borrow $carry
        mov     0*8($aptr),%rdx         # a[8], modulo-scheduled
+       xor     %ebp,%ebp               # xor   $zero,$zero
        mov     %r8,0*8($tptr)
-       lea     8*8($aaptr),$aaptr
+       cmp     $carry,$tptr            # cf=0, of=0
+       je      .Lsqrx8x_outer_loop
+
        mov     %r9,1*8($tptr)
-        mov    1*8($aaptr),%r8         # potentially forwarded store
+        mov    1*8($carry),%r9
        mov     %r10,2*8($tptr)
-        mov    2*8($aaptr),%r9         # ...
+        mov    2*8($carry),%r10
        mov     %r11,3*8($tptr)
-        mov    3*8($aaptr),%r10
+        mov    3*8($carry),%r11
        mov     %r12,4*8($tptr)
-        mov    4*8($aaptr),%r11
+        mov    4*8($carry),%r12
        mov     %r13,5*8($tptr)
-        mov    5*8($aaptr),%r12
+        mov    5*8($carry),%r13
        mov     %r14,6*8($tptr)
-        mov    6*8($aaptr),%r13
+        mov    6*8($carry),%r14
        mov     %r15,7*8($tptr)
-        mov    7*8($aaptr),%r14
-       mov     $aaptr,$tptr
-       xor     $zero,$zero             # cf=0, cf=0
+        mov    7*8($carry),%r15
+       mov     $carry,$tptr
        jmp     .Lsqrx8x_outer_loop
 
 .align 32
@@ -2373,13 +2383,12 @@ ___
 }\f{
 my $i="%rcx";
 $code.=<<___;
-       mov     (%rsp),$num             # restore $num
-
        lea     48(%rsp),$tptr
        mov     ($aptr,$i),%rdx         # a[0]
 
        mov     8($tptr),$A0[1]         # t[1]
        xor     $A0[0],$A0[0]           # t[0], of=0, cf=0
+       mov     (%rsp),$num             # restore $num
        adox    $A0[1],$A0[1]
         mov    16($tptr),$A1[0]        # t[2]  # prefetch
         mov    24($tptr),$A1[1]        # t[3]  # prefetch
@@ -2440,9 +2449,9 @@ $code.=<<___;
 .align 32
 .Lsqrx4x_shift_n_add_break:
        adcx    $A1[1],%rbx
-       .byte   0x48,0x89,0x87,0x30,0x00,0x00,0x00      # mov   %rax,48($tptr)
-       .byte   0x48,0x89,0x9f,0x38,0x00,0x00,0x00      # mov   %rbx,56($tptr)
-       .byte   0x48,0x8d,0xbf,0x40,0x00,0x00,0x00      # lea   64($tptr),$tptr
+       mov     %rax,48($tptr)
+       mov     %rbx,56($tptr)
+       lea     64($tptr),$tptr         # end of t[] buffer
 ___
 }\f
 ######################################################################
@@ -2456,17 +2465,16 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
 
 $code.=<<___;
        movq    %xmm2,$nptr
+       xor     %eax,%eax               # initial top-most carry bit
        mov     32(%rsp),%rbx           # n0
        mov     48(%rsp),%rdx           # "%r8", 8*0($tptr)
-       lea     ($nptr,$num),%rax       # end of n[]
+       lea     -64($nptr,$num),%rcx    # end of n[]
        #lea    48(%rsp,$num,2),$tptr   # end of t[] buffer
-       mov     %rax, 0(%rsp)           # save end of n[]
+       mov     %rcx, 0(%rsp)           # save end of n[]
        mov     $tptr,8(%rsp)           # save end of t[]
 
        lea     48(%rsp),$tptr          # initial t[] window
-       xor     %rax,%rax
-       nop
-       #jmp    .Lsqrx8x_reduction_loop
+       jmp     .Lsqrx8x_reduction_loop
 
 .align 32
 .Lsqrx8x_reduction_loop:
@@ -2529,29 +2537,31 @@ $code.=<<___;
        adox    $carry,%r15             # $carry is 0
        adcx    $carry,%r15             # cf=0
 
+       .byte   0x67
        inc     %rcx                    # of=0
        jnz     .Lsqrx8x_reduce
 
-       lea     8*8($nptr),$nptr
-       xor     %rax,%rax
+       .byte   0x66,0x67
+       mov     $carry,%rax             # xor   %rax,%rax
        cmp     0(%rsp),$nptr           # end of n[]?
        jae     .Lsqrx8x_no_tail
 
        mov     48(%rsp),%rdx           # pull n0*a[0]
        add     8*0($tptr),%r8
-       adcx    8*1($tptr),%r9
-       adcx    8*2($tptr),%r10
-       adcx    8*3($tptr),%r11
-       adcx    8*4($tptr),%r12
-       adcx    8*5($tptr),%r13
-       adcx    8*6($tptr),%r14
-       adcx    8*7($tptr),%r15
+       lea     8*8($nptr),$nptr
+       mov     \$-8,%rcx
+       adc     8*1($tptr),%r9
+       adc     8*2($tptr),%r10
+       adc     8*3($tptr),%r11
+       adc     8*4($tptr),%r12
+       adc     8*5($tptr),%r13
+       adc     8*6($tptr),%r14
+       adc     8*7($tptr),%r15
        lea     8*8($tptr),$tptr
-       sbb     $carry,$carry           # top carry
+       sbb     %rax,%rax               # top carry
 
-       mov     \$-8,%rcx
-       mov     $carry,16(%rsp)
        xor     $carry,$carry           # of=0, cf=0
+       mov     %rax,16(%rsp)
        jmp     .Lsqrx8x_tail
 
 .align 32
@@ -2588,7 +2598,7 @@ $code.=<<___;
        mulx    8*7($nptr),%rax,%r15
         mov    48+72(%rsp,%rcx,8),%rdx # pull n0*a[i]
        adcx    %rax,%r14
-       .byte   0x66
+       .byte   0x67
        adox    $carry,%r15
         mov    %rbx,($tptr,%rcx,8)     # save result
         mov    %r8,%rbx
@@ -2597,35 +2607,35 @@ $code.=<<___;
        inc     %rcx                    # of=0
        jnz     .Lsqrx8x_tail
 
-       lea     8*8($nptr),$nptr
        cmp     0(%rsp),$nptr           # end of n[]?
        jae     .Lsqrx8x_tail_done      # break out of loop
 
-       sub     16(%rsp),$carry         # neg   $carry
+       sub     16(%rsp),$carry         # mov 16(%rsp),%cf
         mov    48(%rsp),%rdx           # pull n0*a[0]
-       adcx    8*0($tptr),%r8
-       adcx    8*1($tptr),%r9
-       adcx    8*2($tptr),%r10
-       adcx    8*3($tptr),%r11
-       adcx    8*4($tptr),%r12
-       adcx    8*5($tptr),%r13
-       adcx    8*6($tptr),%r14
-       adcx    8*7($tptr),%r15
+        lea    8*8($nptr),$nptr
+       adc     8*0($tptr),%r8
+       adc     8*1($tptr),%r9
+       adc     8*2($tptr),%r10
+       adc     8*3($tptr),%r11
+       adc     8*4($tptr),%r12
+       adc     8*5($tptr),%r13
+       adc     8*6($tptr),%r14
+       adc     8*7($tptr),%r15
        lea     8*8($tptr),$tptr
-       sbb     $carry,$carry
-
        mov     \$-8,%rcx
-       mov     $carry,16(%rsp)
+       sbb     %rax,%rax
+
        xor     $carry,$carry           # of=0, cf=0
+       mov     %rax,16(%rsp)
        jmp     .Lsqrx8x_tail
 
 .align 32
 .Lsqrx8x_tail_done:
        add     24(%rsp),%r8            # can this overflow?
-       xor     %rax,%rax
+       mov     $carry,%rax             # xor   %rax,%rax
 
-       sub     16(%rsp),$carry         # neg $carry
-.Lsqrx8x_no_tail:                      # carry flag is 0
+       sub     16(%rsp),$carry         # mov 16(%rsp),%cf
+.Lsqrx8x_no_tail:                      # %cf is 0 if jumped here
        adc     8*0($tptr),%r8
         movq   %xmm3,%rcx
        adc     8*1($tptr),%r9
@@ -2639,24 +2649,24 @@ $code.=<<___;
        adc     8*7($tptr),%r15
        adc     %rax,%rax               # top-most carry
 
-       cmp     8(%rsp),$carry          # end of t[]?
        mov     32(%rsp),%rbx           # n0
        mov     8*8($tptr,%rcx),%rdx    # modulo-scheduled "%r8"
 
-       lea     8*8($tptr,%rcx),$tptr   # start of current t[] window
-       mov     %r8,-8*8($carry)        # store top 512 bits
-       mov     %r9,-8*7($carry)
-       mov     %r10,-8*6($carry)
-       mov     %r11,-8*5($carry)
-       mov     %r12,-8*4($carry)
-       mov     %r13,-8*3($carry)
-       mov     %r14,-8*2($carry)
-       mov     %r15,-8*1($carry)
+       mov     %r8,8*0($tptr)          # store top 512 bits
+       mov     %r9,8*1($tptr)
+       mov     %r10,8*2($tptr)
+       mov     %r11,8*3($tptr)
+       mov     %r12,8*4($tptr)
+       mov     %r13,8*5($tptr)
+       mov     %r14,8*6($tptr)
+       mov     %r15,8*7($tptr)
 
+       lea     8*8($tptr,%rcx),$tptr   # start of current t[] window
+       cmp     8(%rsp),$carry          # end of t[]?
        jb      .Lsqrx8x_reduction_loop
 
-       mov     %rcx,$num
-       neg     $num                    # restore $num
+       mov     %rcx,%rdx               # -$num
+       jmp     .Lsqrx8x_post
 ___
 }\f
 ##############################################################
@@ -2667,24 +2677,28 @@ my ($rptr,$nptr,$lptr,$i)=($aptr,"%rbp","%rbx","%rcx");
 my @ri=map("%r$_",(10..13));
 my @ni=map("%r$_",(14..15));
 $code.=<<___;
-       lea     ($nptr,$num),$nptr      # end of $nptr
-       lea     48(%rsp,$num),$lptr     # end of lower half of t[2*num]
-       lea     48(%rsp,$num),$tptr
+.align 32
+.Lsqrx8x_post:
+       neg     %rdx                    # restore $num
        neg     %rax                    # top-most carry as mask
+       mov     0*8($nptr),%r8
+       mov     1*8($nptr),%r9
+       lea     ($nptr,%rdx),$nptr      # end of $nptr
+       lea     48(%rsp,%rdx),$lptr     # end of lower half of t[2*num]
+       lea     48(%rsp,%rdx),$tptr
+       .byte   0x67
        xor     %rdx,%rdx
        movq    %xmm1,$rptr             # restore $rptr
 
-       mov     0*8($nptr,$i),%r8
-       mov     1*8($nptr,$i),%r9
        neg     %r8
        jmp     .Lsqrx8x_sub_entry
 
-.align 32
+.byte  0x66,0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
 .Lsqrx8x_sub:
        mov     0*8($nptr,$i),%r8
        mov     1*8($nptr,$i),%r9
        not     %r8
-.Lsqrx8x_sub_entry:
+.Lsqrx8x_sub_entry:                    # aligned at 32
        mov     2*8($nptr,$i),%r10
        not     %r9
        and     %rax,%r8
@@ -2709,28 +2723,27 @@ $code.=<<___;
        movdqa  %xmm0,2*8($lptr,$i)
        and     %rax,%r15
 
-       neg     %rdx                    # mov %rdx,%cf
+       neg     %edx                    # mov %edx,%cf
        movdqa  %xmm0,4*8($lptr,$i)
        adc     0*8($tptr),%r8
+       mov     %r8,0*8($rptr)          # result
        adc     1*8($tptr),%r9
        movdqa  %xmm0,6*8($lptr,$i)
        adc     2*8($tptr),%r10
+       mov     %r9,1*8($rptr)
        adc     3*8($tptr),%r11
        movdqa  %xmm0,0*8($tptr)        # zap upper half
        adc     4*8($tptr),%r12
+       mov     %r10,2*8($rptr)
        adc     5*8($tptr),%r13
        movdqa  %xmm0,2*8($tptr)
        adc     6*8($tptr),%r14
+       mov     %r11,3*8($rptr)
        adc     7*8($tptr),%r15
+       sbb     %edx,%edx               # mov %cf,%edx
        movdqa  %xmm0,4*8($tptr)
-       sbb     %rdx,%rdx               # mov %cf,%rdx
        movdqa  %xmm0,6*8($tptr)
        lea     8*8($tptr),$tptr
-
-       mov     %r8,0*8($rptr)
-       mov     %r9,1*8($rptr)
-       mov     %r10,2*8($rptr)
-       mov     %r11,3*8($rptr)
        mov     %r12,4*8($rptr)
        mov     %r13,5*8($rptr)
        mov     %r14,6*8($rptr)