bn/asm/x86_64*: add DWARF CFI directives.
[openssl.git] / crypto / bn / asm / x86_64-mont.pl
index 2989b58f256eaa0fe4315d9970c91161f874481a..cf3daa2bb588a7f6ae5378a346b0d0ccd9b3296f 100755 (executable)
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -50,7 +57,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
 
 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
@@ -68,6 +75,11 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
        $addx = ($1>=12);
 }
 
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
+       my $ver = $2 + $3/100.0;        # 3.1->3.01, 3.10->3.10
+       $addx = ($ver>=3.03);
+}
+
 # int bn_mul_mont(
 $rp="%rdi";    # BN_ULONG *rp,
 $ap="%rsi";    # const BN_ULONG *ap,
@@ -92,6 +104,10 @@ $code=<<___;
 .type  bn_mul_mont,\@function,6
 .align 16
 bn_mul_mont:
+.cfi_startproc
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
        test    \$3,${num}d
        jnz     .Lmul_enter
        cmp     \$8,${num}d
@@ -110,20 +126,50 @@ $code.=<<___;
 .align 16
 .Lmul_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
-       mov     ${num}d,${num}d
-       lea     2($num),%r10
+       neg     $num
        mov     %rsp,%r11
-       neg     %r10
-       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
-       and     \$-1024,%rsp            # minimize TLB usage
+       lea     -16(%rsp,$num,8),%r10   # future alloca(8*(num+2))
+       neg     $num                    # restore $num
+       and     \$-1024,%r10            # minimize TLB usage
+
+       # An OS-agnostic version of __chkstk.
+       #
+       # Some OSes (Windows) insist on stack being "wired" to
+       # physical memory in strictly sequential manner, i.e. if stack
+       # allocation spans two pages, then reference to farmost one can
+       # be punishable by SEGV. But page walking can do good even on
+       # other OSes, because it guarantees that villain thread hits
+       # the guard page before it can make damage to innocent one...
+       sub     %r10,%r11
+       and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+       jmp     .Lmul_page_walk_done
 
-       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.align 16
+.Lmul_page_walk:
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul_page_walk
+.Lmul_page_walk_done:
+
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.cfi_cfa_expression    %rsp+8,$num,8,mul,plus,deref,+8
 .Lmul_body:
        mov     $bp,%r12                # reassign $bp
 ___
@@ -294,16 +340,25 @@ $code.=<<___;
        jnz     .Lcopy
 
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+.cfi_def_cfa   %rsi,8
        mov     \$1,%rax
-       mov     (%rsi),%r15
-       mov     8(%rsi),%r14
-       mov     16(%rsi),%r13
-       mov     24(%rsi),%r12
-       mov     32(%rsi),%rbp
-       mov     40(%rsi),%rbx
-       lea     48(%rsi),%rsp
+       mov     -48(%rsi),%r15
+.cfi_restore   %r15
+       mov     -40(%rsi),%r14
+.cfi_restore   %r14
+       mov     -32(%rsi),%r13
+.cfi_restore   %r13
+       mov     -24(%rsi),%r12
+.cfi_restore   %r12
+       mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmul_epilogue:
        ret
+.cfi_endproc
 .size  bn_mul_mont,.-bn_mul_mont
 ___
 {{{
@@ -313,6 +368,10 @@ $code.=<<___;
 .type  bn_mul4x_mont,\@function,6
 .align 16
 bn_mul4x_mont:
+.cfi_startproc
+       mov     ${num}d,${num}d
+       mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -322,20 +381,41 @@ $code.=<<___ if ($addx);
 ___
 $code.=<<___;
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 
-       mov     ${num}d,${num}d
-       lea     4($num),%r10
+       neg     $num
        mov     %rsp,%r11
-       neg     %r10
-       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
-       and     \$-1024,%rsp            # minimize TLB usage
+       lea     -32(%rsp,$num,8),%r10   # future alloca(8*(num+4))
+       neg     $num                    # restore
+       and     \$-1024,%r10            # minimize TLB usage
 
-       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
+       sub     %r10,%r11
+       and     \$-4096,%r11
+       lea     (%r10,%r11),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul4x_page_walk
+       jmp     .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r11
+       cmp     %r10,%rsp
+       ja      .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.cfi_cfa_expression    %rsp+8,$num,8,mul,plus,deref,+8
 .Lmul4x_body:
        mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
        mov     %rdx,%r12               # reassign $bp
@@ -642,10 +722,11 @@ ___
 my @ri=("%rax","%rdx",$m0,$m1);
 $code.=<<___;
        mov     16(%rsp,$num,8),$rp     # restore $rp
+       lea     -4($num),$j
        mov     0(%rsp),@ri[0]          # tp[0]
        pxor    %xmm0,%xmm0
        mov     8(%rsp),@ri[1]          # tp[1]
-       shr     \$2,$num                # num/=4
+       shr     \$2,$j                  # j=num/4-1
        lea     (%rsp),$ap              # borrow ap for tp
        xor     $i,$i                   # i=0 and clear CF!
 
@@ -653,7 +734,6 @@ $code.=<<___;
        mov     16($ap),@ri[2]          # tp[2]
        mov     24($ap),@ri[3]          # tp[3]
        sbb     8($np),@ri[1]
-       lea     -1($num),$j             # j=num/4-1
        jmp     .Lsub4x
 .align 16
 .Lsub4x:
@@ -687,8 +767,9 @@ $code.=<<___;
        not     @ri[0]
        mov     $rp,$np
        and     @ri[0],$np
-       lea     -1($num),$j
+       lea     -4($num),$j
        or      $np,$ap                 # ap=borrow?tp:rp
+       shr     \$2,$j                  # j=num/4-1
 
        movdqu  ($ap),%xmm1
        movdqa  %xmm0,(%rsp)
@@ -706,7 +787,6 @@ $code.=<<___;
        dec     $j
        jnz     .Lcopy4x
 
-       shl     \$2,$num
        movdqu  16($ap,$i),%xmm2
        movdqa  %xmm0,16(%rsp,$i)
        movdqu  %xmm2,16($rp,$i)
@@ -714,16 +794,25 @@ ___
 }
 $code.=<<___;
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
+.cfi_def_cfa   %rsi, 8
        mov     \$1,%rax
-       mov     (%rsi),%r15
-       mov     8(%rsi),%r14
-       mov     16(%rsi),%r13
-       mov     24(%rsi),%r12
-       mov     32(%rsi),%rbp
-       mov     40(%rsi),%rbx
-       lea     48(%rsi),%rsp
+       mov     -48(%rsi),%r15
+.cfi_restore   %r15
+       mov     -40(%rsi),%r14
+.cfi_restore   %r14
+       mov     -32(%rsi),%r13
+.cfi_restore   %r13
+       mov     -24(%rsi),%r12
+.cfi_restore   %r12
+       mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmul4x_epilogue:
        ret
+.cfi_endproc
 .size  bn_mul4x_mont,.-bn_mul4x_mont
 ___
 }}}
@@ -751,14 +840,23 @@ $code.=<<___;
 .type  bn_sqr8x_mont,\@function,6
 .align 32
 bn_sqr8x_mont:
-.Lsqr8x_enter:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
+.Lsqr8x_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
+.Lsqr8x_prologue:
 
        mov     ${num}d,%r10d
        shl     \$3,${num}d             # convert $num to bytes
@@ -770,111 +868,165 @@ bn_sqr8x_mont:
        # 4096. this is done to allow memory disambiguation logic
        # do its job.
        #
-       lea     -64(%rsp,$num,4),%r11
+       lea     -64(%rsp,$num,2),%r11
+       mov     %rsp,%rbp
        mov     ($n0),$n0               # *n0
        sub     $aptr,%r11
        and     \$4095,%r11
        cmp     %r11,%r10
        jb      .Lsqr8x_sp_alt
-       sub     %r11,%rsp               # align with $aptr
-       lea     -64(%rsp,$num,4),%rsp   # alloca(frame+4*$num)
+       sub     %r11,%rbp               # align with $aptr
+       lea     -64(%rbp,$num,2),%rbp   # future alloca(frame+2*$num)
        jmp     .Lsqr8x_sp_done
 
 .align 32
 .Lsqr8x_sp_alt:
-       lea     4096-64(,$num,4),%r10   # 4096-frame-4*$num
-       lea     -64(%rsp,$num,4),%rsp   # alloca(frame+4*$num)
+       lea     4096-64(,$num,2),%r10   # 4096-frame-2*$num
+       lea     -64(%rbp,$num,2),%rbp   # future alloca(frame+2*$num)
        sub     %r10,%r11
        mov     \$0,%r10
        cmovc   %r10,%r11
-       sub     %r11,%rsp
+       sub     %r11,%rbp
 .Lsqr8x_sp_done:
-       and     \$-64,%rsp
-       mov     $num,%r10       
+       and     \$-64,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
+       and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lsqr8x_page_walk
+       jmp     .Lsqr8x_page_walk_done
+
+.align 16
+.Lsqr8x_page_walk:
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
+       mov     $num,%r10
        neg     $num
 
-       lea     64(%rsp,$num,2),%r11    # copy of modulus
        mov     $n0,  32(%rsp)
        mov     %rax, 40(%rsp)          # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
 .Lsqr8x_body:
 
-       mov     $num,$i
-       movq    %r11, %xmm2             # save pointer to modulus copy
-       shr     \$3+2,$i
-       mov     OPENSSL_ia32cap_P+8(%rip),%eax
-       jmp     .Lsqr8x_copy_n
-
-.align 32
-.Lsqr8x_copy_n:
-       movq    8*0($nptr),%xmm0
-       movq    8*1($nptr),%xmm1
-       movq    8*2($nptr),%xmm3
-       movq    8*3($nptr),%xmm4
-       lea     8*4($nptr),$nptr
-       movdqa  %xmm0,16*0(%r11)
-       movdqa  %xmm1,16*1(%r11)
-       movdqa  %xmm3,16*2(%r11)
-       movdqa  %xmm4,16*3(%r11)
-       lea     16*4(%r11),%r11
-       dec     $i
-       jnz     .Lsqr8x_copy_n
-
+       movq    $nptr, %xmm2            # save pointer to modulus
        pxor    %xmm0,%xmm0
        movq    $rptr,%xmm1             # save $rptr
        movq    %r10, %xmm3             # -$num
 ___
 $code.=<<___ if ($addx);
+       mov     OPENSSL_ia32cap_P+8(%rip),%eax
        and     \$0x80100,%eax
        cmp     \$0x80100,%eax
        jne     .Lsqr8x_nox
 
        call    bn_sqrx8x_internal      # see x86_64-mont5 module
-
-       pxor    %xmm0,%xmm0
-       lea     48(%rsp),%rax
-       lea     64(%rsp,$num,2),%rdx
-       shr     \$3+2,$num
-       mov     40(%rsp),%rsi           # restore %rsp
-       jmp     .Lsqr8x_zero
+                                       # %rax  top-most carry
+                                       # %rbp  nptr
+                                       # %rcx  -8*num
+                                       # %r8   end of tp[2*num]
+       lea     (%r8,%rcx),%rbx
+       mov     %rcx,$num
+       mov     %rcx,%rdx
+       movq    %xmm1,$rptr
+       sar     \$3+2,%rcx              # %cf=0
+       jmp     .Lsqr8x_sub
 
 .align 32
 .Lsqr8x_nox:
 ___
 $code.=<<___;
        call    bn_sqr8x_internal       # see x86_64-mont5 module
+                                       # %rax  top-most carry
+                                       # %rbp  nptr
+                                       # %r8   -8*num
+                                       # %rdi  end of tp[2*num]
+       lea     (%rdi,$num),%rbx
+       mov     $num,%rcx
+       mov     $num,%rdx
+       movq    %xmm1,$rptr
+       sar     \$3+2,%rcx              # %cf=0
+       jmp     .Lsqr8x_sub
 
+.align 32
+.Lsqr8x_sub:
+       mov     8*0(%rbx),%r12
+       mov     8*1(%rbx),%r13
+       mov     8*2(%rbx),%r14
+       mov     8*3(%rbx),%r15
+       lea     8*4(%rbx),%rbx
+       sbb     8*0(%rbp),%r12
+       sbb     8*1(%rbp),%r13
+       sbb     8*2(%rbp),%r14
+       sbb     8*3(%rbp),%r15
+       lea     8*4(%rbp),%rbp
+       mov     %r12,8*0($rptr)
+       mov     %r13,8*1($rptr)
+       mov     %r14,8*2($rptr)
+       mov     %r15,8*3($rptr)
+       lea     8*4($rptr),$rptr
+       inc     %rcx                    # preserves %cf
+       jnz     .Lsqr8x_sub
+
+       sbb     \$0,%rax                # top-most carry
+       lea     (%rbx,$num),%rbx        # rewind
+       lea     ($rptr,$num),$rptr      # rewind
+
+       movq    %rax,%xmm1
        pxor    %xmm0,%xmm0
-       lea     48(%rsp),%rax
-       lea     64(%rsp,$num,2),%rdx
-       shr     \$3+2,$num
+       pshufd  \$0,%xmm1,%xmm1
        mov     40(%rsp),%rsi           # restore %rsp
-       jmp     .Lsqr8x_zero
+.cfi_def_cfa   %rsi,8
+       jmp     .Lsqr8x_cond_copy
 
 .align 32
-.Lsqr8x_zero:
-       movdqa  %xmm0,16*0(%rax)        # wipe t
-       movdqa  %xmm0,16*1(%rax)
-       movdqa  %xmm0,16*2(%rax)
-       movdqa  %xmm0,16*3(%rax)
-       lea     16*4(%rax),%rax
-       movdqa  %xmm0,16*0(%rdx)        # wipe n
-       movdqa  %xmm0,16*1(%rdx)
-       movdqa  %xmm0,16*2(%rdx)
-       movdqa  %xmm0,16*3(%rdx)
-       lea     16*4(%rdx),%rdx
-       dec     $num
-       jnz     .Lsqr8x_zero
+.Lsqr8x_cond_copy:
+       movdqa  16*0(%rbx),%xmm2
+       movdqa  16*1(%rbx),%xmm3
+       lea     16*2(%rbx),%rbx
+       movdqu  16*0($rptr),%xmm4
+       movdqu  16*1($rptr),%xmm5
+       lea     16*2($rptr),$rptr
+       movdqa  %xmm0,-16*2(%rbx)       # zero tp
+       movdqa  %xmm0,-16*1(%rbx)
+       movdqa  %xmm0,-16*2(%rbx,%rdx)
+       movdqa  %xmm0,-16*1(%rbx,%rdx)
+       pcmpeqd %xmm1,%xmm0
+       pand    %xmm1,%xmm2
+       pand    %xmm1,%xmm3
+       pand    %xmm0,%xmm4
+       pand    %xmm0,%xmm5
+       pxor    %xmm0,%xmm0
+       por     %xmm2,%xmm4
+       por     %xmm3,%xmm5
+       movdqu  %xmm4,-16*2($rptr)
+       movdqu  %xmm5,-16*1($rptr)
+       add     \$32,$num
+       jnz     .Lsqr8x_cond_copy
 
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r14
        mov     -32(%rsi),%r13
+.cfi_restore   %r13
        mov     -24(%rsi),%r12
+.cfi_restore   %r12
        mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
        mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lsqr8x_epilogue:
        ret
+.cfi_endproc
 .size  bn_sqr8x_mont,.-bn_sqr8x_mont
 ___
 }}}
@@ -886,23 +1038,48 @@ $code.=<<___;
 .type  bn_mulx4x_mont,\@function,6
 .align 32
 bn_mulx4x_mont:
-.Lmulx4x_enter:
+.cfi_startproc
        mov     %rsp,%rax
+.cfi_def_cfa_register  %rax
+.Lmulx4x_enter:
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
+.Lmulx4x_prologue:
 
        shl     \$3,${num}d             # convert $num to bytes
-       .byte   0x67
        xor     %r10,%r10
        sub     $num,%r10               # -$num
        mov     ($n0),$n0               # *n0
-       lea     -72(%rsp,%r10),%rsp     # alloca(frame+$num+8)
+       lea     -72(%rsp,%r10),%rbp     # future alloca(frame+$num+8)
+       and     \$-128,%rbp
+       mov     %rsp,%r11
+       sub     %rbp,%r11
+       and     \$-4096,%r11
+       lea     (%rbp,%r11),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+       jmp     .Lmulx4x_page_walk_done
+
+.align 16
+.Lmulx4x_page_walk:
+       lea     -4096(%rsp),%rsp
+       mov     (%rsp),%r10
+       cmp     %rbp,%rsp
+       ja      .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
        lea     ($bp,$num),%r10
-       and     \$-128,%rsp
        ##############################################################
        # Stack layout
        # +0    num
@@ -922,6 +1099,7 @@ bn_mulx4x_mont:
        mov     $n0, 24(%rsp)           # save *n0
        mov     $rp, 32(%rsp)           # save $rp
        mov     %rax,40(%rsp)           # save original %rsp
+.cfi_cfa_expression    %rsp+40,deref,+8
        mov     $num,48(%rsp)           # inner counter
        jmp     .Lmulx4x_body
 
@@ -1042,18 +1220,17 @@ $code.=<<___;
        mulx    2*8($aptr),%r15,%r13    # ...
        adox    -3*8($tptr),%r11
        adcx    %r15,%r12
-       adox    $zero,%r12
+       adox    -2*8($tptr),%r12
        adcx    $zero,%r13
+       adox    $zero,%r13
 
        mov     $bptr,8(%rsp)           # off-load &b[i]
-       .byte   0x67
        mov     $mi,%r15
        imulq   24(%rsp),$mi            # "t[0]"*n0
        xor     %ebp,%ebp               # xor   $zero,$zero     # cf=0, of=0
 
        mulx    3*8($aptr),%rax,%r14
         mov    $mi,%rdx
-       adox    -2*8($tptr),%r12
        adcx    %rax,%r13
        adox    -1*8($tptr),%r13
        adcx    $zero,%r14
@@ -1130,74 +1307,94 @@ $code.=<<___;
        adc     $zero,%r15              # modulo-scheduled
        sub     0*8($tptr),$zero        # pull top-most carry
        adc     %r15,%r14
-       mov     -8($nptr),$mi
        sbb     %r15,%r15               # top-most carry
        mov     %r14,-1*8($tptr)
 
        cmp     16(%rsp),$bptr
        jne     .Lmulx4x_outer
 
-       sub     %r14,$mi                # compare top-most words
-       sbb     $mi,$mi
-       or      $mi,%r15
-
-       neg     $num
-       xor     %rdx,%rdx
+       lea     64(%rsp),$tptr
+       sub     $num,$nptr              # rewind $nptr
+       neg     %r15
+       mov     $num,%rdx
+       shr     \$3+2,$num              # %cf=0
        mov     32(%rsp),$rptr          # restore rp
+       jmp     .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+       mov     8*0($tptr),%r11
+       mov     8*1($tptr),%r12
+       mov     8*2($tptr),%r13
+       mov     8*3($tptr),%r14
+       lea     8*4($tptr),$tptr
+       sbb     8*0($nptr),%r11
+       sbb     8*1($nptr),%r12
+       sbb     8*2($nptr),%r13
+       sbb     8*3($nptr),%r14
+       lea     8*4($nptr),$nptr
+       mov     %r11,8*0($rptr)
+       mov     %r12,8*1($rptr)
+       mov     %r13,8*2($rptr)
+       mov     %r14,8*3($rptr)
+       lea     8*4($rptr),$rptr
+       dec     $num                    # preserves %cf
+       jnz     .Lmulx4x_sub
+
+       sbb     \$0,%r15                # top-most carry
        lea     64(%rsp),$tptr
+       sub     %rdx,$rptr              # rewind
 
+       movq    %r15,%xmm1
        pxor    %xmm0,%xmm0
-       mov     0*8($nptr,$num),%r8
-       mov     1*8($nptr,$num),%r9
-       neg     %r8
-       jmp     .Lmulx4x_sub_entry
+       pshufd  \$0,%xmm1,%xmm1
+       mov     40(%rsp),%rsi           # restore %rsp
+.cfi_def_cfa   %rsi,8
+       jmp     .Lmulx4x_cond_copy
 
 .align 32
-.Lmulx4x_sub:
-       mov     0*8($nptr,$num),%r8
-       mov     1*8($nptr,$num),%r9
-       not     %r8
-.Lmulx4x_sub_entry:
-       mov     2*8($nptr,$num),%r10
-       not     %r9
-       and     %r15,%r8
-       mov     3*8($nptr,$num),%r11
-       not     %r10
-       and     %r15,%r9
-       not     %r11
-       and     %r15,%r10
-       and     %r15,%r11
-
-       neg     %rdx                    # mov %rdx,%cf
-       adc     0*8($tptr),%r8
-       adc     1*8($tptr),%r9
-       movdqa  %xmm0,($tptr)
-       adc     2*8($tptr),%r10
-       adc     3*8($tptr),%r11
-       movdqa  %xmm0,16($tptr)
-       lea     4*8($tptr),$tptr
-       sbb     %rdx,%rdx               # mov %cf,%rdx
+.Lmulx4x_cond_copy:
+       movdqa  16*0($tptr),%xmm2
+       movdqa  16*1($tptr),%xmm3
+       lea     16*2($tptr),$tptr
+       movdqu  16*0($rptr),%xmm4
+       movdqu  16*1($rptr),%xmm5
+       lea     16*2($rptr),$rptr
+       movdqa  %xmm0,-16*2($tptr)      # zero tp
+       movdqa  %xmm0,-16*1($tptr)
+       pcmpeqd %xmm1,%xmm0
+       pand    %xmm1,%xmm2
+       pand    %xmm1,%xmm3
+       pand    %xmm0,%xmm4
+       pand    %xmm0,%xmm5
+       pxor    %xmm0,%xmm0
+       por     %xmm2,%xmm4
+       por     %xmm3,%xmm5
+       movdqu  %xmm4,-16*2($rptr)
+       movdqu  %xmm5,-16*1($rptr)
+       sub     \$32,%rdx
+       jnz     .Lmulx4x_cond_copy
 
-       mov     %r8,0*8($rptr)
-       mov     %r9,1*8($rptr)
-       mov     %r10,2*8($rptr)
-       mov     %r11,3*8($rptr)
-       lea     4*8($rptr),$rptr
+       mov     %rdx,($tptr)
 
-       add     \$32,$num
-       jnz     .Lmulx4x_sub
-
-       mov     40(%rsp),%rsi           # restore %rsp
        mov     \$1,%rax
        mov     -48(%rsi),%r15
+.cfi_restore   %r15
        mov     -40(%rsi),%r14
+.cfi_restore   %r15
        mov     -32(%rsi),%r13
+.cfi_restore   %r15
        mov     -24(%rsi),%r12
+.cfi_restore   %r15
        mov     -16(%rsi),%rbp
+.cfi_restore   %r15
        mov     -8(%rsi),%rbx
+.cfi_restore   %r15
        lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
 .Lmulx4x_epilogue:
        ret
+.cfi_endproc
 .size  bn_mulx4x_mont,.-bn_mulx4x_mont
 ___
 }}}
@@ -1250,22 +1447,8 @@ mul_handler:
 
        mov     192($context),%r10      # pull $num
        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
-       lea     48(%rax),%rax
 
-       mov     -8(%rax),%rbx
-       mov     -16(%rax),%rbp
-       mov     -24(%rax),%r12
-       mov     -32(%rax),%r13
-       mov     -40(%rax),%r14
-       mov     -48(%rax),%r15
-       mov     %rbx,144($context)      # restore context->Rbx
-       mov     %rbp,160($context)      # restore context->Rbp
-       mov     %r12,216($context)      # restore context->R12
-       mov     %r13,224($context)      # restore context->R13
-       mov     %r14,232($context)      # restore context->R14
-       mov     %r15,240($context)      # restore context->R15
-
-       jmp     .Lcommon_seh_tail
+       jmp     .Lcommon_pop_regs
 .size  mul_handler,.-mul_handler
 
 .type  sqr_handler,\@abi-omnipotent
@@ -1290,18 +1473,24 @@ sqr_handler:
 
        mov     0(%r11),%r10d           # HandlerData[0]
        lea     (%rsi,%r10),%r10        # end of prologue label
-       cmp     %r10,%rbx               # context->Rip<.Lsqr_body
+       cmp     %r10,%rbx               # context->Rip<.Lsqr_prologue
        jb      .Lcommon_seh_tail
 
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # body label
+       cmp     %r10,%rbx               # context->Rip<.Lsqr_body
+       jb      .Lcommon_pop_regs
+
        mov     152($context),%rax      # pull context->Rsp
 
-       mov     4(%r11),%r10d           # HandlerData[1]
+       mov     8(%r11),%r10d           # HandlerData[2]
        lea     (%rsi,%r10),%r10        # epilogue label
        cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
        jae     .Lcommon_seh_tail
 
        mov     40(%rax),%rax           # pull saved stack pointer
 
+.Lcommon_pop_regs:
        mov     -8(%rax),%rbx
        mov     -16(%rax),%rbp
        mov     -24(%rax),%r12
@@ -1388,13 +1577,15 @@ $code.=<<___;
 .LSEH_info_bn_sqr8x_mont:
        .byte   9,0,0,0
        .rva    sqr_handler
-       .rva    .Lsqr8x_body,.Lsqr8x_epilogue   # HandlerData[]
+       .rva    .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue          # HandlerData[]
+.align 8
 ___
 $code.=<<___ if ($addx);
 .LSEH_info_bn_mulx4x_mont:
        .byte   9,0,0,0
        .rva    sqr_handler
-       .rva    .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+       .rva    .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue       # HandlerData[]
+.align 8
 ___
 }