Revert previous Linux-specific/centric commit#19629. If it really has to
[openssl.git] / crypto / bn / asm / x86_64-mont.pl
index 6b33c7e9ea895a6c4b98698d451e521b6a042fba..3b7a6f243f21e65882daa79e575c4a12ea6dce2c 100755 (executable)
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
 
-$output=shift;
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $output";
+open STDOUT,"| $^X $xlate $flavour $output";
 
 # int bn_mul_mont(
 $rp="%rdi";    # BN_ULONG *rp,
@@ -54,13 +58,15 @@ bn_mul_mont:
        push    %r14
        push    %r15
 
-       lea     2($num),%rax
-       mov     %rsp,%rbp
-       neg     %rax
-       lea     (%rsp,%rax,8),%rsp      # tp=alloca(8*(num+2))
+       mov     ${num}d,${num}d
+       lea     2($num),%r10
+       mov     %rsp,%r11
+       neg     %r10
+       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
        and     \$-1024,%rsp            # minimize TLB usage
 
-       mov     %rbp,8(%rsp,$num,8)     # tp[num+1]=%rsp
+       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lprologue:
        mov     %rdx,$bp                # $bp reassigned, remember?
 
        mov     ($n0),$n0               # pull n0[0] value
@@ -196,18 +202,129 @@ bn_mul_mont:
        dec     $j
        jge     .Lcopy
 
-       mov     8(%rsp,$num,8),%rsp     # restore %rsp
+       mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
+       mov     (%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lepilogue:
+       ret
+.size  bn_mul_mont,.-bn_mul_mont
+.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lprologue(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<.Lprologue
+       jb      .Lin_prologue
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     .Lepilogue(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip>=.Lepilogue
+       jae     .Lin_prologue
+
+       mov     192($context),%r10      # pull $num
+       mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
+       lea     48(%rax),%rax
+
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+       mov     -48(%rax),%r15
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+
+.Lin_prologue:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
        pop     %r15
        pop     %r14
        pop     %r13
        pop     %r12
        pop     %rbp
        pop     %rbx
+       pop     %rdi
+       pop     %rsi
        ret
-.size  bn_mul_mont,.-bn_mul_mont
-.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.size  se_handler,.-se_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_bn_mul_mont
+       .rva    .LSEH_end_bn_mul_mont
+       .rva    .LSEH_info_bn_mul_mont
+
+.section       .xdata
+.align 8
+.LSEH_info_bn_mul_mont:
+       .byte   9,0,0,0
+       .rva    se_handler
 ___
+}
 
 print $code;
 close STDOUT;