#! /usr/bin/env perl
-# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# the np argument is not just modulus value, but one interleaved
# with 0. This is to optimize post-condition...
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+ or die "can't call $xlate: $!";
*STDOUT=*OUT;
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
.type bn_mul_mont_gather5,\@function,6
.align 64
bn_mul_mont_gather5:
+.cfi_startproc
+ mov ${num}d,${num}d
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
test \$7,${num}d
jnz .Lmul_enter
___
.align 16
.Lmul_enter:
- mov ${num}d,${num}d
- mov %rsp,%rax
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
- lea .Linc(%rip),%r10
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
- lea 2($num),%r11
- neg %r11
- lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
- and \$-1024,%rsp # minimize TLB usage
+ neg $num
+ mov %rsp,%r11
+ lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
- mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul_body:
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- sub %rsp,%rax
- and \$-4096,%rax
+ sub %r10,%r11
+ and \$-4096,%r11
+ lea (%r10,%r11),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
.Lmul_page_walk:
- mov (%rsp,%rax),%r11
- sub \$4096,%rax
- .byte 0x2e # predict non-taken
- jnc .Lmul_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ lea .Linc(%rip),%r10
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
+.Lmul_body:
lea 128($bp),%r12 # reassign $bp (+size optimization)
___
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
- dec $j # doesnn't affect CF!
+ dec $j # doesn't affect CF!
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ mov \$-1,%rbx
+ xor %rax,%rbx
xor $i,$i
- and %rax,$ap
- not %rax
- mov $rp,$np
- and %rax,$np
mov $num,$j # j=num
- or $np,$ap # ap=borrow?tp:rp
-.align 16
-.Lcopy: # copy or in-place refresh
- mov ($ap,$i,8),%rax
+
+.Lcopy: # conditional copy
+ mov ($rp,$i,8),%rcx
+ mov (%rsp,$i,8),%rdx
+ and %rbx,%rcx
+ and %rax,%rdx
mov $i,(%rsp,$i,8) # zap temporary vector
- mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ or %rcx,%rdx
+ mov %rdx,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul_epilogue:
ret
+.cfi_endproc
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
___
{{{
.type bn_mul4x_mont_gather5,\@function,6
.align 32
bn_mul4x_mont_gather5:
+.cfi_startproc
+ .byte 0x67
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
je .Lmulx4x_enter
___
$code.=<<___;
- .byte 0x67
- mov %rsp,%rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lmul4x_prologue:
.byte 0x67
shl \$3,${num}d # convert $num to bytes
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
- sub %r11,%rsp # align with $rp
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $rp
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lmul4xsp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
.Lmul4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmul4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
neg $num
mov %rax,40(%rsp)
+.cfi_cfa_expression %rsp+40,deref,+8
.Lmul4x_body:
call mul4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmul4x_epilogue:
ret
+.cfi_endproc
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
+.cfi_startproc
shl \$5,$num # $num was in bytes
movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
lea .Linc(%rip),%rax
___
}
$code.=<<___;
+.cfi_endproc
.size mul4x_internal,.-mul4x_internal
___
}}}
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
my $num ="%r9"; # int num, has to be divisible by 8
- # int pwr
+ # int pwr
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
.type bn_power5,\@function,6
.align 32
bn_power5:
+.cfi_startproc
+ mov %rsp,%rax
+.cfi_def_cfa_register %rax
___
$code.=<<___ if ($addx);
mov OPENSSL_ia32cap_P+8(%rip),%r11d
je .Lpowerx5_enter
___
$code.=<<___;
- mov %rsp,%rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lpower5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwr_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwr_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+ jmp .Lpwr_page_walk_done
+
.Lpwr_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lpwr_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwr_page_walk
+.Lpwr_page_walk_done:
- mov $num,%r10
+ mov $num,%r10
neg $num
##############################################################
#
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lpower5_body:
movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
call mul4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lpower5_epilogue:
ret
+.cfi_endproc
.size bn_power5,.-bn_power5
.globl bn_sqr8x_internal
.align 32
bn_sqr8x_internal:
__bn_sqr8x_internal:
+.cfi_startproc
##############################################################
# Squaring part:
#
.align 32
.L8x_tail_done:
+ xor %rax,%rax
add (%rdx),%r8 # can this overflow?
adc \$0,%r9
adc \$0,%r10
adc \$0,%r12
adc \$0,%r13
adc \$0,%r14
- adc \$0,%r15 # can't overflow, because we
- # started with "overhung" part
- # of multiplication
- xor %rax,%rax
+ adc \$0,%r15
+ adc \$0,%rax
neg $carry
.L8x_no_tail:
cmp %rdx,$tptr # end of t[]?
jb .L8x_reduction_loop
ret
+.cfi_endproc
.size bn_sqr8x_internal,.-bn_sqr8x_internal
___
}\f
.type __bn_post4x_internal,\@abi-omnipotent
.align 32
__bn_post4x_internal:
+.cfi_startproc
mov 8*0($nptr),%r12
lea (%rdi,$num),$tptr # %rdi was $tptr above
mov $num,%rcx
jnz .Lsqr4x_sub
mov $num,%r10 # prepare for back-to-back call
- neg $num # restore $num
+ neg $num # restore $num
ret
+.cfi_endproc
.size __bn_post4x_internal,.-__bn_post4x_internal
___
}
.type bn_from_montgomery,\@abi-omnipotent
.align 32
bn_from_montgomery:
+.cfi_startproc
testl \$7,`($win64?"48(%rsp)":"%r9d")`
jz bn_from_mont8x
xor %eax,%eax
ret
+.cfi_endproc
.size bn_from_montgomery,.-bn_from_montgomery
.type bn_from_mont8x,\@function,6
.align 32
bn_from_mont8x:
+.cfi_startproc
.byte 0x67
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lfrom_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lfrom_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lfrom_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+ jmp .Lfrom_page_walk_done
+
.Lfrom_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lfrom_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lfrom_page_walk
+.Lfrom_page_walk_done:
mov $num,%r10
neg $num
#
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lfrom_body:
mov $num,%r11
lea 48(%rsp),%rax
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
- mov 40(%rsp),%rsi # restore %rsp
jmp .Lfrom_mont_zero
.align 32
pxor %xmm0,%xmm0
lea 48(%rsp),%rax
- mov 40(%rsp),%rsi # restore %rsp
jmp .Lfrom_mont_zero
.align 32
.Lfrom_mont_zero:
+ mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
movdqa %xmm0,16*0(%rax)
movdqa %xmm0,16*1(%rax)
movdqa %xmm0,16*2(%rax)
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lfrom_epilogue:
ret
+.cfi_endproc
.size bn_from_mont8x,.-bn_from_mont8x
___
}
.type bn_mulx4x_mont_gather5,\@function,6
.align 32
bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
+.cfi_startproc
mov %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lmulx4x_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmulx4xsp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
.Lmulx4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
-.Lmulx4xsp_done:
- and \$-64,%rsp # ensure alignment
- mov %rax,%r11
- sub %rsp,%r11
+ sub %r11,%rbp
+.Lmulx4xsp_done:
+ and \$-64,%rbp # ensure alignment
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
.Lmulx4x_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lmulx4x_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
##############################################################
# Stack layout
#
mov $n0, 32(%rsp) # save *n0
mov %rax,40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lmulx4x_body:
call mulx4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lmulx4x_epilogue:
ret
+.cfi_endproc
.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
.type mulx4x_internal,\@abi-omnipotent
.align 32
mulx4x_internal:
+.cfi_startproc
mov $num,8(%rsp) # save -$num (it was in bytes)
mov $num,%r10
neg $num # restore $num
$code.=<<___;
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
- lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton)
+ lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization)
lea 128($bp),$bptr # size optimization
pshufd \$0,%xmm5,%xmm5 # broadcast index
mov 8*2(%rbp),%r14
mov 8*3(%rbp),%r15
jmp .Lsqrx4x_sub_entry # common post-condition
+.cfi_endproc
.size mulx4x_internal,.-mulx4x_internal
___
}\f{
.type bn_powerx5,\@function,6
.align 32
bn_powerx5:
-.Lpowerx5_enter:
+.cfi_startproc
mov %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lpowerx5_enter:
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
+.Lpowerx5_prologue:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
+ mov %rsp,%rbp
sub $rptr,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lpwrx_sp_alt
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
- sub %r11,%rsp
+ sub %r11,%rbp
.Lpwrx_sp_done:
- and \$-64,%rsp
- mov %rax,%r11
- sub %rsp,%r11
+ and \$-64,%rbp
+ mov %rsp,%r11
+ sub %rbp,%r11
and \$-4096,%r11
+ lea (%rbp,%r11),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
.Lpwrx_page_walk:
- mov (%rsp,%r11),%r10
- sub \$4096,%r11
- .byte 0x2e # predict non-taken
- jnc .Lpwrx_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r10
+ cmp %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
- mov $num,%r10
+ mov $num,%r10
neg $num
##############################################################
movq $bptr,%xmm4
mov $n0, 32(%rsp)
mov %rax, 40(%rsp) # save original %rsp
+.cfi_cfa_expression %rsp+40,deref,+8
.Lpowerx5_body:
call __bn_sqrx8x_internal
call mulx4x_internal
mov 40(%rsp),%rsi # restore %rsp
+.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
+.cfi_restore %r15
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lpowerx5_epilogue:
ret
+.cfi_endproc
.size bn_powerx5,.-bn_powerx5
.globl bn_sqrx8x_internal
.align 32
bn_sqrx8x_internal:
__bn_sqrx8x_internal:
+.cfi_startproc
##################################################################
# Squaring part:
#
.align 32
.Lsqrx8x_break:
- sub 16+8(%rsp),%r8 # consume last carry
+ xor $zero,$zero
+ sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
+ adcx $zero,%r8
mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
+ adcx $zero,%r9
mov 0*8($aptr),%rdx # a[8], modulo-scheduled
- xor %ebp,%ebp # xor $zero,$zero
+ adc \$0,%r10
mov %r8,0*8($tptr)
+ adc \$0,%r11
+ adc \$0,%r12
+ adc \$0,%r13
+ adc \$0,%r14
+ adc \$0,%r15
cmp $carry,$tptr # cf=0, of=0
je .Lsqrx8x_outer_loop
.align 32
.Lsqrx8x_tail_done:
+ xor %rax,%rax
add 24+8(%rsp),%r8 # can this overflow?
adc \$0,%r9
adc \$0,%r10
adc \$0,%r12
adc \$0,%r13
adc \$0,%r14
- adc \$0,%r15 # can't overflow, because we
- # started with "overhung" part
- # of multiplication
- mov $carry,%rax # xor %rax,%rax
+ adc \$0,%r15
+ adc \$0,%rax
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
.Lsqrx8x_no_tail: # %cf is 0 if jumped here
adc 8*5($tptr),%r13
adc 8*6($tptr),%r14
adc 8*7($tptr),%r15
- adc %rax,%rax # top-most carry
+ adc \$0,%rax # top-most carry
mov 32+8(%rsp),%rbx # n0
mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop
ret
+.cfi_endproc
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
___
}\f
$code.=<<___;
.align 32
__bn_postx4x_internal:
+.cfi_startproc
mov 8*0($nptr),%r12
mov %rcx,%r10 # -$num
mov %rcx,%r9 # -$num
neg %r9 # restore $num
ret
+.cfi_endproc
.size __bn_postx4x_internal,.-__bn_postx4x_internal
___
}
.type bn_get_bits5,\@abi-omnipotent
.align 16
bn_get_bits5:
+.cfi_startproc
lea 0($inp),%r10
lea 1($inp),%r11
mov $num,%ecx
shrl %cl,%eax
and \$31,%eax
ret
+.cfi_endproc
.size bn_get_bits5,.-bn_get_bits5
.globl bn_scatter5
.type bn_scatter5,\@abi-omnipotent
.align 16
bn_scatter5:
+.cfi_startproc
cmp \$0, $num
jz .Lscatter_epilogue
lea ($tbl,$idx,8),$tbl
jnz .Lscatter
.Lscatter_epilogue:
ret
+.cfi_endproc
.size bn_scatter5,.-bn_scatter5
.globl bn_gather5
.align 32
bn_gather5:
.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
+.cfi_startproc
# I can't trust assembler to use specific encoding:-(
.byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
lea (%r10),%rsp
ret
.LSEH_end_bn_gather5:
+.cfi_endproc
.size bn_gather5,.-bn_gather5
___
}
cmp %r10,%rbx # context->Rip<end of prologue label
jb .Lcommon_seh_tail
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # beginning of body label
+ cmp %r10,%rbx # context->Rip<body label
+ jb .Lcommon_pop_regs
+
mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
+ mov 8(%r11),%r10d # HandlerData[2]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
- jmp .Lbody_proceed
+ jmp .Lcommon_pop_regs
.Lbody_40:
mov 40(%rax),%rax # pull saved stack pointer
-.Lbody_proceed:
+.Lcommon_pop_regs:
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
.LSEH_info_bn_mul_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
+ .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_mul4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
+ .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_power5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
+ .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_from_mont8x:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
+ .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
___
$code.=<<___ if ($addx);
.align 8
.LSEH_info_bn_mulx4x_mont_gather5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
+ .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
.align 8
.LSEH_info_bn_powerx5:
.byte 9,0,0,0
.rva mul_handler
- .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
+ .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
___
$code.=<<___;
.align 8