-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
##############################################################################
# #
# Further optimization by <appro@openssl.org>:
#
-# this/original
-# Opteron +8-33%
-# Bulldozer +10-30%
-# P4 +14-38%
-# Westmere +8-23%
-# Sandy Bridge +8-24%
-# Ivy Bridge +7-25%
-# Haswell +5-25%
-# Atom +10-32%
-# VIA Nano +37-130%
+# this/original with/without -DECP_NISTZ256_ASM(*)
+# Opteron +12-49% +110-150%
+# Bulldozer +14-45% +175-210%
+# P4 +18-46% n/a :-(
+# Westmere +12-34% +80-87%
+# Sandy Bridge +9-35% +110-120%
+# Ivy Bridge +9-35% +110-125%
+# Haswell +8-37% +140-160%
+# Broadwell +18-58% +145-210%
+# Atom +15-50% +130-180%
+# VIA Nano +43-160% +300-480%
+#
+# (*) "without -DECP_NISTZ256_ASM" refers to build with
+# "enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark.
+# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
+# server-side operation. Keep in mind that +100% means 2x improvement.
$flavour = shift;
$output = shift;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
$addx = ($1>=12);
}
-if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
$avx = ($ver>=3.0) + ($ver>=3.01);
$addx = ($ver>=3.03);
push %r13
mov 8*0($a_ptr), $a0
+ xor $t4,$t4
mov 8*1($a_ptr), $a1
add $a0, $a0 # a0:a3+a0:a3
mov 8*2($a_ptr), $a2
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub 8*0($a_ptr), $a0
mov $a2, $t2
sbb 8*2($a_ptr), $a2
mov $a3, $t3
sbb 8*3($a_ptr), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovc $t0, $a0
+ cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
sbb \$0, $a2
mov $a3, $t3
sbb .Lpoly+8*3(%rip), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
- cmovz $t2, $a2
- cmovz $t3, $a3
+ cmovc $t0, $a0
+ cmovc $t1, $a1
+ cmovc $t2, $a2
+ cmovc $t3, $a3
xor $t4, $t4
add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3]
sbb \$0, $a2
mov $a3, $t3
sbb .Lpoly+8*3(%rip), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovc $t0, $a0
+ cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
sbb 8*2($a_ptr), $a2
mov $a3, $t3
sbb 8*3($a_ptr), $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovc $t0, $a0
+ cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
# and add the result to the acc.
# Due to the special form of p256 we do some optimizations
#
- # acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
- # then we add acc[0] and get acc[0] x 2^64
-
- mulq $poly1
- xor $t0, $t0
- add $acc0, $acc1 # +=acc[0]*2^64
- adc \$0, %rdx
- add %rax, $acc1
- mov $acc0, %rax
-
- # acc[0] x p256[2] = 0
- adc %rdx, $acc2
- adc \$0, $t0
+ # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
+ # then we add acc[0] and get acc[0] x 2^96
+ mov $acc0, $t1
+ shl \$32, $acc0
mulq $poly3
- xor $acc0, $acc0
- add $t0, $acc3
- adc \$0, %rdx
- add %rax, $acc3
+ shr \$32, $t1
+ add $acc0, $acc1 # +=acc[0]<<96
+ adc $t1, $acc2
+ adc %rax, $acc3
mov 8*1($b_ptr), %rax
adc %rdx, $acc4
adc \$0, $acc5
+ xor $acc0, $acc0
########################################################################
# Multiply by b[1]
########################################################################
# Second reduction step
- mulq $poly1
- xor $t0, $t0
- add $acc1, $acc2
- adc \$0, %rdx
- add %rax, $acc2
- mov $acc1, %rax
- adc %rdx, $acc3
- adc \$0, $t0
-
+ mov $acc1, $t1
+ shl \$32, $acc1
mulq $poly3
- xor $acc1, $acc1
- add $t0, $acc4
- adc \$0, %rdx
- add %rax, $acc4
+ shr \$32, $t1
+ add $acc1, $acc2
+ adc $t1, $acc3
+ adc %rax, $acc4
mov 8*2($b_ptr), %rax
adc %rdx, $acc5
adc \$0, $acc0
+ xor $acc1, $acc1
########################################################################
# Multiply by b[2]
########################################################################
# Third reduction step
- mulq $poly1
- xor $t0, $t0
- add $acc2, $acc3
- adc \$0, %rdx
- add %rax, $acc3
- mov $acc2, %rax
- adc %rdx, $acc4
- adc \$0, $t0
-
+ mov $acc2, $t1
+ shl \$32, $acc2
mulq $poly3
- xor $acc2, $acc2
- add $t0, $acc5
- adc \$0, %rdx
- add %rax, $acc5
+ shr \$32, $t1
+ add $acc2, $acc3
+ adc $t1, $acc4
+ adc %rax, $acc5
mov 8*3($b_ptr), %rax
adc %rdx, $acc0
adc \$0, $acc1
+ xor $acc2, $acc2
########################################################################
# Multiply by b[3]
########################################################################
# Final reduction step
- mulq $poly1
- #xor $t0, $t0
- add $acc3, $acc4
- adc \$0, %rdx
- add %rax, $acc4
- mov $acc3, %rax
- adc %rdx, $acc5
- #adc \$0, $t0 # doesn't overflow
-
+ mov $acc3, $t1
+ shl \$32, $acc3
mulq $poly3
- #add $t0, $acc0
- #adc \$0, %rdx
+ shr \$32, $t1
+ add $acc3, $acc4
+ adc $t1, $acc5
mov $acc4, $t0
- add %rax, $acc0
+ adc %rax, $acc0
adc %rdx, $acc1
mov $acc5, $t1
adc \$0, $acc2
sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t3
sbb $poly3, $acc1 # .Lpoly[3]
- neg $acc2
+ sbb \$0, $acc2
- cmovnc $t0, $acc4
- cmovnc $t1, $acc5
+ cmovc $t0, $acc4
+ cmovc $t1, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $t2, $acc0
+ cmovc $t2, $acc0
mov $acc5, 8*1($r_ptr)
- cmovnc $t3, $acc1
+ cmovc $t3, $acc1
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
##########################################
# Now the reduction
# First iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc0, $acc1
- adc \$0, %rdx
- add %rax, $acc1
- mov $acc0, %rax
- adc %rdx, $acc2 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc0, $t0
+ shl \$32, $acc0
mulq $t1
- xor $acc0, $acc0
- #add $t0, $acc3
- #adc \$0, %rdx
- add %rax, $acc3
+ shr \$32, $t0
+ add $acc0, $acc1 # +=acc[0]<<96
+ adc $t0, $acc2
+ adc %rax, $acc3
mov $acc1, %rax
- adc %rdx, $acc4
- adc \$0, $acc0
+ adc \$0, %rdx
##########################################
# Second iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc1, $acc2
- adc \$0, %rdx
- add %rax, $acc2
- mov $acc1, %rax
- adc %rdx, $acc3 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc1, $t0
+ shl \$32, $acc1
+ mov %rdx, $acc0
mulq $t1
- xor $acc1, $acc1
- #add $t0, $acc4
- #adc \$0, %rdx
- add %rax, $acc4
+ shr \$32, $t0
+ add $acc1, $acc2
+ adc $t0, $acc3
+ adc %rax, $acc0
mov $acc2, %rax
- adc %rdx, $acc0
- adc \$0, $acc1
+ adc \$0, %rdx
##########################################
# Third iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc2, $acc3
- adc \$0, %rdx
- add %rax, $acc3
- mov $acc2, %rax
- adc %rdx, $acc4 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc2, $t0
+ shl \$32, $acc2
+ mov %rdx, $acc1
mulq $t1
- xor $acc2, $acc2
- #add $t0, $acc0
- #adc \$0, %rdx
- add %rax, $acc0
+ shr \$32, $t0
+ add $acc2, $acc3
+ adc $t0, $acc0
+ adc %rax, $acc1
mov $acc3, %rax
- adc %rdx, $acc1
- adc \$0, $acc2
+ adc \$0, %rdx
###########################################
# Last iteration
- mulq $a_ptr
- #xor $t0, $t0
- add $acc3, $acc4
- adc \$0, %rdx
- add %rax, $acc4
- mov $acc3, %rax
- adc %rdx, $acc0 # doesn't overflow
- #adc \$0, $t0
-
+ mov $acc3, $t0
+ shl \$32, $acc3
+ mov %rdx, $acc2
mulq $t1
+ shr \$32, $t0
+ add $acc3, $acc0
+ adc $t0, $acc1
+ adc %rax, $acc2
+ adc \$0, %rdx
xor $acc3, $acc3
- #add $t0, $acc1
- #adc \$0, %rdx
- add %rax, $acc1
- adc %rdx, $acc2
- adc \$0, $acc3
############################################
# Add the rest of the acc
- add $acc0, $acc5
+ add $acc0, $acc4
+ adc $acc1, $acc5
mov $acc4, $acc0
- adc $acc1, $acc6
- adc $acc2, $acc7
+ adc $acc2, $acc6
+ adc %rdx, $acc7
mov $acc5, $acc1
adc \$0, $acc3
sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $t0
sbb $t1, $acc7 # .Lpoly[3]
- neg $acc3
+ sbb \$0, $acc3
- cmovnc $acc0, $acc4
- cmovnc $acc1, $acc5
+ cmovc $acc0, $acc4
+ cmovc $acc1, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $acc2, $acc6
+ cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr)
- cmovnc $t0, $acc7
+ cmovc $t0, $acc7
mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr)
########################################################################
# First reduction step
- xor $acc0, $acc0 # $acc0=0,cf=0,of=0
- adox $t1, $acc1
- adox $t0, $acc2
+ add $t1, $acc1
+ adc $t0, $acc2
mulx $poly3, $t0, $t1
mov 8*1($b_ptr), %rdx
- adox $t0, $acc3
- adcx $t1, $acc4
-
- adox $acc0, $acc4
- adcx $acc0, $acc5 # cf=0
- adox $acc0, $acc5 # of=0
+ adc $t0, $acc3
+ adc $t1, $acc4
+ adc \$0, $acc5
+ xor $acc0, $acc0 # $acc0=0,cf=0,of=0
########################################################################
# Multiply by b[1]
########################################################################
# Second reduction step
- xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
- adox $t0, $acc2
- adox $t1, $acc3
+ add $t0, $acc2
+ adc $t1, $acc3
mulx $poly3, $t0, $t1
mov 8*2($b_ptr), %rdx
- adox $t0, $acc4
- adcx $t1, $acc5
-
- adox $acc1, $acc5
- adcx $acc1, $acc0 # cf=0
- adox $acc1, $acc0 # of=0
+ adc $t0, $acc4
+ adc $t1, $acc5
+ adc \$0, $acc0
+ xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
########################################################################
# Multiply by b[2]
########################################################################
# Third reduction step
- xor $acc2, $acc2 # $acc2=0,cf=0,of=0
- adox $t0, $acc3
- adox $t1, $acc4
+ add $t0, $acc3
+ adc $t1, $acc4
mulx $poly3, $t0, $t1
mov 8*3($b_ptr), %rdx
- adox $t0, $acc5
- adcx $t1, $acc0
-
- adox $acc2, $acc0
- adcx $acc2, $acc1 # cf=0
- adox $acc2, $acc1 # of=0
+ adc $t0, $acc5
+ adc $t1, $acc0
+ adc \$0, $acc1
+ xor $acc2, $acc2 # $acc2=0,cf=0,of=0
########################################################################
# Multiply by b[3]
########################################################################
# Fourth reduction step
- xor $acc3, $acc3 # $acc3=0,cf=0,of=0
- adox $t0, $acc4
- adox $t1, $acc5
+ add $t0, $acc4
+ adc $t1, $acc5
mulx $poly3, $t0, $t1
mov $acc4, $t2
mov .Lpoly+8*1(%rip), $poly1
- adcx $t0, $acc0
- adox $t1, $acc1
+ adc $t0, $acc0
mov $acc5, $t3
-
- adcx $acc3, $acc1
- adox $acc3, $acc2
+ adc $t1, $acc1
adc \$0, $acc2
- mov $acc0, $t0
########################################################################
# Branch-less conditional subtraction of P
xor %eax, %eax
+ mov $acc0, $t0
sbb \$-1, $acc4 # .Lpoly[0]
sbb $poly1, $acc5 # .Lpoly[1]
sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t1
sbb $poly3, $acc1 # .Lpoly[3]
+ sbb \$0, $acc2
- bt \$0,$acc2
- cmovnc $t2, $acc4
- cmovnc $t3, $acc5
+ cmovc $t2, $acc4
+ cmovc $t3, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $t0, $acc0
+ cmovc $t0, $acc0
mov $acc5, 8*1($r_ptr)
- cmovnc $t1, $acc1
+ cmovc $t1, $acc1
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
mov .Lpoly+8*3(%rip), $t1
# reduction step 1
- xor $acc0, $acc0
- adcx $t0, $acc1
- adcx $t4, $acc2
+ add $t0, $acc1
+ adc $t4, $acc2
- mulx $t1, $t0, $t4
+ mulx $t1, $t0, $acc0
mov $acc1, %rdx
- adcx $t0, $acc3
+ adc $t0, $acc3
shlx $a_ptr, $acc1, $t0
- adox $t4, $acc0
- shrx $a_ptr, $acc1, $t4
adc \$0, $acc0
+ shrx $a_ptr, $acc1, $t4
# reduction step 2
- xor $acc1, $acc1
- adcx $t0, $acc2
- adcx $t4, $acc3
+ add $t0, $acc2
+ adc $t4, $acc3
- mulx $t1, $t0, $t4
+ mulx $t1, $t0, $acc1
mov $acc2, %rdx
- adcx $t0, $acc0
+ adc $t0, $acc0
shlx $a_ptr, $acc2, $t0
- adox $t4, $acc1
- shrx $a_ptr, $acc2, $t4
adc \$0, $acc1
+ shrx $a_ptr, $acc2, $t4
# reduction step 3
- xor $acc2, $acc2
- adcx $t0, $acc3
- adcx $t4, $acc0
+ add $t0, $acc3
+ adc $t4, $acc0
- mulx $t1, $t0, $t4
+ mulx $t1, $t0, $acc2
mov $acc3, %rdx
- adcx $t0, $acc1
+ adc $t0, $acc1
shlx $a_ptr, $acc3, $t0
- adox $t4, $acc2
- shrx $a_ptr, $acc3, $t4
adc \$0, $acc2
+ shrx $a_ptr, $acc3, $t4
# reduction step 4
- xor $acc3, $acc3
- adcx $t0, $acc0
- adcx $t4, $acc1
+ add $t0, $acc0
+ adc $t4, $acc1
- mulx $t1, $t0, $t4
- adcx $t0, $acc2
- adox $t4, $acc3
+ mulx $t1, $t0, $acc3
+ adc $t0, $acc2
adc \$0, $acc3
xor $t3, $t3 # cf=0
sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $acc3
sbb $t1, $acc7 # .Lpoly[3]
+ sbb \$0, $t3
- bt \$0,$t3
- cmovnc $acc0, $acc4
- cmovnc $acc1, $acc5
+ cmovc $acc0, $acc4
+ cmovc $acc1, $acc5
mov $acc4, 8*0($r_ptr)
- cmovnc $acc2, $acc6
+ cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr)
- cmovnc $acc3, $acc7
+ cmovc $acc3, $acc7
mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr)
}
{
my ($r_ptr,$in_ptr)=("%rdi","%rsi");
-my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
-my ($t0,$t1)=("%rcx","%rsi");
+my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
$code.=<<___;
################################################################################
push %r13
mov 8*0($in_ptr), %rax
+ mov .Lpoly+8*3(%rip), $t2
mov 8*1($in_ptr), $acc1
mov 8*2($in_ptr), $acc2
mov 8*3($in_ptr), $acc3
- lea .Lpoly(%rip), $in_ptr
- xor $acc4, $acc4
mov %rax, $acc0
+ mov .Lpoly+8*1(%rip), $t1
#########################################
# First iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
+ mov %rax, $t0
+ shl \$32, $acc0
+ mulq $t2
+ shr \$32, $t0
add $acc0, $acc1
- adc \$0, %rdx
- add %rax, $acc1
- mov $acc0, %rax
- adc %rdx, $acc2
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- xor $acc0, $acc0
- add $t0, $acc3
- adc \$0, %rdx
- add %rax, $acc3
+ adc $t0, $acc2
+ adc %rax, $acc3
mov $acc1, %rax
- adc %rdx, $acc4
- adc \$0, $acc0
+ adc \$0, %rdx
#########################################
# Second iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
+ mov $acc1, $t0
+ shl \$32, $acc1
+ mov %rdx, $acc0
+ mulq $t2
+ shr \$32, $t0
add $acc1, $acc2
- adc \$0, %rdx
- add %rax, $acc2
- mov $acc1, %rax
- adc %rdx, $acc3
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- xor $acc1, $acc1
- add $t0, $acc4
- adc \$0, %rdx
- add %rax, $acc4
+ adc $t0, $acc3
+ adc %rax, $acc0
mov $acc2, %rax
- adc %rdx, $acc0
- adc \$0, $acc1
+ adc \$0, %rdx
##########################################
# Third iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
+ mov $acc2, $t0
+ shl \$32, $acc2
+ mov %rdx, $acc1
+ mulq $t2
+ shr \$32, $t0
add $acc2, $acc3
- adc \$0, %rdx
- add %rax, $acc3
- mov $acc2, %rax
- adc %rdx, $acc4
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- xor $acc2, $acc2
- add $t0, $acc0
- adc \$0, %rdx
- add %rax, $acc0
+ adc $t0, $acc0
+ adc %rax, $acc1
mov $acc3, %rax
- adc %rdx, $acc1
- adc \$0, $acc2
+ adc \$0, %rdx
###########################################
# Last iteration
- mulq 1*8($in_ptr)
- xor $t0, $t0
- add $acc3, $acc4
- adc \$0, %rdx
- add %rax, $acc4
- mov $acc3, %rax
- adc %rdx, $acc0
- adc \$0, $t0
-
- mulq 3*8($in_ptr)
- add $t0, $acc1
+ mov $acc3, $t0
+ shl \$32, $acc3
+ mov %rdx, $acc2
+ mulq $t2
+ shr \$32, $t0
+ add $acc3, $acc0
+ adc $t0, $acc1
+ mov $acc0, $t0
+ adc %rax, $acc2
+ mov $acc1, $in_ptr
adc \$0, %rdx
- add %rax, $acc1
- adc %rdx, $acc2
- sbb $acc3, $acc3
- mov 0*8($in_ptr), %rax
- mov 1*8($in_ptr), %rdx
- mov 2*8($in_ptr), $t0
- mov 3*8($in_ptr), $t1
-
- and $acc3, %rax
- and $acc3, %rdx
- and $acc3, $t0
- and $acc3, $t1
-
- sub %rax, $acc4
- sbb %rdx, $acc0
- mov $acc4, 8*0($r_ptr)
- sbb $t0, $acc1
- mov $acc0, 8*1($r_ptr)
- sbb $t1, $acc2
- mov $acc1, 8*2($r_ptr)
- mov $acc2, 8*3($r_ptr)
+ ###########################################
+ # Branch-less conditional subtraction
+ sub \$-1, $acc0
+ mov $acc2, %rax
+ sbb $t1, $acc1
+ sbb \$0, $acc2
+ mov %rdx, $acc3
+ sbb $t2, %rdx
+ sbb $t2, $t2
+
+ cmovnz $t0, $acc0
+ cmovnz $in_ptr, $acc1
+ mov $acc0, 8*0($r_ptr)
+ cmovnz %rax, $acc2
+ mov $acc1, 8*1($r_ptr)
+ cmovz %rdx, $acc3
+ mov $acc2, 8*2($r_ptr)
+ mov $acc3, 8*3($r_ptr)
pop %r13
pop %r12
$code.=<<___;
################################################################################
-# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
-.globl ecp_nistz256_select_w5
-.type ecp_nistz256_select_w5,\@abi-omnipotent
+# void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_scatter_w5
+.type ecp_nistz256_scatter_w5,\@abi-omnipotent
+.align 32
+ecp_nistz256_scatter_w5:
+ lea -3($index,$index,2), $index
+ movdqa 0x00($in_t), %xmm0
+ shl \$5, $index
+ movdqa 0x10($in_t), %xmm1
+ movdqa 0x20($in_t), %xmm2
+ movdqa 0x30($in_t), %xmm3
+ movdqa 0x40($in_t), %xmm4
+ movdqa 0x50($in_t), %xmm5
+ movdqa %xmm0, 0x00($val,$index)
+ movdqa %xmm1, 0x10($val,$index)
+ movdqa %xmm2, 0x20($val,$index)
+ movdqa %xmm3, 0x30($val,$index)
+ movdqa %xmm4, 0x40($val,$index)
+ movdqa %xmm5, 0x50($val,$index)
+
+ ret
+.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
+
+################################################################################
+# void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_gather_w5
+.type ecp_nistz256_gather_w5,\@abi-omnipotent
.align 32
-ecp_nistz256_select_w5:
+ecp_nistz256_gather_w5:
___
$code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax
test \$`1<<5`, %eax
- jnz .Lavx2_select_w5
+ jnz .Lavx2_gather_w5
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_select_w5:
+.LSEH_begin_ecp_nistz256_gather_w5:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_select_w5:
+.LSEH_end_ecp_nistz256_gather_w5:
___
$code.=<<___;
ret
-.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
+
+################################################################################
+# void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_scatter_w7
+.type ecp_nistz256_scatter_w7,\@abi-omnipotent
+.align 32
+ecp_nistz256_scatter_w7:
+ movdqu 0x00($in_t), %xmm0
+ shl \$6, $index
+ movdqu 0x10($in_t), %xmm1
+ movdqu 0x20($in_t), %xmm2
+ movdqu 0x30($in_t), %xmm3
+ movdqa %xmm0, 0x00($val,$index)
+ movdqa %xmm1, 0x10($val,$index)
+ movdqa %xmm2, 0x20($val,$index)
+ movdqa %xmm3, 0x30($val,$index)
+
+ ret
+.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
################################################################################
-# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
-.globl ecp_nistz256_select_w7
-.type ecp_nistz256_select_w7,\@abi-omnipotent
+# void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_gather_w7
+.type ecp_nistz256_gather_w7,\@abi-omnipotent
.align 32
-ecp_nistz256_select_w7:
+ecp_nistz256_gather_w7:
___
$code.=<<___ if ($avx>1);
mov OPENSSL_ia32cap_P+8(%rip), %eax
test \$`1<<5`, %eax
- jnz .Lavx2_select_w7
+ jnz .Lavx2_gather_w7
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_select_w7:
+.LSEH_begin_ecp_nistz256_gather_w7:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_select_w7:
+.LSEH_end_ecp_nistz256_gather_w7:
___
$code.=<<___;
ret
-.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
___
}
if ($avx>1) {
$code.=<<___;
################################################################################
-# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
-.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
+# void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index);
+.type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent
.align 32
-ecp_nistz256_avx2_select_w5:
-.Lavx2_select_w5:
+ecp_nistz256_avx2_gather_w5:
+.Lavx2_gather_w5:
vzeroupper
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_avx2_select_w5:
+.LSEH_begin_ecp_nistz256_avx2_gather_w5:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_select_w5:
+.LSEH_end_ecp_nistz256_avx2_gather_w5:
___
$code.=<<___;
ret
-.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
___
}
if ($avx>1) {
$code.=<<___;
################################################################################
-# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
-.globl ecp_nistz256_avx2_select_w7
-.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
+# void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index);
+.globl ecp_nistz256_avx2_gather_w7
+.type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent
.align 32
-ecp_nistz256_avx2_select_w7:
-.Lavx2_select_w7:
+ecp_nistz256_avx2_gather_w7:
+.Lavx2_gather_w7:
vzeroupper
___
$code.=<<___ if ($win64);
lea -0x88(%rsp), %rax
-.LSEH_begin_ecp_nistz256_avx2_select_w7:
+.LSEH_begin_ecp_nistz256_avx2_gather_w7:
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
movaps 0x80(%rsp), %xmm14
movaps 0x90(%rsp), %xmm15
lea 0xa8(%rsp), %rsp
-.LSEH_end_ecp_nistz256_avx2_select_w7:
+.LSEH_end_ecp_nistz256_avx2_gather_w7:
___
$code.=<<___;
ret
-.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
___
} else {
$code.=<<___;
-.globl ecp_nistz256_avx2_select_w7
-.type ecp_nistz256_avx2_select_w7,\@function,3
+.globl ecp_nistz256_avx2_gather_w7
+.type ecp_nistz256_avx2_gather_w7,\@function,3
.align 32
-ecp_nistz256_avx2_select_w7:
+ecp_nistz256_avx2_gather_w7:
.byte 0x0f,0x0b # ud2
ret
-.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
___
}
{{{
.type __ecp_nistz256_add_toq,\@abi-omnipotent
.align 32
__ecp_nistz256_add_toq:
+ xor $t4,$t4
add 8*0($b_ptr), $a0
adc 8*1($b_ptr), $a1
mov $a0, $t0
adc 8*2($b_ptr), $a2
adc 8*3($b_ptr), $a3
mov $a1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $a0
mov $a2, $t2
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovc $t0, $a0
+ cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
.align 32
__ecp_nistz256_mul_by_2q:
+ xor $t4, $t4
add $a0, $a0 # a0:a3+a0:a3
adc $a1, $a1
mov $a0, $t0
adc $a2, $a2
adc $a3, $a3
mov $a1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $a0
mov $a2, $t2
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $a0
- cmovz $t1, $a1
+ cmovc $t0, $a0
+ cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovz $t2, $a2
+ cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovz $t3, $a3
+ cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
push %r15
sub \$32*5+8, %rsp
+.Lpoint_double_shortcut$x:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
mov $a_ptr, $b_ptr # backup copy
movdqu 0x10($a_ptr), %xmm1
mov $b_org, $a_ptr # reassign
movdqa %xmm0, $in1_x(%rsp)
movdqa %xmm1, $in1_x+0x10(%rsp)
- por %xmm0, %xmm1
movdqa %xmm2, $in1_y(%rsp)
movdqa %xmm3, $in1_y+0x10(%rsp)
- por %xmm2, %xmm3
movdqa %xmm4, $in1_z(%rsp)
movdqa %xmm5, $in1_z+0x10(%rsp)
- por %xmm1, %xmm3
+ por %xmm4, %xmm5
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
- pshufd \$0xb1, %xmm3, %xmm5
+ pshufd \$0xb1, %xmm5, %xmm3
movdqu 0x10($a_ptr), %xmm1
movdqu 0x20($a_ptr), %xmm2
por %xmm3, %xmm5
movdqa %xmm0, $in2_x(%rsp)
pshufd \$0x1e, %xmm5, %xmm4
movdqa %xmm1, $in2_x+0x10(%rsp)
- por %xmm0, %xmm1
- movq $r_ptr, %xmm0 # save $r_ptr
+ movdqu 0x40($a_ptr),%xmm0 # in2_z again
+ movdqu 0x50($a_ptr),%xmm1
movdqa %xmm2, $in2_y(%rsp)
movdqa %xmm3, $in2_y+0x10(%rsp)
- por %xmm2, %xmm3
por %xmm4, %xmm5
pxor %xmm4, %xmm4
- por %xmm1, %xmm3
+ por %xmm0, %xmm1
+ movq $r_ptr, %xmm0 # save $r_ptr
lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
pcmpeqd %xmm4, %xmm5
- pshufd \$0xb1, %xmm3, %xmm4
- por %xmm3, %xmm4
+ pshufd \$0xb1, %xmm1, %xmm4
+ por %xmm1, %xmm4
pshufd \$0, %xmm5, %xmm5 # in1infty
pshufd \$0x1e, %xmm4, %xmm3
por %xmm3, %xmm4
mov 0x40+8*1($b_ptr), $acc6
mov 0x40+8*2($b_ptr), $acc7
mov 0x40+8*3($b_ptr), $acc0
+ movq $b_ptr, %xmm1
lea 0x40-$bias($b_ptr), $a_ptr
lea $Z1sqr(%rsp), $r_ptr # Z1^2
test $acc0, $acc0
jnz .Ladd_proceed$x # (in1infty || in2infty)?
test $acc1, $acc1
- jz .Ladd_proceed$x # is_equal(S1,S2)?
+ jz .Ladd_double$x # is_equal(S1,S2)?
movq %xmm0, $r_ptr # restore $r_ptr
pxor %xmm0, %xmm0
movdqu %xmm0, 0x50($r_ptr)
jmp .Ladd_done$x
+.align 32
+.Ladd_double$x:
+ movq %xmm1, $a_ptr # restore $a_ptr
+ movq %xmm0, $r_ptr # restore $r_ptr
+ add \$`32*(18-5)`, %rsp # difference in frame sizes
+ jmp .Lpoint_double_shortcut$x
+
.align 32
.Ladd_proceed$x:
`&load_for_sqr("$R(%rsp)", "$src0")`
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+ xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $acc0
+ cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
- cmovz $t1, $acc1
+ cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
- cmovz $t2, $acc2
+ cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
- cmovz $t3, $acc3
+ cmovc $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
mov 0x40+8*3($a_ptr), $acc0
movdqa %xmm0, $in1_x(%rsp)
movdqa %xmm1, $in1_x+0x10(%rsp)
- por %xmm0, %xmm1
movdqa %xmm2, $in1_y(%rsp)
movdqa %xmm3, $in1_y+0x10(%rsp)
- por %xmm2, %xmm3
movdqa %xmm4, $in1_z(%rsp)
movdqa %xmm5, $in1_z+0x10(%rsp)
- por %xmm1, %xmm3
+ por %xmm4, %xmm5
movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
- pshufd \$0xb1, %xmm3, %xmm5
+ pshufd \$0xb1, %xmm5, %xmm3
movdqu 0x10($b_ptr), %xmm1
movdqu 0x20($b_ptr), %xmm2
por %xmm3, %xmm5
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
+ xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
- sbb $t4, $t4
+ adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
- test $t4, $t4
+ sbb \$0, $t4
- cmovz $t0, $acc0
+ cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
- cmovz $t1, $acc1
+ cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
- cmovz $t2, $acc2
+ cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
- cmovz $t3, $acc3
+ cmovc $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
+ sbb \$0, $t4
- bt \$0, $t4
- cmovnc $t0, $a0
- cmovnc $t1, $a1
+ cmovc $t0, $a0
+ cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovnc $t2, $a2
+ cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovnc $t3, $a3
+ cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
sbb \$0, $a2
mov $a3, $t3
sbb $poly3, $a3
+ sbb \$0, $t4
- bt \$0, $t4
- cmovnc $t0, $a0
- cmovnc $t1, $a1
+ cmovc $t0, $a0
+ cmovc $t1, $a1
mov $a0, 8*0($r_ptr)
- cmovnc $t2, $a2
+ cmovc $t2, $a2
mov $a1, 8*1($r_ptr)
- cmovnc $t3, $a3
+ cmovc $t3, $a3
mov $a2, 8*2($r_ptr)
mov $a3, 8*3($r_ptr)
}
}}}
+########################################################################
+# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
+#
+open TABLE,"<ecp_nistz256_table.c" or
+open TABLE,"<${dir}../ecp_nistz256_table.c" or
+die "failed to open ecp_nistz256_table.c:",$!;
+
+use integer;
+
+foreach(<TABLE>) {
+ s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
+}
+close TABLE;
+
+die "insane number of elements" if ($#arr != 64*16*37-1);
+
+print <<___;
+.text
+.globl ecp_nistz256_precomputed
+.type ecp_nistz256_precomputed,\@object
+.align 4096
+ecp_nistz256_precomputed:
+___
+while (@line=splice(@arr,0,16)) {
+ print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n";
+}
+print <<___;
+.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
+___
+
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;