# measured with rdtsc at fixed clock frequency.
#
# IALU/gcc-4.8(*) AVX(**) AVX2
-# P4 4.90/+120% -
-# Core 2 2.39/+90% -
-# Westmere 1.86/+120% -
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.10/+175% 1.11 0.65
-# Skylake 1.12/+120% 0.96 0.51
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake 1.13/+120% 0.96 0.51
# Silvermont 2.83/+95% -
# VIA Nano 1.82/+150% -
# Sledgehammer 1.38/+160% -
-# Bulldozer 2.21/+130% 0.97
+# Bulldozer 2.30/+130% 0.97
#
# (*) improvement coefficients relative to clang are more modest and
# are ~50% on most processors, in both cases we are comparing to
add $d3,%rax
add %rax,$h0
adc \$0,$h1
+ adc \$0,$h2
___
}
.extern OPENSSL_ia32cap_P
.globl poly1305_init
+.hidden poly1305_init
.globl poly1305_blocks
+.hidden poly1305_blocks
.globl poly1305_emit
+.hidden poly1305_emit
+
.type poly1305_init,\@function,3
.align 32
poly1305_init:
and 8($inp),%rcx
mov %rax,24($ctx)
mov %rcx,32($ctx)
-
+___
+$code.=<<___ if ($flavour !~ /elf32/);
mov %r10,0(%rdx)
mov %r11,8(%rdx)
-
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
mov \$1,%eax
.Lno_key:
ret
.align 32
poly1305_blocks:
.Lblocks:
- sub \$16,$len # too short?
- jc .Lno_data
+ shr \$4,$len
+ jz .Lno_data # too short
push %rbx
push %rbp
&poly1305_iteration();
$code.=<<___;
mov $r1,%rax
- sub \$16,%r15 # len-=16
- jnc .Loop
+ dec %r15 # len-=16
+ jnz .Loop
mov $h0,0($ctx) # store hash value
mov $h1,8($ctx)
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
- and \$-1<<31,$d1
+ and \$`-1*(1<<31)`,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
- and \$-1<<31,$d2
+ and \$`-1*(1<<31)`,$d2
shr \$6,$d1
shl \$52,$r1
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
add %rcx,%rax
add %rax,%r8
adc \$0,%r9
+ adc \$0,%r10
mov %r8,%rax
add \$5,%r8 # compare to modulus
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
- and \$-1<<31,$d1
+ and \$`-1*(1<<31)`,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
- and \$-1<<31,$d2
+ and \$`-1*(1<<31)`,$d2
shr \$6,$d1
shl \$52,$r1
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
.Lmask24:
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
.L129:
-.long 1<<24,0,1<<24,0,1<<24,0,1<<24,0
+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
.Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lfive: