-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# Westmere 4.58/+100% 1.43
# Sandy Bridge 3.90/+100% 1.36
# Haswell 3.88/+70% 1.18 0.72
+# Skylake 3.10/+60% 1.14 0.62
# Silvermont 11.0/+40% 4.80
+# Goldmont 4.10/+200% 2.10
# VIA Nano 6.71/+90% 2.47
# Sledgehammer 3.51/+180% 4.27
# Bulldozer 4.53/+140% 1.31
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386");
+$output=pop and open STDOUT,">$output";
+
+&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
$sse2=$avx=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
$avx = ($1>=2.09) + ($1>=2.10);
}
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
$avx = ($2>=3.0) + ($2>3.0);
}
}
&adc ("ebx",0);
&adc ("ecx",0);
&adc ("esi",0);
+ &adc ("edi",0);
&cmp ("ebp",&wparam(2)); # done yet?
&jne (&label("loop"));
},"edx");
sub lazy_reduction {
+my $extra = shift;
+
################################################################
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
# and P. Schwabe
+ #
+ # [(*) see discussion in poly1305-armv4 module]
&movdqa ($T0,$D3);
&pand ($D3,$MASK);
&psrlq ($T0,26);
+ &$extra () if (defined($extra));
&paddq ($T0,$D4); # h3 -> h4
&movdqa ($T1,$D0);
&pand ($D0,$MASK);
# possible, because
# paddq is "broken"
# on Atom
- &pand ($D1,$MASK);
- &paddq ($T1,$D2); # h1 -> h2
&psllq ($T0,2);
+ &paddq ($T1,$D2); # h1 -> h2
+ &paddq ($T0,$D0); # h4 -> h0 (*)
+ &pand ($D1,$MASK);
&movdqa ($D2,$T1);
&psrlq ($T1,26);
- &paddd ($T0,$D0); # h4 -> h0
&pand ($D2,$MASK);
&paddd ($T1,$D3); # h2 -> h3
&movdqa ($D0,$T0);
&movdqa ($T0,$T1); # -> base 2^26 ...
&pand ($T1,$MASK);
- &paddd ($D0,$T1); # ... and accumuate
+ &paddd ($D0,$T1); # ... and accumulate
&movdqa ($T1,$T0);
&psrlq ($T0,26);
&set_label("short_tail");
- &lazy_reduction ();
-
################################################################
# horizontal addition
+ &pshufd ($T1,$D4,0b01001110);
+ &pshufd ($T0,$D3,0b01001110);
+ &paddq ($D4,$T1);
+ &paddq ($D3,$T0);
&pshufd ($T1,$D0,0b01001110);
&pshufd ($T0,$D1,0b01001110);
- &paddd ($D0,$T1);
+ &paddq ($D0,$T1);
+ &paddq ($D1,$T0);
&pshufd ($T1,$D2,0b01001110);
- &paddd ($D1,$T0);
- &pshufd ($T0,$D3,0b01001110);
- &paddd ($D2,$T1);
- &pshufd ($T1,$D4,0b01001110);
- &paddd ($D3,$T0);
- &paddd ($D4,$T1);
+ #&paddq ($D2,$T1);
+
+ &lazy_reduction (sub { &paddq ($D2,$T1) });
&set_label("done");
&movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value
&movd (&DWP(-16*3+4*2,"edi"),$D2);
&movd (&DWP(-16*3+4*3,"edi"),$D3);
&movd (&DWP(-16*3+4*4,"edi"),$D4);
-&set_label("nodata");
&mov ("esp","ebp");
+&set_label("nodata");
&function_end("_poly1305_blocks_sse2");
&align (32);
&shr ("edi",2);
&lea ("ebp",&DWP(0,"edi","edi",4)); # *5
&mov ("edi",&wparam(1)); # output
- add ("eax","ebp");
+ &add ("eax","ebp");
&mov ("ebp",&wparam(2)); # key
- adc ("ebx",0);
- adc ("ecx",0);
- adc ("edx",0);
+ &adc ("ebx",0);
+ &adc ("ecx",0);
+ &adc ("edx",0);
+ &adc ("esi",0);
&movd ($D0,"eax"); # offload original hash value
&add ("eax",5); # compare to modulus
&test ("eax","eax"); # is_base2_26?
&jz (&label("enter_blocks"));
-&set_label("enter_avx2",16);
+&set_label("enter_avx2");
&vzeroupper ();
&call (&label("pic_point"));
&vpsrlq ($T1,$D1,26);
&vpand ($D1,$D1,$MASK);
&vpaddq ($D2,$D2,$T1); # h1 -> h2
- &vpaddd ($D0,$D0,$T0);
+ &vpaddq ($D0,$D0,$T0);
&vpsllq ($T0,$T0,2);
&vpsrlq ($T1,$D2,26);
&vpand ($D2,$D2,$MASK);
- &vpaddd ($D0,$D0,$T0); # h4 -> h0
- &vpaddd ($D3,$D3,$T1); # h2 -> h3
+ &vpaddq ($D0,$D0,$T0); # h4 -> h0
+ &vpaddq ($D3,$D3,$T1); # h2 -> h3
&vpsrlq ($T1,$D3,26);
&vpsrlq ($T0,$D0,26);
&vpand ($D0,$D0,$MASK);
&vpand ($D3,$D3,$MASK);
- &vpaddd ($D1,$D1,$T0); # h0 -> h1
- &vpaddd ($D4,$D4,$T1); # h3 -> h4
+ &vpaddq ($D1,$D1,$T0); # h0 -> h1
+ &vpaddq ($D4,$D4,$T1); # h3 -> h4
}
&vlazy_reduction();
&vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); });
- &vlazy_reduction();
-
################################################################
# horizontal addition
+ &vpsrldq ($T0,$D4,8);
+ &vpsrldq ($T1,$D3,8);
+ &vpaddq ($D4,$D4,$T0);
&vpsrldq ($T0,$D0,8);
+ &vpaddq ($D3,$D3,$T1);
&vpsrldq ($T1,$D1,8);
&vpaddq ($D0,$D0,$T0);
&vpsrldq ($T0,$D2,8);
&vpaddq ($D1,$D1,$T1);
- &vpsrldq ($T1,$D3,8);
+ &vpermq ($T1,$D4,2); # keep folding
&vpaddq ($D2,$D2,$T0);
- &vpsrldq ($T0,$D4,8);
- &vpaddq ($D3,$D3,$T1);
- &vpermq ($T1,$D0,2); # keep folding
- &vpaddq ($D4,$D4,$T0);
+ &vpermq ($T0,$D3,2);
+ &vpaddq ($D4,$D4,$T1);
+ &vpermq ($T1,$D0,2);
+ &vpaddq ($D3,$D3,$T0);
&vpermq ($T0,$D1,2);
&vpaddq ($D0,$D0,$T1);
&vpermq ($T1,$D2,2);
&vpaddq ($D1,$D1,$T0);
- &vpermq ($T0,$D3,2);
&vpaddq ($D2,$D2,$T1);
- &vpermq ($T1,$D4,2);
- &vpaddq ($D3,$D3,$T0);
- &vpaddq ($D4,$D4,$T1);
+
+ &vlazy_reduction();
&cmp ("ecx",0);
&je (&label("done"));
&jmp (&label("even"));
&set_label("done",16);
- &vmovd (&DWP(-16*3+4*0,"edi"),"xmm0"); # store hash value
- &vmovd (&DWP(-16*3+4*1,"edi"),"xmm1");
- &vmovd (&DWP(-16*3+4*2,"edi"),"xmm2");
- &vmovd (&DWP(-16*3+4*3,"edi"),"xmm3");
- &vmovd (&DWP(-16*3+4*4,"edi"),"xmm4");
+ &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
+ &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1));
+ &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2));
+ &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3));
+ &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4));
&vzeroupper ();
-&set_label("nodata");
&mov ("esp","ebp");
+&set_label("nodata");
&function_end("_poly1305_blocks_avx2");
}
&set_label("const_sse2",64);
&align (4);
&asm_finish();
+
+close STDOUT;