X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fpoly1305%2Fasm%2Fpoly1305-x86.pl;h=9db38b5ecc4b60ce79462ad1a2c070ed50d47dd0;hp=4307c9978a2da3ed303a9a6247825ddbbc789781;hb=a30b0522cb937be54e172c68b0e9f5fa6ec30bf3;hpb=3aa3af68a52ed8e6fba9c5a7659943714593ac88 diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl index 4307c9978a..9db38b5ecc 100755 --- a/crypto/poly1305/asm/poly1305-x86.pl +++ b/crypto/poly1305/asm/poly1305-x86.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -22,7 +29,9 @@ # Westmere 4.58/+100% 1.43 # Sandy Bridge 3.90/+100% 1.36 # Haswell 3.88/+70% 1.18 0.72 +# Skylake 3.10/+60% 1.14 0.62 # Silvermont 11.0/+40% 4.80 +# Goldmont 4.10/+200% 2.10 # VIA Nano 6.71/+90% 2.47 # Sledgehammer 3.51/+180% 4.27 # Bulldozer 4.53/+140% 1.31 @@ -299,6 +308,7 @@ if ($sse2) { &adc ("ebx",0); &adc ("ecx",0); &adc ("esi",0); + &adc ("edi",0); &cmp ("ebp",&wparam(2)); # done yet? &jne (&label("loop")); @@ -544,6 +554,8 @@ my $extra = shift; ################################################################ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein # and P. Schwabe + # + # [(*) see discussion in poly1305-armv4 module] &movdqa ($T0,$D3); &pand ($D3,$MASK); @@ -563,12 +575,12 @@ my $extra = shift; # possible, because # paddq is "broken" # on Atom - &pand ($D1,$MASK); - &paddq ($T1,$D2); # h1 -> h2 &psllq ($T0,2); + &paddq ($T1,$D2); # h1 -> h2 + &paddq ($T0,$D0); # h4 -> h0 (*) + &pand ($D1,$MASK); &movdqa ($D2,$T1); &psrlq ($T1,26); - &paddd ($T0,$D0); # h4 -> h0 &pand ($D2,$MASK); &paddd ($T1,$D3); # h2 -> h3 &movdqa ($D0,$T0); @@ -1165,11 +1177,12 @@ my $addr = shift; &shr ("edi",2); &lea ("ebp",&DWP(0,"edi","edi",4)); # *5 &mov ("edi",&wparam(1)); # output - add ("eax","ebp"); + &add ("eax","ebp"); &mov ("ebp",&wparam(2)); # key - adc ("ebx",0); - adc ("ecx",0); - adc ("edx",0); + &adc ("ebx",0); + &adc ("ecx",0); + &adc ("edx",0); + &adc ("esi",0); &movd ($D0,"eax"); # offload original hash value &add ("eax",5); # compare to modulus @@ -1708,18 +1721,18 @@ sub vlazy_reduction { &vpsrlq ($T1,$D1,26); &vpand ($D1,$D1,$MASK); &vpaddq ($D2,$D2,$T1); # h1 -> h2 - &vpaddd ($D0,$D0,$T0); + &vpaddq ($D0,$D0,$T0); &vpsllq ($T0,$T0,2); &vpsrlq ($T1,$D2,26); &vpand ($D2,$D2,$MASK); - &vpaddd ($D0,$D0,$T0); # h4 -> h0 - &vpaddd ($D3,$D3,$T1); # h2 -> h3 + &vpaddq ($D0,$D0,$T0); # h4 -> h0 + &vpaddq ($D3,$D3,$T1); # h2 -> h3 &vpsrlq ($T1,$D3,26); &vpsrlq ($T0,$D0,26); &vpand ($D0,$D0,$MASK); &vpand ($D3,$D3,$MASK); - &vpaddd ($D1,$D1,$T0); # h0 -> h1 - &vpaddd ($D4,$D4,$T1); # h3 -> h4 + &vpaddq ($D1,$D1,$T0); # h0 -> h1 + &vpaddq ($D4,$D4,$T1); # h3 -> h4 } &vlazy_reduction();