projects
/
openssl.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
94d1f4b
)
aes/asm/bsaes-x86_64.pl: Atom-specific optimization.
author
Andy Polyakov
<appro@openssl.org>
Thu, 24 Apr 2014 08:13:30 +0000
(10:13 +0200)
committer
Andy Polyakov
<appro@openssl.org>
Thu, 24 Apr 2014 08:13:30 +0000
(10:13 +0200)
crypto/aes/asm/bsaes-x86_64.pl
patch
|
blob
|
history
diff --git
a/crypto/aes/asm/bsaes-x86_64.pl
b/crypto/aes/asm/bsaes-x86_64.pl
index 41b90f08443f512bb0079910720936afff275fc7..d2c3978b962e451f0e901f0e578138813c5ef08c 100644
(file)
--- a/
crypto/aes/asm/bsaes-x86_64.pl
+++ b/
crypto/aes/asm/bsaes-x86_64.pl
@@
-38,8
+38,8
@@
# Emilia's this(*) difference
#
# Core 2 9.30 8.69 +7%
# Emilia's this(*) difference
#
# Core 2 9.30 8.69 +7%
-# Nehalem(**) 7.63 6.
98 +9
%
-# Atom 17.1 1
7.4 -2%(***)
+# Nehalem(**) 7.63 6.
88 +11
%
+# Atom 17.1 1
6.4 +4%
#
# (*) Comparison is not completely fair, because "this" is ECB,
# i.e. no extra processing such as counter values calculation
#
# (*) Comparison is not completely fair, because "this" is ECB,
# i.e. no extra processing such as counter values calculation
@@
-50,14
+50,6
@@
# (**) Results were collected on Westmere, which is considered to
# be equivalent to Nehalem for this code.
#
# (**) Results were collected on Westmere, which is considered to
# be equivalent to Nehalem for this code.
#
-# (***) Slowdown on Atom is rather strange per se, because original
-# implementation has a number of 9+-bytes instructions, which
-# are bad for Atom front-end, and which I eliminated completely.
-# In attempt to address deterioration sbox() was tested in FP
-# SIMD "domain" (movaps instead of movdqa, xorps instead of
-# pxor, etc.). While it resulted in nominal 4% improvement on
-# Atom, it hurted Westmere by more than 2x factor.
-#
# As for key schedule conversion subroutine. Interface to OpenSSL
# relies on per-invocation on-the-fly conversion. This naturally
# has impact on performance, especially for short inputs. Conversion
# As for key schedule conversion subroutine. Interface to OpenSSL
# relies on per-invocation on-the-fly conversion. This naturally
# has impact on performance, especially for short inputs. Conversion
@@
-67,7
+59,7
@@
# conversion conversion/8x block
# Core 2 240 0.22
# Nehalem 180 0.20
# conversion conversion/8x block
# Core 2 240 0.22
# Nehalem 180 0.20
-# Atom 430 0.
19
+# Atom 430 0.
20
#
# The ratio values mean that 128-byte blocks will be processed
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
#
# The ratio values mean that 128-byte blocks will be processed
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
@@
-83,9
+75,9
@@
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
-# Core 2 9.
83
-# Nehalem 7.
74
-# Atom 1
9.0
+# Core 2 9.
98
+# Nehalem 7.
80
+# Atom 1
7.9
#
# November 2011.
#
#
# November 2011.
#
@@
-434,21
+426,21
@@
my $mask=pop;
$code.=<<___;
pxor 0x00($key),@x[0]
pxor 0x10($key),@x[1]
$code.=<<___;
pxor 0x00($key),@x[0]
pxor 0x10($key),@x[1]
- pshufb $mask,@x[0]
pxor 0x20($key),@x[2]
pxor 0x20($key),@x[2]
- pshufb $mask,@x[1]
pxor 0x30($key),@x[3]
pxor 0x30($key),@x[3]
- pshufb $mask,@x[2]
+ pshufb $mask,@x[0]
+ pshufb $mask,@x[1]
pxor 0x40($key),@x[4]
pxor 0x40($key),@x[4]
- pshufb $mask,@x[3]
pxor 0x50($key),@x[5]
pxor 0x50($key),@x[5]
- pshufb $mask,@x[4]
+ pshufb $mask,@x[2]
+ pshufb $mask,@x[3]
pxor 0x60($key),@x[6]
pxor 0x60($key),@x[6]
- pshufb $mask,@x[5]
pxor 0x70($key),@x[7]
pxor 0x70($key),@x[7]
+ pshufb $mask,@x[4]
+ pshufb $mask,@x[5]
pshufb $mask,@x[6]
pshufb $mask,@x[6]
- lea 0x80($key),$key
pshufb $mask,@x[7]
pshufb $mask,@x[7]
+ lea 0x80($key),$key
___
}
___
}
@@
-820,18
+812,18
@@
_bsaes_encrypt8:
movdqa 0x50($const), @XMM[8] # .LM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
movdqa 0x50($const), @XMM[8] # .LM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
_bsaes_encrypt8_bitslice:
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
_bsaes_encrypt8_bitslice:
@@
-884,18
+876,18
@@
_bsaes_decrypt8:
movdqa -0x30($const), @XMM[8] # .LM0ISR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
movdqa -0x30($const), @XMM[8] # .LM0ISR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
___
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
___
@@
-1937,21
+1929,21
@@
$code.=<<___;
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
- pshufb @XMM[8], @XMM[0]
pxor @XMM[9], @XMM[2]
pxor @XMM[9], @XMM[2]
- pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[3]
pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[0]
+ pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
pxor @XMM[9], @XMM[4]
- pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[5]
pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[2]
+ pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
pxor @XMM[9], @XMM[6]
- pshufb @XMM[8], @XMM[5]
pxor @XMM[9], @XMM[7]
pxor @XMM[9], @XMM[7]
+ pshufb @XMM[8], @XMM[4]
+ pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[6]
- lea .LBS0(%rip), %r11 # constants table
pshufb @XMM[8], @XMM[7]
pshufb @XMM[8], @XMM[7]
+ lea .LBS0(%rip), %r11 # constants table
mov %ebx,%r10d # pass rounds
call _bsaes_encrypt8_bitslice
mov %ebx,%r10d # pass rounds
call _bsaes_encrypt8_bitslice