projects
/
openssl.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (from parent 1:
cd8d733
)
aesni-sha256-x86_64.pl: harmonize with latest sha512-x86_64.pl.
author
Andy Polyakov
<appro@openssl.org>
Mon, 10 Jun 2013 20:34:06 +0000
(22:34 +0200)
committer
Andy Polyakov
<appro@openssl.org>
Mon, 10 Jun 2013 20:34:06 +0000
(22:34 +0200)
crypto/aes/asm/aesni-sha256-x86_64.pl
patch
|
blob
|
history
diff --git
a/crypto/aes/asm/aesni-sha256-x86_64.pl
b/crypto/aes/asm/aesni-sha256-x86_64.pl
index ebce0214527b39318d9b766b0a16df4468b020f9..16fea488173f587adbf8b077de7b4049ae355e45 100644
(file)
--- a/
crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/
crypto/aes/asm/aesni-sha256-x86_64.pl
@@
-24,6
+24,7
@@
# AES-128/-192/-256+SHA256 this(**)gain
# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
# AES-128/-192/-256+SHA256 this(**)gain
# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
+# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
#
# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
#
# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
@@
-970,11
+971,11
@@
$code.=<<___;
sub \$-16*$SZ,%r13 # inp++, size optimization
mov $SZ*0(%r15),$A
sub \$-16*$SZ,%r13 # inp++, size optimization
mov $SZ*0(%r15),$A
-
xor %r12,%r12
# borrow $a0
+
lea (%rsi,%r13),%r12
# borrow $a0
mov $SZ*1(%r15),$B
cmp $len,%r13 # $_end
mov $SZ*2(%r15),$C
mov $SZ*1(%r15),$B
cmp $len,%r13 # $_end
mov $SZ*2(%r15),$C
- sete %r12b
+ cmove %rsp,%r12 # next block or random data
mov $SZ*3(%r15),$D
mov $SZ*4(%r15),$E
mov $SZ*5(%r15),$F
mov $SZ*3(%r15),$D
mov $SZ*4(%r15),$E
mov $SZ*5(%r15),$F
@@
-990,21
+991,18
@@
$code.=<<___;
jmp .Loop_avx2
.align 16
.Loop_avx2:
jmp .Loop_avx2
.align 16
.Loop_avx2:
- shl \$`log(16*$SZ)/log(2)`,%r12
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
- neg %r12
vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
- add %rsi,%r12 # next or same input block
vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
- vinserti128 \$1,(%r12
,%r13
),@X[0],@X[0]
- vinserti128 \$1,16(%r12
,%r13
),@X[1],@X[1]
+ vinserti128 \$1,(%r12),@X[0],@X[0]
+ vinserti128 \$1,16(%r12),@X[1],@X[1]
vpshufb $t3,@X[0],@X[0]
vpshufb $t3,@X[0],@X[0]
- vinserti128 \$1,32(%r12
,%r13
),@X[2],@X[2]
+ vinserti128 \$1,32(%r12),@X[2],@X[2]
vpshufb $t3,@X[1],@X[1]
vpshufb $t3,@X[1],@X[1]
- vinserti128 \$1,48(%r12
,%r13
),@X[3],@X[3]
+ vinserti128 \$1,48(%r12),@X[3],@X[3]
lea $TABLE(%rip),$Tbl
vpshufb $t3,@X[2],@X[2]
lea $TABLE(%rip),$Tbl
vpshufb $t3,@X[2],@X[2]
@@
-1148,12
+1146,13
@@
$code.=<<___;
add $SZ*4(%r15),$E
add $SZ*5(%r15),$F
add $SZ*6(%r15),$G
add $SZ*4(%r15),$E
add $SZ*5(%r15),$F
add $SZ*6(%r15),$G
-
xor %r12
,%r12
+
lea (%rsi,%r13)
,%r12
add $SZ*7(%r15),$H
cmp $_end,%r13
mov $A,$SZ*0(%r15)
add $SZ*7(%r15),$H
cmp $_end,%r13
mov $A,$SZ*0(%r15)
+ cmove %rsp,%r12 # next block or stale data
mov $B,$SZ*1(%r15)
mov $C,$SZ*2(%r15)
mov $D,$SZ*3(%r15)
mov $B,$SZ*1(%r15)
mov $C,$SZ*2(%r15)
mov $D,$SZ*3(%r15)
@@
-1162,7
+1161,6
@@
$code.=<<___;
mov $G,$SZ*6(%r15)
mov $H,$SZ*7(%r15)
mov $G,$SZ*6(%r15)
mov $H,$SZ*7(%r15)
- sete %r12b
jbe .Loop_avx2
lea (%rsp),$Tbl
jbe .Loop_avx2
lea (%rsp),$Tbl