From: Andy Polyakov Date: Wed, 14 Jul 2010 08:43:38 +0000 (+0000) Subject: aes-s390x.pl: revisit buffer allocation and add performance data. X-Git-Tag: OpenSSL-fips-2_0-rc1~1048 X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=26064d7f77ac20b61a35d758046eeb2344745e12;ds=sidebyside aes-s390x.pl: revisit buffer allocation and add performance data. --- diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl index 38e18b2452..5ffacb8cf4 100644 --- a/crypto/aes/asm/aes-s390x.pl +++ b/crypto/aes/asm/aes-s390x.pl @@ -44,7 +44,7 @@ # Unlike previous version hardware support detection takes place only # at the moment of key schedule setup, which is denoted in key->rounds. # This is done, because deferred key setup can't be made MT-safe, not -# for key lengthes longer than 128 bits. +# for keys longer than 128 bits. # # Add AES_cbc_encrypt, which gives incredible performance improvement, # it was measured to be ~6.6x. It's less than previously mentioned 8x, @@ -52,7 +52,13 @@ # May 2010. # -# Add AES_ctr32_encrypt. +# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x +# performance improvement over "generic" counter mode routine relying +# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers +# to the fact that exact throughput value depends on current stack +# frame alignment within 4KB page. In worst case you get ~75% of the +# maximum, but *on average* it would be as much as ~98%. Meaning that +# worst case is unlike, it's like hitting ravine on plateau. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -1367,24 +1373,27 @@ $code.=<<___ if (!$softonly); lg $iv0,0($ivp) # load ivec lg $ivp,8($ivp) - # prepare and allocate stack frame - lghi $s0,-272 # guarantee at least 256-bytes buffer + # prepare and allocate stack frame at the top of 4K page + # with 1K reserved for eventual signal handling + lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer lghi $s1,-4096 - lgr $fp,$sp algr $s0,$sp + lgr $fp,$sp ngr $s0,$s1 # align at page boundary - la $sp,0($s0) # alloca - stg $fp,0($s0) # back-chain - - # calculate resultant buffer size - la $s0,16($s0) # buffer starts at offset of 16 - slgr $fp,$s0 - srlg $fp,$fp,4 # $fp is buffer length in blocks, minimum 16 + slgr $fp,$s0 # total buffer size + lgr $s2,$sp + lghi $s1,1024+16 # sl[g]fi is extended-immediate facility + slgr $fp,$s1 # deduct reservation to get usable buffer size + # buffer size is at lest 256 and at most 3072+256-16 + + la $sp,1024($s0) # alloca + srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 + stg $s2,0($sp) # back-chain stg $fp,8($sp) slgr $len,$fp brc 1,.Lctr32_hw_loop # not zero, no borrow - algr $fp,$len + algr $fp,$len # input is shorter than allocated buffer lghi $len,0 stg $fp,8($sp)