# Unlike previous version hardware support detection takes place only
# at the moment of key schedule setup, which is denoted in key->rounds.
# This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
#
# Add AES_cbc_encrypt, which gives incredible performance improvement,
# it was measured to be ~6.6x. It's less than previously mentioned 8x,
# because software implementation was optimized.
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
$softonly=0; # allow hardware support
$t0="%r0"; $mask="%r0";
srl %r5,6
ar %r5,%r0
+ larl %r1,OPENSSL_s390xcap_P
+ lg %r0,0(%r1)
+ tmhl %r0,0x4000 # check for message-security assist
+ jz .Lekey_internal
+
lghi %r0,0 # query capability vector
la %r1,16($sp)
.long 0xb92f0042 # kmc %r4,%r2
.size AES_cbc_encrypt,.-AES_cbc_encrypt
___
}
+#void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+# size_t blocks, const AES_KEY *key,
+# const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r3";
+my $len="%r4";
+my $key="%r5"; my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl AES_ctr32_encrypt
+.type AES_ctr32_encrypt,\@function
+.align 16
+AES_ctr32_encrypt:
+___
+$code.=<<___ if (!$softonly);
+ l %r0,240($key)
+ lhi %r1,16
+ clr %r0,%r1
+ jl .Lctr32_software
+
+ stmg %r6,$s3,48($sp)
+
+ slgr $out,$inp
+ la %r1,0($key) # %r1 is permanent copy of $key
+ lg $iv0,0($ivp) # load ivec
+ lg $ivp,8($ivp)
+
+ # prepare and allocate stack frame at the top of 4K page
+ # with 1K reserved for eventual signal handling
+ lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
+ lghi $s1,-4096
+ algr $s0,$sp
+ lgr $fp,$sp
+ ngr $s0,$s1 # align at page boundary
+ slgr $fp,$s0 # total buffer size
+ lgr $s2,$sp
+ lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
+ slgr $fp,$s1 # deduct reservation to get usable buffer size
+ # buffer size is at lest 256 and at most 3072+256-16
+
+ la $sp,1024($s0) # alloca
+ srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
+ stg $s2,0($sp) # back-chain
+ stg $fp,8($sp)
+
+ slgr $len,$fp
+ brc 1,.Lctr32_hw_loop # not zero, no borrow
+ algr $fp,$len # input is shorter than allocated buffer
+ lghi $len,0
+ stg $fp,8($sp)
+
+.Lctr32_hw_loop:
+ la $s2,16($sp)
+ lgr $s3,$fp
+.Lctr32_hw_prepare:
+ stg $iv0,0($s2)
+ stg $ivp,8($s2)
+ la $s2,16($s2)
+ ahi $ivp,1 # 32-bit increment, preserves upper half
+ brct $s3,.Lctr32_hw_prepare
+
+ la $s0,16($sp) # inp
+ sllg $s1,$fp,4 # len
+ la $s2,16($sp) # out
+ .long 0xb92e00a8 # km %r10,%r8
+ brc 1,.-4 # pay attention to "partial completion"
+
+ la $s2,16($sp)
+ lgr $s3,$fp
+ slgr $s2,$inp
+.Lctr32_hw_xor:
+ lg $s0,0($inp)
+ lg $s1,8($inp)
+ xg $s0,0($s2,$inp)
+ xg $s1,8($s2,$inp)
+ stg $s0,0($out,$inp)
+ stg $s1,8($out,$inp)
+ la $inp,16($inp)
+ brct $s3,.Lctr32_hw_xor
+
+ slgr $len,$fp
+ brc 1,.Lctr32_hw_loop # not zero, no borrow
+ algr $fp,$len
+ lghi $len,0
+ brc 4+1,.Lctr32_hw_loop # not zero
+
+ lg $s0,0($sp)
+ lg $s1,8($sp)
+ la $s2,16($sp)
+.Lctr32_hw_zap:
+ stg $s0,0($s2)
+ stg $s0,8($s2)
+ la $s2,16($s2)
+ brct $s1,.Lctr32_hw_zap
+
+ la $sp,0($s0)
+ lmg %r6,$s3,48($sp)
+ br $ra
+.align 16
+.Lctr32_software:
+___
+$code.=<<___;
+ stmg $key,$ra,40($sp)
+ slgr $out,$inp
+ larl $tbl,AES_Te
+ llgf $t1,12($ivp)
+
+.Lctr32_loop:
+ stmg $inp,$len,16($sp)
+ llgf $s0,0($ivp)
+ llgf $s1,4($ivp)
+ llgf $s2,8($ivp)
+ lgr $s3,$t1
+ st $t1,128($sp)
+ lgr %r4,$key
+
+ bras $ra,_s390x_AES_encrypt
+
+ lmg $inp,$ivp,16($sp)
+ llgf $t1,128($sp)
+ x $s0,0($inp)
+ x $s1,4($inp)
+ x $s2,8($inp)
+ x $s3,12($inp)
+ st $s0,0($out,$inp)
+ st $s1,4($out,$inp)
+ st $s2,8($out,$inp)
+ st $s3,12($out,$inp)
+
+ la $inp,16($inp)
+ ahi $t1,1 # 32-bit increment
+ brct $len,.Lctr32_loop
+
+ lmg %r6,$ra,48($sp)
+ br $ra
+.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
$code.=<<___;
+.comm OPENSSL_s390xcap_P,16,8
.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
+close STDOUT; # force flush