-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# minimize/avoid Address Generation Interlock hazard and to favour
# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
# almost 50% on z9. The gain is smaller on z10, because being dual-
-# issue z10 makes it improssible to eliminate the interlock condition:
+# issue z10 makes it impossible to eliminate the interlock condition:
# critial path is not long enough. Yet it spends ~24 cycles per byte
# processed with 128-bit key.
#
# February 2011.
#
-# Add AES_xts_[en|de]crypt. This includes support for z196
-# km-xts-aes instructions, which deliver ~70% improvement at 8KB
-# block size over vanilla km-based code.
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
$flavour = shift;
$g="g";
}
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$softonly=0; # allow hardware support
}
$code=<<___;
+#include "s390x_arch.h"
+
.text
.type AES_Te,\@object
or $s1,$t1
or $t2,$i2
or $t3,$i3
-
+
srlg $i1,$s2,`8-3` # i0
srlg $i2,$s2,`16-3` # i1
nr $i1,$mask
x $s2,24($key)
x $s3,28($key)
- br $ra
+ br $ra
.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
___
x $s2,24($key)
x $s3,28($key)
- br $ra
+ br $ra
.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
___
.type AES_set_encrypt_key,\@function
.align 16
AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
lghi $t0,0
cl${g}r $inp,$t0
je .Lminus1
.Lproceed:
___
$code.=<<___ if (!$softonly);
- # convert bits to km code, [128,192,256]->[18,19,20]
+ # convert bits to km(c) code, [128,192,256]->[18,19,20]
lhi %r5,-128
lhi %r0,18
ar %r5,$bits
ar %r5,%r0
larl %r1,OPENSSL_s390xcap_P
- lg %r0,0(%r1)
- tmhl %r0,0x4000 # check for message-security assist
- jz .Lekey_internal
-
- lghi %r0,0 # query capability vector
- la %r1,16($sp)
- .long 0xb92f0042 # kmc %r4,%r2
-
- llihh %r1,0x8000
- srlg %r1,%r1,0(%r5)
- ng %r1,16($sp)
+ llihh %r0,0x8000
+ srlg %r0,%r0,0(%r5)
+ ng %r0,S390X_KM(%r1) # check availability of both km...
+ ng %r0,S390X_KMC(%r1) # ...and kmc support for given key length
jz .Lekey_internal
lmg %r0,%r1,0($inp) # just copy 128 bits...
je 1f
lg %r1,24($inp)
stg %r1,24($key)
-1: st $bits,236($key) # save bits
- st %r5,240($key) # save km code
+1: st $bits,236($key) # save bits [for debugging purposes]
+ lgr $t0,%r5
+ st %r5,240($key) # save km(c) code
lghi %r2,0
br %r14
___
$code.=<<___;
.align 16
.Lekey_internal:
- stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs
+ stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
larl $tbl,AES_Te+2048
la $key,16($key) # key+=4
la $t3,4($t3) # i++
brct $rounds,.L128_loop
+ lghi $t0,10
lghi %r2,0
- lm${g} %r6,%r13,6*$SIZE_T($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
st $s2,32($key)
st $s3,36($key)
brct $rounds,.L192_continue
+ lghi $t0,12
lghi %r2,0
- lm${g} %r6,%r13,6*$SIZE_T($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
st $s2,40($key)
st $s3,44($key)
brct $rounds,.L256_continue
+ lghi $t0,14
lghi %r2,0
- lm${g} %r6,%r13,6*$SIZE_T($sp)
+ lm${g} %r4,%r13,4*$SIZE_T($sp)
br $ra
.align 16
.type AES_set_decrypt_key,\@function
.align 16
AES_set_decrypt_key:
- st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
- st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers!
- bras $ra,AES_set_encrypt_key
- l${g} $key,4*$SIZE_T($sp)
+ #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
+ st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
+ bras $ra,_s390x_AES_set_encrypt_key
+ #l${g} $key,4*$SIZE_T($sp)
l${g} $ra,14*$SIZE_T($sp)
ltgr %r2,%r2
bnzr $ra
___
$code.=<<___ if (!$softonly);
- l $t0,240($key)
+ #l $t0,240($key)
lhi $t1,16
cr $t0,$t1
jl .Lgo
oill $t0,0x80 # set "decrypt" bit
st $t0,240($key)
br $ra
-
-.align 16
-.Ldkey_internal:
- st${g} $key,4*$SIZE_T($sp)
- st${g} $ra,14*$SIZE_T($sp)
- bras $ra,.Lekey_internal
- l${g} $key,4*$SIZE_T($sp)
- l${g} $ra,14*$SIZE_T($sp)
___
$code.=<<___;
-
-.Lgo: llgf $rounds,240($key)
+.align 16
+.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
la $i1,0($key)
sllg $i2,$rounds,4
la $i2,0($i2,$key)
.Lcbc_enc_done:
l${g} $ivp,6*$SIZE_T($sp)
st $s0,0($ivp)
- st $s1,4($ivp)
+ st $s1,4($ivp)
st $s2,8($ivp)
st $s3,12($ivp)
clr %r0,%r1
jl .Lctr32_software
- stm${g} %r6,$s3,6*$SIZE_T($sp)
+ st${g} $s2,10*$SIZE_T($sp)
+ st${g} $s3,11*$SIZE_T($sp)
+
+ clr $len,%r1 # does work even in 64-bit mode
+ jle .Lctr32_nokma # kma is slower for <= 16 blocks
+
+ larl %r1,OPENSSL_s390xcap_P
+ lr $s2,%r0
+ llihh $s3,0x8000
+ srlg $s3,$s3,0($s2)
+ ng $s3,S390X_KMA(%r1) # check kma capability vector
+ jz .Lctr32_nokma
+
+ l${g}hi %r1,-$stdframe-112
+ l${g}r $s3,$sp
+ la $sp,0(%r1,$sp) # prepare parameter block
+
+ lhi %r1,0x0600
+ sllg $len,$len,4
+ or %r0,%r1 # set HS and LAAD flags
+
+ st${g} $s3,0($sp) # backchain
+ la %r1,$stdframe($sp)
+
+ lmg $s2,$s3,0($key) # copy key
+ stg $s2,$stdframe+80($sp)
+ stg $s3,$stdframe+88($sp)
+ lmg $s2,$s3,16($key)
+ stg $s2,$stdframe+96($sp)
+ stg $s3,$stdframe+104($sp)
+
+ lmg $s2,$s3,0($ivp) # copy iv
+ stg $s2,$stdframe+64($sp)
+ ahi $s3,-1 # kma requires counter-1
+ stg $s3,$stdframe+72($sp)
+ st $s3,$stdframe+12($sp) # copy counter
+
+ lghi $s2,0 # no AAD
+ lghi $s3,0
+
+ .long 0xb929a042 # kma $out,$s2,$inp
+ brc 1,.-4 # pay attention to "partial completion"
+
+ stg %r0,$stdframe+80($sp) # wipe key
+ stg %r0,$stdframe+88($sp)
+ stg %r0,$stdframe+96($sp)
+ stg %r0,$stdframe+104($sp)
+ la $sp,$stdframe+112($sp)
+
+ lm${g} $s2,$s3,10*$SIZE_T($sp)
+ br $ra
+
+.align 16
+.Lctr32_nokma:
+ stm${g} %r6,$s1,6*$SIZE_T($sp)
slgr $out,$inp
la %r1,0($key) # %r1 is permanent copy of $key
.Lctr32_hw_switch:
___
-$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
- larl $s0,OPENSSL_s390xcap_P
- lg $s0,8($s0)
- tmhh $s0,0x0004 # check for message_security-assist-4
- jz .Lctr32_km_loop
-
+$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
llgfr $s0,%r0
lgr $s1,%r1
- lghi %r0,0
- la %r1,16($sp)
- .long 0xb92d2042 # kmctr %r4,%r2,%r2
-
+ larl %r1,OPENSSL_s390xcap_P
llihh %r0,0x8000 # check if kmctr supports the function code
srlg %r0,%r0,0($s0)
- ng %r0,16($sp)
+ ng %r0,S390X_KMCTR(%r1) # check kmctr capability vector
lgr %r0,$s0
lgr %r1,$s1
jz .Lctr32_km_loop
br $ra
.align 16
___
-$code.=<<___;
+$code.=<<___ if (!$softonly);
.Lctr32_km_loop:
la $s2,16($sp)
lgr $s3,$fp
}
########################################################################
-# void AES_xts_encrypt(const char *inp,char *out,size_t len,
-# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
+# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
+# size_t len, const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
#
{
my $inp="%r2";
.align 16
_s390x_xts_km:
___
-$code.=<<___ if(0);
+$code.=<<___ if(1);
llgfr $s0,%r0 # put aside the function code
lghi $s1,0x7f
nr $s1,%r0
- lghi %r0,0 # query capability vector
- la %r1,2*$SIZE_T($sp)
- .long 0xb92e0042 # km %r4,%r2
- llihh %r1,0x8000
- srlg %r1,%r1,32($s1) # check for 32+function code
- ng %r1,2*$SIZE_T($sp)
+ larl %r1,OPENSSL_s390xcap_P
+ llihh %r0,0x8000
+ srlg %r0,%r0,32($s1) # check for 32+function code
+ ng %r0,S390X_KM(%r1) # check km capability vector
lgr %r0,$s0 # restore the function code
la %r1,0($key1) # restore $key1
jz .Lxts_km_vanilla
lrvg $s0,$tweak+0($sp) # load the last tweak
lrvg $s1,$tweak+8($sp)
- stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
+ stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
nill %r0,0xffdf # switch back to original function code
la %r1,0($key1) # restore pointer to $key1
llgc $len,2*$SIZE_T-1($sp)
nill $len,0x0f # $len%=16
br $ra
-
+
.align 16
.Lxts_km_vanilla:
___
lghi $i1,0x87
srag $i2,$s1,63 # broadcast upper bit
ngr $i1,$i2 # rem
- srlg $i2,$s0,63 # carry bit from lower half
- sllg $s0,$s0,1
- sllg $s1,$s1,1
+ algr $s0,$s0
+ alcgr $s1,$s1
xgr $s0,$i1
- ogr $s1,$i2
.Lxts_km_start:
lrvgr $i1,$s0 # flip byte order
lrvgr $i2,$s1
lghi $i1,0x87
srag $i2,$s1,63 # broadcast upper bit
ngr $i1,$i2 # rem
- srlg $i2,$s0,63 # carry bit from lower half
- sllg $s0,$s0,1
- sllg $s1,$s1,1
+ algr $s0,$s0
+ alcgr $s1,$s1
xgr $s0,$i1
- ogr $s1,$i2
ltr $len,$len # clear zero flag
br $ra
clr %r0,%r1
jl .Lxts_enc_software
+ st${g} $ra,5*$SIZE_T($sp)
stm${g} %r6,$s3,6*$SIZE_T($sp)
- st${g} $ra,14*$SIZE_T($sp)
sllg $len,$len,4 # $len&=~15
slgr $out,$inp
- lrvg $s0,$stdframe($sp) # load secno
- lghi $s1,0
+ # generate the tweak value
+ l${g} $s3,$stdframe($sp) # pointer to iv
la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
lghi $s3,16
stmg $s0,$s1,0($s2)
la %r1,0($key2) # $key2 is not needed anymore
stg $i2,8($i3)
.Lxts_enc_km_done:
- l${g} $ra,14*$SIZE_T($sp)
- st${g} $sp,$tweak($sp) # wipe tweak
- st${g} $sp,$tweak($sp)
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
lm${g} %r6,$s3,6*$SIZE_T($sp)
br $ra
.align 16
slgr $out,$inp
- xgr $s0,$s0 # clear upper half
- xgr $s1,$s1
- lrv $s0,$stdframe+4($sp) # load secno
- lrv $s1,$stdframe+0($sp)
- xgr $s2,$s2
- xgr $s3,$s3
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
stm${g} %r2,%r5,2*$SIZE_T($sp)
la $key,0($key2)
larl $tbl,AES_Te
lghi %r1,0x87
srag %r0,$s3,63 # broadcast upper bit
ngr %r1,%r0 # rem
- srlg %r0,$s1,63 # carry bit from lower half
- sllg $s1,$s1,1
- sllg $s3,$s3,1
+ algr $s1,$s1
+ alcgr $s3,$s3
xgr $s1,%r1
- ogr $s3,%r0
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
- srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
lghi %r1,0x87
srag %r0,$s3,63 # broadcast upper bit
ngr %r1,%r0 # rem
- srlg %r0,$s1,63 # carry bit from lower half
- sllg $s1,$s1,1
- sllg $s3,$s3,1
+ algr $s1,$s1
+ alcgr $s3,$s3
xgr $s1,%r1
- ogr $s3,%r0
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
- srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
br $ra
.size AES_xts_encrypt,.-AES_xts_encrypt
___
-# void AES_xts_decrypt(const char *inp,char *out,size_t len,
-# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
+# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
+# size_t len, const AES_KEY *key1, const AES_KEY *key2,
+# const unsigned char iv[16]);
#
$code.=<<___;
.globl AES_xts_decrypt
clr %r0,%r1
jl .Lxts_dec_software
+ st${g} $ra,5*$SIZE_T($sp)
stm${g} %r6,$s3,6*$SIZE_T($sp)
- st${g} $ra,14*$SIZE_T($sp)
nill $len,0xfff0 # $len&=~15
slgr $out,$inp
# generate the tweak value
- lrvg $s0,$stdframe($sp) # load secno
- lghi $s1,0
+ l${g} $s3,$stdframe($sp) # pointer to iv
la $s2,$tweak($sp)
+ lmg $s0,$s1,0($s3)
lghi $s3,16
- stg $s0,0($s2)
- stg $s1,8($s2)
+ stmg $s0,$s1,0($s2)
la %r1,0($key2) # $key2 is not needed past this point
.long 0xb92e00aa # km $s2,$s2, generate the tweak
brc 1,.-4 # can this happen?
lghi $i1,0x87
srag $i2,$s1,63 # broadcast upper bit
ngr $i1,$i2 # rem
- srlg $i2,$s0,63 # carry bit from lower half
- sllg $s0,$s0,1
- sllg $s1,$s1,1
+ algr $s0,$s0
+ alcgr $s1,$s1
xgr $s0,$i1
- ogr $s1,$i2
lrvgr $i1,$s0 # flip byte order
lrvgr $i2,$s1
stg $s2,0($i3)
stg $s3,8($i3)
.Lxts_dec_km_done:
- l${g} $ra,14*$SIZE_T($sp)
- st${g} $sp,$tweak($sp) # wipe tweak
- st${g} $sp,$tweak($sp)
+ stg $sp,$tweak+0($sp) # wipe tweak
+ stg $sp,$tweak+8($sp)
+ l${g} $ra,5*$SIZE_T($sp)
lm${g} %r6,$s3,6*$SIZE_T($sp)
br $ra
.align 16
srlg $len,$len,4
slgr $out,$inp
- xgr $s0,$s0 # clear upper half
- xgr $s1,$s1
- lrv $s0,$stdframe+4($sp) # load secno
- lrv $s1,$stdframe+0($sp)
- xgr $s2,$s2
- xgr $s3,$s3
+ l${g} $s3,$stdframe($sp) # ivp
+ llgf $s0,0($s3) # load iv
+ llgf $s1,4($s3)
+ llgf $s2,8($s3)
+ llgf $s3,12($s3)
stm${g} %r2,%r5,2*$SIZE_T($sp)
la $key,0($key2)
larl $tbl,AES_Te
lghi %r1,0x87
srag %r0,$s3,63 # broadcast upper bit
ngr %r1,%r0 # rem
- srlg %r0,$s1,63 # carry bit from lower half
- sllg $s1,$s1,1
- sllg $s3,$s3,1
+ algr $s1,$s1
+ alcgr $s3,$s3
xgr $s1,%r1
- ogr $s3,%r0
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
- srlg $s0,$s1,32 # smash the tweak to 4x32-bits
+ srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
lghi %r1,0x87
srag %r0,$s3,63 # broadcast upper bit
ngr %r1,%r0 # rem
- srlg %r0,$s1,63 # carry bit from lower half
- sllg $s1,$s1,1
- sllg $s3,$s3,1
+ algr $s1,$s1
+ alcgr $s3,$s3
xgr $s1,%r1
- ogr $s3,%r0
lrvgr $i2,$s1 # flip byte order
lrvgr $i3,$s3
stmg $i2,$i3,$tweak($sp) # save the 1st tweak
lghi %r1,0x87
srag %r0,$s3,63 # broadcast upper bit
ngr %r1,%r0 # rem
- srlg %r0,$s1,63 # carry bit from lower half
- sllg $s1,$s1,1
- sllg $s3,$s3,1
+ algr $s1,$s1
+ alcgr $s3,$s3
xgr $s1,%r1
- ogr $s3,%r0
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
}
$code.=<<___;
.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm OPENSSL_s390xcap_P,16,8
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;