From: Andy Polyakov Date: Mon, 14 Dec 2015 22:23:20 +0000 (+0100) Subject: s390x assembly pack: add ChaCha20 and Poly1305 modules. X-Git-Tag: OpenSSL_1_1_0-pre3~163 X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=592eef5c349b8f8ea6f4b7dba91d700f3a343e84 s390x assembly pack: add ChaCha20 and Poly1305 modules. Reviewed-by: Tim Hudson --- diff --git a/crypto/chacha/Makefile.in b/crypto/chacha/Makefile.in index 0590708262..8987a850f2 100644 --- a/crypto/chacha/Makefile.in +++ b/crypto/chacha/Makefile.in @@ -36,6 +36,8 @@ lib: $(LIBOBJ) $(RANLIB) $(LIB) || echo Never mind. @touch lib +chacha-%.S: asm/chacha-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/chacha/asm/chacha-s390x.pl b/crypto/chacha/asm/chacha-s390x.pl new file mode 100755 index 0000000000..8a09706543 --- /dev/null +++ b/crypto/chacha/asm/chacha-s390x.pl @@ -0,0 +1,317 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# December 2015 +# +# ChaCha20 for s390x. +# +# 3 times faster than compiler-generated code. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + $code .= "\t$opcode\t".join(',',@_)."\n"; +} + +my $sp="%r15"; + +my $stdframe=16*$SIZE_T+4*8; +my $frame=$stdframe+4*20; + +my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); + +my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); +my @t=map("%r$_",(8,9)); + +sub ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_)=map("\"$_\"",@t); +my @x=map("\"$_\"",@x); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # 'c' stores and loads in the middle, but none in the beginning + # or end. + + ( + "&alr (@x[$a0],@x[$b0])", # Q1 + "&alr (@x[$a1],@x[$b1])", # Q2 + "&xr (@x[$d0],@x[$a0])", + "&xr (@x[$d1],@x[$a1])", + "&rll (@x[$d0],@x[$d0],16)", + "&rll (@x[$d1],@x[$d1],16)", + + "&alr ($xc,@x[$d0])", + "&alr ($xc_,@x[$d1])", + "&xr (@x[$b0],$xc)", + "&xr (@x[$b1],$xc_)", + "&rll (@x[$b0],@x[$b0],12)", + "&rll (@x[$b1],@x[$b1],12)", + + "&alr (@x[$a0],@x[$b0])", + "&alr (@x[$a1],@x[$b1])", + "&xr (@x[$d0],@x[$a0])", + "&xr (@x[$d1],@x[$a1])", + "&rll (@x[$d0],@x[$d0],8)", + "&rll (@x[$d1],@x[$d1],8)", + + "&alr ($xc,@x[$d0])", + "&alr ($xc_,@x[$d1])", + "&xr (@x[$b0],$xc)", + "&xr (@x[$b1],$xc_)", + "&rll (@x[$b0],@x[$b0],7)", + "&rll (@x[$b1],@x[$b1],7)", + + "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's + "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", + + "&alr (@x[$a2],@x[$b2])", # Q3 + "&alr (@x[$a3],@x[$b3])", # Q4 + "&xr (@x[$d2],@x[$a2])", + "&xr (@x[$d3],@x[$a3])", + "&rll (@x[$d2],@x[$d2],16)", + "&rll (@x[$d3],@x[$d3],16)", + + "&alr ($xc,@x[$d2])", + "&alr ($xc_,@x[$d3])", + "&xr (@x[$b2],$xc)", + "&xr (@x[$b3],$xc_)", + "&rll (@x[$b2],@x[$b2],12)", + "&rll (@x[$b3],@x[$b3],12)", + + "&alr (@x[$a2],@x[$b2])", + "&alr (@x[$a3],@x[$b3])", + "&xr (@x[$d2],@x[$a2])", + "&xr (@x[$d3],@x[$a3])", + "&rll (@x[$d2],@x[$d2],8)", + "&rll (@x[$d3],@x[$d3],8)", + + "&alr ($xc,@x[$d2])", + "&alr ($xc_,@x[$d3])", + "&xr (@x[$b2],$xc)", + "&xr (@x[$b3],$xc_)", + "&rll (@x[$b2],@x[$b2],7)", + "&rll (@x[$b3],@x[$b3],7)" + ); +} + +$code.=<<___; +.text + +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,\@function +.align 32 +ChaCha20_ctr32: + a${g}hi $len,-64 + l${g}hi %r1,-$frame + stm${g} %r6,%r15,`6*$SIZE_T`($sp) + sl${g}r $out,$inp # difference + la $len,0($inp,$len) # end of input minus 64 + larl %r7,.Lsigma + lgr %r0,$sp + la $sp,0(%r1,$sp) + st${g} %r0,0($sp) + + lmg %r8,%r11,0($key) # load key + lmg %r12,%r13,0($counter) # load counter + lmg %r6,%r7,0(%r7) # load sigma constant + + la %r14,0($inp) + st${g} $out,$frame+3*$SIZE_T($sp) + st${g} $len,$frame+4*$SIZE_T($sp) + stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack + srlg @x[12],%r12,32 # 32-bit counter value + j .Loop_outer + +.align 16 +.Loop_outer: + lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] + lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] + lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] + stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] + lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] + st @x[12],$stdframe+4*12($sp) # save counter + st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer + lhi %r14,10 + j .Loop + +.align 4 +.Loop: +___ + foreach (&ROUND(0, 4, 8,12)) { eval; } + foreach (&ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + brct %r14,.Loop + + l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer + stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] + lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) + + al @x[0],$stdframe+4*0($sp) # accumulate key schedule + al @x[1],$stdframe+4*1($sp) + al @x[2],$stdframe+4*2($sp) + al @x[3],$stdframe+4*3($sp) + al @x[4],$stdframe+4*4($sp) + al @x[5],$stdframe+4*5($sp) + al @x[6],$stdframe+4*6($sp) + al @x[7],$stdframe+4*7($sp) + lrvr @x[0],@x[0] + lrvr @x[1],@x[1] + lrvr @x[2],@x[2] + lrvr @x[3],@x[3] + lrvr @x[4],@x[4] + lrvr @x[5],@x[5] + lrvr @x[6],@x[6] + lrvr @x[7],@x[7] + al @x[12],$stdframe+4*12($sp) + al @x[13],$stdframe+4*13($sp) + al @x[14],$stdframe+4*14($sp) + al @x[15],$stdframe+4*15($sp) + lrvr @x[12],@x[12] + lrvr @x[13],@x[13] + lrvr @x[14],@x[14] + lrvr @x[15],@x[15] + + la @t[0],0(@t[0],%r14) # reconstruct output pointer + cl${g}r %r14,@t[1] + jh .Ltail + + x @x[0],4*0(%r14) # xor with input + x @x[1],4*1(%r14) + st @x[0],4*0(@t[0]) # store output + x @x[2],4*2(%r14) + st @x[1],4*1(@t[0]) + x @x[3],4*3(%r14) + st @x[2],4*2(@t[0]) + x @x[4],4*4(%r14) + st @x[3],4*3(@t[0]) + lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] + x @x[5],4*5(%r14) + st @x[4],4*4(@t[0]) + x @x[6],4*6(%r14) + al @x[0],$stdframe+4*8($sp) + st @x[5],4*5(@t[0]) + x @x[7],4*7(%r14) + al @x[1],$stdframe+4*9($sp) + st @x[6],4*6(@t[0]) + x @x[12],4*12(%r14) + al @x[2],$stdframe+4*10($sp) + st @x[7],4*7(@t[0]) + x @x[13],4*13(%r14) + al @x[3],$stdframe+4*11($sp) + st @x[12],4*12(@t[0]) + x @x[14],4*14(%r14) + st @x[13],4*13(@t[0]) + x @x[15],4*15(%r14) + st @x[14],4*14(@t[0]) + lrvr @x[0],@x[0] + st @x[15],4*15(@t[0]) + lrvr @x[1],@x[1] + lrvr @x[2],@x[2] + lrvr @x[3],@x[3] + lhi @x[12],1 + x @x[0],4*8(%r14) + al @x[12],$stdframe+4*12($sp) # increment counter + x @x[1],4*9(%r14) + st @x[0],4*8(@t[0]) + x @x[2],4*10(%r14) + st @x[1],4*9(@t[0]) + x @x[3],4*11(%r14) + st @x[2],4*10(@t[0]) + la %r14,64(%r14) + st @x[3],4*11(@t[0]) + + cl${g}r %r14,@t[1] # done yet? + jle .Loop_outer + +.Ldone: + xgr %r0,%r0 + xgr %r1,%r1 + xgr %r2,%r2 + xgr %r3,%r3 + stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy + stmg %r0,%r3,$stdframe+4*12($sp) + + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) + br %r14 + +.align 16 +.Ltail: + la @t[1],64($t[1]) + stm @x[0],@x[7],$stdframe+4*0($sp) + sl${g}r @t[1],%r14 + lm @x[0],@x[3],$stdframe+4*8+4*8($sp) + l${g}hi @x[6],0 + stm @x[12],@x[15],$stdframe+4*12($sp) + al @x[0],$stdframe+4*8($sp) + al @x[1],$stdframe+4*9($sp) + al @x[2],$stdframe+4*10($sp) + al @x[3],$stdframe+4*11($sp) + lrvr @x[0],@x[0] + lrvr @x[1],@x[1] + lrvr @x[2],@x[2] + lrvr @x[3],@x[3] + stm @x[0],@x[3],$stdframe+4*8+4*8($sp) + +.Loop_tail: + llgc @x[4],0(@x[6],%r14) + llgc @x[5],$stdframe(@x[6],$sp) + xr @x[5],@x[4] + stc @x[5],0(@x[6],@t[0]) + la @x[6],1(@x[6]) + brct @t[1],.Loop_tail + + j .Ldone +.size ChaCha20_ctr32,.-ChaCha20_ctr32 + +.align 32 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral +.asciz "ChaCha20 for s390x, CRYPTOGAMS by " +.align 4 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/poly1305/Makefile.in b/crypto/poly1305/Makefile.in index 708773d9cf..9d74865805 100644 --- a/crypto/poly1305/Makefile.in +++ b/crypto/poly1305/Makefile.in @@ -39,6 +39,8 @@ lib: $(LIBOBJ) poly1305-sparcv9.S: asm/poly1305-sparcv9.pl $(PERL) asm/poly1305-sparcv9.pl > $@ +poly1305-%.S: asm/poly1305-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@ + files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl new file mode 100755 index 0000000000..635db0e686 --- /dev/null +++ b/crypto/poly1305/asm/poly1305-s390x.pl @@ -0,0 +1,216 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for s390x. +# +# June 2015 +# +# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated +# code. For older compiler improvement coefficient is >3x, because +# then base 2^64 and base 2^32 implementations are compared. +# +# On side note, z13 enables vector base 2^26 implementation... + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$sp="%r15"; + +my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); + +$code.=<<___; +.text + +.globl poly1305_init +.type poly1305_init,\@function +.align 16 +poly1305_init: + lghi %r0,0 + lghi %r1,-1 + stg %r0,0($ctx) # zero hash value + stg %r0,8($ctx) + stg %r0,16($ctx) + + cl${g}r $inp,%r0 + je .Lno_key + + lrvg %r4,0($inp) # load little-endian key + lrvg %r5,8($inp) + + nihl %r1,0xffc0 # 0xffffffc0ffffffff + srlg %r0,%r1,4 # 0x0ffffffc0fffffff + srlg %r1,%r1,4 + nill %r1,0xfffc # 0x0ffffffc0ffffffc + + ngr %r4,%r0 + ngr %r5,%r1 + + stg %r4,32($ctx) + stg %r5,40($ctx) + +.Lno_key: + lghi %r2,0 + br %r14 +.size poly1305_init,.-poly1305_init +___ +{ +my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); +my ($r0,$r1,$s1) = map("%r$_",(0..2)); + +$code.=<<___; +.globl poly1305_blocks +.type poly1305_blocks,\@function +.align 16 +poly1305_blocks: + srl${g} $len,$len,4 + lghi %r0,0 + cl${g}r $len,%r0 + je .Lno_data + + stm${g} %r6,%r14,`6*$SIZE_T`($sp) + + lg $r0,32($ctx) # load key + lg $r1,40($ctx) + + lg $h0,0($ctx) # load hash value + lg $h1,8($ctx) + lg $h2,16($ctx) + + st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx + srlg $s1,$r1,2 + algr $s1,$r1 # s1 = r1 + r1>>2 + j .Loop + +.align 16 +.Loop: + lrvg $d0lo,0($inp) # load little-endian input + lrvg $d1lo,8($inp) + la $inp,16($inp) + + algr $d0lo,$h0 # accumulate input + alcgr $d1lo,$h1 + + lgr $h0,$d0lo + mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo + lgr $h1,$d1lo + mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo + + mlgr $t0,$r1 # h0*r1 -> $t0:$h0 + mlgr $t1,$r0 # h1*r0 -> $t1:$h1 + alcgr $h2,$padbit + + algr $d0lo,$d1lo + lgr $d1lo,$h2 + alcgr $d0hi,$d1hi + lghi $d1hi,0 + + algr $h1,$h0 + alcgr $t1,$t0 + + msgr $d1lo,$s1 # h2*s1 + msgr $h2,$r0 # h2*r0 + + algr $h1,$d1lo + alcgr $t1,$d1hi # $d1hi is zero + + algr $h1,$d0hi + alcgr $h2,$t1 + + lghi $h0,-4 # final reduction step + ngr $h0,$h2 + srlg $t0,$h2,2 + algr $h0,$t0 + + algr $h0,$d0lo + lghi $t1,3 + alcgr $h1,$d1hi # $d1hi is still zero + ngr $h2,$t1 + + brct$g $len,.Loop + + l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx + + stg $h0,0($ctx) # store hash value + stg $h1,8($ctx) + stg $h2,16($ctx) + + lm${g} %r6,%r14,`6*$SIZE_T`($sp) +.Lno_data: + br %r14 +.size poly1305_blocks,.-poly1305_blocks +___ +} +{ +my ($mac,$nonce)=($inp,$len); +my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); + +$code.=<<___; +.globl poly1305_emit +.type poly1305_emit,\@function +.align 16 +poly1305_emit: + stm${g} %r6,%r9,`6*$SIZE_T`($sp) + + lg $h0,0($ctx) + lg $h1,8($ctx) + lg $h2,16($ctx) + + lghi %r0,5 + lghi %r1,0 + lgr $d0,$h0 + lgr $d1,$h1 + + algr $h0,%r0 # compare to modulus + alcgr $h1,%r1 + alcgr $h2,%r1 + + srlg $h2,$h2,2 # did it borrow/carry? + slgr %r1,$h2 # 0-$h2>>2 + lg $h2,0($nonce) # load nonce + lghi %r0,-1 + lg $ctx,8($nonce) + xgr %r0,%r1 # ~%r1 + + ngr $h0,%r1 + ngr $d0,%r0 + ngr $h1,%r1 + ngr $d1,%r0 + ogr $h0,$d0 + rllg $d0,$h2,32 # flip nonce words + ogr $h1,$d1 + rllg $d1,$ctx,32 + + algr $h0,$d0 # accumulate nonce + alcgr $h1,$d1 + + strvg $h0,0($mac) # write little-endian result + strvg $h1,8($mac) + + lm${g} %r6,%r9,`6*$SIZE_T`($sp) + br %r14 +.size poly1305_emit,.-poly1305_emit + +.string "Poly1305 for s390x, CRYPTOGAMS by " +___ +} + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT;