s390x assembly pack: add ChaCha20 and Poly1305 modules.
authorAndy Polyakov <appro@openssl.org>
Mon, 14 Dec 2015 22:23:20 +0000 (23:23 +0100)
committerAndy Polyakov <appro@openssl.org>
Tue, 9 Feb 2016 21:33:52 +0000 (22:33 +0100)
Reviewed-by: Tim Hudson <tjh@openssl.org>
crypto/chacha/Makefile.in
crypto/chacha/asm/chacha-s390x.pl [new file with mode: 0755]
crypto/poly1305/Makefile.in
crypto/poly1305/asm/poly1305-s390x.pl [new file with mode: 0755]

index 0590708262f3c31d241519eebc5ca8f121dd7841..8987a850f2b53e646ae3cc9fecb541612fcd63c6 100644 (file)
@@ -36,6 +36,8 @@ lib:  $(LIBOBJ)
        $(RANLIB) $(LIB) || echo Never mind.
        @touch lib
 
+chacha-%.S:    asm/chacha-%.pl;        $(PERL) $< $(PERLASM_SCHEME) $@
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
diff --git a/crypto/chacha/asm/chacha-s390x.pl b/crypto/chacha/asm/chacha-s390x.pl
new file mode 100755 (executable)
index 0000000..8a09706
--- /dev/null
@@ -0,0 +1,317 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2015
+#
+# ChaCha20 for s390x.
+#
+# 3 times faster than compiler-generated code.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+sub AUTOLOAD()         # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+    $code .= "\t$opcode\t".join(',',@_)."\n";
+}
+
+my $sp="%r15";
+
+my $stdframe=16*$SIZE_T+4*8;
+my $frame=$stdframe+4*20;
+
+my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
+
+my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
+my @t=map("%r$_",(8,9));
+
+sub ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my ($xc,$xc_)=map("\"$_\"",@t);
+my @x=map("\"$_\"",@x);
+
+       # Consider order in which variables are addressed by their
+       # index:
+       #
+       #       a   b   c   d
+       #
+       #       0   4   8  12 < even round
+       #       1   5   9  13
+       #       2   6  10  14
+       #       3   7  11  15
+       #       0   5  10  15 < odd round
+       #       1   6  11  12
+       #       2   7   8  13
+       #       3   4   9  14
+       #
+       # 'a', 'b' and 'd's are permanently allocated in registers,
+       # @x[0..7,12..15], while 'c's are maintained in memory. If
+       # you observe 'c' column, you'll notice that pair of 'c's is
+       # invariant between rounds. This means that we have to reload
+       # them once per round, in the middle. This is why you'll see
+       # 'c' stores and loads in the middle, but none in the beginning
+       # or end.
+
+       (
+       "&alr   (@x[$a0],@x[$b0])",     # Q1
+        "&alr  (@x[$a1],@x[$b1])",     # Q2
+       "&xr    (@x[$d0],@x[$a0])",
+        "&xr   (@x[$d1],@x[$a1])",
+       "&rll   (@x[$d0],@x[$d0],16)",
+        "&rll  (@x[$d1],@x[$d1],16)",
+
+       "&alr   ($xc,@x[$d0])",
+        "&alr  ($xc_,@x[$d1])",
+       "&xr    (@x[$b0],$xc)",
+        "&xr   (@x[$b1],$xc_)",
+       "&rll   (@x[$b0],@x[$b0],12)",
+        "&rll  (@x[$b1],@x[$b1],12)",
+
+       "&alr   (@x[$a0],@x[$b0])",
+        "&alr  (@x[$a1],@x[$b1])",
+       "&xr    (@x[$d0],@x[$a0])",
+        "&xr   (@x[$d1],@x[$a1])",
+       "&rll   (@x[$d0],@x[$d0],8)",
+        "&rll  (@x[$d1],@x[$d1],8)",
+
+       "&alr   ($xc,@x[$d0])",
+        "&alr  ($xc_,@x[$d1])",
+       "&xr    (@x[$b0],$xc)",
+        "&xr   (@x[$b1],$xc_)",
+       "&rll   (@x[$b0],@x[$b0],7)",
+        "&rll  (@x[$b1],@x[$b1],7)",
+
+       "&stm   ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
+       "&lm    ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
+
+       "&alr   (@x[$a2],@x[$b2])",     # Q3
+        "&alr  (@x[$a3],@x[$b3])",     # Q4
+       "&xr    (@x[$d2],@x[$a2])",
+        "&xr   (@x[$d3],@x[$a3])",
+       "&rll   (@x[$d2],@x[$d2],16)",
+        "&rll  (@x[$d3],@x[$d3],16)",
+
+       "&alr   ($xc,@x[$d2])",
+        "&alr  ($xc_,@x[$d3])",
+       "&xr    (@x[$b2],$xc)",
+        "&xr   (@x[$b3],$xc_)",
+       "&rll   (@x[$b2],@x[$b2],12)",
+        "&rll  (@x[$b3],@x[$b3],12)",
+
+       "&alr   (@x[$a2],@x[$b2])",
+        "&alr  (@x[$a3],@x[$b3])",
+       "&xr    (@x[$d2],@x[$a2])",
+        "&xr   (@x[$d3],@x[$a3])",
+       "&rll   (@x[$d2],@x[$d2],8)",
+        "&rll  (@x[$d3],@x[$d3],8)",
+
+       "&alr   ($xc,@x[$d2])",
+        "&alr  ($xc_,@x[$d3])",
+       "&xr    (@x[$b2],$xc)",
+        "&xr   (@x[$b3],$xc_)",
+       "&rll   (@x[$b2],@x[$b2],7)",
+        "&rll  (@x[$b3],@x[$b3],7)"
+       );
+}
+
+$code.=<<___;
+.text
+
+.globl ChaCha20_ctr32
+.type  ChaCha20_ctr32,\@function
+.align 32
+ChaCha20_ctr32:
+       a${g}hi $len,-64
+       l${g}hi %r1,-$frame
+       stm${g} %r6,%r15,`6*$SIZE_T`($sp)
+       sl${g}r $out,$inp                       # difference
+       la      $len,0($inp,$len)               # end of input minus 64
+       larl    %r7,.Lsigma
+       lgr     %r0,$sp
+       la      $sp,0(%r1,$sp)
+       st${g}  %r0,0($sp)
+
+       lmg     %r8,%r11,0($key)                # load key
+       lmg     %r12,%r13,0($counter)           # load counter
+       lmg     %r6,%r7,0(%r7)                  # load sigma constant
+
+       la      %r14,0($inp)
+       st${g}  $out,$frame+3*$SIZE_T($sp)
+       st${g}  $len,$frame+4*$SIZE_T($sp)
+       stmg    %r6,%r13,$stdframe($sp)         # copy key schedule to stack
+       srlg    @x[12],%r12,32                  # 32-bit counter value
+       j       .Loop_outer
+
+.align 16
+.Loop_outer:
+       lm      @x[0],@x[7],$stdframe+4*0($sp)          # load x[0]-x[7]
+       lm      @t[0],@t[1],$stdframe+4*10($sp)         # load x[10]-x[11]
+       lm      @x[13],@x[15],$stdframe+4*13($sp)       # load x[13]-x[15]
+       stm     @t[0],@t[1],$stdframe+4*8+4*10($sp)     # offload x[10]-x[11]
+       lm      @t[0],@t[1],$stdframe+4*8($sp)          # load x[8]-x[9]
+       st      @x[12],$stdframe+4*12($sp)              # save counter
+       st${g}  %r14,$frame+2*$SIZE_T($sp)              # save input pointer
+       lhi     %r14,10
+       j       .Loop
+
+.align 4
+.Loop:
+___
+       foreach (&ROUND(0, 4, 8,12)) { eval; }
+       foreach (&ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+       brct    %r14,.Loop
+
+       l${g}   %r14,$frame+2*$SIZE_T($sp)              # pull input pointer
+       stm     @t[0],@t[1],$stdframe+4*8+4*8($sp)      # offload x[8]-x[9]
+       lm${g}  @t[0],@t[1],$frame+3*$SIZE_T($sp)
+
+       al      @x[0],$stdframe+4*0($sp)        # accumulate key schedule
+       al      @x[1],$stdframe+4*1($sp)
+       al      @x[2],$stdframe+4*2($sp)
+       al      @x[3],$stdframe+4*3($sp)
+       al      @x[4],$stdframe+4*4($sp)
+       al      @x[5],$stdframe+4*5($sp)
+       al      @x[6],$stdframe+4*6($sp)
+       al      @x[7],$stdframe+4*7($sp)
+       lrvr    @x[0],@x[0]
+       lrvr    @x[1],@x[1]
+       lrvr    @x[2],@x[2]
+       lrvr    @x[3],@x[3]
+       lrvr    @x[4],@x[4]
+       lrvr    @x[5],@x[5]
+       lrvr    @x[6],@x[6]
+       lrvr    @x[7],@x[7]
+       al      @x[12],$stdframe+4*12($sp)
+       al      @x[13],$stdframe+4*13($sp)
+       al      @x[14],$stdframe+4*14($sp)
+       al      @x[15],$stdframe+4*15($sp)
+       lrvr    @x[12],@x[12]
+       lrvr    @x[13],@x[13]
+       lrvr    @x[14],@x[14]
+       lrvr    @x[15],@x[15]
+
+       la      @t[0],0(@t[0],%r14)             # reconstruct output pointer
+       cl${g}r %r14,@t[1]
+       jh      .Ltail
+
+       x       @x[0],4*0(%r14)                 # xor with input
+       x       @x[1],4*1(%r14)
+       st      @x[0],4*0(@t[0])                # store output
+       x       @x[2],4*2(%r14)
+       st      @x[1],4*1(@t[0])
+       x       @x[3],4*3(%r14)
+       st      @x[2],4*2(@t[0])
+       x       @x[4],4*4(%r14)
+       st      @x[3],4*3(@t[0])
+        lm     @x[0],@x[3],$stdframe+4*8+4*8($sp)      # load x[8]-x[11]
+       x       @x[5],4*5(%r14)
+       st      @x[4],4*4(@t[0])
+       x       @x[6],4*6(%r14)
+        al     @x[0],$stdframe+4*8($sp)
+       st      @x[5],4*5(@t[0])
+       x       @x[7],4*7(%r14)
+        al     @x[1],$stdframe+4*9($sp)
+       st      @x[6],4*6(@t[0])
+       x       @x[12],4*12(%r14)
+        al     @x[2],$stdframe+4*10($sp)
+       st      @x[7],4*7(@t[0])
+       x       @x[13],4*13(%r14)
+        al     @x[3],$stdframe+4*11($sp)
+       st      @x[12],4*12(@t[0])
+       x       @x[14],4*14(%r14)
+       st      @x[13],4*13(@t[0])
+       x       @x[15],4*15(%r14)
+       st      @x[14],4*14(@t[0])
+        lrvr   @x[0],@x[0]
+       st      @x[15],4*15(@t[0])
+        lrvr   @x[1],@x[1]
+        lrvr   @x[2],@x[2]
+        lrvr   @x[3],@x[3]
+       lhi     @x[12],1
+        x      @x[0],4*8(%r14)
+       al      @x[12],$stdframe+4*12($sp)      # increment counter
+        x      @x[1],4*9(%r14)
+        st     @x[0],4*8(@t[0])
+        x      @x[2],4*10(%r14)
+        st     @x[1],4*9(@t[0])
+        x      @x[3],4*11(%r14)
+        st     @x[2],4*10(@t[0])
+       la      %r14,64(%r14)
+        st     @x[3],4*11(@t[0])
+
+       cl${g}r %r14,@t[1]                      # done yet?
+       jle     .Loop_outer
+
+.Ldone:
+       xgr     %r0,%r0
+       xgr     %r1,%r1
+       xgr     %r2,%r2
+       xgr     %r3,%r3
+       stmg    %r0,%r3,$stdframe+4*4($sp)      # wipe key copy
+       stmg    %r0,%r3,$stdframe+4*12($sp)
+
+       lm${g}  %r6,%r15,`$frame+6*$SIZE_T`($sp)
+       br      %r14
+
+.align 16
+.Ltail:
+       la      @t[1],64($t[1])
+       stm     @x[0],@x[7],$stdframe+4*0($sp)
+       sl${g}r @t[1],%r14
+       lm      @x[0],@x[3],$stdframe+4*8+4*8($sp)
+       l${g}hi @x[6],0
+       stm     @x[12],@x[15],$stdframe+4*12($sp)
+       al      @x[0],$stdframe+4*8($sp)
+       al      @x[1],$stdframe+4*9($sp)
+       al      @x[2],$stdframe+4*10($sp)
+       al      @x[3],$stdframe+4*11($sp)
+       lrvr    @x[0],@x[0]
+       lrvr    @x[1],@x[1]
+       lrvr    @x[2],@x[2]
+       lrvr    @x[3],@x[3]
+       stm     @x[0],@x[3],$stdframe+4*8+4*8($sp)
+
+.Loop_tail:
+       llgc    @x[4],0(@x[6],%r14)
+       llgc    @x[5],$stdframe(@x[6],$sp)
+       xr      @x[5],@x[4]
+       stc     @x[5],0(@x[6],@t[0])
+       la      @x[6],1(@x[6])
+       brct    @t[1],.Loop_tail
+
+       j       .Ldone
+.size  ChaCha20_ctr32,.-ChaCha20_ctr32
+
+.align 32
+.Lsigma:
+.long  0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
+.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/ge;
+
+       print $_,"\n";
+}
+close STDOUT;
index 708773d9cfe49fb50c16631e7959a90d8e558ea0..9d74865805383ea65da211ce428766b563c8d0ae 100644 (file)
@@ -39,6 +39,8 @@ lib:  $(LIBOBJ)
 poly1305-sparcv9.S:    asm/poly1305-sparcv9.pl
        $(PERL) asm/poly1305-sparcv9.pl > $@
 
+poly1305-%.S:  asm/poly1305-%.pl;      $(PERL) $< $(PERLASM_SCHEME) $@
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl
new file mode 100755 (executable)
index 0000000..635db0e
--- /dev/null
@@ -0,0 +1,216 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for s390x.
+#
+# June 2015
+#
+# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
+# code. For older compiler improvement coefficient is >3x, because
+# then base 2^64 and base 2^32 implementations are compared.
+#
+# On side note, z13 enables vector base 2^26 implementation...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$sp="%r15";
+
+my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
+
+$code.=<<___;
+.text
+
+.globl poly1305_init
+.type  poly1305_init,\@function
+.align 16
+poly1305_init:
+       lghi    %r0,0
+       lghi    %r1,-1
+       stg     %r0,0($ctx)             # zero hash value
+       stg     %r0,8($ctx)
+       stg     %r0,16($ctx)
+
+       cl${g}r $inp,%r0
+       je      .Lno_key
+
+       lrvg    %r4,0($inp)             # load little-endian key
+       lrvg    %r5,8($inp)
+
+       nihl    %r1,0xffc0              # 0xffffffc0ffffffff
+       srlg    %r0,%r1,4               # 0x0ffffffc0fffffff
+       srlg    %r1,%r1,4
+       nill    %r1,0xfffc              # 0x0ffffffc0ffffffc
+
+       ngr     %r4,%r0
+       ngr     %r5,%r1
+
+       stg     %r4,32($ctx)
+       stg     %r5,40($ctx)
+
+.Lno_key:
+       lghi    %r2,0
+       br      %r14
+.size  poly1305_init,.-poly1305_init
+___
+{
+my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
+my ($r0,$r1,$s1) = map("%r$_",(0..2));
+
+$code.=<<___;
+.globl poly1305_blocks
+.type  poly1305_blocks,\@function
+.align 16
+poly1305_blocks:
+       srl${g} $len,$len,4
+       lghi    %r0,0
+       cl${g}r $len,%r0
+       je      .Lno_data
+
+       stm${g} %r6,%r14,`6*$SIZE_T`($sp)
+
+       lg      $r0,32($ctx)            # load key
+       lg      $r1,40($ctx)
+
+       lg      $h0,0($ctx)             # load hash value
+       lg      $h1,8($ctx)
+       lg      $h2,16($ctx)
+
+       st$g    $ctx,`2*$SIZE_T`($sp)   # off-load $ctx
+       srlg    $s1,$r1,2
+       algr    $s1,$r1                 # s1 = r1 + r1>>2
+       j       .Loop
+
+.align 16
+.Loop:
+       lrvg    $d0lo,0($inp)           # load little-endian input
+       lrvg    $d1lo,8($inp)
+       la      $inp,16($inp)
+
+       algr    $d0lo,$h0               # accumulate input
+       alcgr   $d1lo,$h1
+
+       lgr     $h0,$d0lo
+       mlgr    $d0hi,$r0               # h0*r0   -> $d0hi:$d0lo
+       lgr     $h1,$d1lo
+       mlgr    $d1hi,$s1               # h1*5*r1 -> $d1hi:$d1lo
+
+       mlgr    $t0,$r1                 # h0*r1   -> $t0:$h0
+       mlgr    $t1,$r0                 # h1*r0   -> $t1:$h1
+       alcgr   $h2,$padbit
+
+       algr    $d0lo,$d1lo
+       lgr     $d1lo,$h2
+       alcgr   $d0hi,$d1hi
+       lghi    $d1hi,0
+
+       algr    $h1,$h0
+       alcgr   $t1,$t0
+
+       msgr    $d1lo,$s1               # h2*s1
+       msgr    $h2,$r0                 # h2*r0
+
+       algr    $h1,$d1lo
+       alcgr   $t1,$d1hi               # $d1hi is zero
+
+       algr    $h1,$d0hi
+       alcgr   $h2,$t1
+
+       lghi    $h0,-4                  # final reduction step
+       ngr     $h0,$h2
+       srlg    $t0,$h2,2
+       algr    $h0,$t0
+
+       algr    $h0,$d0lo
+       lghi    $t1,3
+       alcgr   $h1,$d1hi               # $d1hi is still zero
+       ngr     $h2,$t1
+
+       brct$g  $len,.Loop
+
+       l$g     $ctx,`2*$SIZE_T`($sp)   # restore $ctx
+
+       stg     $h0,0($ctx)             # store hash value
+       stg     $h1,8($ctx)
+       stg     $h2,16($ctx)
+
+       lm${g}  %r6,%r14,`6*$SIZE_T`($sp)
+.Lno_data:
+       br      %r14
+.size  poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($mac,$nonce)=($inp,$len);
+my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
+
+$code.=<<___;
+.globl poly1305_emit
+.type  poly1305_emit,\@function
+.align 16
+poly1305_emit:
+       stm${g} %r6,%r9,`6*$SIZE_T`($sp)
+
+       lg      $h0,0($ctx)
+       lg      $h1,8($ctx)
+       lg      $h2,16($ctx)
+
+       lghi    %r0,5
+       lghi    %r1,0
+       lgr     $d0,$h0
+       lgr     $d1,$h1
+
+       algr    $h0,%r0                 # compare to modulus
+       alcgr   $h1,%r1
+       alcgr   $h2,%r1
+
+       srlg    $h2,$h2,2               # did it borrow/carry?
+       slgr    %r1,$h2                 # 0-$h2>>2
+       lg      $h2,0($nonce)           # load nonce
+       lghi    %r0,-1
+       lg      $ctx,8($nonce)
+       xgr     %r0,%r1                 # ~%r1
+
+       ngr     $h0,%r1
+       ngr     $d0,%r0
+       ngr     $h1,%r1
+       ngr     $d1,%r0
+       ogr     $h0,$d0
+       rllg    $d0,$h2,32              # flip nonce words
+       ogr     $h1,$d1
+       rllg    $d1,$ctx,32
+
+       algr    $h0,$d0                 # accumulate nonce
+       alcgr   $h1,$d1
+
+       strvg   $h0,0($mac)             # write little-endian result
+       strvg   $h1,8($mac)
+
+       lm${g}  %r6,%r9,`6*$SIZE_T`($sp)
+       br      %r14
+.size  poly1305_emit,.-poly1305_emit
+
+.string        "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;