Sync ASM/modes to add CCM and XTS modes and assembly language optimisation
authorDr. Stephen Henson <steve@openssl.org>
Thu, 11 Aug 2011 22:36:19 +0000 (22:36 +0000)
committerDr. Stephen Henson <steve@openssl.org>
Thu, 11 Aug 2011 22:36:19 +0000 (22:36 +0000)
(from HEAD, original by Andy).

17 files changed:
crypto/modes/Makefile
crypto/modes/asm/ghash-alpha.pl [new file with mode: 0644]
crypto/modes/asm/ghash-armv4.pl [new file with mode: 0644]
crypto/modes/asm/ghash-ia64.pl [new file with mode: 0755]
crypto/modes/asm/ghash-parisc.pl [new file with mode: 0644]
crypto/modes/asm/ghash-s390x.pl [new file with mode: 0644]
crypto/modes/asm/ghash-sparcv9.pl [new file with mode: 0644]
crypto/modes/asm/ghash-x86.pl [new file with mode: 0644]
crypto/modes/asm/ghash-x86_64.pl [new file with mode: 0644]
crypto/modes/cbc128.c
crypto/modes/ccm128.c [new file with mode: 0644]
crypto/modes/cfb128.c
crypto/modes/ctr128.c
crypto/modes/cts128.c
crypto/modes/modes.h
crypto/modes/ofb128.c
crypto/modes/xts128.c [new file with mode: 0644]

index 007ceff..c825b12 100644 (file)
@@ -10,15 +10,21 @@ CFLAG=-g
 MAKEFILE=      Makefile
 AR=            ar r
 
+MODES_ASM_OBJ=
+
 CFLAGS= $(INCLUDES) $(CFLAG)
+ASFLAGS= $(INCLUDES) $(ASFLAG)
+AFLAGS= $(ASFLAGS)
 
 GENERAL=Makefile
 TEST=
 APPS=
 
 LIB=$(TOP)/libcrypto.a
-LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c
-LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o
+LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
+       ccm128.c xts128.c
+LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
+       ccm128.o xts128.o $(MODES_ASM_OBJ)
 
 SRC= $(LIBSRC)
 
@@ -38,6 +44,24 @@ lib: $(LIBOBJ)
        $(RANLIB) $(LIB) || echo Never mind.
        @touch lib
 
+ghash-ia64.s:  asm/ghash-ia64.pl
+       $(PERL) asm/ghash-ia64.pl $@ $(CFLAGS)
+ghash-x86.s:   asm/ghash-x86.pl
+       $(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
+ghash-x86_64.s:        asm/ghash-x86_64.pl
+       $(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
+ghash-sparcv9.s:       asm/ghash-sparcv9.pl
+       $(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
+ghash-alpha.s: asm/ghash-alpha.pl
+       $(PERL) $< | $(CC) -E - | tee $@ > /dev/null
+ghash-parisc.s:        asm/ghash-parisc.pl
+       $(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+
+# GNU make "catch all"
+ghash-%.S:     asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
+
+ghash-armv4.o: ghash-armv4.S
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
 
@@ -71,12 +95,47 @@ dclean:
        mv -f Makefile.new $(MAKEFILE)
 
 clean:
-       rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
+       rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
-cbc128.o: cbc128.c modes.h
-cfb128.o: cfb128.c modes.h
-ctr128.o: ctr128.c modes.h
-cts128.o: cts128.c modes.h
-ofb128.o: modes.h ofb128.c
+cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h
+ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h
+cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h
+ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h
+cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h
+gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h
+ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
+xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
+xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
+xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
+xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
+xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c
diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl
new file mode 100644 (file)
index 0000000..6358b27
--- /dev/null
@@ -0,0 +1,451 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";     # $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";    # $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";     # $8
+#################
+$Xi="a0";      # $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4";     # $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";    # $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";        # $28
+
+{ my $N;
+  sub loop() {
+
+       $N++;
+$code.=<<___;
+.align 4
+       extbl   $Xlo,7,$nlo
+       and     $nlo,0xf0,$nhi
+       sll     $nlo,4,$nlo
+       and     $nlo,0xf0,$nlo
+
+       addq    $nlo,$Htbl,$nlo
+       ldq     $Zlo,8($nlo)
+       addq    $nhi,$Htbl,$nhi
+       ldq     $Zhi,0($nlo)
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       lda     $cnt,6(zero)
+       extbl   $Xlo,6,$nlo
+
+       ldq     $Tlo1,8($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+       ldq     $Thi1,0($nhi)
+       srl     $Zlo,4,$Zlo
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       and     $nlo,0xf0,$nhi
+
+       xor     $Tlo1,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+       xor     $Thi1,$Zhi,$Zhi
+       and     $nlo,0xf0,$nlo
+
+       addq    $nlo,$Htbl,$nlo
+       ldq     $Tlo0,8($nlo)
+       addq    $nhi,$Htbl,$nhi
+       ldq     $Thi0,0($nlo)
+
+.Looplo$N:
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       subq    $cnt,1,$cnt
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       extbl   $Xlo,$cnt,$nlo
+
+       and     $nlo,0xf0,$nhi
+       xor     $Thi0,$Zhi,$Zhi
+       xor     $Tlo0,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       and     $nlo,0xf0,$nlo
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+       addq    $nlo,$Htbl,$nlo
+       addq    $nhi,$Htbl,$nhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       ldq     $Tlo0,8($nlo)
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       ldq     $Thi0,0($nlo)
+       bne     $cnt,.Looplo$N
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       lda     $cnt,7(zero)
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       extbl   $Xhi,$cnt,$nlo
+
+       and     $nlo,0xf0,$nhi
+       xor     $Thi0,$Zhi,$Zhi
+       xor     $Tlo0,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       and     $nlo,0xf0,$nlo
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+       addq    $nlo,$Htbl,$nlo
+       addq    $nhi,$Htbl,$nhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       ldq     $Tlo0,8($nlo)
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       ldq     $Thi0,0($nlo)
+       unop
+
+
+.Loophi$N:
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       subq    $cnt,1,$cnt
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       extbl   $Xhi,$cnt,$nlo
+
+       and     $nlo,0xf0,$nhi
+       xor     $Thi0,$Zhi,$Zhi
+       xor     $Tlo0,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       and     $nlo,0xf0,$nlo
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+       addq    $nlo,$Htbl,$nlo
+       addq    $nhi,$Htbl,$nhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       ldq     $Tlo0,8($nlo)
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       ldq     $Thi0,0($nlo)
+       bne     $cnt,.Loophi$N
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo0,$Zlo,$Zlo
+       xor     $Thi0,$Zhi,$Zhi
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       xor     $rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set   noat
+.set   noreorder
+.globl gcm_gmult_4bit
+.align 4
+.ent   gcm_gmult_4bit
+gcm_gmult_4bit:
+       .frame  sp,0,ra
+       .prologue 0
+
+       ldq     $Xlo,8($Xi)
+       ldq     $Xhi,0($Xi)
+
+       br      $rem_4bit,.Lpic1
+.Lpic1:        lda     $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+
+       &loop();
+
+$code.=<<___;
+       srl     $Zlo,24,$t0     # byte swap
+       srl     $Zlo,8,$t1
+
+       sll     $Zlo,8,$t2
+       sll     $Zlo,24,$Zlo
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+
+       zapnot  $Zlo,0x88,$Zlo
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zlo,$t0,$Zlo
+       srl     $Zhi,24,$t0
+       srl     $Zhi,8,$t1
+
+       or      $Zlo,$t2,$Zlo
+       sll     $Zhi,8,$t2
+       sll     $Zhi,24,$Zhi
+
+       srl     $Zlo,32,$Xlo
+       sll     $Zlo,32,$Zlo
+
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+       or      $Zlo,$Xlo,$Xlo
+
+       zapnot  $Zhi,0x88,$Zhi
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zhi,$t0,$Zhi
+       or      $Zhi,$t2,$Zhi
+
+       srl     $Zhi,32,$Xhi
+       sll     $Zhi,32,$Zhi
+
+       or      $Zhi,$Xhi,$Xhi
+       stq     $Xlo,8($Xi)
+       stq     $Xhi,0($Xi)
+
+       ret     (ra)
+.end   gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.align 4
+.ent   gcm_ghash_4bit
+gcm_ghash_4bit:
+       lda     sp,-32(sp)
+       stq     ra,0(sp)
+       stq     s0,8(sp)
+       stq     s1,16(sp)
+       .mask   0x04000600,-32
+       .frame  sp,32,ra
+       .prologue 0
+
+       ldq_u   $inhi,0($inp)
+       ldq_u   $Thi0,7($inp)
+       ldq_u   $inlo,8($inp)
+       ldq_u   $Tlo0,15($inp)
+       ldq     $Xhi,0($Xi)
+       ldq     $Xlo,8($Xi)
+
+       br      $rem_4bit,.Lpic2
+.Lpic2:        lda     $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+
+.Louter:
+       extql   $inhi,$inp,$inhi
+       extqh   $Thi0,$inp,$Thi0
+       or      $inhi,$Thi0,$inhi
+       lda     $inp,16($inp)
+
+       extql   $inlo,$inp,$inlo
+       extqh   $Tlo0,$inp,$Tlo0
+       or      $inlo,$Tlo0,$inlo
+       subq    $len,16,$len
+
+       xor     $Xlo,$inlo,$Xlo
+       xor     $Xhi,$inhi,$Xhi
+___
+
+       &loop();
+
+$code.=<<___;
+       srl     $Zlo,24,$t0     # byte swap
+       srl     $Zlo,8,$t1
+
+       sll     $Zlo,8,$t2
+       sll     $Zlo,24,$Zlo
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+
+       zapnot  $Zlo,0x88,$Zlo
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zlo,$t0,$Zlo
+       srl     $Zhi,24,$t0
+       srl     $Zhi,8,$t1
+
+       or      $Zlo,$t2,$Zlo
+       sll     $Zhi,8,$t2
+       sll     $Zhi,24,$Zhi
+
+       srl     $Zlo,32,$Xlo
+       sll     $Zlo,32,$Zlo
+       beq     $len,.Ldone
+
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+       or      $Zlo,$Xlo,$Xlo
+       ldq_u   $inhi,0($inp)
+
+       zapnot  $Zhi,0x88,$Zhi
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+       ldq_u   $Thi0,7($inp)
+
+       or      $Zhi,$t0,$Zhi
+       or      $Zhi,$t2,$Zhi
+       ldq_u   $inlo,8($inp)
+       ldq_u   $Tlo0,15($inp)
+
+       srl     $Zhi,32,$Xhi
+       sll     $Zhi,32,$Zhi
+
+       or      $Zhi,$Xhi,$Xhi
+       br      zero,.Louter
+
+.Ldone:
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+       or      $Zlo,$Xlo,$Xlo
+
+       zapnot  $Zhi,0x88,$Zhi
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zhi,$t0,$Zhi
+       or      $Zhi,$t2,$Zhi
+
+       srl     $Zhi,32,$Xhi
+       sll     $Zhi,32,$Zhi
+
+       or      $Zhi,$Xhi,$Xhi
+
+       stq     $Xlo,8($Xi)
+       stq     $Xhi,0($Xi)
+
+       .set    noreorder
+       /*ldq   ra,0(sp)*/
+       ldq     s0,8(sp)
+       ldq     s1,16(sp)
+       lda     sp,32(sp)
+       ret     (ra)
+.end   gcm_ghash_4bit
+
+.align 4
+rem_4bit:
+       .quad   0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+       .quad   0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+       .quad   0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+       .quad   0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
new file mode 100644 (file)
index 0000000..d91586e
--- /dev/null
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0";      # argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";     # variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;        # used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+       rev     $_,$_
+       str     $_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+       str     $_,[$Xi,#$i]
+#else
+       mov     $Tlh,$_,lsr#8
+       strb    $_,[$Xi,#$i+3]
+       mov     $Thl,$_,lsr#16
+       strb    $Tlh,[$Xi,#$i+2]
+       mov     $Thh,$_,lsr#24
+       strb    $Thl,[$Xi,#$i+1]
+       strb    $Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code  32
+
+.type  rem_4bit,%object
+.align 5
+rem_4bit:
+.short 0x0000,0x1C20,0x3840,0x2460
+.short 0x7080,0x6CA0,0x48C0,0x54E0
+.short 0xE100,0xFD20,0xD940,0xC560
+.short 0x9180,0x8DA0,0xA9C0,0xB5E0
+.size  rem_4bit,.-rem_4bit
+
+.type  rem_4bit_get,%function
+rem_4bit_get:
+       sub     $rem_4bit,pc,#8
+       sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+       b       .Lrem_4bit_got
+       nop
+.size  rem_4bit_get,.-rem_4bit_get
+
+.global        gcm_ghash_4bit
+.type  gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+       sub     r12,pc,#8
+       add     $len,$inp,$len          @ $len to point at the end
+       stmdb   sp!,{r3-r11,lr}         @ save $len/end too
+       sub     r12,r12,#48             @ &rem_4bit
+
+       ldmia   r12,{r4-r11}            @ copy rem_4bit ...
+       stmdb   sp!,{r4-r11}            @ ... to stack
+
+       ldrb    $nlo,[$inp,#15]
+       ldrb    $nhi,[$Xi,#15]
+.Louter:
+       eor     $nlo,$nlo,$nhi
+       and     $nhi,$nlo,#0xf0
+       and     $nlo,$nlo,#0x0f
+       mov     $cnt,#14
+
+       add     $Zhh,$Htbl,$nlo,lsl#4
+       ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
+       add     $Thh,$Htbl,$nhi
+       ldrb    $nlo,[$inp,#14]
+
+       and     $nhi,$Zll,#0xf          @ rem
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+       add     $nhi,$nhi,$nhi
+       eor     $Zll,$Tll,$Zll,lsr#4
+       ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
+       eor     $Zll,$Zll,$Zlh,lsl#28
+       ldrb    $nhi,[$Xi,#14]
+       eor     $Zlh,$Tlh,$Zlh,lsr#4
+       eor     $Zlh,$Zlh,$Zhl,lsl#28
+       eor     $Zhl,$Thl,$Zhl,lsr#4
+       eor     $Zhl,$Zhl,$Zhh,lsl#28
+       eor     $Zhh,$Thh,$Zhh,lsr#4
+       eor     $nlo,$nlo,$nhi
+       and     $nhi,$nlo,#0xf0
+       and     $nlo,$nlo,#0x0f
+       eor     $Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+       add     $Thh,$Htbl,$nlo,lsl#4
+       and     $nlo,$Zll,#0xf          @ rem
+       subs    $cnt,$cnt,#1
+       add     $nlo,$nlo,$nlo
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+       eor     $Zll,$Tll,$Zll,lsr#4
+       eor     $Zll,$Zll,$Zlh,lsl#28
+       eor     $Zlh,$Tlh,$Zlh,lsr#4
+       eor     $Zlh,$Zlh,$Zhl,lsl#28
+       ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
+       eor     $Zhl,$Thl,$Zhl,lsr#4
+       ldrplb  $nlo,[$inp,$cnt]
+       eor     $Zhl,$Zhl,$Zhh,lsl#28
+       eor     $Zhh,$Thh,$Zhh,lsr#4
+
+       add     $Thh,$Htbl,$nhi
+       and     $nhi,$Zll,#0xf          @ rem
+       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+       add     $nhi,$nhi,$nhi
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+       eor     $Zll,$Tll,$Zll,lsr#4
+       ldrplb  $Tll,[$Xi,$cnt]
+       eor     $Zll,$Zll,$Zlh,lsl#28
+       eor     $Zlh,$Tlh,$Zlh,lsr#4
+       ldrh    $Tlh,[sp,$nhi]
+       eor     $Zlh,$Zlh,$Zhl,lsl#28
+       eor     $Zhl,$Thl,$Zhl,lsr#4
+       eor     $Zhl,$Zhl,$Zhh,lsl#28
+       eorpl   $nlo,$nlo,$Tll
+       eor     $Zhh,$Thh,$Zhh,lsr#4
+       andpl   $nhi,$nlo,#0xf0
+       andpl   $nlo,$nlo,#0x0f
+       eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
+       bpl     .Linner
+
+       ldr     $len,[sp,#32]           @ re-load $len/end
+       add     $inp,$inp,#16
+       mov     $nhi,$Zll
+___
+       &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+       bne     .Louter
+
+       add     sp,sp,#36
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
+       ldmia   sp!,{r4-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global        gcm_gmult_4bit
+.type  gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+       stmdb   sp!,{r4-r11,lr}
+       ldrb    $nlo,[$Xi,#15]
+       b       rem_4bit_get
+.Lrem_4bit_got:
+       and     $nhi,$nlo,#0xf0
+       and     $nlo,$nlo,#0x0f
+       mov     $cnt,#14
+
+       add     $Zhh,$Htbl,$nlo,lsl#4
+       ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
+       ldrb    $nlo,[$Xi,#14]
+
+       add     $Thh,$Htbl,$nhi
+       and     $nhi,$Zll,#0xf          @ rem
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+       add     $nhi,$nhi,$nhi
+       eor     $Zll,$Tll,$Zll,lsr#4
+       ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
+       eor     $Zll,$Zll,$Zlh,lsl#28
+       eor     $Zlh,$Tlh,$Zlh,lsr#4
+       eor     $Zlh,$Zlh,$Zhl,lsl#28
+       eor     $Zhl,$Thl,$Zhl,lsr#4
+       eor     $Zhl,$Zhl,$Zhh,lsl#28
+       eor     $Zhh,$Thh,$Zhh,lsr#4
+       and     $nhi,$nlo,#0xf0
+       eor     $Zhh,$Zhh,$Tll,lsl#16
+       and     $nlo,$nlo,#0x0f
+
+.Loop:
+       add     $Thh,$Htbl,$nlo,lsl#4
+       and     $nlo,$Zll,#0xf          @ rem
+       subs    $cnt,$cnt,#1
+       add     $nlo,$nlo,$nlo
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+       eor     $Zll,$Tll,$Zll,lsr#4
+       eor     $Zll,$Zll,$Zlh,lsl#28
+       eor     $Zlh,$Tlh,$Zlh,lsr#4
+       eor     $Zlh,$Zlh,$Zhl,lsl#28
+       ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
+       eor     $Zhl,$Thl,$Zhl,lsr#4
+       ldrplb  $nlo,[$Xi,$cnt]
+       eor     $Zhl,$Zhl,$Zhh,lsl#28
+       eor     $Zhh,$Thh,$Zhh,lsr#4
+
+       add     $Thh,$Htbl,$nhi
+       and     $nhi,$Zll,#0xf          @ rem
+       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+       add     $nhi,$nhi,$nhi
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+       eor     $Zll,$Tll,$Zll,lsr#4
+       eor     $Zll,$Zll,$Zlh,lsl#28
+       eor     $Zlh,$Tlh,$Zlh,lsr#4
+       ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
+       eor     $Zlh,$Zlh,$Zhl,lsl#28
+       eor     $Zhl,$Thl,$Zhl,lsr#4
+       eor     $Zhl,$Zhl,$Zhh,lsl#28
+       eor     $Zhh,$Thh,$Zhh,lsr#4
+       andpl   $nhi,$nlo,#0xf0
+       andpl   $nlo,$nlo,#0x0f
+       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+       bpl     .Loop
+___
+       &Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
+       ldmia   sp!,{r4-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl; # $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu   neon
+
+.global        gcm_gmult_neon
+.type  gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+       sub             $Htbl,#16               @ point at H in GCM128_CTX
+       vld1.64         `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+       vmov.i32        $mod,#0xe1              @ our irreducible polynomial
+       vld1.64         `&Dlo("$IN")`,[$Xi,:64]!
+       vshr.u64        $mod,#32
+       vldmia          $Htbl,{$Hhi-$Hlo}       @ load H
+       veor            $zero,$zero
+#ifdef __ARMEL__
+       vrev64.8        $IN,$IN
+#endif
+       veor            $Qpost,$Qpost
+       veor            $R,$R
+       mov             $cnt,#16
+       veor            $Z,$Z
+       mov             $len,#16
+       veor            $Zo,$Zo
+       vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+       b               .Linner_neon
+.size  gcm_gmult_neon,.-gcm_gmult_neon
+
+.global        gcm_ghash_neon
+.type  gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+       vld1.64         `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+       vmov.i32        $mod,#0xe1              @ our irreducible polynomial
+       vld1.64         `&Dlo("$Z")`,[$Xi,:64]!
+       vshr.u64        $mod,#32
+       vldmia          $Xi,{$Hhi-$Hlo}         @ load H
+       veor            $zero,$zero
+       nop
+#ifdef __ARMEL__
+       vrev64.8        $Z,$Z
+#endif
+.Louter_neon:
+       vld1.64         `&Dhi($IN)`,[$inp]!     @ load inp
+       veor            $Qpost,$Qpost
+       vld1.64         `&Dlo($IN)`,[$inp]!
+       veor            $R,$R
+       mov             $cnt,#16
+#ifdef __ARMEL__
+       vrev64.8        $IN,$IN
+#endif
+       veor            $Zo,$Zo
+       veor            $IN,$Z                  @ inp^=Xi
+       veor            $Z,$Z
+       vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+.Linner_neon:
+       subs            $cnt,$cnt,#1
+       vmull.p8        $Qlo,$Hlo,$xi           @ H.lo·Xi[i]
+       vmull.p8        $Qhi,$Hhi,$xi           @ H.hi·Xi[i]
+       vext.8          $IN,$zero,#1            @ IN>>=8
+
+       veor            $Z,$Qpost               @ modulo-scheduled part
+       vshl.i64        `&Dlo("$R")`,#48
+       vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+       veor            $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+       veor            `&Dhi("$Z")`,`&Dlo("$R")`
+       vuzp.8          $Qlo,$Qhi
+       vsli.8          $Zo,$T,#1               @ compose the "carry" byte
+       vext.8          $Z,$zero,#1             @ Z>>=8
+
+       vmull.p8        $R,$Zo,$mod             @ "carry"·0xe1
+       vshr.u8         $Zo,$T,#7               @ save Z's bottom bit
+       vext.8          $Qpost,$Qlo,$zero,#1    @ Qlo>>=8
+       veor            $Z,$Qhi
+       bne             .Linner_neon
+
+       veor            $Z,$Qpost               @ modulo-scheduled artefact
+       vshl.i64        `&Dlo("$R")`,#48
+       veor            `&Dhi("$Z")`,`&Dlo("$R")`
+
+       @ finalization, normalize Z:Zo
+       vand            $Zo,$mod                @ suffices to mask the bit
+       vshr.u64        `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+       vshl.i64        $Z,#1
+       subs            $len,#16
+       vorr            $Z,`&Q("$Zo")`          @ Z=Z:Zo<<1
+       bne             .Louter_neon
+
+#ifdef __ARMEL__
+       vrev64.8        $Z,$Z
+#endif
+       sub             $Xi,#16 
+       vst1.64         `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
+       vst1.64         `&Dlo("$Z")`,[$Xi,:64]
+
+       bx      lr
+.size  gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl
new file mode 100755 (executable)
index 0000000..0354c95
--- /dev/null
@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi;        (p18)   ld8     Hlo=[Hi[1]],-8
+       (p19)   dep     rem=Zlo,rem_4bitp,3,4   }
+{ .mfi;        (p19)   xor     Zhi=Zhi,Hhi
+       ($p17)  xor     xi[1]=xi[1],in[1]       };;
+{ .mfi;        (p18)   ld8     Hhi=[Hi[1]]
+       (p19)   shrp    Zlo=Zhi,Zlo,4           }
+{ .mfi;        (p19)   ld8     rem=[rem]
+       (p18)   and     Hi[1]=mask0xf0,xi[2]    };;
+{ .mmi;        ($p16)  ld1     in[0]=[inp],-1
+       (p18)   xor     Zlo=Zlo,Hlo
+       (p19)   shr.u   Zhi=Zhi,4               }
+{ .mib;        (p19)   xor     Hhi=Hhi,rem
+       (p18)   add     Hi[1]=Htbl,Hi[1]        };;
+
+{ .mfi;        (p18)   ld8     Hlo=[Hi[1]],-8
+       (p18)   dep     rem=Zlo,rem_4bitp,3,4   }
+{ .mfi;        (p17)   shladd  Hi[0]=xi[1],4,r0
+       (p18)   xor     Zhi=Zhi,Hhi             };;
+{ .mfi;        (p18)   ld8     Hhi=[Hi[1]]
+       (p18)   shrp    Zlo=Zhi,Zlo,4           }
+{ .mfi;        (p18)   ld8     rem=[rem]
+       (p17)   and     Hi[0]=mask0xf0,Hi[0]    };;
+{ .mmi;        (p16)   ld1     xi[0]=[Xi],-1
+       (p18)   xor     Zlo=Zlo,Hlo
+       (p18)   shr.u   Zhi=Zhi,4               }
+{ .mib;        (p18)   xor     Hhi=Hhi,rem
+       (p17)   add     Hi[0]=Htbl,Hi[0]
+       br.ctop.sptk    $label                  };;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2;     prevlc=r3;      prevpr=r8;
+mask0xf0=r21;
+rem=r22;       rem_4bitp=r23;
+Xi=r24;                Htbl=r25;
+inp=r26;       end=r27;
+Hhi=r28;       Hlo=r29;
+Zhi=r30;       Zlo=r31;
+
+.align 128
+.skip  16                                      // aligns loop body
+.global        gcm_gmult_4bit#
+.proc  gcm_gmult_4bit#
+gcm_gmult_4bit:
+       .prologue
+{ .mmi;        .save   ar.pfs,prevfs
+       alloc   prevfs=ar.pfs,2,6,0,8
+       $ADDP   Xi=15,in0                       // &Xi[15]
+       mov     rem_4bitp=ip            }
+{ .mii;        $ADDP   Htbl=8,in1                      // &Htbl[0].lo
+       .save   ar.lc,prevlc
+       mov     prevlc=ar.lc
+       .save   pr,prevpr
+       mov     prevpr=pr               };;
+
+       .body
+       .rotr   in[3],xi[3],Hi[2]
+
+{ .mib;        ld1     xi[2]=[Xi],-1                   // Xi[15]
+       mov     mask0xf0=0xf0
+       brp.loop.imp    .Loop1,.Lend1-16};;
+{ .mmi;        ld1     xi[1]=[Xi],-1                   // Xi[14]
+                                       };;
+{ .mii;        shladd  Hi[1]=xi[2],4,r0
+       mov     pr.rot=0x7<<16
+       mov     ar.lc=13                };;
+{ .mii;        and     Hi[1]=mask0xf0,Hi[1]
+       mov     ar.ec=3
+       xor     Zlo=Zlo,Zlo             };;
+{ .mii;        add     Hi[1]=Htbl,Hi[1]                // &Htbl[nlo].lo
+       add     rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+       xor     Zhi=Zhi,Zhi             };;
+___
+       &loop   (".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib;        xor     Zhi=Zhi,Hhi             };;     // modulo-scheduling artefact
+{ .mib;        mux1    Zlo=Zlo,\@rev           };;
+{ .mib;        mux1    Zhi=Zhi,\@rev           };;
+{ .mmi;        add     Hlo=9,Xi;;                      // ;; is here to prevent
+       add     Hhi=1,Xi                };;     // pipeline flush on Itanium
+{ .mib;        st8     [Hlo]=Zlo
+       mov     pr=prevpr,0x1ffff       };;
+{ .mib;        st8     [Hhi]=Zhi
+       mov     ar.lc=prevlc
+       br.ret.sptk.many        b0      };;
+.endp  gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+    for (my $i=0;$i<8;$i++) {
+       $code.=<<___;
+{ .mmi;        ld8     r`16+2*$i+1`=[r8],16            // Htable[$i].hi
+       ld8     r`16+2*$i`=[r9],16      }       // Htable[$i].lo
+{ .mmi;        ldf8    f`32+2*$i+1`=[r10],16           // Htable[`8+$i`].hi
+       ldf8    f`32+2*$i`=[r11],16             // Htable[`8+$i`].lo
+___
+       $code.=shift    if (($i+$#_)==7);
+       $code.="\t};;\n"
+    }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align 32
+.skip  16                                      // aligns loop body
+.global        gcm_ghash_4bit#
+.proc  gcm_ghash_4bit#
+gcm_ghash_4bit:
+       .prologue
+{ .mmi;        .save   ar.pfs,prevfs
+       alloc   prevfs=ar.pfs,4,2,0,0
+       .vframe prevsp
+       mov     prevsp=sp
+       mov     $rem_8bit=ip            };;
+       .body
+{ .mfi;        $ADDP   r8=0+0,$Htbl
+       $ADDP   r9=0+8,$Htbl            }
+{ .mfi;        $ADDP   r10=128+0,$Htbl
+       $ADDP   r11=128+8,$Htbl         };;
+___
+       &load_htable(
+       "       $ADDP   $Xip=15,$Xip",          # &Xi[15]
+       "       $ADDP   $len=$len,$inp",        # &inp[len]
+       "       $ADDP   $inp=15,$inp",          # &inp[15]
+       "       mov     $mask0xff=0xff",
+       "       add     sp=-512,sp",
+       "       andcm   sp=sp,$mask0xff",       # align stack frame
+       "       add     r14=0,sp",
+       "       add     r15=8,sp");
+$code.=<<___;
+{ .mmi;        $sum    1<<1                            // go big-endian
+       add     r8=256+0,sp
+       add     r9=256+8,sp             }
+{ .mmi;        add     r10=256+128+0,sp
+       add     r11=256+128+8,sp
+       add     $len=-17,$len           };;
+___
+for($i=0;$i<8;$i++) {  # generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi;        st8     [r8]=$rlo,16                    // Htable[$i].lo
+       st8     [r9]=$rhi,16                    // Htable[$i].hi
+       shrp    $rlo=$rhi,$rlo,4        }//;;
+{ .mmi;        stf8    [r10]=f`32+2*$i`,16             // Htable[`8+$i`].lo
+       stf8    [r11]=f`32+2*$i+1`,16           // Htable[`8+$i`].hi
+       shr.u   $rhi=$rhi,4             };;
+{ .mmi;        st8     [r14]=$rlo,16                   // Htable[$i].lo>>4
+       st8     [r15]=$rhi,16           }//;;   // Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi;        ld8     r16=[r8],16                     // Htable[8].lo
+       ld8     r17=[r9],16             };;     // Htable[8].hi
+{ .mmi;        ld8     r18=[r8],16                     // Htable[9].lo
+       ld8     r19=[r9],16             }       // Htable[9].hi
+{ .mmi;        rum     1<<5                            // clear um.mfh
+       shrp    r16=r17,r16,4           };;
+___
+for($i=0;$i<6;$i++) {  # generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi;        ld8     r`20+2*$i`=[r8],16              // Htable[`10+$i`].lo
+       ld8     r`20+2*$i+1`=[r9],16            // Htable[`10+$i`].hi
+       shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
+{ .mmi;        st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
+       st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
+       shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
+___
+}
+$code.=<<___;
+{ .mmi;        shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
+{ .mmi;        st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
+       st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
+       shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
+{ .mmi;        add     $Htbl=256,sp                    // &Htable[0]
+       add     $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+       shr.u   r`18+2*$i+1`=r`18+2*$i+1`,4     };;
+{ .mmi;        st8     [r14]=r`18+2*$i`                // Htable[`8+$i`].lo>>4
+       st8     [r15]=r`18+2*$i+1`      }       // Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___;  # (p16)
+{ .mmi;        ld1     $in=[$inp],-1                   //(p16) *inp--
+       ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+       cmp.eq  p0,p6=r0,r0             };;     //      clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));        # "rotate" registers
+
+$code.=<<___;  # (p16),(p17)
+{ .mmi;        ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+       xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mii;        ld1     $in=[$inp],-1                   //(p16) *inp--
+       dep     $Atbl=$xi[1],$Htbl,4,4          //(p17) &Htable[nlo].lo
+       and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+.align 32
+.LOOP:
+{ .mmi;
+(p6)   st8     [$Xip]=$Zhi,13
+       xor     $Zlo=$Zlo,$Zlo
+       add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));        # "rotate" registers
+
+$code.=<<___;  # (p16),(p17),(p18)
+{ .mmi;        ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+       ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+       xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mfi;        ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+       dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
+{ .mfi;        shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
+       xor     $Zlo=$Zlo,$Alo          };;     //(p18) Z.lo^=Htable[nlo].lo
+{ .mmi;        ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+       ld1     $in=[$inp],-1           }       //(p16) *inp--
+{ .mmi;        xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
+       mov     $Zhi=$Ahi                       //(p18) Z.hi^=Htable[nlo].hi
+       and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+{ .mmi;        ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
+       ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+       shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;        and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+       add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));        # "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___;  # (p16),(p17),(p18),(p19)
+{ .mmi;        ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+       ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+       shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
+{ .mmi;        shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+       xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
+       xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mmi;        ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+       ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+       dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
+{ .mmi;        shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
+       xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
+       xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;        ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+       ld1     $in=[$inp],-1                   //(p16) *inp--
+       shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
+{ .mmi;        xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
+       xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
+       and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+{ .mmi;        ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
+       ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
+       shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;        and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+       xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
+       add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));        # "rotate" registers
+}
+
+$code.=<<___;  # (p17),(p18),(p19)
+{ .mmi;        ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+       ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+       shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
+{ .mmi;        shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+       xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
+       xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
+{ .mmi;        ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+       ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+       dep     $Atbl=$xi[1],$Htbl,4,4  };;     //(p17) &Htable[nlo].lo
+{ .mmi;        shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
+       xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
+       xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;        ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+       shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
+{ .mmi;        xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
+       xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
+       and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
+{ .mmi;        ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
+       shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;        and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+       xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
+       add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));        # "rotate" registers
+
+$code.=<<___;  # (p18),(p19)
+{ .mfi;        ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
+       shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
+{ .mfi;        shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+       xor     $Zlo=$Zlo,$Blo          };;     //(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi;        ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
+       xor     $Zlo=$Zlo,$Alo          }       //(p18) Z.lo^=Htable[nlo].lo
+{ .mfi;        ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+       xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi;        ld8     $Blo=[$Btbl],8                  //(p18) Htable[nhi].lo,&Htable[nhi].hi
+       shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
+{ .mfi;        shladd  $rem[0]=$Zlo,4,r0               //(p18) Z.lo<<4
+       xor     $Zhi=$Zhi,$Ahi          };;     //(p18) Z.hi^=Htable[nlo].hi
+{ .mfi;        ld8     $Bhi=[$Btbl]                    //(p18) Htable[nhi].hi
+       shrp    $Zlo=$Zhi,$Zlo,4        }       //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi;        and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+       xor     $Zhi=$Zhi,$rem[1]       };;     //(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));        # "rotate" registers
+
+$code.=<<___;  # (p19)
+{ .mmi;        cmp.ltu p6,p0=$inp,$len
+       add     $inp=32,$inp
+       shr.u   $Zhi=$Zhi,4             }       //(p19) Z.hi>>=4
+{ .mmi;        shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
+       xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
+       add     $Xip=9,$Xip             };;     //      &Xi.lo
+{ .mmi;        ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
+(p6)   ld1     $in=[$inp],-1                   //[p16] *inp--
+(p6)   extr.u  $xi[1]=$Zlo,8,8         }       //[p17] Xi[14]
+{ .mmi;        xor     $Zhi=$Zhi,$Bhi                  //(p19) Z.hi^=Hshr4[nhi].hi
+(p6)   and     $xi[0]=$Zlo,$mask0xff   };;     //[p16] Xi[15]
+{ .mmi;        st8     [$Xip]=$Zlo,-8
+(p6)   xor     $xi[0]=$xi[0],$in               //[p17] xi=$xi[i]^inp[i]
+       shl     $rem[1]=$rem[1],48      };;     //(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6)   ld1     $in=[$inp],-1                   //[p16] *inp--
+       xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
+(p6)   dep     $Atbl=$xi[0],$Htbl,4,4  }       //[p17] &Htable[nlo].lo
+{ .mib;
+(p6)   and     $xi[0]=-16,$xi[0]               //[p17] nhi=xi&0xf0
+(p6)   br.cond.dptk.many       .LOOP   };;
+
+{ .mib;        st8     [$Xip]=$Zhi             };;
+{ .mib;        $rum    1<<1                            // return to little-endian
+       .restore        sp
+       mov     sp=prevsp
+       br.ret.sptk.many        b0      };;
+.endp  gcm_ghash_4bit#
+___
+$code.=<<___;
+.align 128
+.type  rem_4bit#,\@object
+rem_4bit:
+        data8  0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8  0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8  0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8  0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size  rem_4bit#,128
+.type  rem_8bit#,\@object
+rem_8bit:
+       data1   0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+       data1   0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+       data1   0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+       data1   0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+       data1   0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+       data1   0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+       data1   0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+       data1   0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+       data1   0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+       data1   0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+       data1   0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+       data1   0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+       data1   0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+       data1   0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+       data1   0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+       data1   0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+       data1   0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+       data1   0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+       data1   0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+       data1   0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+       data1   0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+       data1   0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+       data1   0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+       data1   0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+       data1   0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+       data1   0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+       data1   0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+       data1   0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+       data1   0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+       data1   0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+       data1   0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+       data1   0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size  rem_8bit#,512
+stringz        "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-parisc.pl b/crypto/modes/asm/ghash-parisc.pl
new file mode 100644 (file)
index 0000000..8c7454e
--- /dev/null
@@ -0,0 +1,730 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+       $LEVEL          ="2.0W";
+       $SIZE_T         =8;
+       $FRAME_MARKER   =80;
+       $SAVED_RP       =16;
+       $PUSH           ="std";
+       $PUSHMA         ="std,ma";
+       $POP            ="ldd";
+       $POPMB          ="ldd,mb";
+       $NREGS          =6;
+} else {
+       $LEVEL          ="1.0"; #"\n\t.ALLOW\t2.0";
+       $SIZE_T         =4;
+       $FRAME_MARKER   =48;
+       $SAVED_RP       =20;
+       $PUSH           ="stw";
+       $PUSHMA         ="stwm";
+       $POP            ="ldw";
+       $POPMB          ="ldwm";
+       $NREGS          =11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+                               #                 [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26";    # argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;    # variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+       $Zhl="%r6";
+       $Zlh="%r7";
+       $Hhl="%r8";
+       $Hlh="%r9";
+       $Thl="%r10";
+       $Tlh="%r11";
+}
+$rem2="%r6";   # used in PA-RISC 2.0 code
+
+$code.=<<___;
+       .LEVEL  $LEVEL
+       .SPACE  \$TEXT\$
+       .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+       .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+       .ALIGN  64
+gcm_gmult_4bit
+       .PROC
+       .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+       .ENTRY
+       $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+       $PUSHMA %r3,$FRAME(%sp)
+       $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+       $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+       $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+       $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+       $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+       $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+       $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+       $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+       blr     %r0,$rem_4bit
+       ldi     3,$rem
+L\$pic_gmult
+       andcm   $rem_4bit,$rem,$rem_4bit
+       addl    $inp,$len,$len
+       ldo     L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+       ldi     0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+       ldi     31,$rem
+       mtctl   $rem,%cr11
+       extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
+       b       L\$parisc1_gmult
+       nop
+___
+\f
+$code.=<<___;
+       ldb     15($Xi),$nlo
+       ldo     8($Htbl),$Hll
+
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       ldd     $nlo($Hll),$Zll
+       ldd     $nlo($Hhh),$Zhh
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldb     14($Xi),$nlo
+
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+       b       L\$oop_gmult_pa2
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_gmult_pa2
+       xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
+       depd,z  $Zll,60,4,$rem
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem
+       ldbx    $cnt($Xi),$nlo
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $Tll,$Zll,$Zll
+       addib,uv -1,$cnt,L\$oop_gmult_pa2
+       xor     $Thh,$Zhh,$Zhh
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       std     $Zll,8($Xi)
+       std     $Zhh,0($Xi)
+___
+\f
+$code.=<<___ if ($SIZE_T==4);
+       b       L\$done_gmult
+       nop
+
+L\$parisc1_gmult
+       ldb     15($Xi),$nlo
+       ldo     12($Htbl),$Hll
+       ldo     8($Htbl),$Hlh
+       ldo     4($Htbl),$Hhl
+
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       ldwx    $nlo($Hll),$Zll
+       ldwx    $nlo($Hlh),$Zlh
+       ldwx    $nlo($Hhl),$Zhl
+       ldwx    $nlo($Hhh),$Zhh
+       zdep    $Zll,28,4,$rem
+       ldb     14($Xi),$nlo
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       extru   $Zhh,27,28,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       xor     $rem,$Zhh,$Zhh
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $Thl,$Zhl,$Zhl
+       b       L\$oop_gmult_pa1
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_gmult_pa1
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldbx    $cnt($Xi),$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $rem,$Zhh,$Zhh
+       zdep    $Zll,28,4,$rem
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       and     $mask0xf0,$nlo,$nhi
+       extru   $Zhh,27,28,$Zhh
+       zdep    $nlo,27,4,$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $rem,$Zhh,$Zhh
+       addib,uv -1,$cnt,L\$oop_gmult_pa1
+       xor     $Thl,$Zhl,$Zhl
+
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $rem,$Zhh,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       zdep    $Zll,28,4,$rem
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       extru   $Zhh,27,28,$Zhh
+       xor     $Tll,$Zll,$Zll
+       xor     $Tlh,$Zlh,$Zlh
+       xor     $rem,$Zhh,$Zhh
+       stw     $Zll,12($Xi)
+       xor     $Thl,$Zhl,$Zhl
+       stw     $Zlh,8($Xi)
+       xor     $Thh,$Zhh,$Zhh
+       stw     $Zhl,4($Xi)
+       stw     $Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+       $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
+       $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+       $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+       $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+       $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+       $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+       $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+       $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+       $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+       bv      (%r2)
+       .EXIT
+       $POPMB  -$FRAME(%sp),%r3
+       .PROCEND
+
+       .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+       .ALIGN  64
+gcm_ghash_4bit
+       .PROC
+       .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+       .ENTRY
+       $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
+       $PUSHMA %r3,$FRAME(%sp)
+       $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
+       $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
+       $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+       $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
+       $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
+       $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
+       $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
+       $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+       blr     %r0,$rem_4bit
+       ldi     3,$rem
+L\$pic_ghash
+       andcm   $rem_4bit,$rem,$rem_4bit
+       addl    $inp,$len,$len
+       ldo     L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+       ldi     0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+       ldi     31,$rem
+       mtctl   $rem,%cr11
+       extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
+       b       L\$parisc1_ghash
+       nop
+___
+\f\f
+$code.=<<___;
+       ldb     15($Xi),$nlo
+       ldo     8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+       ldb     15($inp),$nhi
+       xor     $nhi,$nlo,$nlo
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       ldd     $nlo($Hll),$Zll
+       ldd     $nlo($Hhh),$Zhh
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldb     14($Xi),$nlo
+       ldb     14($inp),$byte
+
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+       xor     $byte,$nlo,$nlo
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+       b       L\$oop_ghash_pa2
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_ghash_pa2
+       xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
+       depd,z  $Zll,60,4,$rem2
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldbx    $cnt($Xi),$nlo
+       ldbx    $cnt($inp),$byte
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       ldd     $rem2($rem_4bit),$rem2
+
+       xor     $rem2,$Zhh,$Zhh
+       xor     $byte,$nlo,$nlo
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       and     $mask0xf0,$nlo,$nhi
+       depd,z  $nlo,59,4,$nlo
+
+       extrd,u $Zhh,59,60,$Zhh
+       xor     $Tll,$Zll,$Zll
+
+       ldd     $rem($rem_4bit),$rem
+       addib,uv -1,$cnt,L\$oop_ghash_pa2
+       xor     $Thh,$Zhh,$Zhh
+
+       xor     $rem,$Zhh,$Zhh
+       depd,z  $Zll,60,4,$rem2
+
+       shrpd   $Zhh,$Zll,4,$Zll
+       extrd,u $Zhh,59,60,$Zhh
+       ldd     $nlo($Hll),$Tll
+       ldd     $nlo($Hhh),$Thh
+
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+
+       depd,z  $Zll,60,4,$rem
+       shrpd   $Zhh,$Zll,4,$Zll
+       ldd     $rem2($rem_4bit),$rem2
+
+       xor     $rem2,$Zhh,$Zhh
+       ldd     $nhi($Hll),$Tll
+       ldd     $nhi($Hhh),$Thh
+
+       extrd,u $Zhh,59,60,$Zhh
+       xor     $Tll,$Zll,$Zll
+       xor     $Thh,$Zhh,$Zhh
+       ldd     $rem($rem_4bit),$rem
+
+       xor     $rem,$Zhh,$Zhh
+       std     $Zll,8($Xi)
+       ldo     16($inp),$inp
+       std     $Zhh,0($Xi)
+       cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+       copy    $Zll,$nlo
+___
+\f
+$code.=<<___ if ($SIZE_T==4);
+       b       L\$done_ghash
+       nop
+
+L\$parisc1_ghash
+       ldb     15($Xi),$nlo
+       ldo     12($Htbl),$Hll
+       ldo     8($Htbl),$Hlh
+       ldo     4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+       ldb     15($inp),$byte
+       xor     $byte,$nlo,$nlo
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       ldwx    $nlo($Hll),$Zll
+       ldwx    $nlo($Hlh),$Zlh
+       ldwx    $nlo($Hhl),$Zhl
+       ldwx    $nlo($Hhh),$Zhh
+       zdep    $Zll,28,4,$rem
+       ldb     14($Xi),$nlo
+       ldb     14($inp),$byte
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       extru   $Zhh,27,28,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       xor     $byte,$nlo,$nlo
+       xor     $rem,$Zhh,$Zhh
+       and     $mask0xf0,$nlo,$nhi
+       zdep    $nlo,27,4,$nlo
+
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $Thl,$Zhl,$Zhl
+       b       L\$oop_ghash_pa1
+       ldi     13,$cnt
+
+       .ALIGN  8
+L\$oop_ghash_pa1
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       ldbx    $cnt($Xi),$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       ldbx    $cnt($inp),$byte
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $rem,$Zhh,$Zhh
+       zdep    $Zll,28,4,$rem
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       xor     $byte,$nlo,$nlo
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       and     $mask0xf0,$nlo,$nhi
+       extru   $Zhh,27,28,$Zhh
+       zdep    $nlo,27,4,$nlo
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nlo($Hll),$Tll
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nlo($Hlh),$Tlh
+       xor     $rem,$Zhh,$Zhh
+       addib,uv -1,$cnt,L\$oop_ghash_pa1
+       xor     $Thl,$Zhl,$Zhl
+
+       zdep    $Zll,28,4,$rem
+       ldwx    $nlo($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       ldwx    $nlo($Hhh),$Thh
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       xor     $Tll,$Zll,$Zll
+       ldwx    $nhi($Hll),$Tll
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       xor     $Tlh,$Zlh,$Zlh
+       ldwx    $nhi($Hlh),$Tlh
+       extru   $Zhh,27,28,$Zhh
+       xor     $rem,$Zhh,$Zhh
+       xor     $Thl,$Zhl,$Zhl
+       ldwx    $nhi($Hhl),$Thl
+       xor     $Thh,$Zhh,$Zhh
+       ldwx    $nhi($Hhh),$Thh
+       zdep    $Zll,28,4,$rem
+       ldwx    $rem($rem_4bit),$rem
+       shrpw   $Zlh,$Zll,4,$Zll
+       shrpw   $Zhl,$Zlh,4,$Zlh
+       shrpw   $Zhh,$Zhl,4,$Zhl
+       extru   $Zhh,27,28,$Zhh
+       xor     $Tll,$Zll,$Zll
+       xor     $Tlh,$Zlh,$Zlh
+       xor     $rem,$Zhh,$Zhh
+       stw     $Zll,12($Xi)
+       xor     $Thl,$Zhl,$Zhl
+       stw     $Zlh,8($Xi)
+       xor     $Thh,$Zhh,$Zhh
+       stw     $Zhl,4($Xi)
+       ldo     16($inp),$inp
+       stw     $Zhh,0($Xi)
+       comb,<> $inp,$len,L\$outer_ghash_pa1
+       copy    $Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+       $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
+       $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
+       $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
+       $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+       $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
+       $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
+       $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
+       $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
+       $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+       bv      (%r2)
+       .EXIT
+       $POPMB  -$FRAME(%sp),%r3
+       .PROCEND
+
+       .ALIGN  64
+L\$rem_4bit
+       .WORD   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+       .WORD   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+       .WORD   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+       .WORD   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+       .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+       .ALIGN  64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)                # format 4
+    {  my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)    # format 5
+    {  my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+       $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
+       $opcode|=(1<<5)  if ($mod =~ /^,m/);
+       $opcode|=(1<<13) if ($mod =~ /^,mb/);
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {  my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)    # format 15
+    {  my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+       my $len=32-$3;
+       $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
+       $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)     # format 12
+    {  my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+       my $len=32-$2;
+       $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
+       $opcode |= (1<<13) if ($mod =~ /,\**=/);
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)  # format 14
+    {  my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+       my $cpos=63-$3;
+       $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)   # format 11
+    {  sprintf "\t.WORD\t0x%08x\t; %s",
+               (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+
+    # I only have ",z" completer, it's impicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)    # format 16
+    {  my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+       my $cpos=63-$2;
+       my $len=32-$3;
+       $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode pos
+       $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
+       sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/ge;
+       if ($SIZE_T==4) {
+               s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+               s/cmpb,\*/comb,/;
+               s/,\*/,/;
+       }
+       print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl
new file mode 100644 (file)
index 0000000..48cb08d
--- /dev/null
@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2";     # argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6";   # variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+.text
+
+.globl gcm_gmult_4bit
+.align 32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0);      # hardware is slow for single block...
+       larl    %r1,OPENSSL_s390xcap_P
+       lg      %r0,0(%r1)
+       tmhl    %r0,0x4000      # check for message-security-assist
+       jz      .Lsoft_gmult
+       lghi    %r0,0
+       la      %r1,16($sp)
+       .long   0xb93e0004      # kimd %r0,%r4
+       lg      %r1,24($sp)
+       tmhh    %r1,0x4000      # check for function 65
+       jz      .Lsoft_gmult
+       stg     %r0,16($sp)     # arrange 16 bytes of zero input
+       stg     %r0,24($sp)
+       lghi    %r0,65          # function 65
+       la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
+       la      $inp,16($sp)
+       lghi    $len,16
+       .long   0xb93e0004      # kimd %r0,$inp
+       brc     1,.-4           # pay attention to "partial completion"
+       br      %r14
+.align 32
+.Lsoft_gmult:
+___
+$code.=<<___;
+       stm${g} %r6,%r14,6*$SIZE_T($sp)
+
+       aghi    $Xi,-1
+       lghi    $len,1
+       lghi    $x78,`0xf<<3`
+       larl    $rem_4bit,rem_4bit
+
+       lg      $Zlo,8+1($Xi)           # Xi
+       j       .Lgmult_shortcut
+.type  gcm_gmult_4bit,\@function
+.size  gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl gcm_ghash_4bit
+.align 32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+       larl    %r1,OPENSSL_s390xcap_P
+       lg      %r0,0(%r1)
+       tmhl    %r0,0x4000      # check for message-security-assist
+       jz      .Lsoft_ghash
+       lghi    %r0,0
+       la      %r1,16($sp)
+       .long   0xb93e0004      # kimd %r0,%r4
+       lg      %r1,24($sp)
+       tmhh    %r1,0x4000      # check for function 65
+       jz      .Lsoft_ghash
+       lghi    %r0,65          # function 65
+       la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
+       .long   0xb93e0004      # kimd %r0,$inp
+       brc     1,.-4           # pay attention to "partial completion"
+       br      %r14
+.align 32
+.Lsoft_ghash:
+___
+$cdoe.=<<___ if ($flavour =~ /3[12]/);
+       llgfr   $len,$len
+___
+$code.=<<___;
+       stm${g} %r6,%r14,6*$SIZE_T($sp)
+
+       aghi    $Xi,-1
+       srlg    $len,$len,4
+       lghi    $x78,`0xf<<3`
+       larl    $rem_4bit,rem_4bit
+
+       lg      $Zlo,8+1($Xi)           # Xi
+       lg      $Zhi,0+1($Xi)
+       lghi    $tmp,0
+.Louter:
+       xg      $Zhi,0($inp)            # Xi ^= inp 
+       xg      $Zlo,8($inp)
+       xgr     $Zhi,$tmp
+       stg     $Zlo,8+1($Xi)
+       stg     $Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+       lghi    $tmp,0xf0
+       sllg    $nlo,$Zlo,4
+       srlg    $xi,$Zlo,8              # extract second byte
+       ngr     $nlo,$tmp
+       lgr     $nhi,$Zlo
+       lghi    $cnt,14
+       ngr     $nhi,$tmp
+
+       lg      $Zlo,8($nlo,$Htbl)
+       lg      $Zhi,0($nlo,$Htbl)
+
+       sllg    $nlo,$xi,4
+       sllg    $rem0,$Zlo,3
+       ngr     $nlo,$tmp
+       ngr     $rem0,$x78
+       ngr     $xi,$tmp
+
+       sllg    $tmp,$Zhi,60
+       srlg    $Zlo,$Zlo,4
+       srlg    $Zhi,$Zhi,4
+       xg      $Zlo,8($nhi,$Htbl)
+       xg      $Zhi,0($nhi,$Htbl)
+       lgr     $nhi,$xi
+       sllg    $rem1,$Zlo,3
+       xgr     $Zlo,$tmp
+       ngr     $rem1,$x78
+       j       .Lghash_inner
+.align 16
+.Lghash_inner:
+       srlg    $Zlo,$Zlo,4
+       sllg    $tmp,$Zhi,60
+       xg      $Zlo,8($nlo,$Htbl)
+       srlg    $Zhi,$Zhi,4
+       llgc    $xi,0($cnt,$Xi)
+       xg      $Zhi,0($nlo,$Htbl)
+       sllg    $nlo,$xi,4
+       xg      $Zhi,0($rem0,$rem_4bit)
+       nill    $nlo,0xf0
+       sllg    $rem0,$Zlo,3
+       xgr     $Zlo,$tmp
+       ngr     $rem0,$x78
+       nill    $xi,0xf0
+
+       sllg    $tmp,$Zhi,60
+       srlg    $Zlo,$Zlo,4
+       srlg    $Zhi,$Zhi,4
+       xg      $Zlo,8($nhi,$Htbl)
+       xg      $Zhi,0($nhi,$Htbl)
+       lgr     $nhi,$xi
+       xg      $Zhi,0($rem1,$rem_4bit)
+       sllg    $rem1,$Zlo,3
+       xgr     $Zlo,$tmp
+       ngr     $rem1,$x78
+       brct    $cnt,.Lghash_inner
+
+       sllg    $tmp,$Zhi,60
+       srlg    $Zlo,$Zlo,4
+       srlg    $Zhi,$Zhi,4
+       xg      $Zlo,8($nlo,$Htbl)
+       xg      $Zhi,0($nlo,$Htbl)
+       sllg    $xi,$Zlo,3
+       xg      $Zhi,0($rem0,$rem_4bit)
+       xgr     $Zlo,$tmp
+       ngr     $xi,$x78
+
+       sllg    $tmp,$Zhi,60
+       srlg    $Zlo,$Zlo,4
+       srlg    $Zhi,$Zhi,4
+       xg      $Zlo,8($nhi,$Htbl)
+       xg      $Zhi,0($nhi,$Htbl)
+       xgr     $Zlo,$tmp
+       xg      $Zhi,0($rem1,$rem_4bit)
+
+       lg      $tmp,0($xi,$rem_4bit)
+       la      $inp,16($inp)
+       sllg    $tmp,$tmp,4             # correct last rem_4bit[rem]
+       brctg   $len,.Louter
+
+       xgr     $Zhi,$tmp
+       stg     $Zlo,8+1($Xi)
+       stg     $Zhi,0+1($Xi)
+       lm${g}  %r6,%r14,6*$SIZE_T($sp)
+       br      %r14
+.type  gcm_ghash_4bit,\@function
+.size  gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align 64
+rem_4bit:
+       .long   `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+       .long   `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+       .long   `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+       .long   `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type  rem_4bit,\@object
+.size  rem_4bit,(.-rem_4bit)
+.string        "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl
new file mode 100644 (file)
index 0000000..70e7b04
--- /dev/null
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#              gcc 3.3.x       cc 5.2          this assembler
+#
+# 32-bit build 81.4            43.3            12.6    (+546%/+244%)
+# 64-bit build 20.2            21.2            12.6    (+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+#              gcc 4.4.1                       this assembler
+#
+# 32-bit build 566                             50      (+1000%)
+# 64-bit build 56                              50      (+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+$Zhi="%o0";    # 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0";    # small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0";     # input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+.section       ".text",#alloc,#execinstr
+
+.align 64
+rem_4bit:
+       .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+       .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+       .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+       .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type  rem_4bit,#object
+.size  rem_4bit,(.-rem_4bit)
+
+.globl gcm_ghash_4bit
+.align 32
+gcm_ghash_4bit:
+       save    %sp,-$frame,%sp
+       ldub    [$inp+15],$nlo
+       ldub    [$Xi+15],$xi0
+       ldub    [$Xi+14],$xi1
+       add     $len,$inp,$len
+       add     $Htbl,8,$Htblo
+
+1:     call    .+8
+       add     %o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+       xor     $xi0,$nlo,$nlo
+       and     $nlo,0xf0,$nhi
+       and     $nlo,0x0f,$nlo
+       sll     $nlo,4,$nlo
+       ldx     [$Htblo+$nlo],$Zlo
+       ldx     [$Htbl+$nlo],$Zhi
+
+       ldub    [$inp+14],$nlo
+
+       ldx     [$Htblo+$nhi],$Tlo
+       and     $Zlo,0xf,$remi
+       ldx     [$Htbl+$nhi],$Thi
+       sll     $remi,3,$remi
+       ldx     [$rem_4bit+$remi],$rem
+       srlx    $Zlo,4,$Zlo
+       mov     13,$cnt
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+
+       xor     $xi1,$nlo,$nlo
+       and     $Zlo,0xf,$remi
+       and     $nlo,0xf0,$nhi
+       and     $nlo,0x0f,$nlo
+       ba      .Lghash_inner
+       sll     $nlo,4,$nlo
+.align 32
+.Lghash_inner:
+       ldx     [$Htblo+$nlo],$Tlo
+       sll     $remi,3,$remi
+       xor     $Thi,$Zhi,$Zhi
+       ldx     [$Htbl+$nlo],$Thi
+       srlx    $Zlo,4,$Zlo
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       ldub    [$inp+$cnt],$nlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+       ldub    [$Xi+$cnt],$xi1
+       xor     $Thi,$Zhi,$Zhi
+       and     $Zlo,0xf,$remi
+
+       ldx     [$Htblo+$nhi],$Tlo
+       sll     $remi,3,$remi
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$Htbl+$nhi],$Thi
+       srlx    $Zlo,4,$Zlo
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $xi1,$nlo,$nlo
+       srlx    $Zhi,4,$Zhi
+       and     $nlo,0xf0,$nhi
+       addcc   $cnt,-1,$cnt
+       xor     $Zlo,$tmp,$Zlo
+       and     $nlo,0x0f,$nlo
+       xor     $Tlo,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+       blu     .Lghash_inner
+       and     $Zlo,0xf,$remi
+
+       ldx     [$Htblo+$nlo],$Tlo
+       sll     $remi,3,$remi
+       xor     $Thi,$Zhi,$Zhi
+       ldx     [$Htbl+$nlo],$Thi
+       srlx    $Zlo,4,$Zlo
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+       xor     $Thi,$Zhi,$Zhi
+
+       add     $inp,16,$inp
+       cmp     $inp,$len
+       be,pn   `$bits==64?"%xcc":"%icc"`,.Ldone
+       and     $Zlo,0xf,$remi
+
+       ldx     [$Htblo+$nhi],$Tlo
+       sll     $remi,3,$remi
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$Htbl+$nhi],$Thi
+       srlx    $Zlo,4,$Zlo
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       ldub    [$inp+15],$nlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+       xor     $Thi,$Zhi,$Zhi
+       stx     $Zlo,[$Xi+8]
+       xor     $rem,$Zhi,$Zhi
+       stx     $Zhi,[$Xi]
+       srl     $Zlo,8,$xi1
+       and     $Zlo,0xff,$xi0
+       ba      .Louter
+       and     $xi1,0xff,$xi1
+.align 32
+.Ldone:
+       ldx     [$Htblo+$nhi],$Tlo
+       sll     $remi,3,$remi
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$Htbl+$nhi],$Thi
+       srlx    $Zlo,4,$Zlo
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+       xor     $Thi,$Zhi,$Zhi
+       stx     $Zlo,[$Xi+8]
+       xor     $rem,$Zhi,$Zhi
+       stx     $Zhi,[$Xi]
+
+       ret
+       restore
+.type  gcm_ghash_4bit,#function
+.size  gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl gcm_gmult_4bit
+.align 32
+gcm_gmult_4bit:
+       save    %sp,-$frame,%sp
+       ldub    [$Xi+15],$nlo
+       add     $Htbl,8,$Htblo
+
+1:     call    .+8
+       add     %o7,rem_4bit-1b,$rem_4bit
+
+       and     $nlo,0xf0,$nhi
+       and     $nlo,0x0f,$nlo
+       sll     $nlo,4,$nlo
+       ldx     [$Htblo+$nlo],$Zlo
+       ldx     [$Htbl+$nlo],$Zhi
+
+       ldub    [$Xi+14],$nlo
+
+       ldx     [$Htblo+$nhi],$Tlo
+       and     $Zlo,0xf,$remi
+       ldx     [$Htbl+$nhi],$Thi
+       sll     $remi,3,$remi
+       ldx     [$rem_4bit+$remi],$rem
+       srlx    $Zlo,4,$Zlo
+       mov     13,$cnt
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+
+       and     $Zlo,0xf,$remi
+       and     $nlo,0xf0,$nhi
+       and     $nlo,0x0f,$nlo
+       ba      .Lgmult_inner
+       sll     $nlo,4,$nlo
+.align 32
+.Lgmult_inner:
+       ldx     [$Htblo+$nlo],$Tlo
+       sll     $remi,3,$remi
+       xor     $Thi,$Zhi,$Zhi
+       ldx     [$Htbl+$nlo],$Thi
+       srlx    $Zlo,4,$Zlo
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       ldub    [$Xi+$cnt],$nlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+       xor     $Thi,$Zhi,$Zhi
+       and     $Zlo,0xf,$remi
+
+       ldx     [$Htblo+$nhi],$Tlo
+       sll     $remi,3,$remi
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$Htbl+$nhi],$Thi
+       srlx    $Zlo,4,$Zlo
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       srlx    $Zhi,4,$Zhi
+       and     $nlo,0xf0,$nhi
+       addcc   $cnt,-1,$cnt
+       xor     $Zlo,$tmp,$Zlo
+       and     $nlo,0x0f,$nlo
+       xor     $Tlo,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+       blu     .Lgmult_inner
+       and     $Zlo,0xf,$remi
+
+       ldx     [$Htblo+$nlo],$Tlo
+       sll     $remi,3,$remi
+       xor     $Thi,$Zhi,$Zhi
+       ldx     [$Htbl+$nlo],$Thi
+       srlx    $Zlo,4,$Zlo
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+       xor     $Thi,$Zhi,$Zhi
+       and     $Zlo,0xf,$remi
+
+       ldx     [$Htblo+$nhi],$Tlo
+       sll     $remi,3,$remi
+       xor     $rem,$Zhi,$Zhi
+       ldx     [$Htbl+$nhi],$Thi
+       srlx    $Zlo,4,$Zlo
+       ldx     [$rem_4bit+$remi],$rem
+       sllx    $Zhi,60,$tmp
+       xor     $Tlo,$Zlo,$Zlo
+       srlx    $Zhi,4,$Zhi
+       xor     $Zlo,$tmp,$Zlo
+       xor     $Thi,$Zhi,$Zhi
+       stx     $Zlo,[$Xi+8]
+       xor     $rem,$Zhi,$Zhi
+       stx     $Zhi,[$Xi]
+
+       ret
+       restore
+.type  gcm_gmult_4bit,#function
+.size  gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl
new file mode 100644 (file)
index 0000000..1b9adfb
--- /dev/null
@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#              gcc 2.95.3(*)   MMX assembler   x86 assembler
+#
+# Pentium      105/111(**)     -               50
+# PIII         68 /75          12.2            24
+# P4           125/125         17.8            84(***)
+# Opteron      66 /70          10.1            30
+# Core2                54 /67          8.4             18
+#
+# (*)  gcc 3.4.x was observed to generate few percent slower code,
+#      which is one of reasons why 2.95.3 results were chosen,
+#      another reason is lack of 3.4.x results for older CPUs;
+#      comparison with MMX results is not completely fair, because C
+#      results are for vanilla "256B" implementation, while
+#      assembler results are for "528B";-)
+# (**) second number is result for code compiled with -fPIC flag,
+#      which is actually more relevant, because assembler code is
+#      position-independent;
+# (***)        see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+\f
+$unroll = 0;   # Affects x86 loop. Folded loop performs ~7% worse
+               # than unrolled, which has to be weighted against
+               # 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+
+       &mov    ($Zhh,&DWP(4,$Htbl,$Zll));
+       &mov    ($Zhl,&DWP(0,$Htbl,$Zll));
+       &mov    ($Zlh,&DWP(12,$Htbl,$Zll));
+       &mov    ($Zll,&DWP(8,$Htbl,$Zll));
+       &xor    ($rem,$rem);    # avoid partial register stalls on PIII
+
+       # shrd practically kills P4, 2.5x deterioration, but P4 has
+       # MMX code-path to execute. shrd runs tad faster [than twice
+       # the shifts, move's and or's] on pre-MMX Pentium (as well as
+       # PIII and Core2), *but* minimizes code size, spares register
+       # and thus allows to fold the loop...
+       if (!$unroll) {
+       my $cnt = $inp;
+       &mov    ($cnt,15);
+       &jmp    (&label("x86_loop"));
+       &set_label("x86_loop",16);
+           for($i=1;$i<=2;$i++) {
+               &mov    (&LB($rem),&LB($Zll));
+               &shrd   ($Zll,$Zlh,4);
+               &and    (&LB($rem),0xf);
+               &shrd   ($Zlh,$Zhl,4);
+               &shrd   ($Zhl,$Zhh,4);
+               &shr    ($Zhh,4);
+               &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+               &mov    (&LB($rem),&BP($off,"esp",$cnt));
+               if ($i&1) {
+                       &and    (&LB($rem),0xf0);
+               } else {
+                       &shl    (&LB($rem),4);
+               }
+
+               &xor    ($Zll,&DWP(8,$Htbl,$rem));
+               &xor    ($Zlh,&DWP(12,$Htbl,$rem));
+               &xor    ($Zhl,&DWP(0,$Htbl,$rem));
+               &xor    ($Zhh,&DWP(4,$Htbl,$rem));
+
+               if ($i&1) {
+                       &dec    ($cnt);
+                       &js     (&label("x86_break"));
+               } else {
+                       &jmp    (&label("x86_loop"));
+               }
+           }
+       &set_label("x86_break",16);
+       } else {
+           for($i=1;$i<32;$i++) {
+               &comment($i);
+               &mov    (&LB($rem),&LB($Zll));
+               &shrd   ($Zll,$Zlh,4);
+               &and    (&LB($rem),0xf);
+               &shrd   ($Zlh,$Zhl,4);
+               &shrd   ($Zhl,$Zhh,4);
+               &shr    ($Zhh,4);
+               &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
+
+               if ($i&1) {
+                       &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
+                       &and    (&LB($rem),0xf0);
+               } else {
+                       &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
+                       &shl    (&LB($rem),4);
+               }
+
+               &xor    ($Zll,&DWP(8,$Htbl,$rem));
+               &xor    ($Zlh,&DWP(12,$Htbl,$rem));
+               &xor    ($Zhl,&DWP(0,$Htbl,$rem));
+               &xor    ($Zhh,&DWP(4,$Htbl,$rem));
+           }
+       }
+       &bswap  ($Zll);
+       &bswap  ($Zlh);
+       &bswap  ($Zhl);
+       if (!$x86only) {
+               &bswap  ($Zhh);
+       } else {
+               &mov    ("eax",$Zhh);
+               &bswap  ("eax");
+               &mov    ($Zhh,"eax");
+       }
+}
+
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+       &x86_loop(4);
+       &ret    ();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+    my $bias = shift;
+
+       &mov    (&DWP($bias+0, "esp"),0x0000<<16);
+       &mov    (&DWP($bias+4, "esp"),0x1C20<<16);
+       &mov    (&DWP($bias+8, "esp"),0x3840<<16);
+       &mov    (&DWP($bias+12,"esp"),0x2460<<16);
+       &mov    (&DWP($bias+16,"esp"),0x7080<<16);
+       &mov    (&DWP($bias+20,"esp"),0x6CA0<<16);
+       &mov    (&DWP($bias+24,"esp"),0x48C0<<16);
+       &mov    (&DWP($bias+28,"esp"),0x54E0<<16);
+       &mov    (&DWP($bias+32,"esp"),0xE100<<16);
+       &mov    (&DWP($bias+36,"esp"),0xFD20<<16);
+       &mov    (&DWP($bias+40,"esp"),0xD940<<16);
+       &mov    (&DWP($bias+44,"esp"),0xC560<<16);
+       &mov    (&DWP($bias+48,"esp"),0x9180<<16);
+       &mov    (&DWP($bias+52,"esp"),0x8DA0<<16);
+       &mov    (&DWP($bias+56,"esp"),0xA9C0<<16);
+       &mov    (&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+\f
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+       &stack_push(16+4+1);                    # +1 for stack alignment
+       &mov    ($inp,&wparam(0));              # load Xi
+       &mov    ($Htbl,&wparam(1));             # load Htable
+
+       &mov    ($Zhh,&DWP(0,$inp));            # load Xi[16]
+       &mov    ($Zhl,&DWP(4,$inp));
+       &mov    ($Zlh,&DWP(8,$inp));
+       &mov    ($Zll,&DWP(12,$inp));
+
+       &deposit_rem_4bit(16);
+
+       &mov    (&DWP(0,"esp"),$Zhh);           # copy Xi[16] on stack
+       &mov    (&DWP(4,"esp"),$Zhl);
+       &mov    (&DWP(8,"esp"),$Zlh);
+       &mov    (&DWP(12,"esp"),$Zll);
+       &shr    ($Zll,20);
+       &and    ($Zll,0xf0);
+
+       if ($unroll) {
+               &call   ("_x86_gmult_4bit_inner");
+       } else {
+               &x86_loop(0);
+               &mov    ($inp,&wparam(0));
+       }
+
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(0,$inp),$Zhh);
+       &stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+       &stack_push(16+4+1);                    # +1 for 64-bit alignment
+       &mov    ($Zll,&wparam(0));              # load Xi
+       &mov    ($Htbl,&wparam(1));             # load Htable
+       &mov    ($inp,&wparam(2));              # load in
+       &mov    ("ecx",&wparam(3));             # load len
+       &add    ("ecx",$inp);
+       &mov    (&wparam(3),"ecx");
+
+       &mov    ($Zhh,&DWP(0,$Zll));            # load Xi[16]
+       &mov    ($Zhl,&DWP(4,$Zll));
+       &mov    ($Zlh,&DWP(8,$Zll));
+       &mov    ($Zll,&DWP(12,$Zll));
+
+       &deposit_rem_4bit(16);
+
+    &set_label("x86_outer_loop",16);
+       &xor    ($Zll,&DWP(12,$inp));           # xor with input
+       &xor    ($Zlh,&DWP(8,$inp));
+       &xor    ($Zhl,&DWP(4,$inp));
+       &xor    ($Zhh,&DWP(0,$inp));
+       &mov    (&DWP(12,"esp"),$Zll);          # dump it on stack
+       &mov    (&DWP(8,"esp"),$Zlh);
+       &mov    (&DWP(4,"esp"),$Zhl);
+       &mov    (&DWP(0,"esp"),$Zhh);
+
+       &shr    ($Zll,20);
+       &and    ($Zll,0xf0);
+
+       if ($unroll) {
+               &call   ("_x86_gmult_4bit_inner");
+       } else {
+               &x86_loop(0);
+               &mov    ($inp,&wparam(2));
+       }
+       &lea    ($inp,&DWP(16,$inp));
+       &cmp    ($inp,&wparam(3));
+       &mov    (&wparam(2),$inp)       if (!$unroll);
+       &jb     (&label("x86_outer_loop"));
+
+       &mov    ($inp,&wparam(0));      # load Xi
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(0,$inp),$Zhh);
+       &stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+\f
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (0) {{      # "May" MMX version is kept for reference...
+
+$S=12;         # shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary µ-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+       &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
+       &mov    ($nhi,$Zll);
+       &mov    (&LB($nlo),&LB($nhi));
+       &shl    (&LB($nlo),4);
+       &and    ($nhi,0xf0);
+       &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &movd   ($rem[0],$Zlo);
+
+       for ($cnt=28;$cnt>=-2;$cnt--) {
+           my $odd = $cnt&1;
+           my $nix = $odd ? $nlo : $nhi;
+
+               &shl    (&LB($nlo),4)                   if ($odd);
+               &psrlq  ($Zlo,4);
+               &movq   ($tmp,$Zhi);
+               &psrlq  ($Zhi,4);
+               &pxor   ($Zlo,&QWP(8,$Htbl,$nix));
+               &mov    (&LB($nlo),&BP($cnt/2,$inp))    if (!$odd && $cnt>=0);
+               &psllq  ($tmp,60);
+               &and    ($nhi,0xf0)                     if ($odd);
+               &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+               &and    ($rem[0],0xf);
+               &pxor   ($Zhi,&QWP(0,$Htbl,$nix));
+               &mov    ($nhi,$nlo)                     if (!$odd && $cnt>=0);
+               &movd   ($rem[1],$Zlo);
+               &pxor   ($Zlo,$tmp);
+
+               push    (@rem,shift(@rem));             # "rotate" registers
+       }
+
+       &mov    ($inp,&DWP(4,$rem_4bit,$rem[1],8));     # last rem_4bit[rem]
+
+       &psrlq  ($Zlo,32);      # lower part of Zlo is already there
+       &movd   ($Zhl,$Zhi);
+       &psrlq  ($Zhi,32);
+       &movd   ($Zlh,$Zlo);
+       &movd   ($Zhh,$Zhi);
+       &shl    ($inp,4);       # compensate for rem_4bit[i] being >>4
+
+       &bswap  ($Zll);
+       &bswap  ($Zhl);
+       &bswap  ($Zlh);
+       &xor    ($Zhh,$inp);
+       &bswap  ($Zhh);
+
+       &ret    ();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+       &mov    ($inp,&wparam(0));      # load Xi
+       &mov    ($Htbl,&wparam(1));     # load Htable
+
+       &call   (&label("pic_point"));
+       &set_label("pic_point");
+       &blindpop("eax");
+       &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+       &movz   ($Zll,&BP(15,$inp));
+
+       &call   ("_mmx_gmult_4bit_inner");
+
+       &mov    ($inp,&wparam(0));      # load Xi
+       &emms   ();
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+\f
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+       &mov    ($Zhh,&wparam(0));      # load Xi
+       &mov    ($Htbl,&wparam(1));     # load Htable
+       &mov    ($inp,&wparam(2));      # load in
+       &mov    ($Zlh,&wparam(3));      # load len
+
+       &call   (&label("pic_point"));
+       &set_label("pic_point");
+       &blindpop("eax");
+       &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+       &add    ($Zlh,$inp);
+       &mov    (&wparam(3),$Zlh);      # len to point at the end of input
+       &stack_push(4+1);               # +1 for stack alignment
+
+       &mov    ($Zll,&DWP(12,$Zhh));   # load Xi[16]
+       &mov    ($Zhl,&DWP(4,$Zhh));
+       &mov    ($Zlh,&DWP(8,$Zhh));
+       &mov    ($Zhh,&DWP(0,$Zhh));
+       &jmp    (&label("mmx_outer_loop"));
+
+    &set_label("mmx_outer_loop",16);
+       &xor    ($Zll,&DWP(12,$inp));
+       &xor    ($Zhl,&DWP(4,$inp));
+       &xor    ($Zlh,&DWP(8,$inp));
+       &xor    ($Zhh,&DWP(0,$inp));
+       &mov    (&wparam(2),$inp);
+       &mov    (&DWP(12,"esp"),$Zll);
+       &mov    (&DWP(4,"esp"),$Zhl);
+       &mov    (&DWP(8,"esp"),$Zlh);
+       &mov    (&DWP(0,"esp"),$Zhh);
+
+       &mov    ($inp,"esp");
+       &shr    ($Zll,24);
+
+       &call   ("_mmx_gmult_4bit_inner");
+
+       &mov    ($inp,&wparam(2));
+       &lea    ($inp,&DWP(16,$inp));
+       &cmp    ($inp,&wparam(3));
+       &jb     (&label("mmx_outer_loop"));
+
+       &mov    ($inp,&wparam(0));      # load Xi
+       &emms   ();
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(0,$inp),$Zhh);
+
+       &stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+\f
+}} else {{     # "June" MMX version...
+               # ... has slower "April" gcm_gmult_4bit_mmx with folded
+               # loop. This is done to conserve code size...
+$S=16;         # shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+       &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
+       &mov    ($nhi,$Zll);
+       &mov    (&LB($nlo),&LB($nhi));
+       &mov    ($cnt,14);
+       &shl    (&LB($nlo),4);
+       &and    ($nhi,0xf0);
+       &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &movd   ($rem,$Zlo);
+       &jmp    (&label("mmx_loop"));
+
+    &set_label("mmx_loop",16);
+       &psrlq  ($Zlo,4);
+       &and    ($rem,0xf);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
+       &mov    (&LB($nlo),&BP(0,$inp,$cnt));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &dec    ($cnt);
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
+       &mov    ($nhi,$nlo);
+       &pxor   ($Zlo,$tmp);
+       &js     (&label("mmx_break"));
+
+       &shl    (&LB($nlo),4);
+       &and    ($rem,0xf);
+       &psrlq  ($Zlo,4);
+       &and    ($nhi,0xf0);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &pxor   ($Zlo,$tmp);
+       &jmp    (&label("mmx_loop"));
+
+    &set_label("mmx_break",16);
+       &shl    (&LB($nlo),4);
+       &and    ($rem,0xf);
+       &psrlq  ($Zlo,4);
+       &and    ($nhi,0xf0);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
+       &pxor   ($Zlo,$tmp);
+
+       &psrlq  ($Zlo,4);
+       &and    ($rem,0xf);
+       &movq   ($tmp,$Zhi);
+       &psrlq  ($Zhi,4);
+       &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
+       &psllq  ($tmp,60);
+       &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
+       &movd   ($rem,$Zlo);
+       &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
+       &pxor   ($Zlo,$tmp);
+
+       &psrlq  ($Zlo,32);      # lower part of Zlo is already there
+       &movd   ($Zhl,$Zhi);
+       &psrlq  ($Zhi,32);
+       &movd   ($Zlh,$Zlo);
+       &movd   ($Zhh,$Zhi);
+
+       &bswap  ($Zll);
+       &bswap  ($Zhl);
+       &bswap  ($Zlh);
+       &bswap  ($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+       &mov    ($inp,&wparam(0));      # load Xi
+       &mov    ($Htbl,&wparam(1));     # load Htable
+
+       &call   (&label("pic_point"));
+       &set_label("pic_point");
+       &blindpop("eax");
+       &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+       &movz   ($Zll,&BP(15,$inp));
+
+       &mmx_loop($inp,"eax");
+
+       &emms   ();
+       &mov    (&DWP(12,$inp),$Zll);
+       &mov    (&DWP(4,$inp),$Zhl);
+       &mov    (&DWP(8,$inp),$Zlh);
+       &mov    (&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+\f
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+
+    # parameter block
+    &mov       ("eax",&wparam(0));             # Xi
+    &mov       ("ebx",&wparam(1));             # Htable
+    &mov       ("ecx",&wparam(2));             # inp
+    &mov       ("edx",&wparam(3));             # len
+    &mov       ("ebp","esp");                  # original %esp
+    &call      (&label("pic_point"));
+    &set_label ("pic_point");
+    &blindpop  ($rem_8bit);
+    &lea       ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+    &sub       ("esp",512+16+16);              # allocate stack frame...
+    &and       ("esp",-64);                    # ...and align it
+    &sub       ("esp",16);                     # place for (u8)(H[]<<4)
+
+    &add       ("edx","ecx");                  # pointer to the end of input
+    &mov       (&DWP(528+16+0,"esp"),"eax");   # save Xi
+    &mov       (&DWP(528+16+8,"esp"),"edx");   # save inp+len
+    &mov       (&DWP(528+16+12,"esp"),"ebp");  # save original %esp
+
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my $off1=0,$off2=0,$i;
+
+      &add     ($Htbl,128);                    # optimize for size
+      &lea     ("edi",&DWP(16+128,"esp"));
+      &lea     ("ebp",&DWP(16+256+128,"esp"));
+
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+
+       &mov    ("edx",&DWP(16*$i+8-128,$Htbl))         if ($i<16);
+       &movq   ($lo[0],&QWP(16*$i+8-128,$Htbl))        if ($i<16);
+       &psllq  ($tmp[1],60)                            if ($i>1);
+       &movq   ($hi[0],&QWP(16*$i+0-128,$Htbl))        if ($i<16);
+       &por    ($lo[2],$tmp[1])                        if ($i>1);
+       &movq   (&QWP($off1-128,"edi"),$lo[1])          if ($i>0 && $i<17);
+       &psrlq  ($lo[1],4)                              if ($i>0 && $i<17);
+       &movq   (&QWP($off1,"edi"),$hi[1])              if ($i>0 && $i<17);
+       &movq   ($tmp[0],$hi[1])                        if ($i>0 && $i<17);
+       &movq   (&QWP($off2-128,"ebp"),$lo[2])          if ($i>1);
+       &psrlq  ($hi[1],4)                              if ($i>0 && $i<17);
+       &movq   (&QWP($off2,"ebp"),$hi[2])              if ($i>1);
+       &shl    ("edx",4)                               if ($i<16);
+       &mov    (&BP($i,"esp"),&LB("edx"))              if ($i<16);
+
+       unshift (@lo,pop(@lo));                 # "rotate" registers
+       unshift (@hi,pop(@hi));
+       unshift (@tmp,pop(@tmp));
+       $off1 += 8      if ($i>0);
+       $off2 += 8      if ($i>1);
+      }
+    }
+
+    &movq      ($Zhi,&QWP(0,"eax"));
+    &mov       ("ebx",&DWP(8,"eax"));
+    &mov       ("edx",&DWP(12,"eax"));         # load Xi
+
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+
+    &xor       ($dat,&DWP(12,"ecx"));          # merge input data
+    &xor       ("ebx",&DWP(8,"ecx"));
+    &pxor      ($Zhi,&QWP(0,"ecx"));
+    &lea       ("ecx",&DWP(16,"ecx"));         # inp+=16
+    #&mov      (&DWP(528+12,"esp"),$dat);      # save inp^Xi
+    &mov       (&DWP(528+8,"esp"),"ebx");
+    &movq      (&QWP(528+0,"esp"),$Zhi);
+    &mov       (&DWP(528+16+4,"esp"),"ecx");   # save inp
+
+    &xor       ($nlo,$nlo);
+    &rol       ($dat,8);
+    &mov       (&LB($nlo),&LB($dat));
+    &mov       ($nhi[1],$nlo);
+    &and       (&LB($nlo),0x0f);
+    &shr       ($nhi[1],4);
+    &pxor      ($red[0],$red[0]);
+    &rol       ($dat,8);                       # next byte
+    &pxor      ($red[1],$red[1]);
+    &pxor      ($red[2],$red[2]);
+
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+
+      if ($i>0) {
+       &pxor   ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
+       &rol    ($dat,8);                               # next byte
+       &pxor   ($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+       &pxor   ($Zlo,$tmp);
+       &pxor   ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+       &xor    (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
+      } else {
+       &movq   ($Zlo,&QWP(16,"esp",$nlo,8));
+       &movq   ($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+
+       &mov    (&LB($nlo),&LB($dat));
+       &mov    ($dat,&DWP(528+$j,"esp"))               if (--$j%4==0);
+
+       &movd   ($rem[0],$Zlo);
+       &movz   ($rem[1],&LB($rem[1]))                  if ($i>0);
+       &psrlq  ($Zlo,8);                               # Z>>=8
+
+       &movq   ($tmp,$Zhi);
+       &mov    ($nhi[0],$nlo);
+       &psrlq  ($Zhi,8);
+
+       &pxor   ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));  # Z^=H[nhi]>>4
+       &and    (&LB($nlo),0x0f);
+       &psllq  ($tmp,56);
+
+       &pxor   ($Zhi,$red[1])                          if ($i>1);
+       &shr    ($nhi[0],4);
+       &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2)  if ($i>0);
+
+       unshift (@red,pop(@red));                       # "rotate" registers
+       unshift (@rem,pop(@rem));
+       unshift (@nhi,pop(@nhi));
+    }
+
+    &pxor      ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
+    &pxor      ($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor       (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
+
+    &pxor      ($Zlo,$tmp);
+    &pxor      ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz      ($rem[1],&LB($rem[1]));
+
+    &pxor      ($red[2],$red[2]);                      # clear 2nd word
+    &psllq     ($red[1],4);
+
+    &movd      ($rem[0],$Zlo);
+    &psrlq     ($Zlo,4);                               # Z>>=4
+
+    &movq      ($tmp,$Zhi);
+    &psrlq     ($Zhi,4);
+    &shl       ($rem[0],4);                            # rem<<4
+
+    &pxor      ($Zlo,&QWP(16,"esp",$nhi[1],8));        # Z^=H[nhi]
+    &psllq     ($tmp,60);
+    &movz      ($rem[0],&LB($rem[0]));
+
+    &pxor      ($Zlo,$tmp);
+    &pxor      ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+    &pinsrw    ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor      ($Zhi,$red[1]);
+
+    &movd      ($dat,$Zlo);
+    &pinsrw    ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
+
+    &psllq     ($red[0],12);                           # correct by <<16>>4
+    &pxor      ($Zhi,$red[0]);
+    &psrlq     ($Zlo,32);
+    &pxor      ($Zhi,$red[2]);
+
+    &mov       ("ecx",&DWP(528+16+4,"esp"));   # restore inp
+    &movd      ("ebx",$Zlo);
+    &movq      ($tmp,$Zhi);                    # 01234567
+    &psllw     ($Zhi,8);                       # 1.3.5.7.
+    &psrlw     ($tmp,8);                       # .0.2.4.6
+    &por       ($Zhi,$tmp);                    # 10325476
+    &bswap     ($dat);
+    &pshufw    ($Zhi,$Zhi,0b00011011);         # 76543210
+    &bswap     ("ebx");
+    
+    &cmp       ("ecx",&DWP(528+16+8,"esp"));   # are we done?
+    &jne       (&label("outer"));
+  }
+
+    &mov       ("eax",&DWP(528+16+0,"esp"));   # restore Xi
+    &mov       (&DWP(12,"eax"),"edx");
+    &mov       (&DWP(8,"eax"),"ebx");
+    &movq      (&QWP(0,"eax"),$Zhi);
+
+    &mov       ("esp",&DWP(528+16+12,"esp"));  # restore original %esp
+    &emms      ();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+\f
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");    $Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {    # minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)=@_;
+
+       &movdqa         ($Xhi,$Xi);             #
+       &pshufd         ($T1,$Xi,0b01001110);
+       &pshufd         ($T2,$Hkey,0b01001110);
+       &pxor           ($T1,$Xi);              #
+       &pxor           ($T2,$Hkey);
+
+       &pclmulqdq      ($Xi,$Hkey,0x00);       #######
+       &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
+       &pclmulqdq      ($T1,$T2,0x00);         #######
+       &xorps          ($T1,$Xi);              #
+       &xorps          ($T1,$Xhi);             #
+
+       &movdqa         ($T2,$T1);              #
+       &psrldq         ($T1,8);
+       &pslldq         ($T2,8);                #
+       &pxor           ($Xhi,$T1);
+       &pxor           ($Xi,$T2);              #
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+       &movdqa         ($T1,$Xi);              #
+       &movdqa         ($Xhi,$Xi);
+       &pclmulqdq      ($Xi,$Hkey,0x00);       #######
+       &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
+       &pshufd         ($T2,$T1,0b01001110);   #
+       &pshufd         ($T3,$Hkey,0b01001110);
+       &pxor           ($T2,$T1);              #
+       &pxor           ($T3,$Hkey);
+       &pclmulqdq      ($T2,$T3,0x00);         #######
+       &pxor           ($T2,$Xi);              #
+       &pxor           ($T2,$Xhi);             #
+
+       &movdqa         ($T3,$T2);              #
+       &psrldq         ($T2,8);
+       &pslldq         ($T3,8);                #
+       &pxor           ($Xhi,$T2);
+       &pxor           ($Xi,$T3);              #
+}
+\f
+if (1) {               # Algorithm 9 with <<1 twist.
+                       # Reduction is shorter and uses only two
+                       # temporary registers, which makes it better
+                       # candidate for interleaving with 64x64
+                       # multiplication. Pre-modulo-scheduled loop
+                       # was found to be ~20% faster than Algorithm 5
+                       # below. Algorithm 9 was therefore chosen for
+                       # further optimization...
+
+sub reduction_alg9 {   # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+       # 1st phase
+       &movdqa         ($T1,$Xi)               #
+       &psllq          ($Xi,1);
+       &pxor           ($Xi,$T1);              #
+       &psllq          ($Xi,5);                #
+       &pxor           ($Xi,$T1);              #
+       &psllq          ($Xi,57);               #
+       &movdqa         ($T2,$Xi);              #
+       &pslldq         ($Xi,8);
+       &psrldq         ($T2,8);                #
+       &pxor           ($Xi,$T1);
+       &pxor           ($Xhi,$T2);             #
+
+       # 2nd phase
+       &movdqa         ($T2,$Xi);
+       &psrlq          ($Xi,5);
+       &pxor           ($Xi,$T2);              #
+       &psrlq          ($Xi,1);                #
+       &pxor           ($Xi,$T2);              #
+       &pxor           ($T2,$Xhi);
+       &psrlq          ($Xi,1);                #
+       &pxor           ($Xi,$T2);              #
+}
+
+&function_begin_B("gcm_init_clmul");
+       &mov            ($Htbl,&wparam(0));
+       &mov            ($Xip,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Hkey,&QWP(0,$Xip));
+       &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
+
+       # <<1 twist
+       &pshufd         ($T2,$Hkey,0b11111111); # broadcast uppermost dword
+       &movdqa         ($T1,$Hkey);
+       &psllq          ($Hkey,1);
+       &pxor           ($T3,$T3);              #
+       &psrlq          ($T1,63);
+       &pcmpgtd        ($T3,$T2);              # broadcast carry bit
+       &pslldq         ($T1,8);
+       &por            ($Hkey,$T1);            # H<<=1
+
+       # magic reduction
+       &pand           ($T3,&QWP(16,$const));  # 0x1c2_polynomial
+       &pxor           ($Hkey,$T3);            # if(carry) H^=0x1c2_polynomial
+
+       # calculate H^2
+       &movdqa         ($Xi,$Hkey);
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+
+       &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
+       &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
+
+       &ret            ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($T3,&QWP(0,$const));
+       &movups         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$T3);
+
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+
+       &pshufb         ($Xi,$T3);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+
+       &ret    ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+       &mov            ($inp,&wparam(2));
+       &mov            ($len,&wparam(3));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($T3,&QWP(0,$const));
+       &movdqu         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$T3);
+
+       &sub            ($len,0x10);
+       &jz             (&label("odd_tail"));
+
+       #######
+       # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+       #       [(H*Ii+1) + (H*Xi+1)] mod P =
+       #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+       #
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+       &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+       &lea            ($inp,&DWP(32,$inp));   # i+=2
+       &sub            ($len,0x20);
+       &jbe            (&label("even_tail"));
+
+&set_label("mod_loop");
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movups         ($Hkey,&QWP(0,$Htbl));  # load H
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+
+       &movdqa         ($T3,$Xn);              #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
+       &movdqa         ($Xhn,$Xn);
+        &pxor          ($Xhi,$T1);             # "Ii+Xi", consume early
+
+         &movdqa       ($T1,$Xi)               #&reduction_alg9($Xhi,$Xi); 1st phase
+         &psllq        ($Xi,1);
+         &pxor         ($Xi,$T1);              #
+         &psllq        ($Xi,5);                #
+         &pxor         ($Xi,$T1);              #
+       &pclmulqdq      ($Xn,$Hkey,0x00);       #######
+         &psllq        ($Xi,57);               #
+         &movdqa       ($T2,$Xi);              #
+         &pslldq       ($Xi,8);
+         &psrldq       ($T2,8);                #       
+         &pxor         ($Xi,$T1);
+       &pshufd         ($T1,$T3,0b01001110);
+         &pxor         ($Xhi,$T2);             #
+       &pxor           ($T1,$T3);
+       &pshufd         ($T3,$Hkey,0b01001110);
+       &pxor           ($T3,$Hkey);            #
+
+       &pclmulqdq      ($Xhn,$Hkey,0x11);      #######
+         &movdqa       ($T2,$Xi);              # 2nd phase
+         &psrlq        ($Xi,5);
+         &pxor         ($Xi,$T2);              #
+         &psrlq        ($Xi,1);                #
+         &pxor         ($Xi,$T2);              #
+         &pxor         ($T2,$Xhi);
+         &psrlq        ($Xi,1);                #
+         &pxor         ($Xi,$T2);              #
+
+       &pclmulqdq      ($T1,$T3,0x00);         #######
+       &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
+       &xorps          ($T1,$Xn);              #
+       &xorps          ($T1,$Xhn);             #
+
+       &movdqa         ($T3,$T1);              #
+       &psrldq         ($T1,8);
+       &pslldq         ($T3,8);                #
+       &pxor           ($Xhn,$T1);
+       &pxor           ($Xn,$T3);              #
+       &movdqa         ($T3,&QWP(0,$const));
+
+       &lea            ($inp,&DWP(32,$inp));
+       &sub            ($len,0x20);
+       &ja             (&label("mod_loop"));
+
+&set_label("even_tail");
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &reduction_alg9 ($Xhi,$Xi);
+
+       &test           ($len,$len);
+       &jnz            (&label("done"));
+
+       &movups         ($Hkey,&QWP(0,$Htbl));  # load H
+&set_label("odd_tail");
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &pshufb         ($T1,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+       &reduction_alg9 ($Xhi,$Xi);
+
+&set_label("done");
+       &pshufb         ($Xi,$T3);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+\f
+} else {               # Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 {   # 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+       # <<1
+       &movdqa         ($T1,$Xi);              #
+       &movdqa         ($T2,$Xhi);
+       &pslld          ($Xi,1);
+       &pslld          ($Xhi,1);               #
+       &psrld          ($T1,31);
+       &psrld          ($T2,31);               #
+       &movdqa         ($T3,$T1);
+       &pslldq         ($T1,4);
+       &psrldq         ($T3,12);               #
+       &pslldq         ($T2,4);
+       &por            ($Xhi,$T3);             #
+       &por            ($Xi,$T1);
+       &por            ($Xhi,$T2);             #
+
+       # 1st phase
+       &movdqa         ($T1,$Xi);
+       &movdqa         ($T2,$Xi);
+       &movdqa         ($T3,$Xi);              #
+       &pslld          ($T1,31);
+       &pslld          ($T2,30);
+       &pslld          ($Xi,25);               #
+       &pxor           ($T1,$T2);
+       &pxor           ($T1,$Xi);              #
+       &movdqa         ($T2,$T1);              #
+       &pslldq         ($T1,12);
+       &psrldq         ($T2,4);                #
+       &pxor           ($T3,$T1);
+
+       # 2nd phase
+       &pxor           ($Xhi,$T3);             #
+       &movdqa         ($Xi,$T3);
+       &movdqa         ($T1,$T3);
+       &psrld          ($Xi,1);                #
+       &psrld          ($T1,2);
+       &psrld          ($T3,7);                #
+       &pxor           ($Xi,$T1);
+       &pxor           ($Xhi,$T2);
+       &pxor           ($Xi,$T3);              #
+       &pxor           ($Xi,$Xhi);             #
+}
+
+&function_begin_B("gcm_init_clmul");
+       &mov            ($Htbl,&wparam(0));
+       &mov            ($Xip,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Hkey,&QWP(0,$Xip));
+       &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
+
+       # calculate H^2
+       &movdqa         ($Xi,$Hkey);
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
+       &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
+
+       &ret            ();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($Xn,&QWP(0,$const));
+       &movdqu         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$Xn);
+
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &pshufb         ($Xi,$Xn);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+
+       &ret    ();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+       &mov            ($Xip,&wparam(0));
+       &mov            ($Htbl,&wparam(1));
+       &mov            ($inp,&wparam(2));
+       &mov            ($len,&wparam(3));
+
+       &call           (&label("pic"));
+&set_label("pic");
+       &blindpop       ($const);
+       &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+       &movdqu         ($Xi,&QWP(0,$Xip));
+       &movdqa         ($T3,&QWP(0,$const));
+       &movdqu         ($Hkey,&QWP(0,$Htbl));
+       &pshufb         ($Xi,$T3);
+
+       &sub            ($len,0x10);
+       &jz             (&label("odd_tail"));
+
+       #######
+       # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+       #       [(H*Ii+1) + (H*Xi+1)] mod P =
+       #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+       #
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+       &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+       &sub            ($len,0x20);
+       &lea            ($inp,&DWP(32,$inp));   # i+=2
+       &jbe            (&label("even_tail"));
+
+&set_label("mod_loop");
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+       &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &reduction_alg5 ($Xhi,$Xi);
+
+       #######
+       &movdqa         ($T3,&QWP(0,$const));
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
+       &pshufb         ($T1,$T3);
+       &pshufb         ($Xn,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+       &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
+
+       &sub            ($len,0x20);
+       &lea            ($inp,&DWP(32,$inp));
+       &ja             (&label("mod_loop"));
+
+&set_label("even_tail");
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
+
+       &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
+       &pxor           ($Xhi,$Xhn);
+
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &movdqa         ($T3,&QWP(0,$const));
+       &test           ($len,$len);
+       &jnz            (&label("done"));
+
+       &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+&set_label("odd_tail");
+       &movdqu         ($T1,&QWP(0,$inp));     # Ii
+       &pshufb         ($T1,$T3);
+       &pxor           ($Xi,$T1);              # Ii+Xi
+
+       &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+       &reduction_alg5 ($Xhi,$Xi);
+
+       &movdqa         ($T3,&QWP(0,$const));
+&set_label("done");
+       &pshufb         ($Xi,$T3);
+       &movdqu         (&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+\f
+&set_label("bswap",64);
+       &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+       &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
+}}     # $sse2
+
+&set_label("rem_4bit",64);
+       &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+       &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+       &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+       &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+       &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+       &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+       &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+       &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+       &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+       &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+       &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+       &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+       &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+       &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+       &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+       &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+       &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+       &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+       &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+       &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+       &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+       &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+       &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+       &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+       &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+       &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+       &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+       &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+       &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+       &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+       &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+       &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+       &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+       &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+       &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+       &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}}    # !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
new file mode 100644 (file)
index 0000000..a5ae180
--- /dev/null
@@ -0,0 +1,805 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#              gcc 3.4.x(*)    assembler
+#
+# P4           28.6            14.0            +100%
+# Opteron      19.3            7.7             +150%
+# Core2                17.8            8.1(**)         +120%
+#
+# (*)  comparison is not completely fair, because C results are
+#      for vanilla "256B" implementation, while assembler results
+#      are for "528B";-)
+# (**) it's mystery [to me] why Core2 result is not same as for
+#      Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/    or
+                       $r =~ s/%[er]([sd]i)/%\1l/      or
+                       $r =~ s/%[er](bp)/%\1l/         or
+                       $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+sub AUTOLOAD()         # thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+\f
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+
+       $N++;
+$code.=<<___;
+       xor     $nlo,$nlo
+       xor     $nhi,$nhi
+       mov     `&LB("$Zlo")`,`&LB("$nlo")`
+       mov     `&LB("$Zlo")`,`&LB("$nhi")`
+       shl     \$4,`&LB("$nlo")`
+       mov     \$14,$cnt
+       mov     8($Htbl,$nlo),$Zlo
+       mov     ($Htbl,$nlo),$Zhi
+       and     \$0xf0,`&LB("$nhi")`
+       mov     $Zlo,$rem
+       jmp     .Loop$N
+
+.align 16
+.Loop$N:
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       mov     ($inp,$cnt),`&LB("$nlo")`
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nhi),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nhi),$Zhi
+       mov     `&LB("$nlo")`,`&LB("$nhi")`
+       xor     ($rem_4bit,$rem,8),$Zhi
+       mov     $Zlo,$rem
+       shl     \$4,`&LB("$nlo")`
+       xor     $tmp,$Zlo
+       dec     $cnt
+       js      .Lbreak$N
+
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nlo),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nlo),$Zhi
+       and     \$0xf0,`&LB("$nhi")`
+       xor     ($rem_4bit,$rem,8),$Zhi
+       mov     $Zlo,$rem
+       xor     $tmp,$Zlo
+       jmp     .Loop$N
+
+.align 16
+.Lbreak$N:
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nlo),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nlo),$Zhi
+       and     \$0xf0,`&LB("$nhi")`
+       xor     ($rem_4bit,$rem,8),$Zhi
+       mov     $Zlo,$rem
+       xor     $tmp,$Zlo
+
+       shr     \$4,$Zlo
+       and     \$0xf,$rem
+       mov     $Zhi,$tmp
+       shr     \$4,$Zhi
+       xor     8($Htbl,$nhi),$Zlo
+       shl     \$60,$tmp
+       xor     ($Htbl,$nhi),$Zhi
+       xor     $tmp,$Zlo
+       xor     ($rem_4bit,$rem,8),$Zhi
+
+       bswap   $Zlo
+       bswap   $Zhi
+___
+}}
+
+$code=<<___;
+.text
+
+.globl gcm_gmult_4bit
+.type  gcm_gmult_4bit,\@function,2
+.align 16
+gcm_gmult_4bit:
+       push    %rbx
+       push    %rbp            # %rbp and %r12 are pushed exclusively in
+       push    %r12            # order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+       movzb   15($Xi),$Zlo
+       lea     .Lrem_4bit(%rip),$rem_4bit
+___
+       &loop   ($Xi);
+$code.=<<___;
+       mov     $Zlo,8($Xi)
+       mov     $Zhi,($Xi)
+
+       mov     16(%rsp),%rbx
+       lea     24(%rsp),%rsp
+.Lgmult_epilogue:
+       ret
+.size  gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+\f
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.type  gcm_ghash_4bit,\@function,4
+.align 16
+gcm_ghash_4bit:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       sub     \$280,%rsp
+.Lghash_prologue:
+       mov     $inp,%r14               # reassign couple of args
+       mov     $len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+
+       &sub    ($Htbl,-128);           # size optimization
+       &lea    ($Hshr4,"16+128(%rsp)");
+       { my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+
+         &xor  ($dat,$dat);
+         for ($i=0,$j=-2;$i<18;$i++,$j++) {
+           &mov        ("$j(%rsp)",&LB($dat))          if ($i>1);
+           &or         ($lo[0],$tmp)                   if ($i>1);
+           &mov        (&LB($dat),&LB($lo[1]))         if ($i>0 && $i<17);
+           &shr        ($lo[1],4)                      if ($i>0 && $i<17);
+           &mov        ($tmp,$hi[1])                   if ($i>0 && $i<17);
+           &shr        ($hi[1],4)                      if ($i>0 && $i<17);
+           &mov        ("8*$j($Hshr4)",$hi[0])         if ($i>1);
+           &mov        ($hi[0],"16*$i+0-128($Htbl)")   if ($i<16);
+           &shl        (&LB($dat),4)                   if ($i>0 && $i<17);
+           &mov        ("8*$j-128($Hshr4)",$lo[0])     if ($i>1);
+           &mov        ($lo[0],"16*$i+8-128($Htbl)")   if ($i<16);
+           &shl        ($tmp,60)                       if ($i>0 && $i<17);
+
+           push        (@lo,shift(@lo));
+           push        (@hi,shift(@hi));
+         }
+       }
+       &add    ($Htbl,-128);
+       &mov    ($Zlo,"8($Xi)");
+       &mov    ($Zhi,"0($Xi)");
+       &add    ($len,$inp);            # pointer to the end of data
+       &lea    ($rem_8bit,".Lrem_8bit(%rip)");
+       &jmp    (".Louter_loop");
+
+$code.=".align 16\n.Louter_loop:\n";
+       &xor    ($Zhi,"($inp)");
+       &mov    ("%rdx","8($inp)");
+       &lea    ($inp,"16($inp)");
+       &xor    ("%rdx",$Zlo);
+       &mov    ("($Xi)",$Zhi);
+       &mov    ("8($Xi)","%rdx");
+       &shr    ("%rdx",32);
+
+       &xor    ($nlo,$nlo);
+       &rol    ($dat,8);
+       &mov    (&LB($nlo),&LB($dat));
+       &movz   ($nhi[0],&LB($dat));
+       &shl    (&LB($nlo),4);
+       &shr    ($nhi[0],4);
+
+       for ($j=11,$i=0;$i<15;$i++) {
+           &rol        ($dat,8);
+           &xor        ($Zlo,"8($Htbl,$nlo)")                  if ($i>0);
+           &xor        ($Zhi,"($Htbl,$nlo)")                   if ($i>0);
+           &mov        ($Zlo,"8($Htbl,$nlo)")                  if ($i==0);
+           &mov        ($Zhi,"($Htbl,$nlo)")                   if ($i==0);
+
+           &mov        (&LB($nlo),&LB($dat));
+           &xor        ($Zlo,$tmp)                             if ($i>0);
+           &movzw      ($rem[1],"($rem_8bit,$rem[1],2)")       if ($i>0);
+
+           &movz       ($nhi[1],&LB($dat));
+           &shl        (&LB($nlo),4);
+           &movzb      ($rem[0],"(%rsp,$nhi[0])");
+
+           &shr        ($nhi[1],4)                             if ($i<14);
+           &and        ($nhi[1],0xf0)                          if ($i==14);
+           &shl        ($rem[1],48)                            if ($i>0);
+           &xor        ($rem[0],$Zlo);
+
+           &mov        ($tmp,$Zhi);
+           &xor        ($Zhi,$rem[1])                          if ($i>0);
+           &shr        ($Zlo,8);
+
+           &movz       ($rem[0],&LB($rem[0]));
+           &mov        ($dat,"$j($Xi)")                        if (--$j%4==0);
+           &shr        ($Zhi,8);
+
+           &xor        ($Zlo,"-128($Hshr4,$nhi[0],8)");
+           &shl        ($tmp,56);
+           &xor        ($Zhi,"($Hshr4,$nhi[0],8)");
+
+           unshift     (@nhi,pop(@nhi));               # "rotate" registers
+           unshift     (@rem,pop(@rem));
+       }
+       &movzw  ($rem[1],"($rem_8bit,$rem[1],2)");
+       &xor    ($Zlo,"8($Htbl,$nlo)");
+       &xor    ($Zhi,"($Htbl,$nlo)");
+
+       &shl    ($rem[1],48);
+       &xor    ($Zlo,$tmp);
+
+       &xor    ($Zhi,$rem[1]);
+       &movz   ($rem[0],&LB($Zlo));
+       &shr    ($Zlo,4);
+
+       &mov    ($tmp,$Zhi);
+       &shl    (&LB($rem[0]),4);
+       &shr    ($Zhi,4);
+
+       &xor    ($Zlo,"8($Htbl,$nhi[0])");
+       &movzw  ($rem[0],"($rem_8bit,$rem[0],2)");
+       &shl    ($tmp,60);
+
+       &xor    ($Zhi,"($Htbl,$nhi[0])");
+       &xor    ($Zlo,$tmp);
+       &shl    ($rem[0],48);
+
+       &bswap  ($Zlo);
+       &xor    ($Zhi,$rem[0]);
+
+       &bswap  ($Zhi);
+       &cmp    ($inp,$len);
+       &jb     (".Louter_loop");
+}
+$code.=<<___;
+       mov     $Zlo,8($Xi)
+       mov     $Zhi,($Xi)
+
+       lea     280(%rsp),%rsi
+       mov     0(%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lghash_epilogue:
+       ret
+.size  gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+\f
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64?        ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+               ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");  $Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {    # minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+
+$code.=<<___ if (!defined($modulo));
+       movdqa          $Xi,$Xhi                #
+       pshufd          \$0b01001110,$Xi,$T1
+       pshufd          \$0b01001110,$Hkey,$T2
+       pxor            $Xi,$T1                 #
+       pxor            $Hkey,$T2
+___
+$code.=<<___;
+       pclmulqdq       \$0x00,$Hkey,$Xi        #######
+       pclmulqdq       \$0x11,$Hkey,$Xhi       #######
+       pclmulqdq       \$0x00,$T2,$T1          #######
+       pxor            $Xi,$T1                 #
+       pxor            $Xhi,$T1                #
+
+       movdqa          $T1,$T2                 #
+       psrldq          \$8,$T1
+       pslldq          \$8,$T2                 #
+       pxor            $T1,$Xhi
+       pxor            $T2,$Xi                 #
+___
+}
+
+sub reduction_alg9 {   # 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+       # 1st phase
+       movdqa          $Xi,$T1                 #
+       psllq           \$1,$Xi
+       pxor            $T1,$Xi                 #
+       psllq           \$5,$Xi                 #
+       pxor            $T1,$Xi                 #
+       psllq           \$57,$Xi                #
+       movdqa          $Xi,$T2                 #
+       pslldq          \$8,$Xi
+       psrldq          \$8,$T2                 #       
+       pxor            $T1,$Xi
+       pxor            $T2,$Xhi                #
+
+       # 2nd phase
+       movdqa          $Xi,$T2
+       psrlq           \$5,$Xi
+       pxor            $T2,$Xi                 #
+       psrlq           \$1,$Xi                 #
+       pxor            $T2,$Xi                 #
+       pxor            $Xhi,$T2
+       psrlq           \$1,$Xi                 #
+       pxor            $T2,$Xi                 #
+___
+}
+\f
+{ my ($Htbl,$Xip)=@_4args;
+
+$code.=<<___;
+.globl gcm_init_clmul
+.type  gcm_init_clmul,\@abi-omnipotent
+.align 16
+gcm_init_clmul:
+       movdqu          ($Xip),$Hkey
+       pshufd          \$0b01001110,$Hkey,$Hkey        # dword swap
+
+       # <<1 twist
+       pshufd          \$0b11111111,$Hkey,$T2  # broadcast uppermost dword
+       movdqa          $Hkey,$T1
+       psllq           \$1,$Hkey
+       pxor            $T3,$T3                 #
+       psrlq           \$63,$T1
+       pcmpgtd         $T2,$T3                 # broadcast carry bit
+       pslldq          \$8,$T1
+       por             $T1,$Hkey               # H<<=1
+
+       # magic reduction
+       pand            .L0x1c2_polynomial(%rip),$T3
+       pxor            $T3,$Hkey               # if(carry) H^=0x1c2_polynomial
+
+       # calculate H^2
+       movdqa          $Hkey,$Xi
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+       movdqu          $Hkey,($Htbl)           # save H
+       movdqu          $Xi,16($Htbl)           # save H^2
+       ret
+.size  gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl gcm_gmult_clmul
+.type  gcm_gmult_clmul,\@abi-omnipotent
+.align 16
+gcm_gmult_clmul:
+       movdqu          ($Xip),$Xi
+       movdqa          .Lbswap_mask(%rip),$T3
+       movdqu          ($Htbl),$Hkey
+       pshufb          $T3,$Xi
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+       pshufb          $T3,$Xi
+       movdqu          $Xi,($Xip)
+       ret
+.size  gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+\f
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+
+$code.=<<___;
+.globl gcm_ghash_clmul
+.type  gcm_ghash_clmul,\@abi-omnipotent
+.align 16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+       # I can't trust assembler to use specific encoding:-(
+       .byte   0x48,0x83,0xec,0x58             #sub    \$0x58,%rsp
+       .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
+       .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
+       .byte   0x44,0x0f,0x29,0x44,0x24,0x20   #movaps %xmm8,0x20(%rsp)
+       .byte   0x44,0x0f,0x29,0x4c,0x24,0x30   #movaps %xmm9,0x30(%rsp)
+       .byte   0x44,0x0f,0x29,0x54,0x24,0x40   #movaps %xmm10,0x40(%rsp)
+___
+$code.=<<___;
+       movdqa          .Lbswap_mask(%rip),$T3
+
+       movdqu          ($Xip),$Xi
+       movdqu          ($Htbl),$Hkey
+       pshufb          $T3,$Xi
+
+       sub             \$0x10,$len
+       jz              .Lodd_tail
+
+       movdqu          16($Htbl),$Hkey2
+       #######
+       # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+       #       [(H*Ii+1) + (H*Xi+1)] mod P =
+       #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
+       #
+       movdqu          ($inp),$T1              # Ii
+       movdqu          16($inp),$Xn            # Ii+1
+       pshufb          $T3,$T1
+       pshufb          $T3,$Xn
+       pxor            $T1,$Xi                 # Ii+Xi
+___
+       &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
+$code.=<<___;
+       movdqa          $Xi,$Xhi                #
+       pshufd          \$0b01001110,$Xi,$T1
+       pshufd          \$0b01001110,$Hkey2,$T2
+       pxor            $Xi,$T1                 #
+       pxor            $Hkey2,$T2
+
+       lea             32($inp),$inp           # i+=2
+       sub             \$0x20,$len
+       jbe             .Leven_tail
+
+.Lmod_loop:
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
+$code.=<<___;
+       movdqu          ($inp),$T1              # Ii
+       pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
+       pxor            $Xhn,$Xhi
+
+       movdqu          16($inp),$Xn            # Ii+1
+       pshufb          $T3,$T1
+       pshufb          $T3,$Xn
+
+       movdqa          $Xn,$Xhn                #
+       pshufd          \$0b01001110,$Xn,$T1n
+       pshufd          \$0b01001110,$Hkey,$T2n
+       pxor            $Xn,$T1n                #
+       pxor            $Hkey,$T2n
+        pxor           $T1,$Xhi                # "Ii+Xi", consume early
+
+         movdqa        $Xi,$T1                 # 1st phase
+         psllq         \$1,$Xi
+         pxor          $T1,$Xi                 #
+         psllq         \$5,$Xi                 #
+         pxor          $T1,$Xi                 #
+       pclmulqdq       \$0x00,$Hkey,$Xn        #######
+         psllq         \$57,$Xi                #
+         movdqa        $Xi,$T2                 #
+         pslldq        \$8,$Xi
+         psrldq        \$8,$T2                 #       
+         pxor          $T1,$Xi
+         pxor          $T2,$Xhi                #
+
+       pclmulqdq       \$0x11,$Hkey,$Xhn       #######
+         movdqa        $Xi,$T2                 # 2nd phase
+         psrlq         \$5,$Xi
+         pxor          $T2,$Xi                 #
+         psrlq         \$1,$Xi                 #
+         pxor          $T2,$Xi                 #
+         pxor          $Xhi,$T2
+         psrlq         \$1,$Xi                 #
+         pxor          $T2,$Xi                 #
+
+       pclmulqdq       \$0x00,$T2n,$T1n        #######
+        movdqa         $Xi,$Xhi                #
+        pshufd         \$0b01001110,$Xi,$T1
+        pshufd         \$0b01001110,$Hkey2,$T2
+        pxor           $Xi,$T1                 #
+        pxor           $Hkey2,$T2
+
+       pxor            $Xn,$T1n                #
+       pxor            $Xhn,$T1n               #
+       movdqa          $T1n,$T2n               #
+       psrldq          \$8,$T1n
+       pslldq          \$8,$T2n                #
+       pxor            $T1n,$Xhn
+       pxor            $T2n,$Xn                #
+
+       lea             32($inp),$inp
+       sub             \$0x20,$len
+       ja              .Lmod_loop
+
+.Leven_tail:
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
+$code.=<<___;
+       pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
+       pxor            $Xhn,$Xhi
+___
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+       test            $len,$len
+       jnz             .Ldone
+
+.Lodd_tail:
+       movdqu          ($inp),$T1              # Ii
+       pshufb          $T3,$T1
+       pxor            $T1,$Xi                 # Ii+Xi
+___
+       &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
+       &reduction_alg9 ($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+       pshufb          $T3,$Xi
+       movdqu          $Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsp),%xmm6
+       movaps  0x10(%rsp),%xmm7
+       movaps  0x20(%rsp),%xmm8
+       movaps  0x30(%rsp),%xmm9
+       movaps  0x40(%rsp),%xmm10
+       add     \$0x58,%rsp
+___
+$code.=<<___;
+       ret
+.LSEH_end_gcm_ghash_clmul:
+.size  gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.align 64
+.Lbswap_mask:
+       .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+       .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align 64
+.type  .Lrem_4bit,\@object
+.Lrem_4bit:
+       .long   0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+       .long   0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+       .long   0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+       .long   0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type  .Lrem_8bit,\@object
+.Lrem_8bit:
+       .value  0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+       .value  0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+       .value  0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+       .value  0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+       .value  0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+       .value  0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+       .value  0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+       .value  0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+       .value  0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+       .value  0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+       .value  0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+       .value  0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+       .value  0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+       .value  0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+       .value  0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+       .value  0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+       .value  0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+       .value  0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+       .value  0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+       .value  0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+       .value  0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+       .value  0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+       .value  0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+       .value  0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+       .value  0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+       .value  0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+       .value  0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+       .value  0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+       .value  0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+       .value  0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+       .value  0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+       .value  0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 64
+___
+\f
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lin_prologue
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lin_prologue
+
+       lea     24(%rax),%rax           # adjust "rsp"
+
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+
+.Lin_prologue:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  se_handler,.-se_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_gcm_gmult_4bit
+       .rva    .LSEH_end_gcm_gmult_4bit
+       .rva    .LSEH_info_gcm_gmult_4bit
+
+       .rva    .LSEH_begin_gcm_ghash_4bit
+       .rva    .LSEH_end_gcm_ghash_4bit
+       .rva    .LSEH_info_gcm_ghash_4bit
+
+       .rva    .LSEH_begin_gcm_ghash_clmul
+       .rva    .LSEH_end_gcm_ghash_clmul
+       .rva    .LSEH_info_gcm_ghash_clmul
+
+.section       .xdata
+.align 8
+.LSEH_info_gcm_gmult_4bit:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lgmult_prologue,.Lgmult_epilogue       # HandlerData
+.LSEH_info_gcm_ghash_4bit:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lghash_prologue,.Lghash_epilogue       # HandlerData
+.LSEH_info_gcm_ghash_clmul:
+       .byte   0x01,0x1f,0x0b,0x00
+       .byte   0x1f,0xa8,0x04,0x00     #movaps 0x40(rsp),xmm10
+       .byte   0x19,0x98,0x03,0x00     #movaps 0x30(rsp),xmm9
+       .byte   0x13,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
+       .byte   0x0d,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
+       .byte   0x08,0x68,0x00,0x00     #movaps (rsp),xmm6
+       .byte   0x04,0xa2,0x00,0x00     #sub    rsp,0x58
+___
+}
+\f
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
index 8f8bd56..3d3782c 100644 (file)
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
+#ifndef STRICT_ALIGNMENT
 #  define STRICT_ALIGNMENT 0
 #endif
 
diff --git a/crypto/modes/ccm128.c b/crypto/modes/ccm128.c
new file mode 100644 (file)
index 0000000..c9b35e5
--- /dev/null
@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+/* First you setup M and L parameters and pass the key schedule.
+ * This is called once per session setup... */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+       unsigned int M,unsigned int L,void *key,block128_f block)
+{
+       memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
+       ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
+       ctx->blocks = 0;
+       ctx->block = block;
+       ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+       const unsigned char *nonce,size_t nlen,size_t mlen)
+{
+       unsigned int L = ctx->nonce.c[0]&7;     /* the L parameter */
+
+       if (nlen<(14-L)) return -1;             /* nonce is too short */
+
+       if (sizeof(mlen)==8 && L>=3) {
+               ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
+               ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
+               ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
+               ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
+       }
+       else
+               *(u32*)(&ctx->nonce.c[8]) = 0;
+
+       ctx->nonce.c[12] = (u8)(mlen>>24);
+       ctx->nonce.c[13] = (u8)(mlen>>16);
+       ctx->nonce.c[14] = (u8)(mlen>>8);
+       ctx->nonce.c[15] = (u8)mlen;
+
+       ctx->nonce.c[0] &= ~0x40;       /* clear Adata flag */
+       memcpy(&ctx->nonce.c[1],nonce,14-L);
+
+       return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+       const unsigned char *aad,size_t alen)
+{      unsigned int i;
+       block128_f block = ctx->block;
+
+       if (alen==0) return;
+
+       ctx->nonce.c[0] |= 0x40;        /* set Adata flag */
+       (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
+       ctx->blocks++;
+
+       if (alen<(0x10000-0x100)) {
+               ctx->cmac.c[0] ^= (u8)(alen>>8);
+               ctx->cmac.c[1] ^= (u8)alen;
+               i=2;
+       }
+       else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
+               ctx->cmac.c[0] ^= 0xFF;
+               ctx->cmac.c[1] ^= 0xFF;
+               ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
+               ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
+               ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
+               ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
+               ctx->cmac.c[6] ^= (u8)(alen>>24);
+               ctx->cmac.c[7] ^= (u8)(alen>>16);
+               ctx->cmac.c[8] ^= (u8)(alen>>8);
+               ctx->cmac.c[9] ^= (u8)alen;
+               i=10;
+       }
+       else {
+               ctx->cmac.c[0] ^= 0xFF;
+               ctx->cmac.c[1] ^= 0xFE;
+               ctx->cmac.c[2] ^= (u8)(alen>>24);
+               ctx->cmac.c[3] ^= (u8)(alen>>16);
+               ctx->cmac.c[4] ^= (u8)(alen>>8);
+               ctx->cmac.c[5] ^= (u8)alen;
+               i=6;
+       }
+
+       do {
+               for(;i<16 && alen;++i,++aad,--alen)
+                       ctx->cmac.c[i] ^= *aad;
+               (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
+               ctx->blocks++;
+               i=0;
+       } while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/* counter part of nonce may not be larger than L*8 bits,
+ * L is not larger than 8, therefore 64-bit counter... */
+static void ctr64_inc(unsigned char *counter) {
+       unsigned int n=8;
+       u8  c;
+
+       counter += 8;
+       do {
+               --n;
+               c = counter[n];
+               ++c;
+               counter[n] = c;
+               if (c) return;
+       } while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out,
+       size_t len)
+{
+       size_t          n;
+       unsigned int    i,L;
+       unsigned char   flags0  = ctx->nonce.c[0];
+       block128_f      block   = ctx->block;
+       void *          key     = ctx->key;
+       union { u64 u[2]; u8 c[16]; } scratch;
+
+       if (!(flags0&0x40))
+               (*block)(ctx->nonce.c,ctx->cmac.c,key),
+               ctx->blocks++;
+
+       ctx->nonce.c[0] = L = flags0&7;
+       for (n=0,i=15-L;i<15;++i) {
+               n |= ctx->nonce.c[i];
+               ctx->nonce.c[i]=0;
+               n <<= 8;
+       }
+       n |= ctx->nonce.c[15];  /* reconstructed length */
+       ctx->nonce.c[15]=1;
+
+       if (n!=len) return -1;  /* length mismatch */
+
+       ctx->blocks += ((len+15)>>3)|1;
+       if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
+
+       while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+               union { u64 u[2]; u8 c[16]; } temp;
+
+               memcpy (temp.c,inp,16);
+               ctx->cmac.u[0] ^= temp.u[0];
+               ctx->cmac.u[1] ^= temp.u[1];
+#else
+               ctx->cmac.u[0] ^= ((u64*)inp)[0];
+               ctx->cmac.u[1] ^= ((u64*)inp)[1];
+#endif
+               (*block)(ctx->cmac.c,ctx->cmac.c,key);
+               (*block)(ctx->nonce.c,scratch.c,key);
+               ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+               temp.u[0] ^= scratch.u[0];
+               temp.u[1] ^= scratch.u[1];
+               memcpy(out,temp.c,16);
+#else
+               ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
+               ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+#endif
+               inp += 16;
+               out += 16;
+               len -= 16;
+       }
+
+       if (len) {
+               for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+               (*block)(ctx->cmac.c,ctx->cmac.c,key);
+               (*block)(ctx->nonce.c,scratch.c,key);
+               for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+       }
+
+       for (i=15-L;i<16;++i)
+               ctx->nonce.c[i]=0;
+
+       (*block)(ctx->nonce.c,scratch.c,key);
+       ctx->cmac.u[0] ^= scratch.u[0];
+       ctx->cmac.u[1] ^= scratch.u[1];
+
+       ctx->nonce.c[0] = flags0;
+
+       return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out,
+       size_t len)
+{
+       size_t          n;
+       unsigned int    i,L;
+       unsigned char   flags0  = ctx->nonce.c[0];
+       block128_f      block   = ctx->block;
+       void *          key     = ctx->key;
+       union { u64 u[2]; u8 c[16]; } scratch;
+
+       if (!(flags0&0x40))
+               (*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+       ctx->nonce.c[0] = L = flags0&7;
+       for (n=0,i=15-L;i<15;++i) {
+               n |= ctx->nonce.c[i];
+               ctx->nonce.c[i]=0;
+               n <<= 8;
+       }
+       n |= ctx->nonce.c[15];  /* reconstructed length */
+       ctx->nonce.c[15]=1;
+
+       if (n!=len) return -1;
+
+       while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+               union { u64 u[2]; u8 c[16]; } temp;
+#endif
+               (*block)(ctx->nonce.c,scratch.c,key);
+               ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+               memcpy (temp.c,inp,16);
+               ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+               ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+               memcpy (out,scratch.c,16);
+#else
+               ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
+               ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+#endif
+               (*block)(ctx->cmac.c,ctx->cmac.c,key);
+
+               inp += 16;
+               out += 16;
+               len -= 16;
+       }
+
+       if (len) {
+               (*block)(ctx->nonce.c,scratch.c,key);
+               for (i=0; i<len; ++i)
+                       ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+               (*block)(ctx->cmac.c,ctx->cmac.c,key);
+       }
+
+       for (i=15-L;i<16;++i)
+               ctx->nonce.c[i]=0;
+
+       (*block)(ctx->nonce.c,scratch.c,key);
+       ctx->cmac.u[0] ^= scratch.u[0];
+       ctx->cmac.u[1] ^= scratch.u[1];
+
+       ctx->nonce.c[0] = flags0;
+
+       return 0;
+}
+
+static void ctr64_add (unsigned char *counter,size_t inc)
+{      size_t n=8, val=0;
+
+       counter += 8;
+       do {
+               --n;
+               val += counter[n] + (inc&0xff);
+               counter[n] = (unsigned char)val;
+               val >>= 8;      /* carry bit */
+               inc >>= 8;
+       } while(n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out,
+       size_t len,ccm128_f stream)
+{
+       size_t          n;
+       unsigned int    i,L;
+       unsigned char   flags0  = ctx->nonce.c[0];
+       block128_f      block   = ctx->block;
+       void *          key     = ctx->key;
+       union { u64 u[2]; u8 c[16]; } scratch;
+
+       if (!(flags0&0x40))
+               (*block)(ctx->nonce.c,ctx->cmac.c,key),
+               ctx->blocks++;
+
+       ctx->nonce.c[0] = L = flags0&7;
+       for (n=0,i=15-L;i<15;++i) {
+               n |= ctx->nonce.c[i];
+               ctx->nonce.c[i]=0;
+               n <<= 8;
+       }
+       n |= ctx->nonce.c[15];  /* reconstructed length */
+       ctx->nonce.c[15]=1;
+
+       if (n!=len) return -1;  /* length mismatch */
+
+       ctx->blocks += ((len+15)>>3)|1;
+       if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
+
+       if ((n=len/16)) {
+               (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+               n   *= 16;
+               inp += n;
+               out += n;
+               len -= n;
+               if (len) ctr64_add(ctx->nonce.c,n/16);
+       }
+
+       if (len) {
+               for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+               (*block)(ctx->cmac.c,ctx->cmac.c,key);
+               (*block)(ctx->nonce.c,scratch.c,key);
+               for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+       }
+
+       for (i=15-L;i<16;++i)
+               ctx->nonce.c[i]=0;
+
+       (*block)(ctx->nonce.c,scratch.c,key);
+       ctx->cmac.u[0] ^= scratch.u[0];
+       ctx->cmac.u[1] ^= scratch.u[1];
+
+       ctx->nonce.c[0] = flags0;
+
+       return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out,
+       size_t len,ccm128_f stream)
+{
+       size_t          n;
+       unsigned int    i,L;
+       unsigned char   flags0  = ctx->nonce.c[0];
+       block128_f      block   = ctx->block;
+       void *          key     = ctx->key;
+       union { u64 u[2]; u8 c[16]; } scratch;
+
+       if (!(flags0&0x40))
+               (*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+       ctx->nonce.c[0] = L = flags0&7;
+       for (n=0,i=15-L;i<15;++i) {
+               n |= ctx->nonce.c[i];
+               ctx->nonce.c[i]=0;
+               n <<= 8;
+       }
+       n |= ctx->nonce.c[15];  /* reconstructed length */
+       ctx->nonce.c[15]=1;
+
+       if (n!=len) return -1;
+
+       if ((n=len/16)) {
+               (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+               n   *= 16;
+               inp += n;
+               out += n;
+               len -= n;
+               if (len) ctr64_add(ctx->nonce.c,n/16);
+       }
+
+       if (len) {
+               (*block)(ctx->nonce.c,scratch.c,key);
+               for (i=0; i<len; ++i)
+                       ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+               (*block)(ctx->cmac.c,ctx->cmac.c,key);
+       }
+
+       for (i=15-L;i<16;++i)
+               ctx->nonce.c[i]=0;
+
+       (*block)(ctx->nonce.c,scratch.c,key);
+       ctx->cmac.u[0] ^= scratch.u[0];
+       ctx->cmac.u[1] ^= scratch.u[1];
+
+       ctx->nonce.c[0] = flags0;
+
+       return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
+{      unsigned int M = (ctx->nonce.c[0]>>3)&7;        /* the M parameter */
+
+       M *= 2; M += 2;
+       if (len<M)      return 0;
+       memcpy(tag,ctx->cmac.c,M);
+       return M;
+}
index e5938c6..4e6f5d3 100644 (file)
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit cfb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
index 181614a..ee642c5 100644 (file)
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
 #endif
 #include <assert.h>
 
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#define STRICT_ALIGNMENT
-#if defined(__i386)    || defined(__i386__)    || \
-    defined(__x86_64)  || defined(__x86_64__)  || \
-    defined(_M_IX86)   || defined(_M_AMD64)    || defined(_M_X64) || \
-    defined(__s390__)  || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* NOTE: the IV/counter CTR mode is big-endian.  The code itself
  * is endian-neutral. */
 
@@ -183,9 +173,6 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
        *num=n;
 }
 
-#define GETU32(p)      ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
-#define PUTU32(p,v)    ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
-
 /* increment upper 96 bits of 128-bit counter by 1 */
 static void ctr96_inc(unsigned char *counter) {
        u32 n=12;
index e0430f9..c0e1f36 100644 (file)
@@ -5,7 +5,8 @@
  * forms are granted according to the OpenSSL license.
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -23,8 +24,9 @@
  * deviates from mentioned RFCs. Most notably it allows input to be
  * of block length and it doesn't flip the order of the last two
  * blocks. CTS is being discussed even in ECB context, but it's not
- * adopted for any known application. This implementation complies
- * with mentioned RFCs and [as such] extends CBC mode.
+ * adopted for any known application. This implementation provides
+ * two interfaces: one compliant with above mentioned RFCs and one
+ * compliant with the NIST proposal, both extending CBC mode.
  */
 
 size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
@@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
        return len+residue;
 }
 
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], block128_f block)
+{      size_t residue, n;
+
+       assert (in && out && key && ivec);
+
+       if (len < 16) return 0;
+
+       residue=len%16;
+
+       len -= residue;
+
+       CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
+
+       if (residue==0) return len;
+
+       in  += len;
+       out += len;
+
+       for (n=0; n<residue; ++n)
+               ivec[n] ^= in[n];
+       (*block)(ivec,ivec,key);
+       memcpy(out-16+residue,ivec,16);
+
+       return len+residue;
+}
+
 size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], cbc128_f cbc)
@@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
        return len+residue;
 }
 
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], cbc128_f cbc)
+{      size_t residue;
+       union { size_t align; unsigned char c[16]; } tmp;
+
+       assert (in && out && key && ivec);
+
+       if (len < 16) return 0;
+
+       residue=len%16;
+
+       len -= residue;
+
+       (*cbc)(in,out,len,key,ivec,1);
+
+       if (residue==0) return len;
+
+       in  += len;
+       out += len;
+
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+       (*cbc)(in,out-16+residue,residue,key,ivec,1);
+#else
+       {
+       size_t n;
+       for (n=0; n<16; n+=sizeof(size_t))
+               *(size_t *)(tmp.c+n) = 0;
+       memcpy(tmp.c,in,residue);
+       }
+       (*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
+#endif
+       return len+residue;
+}
+
 size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], block128_f block)
@@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
        for(residue+=16; n<residue; ++n)
                out[n] = tmp.c[n] ^ in[n];
 
-       return len+residue-16;
+       return 16+len+residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], block128_f block)
+{      size_t residue, n;
+       union { size_t align; unsigned char c[32]; } tmp;
+
+       assert (in && out && key && ivec);
+
+       if (len<16) return 0;
+
+       residue=len%16;
+
+       if (residue==0) {
+               CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
+               return len;
+       }
+
+       len -= 16+residue;
+
+       if (len) {
+               CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
+               in  += len;
+               out += len;
+       }
+
+       (*block)(in+residue,tmp.c+16,key);
+
+       for (n=0; n<16; n+=sizeof(size_t))
+               *(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n);
+       memcpy(tmp.c,in,residue);
+       (*block)(tmp.c,tmp.c,key);
+
+       for(n=0; n<16; ++n) {
+               unsigned char c = in[n];
+               out[n] = tmp.c[n] ^ ivec[n];
+               ivec[n] = in[n+residue];
+               tmp.c[n] = c;
+       }
+       for(residue+=16; n<residue; ++n)
+               out[n] = tmp.c[n] ^ tmp.c[n-16];
+
+       return 16+len+residue;
 }
 
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
@@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
        (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
        memcpy(out,tmp.c,16+residue);
 #endif
-       return len+residue;
+       return 16+len+residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], cbc128_f cbc)
+{      size_t residue, n;
+       union { size_t align; unsigned char c[32]; } tmp;
+
+       assert (in && out && key && ivec);
+
+       if (len<16) return 0;
+
+       residue=len%16;
+
+       if (residue==0) {
+               (*cbc)(in,out,len,key,ivec,0);
+               return len;
+       }
+
+       len -= 16+residue;
+
+       if (len) {
+               (*cbc)(in,out,len,key,ivec,0);
+               in  += len;
+               out += len;
+       }
+
+       for (n=16; n<32; n+=sizeof(size_t))
+               *(size_t *)(tmp.c+n) = 0;
+       /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
+       (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
+
+       memcpy(tmp.c,in,residue);
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+       (*cbc)(tmp.c,out,16+residue,key,ivec,0);
+#else
+       (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
+       memcpy(out,tmp.c,16+residue);
+#endif
+       return 16+len+residue;
 }
 
 #if defined(SELFTEST)
@@ -200,9 +349,8 @@ static const unsigned char vector_64[64] =
 static AES_KEY encks, decks;
 
 void test_vector(const unsigned char *vector,size_t len)
-{      unsigned char cleartext[64];
-       unsigned char iv[sizeof(test_iv)];
-       unsigned char ciphertext[64];
+{      unsigned char iv[sizeof(test_iv)];
+       unsigned char cleartext[64],ciphertext[64];
        size_t tail;
 
        printf("vector_%d\n",len); fflush(stdout);
@@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len)
                fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
 }
 
-main()
+void test_nistvector(const unsigned char *vector,size_t len)
+{      unsigned char iv[sizeof(test_iv)];
+       unsigned char cleartext[64],ciphertext[64],nistvector[64];
+       size_t tail;
+
+       printf("nistvector_%d\n",len); fflush(stdout);
+
+       if ((tail=len%16) == 0) tail = 16;
+
+       len -= 16 + tail;
+       memcpy(nistvector,vector,len);
+       /* flip two last blocks */
+       memcpy(nistvector+len,vector+len+16,tail);
+       memcpy(nistvector+len+tail,vector+len,16);
+       len += 16 + tail;
+       tail = 16;
+
+       /* test block-based encryption */
+       memcpy(iv,test_iv,sizeof(test_iv));
+       CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
+       if (memcmp(ciphertext,nistvector,len))
+               fprintf(stderr,"output_%d mismatch\n",len), exit(1);
+       if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+               fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
+
+       /* test block-based decryption */
+       memcpy(iv,test_iv,sizeof(test_iv));
+       CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
+       if (memcmp(cleartext,test_input,len))
+               fprintf(stderr,"input_%d mismatch\n",len), exit(2);
+       if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+               fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
+
+       /* test streamed encryption */
+       memcpy(iv,test_iv,sizeof(test_iv));
+       CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
+       if (memcmp(ciphertext,nistvector,len))
+               fprintf(stderr,"output_%d mismatch\n",len), exit(3);
+       if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+               fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
+
+       /* test streamed decryption */
+       memcpy(iv,test_iv,sizeof(test_iv));
+       CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
+       if (memcmp(cleartext,test_input,len))
+               fprintf(stderr,"input_%d mismatch\n",len), exit(4);
+       if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+               fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
+}
+
+int main()
 {
        AES_set_encrypt_key(test_key,128,&encks);
        AES_set_decrypt_key(test_key,128,&decks);
@@ -254,6 +452,14 @@ main()
        test_vector(vector_47,sizeof(vector_47));
        test_vector(vector_48,sizeof(vector_48));
        test_vector(vector_64,sizeof(vector_64));
-       exit(0);
+
+       test_nistvector(vector_17,sizeof(vector_17));
+       test_nistvector(vector_31,sizeof(vector_31));
+       test_nistvector(vector_32,sizeof(vector_32));
+       test_nistvector(vector_47,sizeof(vector_47));
+       test_nistvector(vector_48,sizeof(vector_48));
+       test_nistvector(vector_64,sizeof(vector_64));
+
+       return 0;
 }
 #endif
index 4a5c6e2..f18215b 100644 (file)
@@ -19,6 +19,10 @@ typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
                        size_t blocks, const void *key,
                        const unsigned char ivec[16]);
 
+typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
+                       size_t blocks, const void *key,
+                       const unsigned char ivec[16],unsigned char cmac[16]);
+
 void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], block128_f block);
@@ -67,6 +71,19 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
                        size_t len, const void *key,
                        unsigned char ivec[16], cbc128_f cbc);
 
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+                       size_t len, const void *key,
+                       unsigned char ivec[16], cbc128_f cbc);
+
 typedef struct gcm128_context GCM128_CONTEXT;
 
 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
@@ -91,3 +108,28 @@ int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
                        size_t len);
 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
+
+typedef struct ccm128_context CCM128_CONTEXT;
+
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+       unsigned int M, unsigned int L, void *key,block128_f block);
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+       const unsigned char *nonce, size_t nlen, size_t mlen);
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+       const unsigned char *aad, size_t alen);
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out, size_t len,
+       ccm128_f stream);
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+       const unsigned char *inp, unsigned char *out, size_t len,
+       ccm128_f stream);
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+
+typedef struct xts128_context XTS128_CONTEXT;
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+       const unsigned char *inp, unsigned char *out, size_t len, int enc);
index c732e2e..01c0170 100644 (file)
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit ofb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
diff --git a/crypto/modes/xts128.c b/crypto/modes/xts128.c
new file mode 100644 (file)
index 0000000..9cf27a2
--- /dev/null
@@ -0,0 +1,187 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+       const unsigned char *inp, unsigned char *out,
+       size_t len, int enc)
+{
+       const union { long one; char little; } is_endian = {1};
+       union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
+       unsigned int i;
+
+       if (len<16) return -1;
+
+       memcpy(tweak.c, iv, 16);
+
+       (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
+
+       if (!enc && (len%16)) len-=16;
+
+       while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+               memcpy(scratch.c,inp,16);
+               scratch.u[0] ^= tweak.u[0];
+               scratch.u[1] ^= tweak.u[1];
+#else
+               scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
+               scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+#endif
+               (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+               scratch.u[0] ^= tweak.u[0];
+               scratch.u[1] ^= tweak.u[1];
+               memcpy(out,scratch.c,16);
+#else
+               ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
+               ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+#endif
+               inp += 16;
+               out += 16;
+               len -= 16;
+
+               if (len==0)     return 0;
+
+               if (is_endian.little) {
+                       unsigned int carry,res;
+                       
+                       res = 0x87&(((int)tweak.d[3])>>31);
+                       carry = (unsigned int)(tweak.u[0]>>63);
+                       tweak.u[0] = (tweak.u[0]<<1)^res;
+                       tweak.u[1] = (tweak.u[1]<<1)|carry;
+               }
+               else {
+                       size_t c;
+
+                       for (c=0,i=0;i<16;++i) {
+                               /*+ substitutes for |, because c is 1 bit */ 
+                               c += ((size_t)tweak.c[i])<<1;
+                               tweak.c[i] = (u8)c;
+                               c = c>>8;
+                       }
+                       tweak.c[0] ^= (u8)(0x87&(0-c));
+               }
+       }
+       if (enc) {
+               for (i=0;i<len;++i) {
+                       u8 c = inp[i];
+                       out[i] = scratch.c[i];
+                       scratch.c[i] = c;
+               }
+               scratch.u[0] ^= tweak.u[0];
+               scratch.u[1] ^= tweak.u[1];
+               (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+               scratch.u[0] ^= tweak.u[0];
+               scratch.u[1] ^= tweak.u[1];
+               memcpy(out-16,scratch.c,16);
+       }
+       else {
+               union { u64 u[2]; u8 c[16]; } tweak1;
+
+               if (is_endian.little) {
+                       unsigned int carry,res;
+
+                       res = 0x87&(((int)tweak.d[3])>>31);
+                       carry = (unsigned int)(tweak.u[0]>>63);
+                       tweak1.u[0] = (tweak.u[0]<<1)^res;
+                       tweak1.u[1] = (tweak.u[1]<<1)|carry;
+               }
+               else {
+                       size_t c;
+
+                       for (c=0,i=0;i<16;++i) {
+                               /*+ substitutes for |, because c is 1 bit */ 
+                               c += ((size_t)tweak.c[i])<<1;
+                               tweak1.c[i] = (u8)c;
+                               c = c>>8;
+                       }
+                       tweak1.c[0] ^= (u8)(0x87&(0-c));
+               }
+#if defined(STRICT_ALIGNMENT)
+               memcpy(scratch.c,inp,16);
+               scratch.u[0] ^= tweak1.u[0];
+               scratch.u[1] ^= tweak1.u[1];
+#else
+               scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
+               scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+#endif
+               (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+               scratch.u[0] ^= tweak1.u[0];
+               scratch.u[1] ^= tweak1.u[1];
+
+               for (i=0;i<len;++i) {
+                       u8 c = inp[16+i];
+                       out[16+i] = scratch.c[i];
+                       scratch.c[i] = c;
+               }
+               scratch.u[0] ^= tweak.u[0];
+               scratch.u[1] ^= tweak.u[1];
+               (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+               scratch.u[0] ^= tweak.u[0];
+               scratch.u[1] ^= tweak.u[1];
+               memcpy (out,scratch.c,16);
+#else
+               ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
+               ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+#endif
+       }
+
+       return 0;
+}