Synchronize message digests in 098-fips with 098.
authorAndy Polyakov <appro@openssl.org>
Sun, 11 Nov 2007 13:34:08 +0000 (13:34 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 11 Nov 2007 13:34:08 +0000 (13:34 +0000)
20 files changed:
Configure
TABLE
crypto/md32_common.h
crypto/md4/md4_dgst.c
crypto/md4/md4_locl.h
crypto/md5/Makefile
crypto/md5/asm/md5-586.pl
crypto/md5/asm/md5-sparcv9.S [deleted file]
crypto/md5/asm/md5-x86_64.pl
crypto/md5/md5_dgst.c
crypto/md5/md5_locl.h
crypto/ripemd/asm/rmd-586.pl
crypto/ripemd/rmd_dgst.c
crypto/ripemd/rmd_locl.h
crypto/sha/asm/sha1-586.pl
crypto/sha/asm/sha1-ia64.pl
crypto/sha/asm/sha512-ia64.pl
crypto/sha/sha256.c
crypto/sha/sha512.c
crypto/sha/sha_locl.h

index ab88aebe0562b8d26b63f3a404337775ec46274d..c4fb17ef5a908c6fee328ac27327558cc4370f38 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -201,11 +201,11 @@ my %table=(
 "solaris-sparcv7-gcc","gcc:-O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "solaris-sparcv8-gcc","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # -m32 should be safe to add as long as driver recognizes -mcpu=ultrasparc
-"solaris-sparcv9-gcc","gcc:-m32 -mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::des_enc-sparc.o fcrypt_b.o:::md5-sparcv9.o::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"solaris-sparcv9-gcc","gcc:-m32 -mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 ####
 "debug-solaris-sparcv8-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG_ALL -O -g -mv8 -Wall -DB_ENDIAN::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"debug-solaris-sparcv9-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG_ALL -DPEDANTIC -O -g -mcpu=ultrasparc -pedantic -ansi -Wall -Wshadow -Wno-long-long -D__EXTENSIONS__ -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"debug-solaris-sparcv9-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG_ALL -DPEDANTIC -O -g -mcpu=ultrasparc -pedantic -ansi -Wall -Wshadow -Wno-long-long -D__EXTENSIONS__ -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 
 #### SPARC Solaris with Sun C setups
 # SC4.0 doesn't pass 'make test', upgrade to SC5.0 or SC4.2.
@@ -213,11 +213,11 @@ my %table=(
 # SC5.0 note: Compiler common patch 107357-01 or later is required!
 "solaris-sparcv7-cc","cc:-xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR:${no_asm}:dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "solaris-sparcv8-cc","cc:-xarch=v8 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"solaris-sparcv9-cc","cc:-xtarget=ultra -xarch=v8plus -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::des_enc-sparc.o fcrypt_b.o:::md5-sparcv9.o::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs",
+"solaris-sparcv9-cc","cc:-xtarget=ultra -xarch=v8plus -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs",
 ####
 "debug-solaris-sparcv8-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG_ALL -xarch=v8 -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"debug-solaris-sparcv9-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o::::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", 
+"debug-solaris-sparcv9-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o::::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", 
 
 #### SunOS configs, assuming sparc for the gcc one.
 #"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST::(unknown):SUNOS::DES_UNROLL:${no_asm}::",
@@ -332,9 +332,9 @@ my %table=(
 "linux-sparcv8","gcc:-mv8 -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # it's a real mess with -mcpu=ultrasparc option under Linux, but
 # -Wa,-Av8plus should do the trick no matter what.
-"linux-sparcv9","gcc:-m32 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -Wa,-Av8plus -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::md5-sparcv8plus.o::::::dlfcn:linux-shared:-fPIC:-m32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-sparcv9","gcc:-m32 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -Wa,-Av8plus -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:linux-shared:-fPIC:-m32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # GCC 3.1 is a requirement
-"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::::::md5-sparcv9.o::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::::::::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 #### Alpha Linux with GNU C and Compaq C setups
 # Special notes:
 # - linux-alpha+bwx-gcc is ment to be used from ./config only. If you
@@ -364,7 +364,7 @@ my %table=(
 # -DMD32_REG_T=int doesn't actually belong in sparc64 target, it
 # simply *happens* to work around a compiler bug in gcc 3.3.3,
 # triggered by RIPEMD160 code.
-"BSD-sparc64", "gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR:::des_enc-sparc.o fcrypt_b.o:::md5-sparcv9.o::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"BSD-sparc64", "gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR:::des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "BSD-ia64",    "gcc:-DL_ENDIAN -DTERMIOS -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK:${ia64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "BSD-x86_64",  "gcc:-DL_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 
diff --git a/TABLE b/TABLE
index 91c5ca47b8ed46e46a0e372d0fd5f92161e6703d..689baf9205a59bdc10a3d2252b1cf00c96c66e93 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -148,7 +148,7 @@ $bn_obj       =
 $des_obj      = des_enc-sparc.o fcrypt_b.o
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv9.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -1660,7 +1660,7 @@ $bn_obj       = sparcv8plus.o
 $des_obj      = 
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv8plus.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -1687,7 +1687,7 @@ $bn_obj       = sparcv8plus.o
 $des_obj      = des_enc-sparc.o fcrypt_b.o
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv8plus.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -2929,7 +2929,7 @@ $bn_obj       = sparcv8plus.o
 $des_obj      = des_enc-sparc.o fcrypt_b.o
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv8plus.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -2983,7 +2983,7 @@ $bn_obj       =
 $des_obj      = 
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv9.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -3577,7 +3577,7 @@ $bn_obj       = sparcv8plus.o
 $des_obj      = des_enc-sparc.o fcrypt_b.o
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv8plus.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -3604,7 +3604,7 @@ $bn_obj       = sparcv8plus.o
 $des_obj      = des_enc-sparc.o fcrypt_b.o
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv8plus.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -3685,7 +3685,7 @@ $bn_obj       =
 $des_obj      = des_enc-sparc.o fcrypt_b.o
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv9.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
@@ -3712,7 +3712,7 @@ $bn_obj       =
 $des_obj      = des_enc-sparc.o fcrypt_b.o
 $aes_obj      = 
 $bf_obj       = 
-$md5_obj      = md5-sparcv9.o
+$md5_obj      = 
 $sha1_obj     = 
 $cast_obj     = 
 $rc4_obj      = 
index 0e625a8e55c01c4f7103a43ba77681e13f44e5d1..089c4502905c68d78a5b267bc93a2e124b5a2b73 100644 (file)
@@ -1,6 +1,6 @@
 /* crypto/md32_common.h */
 /* ====================================================================
- * Copyright (c) 1999-2002 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * OF THE POSSIBILITY OF SUCH DAMAGE.
  * ====================================================================
  *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com).
- *
  */
 
 /*
  *             typedef struct {
  *                     ...
  *                     HASH_LONG       Nl,Nh;
+ *                     either {
  *                     HASH_LONG       data[HASH_LBLOCK];
+ *                     unsigned char   data[HASH_CBLOCK];
+ *                     };
  *                     unsigned int    num;
  *                     ...
  *                     } HASH_CTX;
+ *     data[] vector is expected to be zeroed upon first call to
+ *     HASH_UPDATE.
  * HASH_UPDATE
  *     name of "Update" function, implemented here.
  * HASH_TRANSFORM
  *     name of "Transform" function, implemented here.
  * HASH_FINAL
  *     name of "Final" function, implemented here.
- * HASH_BLOCK_HOST_ORDER
- *     name of "block" function treating *aligned* input message
- *     in host byte order, implemented externally.
  * HASH_BLOCK_DATA_ORDER
- *     name of "block" function treating *unaligned* input message
- *     in original (data) byte order, implemented externally (it
- *     actually is optional if data and host are of the same
- *     "endianess").
+ *     name of "block" function capable of treating *unaligned* input
+ *     message in original (data) byte order, implemented externally.
  * HASH_MAKE_STRING
  *     macro convering context variables to an ASCII hash string.
  *
- * Optional macros:
- *
- * B_ENDIAN or L_ENDIAN
- *     defines host byte-order.
- * HASH_LONG_LOG2
- *     defaults to 2 if not states otherwise.
- * HASH_LBLOCK
- *     assumed to be HASH_CBLOCK/4 if not stated otherwise.
- * HASH_BLOCK_DATA_ORDER_ALIGNED
- *     alternative "block" function capable of treating
- *     aligned input message in original (data) order,
- *     implemented externally.
- *
  * MD5 example:
  *
  *     #define DATA_ORDER_IS_LITTLE_ENDIAN
  *     #define HASH_LONG_LOG2          MD5_LONG_LOG2
  *     #define HASH_CTX                MD5_CTX
  *     #define HASH_CBLOCK             MD5_CBLOCK
- *     #define HASH_LBLOCK             MD5_LBLOCK
  *     #define HASH_UPDATE             MD5_Update
  *     #define HASH_TRANSFORM          MD5_Transform
  *     #define HASH_FINAL              MD5_Final
- *     #define HASH_BLOCK_HOST_ORDER   md5_block_host_order
  *     #define HASH_BLOCK_DATA_ORDER   md5_block_data_order
  *
  *                                     <appro@fy.chalmers.se>
 #error "HASH_FINAL must be defined!"
 #endif
 
-#ifndef HASH_BLOCK_HOST_ORDER
-#error "HASH_BLOCK_HOST_ORDER must be defined!"
-#endif
-
-#if 0
-/*
- * Moved below as it's required only if HASH_BLOCK_DATA_ORDER_ALIGNED
- * isn't defined.
- */
 #ifndef HASH_BLOCK_DATA_ORDER
 #error "HASH_BLOCK_DATA_ORDER must be defined!"
 #endif
-#endif
-
-#ifndef HASH_LBLOCK
-#define HASH_LBLOCK    (HASH_CBLOCK/4)
-#endif
-
-#ifndef HASH_LONG_LOG2
-#define HASH_LONG_LOG2 2
-#endif
 
 /*
  * Engage compiler specific rotate intrinsic function if available.
                                : "cc");                \
                           ret;                         \
                        })
-#  elif defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__)
+#  elif defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \
+       defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__)
 #   define ROTATE(a,n) ({ register unsigned int ret;   \
                                asm (                   \
                                "rlwinm %0,%1,%2,0,31"  \
                                : "r"(a), "I"(n));      \
                           ret;                         \
                        })
+#  elif defined(__s390x__)
+#   define ROTATE(a,n) ({ register unsigned int ret;   \
+                               asm ("rll %0,%1,%2"     \
+                               : "=r"(ret)             \
+                               : "r"(a), "I"(n));      \
+                         ret;                          \
+                       })
 #  endif
 # endif
 #endif /* PEDANTIC */
 
-#if HASH_LONG_LOG2==2  /* Engage only if sizeof(HASH_LONG)== 4 */
-/* A nice byte order reversal from Wei Dai <weidai@eskimo.com> */
-#ifdef ROTATE
-/* 5 instructions with rotate instruction, else 9 */
-#define REVERSE_FETCH32(a,l)   (                                       \
-               l=*(const HASH_LONG *)(a),                              \
-               ((ROTATE(l,8)&0x00FF00FF)|(ROTATE((l&0x00FF00FF),24)))  \
-                               )
-#else
-/* 6 instructions with rotate instruction, else 8 */
-#define REVERSE_FETCH32(a,l)   (                               \
-               l=*(const HASH_LONG *)(a),                      \
-               l=(((l>>8)&0x00FF00FF)|((l&0x00FF00FF)<<8)),    \
-               ROTATE(l,16)                                    \
-                               )
-/*
- * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|...
- * It's rewritten as above for two reasons:
- *     - RISCs aren't good at long constants and have to explicitely
- *       compose 'em with several (well, usually 2) instructions in a
- *       register before performing the actual operation and (as you
- *       already realized:-) having same constant should inspire the
- *       compiler to permanently allocate the only register for it;
- *     - most modern CPUs have two ALUs, but usually only one has
- *       circuitry for shifts:-( this minor tweak inspires compiler
- *       to schedule shift instructions in a better way...
- *
- *                             <appro@fy.chalmers.se>
- */
-#endif
-#endif
-
 #ifndef ROTATE
 #define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
 #endif
 
-/*
- * Make some obvious choices. E.g., HASH_BLOCK_DATA_ORDER_ALIGNED
- * and HASH_BLOCK_HOST_ORDER ought to be the same if input data
- * and host are of the same "endianess". It's possible to mask
- * this with blank #define HASH_BLOCK_DATA_ORDER though...
- *
- *                             <appro@fy.chalmers.se>
- */
-#if defined(B_ENDIAN)
-#  if defined(DATA_ORDER_IS_BIG_ENDIAN)
-#    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
-#      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
-#    endif
-#  endif
-#elif defined(L_ENDIAN)
-#  if defined(DATA_ORDER_IS_LITTLE_ENDIAN)
-#    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
-#      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
-#    endif
-#  endif
-#endif
-
-#if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED)
-#ifndef HASH_BLOCK_DATA_ORDER
-#error "HASH_BLOCK_DATA_ORDER must be defined!"
-#endif
-#endif
-
 #if defined(DATA_ORDER_IS_BIG_ENDIAN)
 
 #ifndef PEDANTIC
 # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
 #  if ((defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)) || \
       (defined(__x86_64) || defined(__x86_64__))
+#   if !defined(B_ENDIAN)
     /*
      * This gives ~30-40% performance improvement in SHA-256 compiled
      * with gcc [on P4]. Well, first macro to be frank. We can pull
 #   define HOST_l2c(l,c)       ({ unsigned int r=(l);                  \
                                   asm ("bswapl %0":"=r"(r):"0"(r));    \
                                   *((unsigned int *)(c))=r; (c)+=4; r; })
+#   endif
 #  endif
 # endif
 #endif
+#if defined(__s390__) || defined(__s390x__)
+# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, (l))
+# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, (l))
+#endif
 
 #ifndef HOST_c2l
 #define HOST_c2l(c,l)  (l =(((unsigned long)(*((c)++)))<<24),          \
                         l|=(((unsigned long)(*((c)++)))    ),          \
                         l)
 #endif
-#define HOST_p_c2l(c,l,n)      {                                       \
-                       switch (n) {                                    \
-                       case 0: l =((unsigned long)(*((c)++)))<<24;     \
-                       case 1: l|=((unsigned long)(*((c)++)))<<16;     \
-                       case 2: l|=((unsigned long)(*((c)++)))<< 8;     \
-                       case 3: l|=((unsigned long)(*((c)++)));         \
-                               } }
-#define HOST_p_c2l_p(c,l,sc,len) {                                     \
-                       switch (sc) {                                   \
-                       case 0: l =((unsigned long)(*((c)++)))<<24;     \
-                               if (--len == 0) break;                  \
-                       case 1: l|=((unsigned long)(*((c)++)))<<16;     \
-                               if (--len == 0) break;                  \
-                       case 2: l|=((unsigned long)(*((c)++)))<< 8;     \
-                               } }
-/* NOTE the pointer is not incremented at the end of this */
-#define HOST_c2l_p(c,l,n)      {                                       \
-                       l=0; (c)+=n;                                    \
-                       switch (n) {                                    \
-                       case 3: l =((unsigned long)(*(--(c))))<< 8;     \
-                       case 2: l|=((unsigned long)(*(--(c))))<<16;     \
-                       case 1: l|=((unsigned long)(*(--(c))))<<24;     \
-                               } }
 #ifndef HOST_l2c
 #define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)>>24)&0xff),      \
                         *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
 
 #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
 
+#ifndef PEDANTIC
+# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#  if defined(__s390x__)
+#   define HOST_c2l(c,l)       ({ asm ("lrv    %0,0(%1)"               \
+                                       :"=r"(l) : "r"(c));             \
+                                  (c)+=4; (l);                         })
+#   define HOST_l2c(l,c)       ({ asm ("strv   %0,0(%1)"               \
+                                       : : "r"(l),"r"(c) : "memory");  \
+                                  (c)+=4; (l);                         })
+#  endif
+# endif
+#endif
 #if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
 # ifndef B_ENDIAN
    /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
                         l|=(((unsigned long)(*((c)++)))<<24),          \
                         l)
 #endif
-#define HOST_p_c2l(c,l,n)      {                                       \
-                       switch (n) {                                    \
-                       case 0: l =((unsigned long)(*((c)++)));         \
-                       case 1: l|=((unsigned long)(*((c)++)))<< 8;     \
-                       case 2: l|=((unsigned long)(*((c)++)))<<16;     \
-                       case 3: l|=((unsigned long)(*((c)++)))<<24;     \
-                               } }
-#define HOST_p_c2l_p(c,l,sc,len) {                                     \
-                       switch (sc) {                                   \
-                       case 0: l =((unsigned long)(*((c)++)));         \
-                               if (--len == 0) break;                  \
-                       case 1: l|=((unsigned long)(*((c)++)))<< 8;     \
-                               if (--len == 0) break;                  \
-                       case 2: l|=((unsigned long)(*((c)++)))<<16;     \
-                               } }
-/* NOTE the pointer is not incremented at the end of this */
-#define HOST_c2l_p(c,l,n)      {                                       \
-                       l=0; (c)+=n;                                    \
-                       switch (n) {                                    \
-                       case 3: l =((unsigned long)(*(--(c))))<<16;     \
-                       case 2: l|=((unsigned long)(*(--(c))))<< 8;     \
-                       case 1: l|=((unsigned long)(*(--(c))));         \
-                               } }
 #ifndef HOST_l2c
 #define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)    )&0xff),      \
                         *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
 int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
        {
        const unsigned char *data=data_;
-       register HASH_LONG * p;
-       register HASH_LONG l;
-       size_t sw,sc,ew,ec;
+       unsigned char *p;
+       HASH_LONG l;
+       size_t n;
 
        if (len==0) return 1;
 
@@ -413,101 +296,43 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
        c->Nh+=(len>>29);       /* might cause compiler warning on 16-bit */
        c->Nl=l;
 
-       if (c->num != 0)
+       n = c->num;
+       if (n != 0)
                {
-               p=c->data;
-               sw=c->num>>2;
-               sc=c->num&0x03;
+               p=(unsigned char *)c->data;
 
-               if ((c->num+len) >= HASH_CBLOCK)
+               if ((n+len) >= HASH_CBLOCK)
                        {
-                       l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l;
-                       for (; sw<HASH_LBLOCK; sw++)
-                               {
-                               HOST_c2l(data,l); p[sw]=l;
-                               }
-                       HASH_BLOCK_HOST_ORDER (c,p,1);
-                       len-=(HASH_CBLOCK-c->num);
-                       c->num=0;
-                       /* drop through and do the rest */
+                       memcpy (p+n,data,HASH_CBLOCK-n);
+                       HASH_BLOCK_DATA_ORDER (c,p,1);
+                       n      = HASH_CBLOCK-n;
+                       data  += n;
+                       len   -= n;
+                       c->num = 0;
+                       memset (p,0,HASH_CBLOCK);       /* keep it zeroed */
                        }
                else
                        {
-                       c->num+=(unsigned int)len;
-                       if ((sc+len) < 4) /* ugly, add char's to a word */
-                               {
-                               l=p[sw]; HOST_p_c2l_p(data,l,sc,len); p[sw]=l;
-                               }
-                       else
-                               {
-                               ew=(c->num>>2);
-                               ec=(c->num&0x03);
-                               if (sc)
-                                       l=p[sw];
-                               HOST_p_c2l(data,l,sc);
-                               p[sw++]=l;
-                               for (; sw < ew; sw++)
-                                       {
-                                       HOST_c2l(data,l); p[sw]=l;
-                                       }
-                               if (ec)
-                                       {
-                                       HOST_c2l_p(data,l,ec); p[sw]=l;
-                                       }
-                               }
+                       memcpy (p+n,data,len);
+                       c->num += (unsigned int)len;
                        return 1;
                        }
                }
 
-       sw=len/HASH_CBLOCK;
-       if (sw > 0)
+       n = len/HASH_CBLOCK;
+       if (n > 0)
                {
-#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED)
-               /*
-                * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined
-                * only if sizeof(HASH_LONG)==4.
-                */
-               if ((((size_t)data)%4) == 0)
-                       {
-                       /* data is properly aligned so that we can cast it: */
-                       HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,sw);
-                       sw*=HASH_CBLOCK;
-                       data+=sw;
-                       len-=sw;
-                       }
-               else
-#if !defined(HASH_BLOCK_DATA_ORDER)
-                       while (sw--)
-                               {
-                               memcpy (p=c->data,data,HASH_CBLOCK);
-                               HASH_BLOCK_DATA_ORDER_ALIGNED(c,p,1);
-                               data+=HASH_CBLOCK;
-                               len-=HASH_CBLOCK;
-                               }
-#endif
-#endif
-#if defined(HASH_BLOCK_DATA_ORDER)
-                       {
-                       HASH_BLOCK_DATA_ORDER(c,data,sw);
-                       sw*=HASH_CBLOCK;
-                       data+=sw;
-                       len-=sw;
-                       }
-#endif
+               HASH_BLOCK_DATA_ORDER (c,data,n);
+               n    *= HASH_CBLOCK;
+               data += n;
+               len  -= n;
                }
 
-       if (len!=0)
+       if (len != 0)
                {
-               p = c->data;
+               p = (unsigned char *)c->data;
                c->num = len;
-               ew=len>>2;      /* words to copy */
-               ec=len&0x03;
-               for (; ew; ew--,p++)
-                       {
-                       HOST_c2l(data,l); *p=l;
-                       }
-               HOST_c2l_p(data,l,ec);
-               *p=l;
+               memcpy (p,data,len);
                }
        return 1;
        }
@@ -515,73 +340,38 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
 
 void HASH_TRANSFORM (HASH_CTX *c, const unsigned char *data)
        {
-#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED)
-       if ((((size_t)data)%4) == 0)
-               /* data is properly aligned so that we can cast it: */
-               HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,1);
-       else
-#if !defined(HASH_BLOCK_DATA_ORDER)
-               {
-               memcpy (c->data,data,HASH_CBLOCK);
-               HASH_BLOCK_DATA_ORDER_ALIGNED (c,c->data,1);
-               }
-#endif
-#endif
-#if defined(HASH_BLOCK_DATA_ORDER)
        HASH_BLOCK_DATA_ORDER (c,data,1);
-#endif
        }
 
 
 int HASH_FINAL (unsigned char *md, HASH_CTX *c)
        {
-       register HASH_LONG *p;
-       register unsigned long l;
-       register int i,j;
-       static const unsigned char end[4]={0x80,0x00,0x00,0x00};
-       const unsigned char *cp=end;
-
-       /* c->num should definitly have room for at least one more byte. */
-       p=c->data;
-       i=c->num>>2;
-       j=c->num&0x03;
-
-#if 0
-       /* purify often complains about the following line as an
-        * Uninitialized Memory Read.  While this can be true, the
-        * following p_c2l macro will reset l when that case is true.
-        * This is because j&0x03 contains the number of 'valid' bytes
-        * already in p[i].  If and only if j&0x03 == 0, the UMR will
-        * occur but this is also the only time p_c2l will do
-        * l= *(cp++) instead of l|= *(cp++)
-        * Many thanks to Alex Tang <altitude@cic.net> for pickup this
-        * 'potential bug' */
-#ifdef PURIFY
-       if (j==0) p[i]=0; /* Yeah, but that's not the way to fix it:-) */
-#endif
-       l=p[i];
-#else
-       l = (j==0) ? 0 : p[i];
-#endif
-       HOST_p_c2l(cp,l,j); p[i++]=l; /* i is the next 'undefined word' */
+       unsigned char *p = (unsigned char *)c->data;
+       size_t n = c->num;
 
-       if (i>(HASH_LBLOCK-2)) /* save room for Nl and Nh */
+       p[n] = 0x80; /* there is always room for one */
+       n++;
+
+       if (n > (HASH_CBLOCK-8))
                {
-               if (i<HASH_LBLOCK) p[i]=0;
-               HASH_BLOCK_HOST_ORDER (c,p,1);
-               i=0;
+               memset (p+n,0,HASH_CBLOCK-n);
+               n=0;
+               HASH_BLOCK_DATA_ORDER (c,p,1);
                }
-       for (; i<(HASH_LBLOCK-2); i++)
-               p[i]=0;
+       memset (p+n,0,HASH_CBLOCK-8-n);
 
+       p += HASH_CBLOCK-8;
 #if   defined(DATA_ORDER_IS_BIG_ENDIAN)
-       p[HASH_LBLOCK-2]=c->Nh;
-       p[HASH_LBLOCK-1]=c->Nl;
+       (void)HOST_l2c(c->Nh,p);
+       (void)HOST_l2c(c->Nl,p);
 #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
-       p[HASH_LBLOCK-2]=c->Nl;
-       p[HASH_LBLOCK-1]=c->Nh;
+       (void)HOST_l2c(c->Nl,p);
+       (void)HOST_l2c(c->Nh,p);
 #endif
-       HASH_BLOCK_HOST_ORDER (c,p,1);
+       p -= HASH_CBLOCK;
+       HASH_BLOCK_DATA_ORDER (c,p,1);
+       c->num=0;
+       memset (p,0,HASH_CBLOCK);
 
 #ifndef HASH_MAKE_STRING
 #error "HASH_MAKE_STRING must be defined!"
@@ -589,11 +379,6 @@ int HASH_FINAL (unsigned char *md, HASH_CTX *c)
        HASH_MAKE_STRING(c,md);
 #endif
 
-       c->num=0;
-       /* clear stuff, HASH_BLOCK may be leaving some stuff on the stack
-        * but I'm not worried :-)
-       OPENSSL_cleanse((void *)c,sizeof(HASH_CTX));
-        */
        return 1;
        }
 
index 86b79b8e4d4525061793acccc936cf0cea6537a7..cfef94af39a0b098519bcfd052f05d231c7432cf 100644 (file)
@@ -82,79 +82,6 @@ int MD4_Init(MD4_CTX *c)
        return 1;
        }
 
-#ifndef md4_block_host_order
-void md4_block_host_order (MD4_CTX *c, const void *data, size_t num)
-       {
-       const MD4_LONG *X=data;
-       register unsigned MD32_REG_T A,B,C,D;
-
-       A=c->A;
-       B=c->B;
-       C=c->C;
-       D=c->D;
-
-       for (;num--;X+=HASH_LBLOCK)
-               {
-       /* Round 0 */
-       R0(A,B,C,D,X[ 0], 3,0);
-       R0(D,A,B,C,X[ 1], 7,0);
-       R0(C,D,A,B,X[ 2],11,0);
-       R0(B,C,D,A,X[ 3],19,0);
-       R0(A,B,C,D,X[ 4], 3,0);
-       R0(D,A,B,C,X[ 5], 7,0);
-       R0(C,D,A,B,X[ 6],11,0);
-       R0(B,C,D,A,X[ 7],19,0);
-       R0(A,B,C,D,X[ 8], 3,0);
-       R0(D,A,B,C,X[ 9], 7,0);
-       R0(C,D,A,B,X[10],11,0);
-       R0(B,C,D,A,X[11],19,0);
-       R0(A,B,C,D,X[12], 3,0);
-       R0(D,A,B,C,X[13], 7,0);
-       R0(C,D,A,B,X[14],11,0);
-       R0(B,C,D,A,X[15],19,0);
-       /* Round 1 */
-       R1(A,B,C,D,X[ 0], 3,0x5A827999L);
-       R1(D,A,B,C,X[ 4], 5,0x5A827999L);
-       R1(C,D,A,B,X[ 8], 9,0x5A827999L);
-       R1(B,C,D,A,X[12],13,0x5A827999L);
-       R1(A,B,C,D,X[ 1], 3,0x5A827999L);
-       R1(D,A,B,C,X[ 5], 5,0x5A827999L);
-       R1(C,D,A,B,X[ 9], 9,0x5A827999L);
-       R1(B,C,D,A,X[13],13,0x5A827999L);
-       R1(A,B,C,D,X[ 2], 3,0x5A827999L);
-       R1(D,A,B,C,X[ 6], 5,0x5A827999L);
-       R1(C,D,A,B,X[10], 9,0x5A827999L);
-       R1(B,C,D,A,X[14],13,0x5A827999L);
-       R1(A,B,C,D,X[ 3], 3,0x5A827999L);
-       R1(D,A,B,C,X[ 7], 5,0x5A827999L);
-       R1(C,D,A,B,X[11], 9,0x5A827999L);
-       R1(B,C,D,A,X[15],13,0x5A827999L);
-       /* Round 2 */
-       R2(A,B,C,D,X[ 0], 3,0x6ED9EBA1);
-       R2(D,A,B,C,X[ 8], 9,0x6ED9EBA1);
-       R2(C,D,A,B,X[ 4],11,0x6ED9EBA1);
-       R2(B,C,D,A,X[12],15,0x6ED9EBA1);
-       R2(A,B,C,D,X[ 2], 3,0x6ED9EBA1);
-       R2(D,A,B,C,X[10], 9,0x6ED9EBA1);
-       R2(C,D,A,B,X[ 6],11,0x6ED9EBA1);
-       R2(B,C,D,A,X[14],15,0x6ED9EBA1);
-       R2(A,B,C,D,X[ 1], 3,0x6ED9EBA1);
-       R2(D,A,B,C,X[ 9], 9,0x6ED9EBA1);
-       R2(C,D,A,B,X[ 5],11,0x6ED9EBA1);
-       R2(B,C,D,A,X[13],15,0x6ED9EBA1);
-       R2(A,B,C,D,X[ 3], 3,0x6ED9EBA1);
-       R2(D,A,B,C,X[11], 9,0x6ED9EBA1);
-       R2(C,D,A,B,X[ 7],11,0x6ED9EBA1);
-       R2(B,C,D,A,X[15],15,0x6ED9EBA1);
-
-       A = c->A += A;
-       B = c->B += B;
-       C = c->C += C;
-       D = c->D += D;
-               }
-       }
-#endif
-
 #ifndef md4_block_data_order
 #ifdef X
 #undef X
@@ -240,19 +167,3 @@ void md4_block_data_order (MD4_CTX *c, const void *data_, size_t num)
                }
        }
 #endif
-
-#ifdef undef
-int printit(unsigned long *l)
-       {
-       int i,ii;
-
-       for (i=0; i<2; i++)
-               {
-               for (ii=0; ii<8; ii++)
-                       {
-                       fprintf(stderr,"%08lx ",l[i*8+ii]);
-                       }
-               fprintf(stderr,"\n");
-               }
-       }
-#endif
index abc7b9bb84bd60806206461f02e2fd8dee807096..c8085b0eadf9a158d931be8007d474cae1cb333c 100644 (file)
 #define MD4_LONG_LOG2 2 /* default to 32 bits */
 #endif
 
-void md4_block_host_order (MD4_CTX *c, const void *p,size_t num);
 void md4_block_data_order (MD4_CTX *c, const void *p,size_t num);
 
-#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__)
-# if !defined(B_ENDIAN)
-/*
- * *_block_host_order is expected to handle aligned data while
- * *_block_data_order - unaligned. As algorithm and host (x86)
- * are in this case of the same "endianness" these two are
- * otherwise indistinguishable. But normally you don't want to
- * call the same function because unaligned access in places
- * where alignment is expected is usually a "Bad Thing". Indeed,
- * on RISCs you get punished with BUS ERROR signal or *severe*
- * performance degradation. Intel CPUs are in turn perfectly
- * capable of loading unaligned data without such drastic side
- * effect. Yes, they say it's slower than aligned load, but no
- * exception is generated and therefore performance degradation
- * is *incomparable* with RISCs. What we should weight here is
- * costs of unaligned access against costs of aligning data.
- * According to my measurements allowing unaligned access results
- * in ~9% performance improvement on Pentium II operating at
- * 266MHz. I won't be surprised if the difference will be higher
- * on faster systems:-)
- *
- *                             <appro@fy.chalmers.se>
- */
-# define md4_block_data_order md4_block_host_order
-# endif
-#endif
-
 #define DATA_ORDER_IS_LITTLE_ENDIAN
 
 #define HASH_LONG              MD4_LONG
-#define HASH_LONG_LOG2         MD4_LONG_LOG2
 #define HASH_CTX               MD4_CTX
 #define HASH_CBLOCK            MD4_CBLOCK
-#define HASH_LBLOCK            MD4_LBLOCK
 #define HASH_UPDATE            MD4_Update
 #define HASH_TRANSFORM         MD4_Transform
 #define HASH_FINAL             MD4_Final
@@ -112,21 +82,7 @@ void md4_block_data_order (MD4_CTX *c, const void *p,size_t num);
        ll=(c)->C; HOST_l2c(ll,(s));    \
        ll=(c)->D; HOST_l2c(ll,(s));    \
        } while (0)
-#define HASH_BLOCK_HOST_ORDER  md4_block_host_order
-#if !defined(L_ENDIAN) || defined(md4_block_data_order)
 #define        HASH_BLOCK_DATA_ORDER   md4_block_data_order
-/*
- * Little-endians (Intel and Alpha) feel better without this.
- * It looks like memcpy does better job than generic
- * md4_block_data_order on copying-n-aligning input data.
- * But frankly speaking I didn't expect such result on Alpha.
- * On the other hand I've got this with egcs-1.0.2 and if
- * program is compiled with another (better?) compiler it
- * might turn out other way around.
- *
- *                             <appro@fy.chalmers.se>
- */
-#endif
 
 #include "md32_common.h"
 
index 849a0a5bacd5fe4585b07ee6c356a23c168e5fec..ceb00e8956ad7f1de55cfb38462e505ec271035b 100644 (file)
@@ -52,24 +52,6 @@ mx86-cof.s: asm/md5-586.pl ../perlasm/x86asm.pl
 mx86-out.s: asm/md5-586.pl ../perlasm/x86asm.pl
        (cd asm; $(PERL) md5-586.pl a.out $(CFLAGS) > ../$@)
 
-md5-sparcv8plus.o: asm/md5-sparcv9.S
-       $(CC) $(ASFLAGS) -DMD5_BLOCK_DATA_ORDER -c \
-               -o md5-sparcv8plus.o asm/md5-sparcv9.S
-
-# Old GNU assembler doesn't understand V9 instructions, so we
-# hire /usr/ccs/bin/as to do the job. Note that option is called
-# *-gcc27, but even gcc 2>=8 users may experience similar problem
-# if they didn't bother to upgrade GNU assembler. Such users should
-# not choose this option, but be adviced to *remove* GNU assembler
-# or upgrade it.
-md5-sparcv8plus-gcc27.o: asm/md5-sparcv9.S
-       $(CC) $(ASFLAGS) -DMD5_BLOCK_DATA_ORDER -E asm/md5-sparcv9.S | \
-               /usr/ccs/bin/as -xarch=v8plus - -o md5-sparcv8plus-gcc27.o
-
-md5-sparcv9.o: asm/md5-sparcv9.S
-       $(CC) $(ASFLAGS) -DMD5_BLOCK_DATA_ORDER -c \
-               -o md5-sparcv9.o asm/md5-sparcv9.S
-
 md5-x86_64.s:  asm/md5-x86_64.pl;      $(PERL) asm/md5-x86_64.pl $@
 
 files:
index fa3fa3bed59c1685f6b3005d56bf3ac555d97fc9..76ac235f7d035e388d925847445a2fcbafcd6de6 100644 (file)
@@ -29,7 +29,7 @@ $X="esi";
  0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9, # R3
  );
 
-&md5_block("md5_block_asm_host_order");
+&md5_block("md5_block_asm_data_order");
 &asm_finish();
 
 sub Np
diff --git a/crypto/md5/asm/md5-sparcv9.S b/crypto/md5/asm/md5-sparcv9.S
deleted file mode 100644 (file)
index db45aa4..0000000
+++ /dev/null
@@ -1,1031 +0,0 @@
-.ident "md5-sparcv9.S, Version 1.0"
-.ident "SPARC V9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
-.file  "md5-sparcv9.S"
-
-/*
- * ====================================================================
- * Copyright (c) 1999 Andy Polyakov <appro@fy.chalmers.se>.
- *
- * Rights for redistribution and usage in source and binary forms are
- * granted as long as above copyright notices are retained. Warranty
- * of any kind is (of course:-) disclaimed.
- * ====================================================================
- */
-
-/*
- * This is my modest contribution to OpenSSL project (see
- * http://www.openssl.org/ for more information about it) and is an
- * assembler implementation of MD5 block hash function. I've hand-coded
- * this for the sole reason to reach UltraSPARC-specific "load in
- * little-endian byte order" instruction. This gives up to 15%
- * performance improvement for cases when input message is aligned at
- * 32 bits boundary. The module was tested under both 32 *and* 64 bit
- * kernels. For updates see http://fy.chalmers.se/~appro/hpe/.
- *
- * To compile with SC4.x/SC5.x:
- *
- *     cc -xarch=v[9|8plus] -DOPENSSL_SYSNAME_ULTRASPARC -DMD5_BLOCK_DATA_ORDER \
- *             -c md5-sparcv9.S
- *
- * and with gcc:
- *
- *     gcc -mcpu=ultrasparc -DOPENSSL_SYSNAME_ULTRASPARC -DMD5_BLOCK_DATA_ORDER \
- *             -c md5-sparcv9.S
- *
- * or if above fails (it does if you have gas):
- *
- *     gcc -E -DOPENSSL_SYSNAMEULTRASPARC -DMD5_BLOCK_DATA_ORDER md5_block.sparc.S | \
- *             as -xarch=v8plus /dev/fd/0 -o md5-sparcv9.o
- */
-
-#include <openssl/e_os2.h>
-
-#define        A       %o0
-#define B      %o1
-#define        C       %o2
-#define        D       %o3
-#define        T1      %o4
-#define        T2      %o5
-
-#define        R0      %l0
-#define        R1      %l1
-#define        R2      %l2
-#define        R3      %l3
-#define        R4      %l4
-#define        R5      %l5
-#define        R6      %l6
-#define        R7      %l7
-#define        R8      %i3
-#define        R9      %i4
-#define        R10     %i5
-#define        R11     %g1
-#define R12    %g2
-#define        R13     %g3
-#define RX     %g4
-
-#define Aptr   %i0+0
-#define Bptr   %i0+4
-#define Cptr   %i0+8
-#define Dptr   %i0+12
-
-#define Aval   R5      /* those not used at the end of the last round */
-#define Bval   R6
-#define Cval   R7
-#define Dval   R8
-
-#if defined(MD5_BLOCK_DATA_ORDER)
-# if defined(OPENSSL_SYSNAME_ULTRASPARC)
-#  define      LOAD                    lda
-#  define      X(i)                    [%i1+i*4]%asi
-#  define      md5_block               md5_block_asm_data_order_aligned
-#  define      ASI_PRIMARY_LITTLE      0x88
-# else
-#  error "MD5_BLOCK_DATA_ORDER is supported only on UltraSPARC!"
-# endif
-#else
-# define       LOAD                    ld
-# define       X(i)                    [%i1+i*4]
-# define       md5_block               md5_block_asm_host_order
-#endif
-
-.section        ".text",#alloc,#execinstr
-
-#if defined(__SUNPRO_C) && defined(__sparcv9)
-  /* They've said -xarch=v9 at command line */
-  .register    %g2,#scratch
-  .register    %g3,#scratch
-# define       FRAME   -192
-#elif defined(__GNUC__) && defined(__arch64__)
-  /* They've said -m64 at command line */
-  .register     %g2,#scratch
-  .register     %g3,#scratch
-# define        FRAME   -192
-#else
-# define       FRAME   -96
-#endif
-
-.align  32
-
-.global md5_block
-md5_block:
-       save    %sp,FRAME,%sp
-
-       ld      [Dptr],D
-       ld      [Cptr],C
-       ld      [Bptr],B
-       ld      [Aptr],A
-#ifdef ASI_PRIMARY_LITTLE
-       rd      %asi,%o7        ! How dare I? Well, I just do:-)
-       wr      %g0,ASI_PRIMARY_LITTLE,%asi
-#endif
-       LOAD    X(0),R0
-
-.Lmd5_block_loop:
-
-!!!!!!!!Round 0
-
-       xor     C,D,T1
-       sethi   %hi(0xd76aa478),T2
-       and     T1,B,T1
-       or      T2,%lo(0xd76aa478),T2   !=
-       xor     T1,D,T1
-       add     T1,R0,T1
-       LOAD    X(1),R1
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,7,T2
-       srl     A,32-7,A
-       or      A,T2,A                  !=
-        xor     B,C,T1
-       add     A,B,A
-
-       sethi   %hi(0xe8c7b756),T2
-       and     T1,A,T1                 !=
-       or      T2,%lo(0xe8c7b756),T2
-       xor     T1,C,T1
-       LOAD    X(2),R2
-       add     T1,R1,T1                !=
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,12,T2
-       srl     D,32-12,D               !=
-       or      D,T2,D
-        xor     A,B,T1
-       add     D,A,D
-
-       sethi   %hi(0x242070db),T2      !=
-       and     T1,D,T1
-       or      T2,%lo(0x242070db),T2
-       xor     T1,B,T1
-       add     T1,R2,T1                !=
-       LOAD    X(3),R3
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,17,T2                 !=
-       srl     C,32-17,C
-       or      C,T2,C
-        xor     D,A,T1
-       add     C,D,C                   !=
-
-       sethi   %hi(0xc1bdceee),T2
-       and     T1,C,T1
-       or      T2,%lo(0xc1bdceee),T2
-       xor     T1,A,T1                 !=
-       add     T1,R3,T1
-       LOAD    X(4),R4
-       add     T1,T2,T1
-       add     B,T1,B                  !=
-       sll     B,22,T2
-       srl     B,32-22,B
-       or      B,T2,B
-        xor     C,D,T1                 !=
-       add     B,C,B
-
-       sethi   %hi(0xf57c0faf),T2
-       and     T1,B,T1
-       or      T2,%lo(0xf57c0faf),T2   !=
-       xor     T1,D,T1
-       add     T1,R4,T1
-       LOAD    X(5),R5
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,7,T2
-       srl     A,32-7,A
-       or      A,T2,A                  !=
-        xor     B,C,T1
-       add     A,B,A
-
-       sethi   %hi(0x4787c62a),T2
-       and     T1,A,T1                 !=
-       or      T2,%lo(0x4787c62a),T2
-       xor     T1,C,T1
-       LOAD    X(6),R6
-       add     T1,R5,T1                !=
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,12,T2
-       srl     D,32-12,D               !=
-       or      D,T2,D
-        xor     A,B,T1
-       add     D,A,D
-
-       sethi   %hi(0xa8304613),T2      !=
-       and     T1,D,T1
-       or      T2,%lo(0xa8304613),T2
-       xor     T1,B,T1
-       add     T1,R6,T1                !=
-       LOAD    X(7),R7
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,17,T2                 !=
-       srl     C,32-17,C
-       or      C,T2,C
-        xor     D,A,T1
-       add     C,D,C                   !=
-
-       sethi   %hi(0xfd469501),T2
-       and     T1,C,T1
-       or      T2,%lo(0xfd469501),T2
-       xor     T1,A,T1                 !=
-       add     T1,R7,T1
-       LOAD    X(8),R8
-       add     T1,T2,T1
-       add     B,T1,B                  !=
-       sll     B,22,T2
-       srl     B,32-22,B
-       or      B,T2,B
-        xor     C,D,T1                 !=
-       add     B,C,B
-
-       sethi   %hi(0x698098d8),T2
-       and     T1,B,T1
-       or      T2,%lo(0x698098d8),T2   !=
-       xor     T1,D,T1
-       add     T1,R8,T1
-       LOAD    X(9),R9
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,7,T2
-       srl     A,32-7,A
-       or      A,T2,A                  !=
-        xor     B,C,T1
-       add     A,B,A
-
-       sethi   %hi(0x8b44f7af),T2
-       and     T1,A,T1                 !=
-       or      T2,%lo(0x8b44f7af),T2
-       xor     T1,C,T1
-       LOAD    X(10),R10
-       add     T1,R9,T1                !=
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,12,T2
-       srl     D,32-12,D               !=
-       or      D,T2,D
-        xor     A,B,T1
-       add     D,A,D
-
-       sethi   %hi(0xffff5bb1),T2      !=
-       and     T1,D,T1
-       or      T2,%lo(0xffff5bb1),T2
-       xor     T1,B,T1
-       add     T1,R10,T1               !=
-       LOAD    X(11),R11
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,17,T2                 !=
-       srl     C,32-17,C
-       or      C,T2,C
-        xor     D,A,T1
-       add     C,D,C                   !=
-
-       sethi   %hi(0x895cd7be),T2
-       and     T1,C,T1
-       or      T2,%lo(0x895cd7be),T2
-       xor     T1,A,T1                 !=
-       add     T1,R11,T1
-       LOAD    X(12),R12
-       add     T1,T2,T1
-       add     B,T1,B                  !=
-       sll     B,22,T2
-       srl     B,32-22,B
-       or      B,T2,B
-        xor     C,D,T1                 !=
-       add     B,C,B
-
-       sethi   %hi(0x6b901122),T2
-       and     T1,B,T1
-       or      T2,%lo(0x6b901122),T2   !=
-       xor     T1,D,T1
-       add     T1,R12,T1
-       LOAD    X(13),R13
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,7,T2
-       srl     A,32-7,A
-       or      A,T2,A                  !=
-        xor     B,C,T1
-       add     A,B,A
-
-       sethi   %hi(0xfd987193),T2
-       and     T1,A,T1                 !=
-       or      T2,%lo(0xfd987193),T2
-       xor     T1,C,T1
-       LOAD    X(14),RX
-       add     T1,R13,T1               !=
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,12,T2
-       srl     D,32-12,D               !=
-       or      D,T2,D
-        xor     A,B,T1
-       add     D,A,D
-
-       sethi   %hi(0xa679438e),T2      !=
-       and     T1,D,T1
-       or      T2,%lo(0xa679438e),T2
-       xor     T1,B,T1
-       add     T1,RX,T1                !=
-       LOAD    X(15),RX
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,17,T2                 !=
-       srl     C,32-17,C
-       or      C,T2,C
-        xor     D,A,T1
-       add     C,D,C                   !=
-
-       sethi   %hi(0x49b40821),T2
-       and     T1,C,T1
-       or      T2,%lo(0x49b40821),T2
-       xor     T1,A,T1                 !=
-       add     T1,RX,T1
-       !pre-LOADed     X(1),R1
-       add     T1,T2,T1
-       add     B,T1,B
-       sll     B,22,T2                 !=
-       srl     B,32-22,B
-       or      B,T2,B
-       add     B,C,B
-
-!!!!!!!!Round 1
-
-       xor     B,C,T1                  !=
-       sethi   %hi(0xf61e2562),T2
-       and     T1,D,T1
-       or      T2,%lo(0xf61e2562),T2
-       xor     T1,C,T1                 !=
-       add     T1,R1,T1
-       !pre-LOADed     X(6),R6
-       add     T1,T2,T1
-       add     A,T1,A
-       sll     A,5,T2                  !=
-       srl     A,32-5,A
-       or      A,T2,A
-       add     A,B,A
-
-       xor     A,B,T1                  !=
-       sethi   %hi(0xc040b340),T2
-       and     T1,C,T1
-       or      T2,%lo(0xc040b340),T2
-       xor     T1,B,T1                 !=
-       add     T1,R6,T1
-       !pre-LOADed     X(11),R11
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,9,T2                  !=
-       srl     D,32-9,D
-       or      D,T2,D
-       add     D,A,D
-
-       xor     D,A,T1                  !=
-       sethi   %hi(0x265e5a51),T2
-       and     T1,B,T1
-       or      T2,%lo(0x265e5a51),T2
-       xor     T1,A,T1                 !=
-       add     T1,R11,T1
-       !pre-LOADed     X(0),R0
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,14,T2                 !=
-       srl     C,32-14,C
-       or      C,T2,C
-       add     C,D,C
-
-       xor     C,D,T1                  !=
-       sethi   %hi(0xe9b6c7aa),T2
-       and     T1,A,T1
-       or      T2,%lo(0xe9b6c7aa),T2
-       xor     T1,D,T1                 !=
-       add     T1,R0,T1
-       !pre-LOADed     X(5),R5
-       add     T1,T2,T1
-       add     B,T1,B
-       sll     B,20,T2                 !=
-       srl     B,32-20,B
-       or      B,T2,B
-       add     B,C,B
-
-       xor     B,C,T1                  !=
-       sethi   %hi(0xd62f105d),T2
-       and     T1,D,T1
-       or      T2,%lo(0xd62f105d),T2
-       xor     T1,C,T1                 !=
-       add     T1,R5,T1
-       !pre-LOADed     X(10),R10
-       add     T1,T2,T1
-       add     A,T1,A
-       sll     A,5,T2                  !=
-       srl     A,32-5,A
-       or      A,T2,A
-       add     A,B,A
-
-       xor     A,B,T1                  !=
-       sethi   %hi(0x02441453),T2
-       and     T1,C,T1
-       or      T2,%lo(0x02441453),T2
-       xor     T1,B,T1                 !=
-       add     T1,R10,T1
-       LOAD    X(15),RX
-       add     T1,T2,T1
-       add     D,T1,D                  !=
-       sll     D,9,T2
-       srl     D,32-9,D
-       or      D,T2,D
-       add     D,A,D                   !=
-
-       xor     D,A,T1
-       sethi   %hi(0xd8a1e681),T2
-       and     T1,B,T1
-       or      T2,%lo(0xd8a1e681),T2   !=
-       xor     T1,A,T1
-       add     T1,RX,T1
-       !pre-LOADed     X(4),R4
-       add     T1,T2,T1
-       add     C,T1,C                  !=
-       sll     C,14,T2
-       srl     C,32-14,C
-       or      C,T2,C
-       add     C,D,C                   !=
-
-       xor     C,D,T1
-       sethi   %hi(0xe7d3fbc8),T2
-       and     T1,A,T1
-       or      T2,%lo(0xe7d3fbc8),T2   !=
-       xor     T1,D,T1
-       add     T1,R4,T1
-       !pre-LOADed     X(9),R9
-       add     T1,T2,T1
-       add     B,T1,B                  !=
-       sll     B,20,T2
-       srl     B,32-20,B
-       or      B,T2,B
-       add     B,C,B                   !=
-
-       xor     B,C,T1
-       sethi   %hi(0x21e1cde6),T2
-       and     T1,D,T1
-       or      T2,%lo(0x21e1cde6),T2   !=
-       xor     T1,C,T1
-       add     T1,R9,T1
-       LOAD    X(14),RX
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,5,T2
-       srl     A,32-5,A
-       or      A,T2,A                  !=
-       add     A,B,A
-
-       xor     A,B,T1
-       sethi   %hi(0xc33707d6),T2
-       and     T1,C,T1                 !=
-       or      T2,%lo(0xc33707d6),T2
-       xor     T1,B,T1
-       add     T1,RX,T1
-       !pre-LOADed     X(3),R3
-       add     T1,T2,T1                !=
-       add     D,T1,D
-       sll     D,9,T2
-       srl     D,32-9,D
-       or      D,T2,D                  !=
-       add     D,A,D
-
-       xor     D,A,T1
-       sethi   %hi(0xf4d50d87),T2
-       and     T1,B,T1                 !=
-       or      T2,%lo(0xf4d50d87),T2
-       xor     T1,A,T1
-       add     T1,R3,T1
-       !pre-LOADed     X(8),R8
-       add     T1,T2,T1                !=
-       add     C,T1,C
-       sll     C,14,T2
-       srl     C,32-14,C
-       or      C,T2,C                  !=
-       add     C,D,C
-
-       xor     C,D,T1
-       sethi   %hi(0x455a14ed),T2
-       and     T1,A,T1                 !=
-       or      T2,%lo(0x455a14ed),T2
-       xor     T1,D,T1
-       add     T1,R8,T1
-       !pre-LOADed     X(13),R13
-       add     T1,T2,T1                !=
-       add     B,T1,B
-       sll     B,20,T2
-       srl     B,32-20,B
-       or      B,T2,B                  !=
-       add     B,C,B
-
-       xor     B,C,T1
-       sethi   %hi(0xa9e3e905),T2
-       and     T1,D,T1                 !=
-       or      T2,%lo(0xa9e3e905),T2
-       xor     T1,C,T1
-       add     T1,R13,T1
-       !pre-LOADed     X(2),R2
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,5,T2
-       srl     A,32-5,A
-       or      A,T2,A                  !=
-       add     A,B,A
-
-       xor     A,B,T1
-       sethi   %hi(0xfcefa3f8),T2
-       and     T1,C,T1                 !=
-       or      T2,%lo(0xfcefa3f8),T2
-       xor     T1,B,T1
-       add     T1,R2,T1
-       !pre-LOADed     X(7),R7
-       add     T1,T2,T1                !=
-       add     D,T1,D
-       sll     D,9,T2
-       srl     D,32-9,D
-       or      D,T2,D                  !=
-       add     D,A,D
-
-       xor     D,A,T1
-       sethi   %hi(0x676f02d9),T2
-       and     T1,B,T1                 !=
-       or      T2,%lo(0x676f02d9),T2
-       xor     T1,A,T1
-       add     T1,R7,T1
-       !pre-LOADed     X(12),R12
-       add     T1,T2,T1                !=
-       add     C,T1,C
-       sll     C,14,T2
-       srl     C,32-14,C
-       or      C,T2,C                  !=
-       add     C,D,C
-
-       xor     C,D,T1
-       sethi   %hi(0x8d2a4c8a),T2
-       and     T1,A,T1                 !=
-       or      T2,%lo(0x8d2a4c8a),T2
-       xor     T1,D,T1
-       add     T1,R12,T1
-       !pre-LOADed     X(5),R5
-       add     T1,T2,T1                !=
-       add     B,T1,B
-       sll     B,20,T2
-       srl     B,32-20,B
-       or      B,T2,B                  !=
-       add     B,C,B
-
-!!!!!!!!Round 2
-
-       xor     B,C,T1
-       sethi   %hi(0xfffa3942),T2
-       xor     T1,D,T1                 !=
-       or      T2,%lo(0xfffa3942),T2
-       add     T1,R5,T1
-       !pre-LOADed     X(8),R8
-       add     T1,T2,T1
-       add     A,T1,A                  !=
-       sll     A,4,T2
-       srl     A,32-4,A
-       or      A,T2,A
-       add     A,B,A                   !=
-
-       xor     A,B,T1
-       sethi   %hi(0x8771f681),T2
-       xor     T1,C,T1
-       or      T2,%lo(0x8771f681),T2   !=
-       add     T1,R8,T1
-       !pre-LOADed     X(11),R11
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,11,T2                 !=
-       srl     D,32-11,D
-       or      D,T2,D
-       add     D,A,D
-
-       xor     D,A,T1                  !=
-       sethi   %hi(0x6d9d6122),T2
-       xor     T1,B,T1
-       or      T2,%lo(0x6d9d6122),T2
-       add     T1,R11,T1               !=
-       LOAD    X(14),RX
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,16,T2                 !=
-       srl     C,32-16,C
-       or      C,T2,C
-       add     C,D,C
-
-       xor     C,D,T1                  !=
-       sethi   %hi(0xfde5380c),T2
-       xor     T1,A,T1
-       or      T2,%lo(0xfde5380c),T2
-       add     T1,RX,T1                !=
-       !pre-LOADed     X(1),R1
-       add     T1,T2,T1
-       add     B,T1,B
-       sll     B,23,T2
-       srl     B,32-23,B               !=
-       or      B,T2,B
-       add     B,C,B
-
-       xor     B,C,T1
-       sethi   %hi(0xa4beea44),T2      !=
-       xor     T1,D,T1
-       or      T2,%lo(0xa4beea44),T2
-       add     T1,R1,T1
-       !pre-LOADed     X(4),R4
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,4,T2
-       srl     A,32-4,A
-       or      A,T2,A                  !=
-       add     A,B,A
-
-       xor     A,B,T1
-       sethi   %hi(0x4bdecfa9),T2
-       xor     T1,C,T1                 !=
-       or      T2,%lo(0x4bdecfa9),T2
-       add     T1,R4,T1
-       !pre-LOADed     X(7),R7
-       add     T1,T2,T1
-       add     D,T1,D                  !=
-       sll     D,11,T2
-       srl     D,32-11,D
-       or      D,T2,D
-       add     D,A,D                   !=
-
-       xor     D,A,T1
-       sethi   %hi(0xf6bb4b60),T2
-       xor     T1,B,T1
-       or      T2,%lo(0xf6bb4b60),T2   !=
-       add     T1,R7,T1
-       !pre-LOADed     X(10),R10
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,16,T2                 !=
-       srl     C,32-16,C
-       or      C,T2,C
-       add     C,D,C
-
-       xor     C,D,T1                  !=
-       sethi   %hi(0xbebfbc70),T2
-       xor     T1,A,T1
-       or      T2,%lo(0xbebfbc70),T2
-       add     T1,R10,T1               !=
-       !pre-LOADed     X(13),R13
-       add     T1,T2,T1
-       add     B,T1,B
-       sll     B,23,T2
-       srl     B,32-23,B               !=
-       or      B,T2,B
-       add     B,C,B
-
-       xor     B,C,T1
-       sethi   %hi(0x289b7ec6),T2      !=
-       xor     T1,D,T1
-       or      T2,%lo(0x289b7ec6),T2
-       add     T1,R13,T1
-       !pre-LOADed     X(0),R0
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,4,T2
-       srl     A,32-4,A
-       or      A,T2,A                  !=
-       add     A,B,A
-
-       xor     A,B,T1
-       sethi   %hi(0xeaa127fa),T2
-       xor     T1,C,T1                 !=
-       or      T2,%lo(0xeaa127fa),T2
-       add     T1,R0,T1
-       !pre-LOADed     X(3),R3
-       add     T1,T2,T1
-       add     D,T1,D                  !=
-       sll     D,11,T2
-       srl     D,32-11,D
-       or      D,T2,D
-       add     D,A,D                   !=
-
-       xor     D,A,T1
-       sethi   %hi(0xd4ef3085),T2
-       xor     T1,B,T1
-       or      T2,%lo(0xd4ef3085),T2   !=
-       add     T1,R3,T1
-       !pre-LOADed     X(6),R6
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,16,T2                 !=
-       srl     C,32-16,C
-       or      C,T2,C
-       add     C,D,C
-
-       xor     C,D,T1                  !=
-       sethi   %hi(0x04881d05),T2
-       xor     T1,A,T1
-       or      T2,%lo(0x04881d05),T2
-       add     T1,R6,T1                !=
-       !pre-LOADed     X(9),R9
-       add     T1,T2,T1
-       add     B,T1,B
-       sll     B,23,T2
-       srl     B,32-23,B               !=
-       or      B,T2,B
-       add     B,C,B
-
-       xor     B,C,T1
-       sethi   %hi(0xd9d4d039),T2      !=
-       xor     T1,D,T1
-       or      T2,%lo(0xd9d4d039),T2
-       add     T1,R9,T1
-       !pre-LOADed     X(12),R12
-       add     T1,T2,T1                !=
-       add     A,T1,A
-       sll     A,4,T2
-       srl     A,32-4,A
-       or      A,T2,A                  !=
-       add     A,B,A
-
-       xor     A,B,T1
-       sethi   %hi(0xe6db99e5),T2
-       xor     T1,C,T1                 !=
-       or      T2,%lo(0xe6db99e5),T2
-       add     T1,R12,T1
-       LOAD    X(15),RX
-       add     T1,T2,T1                !=
-       add     D,T1,D
-       sll     D,11,T2
-       srl     D,32-11,D
-       or      D,T2,D                  !=
-       add     D,A,D
-
-       xor     D,A,T1
-       sethi   %hi(0x1fa27cf8),T2
-       xor     T1,B,T1                 !=
-       or      T2,%lo(0x1fa27cf8),T2
-       add     T1,RX,T1
-       !pre-LOADed     X(2),R2
-       add     T1,T2,T1
-       add     C,T1,C                  !=
-       sll     C,16,T2
-       srl     C,32-16,C
-       or      C,T2,C
-       add     C,D,C                   !=
-
-       xor     C,D,T1
-       sethi   %hi(0xc4ac5665),T2
-       xor     T1,A,T1
-       or      T2,%lo(0xc4ac5665),T2   !=
-       add     T1,R2,T1
-       !pre-LOADed     X(0),R0
-       add     T1,T2,T1
-       add     B,T1,B
-       sll     B,23,T2                 !=
-       srl     B,32-23,B
-       or      B,T2,B
-       add     B,C,B
-
-!!!!!!!!Round 3
-
-       orn     B,D,T1                  !=
-       sethi   %hi(0xf4292244),T2
-       xor     T1,C,T1
-       or      T2,%lo(0xf4292244),T2
-       add     T1,R0,T1                !=
-       !pre-LOADed     X(7),R7
-       add     T1,T2,T1
-       add     A,T1,A
-       sll     A,6,T2
-       srl     A,32-6,A                !=
-       or      A,T2,A
-       add     A,B,A
-
-       orn     A,C,T1
-       sethi   %hi(0x432aff97),T2      !=
-       xor     T1,B,T1
-       or      T2,%lo(0x432aff97),T2
-       LOAD    X(14),RX
-       add     T1,R7,T1                !=
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,10,T2
-       srl     D,32-10,D               !=
-       or      D,T2,D
-       add     D,A,D
-
-       orn     D,B,T1
-       sethi   %hi(0xab9423a7),T2      !=
-       xor     T1,A,T1
-       or      T2,%lo(0xab9423a7),T2
-       add     T1,RX,T1
-       !pre-LOADed     X(5),R5
-       add     T1,T2,T1                !=
-       add     C,T1,C
-       sll     C,15,T2
-       srl     C,32-15,C
-       or      C,T2,C                  !=
-       add     C,D,C
-
-       orn     C,A,T1
-       sethi   %hi(0xfc93a039),T2
-       xor     T1,D,T1                 !=
-       or      T2,%lo(0xfc93a039),T2
-       add     T1,R5,T1
-       !pre-LOADed     X(12),R12
-       add     T1,T2,T1
-       add     B,T1,B                  !=
-       sll     B,21,T2
-       srl     B,32-21,B
-       or      B,T2,B
-       add     B,C,B                   !=
-
-       orn     B,D,T1
-       sethi   %hi(0x655b59c3),T2
-       xor     T1,C,T1
-       or      T2,%lo(0x655b59c3),T2   !=
-       add     T1,R12,T1
-       !pre-LOADed     X(3),R3
-       add     T1,T2,T1
-       add     A,T1,A
-       sll     A,6,T2                  !=
-       srl     A,32-6,A
-       or      A,T2,A
-       add     A,B,A
-
-       orn     A,C,T1                  !=
-       sethi   %hi(0x8f0ccc92),T2
-       xor     T1,B,T1
-       or      T2,%lo(0x8f0ccc92),T2
-       add     T1,R3,T1                !=
-       !pre-LOADed     X(10),R10
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,10,T2
-       srl     D,32-10,D               !=
-       or      D,T2,D
-       add     D,A,D
-
-       orn     D,B,T1
-       sethi   %hi(0xffeff47d),T2      !=
-       xor     T1,A,T1
-       or      T2,%lo(0xffeff47d),T2
-       add     T1,R10,T1
-       !pre-LOADed     X(1),R1
-       add     T1,T2,T1                !=
-       add     C,T1,C
-       sll     C,15,T2
-       srl     C,32-15,C
-       or      C,T2,C                  !=
-       add     C,D,C
-
-       orn     C,A,T1
-       sethi   %hi(0x85845dd1),T2
-       xor     T1,D,T1                 !=
-       or      T2,%lo(0x85845dd1),T2
-       add     T1,R1,T1
-       !pre-LOADed     X(8),R8
-       add     T1,T2,T1
-       add     B,T1,B                  !=
-       sll     B,21,T2
-       srl     B,32-21,B
-       or      B,T2,B
-       add     B,C,B                   !=
-
-       orn     B,D,T1
-       sethi   %hi(0x6fa87e4f),T2
-       xor     T1,C,T1
-       or      T2,%lo(0x6fa87e4f),T2   !=
-       add     T1,R8,T1
-       LOAD    X(15),RX
-       add     T1,T2,T1
-       add     A,T1,A                  !=
-       sll     A,6,T2
-       srl     A,32-6,A
-       or      A,T2,A
-       add     A,B,A                   !=
-
-       orn     A,C,T1
-       sethi   %hi(0xfe2ce6e0),T2
-       xor     T1,B,T1
-       or      T2,%lo(0xfe2ce6e0),T2   !=
-       add     T1,RX,T1
-       !pre-LOADed     X(6),R6
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,10,T2                 !=
-       srl     D,32-10,D
-       or      D,T2,D
-       add     D,A,D
-
-       orn     D,B,T1                  !=
-       sethi   %hi(0xa3014314),T2
-       xor     T1,A,T1
-       or      T2,%lo(0xa3014314),T2
-       add     T1,R6,T1                !=
-       !pre-LOADed     X(13),R13
-       add     T1,T2,T1
-       add     C,T1,C
-       sll     C,15,T2
-       srl     C,32-15,C               !=
-       or      C,T2,C
-       add     C,D,C
-
-       orn     C,A,T1
-       sethi   %hi(0x4e0811a1),T2      !=
-       xor     T1,D,T1
-       or      T2,%lo(0x4e0811a1),T2
-       !pre-LOADed     X(4),R4
-        ld      [Aptr],Aval
-       add     T1,R13,T1               !=
-       add     T1,T2,T1
-       add     B,T1,B
-       sll     B,21,T2
-       srl     B,32-21,B               !=
-       or      B,T2,B
-       add     B,C,B
-
-       orn     B,D,T1
-       sethi   %hi(0xf7537e82),T2      !=
-       xor     T1,C,T1
-       or      T2,%lo(0xf7537e82),T2
-       !pre-LOADed     X(11),R11
-        ld      [Dptr],Dval
-       add     T1,R4,T1                !=
-       add     T1,T2,T1
-       add     A,T1,A
-       sll     A,6,T2
-       srl     A,32-6,A                !=
-       or      A,T2,A
-       add     A,B,A
-
-       orn     A,C,T1
-       sethi   %hi(0xbd3af235),T2      !=
-       xor     T1,B,T1
-       or      T2,%lo(0xbd3af235),T2
-       !pre-LOADed     X(2),R2
-        ld      [Cptr],Cval
-       add     T1,R11,T1               !=
-       add     T1,T2,T1
-       add     D,T1,D
-       sll     D,10,T2
-       srl     D,32-10,D               !=
-       or      D,T2,D
-       add     D,A,D
-
-       orn     D,B,T1
-       sethi   %hi(0x2ad7d2bb),T2      !=
-       xor     T1,A,T1
-       or      T2,%lo(0x2ad7d2bb),T2
-       !pre-LOADed     X(9),R9
-        ld      [Bptr],Bval
-       add     T1,R2,T1                !=
-        add     Aval,A,Aval
-       add     T1,T2,T1
-        st      Aval,[Aptr]
-       add     C,T1,C                  !=
-       sll     C,15,T2
-        add     Dval,D,Dval
-       srl     C,32-15,C
-       or      C,T2,C                  !=
-        st      Dval,[Dptr]
-       add     C,D,C
-
-       orn     C,A,T1
-       sethi   %hi(0xeb86d391),T2      !=
-       xor     T1,D,T1
-       or      T2,%lo(0xeb86d391),T2
-       add     T1,R9,T1
-       !pre-LOADed     X(0),R0
-        mov     Aval,A                 !=
-       add     T1,T2,T1
-        mov     Dval,D
-       add     B,T1,B
-       sll     B,21,T2                 !=
-        add     Cval,C,Cval
-       srl     B,32-21,B
-        st      Cval,[Cptr]
-       or      B,T2,B                  !=
-       add     B,C,B
-
-       deccc   %i2
-       mov     Cval,C
-       add     B,Bval,B                !=
-       inc     64,%i1
-       nop
-       st      B,[Bptr]
-       nop                             !=
-
-#ifdef OPENSSL_SYSNAME_ULTRASPARC
-       bg,a,pt %icc,.Lmd5_block_loop
-#else
-       bg,a    .Lmd5_block_loop
-#endif
-       LOAD    X(0),R0
-
-#ifdef ASI_PRIMARY_LITTLE
-       wr      %g0,%o7,%asi
-#endif
-       ret
-       restore %g0,0,%o0
-
-.type  md5_block,#function
-.size  md5_block,(.-md5_block)
index c36a7febf7f685f6817424d63246b1ff4c4a4f88..9a6fa67224ee2032af9e60edb930501305d12526 100755 (executable)
@@ -111,9 +111,9 @@ $code .= <<EOF;
 .text
 .align 16
 
-.globl md5_block_asm_host_order
-.type md5_block_asm_host_order,\@function,3
-md5_block_asm_host_order:
+.globl md5_block_asm_data_order
+.type md5_block_asm_data_order,\@function,3
+md5_block_asm_data_order:
        push    %rbp
        push    %rbx
        push    %r14
@@ -237,7 +237,7 @@ $code .= <<EOF;
        pop     %rbx
        pop     %rbp
        ret
-.size md5_block_asm_host_order,.-md5_block_asm_host_order
+.size md5_block_asm_data_order,.-md5_block_asm_data_order
 EOF
 
 print $code;
index 953f0496f69cc47dad46ad171ad255c95a38d9e4..b96e332ba41786af1d0e5273a96ba08cd55aaa1e 100644 (file)
@@ -82,96 +82,6 @@ int MD5_Init(MD5_CTX *c)
        return 1;
        }
 
-#ifndef md5_block_host_order
-void md5_block_host_order (MD5_CTX *c, const void *data, size_t num)
-       {
-       const MD5_LONG *X=data;
-       register unsigned MD32_REG_T A,B,C,D;
-
-       A=c->A;
-       B=c->B;
-       C=c->C;
-       D=c->D;
-
-       for (;num--;X+=HASH_LBLOCK)
-               {
-       /* Round 0 */
-       R0(A,B,C,D,X[ 0], 7,0xd76aa478L);
-       R0(D,A,B,C,X[ 1],12,0xe8c7b756L);
-       R0(C,D,A,B,X[ 2],17,0x242070dbL);
-       R0(B,C,D,A,X[ 3],22,0xc1bdceeeL);
-       R0(A,B,C,D,X[ 4], 7,0xf57c0fafL);
-       R0(D,A,B,C,X[ 5],12,0x4787c62aL);
-       R0(C,D,A,B,X[ 6],17,0xa8304613L);
-       R0(B,C,D,A,X[ 7],22,0xfd469501L);
-       R0(A,B,C,D,X[ 8], 7,0x698098d8L);
-       R0(D,A,B,C,X[ 9],12,0x8b44f7afL);
-       R0(C,D,A,B,X[10],17,0xffff5bb1L);
-       R0(B,C,D,A,X[11],22,0x895cd7beL);
-       R0(A,B,C,D,X[12], 7,0x6b901122L);
-       R0(D,A,B,C,X[13],12,0xfd987193L);
-       R0(C,D,A,B,X[14],17,0xa679438eL);
-       R0(B,C,D,A,X[15],22,0x49b40821L);
-       /* Round 1 */
-       R1(A,B,C,D,X[ 1], 5,0xf61e2562L);
-       R1(D,A,B,C,X[ 6], 9,0xc040b340L);
-       R1(C,D,A,B,X[11],14,0x265e5a51L);
-       R1(B,C,D,A,X[ 0],20,0xe9b6c7aaL);
-       R1(A,B,C,D,X[ 5], 5,0xd62f105dL);
-       R1(D,A,B,C,X[10], 9,0x02441453L);
-       R1(C,D,A,B,X[15],14,0xd8a1e681L);
-       R1(B,C,D,A,X[ 4],20,0xe7d3fbc8L);
-       R1(A,B,C,D,X[ 9], 5,0x21e1cde6L);
-       R1(D,A,B,C,X[14], 9,0xc33707d6L);
-       R1(C,D,A,B,X[ 3],14,0xf4d50d87L);
-       R1(B,C,D,A,X[ 8],20,0x455a14edL);
-       R1(A,B,C,D,X[13], 5,0xa9e3e905L);
-       R1(D,A,B,C,X[ 2], 9,0xfcefa3f8L);
-       R1(C,D,A,B,X[ 7],14,0x676f02d9L);
-       R1(B,C,D,A,X[12],20,0x8d2a4c8aL);
-       /* Round 2 */
-       R2(A,B,C,D,X[ 5], 4,0xfffa3942L);
-       R2(D,A,B,C,X[ 8],11,0x8771f681L);
-       R2(C,D,A,B,X[11],16,0x6d9d6122L);
-       R2(B,C,D,A,X[14],23,0xfde5380cL);
-       R2(A,B,C,D,X[ 1], 4,0xa4beea44L);
-       R2(D,A,B,C,X[ 4],11,0x4bdecfa9L);
-       R2(C,D,A,B,X[ 7],16,0xf6bb4b60L);
-       R2(B,C,D,A,X[10],23,0xbebfbc70L);
-       R2(A,B,C,D,X[13], 4,0x289b7ec6L);
-       R2(D,A,B,C,X[ 0],11,0xeaa127faL);
-       R2(C,D,A,B,X[ 3],16,0xd4ef3085L);
-       R2(B,C,D,A,X[ 6],23,0x04881d05L);
-       R2(A,B,C,D,X[ 9], 4,0xd9d4d039L);
-       R2(D,A,B,C,X[12],11,0xe6db99e5L);
-       R2(C,D,A,B,X[15],16,0x1fa27cf8L);
-       R2(B,C,D,A,X[ 2],23,0xc4ac5665L);
-       /* Round 3 */
-       R3(A,B,C,D,X[ 0], 6,0xf4292244L);
-       R3(D,A,B,C,X[ 7],10,0x432aff97L);
-       R3(C,D,A,B,X[14],15,0xab9423a7L);
-       R3(B,C,D,A,X[ 5],21,0xfc93a039L);
-       R3(A,B,C,D,X[12], 6,0x655b59c3L);
-       R3(D,A,B,C,X[ 3],10,0x8f0ccc92L);
-       R3(C,D,A,B,X[10],15,0xffeff47dL);
-       R3(B,C,D,A,X[ 1],21,0x85845dd1L);
-       R3(A,B,C,D,X[ 8], 6,0x6fa87e4fL);
-       R3(D,A,B,C,X[15],10,0xfe2ce6e0L);
-       R3(C,D,A,B,X[ 6],15,0xa3014314L);
-       R3(B,C,D,A,X[13],21,0x4e0811a1L);
-       R3(A,B,C,D,X[ 4], 6,0xf7537e82L);
-       R3(D,A,B,C,X[11],10,0xbd3af235L);
-       R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
-       R3(B,C,D,A,X[ 9],21,0xeb86d391L);
-
-       A = c->A += A;
-       B = c->B += B;
-       C = c->C += C;
-       D = c->D += D;
-               }
-       }
-#endif
-
 #ifndef md5_block_data_order
 #ifdef X
 #undef X
@@ -274,19 +184,3 @@ void md5_block_data_order (MD5_CTX *c, const void *data_, size_t num)
                }
        }
 #endif
-
-#ifdef undef
-int printit(unsigned long *l)
-       {
-       int i,ii;
-
-       for (i=0; i<2; i++)
-               {
-               for (ii=0; ii<8; ii++)
-                       {
-                       fprintf(stderr,"%08lx ",l[i*8+ii]);
-                       }
-               fprintf(stderr,"\n");
-               }
-       }
-#endif
index 94f395f27a249243d2e5ce7a89a3341d4582a55b..84e81b960de481410fe5e6c862b6c1103d69c004 100644 (file)
 #endif
 
 #ifdef MD5_ASM
-# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__)
-#  if !defined(B_ENDIAN)
-#   define md5_block_host_order md5_block_asm_host_order
-#  endif
-# elif defined(__sparc) && defined(OPENSSL_SYS_ULTRASPARC)
-   void md5_block_asm_data_order_aligned (MD5_CTX *c, const MD5_LONG *p,size_t num);
-#  define HASH_BLOCK_DATA_ORDER_ALIGNED md5_block_asm_data_order_aligned
+# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) || \
+     defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
+#  define md5_block_data_order md5_block_asm_data_order
 # endif
 #endif
 
-void md5_block_host_order (MD5_CTX *c, const void *p,size_t num);
 void md5_block_data_order (MD5_CTX *c, const void *p,size_t num);
 
-#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__)
-# if !defined(B_ENDIAN)
-/*
- * *_block_host_order is expected to handle aligned data while
- * *_block_data_order - unaligned. As algorithm and host (x86)
- * are in this case of the same "endianness" these two are
- * otherwise indistinguishable. But normally you don't want to
- * call the same function because unaligned access in places
- * where alignment is expected is usually a "Bad Thing". Indeed,
- * on RISCs you get punished with BUS ERROR signal or *severe*
- * performance degradation. Intel CPUs are in turn perfectly
- * capable of loading unaligned data without such drastic side
- * effect. Yes, they say it's slower than aligned load, but no
- * exception is generated and therefore performance degradation
- * is *incomparable* with RISCs. What we should weight here is
- * costs of unaligned access against costs of aligning data.
- * According to my measurements allowing unaligned access results
- * in ~9% performance improvement on Pentium II operating at
- * 266MHz. I won't be surprised if the difference will be higher
- * on faster systems:-)
- *
- *                             <appro@fy.chalmers.se>
- */
-# define md5_block_data_order md5_block_host_order
-# endif
-#endif
-
 #define DATA_ORDER_IS_LITTLE_ENDIAN
 
 #define HASH_LONG              MD5_LONG
-#define HASH_LONG_LOG2         MD5_LONG_LOG2
 #define HASH_CTX               MD5_CTX
 #define HASH_CBLOCK            MD5_CBLOCK
-#define HASH_LBLOCK            MD5_LBLOCK
 #define HASH_UPDATE            MD5_Update
 #define HASH_TRANSFORM         MD5_Transform
 #define HASH_FINAL             MD5_Final
@@ -123,21 +89,7 @@ void md5_block_data_order (MD5_CTX *c, const void *p,size_t num);
        ll=(c)->C; HOST_l2c(ll,(s));    \
        ll=(c)->D; HOST_l2c(ll,(s));    \
        } while (0)
-#define HASH_BLOCK_HOST_ORDER  md5_block_host_order
-#if !defined(L_ENDIAN) || defined(md5_block_data_order)
 #define        HASH_BLOCK_DATA_ORDER   md5_block_data_order
-/*
- * Little-endians (Intel and Alpha) feel better without this.
- * It looks like memcpy does better job than generic
- * md5_block_data_order on copying-n-aligning input data.
- * But frankly speaking I didn't expect such result on Alpha.
- * On the other hand I've got this with egcs-1.0.2 and if
- * program is compiled with another (better?) compiler it
- * might turn out other way around.
- *
- *                             <appro@fy.chalmers.se>
- */
-#endif
 
 #include "md32_common.h"
 
index 0ab6f76bfffb4a5ef1fa3d82ad91e2cae9e96ac3..4f3c4c967f5146be91219395b5775d2ae33ccf47 100644 (file)
@@ -1,7 +1,7 @@
 #!/usr/local/bin/perl
 
 # Normal is the
-# ripemd160_block_asm_host_order(RIPEMD160_CTX *c, ULONG *X,int blocks);
+# ripemd160_block_asm_data_order(RIPEMD160_CTX *c, ULONG *X,int blocks);
 
 $normal=0;
 
@@ -56,7 +56,7 @@ $KR3=0x7A6D76E9;
         8, 5,12, 9,12, 5,14, 6, 8,13, 6, 5,15,13,11,11,
        );
 
-&ripemd160_block("ripemd160_block_asm_host_order");
+&ripemd160_block("ripemd160_block_asm_data_order");
 &asm_finish();
 
 sub Xv
index 9608a8fd0e564aef7f6dcebc5edd5bde382bf292..61626284b8fad1c12e7c3215ddb2c1c8d1ab834d 100644 (file)
@@ -82,207 +82,6 @@ int RIPEMD160_Init(RIPEMD160_CTX *c)
        return 1;
        }
 
-#ifndef ripemd160_block_host_order
-#ifdef X
-#undef X
-#endif
-#define X(i)   XX[i]
-void ripemd160_block_host_order (RIPEMD160_CTX *ctx, const void *p, size_t num)
-       {
-       const RIPEMD160_LONG *XX=p;
-       register unsigned MD32_REG_T A,B,C,D,E;
-       register unsigned MD32_REG_T a,b,c,d,e;
-
-       for (;num--;XX+=HASH_LBLOCK)
-               {
-
-       A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E;
-
-       RIP1(A,B,C,D,E,WL00,SL00);
-       RIP1(E,A,B,C,D,WL01,SL01);
-       RIP1(D,E,A,B,C,WL02,SL02);
-       RIP1(C,D,E,A,B,WL03,SL03);
-       RIP1(B,C,D,E,A,WL04,SL04);
-       RIP1(A,B,C,D,E,WL05,SL05);
-       RIP1(E,A,B,C,D,WL06,SL06);
-       RIP1(D,E,A,B,C,WL07,SL07);
-       RIP1(C,D,E,A,B,WL08,SL08);
-       RIP1(B,C,D,E,A,WL09,SL09);
-       RIP1(A,B,C,D,E,WL10,SL10);
-       RIP1(E,A,B,C,D,WL11,SL11);
-       RIP1(D,E,A,B,C,WL12,SL12);
-       RIP1(C,D,E,A,B,WL13,SL13);
-       RIP1(B,C,D,E,A,WL14,SL14);
-       RIP1(A,B,C,D,E,WL15,SL15);
-
-       RIP2(E,A,B,C,D,WL16,SL16,KL1);
-       RIP2(D,E,A,B,C,WL17,SL17,KL1);
-       RIP2(C,D,E,A,B,WL18,SL18,KL1);
-       RIP2(B,C,D,E,A,WL19,SL19,KL1);
-       RIP2(A,B,C,D,E,WL20,SL20,KL1);
-       RIP2(E,A,B,C,D,WL21,SL21,KL1);
-       RIP2(D,E,A,B,C,WL22,SL22,KL1);
-       RIP2(C,D,E,A,B,WL23,SL23,KL1);
-       RIP2(B,C,D,E,A,WL24,SL24,KL1);
-       RIP2(A,B,C,D,E,WL25,SL25,KL1);
-       RIP2(E,A,B,C,D,WL26,SL26,KL1);
-       RIP2(D,E,A,B,C,WL27,SL27,KL1);
-       RIP2(C,D,E,A,B,WL28,SL28,KL1);
-       RIP2(B,C,D,E,A,WL29,SL29,KL1);
-       RIP2(A,B,C,D,E,WL30,SL30,KL1);
-       RIP2(E,A,B,C,D,WL31,SL31,KL1);
-
-       RIP3(D,E,A,B,C,WL32,SL32,KL2);
-       RIP3(C,D,E,A,B,WL33,SL33,KL2);
-       RIP3(B,C,D,E,A,WL34,SL34,KL2);
-       RIP3(A,B,C,D,E,WL35,SL35,KL2);
-       RIP3(E,A,B,C,D,WL36,SL36,KL2);
-       RIP3(D,E,A,B,C,WL37,SL37,KL2);
-       RIP3(C,D,E,A,B,WL38,SL38,KL2);
-       RIP3(B,C,D,E,A,WL39,SL39,KL2);
-       RIP3(A,B,C,D,E,WL40,SL40,KL2);
-       RIP3(E,A,B,C,D,WL41,SL41,KL2);
-       RIP3(D,E,A,B,C,WL42,SL42,KL2);
-       RIP3(C,D,E,A,B,WL43,SL43,KL2);
-       RIP3(B,C,D,E,A,WL44,SL44,KL2);
-       RIP3(A,B,C,D,E,WL45,SL45,KL2);
-       RIP3(E,A,B,C,D,WL46,SL46,KL2);
-       RIP3(D,E,A,B,C,WL47,SL47,KL2);
-
-       RIP4(C,D,E,A,B,WL48,SL48,KL3);
-       RIP4(B,C,D,E,A,WL49,SL49,KL3);
-       RIP4(A,B,C,D,E,WL50,SL50,KL3);
-       RIP4(E,A,B,C,D,WL51,SL51,KL3);
-       RIP4(D,E,A,B,C,WL52,SL52,KL3);
-       RIP4(C,D,E,A,B,WL53,SL53,KL3);
-       RIP4(B,C,D,E,A,WL54,SL54,KL3);
-       RIP4(A,B,C,D,E,WL55,SL55,KL3);
-       RIP4(E,A,B,C,D,WL56,SL56,KL3);
-       RIP4(D,E,A,B,C,WL57,SL57,KL3);
-       RIP4(C,D,E,A,B,WL58,SL58,KL3);
-       RIP4(B,C,D,E,A,WL59,SL59,KL3);
-       RIP4(A,B,C,D,E,WL60,SL60,KL3);
-       RIP4(E,A,B,C,D,WL61,SL61,KL3);
-       RIP4(D,E,A,B,C,WL62,SL62,KL3);
-       RIP4(C,D,E,A,B,WL63,SL63,KL3);
-
-       RIP5(B,C,D,E,A,WL64,SL64,KL4);
-       RIP5(A,B,C,D,E,WL65,SL65,KL4);
-       RIP5(E,A,B,C,D,WL66,SL66,KL4);
-       RIP5(D,E,A,B,C,WL67,SL67,KL4);
-       RIP5(C,D,E,A,B,WL68,SL68,KL4);
-       RIP5(B,C,D,E,A,WL69,SL69,KL4);
-       RIP5(A,B,C,D,E,WL70,SL70,KL4);
-       RIP5(E,A,B,C,D,WL71,SL71,KL4);
-       RIP5(D,E,A,B,C,WL72,SL72,KL4);
-       RIP5(C,D,E,A,B,WL73,SL73,KL4);
-       RIP5(B,C,D,E,A,WL74,SL74,KL4);
-       RIP5(A,B,C,D,E,WL75,SL75,KL4);
-       RIP5(E,A,B,C,D,WL76,SL76,KL4);
-       RIP5(D,E,A,B,C,WL77,SL77,KL4);
-       RIP5(C,D,E,A,B,WL78,SL78,KL4);
-       RIP5(B,C,D,E,A,WL79,SL79,KL4);
-
-       a=A; b=B; c=C; d=D; e=E;
-       /* Do other half */
-       A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E;
-
-       RIP5(A,B,C,D,E,WR00,SR00,KR0);
-       RIP5(E,A,B,C,D,WR01,SR01,KR0);
-       RIP5(D,E,A,B,C,WR02,SR02,KR0);
-       RIP5(C,D,E,A,B,WR03,SR03,KR0);
-       RIP5(B,C,D,E,A,WR04,SR04,KR0);
-       RIP5(A,B,C,D,E,WR05,SR05,KR0);
-       RIP5(E,A,B,C,D,WR06,SR06,KR0);
-       RIP5(D,E,A,B,C,WR07,SR07,KR0);
-       RIP5(C,D,E,A,B,WR08,SR08,KR0);
-       RIP5(B,C,D,E,A,WR09,SR09,KR0);
-       RIP5(A,B,C,D,E,WR10,SR10,KR0);
-       RIP5(E,A,B,C,D,WR11,SR11,KR0);
-       RIP5(D,E,A,B,C,WR12,SR12,KR0);
-       RIP5(C,D,E,A,B,WR13,SR13,KR0);
-       RIP5(B,C,D,E,A,WR14,SR14,KR0);
-       RIP5(A,B,C,D,E,WR15,SR15,KR0);
-
-       RIP4(E,A,B,C,D,WR16,SR16,KR1);
-       RIP4(D,E,A,B,C,WR17,SR17,KR1);
-       RIP4(C,D,E,A,B,WR18,SR18,KR1);
-       RIP4(B,C,D,E,A,WR19,SR19,KR1);
-       RIP4(A,B,C,D,E,WR20,SR20,KR1);
-       RIP4(E,A,B,C,D,WR21,SR21,KR1);
-       RIP4(D,E,A,B,C,WR22,SR22,KR1);
-       RIP4(C,D,E,A,B,WR23,SR23,KR1);
-       RIP4(B,C,D,E,A,WR24,SR24,KR1);
-       RIP4(A,B,C,D,E,WR25,SR25,KR1);
-       RIP4(E,A,B,C,D,WR26,SR26,KR1);
-       RIP4(D,E,A,B,C,WR27,SR27,KR1);
-       RIP4(C,D,E,A,B,WR28,SR28,KR1);
-       RIP4(B,C,D,E,A,WR29,SR29,KR1);
-       RIP4(A,B,C,D,E,WR30,SR30,KR1);
-       RIP4(E,A,B,C,D,WR31,SR31,KR1);
-
-       RIP3(D,E,A,B,C,WR32,SR32,KR2);
-       RIP3(C,D,E,A,B,WR33,SR33,KR2);
-       RIP3(B,C,D,E,A,WR34,SR34,KR2);
-       RIP3(A,B,C,D,E,WR35,SR35,KR2);
-       RIP3(E,A,B,C,D,WR36,SR36,KR2);
-       RIP3(D,E,A,B,C,WR37,SR37,KR2);
-       RIP3(C,D,E,A,B,WR38,SR38,KR2);
-       RIP3(B,C,D,E,A,WR39,SR39,KR2);
-       RIP3(A,B,C,D,E,WR40,SR40,KR2);
-       RIP3(E,A,B,C,D,WR41,SR41,KR2);
-       RIP3(D,E,A,B,C,WR42,SR42,KR2);
-       RIP3(C,D,E,A,B,WR43,SR43,KR2);
-       RIP3(B,C,D,E,A,WR44,SR44,KR2);
-       RIP3(A,B,C,D,E,WR45,SR45,KR2);
-       RIP3(E,A,B,C,D,WR46,SR46,KR2);
-       RIP3(D,E,A,B,C,WR47,SR47,KR2);
-
-       RIP2(C,D,E,A,B,WR48,SR48,KR3);
-       RIP2(B,C,D,E,A,WR49,SR49,KR3);
-       RIP2(A,B,C,D,E,WR50,SR50,KR3);
-       RIP2(E,A,B,C,D,WR51,SR51,KR3);
-       RIP2(D,E,A,B,C,WR52,SR52,KR3);
-       RIP2(C,D,E,A,B,WR53,SR53,KR3);
-       RIP2(B,C,D,E,A,WR54,SR54,KR3);
-       RIP2(A,B,C,D,E,WR55,SR55,KR3);
-       RIP2(E,A,B,C,D,WR56,SR56,KR3);
-       RIP2(D,E,A,B,C,WR57,SR57,KR3);
-       RIP2(C,D,E,A,B,WR58,SR58,KR3);
-       RIP2(B,C,D,E,A,WR59,SR59,KR3);
-       RIP2(A,B,C,D,E,WR60,SR60,KR3);
-       RIP2(E,A,B,C,D,WR61,SR61,KR3);
-       RIP2(D,E,A,B,C,WR62,SR62,KR3);
-       RIP2(C,D,E,A,B,WR63,SR63,KR3);
-
-       RIP1(B,C,D,E,A,WR64,SR64);
-       RIP1(A,B,C,D,E,WR65,SR65);
-       RIP1(E,A,B,C,D,WR66,SR66);
-       RIP1(D,E,A,B,C,WR67,SR67);
-       RIP1(C,D,E,A,B,WR68,SR68);
-       RIP1(B,C,D,E,A,WR69,SR69);
-       RIP1(A,B,C,D,E,WR70,SR70);
-       RIP1(E,A,B,C,D,WR71,SR71);
-       RIP1(D,E,A,B,C,WR72,SR72);
-       RIP1(C,D,E,A,B,WR73,SR73);
-       RIP1(B,C,D,E,A,WR74,SR74);
-       RIP1(A,B,C,D,E,WR75,SR75);
-       RIP1(E,A,B,C,D,WR76,SR76);
-       RIP1(D,E,A,B,C,WR77,SR77);
-       RIP1(C,D,E,A,B,WR78,SR78);
-       RIP1(B,C,D,E,A,WR79,SR79);
-
-       D     =ctx->B+c+D;
-       ctx->B=ctx->C+d+E;
-       ctx->C=ctx->D+e+A;
-       ctx->D=ctx->E+a+B;
-       ctx->E=ctx->A+b+C;
-       ctx->A=D;
-
-               }
-       }
-#endif
-
 #ifndef ripemd160_block_data_order
 #ifdef X
 #undef X
index b52d7861654cd6d9c26899acba33d43249724306..f14b346e662296865442577432678c92afb377f2 100644 (file)
  */
 #ifdef RMD160_ASM
 # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__)
-#  if !defined(B_ENDIAN)
-#   define ripemd160_block_host_order ripemd160_block_asm_host_order
-#  endif
+#  define ripemd160_block_data_order ripemd160_block_asm_data_order
 # endif
 #endif
 
-void ripemd160_block_host_order (RIPEMD160_CTX *c, const void *p,size_t num);
 void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num);
 
-#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__)
-# if !defined(B_ENDIAN)
-#  define ripemd160_block_data_order ripemd160_block_host_order
-# endif
-#endif
-
 #define DATA_ORDER_IS_LITTLE_ENDIAN
 
 #define HASH_LONG               RIPEMD160_LONG
-#define HASH_LONG_LOG2          RIPEMD160_LONG_LOG2
 #define HASH_CTX                RIPEMD160_CTX
 #define HASH_CBLOCK             RIPEMD160_CBLOCK
-#define HASH_LBLOCK             RIPEMD160_LBLOCK
 #define HASH_UPDATE             RIPEMD160_Update
 #define HASH_TRANSFORM          RIPEMD160_Transform
 #define HASH_FINAL              RIPEMD160_Final
-#define HASH_BLOCK_HOST_ORDER   ripemd160_block_host_order
 #define        HASH_MAKE_STRING(c,s)   do {    \
        unsigned long ll;               \
        ll=(c)->A; HOST_l2c(ll,(s));    \
@@ -106,9 +94,7 @@ void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num);
        ll=(c)->D; HOST_l2c(ll,(s));    \
        ll=(c)->E; HOST_l2c(ll,(s));    \
        } while (0)
-#if !defined(L_ENDIAN) || defined(ripemd160_block_data_order)
 #define HASH_BLOCK_DATA_ORDER   ripemd160_block_data_order
-#endif
 
 #include "md32_common.h"
 
index 4f8521f1e2ca863322160842d9d14a646a50eb34..0b4dab2bd53c1414962d5208963e738938fb84d4 100644 (file)
@@ -1,4 +1,16 @@
-#!/usr/local/bin/perl
+#!/usr/bin/env perl
+
+# ====================================================================
+# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# "[Re]written" was achieved in two major overhauls. In 2004 BODY_*
+# functions were re-implemented to address P4 performance issue [see
+# commentary below], and in 2006 the rest was rewritten in order to
+# gain freedom to liberate licensing terms.
 
 # It was noted that Intel IA-32 C compiler generates code which
 # performs ~30% *faster* on P4 CPU than original *hand-coded*
 # improvement on P4 outweights the loss and incorporate this
 # re-tuned code to 0.9.7 and later.
 # ----------------------------------------------------------------
-# Those who for any particular reason absolutely must score on
-# Pentium can replace this module with one from 0.9.6 distribution.
-# This "offer" shall be revoked the moment programming interface to
-# this module is changed, in which case this paragraph should be
-# removed.
-# ----------------------------------------------------------------
 #                                      <appro@fy.chalmers.se>
 
-$normal=0;
-
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
 
 $A="eax";
-$B="ecx";
-$C="ebx";
+$B="ebx";
+$C="ecx";
 $D="edx";
 $E="edi";
 $T="esi";
 $tmp1="ebp";
 
-$off=9*4;
-
-@K=(0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6);
-
-&sha1_block_data("sha1_block_asm_data_order");
-
-&asm_finish();
-
-sub Nn
-       {
-       local($p)=@_;
-       local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E);
-       return($n{$p});
-       }
-
-sub Np
-       {
-       local($p)=@_;
-       local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E);
-       local(%n)=($A,$B,$B,$C,$C,$D,$D,$E,$E,$T,$T,$A);
-       return($n{$p});
-       }
-
-sub Na
-       {
-       local($n)=@_;
-       return( (($n   )&0x0f),
-               (($n+ 2)&0x0f),
-               (($n+ 8)&0x0f),
-               (($n+13)&0x0f),
-               (($n+ 1)&0x0f));
-       }
-
-sub X_expand
-       {
-       local($in)=@_;
-
-       &comment("First, load the words onto the stack in network byte order");
-       for ($i=0; $i<16; $i+=2)
-               {
-               &mov($A,&DWP(($i+0)*4,$in,"",0));# unless $i == 0;
-                &mov($B,&DWP(($i+1)*4,$in,"",0));
-               &bswap($A);
-                &bswap($B);
-               &mov(&swtmp($i+0),$A);
-                &mov(&swtmp($i+1),$B);
-               }
-
-       &comment("We now have the X array on the stack");
-       &comment("starting at sp-4");
-       }
-
-# Rules of engagement
-# F is always trashable at the start, the running total.
-# E becomes the next F so it can be trashed after it has been 'accumulated'
-# F becomes A in the next round.  We don't need to access it much.
-# During the X update part, the result ends up in $X[$n0].
+@V=($A,$B,$C,$D,$E,$T);
 
 sub BODY_00_15
        {
-       local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
+       local($n,$a,$b,$c,$d,$e,$f)=@_;
 
        &comment("00_15 $n");
 
@@ -109,37 +58,37 @@ sub BODY_00_15
         else        { &mov($a,$tmp1); }
        &rotl($tmp1,5);                 # tmp1=ROTATE(a,5)
         &xor($f,$d);
-       &and($f,$b);
-        &add($tmp1,$e);                # tmp1+=e;
-       &mov($e,&swtmp($n));            # e becomes volatile and
-                                       # is loaded with xi
+       &add($tmp1,$e);                 # tmp1+=e;
+        &and($f,$b);
+       &mov($e,&swtmp($n%16));         # e becomes volatile and is loaded
+                                       # with xi, also note that e becomes
+                                       # f in next round...
         &xor($f,$d);                   # f holds F_00_19(b,c,d)
        &rotr($b,2);                    # b=ROTATE(b,30)
-        &lea($tmp1,&DWP($K,$tmp1,$e,1));# tmp1+=K_00_19+xi
+        &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi
 
        if ($n==15) { &add($f,$tmp1); } # f+=tmp1
-       else        { &add($tmp1,$f); }
+       else        { &add($tmp1,$f); } # f becomes a in next round
        }
 
 sub BODY_16_19
        {
-       local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
-       local($n0,$n1,$n2,$n3,$np)=&Na($n);
+       local($n,$a,$b,$c,$d,$e,$f)=@_;
 
        &comment("16_19 $n");
 
-       &mov($f,&swtmp($n1));           # f to hold Xupdate(xi,xa,xb,xc,xd)
+       &mov($f,&swtmp($n%16));         # f to hold Xupdate(xi,xa,xb,xc,xd)
         &mov($tmp1,$c);                # tmp1 to hold F_00_19(b,c,d)
-       &xor($f,&swtmp($n0));
+       &xor($f,&swtmp(($n+2)%16));
         &xor($tmp1,$d);
-       &xor($f,&swtmp($n2));
+       &xor($f,&swtmp(($n+8)%16));
         &and($tmp1,$b);                # tmp1 holds F_00_19(b,c,d)
        &rotr($b,2);                    # b=ROTATE(b,30)
-        &xor($f,&swtmp($n3));          # f holds xa^xb^xc^xd
-       &rotl($f,1);                    # f=ROATE(f,1)
+        &xor($f,&swtmp(($n+13)%16));   # f holds xa^xb^xc^xd
+       &rotl($f,1);                    # f=ROTATE(f,1)
         &xor($tmp1,$d);                # tmp1=F_00_19(b,c,d)
-       &mov(&swtmp($n0),$f);           # xi=f
-       &lea($f,&DWP($K,$f,$e,1));      # f+=K_00_19+e
+       &mov(&swtmp($n%16),$f);         # xi=f
+       &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
         &mov($e,$a);                   # e becomes volatile
        &rotl($e,5);                    # e=ROTATE(a,5)
         &add($f,$tmp1);                # f+=F_00_19(b,c,d)
@@ -148,48 +97,47 @@ sub BODY_16_19
 
 sub BODY_20_39
        {
-       local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
+       local($n,$a,$b,$c,$d,$e,$f)=@_;
+       local $K=($n<40)?0x6ed9eba1:0xca62c1d6;
 
        &comment("20_39 $n");
-       local($n0,$n1,$n2,$n3,$np)=&Na($n);
 
        &mov($tmp1,$b);                 # tmp1 to hold F_20_39(b,c,d)
-        &mov($f,&swtmp($n0));          # f to hold Xupdate(xi,xa,xb,xc,xd)
+        &mov($f,&swtmp($n%16));        # f to hold Xupdate(xi,xa,xb,xc,xd)
        &rotr($b,2);                    # b=ROTATE(b,30)
-        &xor($f,&swtmp($n1));
+        &xor($f,&swtmp(($n+2)%16));
        &xor($tmp1,$c);
-        &xor($f,&swtmp($n2));
+        &xor($f,&swtmp(($n+8)%16));
        &xor($tmp1,$d);                 # tmp1 holds F_20_39(b,c,d)
-        &xor($f,&swtmp($n3));          # f holds xa^xb^xc^xd
+        &xor($f,&swtmp(($n+13)%16));   # f holds xa^xb^xc^xd
        &rotl($f,1);                    # f=ROTATE(f,1)
         &add($tmp1,$e);
-       &mov(&swtmp($n0),$f);           # xi=f
+       &mov(&swtmp($n%16),$f);         # xi=f
         &mov($e,$a);                   # e becomes volatile
        &rotl($e,5);                    # e=ROTATE(a,5)
-        &lea($f,&DWP($K,$f,$tmp1,1));  # f+=K_20_39+e
+        &lea($f,&DWP($K,$f,$tmp1));    # f+=K_20_39+e
        &add($f,$e);                    # f+=ROTATE(a,5)
        }
 
 sub BODY_40_59
        {
-       local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_;
+       local($n,$a,$b,$c,$d,$e,$f)=@_;
 
        &comment("40_59 $n");
-       local($n0,$n1,$n2,$n3,$np)=&Na($n);
 
-       &mov($f,&swtmp($n0));           # f to hold Xupdate(xi,xa,xb,xc,xd)
-        &mov($tmp1,&swtmp($n1));
+       &mov($f,&swtmp($n%16));         # f to hold Xupdate(xi,xa,xb,xc,xd)
+        &mov($tmp1,&swtmp(($n+2)%16));
        &xor($f,$tmp1);
-        &mov($tmp1,&swtmp($n2));
+        &mov($tmp1,&swtmp(($n+8)%16));
        &xor($f,$tmp1);
-        &mov($tmp1,&swtmp($n3));
+        &mov($tmp1,&swtmp(($n+13)%16));
        &xor($f,$tmp1);                 # f holds xa^xb^xc^xd
         &mov($tmp1,$b);                # tmp1 to hold F_40_59(b,c,d)
        &rotl($f,1);                    # f=ROTATE(f,1)
         &or($tmp1,$c);
-       &mov(&swtmp($n0),$f);           # xi=f
+       &mov(&swtmp($n%16),$f);         # xi=f
         &and($tmp1,$d);
-       &lea($f,&DWP($K,$f,$e,1));      # f+=K_40_59+e
+       &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
         &mov($e,$b);                   # e becomes volatile and is used
                                        # to calculate F_40_59(b,c,d)
        &rotr($b,2);                    # b=ROTATE(b,30)
@@ -201,230 +149,71 @@ sub BODY_40_59
        &add($f,$e);                    # f+=ROTATE(a,5)
        }
 
-sub BODY_60_79
-       {
-       &BODY_20_39(@_);
-       }
-
-sub sha1_block_host
-       {
-       local($name, $sclabel)=@_;
-
-       &function_begin_B($name,"");
-
-       # parameter 1 is the MD5_CTX structure.
-       # A     0
-       # B     4
-       # C     8
-       # D     12
-       # E     16
-
-       &mov("ecx",     &wparam(2));
-        &push("esi");
-       &shl("ecx",6);
-        &mov("esi",    &wparam(1));
-       &push("ebp");
-        &add("ecx","esi");     # offset to leave on
-       &push("ebx");
-        &mov("ebp",    &wparam(0));
-       &push("edi");
-        &mov($D,       &DWP(12,"ebp","",0));
-       &stack_push(18+9);
-        &mov($E,       &DWP(16,"ebp","",0));
-       &mov($C,        &DWP( 8,"ebp","",0));
-        &mov(&swtmp(17),"ecx");
+&function_begin("sha1_block_data_order",16);
+       &mov($tmp1,&wparam(0)); # SHA_CTX *c
+       &mov($T,&wparam(1));    # const void *input
+       &mov($A,&wparam(2));    # size_t num
+       &stack_push(16);        # allocate X[16]
+       &shl($A,6);
+       &add($A,$T);
+       &mov(&wparam(2),$A);    # pointer beyond the end of input
+       &mov($E,&DWP(16,$tmp1));# pre-load E
 
-       &comment("First we need to setup the X array");
+       &set_label("loop",16);
 
-       for ($i=0; $i<16; $i+=2)
+       # copy input chunk to X, but reversing byte order!
+       for ($i=0; $i<16; $i+=4)
                {
-               &mov($A,&DWP(($i+0)*4,"esi","",0));# unless $i == 0;
-                &mov($B,&DWP(($i+1)*4,"esi","",0));
+               &mov($A,&DWP(4*($i+0),$T));
+               &mov($B,&DWP(4*($i+1),$T));
+               &mov($C,&DWP(4*($i+2),$T));
+               &mov($D,&DWP(4*($i+3),$T));
+               &bswap($A);
+               &bswap($B);
+               &bswap($C);
+               &bswap($D);
                &mov(&swtmp($i+0),$A);
-                &mov(&swtmp($i+1),$B);
+               &mov(&swtmp($i+1),$B);
+               &mov(&swtmp($i+2),$C);
+               &mov(&swtmp($i+3),$D);
                }
-       &jmp($sclabel);
-       &function_end_B($name);
-       }
-
-
-sub sha1_block_data
-       {
-       local($name)=@_;
-
-       &function_begin_B($name,"");
-
-       # parameter 1 is the MD5_CTX structure.
-       # A     0
-       # B     4
-       # C     8
-       # D     12
-       # E     16
-
-       &mov("ecx",     &wparam(2));
-        &push("esi");
-       &shl("ecx",6);
-        &mov("esi",    &wparam(1));
-       &push("ebp");
-        &add("ecx","esi");     # offset to leave on
-       &push("ebx");
-        &mov("ebp",    &wparam(0));
-       &push("edi");
-        &mov($D,       &DWP(12,"ebp","",0));
-       &stack_push(18+9);
-        &mov($E,       &DWP(16,"ebp","",0));
-       &mov($C,        &DWP( 8,"ebp","",0));
-        &mov(&swtmp(17),"ecx");
-
-       &comment("First we need to setup the X array");
-
-       &set_label("start") unless $normal;
-
-       &X_expand("esi");
-        &mov(&wparam(1),"esi");
-
-       &set_label("shortcut", 0, 1);
-       &comment("");
-       &comment("Start processing");
-
-       # odd start
-       &mov($A,        &DWP( 0,"ebp","",0));
-        &mov($B,       &DWP( 4,"ebp","",0));
-       $X="esp";
-       &BODY_00_15(-2,$K[0],$X, 0,$A,$B,$C,$D,$E,$T);
-       &BODY_00_15( 0,$K[0],$X, 1,$T,$A,$B,$C,$D,$E);
-       &BODY_00_15( 0,$K[0],$X, 2,$E,$T,$A,$B,$C,$D);
-       &BODY_00_15( 0,$K[0],$X, 3,$D,$E,$T,$A,$B,$C);
-       &BODY_00_15( 0,$K[0],$X, 4,$C,$D,$E,$T,$A,$B);
-       &BODY_00_15( 0,$K[0],$X, 5,$B,$C,$D,$E,$T,$A);
-       &BODY_00_15( 0,$K[0],$X, 6,$A,$B,$C,$D,$E,$T);
-       &BODY_00_15( 0,$K[0],$X, 7,$T,$A,$B,$C,$D,$E);
-       &BODY_00_15( 0,$K[0],$X, 8,$E,$T,$A,$B,$C,$D);
-       &BODY_00_15( 0,$K[0],$X, 9,$D,$E,$T,$A,$B,$C);
-       &BODY_00_15( 0,$K[0],$X,10,$C,$D,$E,$T,$A,$B);
-       &BODY_00_15( 0,$K[0],$X,11,$B,$C,$D,$E,$T,$A);
-       &BODY_00_15( 0,$K[0],$X,12,$A,$B,$C,$D,$E,$T);
-       &BODY_00_15( 0,$K[0],$X,13,$T,$A,$B,$C,$D,$E);
-       &BODY_00_15( 0,$K[0],$X,14,$E,$T,$A,$B,$C,$D);
-       &BODY_00_15( 1,$K[0],$X,15,$D,$E,$T,$A,$B,$C);
-       &BODY_16_19(-1,$K[0],$X,16,$C,$D,$E,$T,$A,$B);
-       &BODY_16_19( 0,$K[0],$X,17,$B,$C,$D,$E,$T,$A);
-       &BODY_16_19( 0,$K[0],$X,18,$A,$B,$C,$D,$E,$T);
-       &BODY_16_19( 1,$K[0],$X,19,$T,$A,$B,$C,$D,$E);
-
-       &BODY_20_39(-1,$K[1],$X,20,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39( 0,$K[1],$X,21,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39( 0,$K[1],$X,22,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39( 0,$K[1],$X,23,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39( 0,$K[1],$X,24,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39( 0,$K[1],$X,25,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39( 0,$K[1],$X,26,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39( 0,$K[1],$X,27,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39( 0,$K[1],$X,28,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39( 0,$K[1],$X,29,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39( 0,$K[1],$X,30,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39( 0,$K[1],$X,31,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39( 0,$K[1],$X,32,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39( 0,$K[1],$X,33,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39( 0,$K[1],$X,34,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39( 0,$K[1],$X,35,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39( 0,$K[1],$X,36,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39( 0,$K[1],$X,37,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39( 0,$K[1],$X,38,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39( 1,$K[1],$X,39,$D,$E,$T,$A,$B,$C);
-
-       &BODY_40_59(-1,$K[2],$X,40,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59( 0,$K[2],$X,41,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59( 0,$K[2],$X,42,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59( 0,$K[2],$X,43,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59( 0,$K[2],$X,44,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59( 0,$K[2],$X,45,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59( 0,$K[2],$X,46,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59( 0,$K[2],$X,47,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59( 0,$K[2],$X,48,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59( 0,$K[2],$X,49,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59( 0,$K[2],$X,50,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59( 0,$K[2],$X,51,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59( 0,$K[2],$X,52,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59( 0,$K[2],$X,53,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59( 0,$K[2],$X,54,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59( 0,$K[2],$X,55,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59( 0,$K[2],$X,56,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59( 0,$K[2],$X,57,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59( 0,$K[2],$X,58,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59( 1,$K[2],$X,59,$B,$C,$D,$E,$T,$A);
-
-       &BODY_60_79(-1,$K[3],$X,60,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79( 0,$K[3],$X,61,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79( 0,$K[3],$X,62,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79( 0,$K[3],$X,63,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79( 0,$K[3],$X,64,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79( 0,$K[3],$X,65,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79( 0,$K[3],$X,66,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79( 0,$K[3],$X,67,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79( 0,$K[3],$X,68,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79( 0,$K[3],$X,69,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79( 0,$K[3],$X,70,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79( 0,$K[3],$X,71,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79( 0,$K[3],$X,72,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79( 0,$K[3],$X,73,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79( 0,$K[3],$X,74,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79( 0,$K[3],$X,75,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79( 0,$K[3],$X,76,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79( 0,$K[3],$X,77,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79( 0,$K[3],$X,78,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79( 2,$K[3],$X,79,$T,$A,$B,$C,$D,$E);
-
-       &comment("End processing");
-       &comment("");
-       # D is the tmp value
-
-       # E -> A
-       # T -> B
-       # A -> C
-       # B -> D
-       # C -> E
-       # D -> T
-
-       &mov($tmp1,&wparam(0));
-
-        &mov($D,       &DWP(12,$tmp1,"",0));
-       &add($D,$B);
-        &mov($B,       &DWP( 4,$tmp1,"",0));
-       &add($B,$T);
-        &mov($T,       $A);
-       &mov($A,        &DWP( 0,$tmp1,"",0));
-        &mov(&DWP(12,$tmp1,"",0),$D);
-
-       &add($A,$E);
-        &mov($E,       &DWP(16,$tmp1,"",0));
-       &add($E,$C);
-        &mov($C,       &DWP( 8,$tmp1,"",0));
-       &add($C,$T);
-
-        &mov(&DWP( 0,$tmp1,"",0),$A);
-       &mov("esi",&wparam(1));
-        &mov(&DWP( 8,$tmp1,"",0),$C);
-       &add("esi",64);
-        &mov("eax",&swtmp(17));
-       &mov(&DWP(16,$tmp1,"",0),$E);
-        &cmp("esi","eax");
-       &mov(&DWP( 4,$tmp1,"",0),$B);
-        &jb(&label("start"));
-
-       &stack_pop(18+9);
-        &pop("edi");
-       &pop("ebx");
-        &pop("ebp");
-       &pop("esi");
-        &ret();
-
-       # keep a note of shortcut label so it can be used outside
-       # block.
-       my $sclabel = &label("shortcut");
-
-       &function_end_B($name);
-       # Putting this here avoids problems with MASM in debugging mode
-       &sha1_block_host("sha1_block_asm_host_order", $sclabel);
-       }
+       &mov(&wparam(1),$T);    # redundant in 1st spin
+
+       &mov($A,&DWP(0,$tmp1)); # load SHA_CTX
+       &mov($B,&DWP(4,$tmp1));
+       &mov($C,&DWP(8,$tmp1));
+       &mov($D,&DWP(12,$tmp1));
+       # E is pre-loaded
+
+       for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+       for(;$i<20;$i++)        { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
+       for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+       for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+       for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+       (($V[5] eq $D) and ($V[0] eq $E)) or die;       # double-check
+
+       &mov($tmp1,&wparam(0)); # re-load SHA_CTX*
+       &mov($D,&wparam(1));    # D is last "T" and is discarded
+
+       &add($E,&DWP(0,$tmp1)); # E is last "A"...
+       &add($T,&DWP(4,$tmp1));
+       &add($A,&DWP(8,$tmp1));
+       &add($B,&DWP(12,$tmp1));
+       &add($C,&DWP(16,$tmp1));
+
+       &mov(&DWP(0,$tmp1),$E); # update SHA_CTX
+        &add($D,64);           # advance input pointer
+       &mov(&DWP(4,$tmp1),$T);
+        &cmp($D,&wparam(2));   # have we reached the end yet?
+       &mov(&DWP(8,$tmp1),$A);
+        &mov($E,$C);           # C is last "E" which needs to be "pre-loaded"
+       &mov(&DWP(12,$tmp1),$B);
+        &mov($T,$D);           # input pointer
+       &mov(&DWP(16,$tmp1),$C);
+       &jb(&label("loop"));
+
+       &stack_pop(16);
+&function_end("sha1_block_data_order");
 
+&asm_finish();
index 9478f5dd5d10d4e2d6e93c07ccc997e77f1c3a35..aa18c1089b289a922a9d6416f7e6d2d6ea128c58 100644 (file)
@@ -2,8 +2,9 @@
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # Eternal question is what's wrong with compiler generated code? The
 # to perform rotations by maintaining copy of 32-bit value in upper
 # bits of 64-bit register. Just follow mux2 and shrp instructions...
 # Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
-# is >50% better than HP C and >2x better than gcc. As of this moment
-# performance under little-endian OS such as Linux and Windows will be
-# a bit lower, because data has to be picked in reverse byte-order.
-# It's possible to resolve this issue by implementing third function,
-# sha1_block_asm_data_order_aligned, which would temporarily flip
-# BE field in User Mask register...
+# is >50% better than HP C and >2x better than gcc.
 
 $code=<<___;
-.ident  \"sha1-ia64.s, version 1.0\"
+.ident  \"sha1-ia64.s, version 1.2\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
 
@@ -55,63 +51,55 @@ else {
 
 sub BODY_00_15 {
 local  *code=shift;
-local  ($i,$a,$b,$c,$d,$e,$f,$unaligned)=@_;
+local  ($i,$a,$b,$c,$d,$e,$f)=@_;
 
-if ($unaligned) {
-       $code.=<<___;
-{ .mmi;        ld1     tmp0=[inp],2                // MSB
-       ld1     tmp1=[tmp3],2           };;
-{ .mmi;        ld1     tmp2=[inp],2
-       ld1     $X[$i&0xf]=[tmp3],2         // LSB
-       dep     tmp1=tmp0,tmp1,8,8      };;
-{ .mii;        cmp.ne  p16,p0=r0,r0                // no misaligned prefetch
-       dep     $X[$i&0xf]=tmp2,$X[$i&0xf],8,8;;
-       dep     $X[$i&0xf]=tmp1,$X[$i&0xf],16,16        };;
-{ .mmi;        nop.m   0
-___
-       }
-elsif ($i<15) {
-       $code.=<<___;
-{ .mmi;        ld4     $X[($i+1)&0xf]=[inp],4  // prefetch
-___
-       }
-else   {
-       $code.=<<___;
-{ .mmi;        nop.m   0
+$code.=<<___ if ($i==0);
+{ .mmi;        ld1     $X[$i&0xf]=[inp],2          // MSB
+       ld1     tmp2=[tmp3],2           };;
+{ .mmi;        ld1     tmp0=[inp],2
+       ld1     tmp4=[tmp3],2               // LSB
+       dep     $X[$i&0xf]=$X[$i&0xf],tmp2,8,8  };;
 ___
-       }
 if ($i<15) {
        $code.=<<___;
-       and     tmp0=$c,$b
-       dep.z   tmp5=$a,5,27            }   // a<<5
+{ .mmi;        ld1     $X[($i+1)&0xf]=[inp],2      // +1
+       dep     tmp1=tmp0,tmp4,8,8      };;
+{ .mmi;        ld1     tmp2=[tmp3],2               // +1
+       and     tmp4=$c,$b
+       dep     $X[$i&0xf]=$X[$i&0xf],tmp1,16,16        } //;;
 { .mmi;        andcm   tmp1=$d,$b
-       add     tmp4=$e,$K_00_19        };;
-{ .mmi;        or      tmp0=tmp0,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
-       add     $f=tmp4,$X[$i&0xf]          // f=xi+e+K_00_19
+       add     tmp0=$e,$K_00_19
+       dep.z   tmp5=$a,5,27            };; // a<<5
+{ .mmi;        or      tmp4=tmp4,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
+       add     $f=tmp0,$X[$i&0xf]          // f=xi+e+K_00_19
        extr.u  tmp1=$a,27,5            };; // a>>27
-{ .mib;        add     $f=$f,tmp0                  // f+=F_00_19(b,c,d)
+{ .mmi;        ld1     tmp0=[inp],2                // +1
+       add     $f=$f,tmp4                  // f+=F_00_19(b,c,d)
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
-{ .mib;        or      tmp1=tmp1,tmp5              // ROTATE(a,5)
+{ .mmi;        ld1     tmp4=[tmp3],2               // +1
+       or      tmp5=tmp1,tmp5              // ROTATE(a,5)
        mux2    tmp6=$a,0x44            };; // see b in next iteration
-{ .mii;        add     $f=$f,tmp1                  // f+=ROTATE(a,5)
-       mux2    $X[$i&0xf]=$X[$i&0xf],0x44
-       nop.i   0                       };;
+{ .mii;        add     $f=$f,tmp5                  // f+=ROTATE(a,5)
+       dep     $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8  // +1
+       mux2    $X[$i&0xf]=$X[$i&0xf],0x44      } //;;
 
 ___
        }
 else   {
        $code.=<<___;
-       and     tmp0=$c,$b
-       dep.z   tmp5=$a,5,27            }   // a<<5 ;;?
+{ .mii;        and     tmp3=$c,$b
+       dep     tmp1=tmp0,tmp4,8,8;;
+       dep     $X[$i&0xf]=$X[$i&0xf],tmp1,16,16        } //;;
 { .mmi;        andcm   tmp1=$d,$b
-       add     tmp4=$e,$K_00_19        };;
-{ .mmi;        or      tmp0=tmp0,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
-       add     $f=tmp4,$X[$i&0xf]          // f=xi+e+K_00_19
+       add     tmp0=$e,$K_00_19
+       dep.z   tmp5=$a,5,27            };; // a<<5
+{ .mmi;        or      tmp4=tmp3,tmp1              // F_00_19(b,c,d)=(b&c)|(~b&d)
+       add     $f=tmp0,$X[$i&0xf]          // f=xi+e+K_00_19
        extr.u  tmp1=$a,27,5            }   // a>>27
 { .mmi;        xor     tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]  // +1
        xor     tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
        nop.i   0                       };;
-{ .mmi;        add     $f=$f,tmp0                  // f+=F_00_19(b,c,d)
+{ .mmi;        add     $f=$f,tmp4                  // f+=F_00_19(b,c,d)
        xor     tmp2=tmp2,tmp3              // +1
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30)
 { .mmi; or     tmp1=tmp1,tmp5              // ROTATE(a,5)
@@ -190,9 +178,7 @@ $code.=<<___;
        extr.u  tmp1=$a,27,5            }   // a>>27
 { .mib;        add     $f=$f,tmp4                  // f+=e+K_20_39
        add     $h1=$h1,$a              };; // wrap up
-{ .mmi;
-(p16)  ld4.s   $X[0]=[inp],4               // non-faulting prefetch
-       add     $f=$f,tmp0                  // f+=F_20_39(b,c,d)
+{ .mmi;        add     $f=$f,tmp0                  // f+=F_20_39(b,c,d)
        shrp    $b=tmp6,tmp6,2          }   // b=ROTATE(b,30) ;;?
 { .mmi;        or      tmp1=tmp1,tmp5              // ROTATE(a,5)
        add     $h3=$h3,$c              };; // wrap up
@@ -245,164 +231,11 @@ tmp3=r11;
 ctx=r32;       // in0
 inp=r33;       // in1
 
-// void sha1_block_asm_host_order(SHA_CTX *c,const void *p,size_t num);
-.global        sha1_block_asm_host_order#
-.proc  sha1_block_asm_host_order#
+// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
+.global        sha1_block_data_order#
+.proc  sha1_block_data_order#
 .align 32
-sha1_block_asm_host_order:
-       .prologue
-{ .mmi;        alloc   tmp1=ar.pfs,3,15,0,0
-       $ADDP   tmp0=4,ctx
-       .save   ar.lc,r3
-       mov     r3=ar.lc                }
-{ .mmi;        $ADDP   ctx=0,ctx
-       $ADDP   inp=0,inp
-       mov     r2=pr                   };;
-tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
-       .body
-{ .mlx;        ld4     $h0=[ctx],8
-       movl    $K_00_19=0x5a827999     }
-{ .mlx;        ld4     $h1=[tmp0],8
-       movl    $K_20_39=0x6ed9eba1     };;
-{ .mlx;        ld4     $h2=[ctx],8
-       movl    $K_40_59=0x8f1bbcdc     }
-{ .mlx;        ld4     $h3=[tmp0]
-       movl    $K_60_79=0xca62c1d6     };;
-{ .mmi;        ld4     $h4=[ctx],-16
-       add     in2=-1,in2                  // adjust num for ar.lc
-       mov     ar.ec=1                 };;
-{ .mmi;        ld4     $X[0]=[inp],4               // prefetch
-       cmp.ne  p16,p0=r0,in2               // prefecth at loop end
-       mov     ar.lc=in2               };; // brp.loop.imp: too far
-
-.Lhtop:
-{ .mmi;        mov     $A=$h0
-       mov     $B=$h1
-       mux2    tmp6=$h1,0x44           }
-{ .mmi;        mov     $C=$h2
-       mov     $D=$h3
-       mov     $E=$h4                  };;
-
-___
-
-       &BODY_00_15(\$code, 0,$A,$B,$C,$D,$E,$T);
-       &BODY_00_15(\$code, 1,$T,$A,$B,$C,$D,$E);
-       &BODY_00_15(\$code, 2,$E,$T,$A,$B,$C,$D);
-       &BODY_00_15(\$code, 3,$D,$E,$T,$A,$B,$C);
-       &BODY_00_15(\$code, 4,$C,$D,$E,$T,$A,$B);
-       &BODY_00_15(\$code, 5,$B,$C,$D,$E,$T,$A);
-       &BODY_00_15(\$code, 6,$A,$B,$C,$D,$E,$T);
-       &BODY_00_15(\$code, 7,$T,$A,$B,$C,$D,$E);
-       &BODY_00_15(\$code, 8,$E,$T,$A,$B,$C,$D);
-       &BODY_00_15(\$code, 9,$D,$E,$T,$A,$B,$C);
-       &BODY_00_15(\$code,10,$C,$D,$E,$T,$A,$B);
-       &BODY_00_15(\$code,11,$B,$C,$D,$E,$T,$A);
-       &BODY_00_15(\$code,12,$A,$B,$C,$D,$E,$T);
-       &BODY_00_15(\$code,13,$T,$A,$B,$C,$D,$E);
-       &BODY_00_15(\$code,14,$E,$T,$A,$B,$C,$D);
-       &BODY_00_15(\$code,15,$D,$E,$T,$A,$B,$C);
-
-       &BODY_16_19(\$code,16,$C,$D,$E,$T,$A,$B);
-       &BODY_16_19(\$code,17,$B,$C,$D,$E,$T,$A);
-       &BODY_16_19(\$code,18,$A,$B,$C,$D,$E,$T);
-       &BODY_16_19(\$code,19,$T,$A,$B,$C,$D,$E);
-
-       &BODY_20_39(\$code,20,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,21,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39(\$code,22,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39(\$code,23,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39(\$code,24,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39(\$code,25,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39(\$code,26,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,27,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39(\$code,28,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39(\$code,29,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39(\$code,30,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39(\$code,31,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39(\$code,32,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,33,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39(\$code,34,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39(\$code,35,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39(\$code,36,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39(\$code,37,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39(\$code,38,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,39,$D,$E,$T,$A,$B,$C);
-
-       &BODY_40_59(\$code,40,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,41,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59(\$code,42,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59(\$code,43,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59(\$code,44,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59(\$code,45,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59(\$code,46,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,47,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59(\$code,48,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59(\$code,49,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59(\$code,50,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59(\$code,51,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59(\$code,52,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,53,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59(\$code,54,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59(\$code,55,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59(\$code,56,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59(\$code,57,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59(\$code,58,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,59,$B,$C,$D,$E,$T,$A);
-
-       &BODY_60_79(\$code,60,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,61,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79(\$code,62,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79(\$code,63,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79(\$code,64,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79(\$code,65,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79(\$code,66,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,67,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79(\$code,68,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79(\$code,69,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79(\$code,70,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79(\$code,71,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79(\$code,72,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,73,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79(\$code,74,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79(\$code,75,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79(\$code,76,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79(\$code,77,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79(\$code,78,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,79,$T,$A,$B,$C,$D,$E);
-
-$code.=<<___;
-{ .mmb;        add     $h0=$h0,$E
-       nop.m   0
-       br.ctop.dptk.many       .Lhtop  };;
-.Lhend:
-{ .mmi;        add     tmp0=4,ctx
-       mov     ar.lc=r3                };;
-{ .mmi;        st4     [ctx]=$h0,8
-       st4     [tmp0]=$h1,8            };;
-{ .mmi;        st4     [ctx]=$h2,8
-       st4     [tmp0]=$h3              };;
-{ .mib;        st4     [ctx]=$h4,-16
-       mov     pr=r2,0x1ffff
-       br.ret.sptk.many        b0      };;
-.endp  sha1_block_asm_host_order#
-___
-
-
-$code.=<<___;
-// void sha1_block_asm_data_order(SHA_CTX *c,const void *p,size_t num);
-.global        sha1_block_asm_data_order#
-.proc  sha1_block_asm_data_order#
-.align 32
-sha1_block_asm_data_order:
-___
-$code.=<<___ if ($big_endian);
-{ .mmi;        and     r2=3,inp                                };;
-{ .mib;        cmp.eq  p6,p0=r0,r2
-(p6)   br.dptk.many    sha1_block_asm_host_order       };;
-___
-$code.=<<___;
+sha1_block_data_order:
        .prologue
 { .mmi;        alloc   tmp1=ar.pfs,3,15,0,0
        $ADDP   tmp0=4,ctx
@@ -440,90 +273,16 @@ tmp6=loc14;
 
 ___
 
-       &BODY_00_15(\$code, 0,$A,$B,$C,$D,$E,$T,1);
-       &BODY_00_15(\$code, 1,$T,$A,$B,$C,$D,$E,1);
-       &BODY_00_15(\$code, 2,$E,$T,$A,$B,$C,$D,1);
-       &BODY_00_15(\$code, 3,$D,$E,$T,$A,$B,$C,1);
-       &BODY_00_15(\$code, 4,$C,$D,$E,$T,$A,$B,1);
-       &BODY_00_15(\$code, 5,$B,$C,$D,$E,$T,$A,1);
-       &BODY_00_15(\$code, 6,$A,$B,$C,$D,$E,$T,1);
-       &BODY_00_15(\$code, 7,$T,$A,$B,$C,$D,$E,1);
-       &BODY_00_15(\$code, 8,$E,$T,$A,$B,$C,$D,1);
-       &BODY_00_15(\$code, 9,$D,$E,$T,$A,$B,$C,1);
-       &BODY_00_15(\$code,10,$C,$D,$E,$T,$A,$B,1);
-       &BODY_00_15(\$code,11,$B,$C,$D,$E,$T,$A,1);
-       &BODY_00_15(\$code,12,$A,$B,$C,$D,$E,$T,1);
-       &BODY_00_15(\$code,13,$T,$A,$B,$C,$D,$E,1);
-       &BODY_00_15(\$code,14,$E,$T,$A,$B,$C,$D,1);
-       &BODY_00_15(\$code,15,$D,$E,$T,$A,$B,$C,1);
-
-       &BODY_16_19(\$code,16,$C,$D,$E,$T,$A,$B);
-       &BODY_16_19(\$code,17,$B,$C,$D,$E,$T,$A);
-       &BODY_16_19(\$code,18,$A,$B,$C,$D,$E,$T);
-       &BODY_16_19(\$code,19,$T,$A,$B,$C,$D,$E);
+{ my $i,@V=($A,$B,$C,$D,$E,$T);
 
-       &BODY_20_39(\$code,20,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,21,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39(\$code,22,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39(\$code,23,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39(\$code,24,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39(\$code,25,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39(\$code,26,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,27,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39(\$code,28,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39(\$code,29,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39(\$code,30,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39(\$code,31,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39(\$code,32,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,33,$D,$E,$T,$A,$B,$C);
-       &BODY_20_39(\$code,34,$C,$D,$E,$T,$A,$B);
-       &BODY_20_39(\$code,35,$B,$C,$D,$E,$T,$A);
-       &BODY_20_39(\$code,36,$A,$B,$C,$D,$E,$T);
-       &BODY_20_39(\$code,37,$T,$A,$B,$C,$D,$E);
-       &BODY_20_39(\$code,38,$E,$T,$A,$B,$C,$D);
-       &BODY_20_39(\$code,39,$D,$E,$T,$A,$B,$C);
+       for($i=0;$i<16;$i++)    { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
+       for(;$i<20;$i++)        { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
+       for(;$i<40;$i++)        { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
+       for(;$i<60;$i++)        { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
+       for(;$i<80;$i++)        { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
 
-       &BODY_40_59(\$code,40,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,41,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59(\$code,42,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59(\$code,43,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59(\$code,44,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59(\$code,45,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59(\$code,46,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,47,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59(\$code,48,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59(\$code,49,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59(\$code,50,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59(\$code,51,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59(\$code,52,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,53,$B,$C,$D,$E,$T,$A);
-       &BODY_40_59(\$code,54,$A,$B,$C,$D,$E,$T);
-       &BODY_40_59(\$code,55,$T,$A,$B,$C,$D,$E);
-       &BODY_40_59(\$code,56,$E,$T,$A,$B,$C,$D);
-       &BODY_40_59(\$code,57,$D,$E,$T,$A,$B,$C);
-       &BODY_40_59(\$code,58,$C,$D,$E,$T,$A,$B);
-       &BODY_40_59(\$code,59,$B,$C,$D,$E,$T,$A);
-
-       &BODY_60_79(\$code,60,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,61,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79(\$code,62,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79(\$code,63,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79(\$code,64,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79(\$code,65,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79(\$code,66,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,67,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79(\$code,68,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79(\$code,69,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79(\$code,70,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79(\$code,71,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79(\$code,72,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,73,$T,$A,$B,$C,$D,$E);
-       &BODY_60_79(\$code,74,$E,$T,$A,$B,$C,$D);
-       &BODY_60_79(\$code,75,$D,$E,$T,$A,$B,$C);
-       &BODY_60_79(\$code,76,$C,$D,$E,$T,$A,$B);
-       &BODY_60_79(\$code,77,$B,$C,$D,$E,$T,$A);
-       &BODY_60_79(\$code,78,$A,$B,$C,$D,$E,$T);
-       &BODY_60_79(\$code,79,$T,$A,$B,$C,$D,$E);
+       (($V[5] eq $D) and ($V[0] eq $E)) or die;       # double-check
+}
 
 $code.=<<___;
 { .mmb;        add     $h0=$h0,$E
@@ -539,7 +298,8 @@ $code.=<<___;
 { .mib;        st4     [ctx]=$h4,-16
        mov     pr=r2,0x1ffff
        br.ret.sptk.many        b0      };;
-.endp  sha1_block_asm_data_order#
+.endp  sha1_block_data_order#
+stringz        "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 print $code;
index 9de91749923f91a8b0e185da498e20adcfa7eadc..1c6ce56522ed2d0c7b4942ed1d573f3e84d42af0 100755 (executable)
@@ -2,8 +2,9 @@
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
 # SHA256/512_Transform for Itanium.
@@ -71,7 +72,7 @@ if ($output =~ /512.*\.[s|asm]/) {
        $ADD="add";
        $SHRU="shr.u";
        $TABLE="K512";
-       $func="sha512_block";
+       $func="sha512_block_data_order";
        @Sigma0=(28,34,39);
        @Sigma1=(14,18,41);
        @sigma0=(1,  8, 7);
@@ -85,7 +86,7 @@ if ($output =~ /512.*\.[s|asm]/) {
        $ADD="padd4";
        $SHRU="pshr4.u";
        $TABLE="K256";
-       $func="sha256_block";
+       $func="sha256_block_data_order";
        @Sigma0=( 2,13,22);
        @Sigma1=( 6,11,25);
        @sigma0=( 7,18, 3);
@@ -105,11 +106,13 @@ if (!defined($big_endian))
              { $big_endian=(unpack('L',pack('N',1))==1);  }
 
 $code=<<___;
-.ident  \"$output, version 1.0\"
+.ident  \"$output, version 1.1\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
 .text
 
+pfssave=r2;
+lcsave=r3;
 prsave=r14;
 K=r15;
 A=r16; B=r17;  C=r18;  D=r19;
@@ -121,6 +124,8 @@ ctx=r31;    // 1st arg
 input=r48;     // 2nd arg
 num=r49;       // 3rd arg
 sgm0=r50;      sgm1=r51;       // small constants
+A_=r54;        B_=r55; C_=r56; D_=r57;
+E_=r58;        F_=r59; G_=r60; H_=r61;
 
 // void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
 .global        $func#
@@ -128,81 +133,319 @@ sgm0=r50;        sgm1=r51;       // small constants
 .align 32
 $func:
        .prologue
-       .save   ar.pfs,r2
-{ .mmi;        alloc   r2=ar.pfs,3,17,0,16
+       .save   ar.pfs,pfssave
+{ .mmi;        alloc   pfssave=ar.pfs,3,27,0,16
        $ADDP   ctx=0,r32               // 1st arg
-       .save   ar.lc,r3
-       mov     r3=ar.lc        }
+       .save   ar.lc,lcsave
+       mov     lcsave=ar.lc    }
 { .mmi;        $ADDP   input=0,r33             // 2nd arg
-       addl    Ktbl=\@ltoff($TABLE#),gp
+       mov     num=r34                 // 3rd arg
        .save   pr,prsave
        mov     prsave=pr       };;
 
        .body
-{ .mii;        ld8     Ktbl=[Ktbl]
-       mov     num=r34         };;     // 3rd arg
-
 { .mib;        add     r8=0*$SZ,ctx
        add     r9=1*$SZ,ctx
-       brp.loop.imp    .L_first16,.L_first16_ctop
-                               }
+       brp.loop.imp    .L_first16,.L_first16_end-16    }
 { .mib;        add     r10=2*$SZ,ctx
        add     r11=3*$SZ,ctx
-       brp.loop.imp    .L_rest,.L_rest_ctop
-                               };;
-// load A-H
-{ .mmi;        $LDW    A=[r8],4*$SZ
-       $LDW    B=[r9],4*$SZ
-       mov     sgm0=$sigma0[2] }
-{ .mmi;        $LDW    C=[r10],4*$SZ
-       $LDW    D=[r11],4*$SZ
-       mov     sgm1=$sigma1[2] };;
-{ .mmi;        $LDW    E=[r8]
-       $LDW    F=[r9]          }
-{ .mmi;        $LDW    G=[r10]
-       $LDW    H=[r11]
-       cmp.ne  p15,p14=0,r35   };;     // used in sha256_block
+       brp.loop.imp    .L_rest,.L_rest_end-16          };;
 
+// load A-H
+.Lpic_point:
+{ .mmi;        $LDW    A_=[r8],4*$SZ
+       $LDW    B_=[r9],4*$SZ
+       mov     Ktbl=ip         }
+{ .mmi;        $LDW    C_=[r10],4*$SZ
+       $LDW    D_=[r11],4*$SZ
+       mov     sgm0=$sigma0[2] };;
+{ .mmi;        $LDW    E_=[r8]
+       $LDW    F_=[r9]
+       add     Ktbl=($TABLE#-.Lpic_point),Ktbl         }
+{ .mmi;        $LDW    G_=[r10]
+       $LDW    H_=[r11]
+       cmp.ne  p0,p16=0,r0     };;     // used in sha256_block
+___
+$code.=<<___ if ($BITS==64);
+{ .mii;        and     r8=7,input
+       and     input=~7,input;;
+       cmp.eq  p9,p0=1,r8      }
+{ .mmi;        cmp.eq  p10,p0=2,r8
+       cmp.eq  p11,p0=3,r8
+       cmp.eq  p12,p0=4,r8     }
+{ .mmi;        cmp.eq  p13,p0=5,r8
+       cmp.eq  p14,p0=6,r8
+       cmp.eq  p15,p0=7,r8     };;
+___
+$code.=<<___;
 .L_outer:
-{ .mii;        mov     ar.lc=15
-       mov     ar.ec=1         };;
-.align 32
-.L_first16:
 .rotr  X[16]
+{ .mmi;        mov     A=A_
+       mov     B=B_
+       mov     ar.lc=14        }
+{ .mmi;        mov     C=C_
+       mov     D=D_
+       mov     E=E_            }
+{ .mmi;        mov     F=F_
+       mov     G=G_
+       mov     ar.ec=2         }
+{ .mmi;        ld1     X[15]=[input],$SZ               // eliminated in 64-bit
+       mov     H=H_
+       mov     sgm1=$sigma1[2] };;
+
 ___
 $t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
-{ .mib;        (p14)   add     r9=1,input
-       (p14)   add     r10=2,input     }
-{ .mib;        (p14)   add     r11=3,input
-       (p15)   br.dptk.few     .L_host };;
-{ .mmi;        (p14)   ld1     r8=[input],$SZ
-       (p14)   ld1     r9=[r9]         }
-{ .mmi;        (p14)   ld1     r10=[r10]
-       (p14)   ld1     r11=[r11]       };;
-{ .mii;        (p14)   dep     r9=r8,r9,8,8
-       (p14)   dep     r11=r10,r11,8,8 };;
-{ .mib;        (p14)   dep     X[15]=r9,r11,16,16 };;
-.L_host:
-{ .mib;        (p15)   $LDW    X[15]=[input],$SZ       // X[i]=*input++
+.align 32
+.L_first16:
+{ .mmi;                add     r9=1-$SZ,input
+               add     r10=2-$SZ,input
+               add     r11=3-$SZ,input };;
+{ .mmi;                ld1     r9=[r9]
+               ld1     r10=[r10]
                dep.z   $t1=E,32,32     }
-{ .mib;                $LDW    K=[Ktbl],$SZ
+{ .mmi;                $LDW    K=[Ktbl],$SZ
+               ld1     r11=[r11]
                zxt4    E=E             };;
-{ .mmi;                or      $t1=$t1,E
-               and     T1=F,E
-               and     T2=A,B          }
+{ .mii;                or      $t1=$t1,E
+               dep     X[15]=X[15],r9,8,8
+               dep     r11=r10,r11,8,8 };;
+{ .mmi;                and     T1=F,E
+               and     T2=A,B
+               dep     X[15]=X[15],r11,16,16   }
 { .mmi;                andcm   r8=G,E
                and     r9=A,C
                mux2    $t0=A,0x44      };;     // copy lower half to upper
-{ .mib;                xor     T1=T1,r8                // T1=((e & f) ^ (~e & g))
+{ .mmi;        (p16)   ld1     X[15-1]=[input],$SZ     // prefetch
+               xor     T1=T1,r8                // T1=((e & f) ^ (~e & g))
                _rotr   r11=$t1,$Sigma1[0] }    // ROTR(e,14)
 { .mib;                and     r10=B,C
                xor     T2=T2,r9        };;
 ___
 $t0="A", $t1="E", $code.=<<___ if ($BITS==64);
-{ .mmi;                $LDW    X[15]=[input],$SZ       // X[i]=*input++
+// in 64-bit mode I load whole X[16] at once and take care of alignment...
+{ .mmi;        add     r8=1*$SZ,input
+       add     r9=2*$SZ,input
+       add     r10=3*$SZ,input         };;
+{ .mmb;        $LDW    X[15]=[input],4*$SZ
+       $LDW    X[14]=[r8],4*$SZ
+(p9)   br.cond.dpnt.many       .L1byte };;
+{ .mmb;        $LDW    X[13]=[r9],4*$SZ
+       $LDW    X[12]=[r10],4*$SZ
+(p10)  br.cond.dpnt.many       .L2byte };;
+{ .mmb;        $LDW    X[11]=[input],4*$SZ
+       $LDW    X[10]=[r8],4*$SZ
+(p11)  br.cond.dpnt.many       .L3byte };;
+{ .mmb;        $LDW    X[ 9]=[r9],4*$SZ
+       $LDW    X[ 8]=[r10],4*$SZ
+(p12)  br.cond.dpnt.many       .L4byte };;
+{ .mmb;        $LDW    X[ 7]=[input],4*$SZ
+       $LDW    X[ 6]=[r8],4*$SZ
+(p13)  br.cond.dpnt.many       .L5byte };;
+{ .mmb;        $LDW    X[ 5]=[r9],4*$SZ
+       $LDW    X[ 4]=[r10],4*$SZ
+(p14)  br.cond.dpnt.many       .L6byte };;
+{ .mmb;        $LDW    X[ 3]=[input],4*$SZ
+       $LDW    X[ 2]=[r8],4*$SZ
+(p15)  br.cond.dpnt.many       .L7byte };;
+{ .mmb;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       br.many .L_first16              };;
+.L1byte:
+{ .mmi;        $LDW    X[13]=[r9],4*$SZ
+       $LDW    X[12]=[r10],4*$SZ
+       shrp    X[15]=X[15],X[14],56    };;
+{ .mmi;        $LDW    X[11]=[input],4*$SZ
+       $LDW    X[10]=[r8],4*$SZ
+       shrp    X[14]=X[14],X[13],56    }
+{ .mmi;        $LDW    X[ 9]=[r9],4*$SZ
+       $LDW    X[ 8]=[r10],4*$SZ
+       shrp    X[13]=X[13],X[12],56    };;
+{ .mmi;        $LDW    X[ 7]=[input],4*$SZ
+       $LDW    X[ 6]=[r8],4*$SZ
+       shrp    X[12]=X[12],X[11],56    }
+{ .mmi;        $LDW    X[ 5]=[r9],4*$SZ
+       $LDW    X[ 4]=[r10],4*$SZ
+       shrp    X[11]=X[11],X[10],56    };;
+{ .mmi;        $LDW    X[ 3]=[input],4*$SZ
+       $LDW    X[ 2]=[r8],4*$SZ
+       shrp    X[10]=X[10],X[ 9],56    }
+{ .mmi;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       shrp    X[ 9]=X[ 9],X[ 8],56    };;
+{ .mii;        $LDW    T1=[input]
+       shrp    X[ 8]=X[ 8],X[ 7],56
+       shrp    X[ 7]=X[ 7],X[ 6],56    }
+{ .mii;        shrp    X[ 6]=X[ 6],X[ 5],56
+       shrp    X[ 5]=X[ 5],X[ 4],56    };;
+{ .mii;        shrp    X[ 4]=X[ 4],X[ 3],56
+       shrp    X[ 3]=X[ 3],X[ 2],56    }
+{ .mii;        shrp    X[ 2]=X[ 2],X[ 1],56
+       shrp    X[ 1]=X[ 1],X[ 0],56    }
+{ .mib;        shrp    X[ 0]=X[ 0],T1,56
+       br.many .L_first16              };;
+.L2byte:
+{ .mmi;        $LDW    X[11]=[input],4*$SZ
+       $LDW    X[10]=[r8],4*$SZ
+       shrp    X[15]=X[15],X[14],48    }
+{ .mmi;        $LDW    X[ 9]=[r9],4*$SZ
+       $LDW    X[ 8]=[r10],4*$SZ
+       shrp    X[14]=X[14],X[13],48    };;
+{ .mmi;        $LDW    X[ 7]=[input],4*$SZ
+       $LDW    X[ 6]=[r8],4*$SZ
+       shrp    X[13]=X[13],X[12],48    }
+{ .mmi;        $LDW    X[ 5]=[r9],4*$SZ
+       $LDW    X[ 4]=[r10],4*$SZ
+       shrp    X[12]=X[12],X[11],48    };;
+{ .mmi;        $LDW    X[ 3]=[input],4*$SZ
+       $LDW    X[ 2]=[r8],4*$SZ
+       shrp    X[11]=X[11],X[10],48    }
+{ .mmi;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       shrp    X[10]=X[10],X[ 9],48    };;
+{ .mii;        $LDW    T1=[input]
+       shrp    X[ 9]=X[ 9],X[ 8],48
+       shrp    X[ 8]=X[ 8],X[ 7],48    }
+{ .mii;        shrp    X[ 7]=X[ 7],X[ 6],48
+       shrp    X[ 6]=X[ 6],X[ 5],48    };;
+{ .mii;        shrp    X[ 5]=X[ 5],X[ 4],48
+       shrp    X[ 4]=X[ 4],X[ 3],48    }
+{ .mii;        shrp    X[ 3]=X[ 3],X[ 2],48
+       shrp    X[ 2]=X[ 2],X[ 1],48    }
+{ .mii;        shrp    X[ 1]=X[ 1],X[ 0],48
+       shrp    X[ 0]=X[ 0],T1,48       }
+{ .mfb;        br.many .L_first16              };;
+.L3byte:
+{ .mmi;        $LDW    X[ 9]=[r9],4*$SZ
+       $LDW    X[ 8]=[r10],4*$SZ
+       shrp    X[15]=X[15],X[14],40    };;
+{ .mmi;        $LDW    X[ 7]=[input],4*$SZ
+       $LDW    X[ 6]=[r8],4*$SZ
+       shrp    X[14]=X[14],X[13],40    }
+{ .mmi;        $LDW    X[ 5]=[r9],4*$SZ
+       $LDW    X[ 4]=[r10],4*$SZ
+       shrp    X[13]=X[13],X[12],40    };;
+{ .mmi;        $LDW    X[ 3]=[input],4*$SZ
+       $LDW    X[ 2]=[r8],4*$SZ
+       shrp    X[12]=X[12],X[11],40    }
+{ .mmi;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       shrp    X[11]=X[11],X[10],40    };;
+{ .mii;        $LDW    T1=[input]
+       shrp    X[10]=X[10],X[ 9],40
+       shrp    X[ 9]=X[ 9],X[ 8],40    }
+{ .mii;        shrp    X[ 8]=X[ 8],X[ 7],40
+       shrp    X[ 7]=X[ 7],X[ 6],40    };;
+{ .mii;        shrp    X[ 6]=X[ 6],X[ 5],40
+       shrp    X[ 5]=X[ 5],X[ 4],40    }
+{ .mii;        shrp    X[ 4]=X[ 4],X[ 3],40
+       shrp    X[ 3]=X[ 3],X[ 2],40    }
+{ .mii;        shrp    X[ 2]=X[ 2],X[ 1],40
+       shrp    X[ 1]=X[ 1],X[ 0],40    }
+{ .mib;        shrp    X[ 0]=X[ 0],T1,40
+       br.many .L_first16              };;
+.L4byte:
+{ .mmi;        $LDW    X[ 7]=[input],4*$SZ
+       $LDW    X[ 6]=[r8],4*$SZ
+       shrp    X[15]=X[15],X[14],32    }
+{ .mmi;        $LDW    X[ 5]=[r9],4*$SZ
+       $LDW    X[ 4]=[r10],4*$SZ
+       shrp    X[14]=X[14],X[13],32    };;
+{ .mmi;        $LDW    X[ 3]=[input],4*$SZ
+       $LDW    X[ 2]=[r8],4*$SZ
+       shrp    X[13]=X[13],X[12],32    }
+{ .mmi;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       shrp    X[12]=X[12],X[11],32    };;
+{ .mii;        $LDW    T1=[input]
+       shrp    X[11]=X[11],X[10],32
+       shrp    X[10]=X[10],X[ 9],32    }
+{ .mii;        shrp    X[ 9]=X[ 9],X[ 8],32
+       shrp    X[ 8]=X[ 8],X[ 7],32    };;
+{ .mii;        shrp    X[ 7]=X[ 7],X[ 6],32
+       shrp    X[ 6]=X[ 6],X[ 5],32    }
+{ .mii;        shrp    X[ 5]=X[ 5],X[ 4],32
+       shrp    X[ 4]=X[ 4],X[ 3],32    }
+{ .mii;        shrp    X[ 3]=X[ 3],X[ 2],32
+       shrp    X[ 2]=X[ 2],X[ 1],32    }
+{ .mii;        shrp    X[ 1]=X[ 1],X[ 0],32
+       shrp    X[ 0]=X[ 0],T1,32       }
+{ .mfb;        br.many .L_first16              };;
+.L5byte:
+{ .mmi;        $LDW    X[ 5]=[r9],4*$SZ
+       $LDW    X[ 4]=[r10],4*$SZ
+       shrp    X[15]=X[15],X[14],24    };;
+{ .mmi;        $LDW    X[ 3]=[input],4*$SZ
+       $LDW    X[ 2]=[r8],4*$SZ
+       shrp    X[14]=X[14],X[13],24    }
+{ .mmi;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       shrp    X[13]=X[13],X[12],24    };;
+{ .mii;        $LDW    T1=[input]
+       shrp    X[12]=X[12],X[11],24
+       shrp    X[11]=X[11],X[10],24    }
+{ .mii;        shrp    X[10]=X[10],X[ 9],24
+       shrp    X[ 9]=X[ 9],X[ 8],24    };;
+{ .mii;        shrp    X[ 8]=X[ 8],X[ 7],24
+       shrp    X[ 7]=X[ 7],X[ 6],24    }
+{ .mii;        shrp    X[ 6]=X[ 6],X[ 5],24
+       shrp    X[ 5]=X[ 5],X[ 4],24    }
+{ .mii;        shrp    X[ 4]=X[ 4],X[ 3],24
+       shrp    X[ 3]=X[ 3],X[ 2],24    }
+{ .mii;        shrp    X[ 2]=X[ 2],X[ 1],24
+       shrp    X[ 1]=X[ 1],X[ 0],24    }
+{ .mib;        shrp    X[ 0]=X[ 0],T1,24
+       br.many .L_first16              };;
+.L6byte:
+{ .mmi;        $LDW    X[ 3]=[input],4*$SZ
+       $LDW    X[ 2]=[r8],4*$SZ
+       shrp    X[15]=X[15],X[14],16    }
+{ .mmi;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       shrp    X[14]=X[14],X[13],16    };;
+{ .mii;        $LDW    T1=[input]
+       shrp    X[13]=X[13],X[12],16
+       shrp    X[12]=X[12],X[11],16    }
+{ .mii;        shrp    X[11]=X[11],X[10],16
+       shrp    X[10]=X[10],X[ 9],16    };;
+{ .mii;        shrp    X[ 9]=X[ 9],X[ 8],16
+       shrp    X[ 8]=X[ 8],X[ 7],16    }
+{ .mii;        shrp    X[ 7]=X[ 7],X[ 6],16
+       shrp    X[ 6]=X[ 6],X[ 5],16    }
+{ .mii;        shrp    X[ 5]=X[ 5],X[ 4],16
+       shrp    X[ 4]=X[ 4],X[ 3],16    }
+{ .mii;        shrp    X[ 3]=X[ 3],X[ 2],16
+       shrp    X[ 2]=X[ 2],X[ 1],16    }
+{ .mii;        shrp    X[ 1]=X[ 1],X[ 0],16
+       shrp    X[ 0]=X[ 0],T1,16       }
+{ .mfb;        br.many .L_first16              };;
+.L7byte:
+{ .mmi;        $LDW    X[ 1]=[r9],4*$SZ
+       $LDW    X[ 0]=[r10],4*$SZ
+       shrp    X[15]=X[15],X[14],8     };;
+{ .mii;        $LDW    T1=[input]
+       shrp    X[14]=X[14],X[13],8
+       shrp    X[13]=X[13],X[12],8     }
+{ .mii;        shrp    X[12]=X[12],X[11],8
+       shrp    X[11]=X[11],X[10],8     };;
+{ .mii;        shrp    X[10]=X[10],X[ 9],8
+       shrp    X[ 9]=X[ 9],X[ 8],8     }
+{ .mii;        shrp    X[ 8]=X[ 8],X[ 7],8
+       shrp    X[ 7]=X[ 7],X[ 6],8     }
+{ .mii;        shrp    X[ 6]=X[ 6],X[ 5],8
+       shrp    X[ 5]=X[ 5],X[ 4],8     }
+{ .mii;        shrp    X[ 4]=X[ 4],X[ 3],8
+       shrp    X[ 3]=X[ 3],X[ 2],8     }
+{ .mii;        shrp    X[ 2]=X[ 2],X[ 1],8
+       shrp    X[ 1]=X[ 1],X[ 0],8     }
+{ .mib;        shrp    X[ 0]=X[ 0],T1,8
+       br.many .L_first16              };;
+
+.align 32
+.L_first16:
+{ .mmi;                $LDW    K=[Ktbl],$SZ
                and     T1=F,E
                and     T2=A,B          }
-{ .mmi;                $LDW    K=[Ktbl],$SZ
+{ .mmi;                //$LDW  X[15]=[input],$SZ       // X[i]=*input++
                andcm   r8=G,E
                and     r9=A,C          };;
 { .mmi;                xor     T1=T1,r8                //T1=((e & f) ^ (~e & g))
@@ -235,13 +478,14 @@ $code.=<<___;
 { .mmi;                xor     r10=r8,r10              // r10=Sigma0(a)
                mov     B=A
                add     A=T1,T2         };;
-.L_first16_ctop:
 { .mib;                add     E=E,T1
                add     A=A,r10                 // T2=Maj(a,b,c)+Sigma0(a)
        br.ctop.sptk    .L_first16      };;
+.L_first16_end:
+
+{ .mii;        mov     ar.lc=$rounds-17
+       mov     ar.ec=1                 };;
 
-{ .mib;        mov     ar.lc=$rounds-17        }
-{ .mib;        mov     ar.ec=1                 };;
 .align 32
 .L_rest:
 .rotr  X[16]
@@ -310,46 +554,38 @@ $code.=<<___;
 { .mmi;                xor     r10=r8,r10              // r10=Sigma0(a)
                mov     B=A
                add     A=T1,T2         };;
-.L_rest_ctop:
 { .mib;                add     E=E,T1
                add     A=A,r10                 // T2=Maj(a,b,c)+Sigma0(a)
        br.ctop.sptk    .L_rest };;
+.L_rest_end:
+
+{ .mmi;        add     A_=A_,A
+       add     B_=B_,B
+       add     C_=C_,C                 }
+{ .mmi;        add     D_=D_,D
+       add     E_=E_,E
+       cmp.ltu p16,p0=1,num            };;
+{ .mmi;        add     F_=F_,F
+       add     G_=G_,G
+       add     H_=H_,H                 }
+{ .mmb;        add     Ktbl=-$SZ*$rounds,Ktbl
+(p16)  add     num=-1,num
+(p16)  br.dptk.many    .L_outer        };;
 
 { .mib;        add     r8=0*$SZ,ctx
        add     r9=1*$SZ,ctx            }
 { .mib;        add     r10=2*$SZ,ctx
        add     r11=3*$SZ,ctx           };;
-{ .mmi;        $LDW    r32=[r8],4*$SZ
-       $LDW    r33=[r9],4*$SZ          }
-{ .mmi;        $LDW    r34=[r10],4*$SZ
-       $LDW    r35=[r11],4*$SZ
-       cmp.ltu p6,p7=1,num             };;
-{ .mmi;        $LDW    r36=[r8],-4*$SZ
-       $LDW    r37=[r9],-4*$SZ
-(p6)   add     Ktbl=-$SZ*$rounds,Ktbl  }
-{ .mmi;        $LDW    r38=[r10],-4*$SZ
-       $LDW    r39=[r11],-4*$SZ
-(p7)   mov     ar.lc=r3                };;
-{ .mmi;        add     A=A,r32
-       add     B=B,r33
-       add     C=C,r34                 }
-{ .mmi;        add     D=D,r35
-       add     E=E,r36
-       add     F=F,r37                 };;
-{ .mmi;        $STW    [r8]=A,4*$SZ
-       $STW    [r9]=B,4*$SZ
-       add     G=G,r38                 }
-{ .mmi;        $STW    [r10]=C,4*$SZ
-       $STW    [r11]=D,4*$SZ
-       add     H=H,r39                 };;
-{ .mmi;        $STW    [r8]=E
-       $STW    [r9]=F
-(p6)   add     num=-1,num              }
-{ .mmb;        $STW    [r10]=G
-       $STW    [r11]=H
-(p6)   br.dptk.many    .L_outer        };;
-
-{ .mib;        mov     pr=prsave,0x1ffff
+{ .mmi;        $STW    [r8]=A_,4*$SZ
+       $STW    [r9]=B_,4*$SZ
+       mov     ar.lc=lcsave            }
+{ .mmi;        $STW    [r10]=C_,4*$SZ
+       $STW    [r11]=D_,4*$SZ
+       mov     pr=prsave,0x1ffff       };;
+{ .mmb;        $STW    [r8]=E_
+       $STW    [r9]=F_                 }
+{ .mmb;        $STW    [r10]=G_
+       $STW    [r11]=H_
        br.ret.sptk.many        b0      };;
 .endp  $func#
 ___
@@ -358,7 +594,10 @@ $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
 if ($BITS==64) {
     $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm;
-    $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
+    $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm     if ($big_endian);
+    $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
+                                               if (!$big_endian);
+    $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
 }
 
 print $code;
@@ -383,6 +622,7 @@ K256:       data4   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
        data4   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
        data4   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size  K256#,$SZ*$rounds
+stringz        "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 print<<___ if ($BITS==64);
 .align 64
@@ -428,4 +668,5 @@ K512:       data8   0x428a2f98d728ae22,0x7137449123ef65cd
        data8   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
        data8   0x5fcb6fab3ad6faec,0x6c44198c4a475817
 .size  K512#,$SZ*$rounds
+stringz        "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
index 05ae9445db11e8a81e5b85a5f28eeb97355c0ec2..5481261ac2fa2ed0b2f45ab99d410c79a3571dd2 100644 (file)
@@ -69,17 +69,11 @@ int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
 int SHA224_Final (unsigned char *md, SHA256_CTX *c)
 {   return SHA256_Final (md,c);   }
 
-#ifndef        SHA_LONG_LOG2
-#define        SHA_LONG_LOG2   2       /* default to 32 bits */
-#endif
-
 #define        DATA_ORDER_IS_BIG_ENDIAN
 
 #define        HASH_LONG               SHA_LONG
-#define        HASH_LONG_LOG2          SHA_LONG_LOG2
 #define        HASH_CTX                SHA256_CTX
 #define        HASH_CBLOCK             SHA_CBLOCK
-#define        HASH_LBLOCK             SHA_LBLOCK
 /*
  * Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
  * default: case below covers for it. It's not clear however if it's
@@ -112,16 +106,15 @@ int SHA224_Final (unsigned char *md, SHA256_CTX *c)
 #define        HASH_UPDATE             SHA256_Update
 #define        HASH_TRANSFORM          SHA256_Transform
 #define        HASH_FINAL              SHA256_Final
-#define        HASH_BLOCK_HOST_ORDER   sha256_block_host_order
 #define        HASH_BLOCK_DATA_ORDER   sha256_block_data_order
-void sha256_block_host_order (SHA256_CTX *ctx, const void *in, size_t num);
+#ifndef SHA256_ASM
+static
+#endif
 void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num);
 
 #include "md32_common.h"
 
-#ifdef SHA256_ASM
-void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host);
-#else
+#ifndef SHA256_ASM
 static const SHA_LONG K256[64] = {
        0x428a2f98UL,0x71374491UL,0xb5c0fbcfUL,0xe9b5dba5UL,
        0x3956c25bUL,0x59f111f1UL,0x923f82a4UL,0xab1c5ed5UL,
@@ -155,10 +148,10 @@ static const SHA_LONG K256[64] = {
 
 #ifdef OPENSSL_SMALL_FOOTPRINT
 
-static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
+static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
        {
        unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2;
-       SHA_LONG        X[16];
+       SHA_LONG        X[16],l;
        int i;
        const unsigned char *data=in;
 
@@ -167,33 +160,13 @@ static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
        a = ctx->h[0];  b = ctx->h[1];  c = ctx->h[2];  d = ctx->h[3];
        e = ctx->h[4];  f = ctx->h[5];  g = ctx->h[6];  h = ctx->h[7];
 
-       if (host)
-               {
-               const SHA_LONG *W=(const SHA_LONG *)data;
-
-               for (i=0;i<16;i++)
-                       {
-                       T1 = X[i] = W[i];
-                       T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
-                       T2 = Sigma0(a) + Maj(a,b,c);
-                       h = g;  g = f;  f = e;  e = d + T1;
-                       d = c;  c = b;  b = a;  a = T1 + T2;
-                       }
-
-               data += SHA256_CBLOCK;
-               }
-       else
+       for (i=0;i<16;i++)
                {
-               SHA_LONG l;
-
-               for (i=0;i<16;i++)
-                       {
-                       HOST_c2l(data,l); T1 = X[i] = l;
-                       T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
-                       T2 = Sigma0(a) + Maj(a,b,c);
-                       h = g;  g = f;  f = e;  e = d + T1;
-                       d = c;  c = b;  b = a;  a = T1 + T2;
-                       }
+               HOST_c2l(data,l); T1 = X[i] = l;
+               T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
+               T2 = Sigma0(a) + Maj(a,b,c);
+               h = g;  g = f;  f = e;  e = d + T1;
+               d = c;  c = b;  b = a;  a = T1 + T2;
                }
 
        for (;i<64;i++)
@@ -227,19 +200,20 @@ static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
        T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f];    \
        ROUND_00_15(i,a,b,c,d,e,f,g,h);         } while (0)
 
-static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
+static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num)
        {
        unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1;
        SHA_LONG        X[16];
        int i;
        const unsigned char *data=in;
+       const union { long one; char little; } is_endian = {1};
 
                        while (num--) {
 
        a = ctx->h[0];  b = ctx->h[1];  c = ctx->h[2];  d = ctx->h[3];
        e = ctx->h[4];  f = ctx->h[5];  g = ctx->h[6];  h = ctx->h[7];
 
-       if (host)
+       if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)in%4)==0)
                {
                const SHA_LONG *W=(const SHA_LONG *)data;
 
@@ -305,15 +279,4 @@ static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
 #endif
 #endif /* SHA256_ASM */
 
-/*
- * Idea is to trade couple of cycles for some space. On IA-32 we save
- * about 4K in "big footprint" case. In "small footprint" case any gain
- * is appreciated:-)
- */
-void HASH_BLOCK_HOST_ORDER (SHA256_CTX *ctx, const void *in, size_t num)
-{   sha256_block (ctx,in,num,1);   }
-
-void HASH_BLOCK_DATA_ORDER (SHA256_CTX *ctx, const void *in, size_t num)
-{   sha256_block (ctx,in,num,0);   }
-
 #endif /* OPENSSL_NO_SHA256 */
index 39d18b8fb46ec2310fa2ec8fc5d6e253d2d68436..c58b843ad01f0a68b223d6fd265133f5095dc218 100644 (file)
 
 const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
 
-#if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386) || defined(__x86_64)
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
+    defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \
+    defined(__s390__) || defined(__s390x__) || \
+    defined(SHA512_ASM)
 #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 #endif
 
@@ -89,7 +92,7 @@ int SHA512_Init (SHA512_CTX *c)
 #ifndef SHA512_ASM
 static
 #endif
-void sha512_block (SHA512_CTX *ctx, const void *in, size_t num);
+void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num);
 
 int SHA512_Final (unsigned char *md, SHA512_CTX *c)
        {
@@ -100,7 +103,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
        n++;
        if (n > (sizeof(c->u)-16))
                memset (p+n,0,sizeof(c->u)-n), n=0,
-               sha512_block (c,p,1);
+               sha512_block_data_order (c,p,1);
 
        memset (p+n,0,sizeof(c->u)-16-n);
 #ifdef B_ENDIAN
@@ -125,7 +128,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
        p[sizeof(c->u)-16] = (unsigned char)(c->Nh>>56);
 #endif
 
-       sha512_block (c,p,1);
+       sha512_block_data_order (c,p,1);
 
        if (md==0) return 0;
 
@@ -197,7 +200,7 @@ int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
                else    {
                        memcpy (p+c->num,data,n), c->num = 0;
                        len-=n, data+=n;
-                       sha512_block (c,p,1);
+                       sha512_block_data_order (c,p,1);
                        }
                }
 
@@ -207,12 +210,12 @@ int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len)
                if ((size_t)data%sizeof(c->u.d[0]) != 0)
                        while (len >= sizeof(c->u))
                                memcpy (p,data,sizeof(c->u)),
-                               sha512_block (c,p,1),
+                               sha512_block_data_order (c,p,1),
                                len  -= sizeof(c->u),
                                data += sizeof(c->u);
                else
 #endif
-                       sha512_block (c,data,len/sizeof(c->u)),
+                       sha512_block_data_order (c,data,len/sizeof(c->u)),
                        data += len,
                        len  %= sizeof(c->u),
                        data -= len;
@@ -227,7 +230,7 @@ int SHA384_Update (SHA512_CTX *c, const void *data, size_t len)
 {   return SHA512_Update (c,data,len);   }
 
 void SHA512_Transform (SHA512_CTX *c, const unsigned char *data)
-{   sha512_block (c,data,1);  }
+{   sha512_block_data_order (c,data,1);  }
 
 unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
        {
@@ -301,40 +304,78 @@ static const SHA_LONG64 K512[80] = {
 #ifndef PEDANTIC
 # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
 #  if defined(__x86_64) || defined(__x86_64__)
-#   define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x)));  \
-                               asm ("bswapq    %0"             \
-                               : "=r"(ret)                     \
-                               : "0"(ret)); ret;               })
-#  endif
-# endif
-#endif
-
-#ifndef PULL64
-#define B(x,j)    (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
-#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
-#endif
-
-#ifndef PEDANTIC
-# if defined(_MSC_VER)
-#  if defined(_WIN64)  /* applies to both IA-64 and AMD64 */
-#   define ROTR(a,n)   _rotr64((a),n)
-#  endif
-# elif defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-#  if defined(__x86_64) || defined(__x86_64__)
 #   define ROTR(a,n)   ({ unsigned long ret;           \
                                asm ("rorq %1,%0"       \
                                : "=r"(ret)             \
                                : "J"(n),"0"(a)         \
                                : "cc"); ret;           })
-#  elif defined(_ARCH_PPC) && defined(__64BIT__)
+#   if !defined(B_ENDIAN)
+#    define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
+                               asm ("bswapq    %0"             \
+                               : "=r"(ret)                     \
+                               : "0"(ret)); ret;               })
+#   endif
+#  elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN)
+#   if defined(I386_ONLY)
+#    define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
+                       unsigned int hi,lo;                     \
+                               asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
+                                   "roll $16,%%eax; roll $16,%%edx; "\
+                                   "xchgb %%ah,%%al;xchgb %%dh,%%dl;" \
+                               : "=a"(lo),"=d"(hi)             \
+                               : "0"(p[1]),"1"(p[0]) : "cc");  \
+                               ((SHA_LONG64)hi)<<32|lo;        })
+#   else
+#    define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
+                       unsigned int hi,lo;                     \
+                               asm ("bswapl %0; bswapl %1;"    \
+                               : "=r"(lo),"=r"(hi)             \
+                               : "0"(p[1]),"1"(p[0]));         \
+                               ((SHA_LONG64)hi)<<32|lo;        })
+#   endif
+#  elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
 #   define ROTR(a,n)   ({ unsigned long ret;           \
                                asm ("rotrdi %0,%1,%2"  \
                                : "=r"(ret)             \
                                : "r"(a),"K"(n)); ret;  })
 #  endif
+# elif defined(_MSC_VER)
+#  if defined(_WIN64)  /* applies to both IA-64 and AMD64 */
+#   define ROTR(a,n)   _rotr64((a),n)
+#  endif
+#  if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#   if defined(I386_ONLY)
+    static SHA_LONG64 __fastcall __pull64be(const void *x)
+    {  _asm    mov     edx, [ecx + 0]
+       _asm    mov     eax, [ecx + 4]
+       _asm    xchg    dh,dl
+       _asm    xchg    ah,al
+       _asm    rol     edx,16
+       _asm    rol     eax,16
+       _asm    xchg    dh,dl
+       _asm    xchg    ah,al
+    }
+#   else
+    static SHA_LONG64 __fastcall __pull64be(const void *x)
+    {  _asm    mov     edx, [ecx + 0]
+       _asm    mov     eax, [ecx + 4]
+       _asm    bswap   edx
+       _asm    bswap   eax
+    }
+#   endif
+#   define PULL64(x) __pull64be(&(x))
+#   if _MSC_VER<=1200
+#    pragma inline_depth(0)
+#   endif
+#  endif
 # endif
 #endif
 
+#ifndef PULL64
+#define B(x,j)    (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
+#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
+#endif
+
 #ifndef ROTR
 #define ROTR(x,s)      (((x)>>s) | (x)<<(64-s))
 #endif
@@ -357,7 +398,7 @@ static const SHA_LONG64 K512[80] = {
 
 #ifdef OPENSSL_SMALL_FOOTPRINT
 
-static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num)
+static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
        {
        const SHA_LONG64 *W=in;
        SHA_LONG64      a,b,c,d,e,f,g,h,s0,s1,T1,T2;
@@ -418,7 +459,7 @@ static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num)
        T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f];    \
        ROUND_00_15(i,a,b,c,d,e,f,g,h);         } while (0)
 
-static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num)
+static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num)
        {
        const SHA_LONG64 *W=in;
        SHA_LONG64      a,b,c,d,e,f,g,h,s0,s1,T1;
index 6281313a455680e84c5a3b8bebeb7e1549571d87..e37e5726e33dcb08ade15fa31eaa91c09bda40d0 100644 (file)
 #include <openssl/opensslconf.h>
 #include <openssl/sha.h>
 
-#ifndef SHA_LONG_LOG2
-#define SHA_LONG_LOG2  2       /* default to 32 bits */
-#endif
-
 #define DATA_ORDER_IS_BIG_ENDIAN
 
 #define HASH_LONG               SHA_LONG
-#define HASH_LONG_LOG2          SHA_LONG_LOG2
 #define HASH_CTX                SHA_CTX
 #define HASH_CBLOCK             SHA_CBLOCK
-#define HASH_LBLOCK             SHA_LBLOCK
 #define HASH_MAKE_STRING(c,s)   do {   \
        unsigned long ll;               \
        ll=(c)->h0; HOST_l2c(ll,(s));   \
 # define HASH_TRANSFORM                SHA_Transform
 # define HASH_FINAL                    SHA_Final
 # define HASH_INIT                     SHA_Init
-# define HASH_BLOCK_HOST_ORDER         sha_block_host_order
 # define HASH_BLOCK_DATA_ORDER         sha_block_data_order
 # define Xupdate(a,ix,ia,ib,ic,id)     (ix=(a)=(ia^ib^ic^id))
 
-  void sha_block_host_order (SHA_CTX *c, const void *p,size_t num);
-  void sha_block_data_order (SHA_CTX *c, const void *p,size_t num);
+static void sha_block_data_order (SHA_CTX *c, const void *p,size_t num);
 
 #elif defined(SHA_1)
 
 # define HASH_TRANSFORM                SHA1_Transform
 # define HASH_FINAL                    SHA1_Final
 # define HASH_INIT                     SHA1_Init
-# define HASH_BLOCK_HOST_ORDER         sha1_block_host_order
 # define HASH_BLOCK_DATA_ORDER         sha1_block_data_order
 # if defined(__MWERKS__) && defined(__MC68K__)
    /* Metrowerks for Motorola fails otherwise:-( <appro@fy.chalmers.se> */
                                        )
 # endif
 
-# ifdef SHA1_ASM
-#  if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__)
-#   if !defined(B_ENDIAN)
-#    define sha1_block_host_order              sha1_block_asm_host_order
-#    define DONT_IMPLEMENT_BLOCK_HOST_ORDER
-#    define sha1_block_data_order              sha1_block_asm_data_order
-#    define DONT_IMPLEMENT_BLOCK_DATA_ORDER
-#    define HASH_BLOCK_DATA_ORDER_ALIGNED      sha1_block_asm_data_order
-#   endif
-#  elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
-#   define sha1_block_host_order               sha1_block_asm_host_order
-#   define DONT_IMPLEMENT_BLOCK_HOST_ORDER
-#   define sha1_block_data_order               sha1_block_asm_data_order
-#   define DONT_IMPLEMENT_BLOCK_DATA_ORDER
-#  endif
-# endif
-  void sha1_block_host_order (SHA_CTX *c, const void *p,size_t num);
-  void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
+#ifndef SHA1_ASM
+static
+#endif
+void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
 
 #else
 # error "Either SHA_0 or SHA_1 must be defined."
@@ -229,133 +206,8 @@ int HASH_INIT (SHA_CTX *c)
 # define X(i)  XX[i]
 #endif
 
-#ifndef DONT_IMPLEMENT_BLOCK_HOST_ORDER
-void HASH_BLOCK_HOST_ORDER (SHA_CTX *c, const void *d, size_t num)
-       {
-       const SHA_LONG *W=d;
-       register unsigned MD32_REG_T A,B,C,D,E,T;
-#ifndef MD32_XARRAY
-       unsigned MD32_REG_T     XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7,
-                               XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15;
-#else
-       SHA_LONG        XX[16];
-#endif
-
-       A=c->h0;
-       B=c->h1;
-       C=c->h2;
-       D=c->h3;
-       E=c->h4;
-
-       for (;;)
-               {
-       BODY_00_15( 0,A,B,C,D,E,T,W[ 0]);
-       BODY_00_15( 1,T,A,B,C,D,E,W[ 1]);
-       BODY_00_15( 2,E,T,A,B,C,D,W[ 2]);
-       BODY_00_15( 3,D,E,T,A,B,C,W[ 3]);
-       BODY_00_15( 4,C,D,E,T,A,B,W[ 4]);
-       BODY_00_15( 5,B,C,D,E,T,A,W[ 5]);
-       BODY_00_15( 6,A,B,C,D,E,T,W[ 6]);
-       BODY_00_15( 7,T,A,B,C,D,E,W[ 7]);
-       BODY_00_15( 8,E,T,A,B,C,D,W[ 8]);
-       BODY_00_15( 9,D,E,T,A,B,C,W[ 9]);
-       BODY_00_15(10,C,D,E,T,A,B,W[10]);
-       BODY_00_15(11,B,C,D,E,T,A,W[11]);
-       BODY_00_15(12,A,B,C,D,E,T,W[12]);
-       BODY_00_15(13,T,A,B,C,D,E,W[13]);
-       BODY_00_15(14,E,T,A,B,C,D,W[14]);
-       BODY_00_15(15,D,E,T,A,B,C,W[15]);
-
-       BODY_16_19(16,C,D,E,T,A,B,X( 0),W[ 0],W[ 2],W[ 8],W[13]);
-       BODY_16_19(17,B,C,D,E,T,A,X( 1),W[ 1],W[ 3],W[ 9],W[14]);
-       BODY_16_19(18,A,B,C,D,E,T,X( 2),W[ 2],W[ 4],W[10],W[15]);
-       BODY_16_19(19,T,A,B,C,D,E,X( 3),W[ 3],W[ 5],W[11],X( 0));
-
-       BODY_20_31(20,E,T,A,B,C,D,X( 4),W[ 4],W[ 6],W[12],X( 1));
-       BODY_20_31(21,D,E,T,A,B,C,X( 5),W[ 5],W[ 7],W[13],X( 2));
-       BODY_20_31(22,C,D,E,T,A,B,X( 6),W[ 6],W[ 8],W[14],X( 3));
-       BODY_20_31(23,B,C,D,E,T,A,X( 7),W[ 7],W[ 9],W[15],X( 4));
-       BODY_20_31(24,A,B,C,D,E,T,X( 8),W[ 8],W[10],X( 0),X( 5));
-       BODY_20_31(25,T,A,B,C,D,E,X( 9),W[ 9],W[11],X( 1),X( 6));
-       BODY_20_31(26,E,T,A,B,C,D,X(10),W[10],W[12],X( 2),X( 7));
-       BODY_20_31(27,D,E,T,A,B,C,X(11),W[11],W[13],X( 3),X( 8));
-       BODY_20_31(28,C,D,E,T,A,B,X(12),W[12],W[14],X( 4),X( 9));
-       BODY_20_31(29,B,C,D,E,T,A,X(13),W[13],W[15],X( 5),X(10));
-       BODY_20_31(30,A,B,C,D,E,T,X(14),W[14],X( 0),X( 6),X(11));
-       BODY_20_31(31,T,A,B,C,D,E,X(15),W[15],X( 1),X( 7),X(12));
-
-       BODY_32_39(32,E,T,A,B,C,D,X( 0),X( 2),X( 8),X(13));
-       BODY_32_39(33,D,E,T,A,B,C,X( 1),X( 3),X( 9),X(14));
-       BODY_32_39(34,C,D,E,T,A,B,X( 2),X( 4),X(10),X(15));
-       BODY_32_39(35,B,C,D,E,T,A,X( 3),X( 5),X(11),X( 0));
-       BODY_32_39(36,A,B,C,D,E,T,X( 4),X( 6),X(12),X( 1));
-       BODY_32_39(37,T,A,B,C,D,E,X( 5),X( 7),X(13),X( 2));
-       BODY_32_39(38,E,T,A,B,C,D,X( 6),X( 8),X(14),X( 3));
-       BODY_32_39(39,D,E,T,A,B,C,X( 7),X( 9),X(15),X( 4));
-
-       BODY_40_59(40,C,D,E,T,A,B,X( 8),X(10),X( 0),X( 5));
-       BODY_40_59(41,B,C,D,E,T,A,X( 9),X(11),X( 1),X( 6));
-       BODY_40_59(42,A,B,C,D,E,T,X(10),X(12),X( 2),X( 7));
-       BODY_40_59(43,T,A,B,C,D,E,X(11),X(13),X( 3),X( 8));
-       BODY_40_59(44,E,T,A,B,C,D,X(12),X(14),X( 4),X( 9));
-       BODY_40_59(45,D,E,T,A,B,C,X(13),X(15),X( 5),X(10));
-       BODY_40_59(46,C,D,E,T,A,B,X(14),X( 0),X( 6),X(11));
-       BODY_40_59(47,B,C,D,E,T,A,X(15),X( 1),X( 7),X(12));
-       BODY_40_59(48,A,B,C,D,E,T,X( 0),X( 2),X( 8),X(13));
-       BODY_40_59(49,T,A,B,C,D,E,X( 1),X( 3),X( 9),X(14));
-       BODY_40_59(50,E,T,A,B,C,D,X( 2),X( 4),X(10),X(15));
-       BODY_40_59(51,D,E,T,A,B,C,X( 3),X( 5),X(11),X( 0));
-       BODY_40_59(52,C,D,E,T,A,B,X( 4),X( 6),X(12),X( 1));
-       BODY_40_59(53,B,C,D,E,T,A,X( 5),X( 7),X(13),X( 2));
-       BODY_40_59(54,A,B,C,D,E,T,X( 6),X( 8),X(14),X( 3));
-       BODY_40_59(55,T,A,B,C,D,E,X( 7),X( 9),X(15),X( 4));
-       BODY_40_59(56,E,T,A,B,C,D,X( 8),X(10),X( 0),X( 5));
-       BODY_40_59(57,D,E,T,A,B,C,X( 9),X(11),X( 1),X( 6));
-       BODY_40_59(58,C,D,E,T,A,B,X(10),X(12),X( 2),X( 7));
-       BODY_40_59(59,B,C,D,E,T,A,X(11),X(13),X( 3),X( 8));
-
-       BODY_60_79(60,A,B,C,D,E,T,X(12),X(14),X( 4),X( 9));
-       BODY_60_79(61,T,A,B,C,D,E,X(13),X(15),X( 5),X(10));
-       BODY_60_79(62,E,T,A,B,C,D,X(14),X( 0),X( 6),X(11));
-       BODY_60_79(63,D,E,T,A,B,C,X(15),X( 1),X( 7),X(12));
-       BODY_60_79(64,C,D,E,T,A,B,X( 0),X( 2),X( 8),X(13));
-       BODY_60_79(65,B,C,D,E,T,A,X( 1),X( 3),X( 9),X(14));
-       BODY_60_79(66,A,B,C,D,E,T,X( 2),X( 4),X(10),X(15));
-       BODY_60_79(67,T,A,B,C,D,E,X( 3),X( 5),X(11),X( 0));
-       BODY_60_79(68,E,T,A,B,C,D,X( 4),X( 6),X(12),X( 1));
-       BODY_60_79(69,D,E,T,A,B,C,X( 5),X( 7),X(13),X( 2));
-       BODY_60_79(70,C,D,E,T,A,B,X( 6),X( 8),X(14),X( 3));
-       BODY_60_79(71,B,C,D,E,T,A,X( 7),X( 9),X(15),X( 4));
-       BODY_60_79(72,A,B,C,D,E,T,X( 8),X(10),X( 0),X( 5));
-       BODY_60_79(73,T,A,B,C,D,E,X( 9),X(11),X( 1),X( 6));
-       BODY_60_79(74,E,T,A,B,C,D,X(10),X(12),X( 2),X( 7));
-       BODY_60_79(75,D,E,T,A,B,C,X(11),X(13),X( 3),X( 8));
-       BODY_60_79(76,C,D,E,T,A,B,X(12),X(14),X( 4),X( 9));
-       BODY_60_79(77,B,C,D,E,T,A,X(13),X(15),X( 5),X(10));
-       BODY_60_79(78,A,B,C,D,E,T,X(14),X( 0),X( 6),X(11));
-       BODY_60_79(79,T,A,B,C,D,E,X(15),X( 1),X( 7),X(12));
-       
-       c->h0=(c->h0+E)&0xffffffffL; 
-       c->h1=(c->h1+T)&0xffffffffL;
-       c->h2=(c->h2+A)&0xffffffffL;
-       c->h3=(c->h3+B)&0xffffffffL;
-       c->h4=(c->h4+C)&0xffffffffL;
-
-       if (--num == 0) break;
-
-       A=c->h0;
-       B=c->h1;
-       C=c->h2;
-       D=c->h3;
-       E=c->h4;
-
-       W+=SHA_LBLOCK;
-               }
-       }
-#endif
-
-#ifndef DONT_IMPLEMENT_BLOCK_DATA_ORDER
-void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
+#if !defined(SHA_1) || !defined(SHA1_ASM)
+static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
        {
        const unsigned char *data=p;
        register unsigned MD32_REG_T A,B,C,D,E,T,l;
@@ -373,25 +225,53 @@ void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
        E=c->h4;
 
        for (;;)
-               {
+                       {
+       const union { long one; char little; } is_endian = {1};
 
-       HOST_c2l(data,l); X( 0)=l;              HOST_c2l(data,l); X( 1)=l;
-       BODY_00_15( 0,A,B,C,D,E,T,X( 0));       HOST_c2l(data,l); X( 2)=l;
-       BODY_00_15( 1,T,A,B,C,D,E,X( 1));       HOST_c2l(data,l); X( 3)=l;
-       BODY_00_15( 2,E,T,A,B,C,D,X( 2));       HOST_c2l(data,l); X( 4)=l;
-       BODY_00_15( 3,D,E,T,A,B,C,X( 3));       HOST_c2l(data,l); X( 5)=l;
-       BODY_00_15( 4,C,D,E,T,A,B,X( 4));       HOST_c2l(data,l); X( 6)=l;
-       BODY_00_15( 5,B,C,D,E,T,A,X( 5));       HOST_c2l(data,l); X( 7)=l;
-       BODY_00_15( 6,A,B,C,D,E,T,X( 6));       HOST_c2l(data,l); X( 8)=l;
-       BODY_00_15( 7,T,A,B,C,D,E,X( 7));       HOST_c2l(data,l); X( 9)=l;
-       BODY_00_15( 8,E,T,A,B,C,D,X( 8));       HOST_c2l(data,l); X(10)=l;
-       BODY_00_15( 9,D,E,T,A,B,C,X( 9));       HOST_c2l(data,l); X(11)=l;
-       BODY_00_15(10,C,D,E,T,A,B,X(10));       HOST_c2l(data,l); X(12)=l;
-       BODY_00_15(11,B,C,D,E,T,A,X(11));       HOST_c2l(data,l); X(13)=l;
-       BODY_00_15(12,A,B,C,D,E,T,X(12));       HOST_c2l(data,l); X(14)=l;
-       BODY_00_15(13,T,A,B,C,D,E,X(13));       HOST_c2l(data,l); X(15)=l;
-       BODY_00_15(14,E,T,A,B,C,D,X(14));
-       BODY_00_15(15,D,E,T,A,B,C,X(15));
+       if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)p%4)==0)
+               {
+               const SHA_LONG *W=(const SHA_LONG *)data;
+
+               X( 0) = W[0];                           X( 1) = W[ 1];
+               BODY_00_15( 0,A,B,C,D,E,T,X( 0));       X( 2) = W[ 2];
+               BODY_00_15( 1,T,A,B,C,D,E,X( 1));       X( 3) = W[ 3];
+               BODY_00_15( 2,E,T,A,B,C,D,X( 2));       X( 4) = W[ 4];
+               BODY_00_15( 3,D,E,T,A,B,C,X( 3));       X( 5) = W[ 5];
+               BODY_00_15( 4,C,D,E,T,A,B,X( 4));       X( 6) = W[ 6];
+               BODY_00_15( 5,B,C,D,E,T,A,X( 5));       X( 7) = W[ 7];
+               BODY_00_15( 6,A,B,C,D,E,T,X( 6));       X( 8) = W[ 8];
+               BODY_00_15( 7,T,A,B,C,D,E,X( 7));       X( 9) = W[ 9];
+               BODY_00_15( 8,E,T,A,B,C,D,X( 8));       X(10) = W[10];
+               BODY_00_15( 9,D,E,T,A,B,C,X( 9));       X(11) = W[11];
+               BODY_00_15(10,C,D,E,T,A,B,X(10));       X(12) = W[12];
+               BODY_00_15(11,B,C,D,E,T,A,X(11));       X(13) = W[13];
+               BODY_00_15(12,A,B,C,D,E,T,X(12));       X(14) = W[14];
+               BODY_00_15(13,T,A,B,C,D,E,X(13));       X(15) = W[15];
+               BODY_00_15(14,E,T,A,B,C,D,X(14));
+               BODY_00_15(15,D,E,T,A,B,C,X(15));
+
+               data += SHA_CBLOCK;
+               }
+       else
+               {
+               HOST_c2l(data,l); X( 0)=l;              HOST_c2l(data,l); X( 1)=l;
+               BODY_00_15( 0,A,B,C,D,E,T,X( 0));       HOST_c2l(data,l); X( 2)=l;
+               BODY_00_15( 1,T,A,B,C,D,E,X( 1));       HOST_c2l(data,l); X( 3)=l;
+               BODY_00_15( 2,E,T,A,B,C,D,X( 2));       HOST_c2l(data,l); X( 4)=l;
+               BODY_00_15( 3,D,E,T,A,B,C,X( 3));       HOST_c2l(data,l); X( 5)=l;
+               BODY_00_15( 4,C,D,E,T,A,B,X( 4));       HOST_c2l(data,l); X( 6)=l;
+               BODY_00_15( 5,B,C,D,E,T,A,X( 5));       HOST_c2l(data,l); X( 7)=l;
+               BODY_00_15( 6,A,B,C,D,E,T,X( 6));       HOST_c2l(data,l); X( 8)=l;
+               BODY_00_15( 7,T,A,B,C,D,E,X( 7));       HOST_c2l(data,l); X( 9)=l;
+               BODY_00_15( 8,E,T,A,B,C,D,X( 8));       HOST_c2l(data,l); X(10)=l;
+               BODY_00_15( 9,D,E,T,A,B,C,X( 9));       HOST_c2l(data,l); X(11)=l;
+               BODY_00_15(10,C,D,E,T,A,B,X(10));       HOST_c2l(data,l); X(12)=l;
+               BODY_00_15(11,B,C,D,E,T,A,X(11));       HOST_c2l(data,l); X(13)=l;
+               BODY_00_15(12,A,B,C,D,E,T,X(12));       HOST_c2l(data,l); X(14)=l;
+               BODY_00_15(13,T,A,B,C,D,E,X(13));       HOST_c2l(data,l); X(15)=l;
+               BODY_00_15(14,E,T,A,B,C,D,X(14));
+               BODY_00_15(15,D,E,T,A,B,C,X(15));
+               }
 
        BODY_16_19(16,C,D,E,T,A,B,X( 0),X( 0),X( 2),X( 8),X(13));
        BODY_16_19(17,B,C,D,E,T,A,X( 1),X( 1),X( 3),X( 9),X(14));
@@ -476,7 +356,7 @@ void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
        D=c->h3;
        E=c->h4;
 
-               }
+                       }
        }
 #endif
 
@@ -511,54 +391,8 @@ void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
        E=D, D=C, C=ROTATE(B,30), B=A;  \
        A=ROTATE(A,5)+T+xa;         } while(0)
 
-#ifndef DONT_IMPLEMENT_BLOCK_HOST_ORDER
-void HASH_BLOCK_HOST_ORDER (SHA_CTX *c, const void *d, size_t num)
-       {
-       const SHA_LONG *W=d;
-       register unsigned MD32_REG_T A,B,C,D,E,T;
-       int i;
-       SHA_LONG        X[16];
-
-       A=c->h0;
-       B=c->h1;
-       C=c->h2;
-       D=c->h3;
-       E=c->h4;
-
-       for (;;)
-               {
-       for (i=0;i<16;i++)
-       { X[i]=W[i]; BODY_00_15(X[i]); }
-       for (i=0;i<4;i++)
-       { BODY_16_19(X[i],       X[i+2],      X[i+8],     X[(i+13)&15]); }
-       for (;i<24;i++)
-       { BODY_20_39(X[i&15],    X[(i+2)&15], X[(i+8)&15],X[(i+13)&15]); }
-       for (i=0;i<20;i++)
-       { BODY_40_59(X[(i+8)&15],X[(i+10)&15],X[i&15],    X[(i+5)&15]);  }
-       for (i=4;i<24;i++)
-       { BODY_60_79(X[(i+8)&15],X[(i+10)&15],X[i&15],    X[(i+5)&15]);  }
-       
-       c->h0=(c->h0+A)&0xffffffffL; 
-       c->h1=(c->h1+B)&0xffffffffL;
-       c->h2=(c->h2+C)&0xffffffffL;
-       c->h3=(c->h3+D)&0xffffffffL;
-       c->h4=(c->h4+E)&0xffffffffL;
-
-       if (--num == 0) break;
-
-       A=c->h0;
-       B=c->h1;
-       C=c->h2;
-       D=c->h3;
-       E=c->h4;
-
-       W+=SHA_LBLOCK;
-               }
-       }
-#endif
-
-#ifndef DONT_IMPLEMENT_BLOCK_DATA_ORDER
-void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
+#if !defined(SHA_1) || !defined(SHA1_ASM)
+static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num)
        {
        const unsigned char *data=p;
        register unsigned MD32_REG_T A,B,C,D,E,T,l;