Reorganize and speed up MD5.
authorUlf Möller <ulf@openssl.org>
Thu, 13 May 1999 13:16:42 +0000 (13:16 +0000)
committerUlf Möller <ulf@openssl.org>
Thu, 13 May 1999 13:16:42 +0000 (13:16 +0000)
Submitted by: Andy Polyakov <appro@fy.chalmers.se>

CHANGES
Configure
crypto/md32_common.h [new file with mode: 0644]
crypto/md5/Makefile.ssl
crypto/md5/asm/md5-586.pl
crypto/md5/asm/md5-sparcv9.S [new file with mode: 0644]
crypto/md5/md5.h
crypto/md5/md5_dgst.c
crypto/md5/md5_locl.h
crypto/md5/md5_one.c

diff --git a/CHANGES b/CHANGES
index 770bd19..a3a90dd 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -5,6 +5,9 @@
 
  Changes between 0.9.2b and 0.9.3
 
+  *) Reorganize and speed up MD5.
+     [Andy Polyakov <appro@fy.chalmers.se>]
+
   *) VMS support.
      [Richard Levitte <richard@levitte.org>]
 
index b7f910f..948da07 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -106,7 +106,7 @@ my %table=(
 # Solaris setups
 "solaris-x86-gcc","gcc:-O3 -fomit-frame-pointer -m486 -Wall -DL_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG $x86_gcc_des $x86_gcc_opts:$x86_sol_asm",
 "solaris-sparc-gcc","gcc:-O3 -fomit-frame-pointer -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8.o::",
-"solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::",
+"solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o:::asm/md5-sparcv8plus.o:",
 "debug-solaris-sparc-gcc","gcc:-O3 -g -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:::",
 "debug-solaris-usparc-gcc","gcc:-O3 -g -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::",
 
@@ -115,12 +115,11 @@ my %table=(
 # SC4 is ok, better than gcc even on bn as long as you tell it -xarch=v8
 # -fast slows things like DES down quite a lot
 # Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
-# SC5.0 with the compiler common patch works.
 "solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
 "solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
 # SC5.0 note: Compiler common patch 107357-01 or later is required!
-"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
-"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::",
+"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:",
+"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:",
 
 # Sunos configs, assuming sparc for the gcc one.
 ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",
diff --git a/crypto/md32_common.h b/crypto/md32_common.h
new file mode 100644 (file)
index 0000000..977ea8e
--- /dev/null
@@ -0,0 +1,592 @@
+/* crypto/md32_common.h */
+/* ====================================================================
+ * Copyright (c) 1999 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/*
+ * This is a generic 32 bit "collector" for message digest algorithms.
+ * Whenever needed it collects input character stream into chunks of
+ * 32 bit values and invokes a block function that performs actual hash
+ * calculations.
+ *
+ * Porting guide.
+ *
+ * Obligatory macros:
+ *
+ * DATA_ORDER_IS_BIG_ENDIAN or DATA_ORDER_IS_LITTLE_ENDIAN
+ *     this macro defines byte order of input stream.
+ * HASH_CBLOCK
+ *     size of a unit chunk HASH_BLOCK operates on.
+ * HASH_LONG
+ *     has to be at lest 32 bit wide, if it's wider, then
+ *     HASH_LONG_LOG2 *has to* be defined along
+ * HASH_CTX
+ *     context structure that at least contains following
+ *     members:
+ *             typedef struct {
+ *                     ...
+ *                     HASH_LONG       Nl,Nh;
+ *                     HASH_LONG       data[HASH_LBLOCK];
+ *                     int             num;
+ *                     ...
+ *                     } HASH_CTX;
+ * HASH_UPDATE
+ *     name of "Update" function, implemented here.
+ * HASH_TRANSFORM
+ *     name of "Transform" function, implemented here.
+ * HASH_FINAL
+ *     name of "Final" function, implemented here.
+ * HASH_BLOCK_HOST_ORDER
+ *     name of "block" function treating *aligned* input message
+ *     in host byte order, implemented externally.
+ * HASH_BLOCK_DATA_ORDER
+ *     name of "block" function treating *unaligned* input message
+ *     in original (data) byte order, implemented externally (it
+ *     actually is optional if data and host are of the same
+ *     "endianess").
+ *
+ * Optional macros:
+ *
+ * B_ENDIAN or L_ENDIAN
+ *     defines host byte-order.
+ * HASH_LONG_LOG2
+ *     defaults to 2 if not states otherwise.
+ * HASH_LBLOCK
+ *     assumed to be HASH_CBLOCK/4 if not stated otherwise.
+ * HASH_BLOCK_DATA_ORDER_ALIGNED
+ *     alternative "block" function capable of treating
+ *     aligned input message in original (data) order,
+ *     implemented externally.
+ *
+ * MD5 example:
+ *
+ *     #define DATA_ORDER_IS_LITTLE_ENDIAN
+ *
+ *     #define HASH_LONG               MD5_LONG
+ *     #define HASH_LONG_LOG2          MD5_LONG_LOG2
+ *     #define HASH_CTX                MD5_CTX
+ *     #define HASH_CBLOCK             MD5_CBLOCK
+ *     #define HASH_LBLOCK             MD5_LBLOCK
+ *     #define HASH_UPDATE             MD5_Update
+ *     #define HASH_TRANSFORM          MD5_Transform
+ *     #define HASH_FINAL              MD5_Final
+ *     #define HASH_BLOCK_HOST_ORDER   md5_block_host_order
+ *     #define HASH_BLOCK_DATA_ORDER   md5_block_data_order
+ *
+ *                                     <appro@fy.chalmers.se>
+ */
+
+#if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+#error "DATA_ORDER must be defined!"
+#endif
+
+#ifndef HASH_CBLOCK
+#error "HASH_CBLOCK must be defined!"
+#endif
+#ifndef HASH_LONG
+#error "HASH_LONG must be defined!"
+#endif
+#ifndef HASH_CTX
+#error "HASH_CTX must be defined!"
+#endif
+
+#ifndef HASH_UPDATE
+#error "HASH_UPDATE must be defined!"
+#endif
+#ifndef HASH_TRANSFORM
+#error "HASH_TRANSFORM must be defined!"
+#endif
+#ifndef HASH_FINAL
+#error "HASH_FINAL must be defined!"
+#endif
+
+#ifndef HASH_BLOCK_HOST_ORDER
+#error "HASH_BLOCK_HOST_ORDER must be defined!"
+#endif
+
+#if 0
+/*
+ * Moved below as it's required only if HASH_BLOCK_DATA_ORDER_ALIGNED
+ * isn't defined.
+ */
+#ifndef HASH_BLOCK_DATA_ORDER
+#error "HASH_BLOCK_DATA_ORDER must be defined!"
+#endif
+#endif
+
+#ifndef HASH_LBLOCK
+#define HASH_LBLOCK    (HASH_CBLOCK/4)
+#endif
+
+#ifndef HASH_LONG_LOG2
+#define HASH_LONG_LOG2 2
+#endif
+
+/*
+ * Engage compiler specific rotate intrinsic function if available.
+ */
+#undef ROTATE
+#ifndef PEDANTIC
+# if defined(_MSC_VER)
+#  define ROTATE(a,n)     _lrotl(a,n)
+# elif defined(__GNUC__) && __GNUC__>=2
+  /*
+   * Some GNU C inline assembler templates. Note that these are
+   * rotates by *constant* number of bits! But that's exactly
+   * what we need here...
+   *
+   *                                   <appro@fy.chalmers.se>
+   */
+#  if defined(__i386)
+#   define ROTATE(a,n) ({ register unsigned int ret;   \
+                               asm volatile (          \
+                               "roll %1,%0"            \
+                               : "=r"(ret)             \
+                               : "I"(n), "0"(a)        \
+                               : "cc");                \
+                          ret;                         \
+                       })
+#  elif defined(__powerpc)
+#   define ROTATE(a,n) ({ register unsigned int ret;   \
+                               asm volatile (          \
+                               "rlwinm %0,%1,%2,0,31"  \
+                               : "=r"(ret)             \
+                               : "r"(a), "I"(n));      \
+                          ret;                         \
+                       })
+#  endif
+# endif
+
+/*
+ * Engage compiler specific "fetch in reverse byte order"
+ * intrinsic function if available.
+ */
+# if defined(__GNUC__) && __GNUC__>=2
+  /* some GNU C inline assembler templates by <appro@fy.chalmers.se> */
+#  if defined(__i386) && !defined(I386_ONLY)
+#   define BE_FETCH32(a)       ({ register unsigned int l=(a);\
+                               asm volatile (          \
+                               "bswapl %0"             \
+                               : "=r"(l) : "0"(l));    \
+                         l;                            \
+                       })
+#  elif defined(__powerpc)
+#   define LE_FETCH32(a)       ({ register unsigned int l;     \
+                               asm volatile (          \
+                               "lwbrx %0,0,%1"         \
+                               : "=r"(l)               \
+                               : "r"(a));              \
+                          l;                           \
+                       })
+
+#  elif defined(__sparc) && defined(ULTRASPARC)
+#  define LE_FETCH32(a)        ({ register unsigned int l;             \
+                               asm volatile (                  \
+                               "lda [%1]#ASI_PRIMARY_LITTLE,%0"\
+                               : "=r"(l)                       \
+                               : "r"(a));                      \
+                          l;                                   \
+                       })
+#  endif
+# endif
+#endif /* PEDANTIC */
+
+#if HASH_LONG_LOG2==2  /* Engage only if sizeof(HASH_LONG)== 4 */
+/* A nice byte order reversal from Wei Dai <weidai@eskimo.com> */
+#ifdef ROTATE
+/* 5 instructions with rotate instruction, else 9 */
+#define REVERSE_FETCH32(a,l)   (                                       \
+               l=*(const HASH_LONG *)(a),                              \
+               ((ROTATE(l,8)&0x00FF00FF)|(ROTATE((l&0x00FF00FF),24)))  \
+                               )
+#else
+/* 6 instructions with rotate instruction, else 8 */
+#define REVERSE_FETCH32(a,l)   (                               \
+               l=*(const HASH_LONG *)(a),                      \
+               l=(((l>>8)&0x00FF00FF)|((l&0x00FF00FF)<<8)),    \
+               ROTATE(l,16)                                    \
+                               )
+/*
+ * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|...
+ * It's rewritten as above for two reasons:
+ *     - RISCs aren't good at long constants and have to explicitely
+ *       compose 'em with several (well, usually 2) instructions in a
+ *       register before performing the actual operation and (as you
+ *       already realized:-) having same constant should inspire the
+ *       compiler to permanently allocate the only register for it;
+ *     - most modern CPUs have two ALUs, but usually only one has
+ *       circuitry for shifts:-( this minor tweak inspires compiler
+ *       to schedule shift instructions in a better way...
+ *
+ *                             <appro@fy.chalmers.se>
+ */
+#endif
+#endif
+
+#ifndef ROTATE
+#define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
+#endif
+
+/*
+ * Make some obvious choices. E.g., HASH_BLOCK_DATA_ORDER_ALIGNED
+ * and HASH_BLOCK_HOST_ORDER ought to be the same if input data
+ * and host are of the same "endianess". It's possible to mask
+ * this with blank #define HASH_BLOCK_DATA_ORDER though...
+ *
+ *                             <appro@fy.chalmers.se>
+ */
+#if defined(B_ENDIAN)
+#  if defined(DATA_ORDER_IS_BIG_ENDIAN)
+#    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
+#      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
+#    endif
+#  elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+#    ifndef HOST_FETCH32
+#      ifdef LE_FETCH32
+#        define HOST_FETCH32(p,l)      LE_FETCH32(p)
+#      elif defined(REVERSE_FETCH32)
+#        define HOST_FETCH32(p,l)      REVERSE_FETCH32(p,l)
+#      endif
+#    endif
+#  endif
+#elif defined(L_ENDIAN)
+#  if defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+#    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
+#      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
+#    endif
+#  elif defined(DATA_ORDER_IS_BIG_ENDIAN)
+#    ifndef HOST_FETCH32
+#      ifdef BE_FETCH32
+#        define HOST_FETCH32(p,l)      BE_FETCH32(p)
+#      elif defined(REVERSE_FETCH32)
+#        define HOST_FETCH32(p,l)      REVERSE_FETCH32(p,l)
+#      endif
+#    endif
+#  endif
+#endif
+
+#if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+#ifndef HASH_BLOCK_DATA_ORDER
+#error "HASH_BLOCK_DATA_ORDER must be defined!"
+#endif
+#endif
+
+#if defined(DATA_ORDER_IS_BIG_ENDIAN)
+
+#define HOST_c2l(c,l)  (l =(((unsigned long)(*((c)++)))<<24),          \
+                        l|=(((unsigned long)(*((c)++)))<<16),          \
+                        l|=(((unsigned long)(*((c)++)))<< 8),          \
+                        l|=(((unsigned long)(*((c)++)))    ),          \
+                        l)
+#define HOST_p_c2l(c,l,n)      {                                       \
+                       switch (n) {                                    \
+                       case 0: l =((unsigned long)(*((c)++)))<<24;     \
+                       case 1: l|=((unsigned long)(*((c)++)))<<16;     \
+                       case 2: l|=((unsigned long)(*((c)++)))<< 8;     \
+                       case 3: l|=((unsigned long)(*((c)++)));         \
+                               } }
+#define HOST_p_c2l_p(c,l,sc,len) {                                     \
+                       switch (sc) {                                   \
+                       case 0: l =((unsigned long)(*((c)++)))<<24;     \
+                               if (--len == 0) break;                  \
+                       case 1: l|=((unsigned long)(*((c)++)))<<16;     \
+                               if (--len == 0) break;                  \
+                       case 2: l|=((unsigned long)(*((c)++)))<< 8;     \
+                               } }
+/* NOTE the pointer is not incremented at the end of this */
+#define HOST_c2l_p(c,l,n)      {                                       \
+                       l=0; (c)+=n;                                    \
+                       switch (n) {                                    \
+                       case 3: l =((unsigned long)(*(--(c))))<< 8;     \
+                       case 2: l|=((unsigned long)(*(--(c))))<<16;     \
+                       case 1: l|=((unsigned long)(*(--(c))))<<24;     \
+                               } }
+#define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)>>24)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)    )&0xff),      \
+                        l)
+
+#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+
+#define HOST_c2l(c,l)  (l =(((unsigned long)(*((c)++)))    ),          \
+                        l|=(((unsigned long)(*((c)++)))<< 8),          \
+                        l|=(((unsigned long)(*((c)++)))<<16),          \
+                        l|=(((unsigned long)(*((c)++)))<<24),          \
+                        l)
+#define HOST_p_c2l(c,l,n)      {                                       \
+                       switch (n) {                                    \
+                       case 0: l =((unsigned long)(*((c)++)));         \
+                       case 1: l|=((unsigned long)(*((c)++)))<< 8;     \
+                       case 2: l|=((unsigned long)(*((c)++)))<<16;     \
+                       case 3: l|=((unsigned long)(*((c)++)))<<24;     \
+                               } }
+#define HOST_p_c2l_p(c,l,sc,len) {                                     \
+                       switch (sc) {                                   \
+                       case 0: l =((unsigned long)(*((c)++)));         \
+                               if (--len == 0) break;                  \
+                       case 1: l|=((unsigned long)(*((c)++)))<< 8;     \
+                               if (--len == 0) break;                  \
+                       case 2: l|=((unsigned long)(*((c)++)))<<16;     \
+                               } }
+/* NOTE the pointer is not incremented at the end of this */
+#define HOST_c2l_p(c,l,n)      {                                       \
+                       l=0; (c)+=n;                                    \
+                       switch (n) {                                    \
+                       case 3: l =((unsigned long)(*(--(c))))<<16;     \
+                       case 2: l|=((unsigned long)(*(--(c))))<< 8;     \
+                       case 1: l|=((unsigned long)(*(--(c))));         \
+                               } }
+#define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)    )&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
+                        *((c)++)=(unsigned char)(((l)>>24)&0xff),      \
+                        l)
+
+#endif
+
+/*
+ * Time for some action:-)
+ */
+
+void HASH_UPDATE (HASH_CTX *c, const unsigned char *data, unsigned long len)
+       {
+       register HASH_LONG * p;
+       register unsigned long l;
+       int sw,sc,ew,ec;
+
+       if (len==0) return;
+
+       l=(c->Nl+(len<<3))&0xffffffffL;
+       /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
+        * Wei Dai <weidai@eskimo.com> for pointing it out. */
+       if (l < c->Nl) /* overflow */
+               c->Nh++;
+       c->Nh+=(len>>29);
+       c->Nl=l;
+
+       if (c->num != 0)
+               {
+               p=c->data;
+               sw=c->num>>2;
+               sc=c->num&0x03;
+
+               if ((c->num+len) >= HASH_CBLOCK)
+                       {
+                       l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l;
+                       for (; sw<HASH_LBLOCK; sw++)
+                               {
+                               HOST_c2l(data,l); p[sw]=l;
+                               }
+                       HASH_BLOCK_HOST_ORDER (c,p,1);
+                       len-=(HASH_CBLOCK-c->num);
+                       c->num=0;
+                       /* drop through and do the rest */
+                       }
+               else
+                       {
+                       c->num+=len;
+                       if ((sc+len) < 4) /* ugly, add char's to a word */
+                               {
+                               l=p[sw]; HOST_p_c2l_p(data,l,sc,len); p[sw]=l;
+                               }
+                       else
+                               {
+                               ew=(c->num>>2);
+                               ec=(c->num&0x03);
+                               l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l;
+                               for (; sw < ew; sw++)
+                                       {
+                                       HOST_c2l(data,l); p[sw]=l;
+                                       }
+                               if (ec)
+                                       {
+                                       HOST_c2l_p(data,l,ec); p[sw]=l;
+                                       }
+                               }
+                       return;
+                       }
+               }
+
+       sw=len/HASH_CBLOCK;
+       if (sw > 0)
+               {
+#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+               /*
+                * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined
+                * only if sizeof(HASH_LONG)==4.
+                */
+               if ((((unsigned long)data)%4) == 0)
+                       {
+                       HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,sw);
+                       sw*=HASH_CBLOCK;
+                       data+=sw;
+                       len-=sw;
+                       }
+               else
+#if !defined(HASH_BLOCK_DATA_ORDER)
+                       while (sw--)
+                               {
+                               memcpy (p=c->data,data,HASH_CBLOCK);
+                               HASH_BLOCK_DATA_ORDER_ALIGNED(c,p,1);
+                               data+=HASH_CBLOCK;
+                               len-=HASH_CBLOCK;
+                               }
+#endif
+#endif
+#if defined(HASH_BLOCK_DATA_ORDER)
+                       {
+                       HASH_BLOCK_DATA_ORDER (c,(HASH_LONG *)data,sw);
+                       sw*=HASH_CBLOCK;
+                       data+=sw;
+                       len-=sw;
+                       }
+#endif
+               }
+
+       if (len!=0)
+               {
+               p = c->data;
+               c->num = len;
+               ew=len>>2;      /* words to copy */
+               ec=len&0x03;
+               for (; ew; ew--,p++)
+                       {
+                       HOST_c2l(data,l); *p=l;
+                       }
+               HOST_c2l_p(data,l,ec);
+               *p=l;
+               }
+       }
+
+
+void HASH_TRANSFORM (HASH_CTX *c, unsigned char *data)
+       {
+#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+       if ((((unsigned long)data)%4) == 0)
+               HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,1);
+       else
+#if !defined(HASH_BLOCK_DATA_ORDER)
+               {
+               memcpy (c->data,data,HASH_CBLOCK);
+               HASH_BLOCK_DATA_ORDER_ALIGNED (c,c->data,1);
+               }
+#endif
+#endif
+#if defined(HASH_BLOCK_DATA_ORDER)
+       HASH_BLOCK_DATA_ORDER (c,(HASH_LONG *)data,1);
+#endif
+       }
+
+
+void HASH_FINAL (unsigned char *md, HASH_CTX *c)
+       {
+       register HASH_LONG *p;
+       register unsigned long l;
+       register int i,j;
+       static const unsigned char end[4]={0x80,0x00,0x00,0x00};
+       const unsigned char *cp=end;
+
+       /* c->num should definitly have room for at least one more byte. */
+       p=c->data;
+       i=c->num>>2;
+       j=c->num&0x03;
+
+#if 0
+       /* purify often complains about the following line as an
+        * Uninitialized Memory Read.  While this can be true, the
+        * following p_c2l macro will reset l when that case is true.
+        * This is because j&0x03 contains the number of 'valid' bytes
+        * already in p[i].  If and only if j&0x03 == 0, the UMR will
+        * occur but this is also the only time p_c2l will do
+        * l= *(cp++) instead of l|= *(cp++)
+        * Many thanks to Alex Tang <altitude@cic.net> for pickup this
+        * 'potential bug' */
+#ifdef PURIFY
+       if (j==0) p[i]=0; /* Yeah, but that's not the way to fix it:-) */
+#endif
+       l=p[i];
+#else
+       l = (j==0) ? 0 : p[i];
+#endif
+       HOST_p_c2l(cp,l,j); p[i++]=l; /* i is the next 'undefined word' */
+
+       if (i>(HASH_LBLOCK-2)) /* save room for Nl and Nh */
+               {
+               if (i<HASH_LBLOCK) p[i]=0;
+               HASH_BLOCK_HOST_ORDER (c,p,1);
+               i=0;
+               }
+       for (; i<(HASH_LBLOCK-2); i++)
+               p[i]=0;
+
+#if   defined(DATA_ORDER_IS_BIG_ENDIAN)
+       p[HASH_LBLOCK-2]=c->Nh;
+       p[HASH_LBLOCK-1]=c->Nl;
+#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+       p[HASH_LBLOCK-2]=c->Nl;
+       p[HASH_LBLOCK-1]=c->Nh;
+#endif
+       HASH_BLOCK_HOST_ORDER (c,p,1);
+
+       l=c->A; HOST_l2c(l,md);
+       l=c->B; HOST_l2c(l,md);
+       l=c->C; HOST_l2c(l,md);
+       l=c->D; HOST_l2c(l,md);
+
+       c->num=0;
+       /* clear stuff, HASH_BLOCK may be leaving some stuff on the stack
+        * but I'm not worried :-)
+       memset((void *)c,0,sizeof(HASH_CTX));
+        */
+       }
index e27cfca..f8eaf62 100644 (file)
@@ -66,6 +66,14 @@ asm/mx86bsdi.o: asm/mx86unix.cpp
 asm/mx86unix.cpp: asm/md5-586.pl
        (cd asm; $(PERL) md5-586.pl cpp >mx86unix.cpp)
 
+# works for both SC and gcc
+asm/md5-sparcv8plus.o: asm/md5-sparcv9.S
+       $(CPP) -DULTRASPARC -DMD5_BLOCK_DATA_ORDER asm/md5-sparcv9.S | as -xarch=v8plus /dev/fd/0 -o asm/md5-sparcv8plus.o
+
+asm/md5-sparcv9.o: asm/md5-sparcv9.S
+       $(CC) -xarch=v9 -DULTRASPARC -DMD5_BLOCK_DATA_ORDER -c asm/md5-sparcv9.S -o asm/md5-sparcv9.o
+
+
 files:
        $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
 
@@ -103,5 +111,5 @@ clean:
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
 md5_dgst.o: ../../include/openssl/md5.h ../../include/openssl/opensslv.h
-md5_dgst.o: md5_locl.h
+md5_dgst.o: ../md32_common.h md5_locl.h
 md5_one.o: ../../include/openssl/md5.h md5_locl.h
index 0249e10..5fc6a20 100644 (file)
@@ -29,7 +29,7 @@ $X="esi";
  0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9, # R3
  );
 
-&md5_block("md5_block_x86");
+&md5_block("md5_block_asm_host_order");
 &asm_finish();
 
 sub Np
@@ -183,6 +183,7 @@ sub md5_block
         &mov($X,       &wparam(1)); # esi
        &mov($C,        &wparam(2));
         &push("ebp");
+       &shl($C,        6);
        &push("ebx");
         &add($C,       $X); # offset we end at
        &sub($C,        64);
diff --git a/crypto/md5/asm/md5-sparcv9.S b/crypto/md5/asm/md5-sparcv9.S
new file mode 100644 (file)
index 0000000..955b368
--- /dev/null
@@ -0,0 +1,1035 @@
+.ident "md5-sparcv9.S, Version 1.0"
+.ident "SPARC V9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
+.file  "md5-sparcv9.S"
+
+/*
+ * ====================================================================
+ * Copyright (c) 1999 Andy Polyakov <appro@fy.chalmers.se>.
+ *
+ * Rights for redistribution and usage in source and binary forms are
+ * granted as long as above copyright notices are retained. Warranty
+ * of any kind is (of course:-) disclaimed.
+ * ====================================================================
+ */
+
+/*
+ * This is my modest contribution to OpenSSL project (see
+ * http://www.openssl.org/ for more information about it) and is an
+ * assembler implementation of MD5 block hash function. I've hand-coded
+ * this for the sole reason to reach UltraSPARC-specific "load in
+ * little-endian byte order" instruction. This gives up to 15%
+ * performance improvement for cases when input message is aligned at
+ * 32 bits boundary. The module was tested under both 32 *and* 64 bit
+ * kernels. For updates see http://fy.chalmers.se/~appro/hpe/.
+ *
+ * To compile with SC4.x/SC5.x:
+ *
+ *     cc -xarch=v[9|8plus] -DULTRASPARC -DMD5_BLOCK_DATA_ORDER \
+ *             -c md5-sparcv9.S
+ *
+ * and with gcc:
+ *
+ *     gcc -mcpu=ultrasparc -DULTRASPARC -DMD5_BLOCK_DATA_ORDER \
+ *             -c md5-sparcv9.S
+ *
+ * or if above fails (it does if you have gas):
+ *
+ *     gcc -E -DULTRASPARC -DMD5_BLOCK_DATA_ORDER md5_block.sparc.S | \
+ *             as -xarch=v8plus /dev/fd/0 -o md5-sparcv9.o
+ */
+
+#define        A       %o0
+#define B      %o1
+#define        C       %o2
+#define        D       %o3
+#define        T1      %o4
+#define        T2      %o5
+
+#define        R0      %l0
+#define        R1      %l1
+#define        R2      %l2
+#define        R3      %l3
+#define        R4      %l4
+#define        R5      %l5
+#define        R6      %l6
+#define        R7      %l7
+#define        R8      %i3
+#define        R9      %i4
+#define        R10     %i5
+#define        R11     %g1
+#define R12    %g2
+#define        R13     %g3
+#define RX     %g4
+
+#define Aptr   %i0+0
+#define Bptr   %i0+4
+#define Cptr   %i0+8
+#define Dptr   %i0+12
+
+#define Aval   R5      /* those not used at the end of the last round */
+#define Bval   R6
+#define Cval   R7
+#define Dval   R8
+
+#if defined(MD5_BLOCK_DATA_ORDER)
+# if defined(ULTRASPARC)
+#  define      LOAD                    lda
+#  define      X(i)                    [%i1+i*4]%asi
+#  define      md5_block               md5_block_asm_data_order_aligned
+#  define      ASI_PRIMARY_LITTLE      0x88
+# else
+#  error "MD5_BLOCK_DATA_ORDER is supported only on UltraSPARC!"
+# endif
+#else
+# define       LOAD                    ld
+# define       X(i)                    [%i1+i*4]
+# define       md5_block               md5_block_asm_host_order
+#endif
+
+.section        ".text",#alloc,#execinstr
+#if defined(__SUNPRO_C) && defined(__sparcv9)
+  /* They've said -xarch=v9 at command line */
+  .register    %g2,#scratch
+  .register    %g3,#scratch
+# define       FRAME   -192
+#else
+# define       FRAME   -96
+#endif
+
+.align  32
+
+.global md5_block
+md5_block:
+       save    %sp,FRAME,%sp
+
+       ld      [Dptr],D
+#ifdef ASI_PRIMARY_LITTLE
+       mov     %asi,%o7        ! How dare I? Well, I just do:-)
+#else
+       nop
+#endif
+       ld      [Cptr],C
+#ifdef ASI_PRIMARY_LITTLE
+       mov     ASI_PRIMARY_LITTLE,%asi
+#else
+       nop
+#endif
+       ld      [Bptr],B
+       nop
+       ld      [Aptr],A
+       nop
+       LOAD    X(0),R0
+       nop
+       ba      .Lmd5_block_loop
+       nop
+
+.align 32
+.Lmd5_block_loop:
+
+!!!!!!!!Round 0
+
+       xor     C,D,T1
+       sethi   %hi(0xd76aa478),T2
+       and     T1,B,T1
+       or      T2,%lo(0xd76aa478),T2   !=
+       xor     T1,D,T1
+       add     T1,R0,T1
+       LOAD    X(1),R1
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+
+       sethi   %hi(0xe8c7b756),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0xe8c7b756),T2
+       xor     T1,C,T1
+       LOAD    X(2),R2
+       add     T1,R1,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+
+       sethi   %hi(0x242070db),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0x242070db),T2
+       xor     T1,B,T1
+       add     T1,R2,T1                !=
+       LOAD    X(3),R3
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+
+       sethi   %hi(0xc1bdceee),T2
+       and     T1,C,T1
+       or      T2,%lo(0xc1bdceee),T2
+       xor     T1,A,T1                 !=
+       add     T1,R3,T1
+       LOAD    X(4),R4
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,22,T2
+       srl     B,32-22,B
+       or      B,T2,B
+        xor     C,D,T1                 !=
+       add     B,C,B
+
+       sethi   %hi(0xf57c0faf),T2
+       and     T1,B,T1
+       or      T2,%lo(0xf57c0faf),T2   !=
+       xor     T1,D,T1
+       add     T1,R4,T1
+       LOAD    X(5),R5
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+
+       sethi   %hi(0x4787c62a),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x4787c62a),T2
+       xor     T1,C,T1
+       LOAD    X(6),R6
+       add     T1,R5,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+
+       sethi   %hi(0xa8304613),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0xa8304613),T2
+       xor     T1,B,T1
+       add     T1,R6,T1                !=
+       LOAD    X(7),R7
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+
+       sethi   %hi(0xfd469501),T2
+       and     T1,C,T1
+       or      T2,%lo(0xfd469501),T2
+       xor     T1,A,T1                 !=
+       add     T1,R7,T1
+       LOAD    X(8),R8
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,22,T2
+       srl     B,32-22,B
+       or      B,T2,B
+        xor     C,D,T1                 !=
+       add     B,C,B
+
+       sethi   %hi(0x698098d8),T2
+       and     T1,B,T1
+       or      T2,%lo(0x698098d8),T2   !=
+       xor     T1,D,T1
+       add     T1,R8,T1
+       LOAD    X(9),R9
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+
+       sethi   %hi(0x8b44f7af),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x8b44f7af),T2
+       xor     T1,C,T1
+       LOAD    X(10),R10
+       add     T1,R9,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+
+       sethi   %hi(0xffff5bb1),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0xffff5bb1),T2
+       xor     T1,B,T1
+       add     T1,R10,T1               !=
+       LOAD    X(11),R11
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+
+       sethi   %hi(0x895cd7be),T2
+       and     T1,C,T1
+       or      T2,%lo(0x895cd7be),T2
+       xor     T1,A,T1                 !=
+       add     T1,R11,T1
+       LOAD    X(12),R12
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,22,T2
+       srl     B,32-22,B
+       or      B,T2,B
+        xor     C,D,T1                 !=
+       add     B,C,B
+
+       sethi   %hi(0x6b901122),T2
+       and     T1,B,T1
+       or      T2,%lo(0x6b901122),T2   !=
+       xor     T1,D,T1
+       add     T1,R12,T1
+       LOAD    X(13),R13
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,7,T2
+       srl     A,32-7,A
+       or      A,T2,A                  !=
+        xor     B,C,T1
+       add     A,B,A
+
+       sethi   %hi(0xfd987193),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0xfd987193),T2
+       xor     T1,C,T1
+       LOAD    X(14),RX
+       add     T1,R13,T1               !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,12,T2
+       srl     D,32-12,D               !=
+       or      D,T2,D
+        xor     A,B,T1
+       add     D,A,D
+
+       sethi   %hi(0xa679438e),T2      !=
+       and     T1,D,T1
+       or      T2,%lo(0xa679438e),T2
+       xor     T1,B,T1
+       add     T1,RX,T1                !=
+       LOAD    X(15),RX
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,17,T2                 !=
+       srl     C,32-17,C
+       or      C,T2,C
+        xor     D,A,T1
+       add     C,D,C                   !=
+
+       sethi   %hi(0x49b40821),T2
+       and     T1,C,T1
+       or      T2,%lo(0x49b40821),T2
+       xor     T1,A,T1                 !=
+       add     T1,RX,T1
+       !pre-LOADed     X(1),R1
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,22,T2                 !=
+       srl     B,32-22,B
+       or      B,T2,B
+       add     B,C,B
+
+!!!!!!!!Round 1
+
+       xor     B,C,T1                  !=
+       sethi   %hi(0xf61e2562),T2
+       and     T1,D,T1
+       or      T2,%lo(0xf61e2562),T2
+       xor     T1,C,T1                 !=
+       add     T1,R1,T1
+       !pre-LOADed     X(6),R6
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,5,T2                  !=
+       srl     A,32-5,A
+       or      A,T2,A
+       add     A,B,A
+
+       xor     A,B,T1                  !=
+       sethi   %hi(0xc040b340),T2
+       and     T1,C,T1
+       or      T2,%lo(0xc040b340),T2
+       xor     T1,B,T1                 !=
+       add     T1,R6,T1
+       !pre-LOADed     X(11),R11
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,9,T2                  !=
+       srl     D,32-9,D
+       or      D,T2,D
+       add     D,A,D
+
+       xor     D,A,T1                  !=
+       sethi   %hi(0x265e5a51),T2
+       and     T1,B,T1
+       or      T2,%lo(0x265e5a51),T2
+       xor     T1,A,T1                 !=
+       add     T1,R11,T1
+       !pre-LOADed     X(0),R0
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,14,T2                 !=
+       srl     C,32-14,C
+       or      C,T2,C
+       add     C,D,C
+
+       xor     C,D,T1                  !=
+       sethi   %hi(0xe9b6c7aa),T2
+       and     T1,A,T1
+       or      T2,%lo(0xe9b6c7aa),T2
+       xor     T1,D,T1                 !=
+       add     T1,R0,T1
+       !pre-LOADed     X(5),R5
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,20,T2                 !=
+       srl     B,32-20,B
+       or      B,T2,B
+       add     B,C,B
+
+       xor     B,C,T1                  !=
+       sethi   %hi(0xd62f105d),T2
+       and     T1,D,T1
+       or      T2,%lo(0xd62f105d),T2
+       xor     T1,C,T1                 !=
+       add     T1,R5,T1
+       !pre-LOADed     X(10),R10
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,5,T2                  !=
+       srl     A,32-5,A
+       or      A,T2,A
+       add     A,B,A
+
+       xor     A,B,T1                  !=
+       sethi   %hi(0x02441453),T2
+       and     T1,C,T1
+       or      T2,%lo(0x02441453),T2
+       xor     T1,B,T1                 !=
+       add     T1,R10,T1
+       LOAD    X(15),RX
+       add     T1,T2,T1
+       add     D,T1,D                  !=
+       sll     D,9,T2
+       srl     D,32-9,D
+       or      D,T2,D
+       add     D,A,D                   !=
+
+       xor     D,A,T1
+       sethi   %hi(0xd8a1e681),T2
+       and     T1,B,T1
+       or      T2,%lo(0xd8a1e681),T2   !=
+       xor     T1,A,T1
+       add     T1,RX,T1
+       !pre-LOADed     X(4),R4
+       add     T1,T2,T1
+       add     C,T1,C                  !=
+       sll     C,14,T2
+       srl     C,32-14,C
+       or      C,T2,C
+       add     C,D,C                   !=
+
+       xor     C,D,T1
+       sethi   %hi(0xe7d3fbc8),T2
+       and     T1,A,T1
+       or      T2,%lo(0xe7d3fbc8),T2   !=
+       xor     T1,D,T1
+       add     T1,R4,T1
+       !pre-LOADed     X(9),R9
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,20,T2
+       srl     B,32-20,B
+       or      B,T2,B
+       add     B,C,B                   !=
+
+       xor     B,C,T1
+       sethi   %hi(0x21e1cde6),T2
+       and     T1,D,T1
+       or      T2,%lo(0x21e1cde6),T2   !=
+       xor     T1,C,T1
+       add     T1,R9,T1
+       LOAD    X(14),RX
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,5,T2
+       srl     A,32-5,A
+       or      A,T2,A                  !=
+       add     A,B,A
+
+       xor     A,B,T1
+       sethi   %hi(0xc33707d6),T2
+       and     T1,C,T1                 !=
+       or      T2,%lo(0xc33707d6),T2
+       xor     T1,B,T1
+       add     T1,RX,T1
+       !pre-LOADed     X(3),R3
+       add     T1,T2,T1                !=
+       add     D,T1,D
+       sll     D,9,T2
+       srl     D,32-9,D
+       or      D,T2,D                  !=
+       add     D,A,D
+
+       xor     D,A,T1
+       sethi   %hi(0xf4d50d87),T2
+       and     T1,B,T1                 !=
+       or      T2,%lo(0xf4d50d87),T2
+       xor     T1,A,T1
+       add     T1,R3,T1
+       !pre-LOADed     X(8),R8
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,14,T2
+       srl     C,32-14,C
+       or      C,T2,C                  !=
+       add     C,D,C
+
+       xor     C,D,T1
+       sethi   %hi(0x455a14ed),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x455a14ed),T2
+       xor     T1,D,T1
+       add     T1,R8,T1
+       !pre-LOADed     X(13),R13
+       add     T1,T2,T1                !=
+       add     B,T1,B
+       sll     B,20,T2
+       srl     B,32-20,B
+       or      B,T2,B                  !=
+       add     B,C,B
+
+       xor     B,C,T1
+       sethi   %hi(0xa9e3e905),T2
+       and     T1,D,T1                 !=
+       or      T2,%lo(0xa9e3e905),T2
+       xor     T1,C,T1
+       add     T1,R13,T1
+       !pre-LOADed     X(2),R2
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,5,T2
+       srl     A,32-5,A
+       or      A,T2,A                  !=
+       add     A,B,A
+
+       xor     A,B,T1
+       sethi   %hi(0xfcefa3f8),T2
+       and     T1,C,T1                 !=
+       or      T2,%lo(0xfcefa3f8),T2
+       xor     T1,B,T1
+       add     T1,R2,T1
+       !pre-LOADed     X(7),R7
+       add     T1,T2,T1                !=
+       add     D,T1,D
+       sll     D,9,T2
+       srl     D,32-9,D
+       or      D,T2,D                  !=
+       add     D,A,D
+
+       xor     D,A,T1
+       sethi   %hi(0x676f02d9),T2
+       and     T1,B,T1                 !=
+       or      T2,%lo(0x676f02d9),T2
+       xor     T1,A,T1
+       add     T1,R7,T1
+       !pre-LOADed     X(12),R12
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,14,T2
+       srl     C,32-14,C
+       or      C,T2,C                  !=
+       add     C,D,C
+
+       xor     C,D,T1
+       sethi   %hi(0x8d2a4c8a),T2
+       and     T1,A,T1                 !=
+       or      T2,%lo(0x8d2a4c8a),T2
+       xor     T1,D,T1
+       add     T1,R12,T1
+       !pre-LOADed     X(5),R5
+       add     T1,T2,T1                !=
+       add     B,T1,B
+       sll     B,20,T2
+       srl     B,32-20,B
+       or      B,T2,B                  !=
+       add     B,C,B
+
+!!!!!!!!Round 2
+
+       xor     B,C,T1
+       sethi   %hi(0xfffa3942),T2
+       xor     T1,D,T1                 !=
+       or      T2,%lo(0xfffa3942),T2
+       add     T1,R5,T1
+       !pre-LOADed     X(8),R8
+       add     T1,T2,T1
+       add     A,T1,A                  !=
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A
+       add     A,B,A                   !=
+
+       xor     A,B,T1
+       sethi   %hi(0x8771f681),T2
+       xor     T1,C,T1
+       or      T2,%lo(0x8771f681),T2   !=
+       add     T1,R8,T1
+       !pre-LOADed     X(11),R11
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,11,T2                 !=
+       srl     D,32-11,D
+       or      D,T2,D
+       add     D,A,D
+
+       xor     D,A,T1                  !=
+       sethi   %hi(0x6d9d6122),T2
+       xor     T1,B,T1
+       or      T2,%lo(0x6d9d6122),T2
+       add     T1,R11,T1               !=
+       LOAD    X(14),RX
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,16,T2                 !=
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C
+
+       xor     C,D,T1                  !=
+       sethi   %hi(0xfde5380c),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xfde5380c),T2
+       add     T1,RX,T1                !=
+       !pre-LOADed     X(1),R1
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2
+       srl     B,32-23,B               !=
+       or      B,T2,B
+       add     B,C,B
+
+       xor     B,C,T1
+       sethi   %hi(0xa4beea44),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0xa4beea44),T2
+       add     T1,R1,T1
+       !pre-LOADed     X(4),R4
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A                  !=
+       add     A,B,A
+
+       xor     A,B,T1
+       sethi   %hi(0x4bdecfa9),T2
+       xor     T1,C,T1                 !=
+       or      T2,%lo(0x4bdecfa9),T2
+       add     T1,R4,T1
+       !pre-LOADed     X(7),R7
+       add     T1,T2,T1
+       add     D,T1,D                  !=
+       sll     D,11,T2
+       srl     D,32-11,D
+       or      D,T2,D
+       add     D,A,D                   !=
+
+       xor     D,A,T1
+       sethi   %hi(0xf6bb4b60),T2
+       xor     T1,B,T1
+       or      T2,%lo(0xf6bb4b60),T2   !=
+       add     T1,R7,T1
+       !pre-LOADed     X(10),R10
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,16,T2                 !=
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C
+
+       xor     C,D,T1                  !=
+       sethi   %hi(0xbebfbc70),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xbebfbc70),T2
+       add     T1,R10,T1               !=
+       !pre-LOADed     X(13),R13
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2
+       srl     B,32-23,B               !=
+       or      B,T2,B
+       add     B,C,B
+
+       xor     B,C,T1
+       sethi   %hi(0x289b7ec6),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0x289b7ec6),T2
+       add     T1,R13,T1
+       !pre-LOADed     X(0),R0
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A                  !=
+       add     A,B,A
+
+       xor     A,B,T1
+       sethi   %hi(0xeaa127fa),T2
+       xor     T1,C,T1                 !=
+       or      T2,%lo(0xeaa127fa),T2
+       add     T1,R0,T1
+       !pre-LOADed     X(3),R3
+       add     T1,T2,T1
+       add     D,T1,D                  !=
+       sll     D,11,T2
+       srl     D,32-11,D
+       or      D,T2,D
+       add     D,A,D                   !=
+
+       xor     D,A,T1
+       sethi   %hi(0xd4ef3085),T2
+       xor     T1,B,T1
+       or      T2,%lo(0xd4ef3085),T2   !=
+       add     T1,R3,T1
+       !pre-LOADed     X(6),R6
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,16,T2                 !=
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C
+
+       xor     C,D,T1                  !=
+       sethi   %hi(0x04881d05),T2
+       xor     T1,A,T1
+       or      T2,%lo(0x04881d05),T2
+       add     T1,R6,T1                !=
+       !pre-LOADed     X(9),R9
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2
+       srl     B,32-23,B               !=
+       or      B,T2,B
+       add     B,C,B
+
+       xor     B,C,T1
+       sethi   %hi(0xd9d4d039),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0xd9d4d039),T2
+       add     T1,R9,T1
+       !pre-LOADed     X(12),R12
+       add     T1,T2,T1                !=
+       add     A,T1,A
+       sll     A,4,T2
+       srl     A,32-4,A
+       or      A,T2,A                  !=
+       add     A,B,A
+
+       xor     A,B,T1
+       sethi   %hi(0xe6db99e5),T2
+       xor     T1,C,T1                 !=
+       or      T2,%lo(0xe6db99e5),T2
+       add     T1,R12,T1
+       LOAD    X(15),RX
+       add     T1,T2,T1                !=
+       add     D,T1,D
+       sll     D,11,T2
+       srl     D,32-11,D
+       or      D,T2,D                  !=
+       add     D,A,D
+
+       xor     D,A,T1
+       sethi   %hi(0x1fa27cf8),T2
+       xor     T1,B,T1                 !=
+       or      T2,%lo(0x1fa27cf8),T2
+       add     T1,RX,T1
+       !pre-LOADed     X(2),R2
+       add     T1,T2,T1
+       add     C,T1,C                  !=
+       sll     C,16,T2
+       srl     C,32-16,C
+       or      C,T2,C
+       add     C,D,C                   !=
+
+       xor     C,D,T1
+       sethi   %hi(0xc4ac5665),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xc4ac5665),T2   !=
+       add     T1,R2,T1
+       !pre-LOADed     X(0),R0
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,23,T2                 !=
+       srl     B,32-23,B
+       or      B,T2,B
+       add     B,C,B
+
+!!!!!!!!Round 3
+
+       orn     B,D,T1                  !=
+       sethi   %hi(0xf4292244),T2
+       xor     T1,C,T1
+       or      T2,%lo(0xf4292244),T2
+       add     T1,R0,T1                !=
+       !pre-LOADed     X(7),R7
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,6,T2
+       srl     A,32-6,A                !=
+       or      A,T2,A
+       add     A,B,A
+
+       orn     A,C,T1
+       sethi   %hi(0x432aff97),T2      !=
+       xor     T1,B,T1
+       or      T2,%lo(0x432aff97),T2
+       LOAD    X(14),RX
+       add     T1,R7,T1                !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2
+       srl     D,32-10,D               !=
+       or      D,T2,D
+       add     D,A,D
+
+       orn     D,B,T1
+       sethi   %hi(0xab9423a7),T2      !=
+       xor     T1,A,T1
+       or      T2,%lo(0xab9423a7),T2
+       add     T1,RX,T1
+       !pre-LOADed     X(5),R5
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,15,T2
+       srl     C,32-15,C
+       or      C,T2,C                  !=
+       add     C,D,C
+
+       orn     C,A,T1
+       sethi   %hi(0xfc93a039),T2
+       xor     T1,D,T1                 !=
+       or      T2,%lo(0xfc93a039),T2
+       add     T1,R5,T1
+       !pre-LOADed     X(12),R12
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,21,T2
+       srl     B,32-21,B
+       or      B,T2,B
+       add     B,C,B                   !=
+
+       orn     B,D,T1
+       sethi   %hi(0x655b59c3),T2
+       xor     T1,C,T1
+       or      T2,%lo(0x655b59c3),T2   !=
+       add     T1,R12,T1
+       !pre-LOADed     X(3),R3
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,6,T2                  !=
+       srl     A,32-6,A
+       or      A,T2,A
+       add     A,B,A
+
+       orn     A,C,T1                  !=
+       sethi   %hi(0x8f0ccc92),T2
+       xor     T1,B,T1
+       or      T2,%lo(0x8f0ccc92),T2
+       add     T1,R3,T1                !=
+       !pre-LOADed     X(10),R10
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2
+       srl     D,32-10,D               !=
+       or      D,T2,D
+       add     D,A,D
+
+       orn     D,B,T1
+       sethi   %hi(0xffeff47d),T2      !=
+       xor     T1,A,T1
+       or      T2,%lo(0xffeff47d),T2
+       add     T1,R10,T1
+       !pre-LOADed     X(1),R1
+       add     T1,T2,T1                !=
+       add     C,T1,C
+       sll     C,15,T2
+       srl     C,32-15,C
+       or      C,T2,C                  !=
+       add     C,D,C
+
+       orn     C,A,T1
+       sethi   %hi(0x85845dd1),T2
+       xor     T1,D,T1                 !=
+       or      T2,%lo(0x85845dd1),T2
+       add     T1,R1,T1
+       !pre-LOADed     X(8),R8
+       add     T1,T2,T1
+       add     B,T1,B                  !=
+       sll     B,21,T2
+       srl     B,32-21,B
+       or      B,T2,B
+       add     B,C,B                   !=
+
+       orn     B,D,T1
+       sethi   %hi(0x6fa87e4f),T2
+       xor     T1,C,T1
+       or      T2,%lo(0x6fa87e4f),T2   !=
+       add     T1,R8,T1
+       LOAD    X(15),RX
+       add     T1,T2,T1
+       add     A,T1,A                  !=
+       sll     A,6,T2
+       srl     A,32-6,A
+       or      A,T2,A
+       add     A,B,A                   !=
+
+       orn     A,C,T1
+       sethi   %hi(0xfe2ce6e0),T2
+       xor     T1,B,T1
+       or      T2,%lo(0xfe2ce6e0),T2   !=
+       add     T1,RX,T1
+       !pre-LOADed     X(6),R6
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2                 !=
+       srl     D,32-10,D
+       or      D,T2,D
+       add     D,A,D
+
+       orn     D,B,T1                  !=
+       sethi   %hi(0xa3014314),T2
+       xor     T1,A,T1
+       or      T2,%lo(0xa3014314),T2
+       add     T1,R6,T1                !=
+       !pre-LOADed     X(13),R13
+       add     T1,T2,T1
+       add     C,T1,C
+       sll     C,15,T2
+       srl     C,32-15,C               !=
+       or      C,T2,C
+       add     C,D,C
+
+       orn     C,A,T1
+       sethi   %hi(0x4e0811a1),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0x4e0811a1),T2
+       !pre-LOADed     X(4),R4
+        ld      [Aptr],Aval
+       add     T1,R13,T1               !=
+       add     T1,T2,T1
+       add     B,T1,B
+       sll     B,21,T2
+       srl     B,32-21,B               !=
+       or      B,T2,B
+       add     B,C,B
+
+       orn     B,D,T1
+       sethi   %hi(0xf7537e82),T2      !=
+       xor     T1,C,T1
+       or      T2,%lo(0xf7537e82),T2
+       !pre-LOADed     X(11),R11
+        ld      [Dptr],Dval
+       add     T1,R4,T1                !=
+       add     T1,T2,T1
+       add     A,T1,A
+       sll     A,6,T2
+       srl     A,32-6,A                !=
+       or      A,T2,A
+       add     A,B,A
+
+       orn     A,C,T1
+       sethi   %hi(0xbd3af235),T2      !=
+       xor     T1,B,T1
+       or      T2,%lo(0xbd3af235),T2
+       !pre-LOADed     X(2),R2
+        ld      [Cptr],Cval
+       add     T1,R11,T1               !=
+       add     T1,T2,T1
+       add     D,T1,D
+       sll     D,10,T2
+       srl     D,32-10,D               !=
+       or      D,T2,D
+       add     D,A,D
+
+       orn     D,B,T1
+       sethi   %hi(0x2ad7d2bb),T2      !=
+       xor     T1,A,T1
+       or      T2,%lo(0x2ad7d2bb),T2
+       !pre-LOADed     X(9),R9
+        ld      [Bptr],Bval
+       add     T1,R2,T1                !=
+        add     Aval,A,Aval
+       add     T1,T2,T1
+        st      Aval,[Aptr]
+       add     C,T1,C                  !=
+       sll     C,15,T2
+        add     Dval,D,Dval
+       srl     C,32-15,C
+       or      C,T2,C                  !=
+        st      Dval,[Dptr]
+       add     C,D,C
+
+       orn     C,A,T1
+       sethi   %hi(0xeb86d391),T2      !=
+       xor     T1,D,T1
+       or      T2,%lo(0xeb86d391),T2
+       add     T1,R9,T1
+       !pre-LOADed     X(0),R0
+        mov     Aval,A                 !=
+       add     T1,T2,T1
+        mov     Dval,D
+       add     B,T1,B
+       sll     B,21,T2                 !=
+        add     Cval,C,Cval
+       srl     B,32-21,B
+        st      Cval,[Cptr]
+       or      B,T2,B                  !=
+       add     B,C,B
+
+       deccc   %i2
+       mov     Cval,C
+       add     B,Bval,B                !=
+       inc     64,%i1
+       nop
+       st      B,[Bptr]
+       nop                             !=
+
+#ifdef ULTRASPARC
+       bg,a,pt %icc,.Lmd5_block_loop
+#else
+       bg,a    .Lmd5_block_loop
+#endif
+       LOAD    X(0),R0
+
+#ifdef ASI_PRIMARY_LITTLE
+       mov     %o7,%asi
+#endif
+       ret
+       restore %g0,0,%o0
+
+.type  md5_block,#function
+.size  md5_block,(.-md5_block)
index 6e97fe1..148fb96 100644 (file)
@@ -67,23 +67,43 @@ extern "C" {
 #error MD5 is disabled.
 #endif
 
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * ! MD5_LONG has to be at least 32 bits wide. If it's wider, then !
+ * ! MD5_LONG_LOG2 has to be defined along.                       !
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#if defined(WIN16) || defined(__LP32__)
+#define MD5_LONG unsigned long
+#elif defined(_CRAY) || defined(__ILP64__)
+#define MD5_LONG unsigned long
+#define MD5_LONG_LOG2 3
+/*
+ * _CRAY note. I could declare short, but I have no idea what impact
+ * does it have on performance on none-T3E machines. I could declare
+ * int, but at least on C90 sizeof(int) can be chosen at compile time.
+ * So I've chosen long...
+ *                                     <appro@fy.chalmers.se>
+ */
+#else
+#define MD5_LONG unsigned int
+#endif
+
 #define MD5_CBLOCK     64
-#define MD5_LBLOCK     16
-#define MD5_BLOCK      16
-#define MD5_LAST_BLOCK  56
-#define MD5_LENGTH_BLOCK 8
+#define MD5_LBLOCK     (MD5_CBLOCK/4)
 #define MD5_DIGEST_LENGTH 16
 
 typedef struct MD5state_st
        {
-       unsigned long A,B,C,D;
-       unsigned long Nl,Nh;
-       unsigned long data[MD5_LBLOCK];
+       MD5_LONG A,B,C,D;
+       MD5_LONG Nl,Nh;
+       MD5_LONG data[MD5_LBLOCK];
        int num;
        } MD5_CTX;
 
 void MD5_Init(MD5_CTX *c);
-void MD5_Update(MD5_CTX *c, const void *data, unsigned long len);
+void MD5_Update(MD5_CTX *c, const unsigned char *data, unsigned long len);
 void MD5_Final(unsigned char *md, MD5_CTX *c);
 unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md);
 void MD5_Transform(MD5_CTX *c, unsigned char *b);
index 2671b00..830e04d 100644 (file)
@@ -70,12 +70,6 @@ char *MD5_version="MD5" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-#  ifdef MD5_ASM
-     void md5_block_x86(MD5_CTX *c, unsigned long *p,int num);
-#    define md5_block md5_block_x86
-#  else
-     static void md5_block(MD5_CTX *c, unsigned long *p,int num);
-#  endif
 void MD5_Init(MD5_CTX *c)
        {
        c->A=INIT_DATA_A;
@@ -87,183 +81,31 @@ void MD5_Init(MD5_CTX *c)
        c->num=0;
        }
 
-void MD5_Update(MD5_CTX *c, const void *_data, unsigned long len)
+#ifndef md5_block_host_order
+void md5_block_host_order (MD5_CTX *c, const MD5_LONG *X, int num)
        {
-       register const unsigned char *data=_data;
-       register ULONG *p;
-       int sw,sc;
-       ULONG l;
-
-       if (len == 0) return;
-
-       l=(c->Nl+(len<<3))&0xffffffffL;
-       /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
-        * Wei Dai <weidai@eskimo.com> for pointing it out. */
-       if (l < c->Nl) /* overflow */
-               c->Nh++;
-       c->Nh+=(len>>29);
-       c->Nl=l;
-
-       if (c->num != 0)
-               {
-               p=c->data;
-               sw=c->num>>2;
-               sc=c->num&0x03;
-
-               if ((c->num+len) >= MD5_CBLOCK)
-                       {
-                       l= p[sw];
-                       p_c2l(data,l,sc);
-                       p[sw++]=l;
-                       for (; sw<MD5_LBLOCK; sw++)
-                               {
-                               c2l(data,l);
-                               p[sw]=l;
-                               }
-                       len-=(MD5_CBLOCK-c->num);
-
-                       md5_block(c,p,64);
-                       c->num=0;
-                       /* drop through and do the rest */
-                       }
-               else
-                       {
-                       int ew,ec;
-
-                       c->num+=(int)len;
-                       if ((sc+len) < 4) /* ugly, add char's to a word */
-                               {
-                               l= p[sw];
-                               p_c2l_p(data,l,sc,len);
-                               p[sw]=l;
-                               }
-                       else
-                               {
-                               ew=(c->num>>2);
-                               ec=(c->num&0x03);
-                               l= p[sw];
-                               p_c2l(data,l,sc);
-                               p[sw++]=l;
-                               for (; sw < ew; sw++)
-                                       { c2l(data,l); p[sw]=l; }
-                               if (ec)
-                                       {
-                                       c2l_p(data,l,ec);
-                                       p[sw]=l;
-                                       }
-                               }
-                       return;
-                       }
-               }
-       /* we now can process the input data in blocks of MD5_CBLOCK
-        * chars and save the leftovers to c->data. */
-#ifdef L_ENDIAN
-       if ((((unsigned long)data)%sizeof(ULONG)) == 0)
-               {
-               sw=(int)len/MD5_CBLOCK;
-               if (sw > 0)
-                       {
-                       sw*=MD5_CBLOCK;
-                       md5_block(c,(ULONG *)data,sw);
-                       data+=sw;
-                       len-=sw;
-                       }
-               }
-#endif
-       p=c->data;
-       while (len >= MD5_CBLOCK)
-               {
-#if defined(L_ENDIAN) || defined(B_ENDIAN)
-               if (p != (unsigned long *)data)
-                       memcpy(p,data,MD5_CBLOCK);
-               data+=MD5_CBLOCK;
-#ifdef B_ENDIAN
-               for (sw=(MD5_LBLOCK/4); sw; sw--)
-                       {
-                       Endian_Reverse32(p[0]);
-                       Endian_Reverse32(p[1]);
-                       Endian_Reverse32(p[2]);
-                       Endian_Reverse32(p[3]);
-                       p+=4;
-                       }
-#endif
-#else
-               for (sw=(MD5_LBLOCK/4); sw; sw--)
-                       {
-                       c2l(data,l); *(p++)=l;
-                       c2l(data,l); *(p++)=l;
-                       c2l(data,l); *(p++)=l;
-                       c2l(data,l); *(p++)=l; 
-                       } 
-#endif
-               p=c->data;
-               md5_block(c,p,64);
-               len-=MD5_CBLOCK;
-               }
-       sc=(int)len;
-       c->num=sc;
-       if (sc)
-               {
-               sw=sc>>2;       /* words to copy */
-#ifdef L_ENDIAN
-               p[sw]=0;
-               memcpy(p,data,sc);
-#else
-               sc&=0x03;
-               for ( ; sw; sw--)
-                       { c2l(data,l); *(p++)=l; }
-               c2l_p(data,l,sc);
-               *p=l;
-#endif
-               }
-       }
-
-void MD5_Transform(MD5_CTX *c, unsigned char *b)
-       {
-       ULONG p[16];
-#if !defined(L_ENDIAN)
-       ULONG *q;
-       int i;
-#endif
-
-#if defined(B_ENDIAN) || defined(L_ENDIAN)
-       memcpy(p,b,64);
-#ifdef B_ENDIAN
-       q=p;
-       for (i=(MD5_LBLOCK/4); i; i--)
-               {
-               Endian_Reverse32(q[0]);
-               Endian_Reverse32(q[1]);
-               Endian_Reverse32(q[2]);
-               Endian_Reverse32(q[3]);
-               q+=4;
-               }
-#endif
-#else
-       q=p;
-       for (i=(MD5_LBLOCK/4); i; i--)
-               {
-               ULONG l;
-               c2l(b,l); *(q++)=l;
-               c2l(b,l); *(q++)=l;
-               c2l(b,l); *(q++)=l;
-               c2l(b,l); *(q++)=l; 
-               } 
-#endif
-       md5_block(c,p,64);
-       }
-
-#ifndef MD5_ASM
-
-static void md5_block(MD5_CTX *c, register ULONG *X, int num)
-       {
-       register ULONG A,B,C,D;
+       register unsigned long A,B,C,D;
+       /*
+        * In case you wonder why A-D are declared as long and not
+        * as MD5_LONG. Doing so results in slight performance
+        * boost on LP64 architectures. The catch is we don't
+        * really care if 32 MSBs of a 64-bit register get polluted
+        * with eventual overflows as we *save* only 32 LSBs in
+        * *either* case. Now declaring 'em long excuses the compiler
+        * from keeping 32 MSBs zeroed resulting in 13% performance
+        * improvement under SPARC Solaris7/64 and 5% under AlphaLinux.
+        * Well, to be honest it should say that this *prevents* 
+        * performance degradation.
+        *
+        *                              <appro@fy.chalmers.se>
+        */
 
        A=c->A;
        B=c->B;
        C=c->C;
        D=c->D;
-       for (;;)
+
+       for (;num--;X+=HASH_LBLOCK)
                {
        /* Round 0 */
        R0(A,B,C,D,X[ 0], 7,0xd76aa478L);
@@ -334,74 +176,127 @@ static void md5_block(MD5_CTX *c, register ULONG *X, int num)
        R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
        R3(B,C,D,A,X[ 9],21,0xeb86d391L);
 
-       A+=c->A&0xffffffffL;
-       B+=c->B&0xffffffffL;
-       c->A=A;
-       c->B=B;
-       C+=c->C&0xffffffffL;
-       D+=c->D&0xffffffffL;
-       c->C=C;
-       c->D=D;
-       X+=16;
-       num-=64;
-       if (num <= 0) break;
+       A = c->A += A;
+       B = c->B += B;
+       C = c->C += C;
+       D = c->D += D;
                }
        }
 #endif
 
-void MD5_Final(unsigned char *md, MD5_CTX *c)
+#ifndef md5_block_data_order
+void md5_block_data_order (MD5_CTX *c, const unsigned char *data, int num)
        {
-       register int i,j;
-       register ULONG l;
-       register ULONG *p;
-       static unsigned char end[4]={0x80,0x00,0x00,0x00};
-       unsigned char *cp=end;
+       register unsigned long A,B,C,D,l;
+       /*
+        * In case you wonder why A-D are declared as long and not
+        * as MD5_LONG. Doing so results in slight performance
+        * boost on LP64 architectures. The catch is we don't
+        * really care if 32 MSBs of a 64-bit register get polluted
+        * with eventual overflows as we *save* only 32 LSBs in
+        * *either* case. Now declaring 'em long excuses the compiler
+        * from keeping 32 MSBs zeroed resulting in 13% performance
+        * improvement under SPARC Solaris7/64 and 5% under AlphaLinux.
+        * Well, to be honest it should say that this *prevents* 
+        * performance degradation.
+        *
+        *                              <appro@fy.chalmers.se>
+        */
+       MD5_LONG X[MD5_LBLOCK];
+       /*
+        * In case you wonder why don't I use c->data for this.
+        * RISCs usually have a handful of registers and if X is
+        * declared as automatic array good optimizing compiler
+        * shall accomodate at least part of it in register bank
+        * instead of memory.
+        *
+        *                              <appro@fy.chalmers.se>
+        */
 
-       /* c->num should definitly have room for at least one more byte. */
-       p=c->data;
-       j=c->num;
-       i=j>>2;
+       A=c->A;
+       B=c->B;
+       C=c->C;
+       D=c->D;
 
-       /* purify often complains about the following line as an
-        * Uninitialized Memory Read.  While this can be true, the
-        * following p_c2l macro will reset l when that case is true.
-        * This is because j&0x03 contains the number of 'valid' bytes
-        * already in p[i].  If and only if j&0x03 == 0, the UMR will
-        * occur but this is also the only time p_c2l will do
-        * l= *(cp++) instead of l|= *(cp++)
-        * Many thanks to Alex Tang <altitude@cic.net> for pickup this
-        * 'potential bug' */
-#ifdef PURIFY
-       if ((j&0x03) == 0) p[i]=0;
-#endif
-       l=p[i];
-       p_c2l(cp,l,j&0x03);
-       p[i]=l;
-       i++;
-       /* i is the next 'undefined word' */
-       if (c->num >= MD5_LAST_BLOCK)
+       for (;num--;)
                {
-               for (; i<MD5_LBLOCK; i++)
-                       p[i]=0;
-               md5_block(c,p,64);
-               i=0;
-               }
-       for (; i<(MD5_LBLOCK-2); i++)
-               p[i]=0;
-       p[MD5_LBLOCK-2]=c->Nl;
-       p[MD5_LBLOCK-1]=c->Nh;
-       md5_block(c,p,64);
-       cp=md;
-       l=c->A; l2c(l,cp);
-       l=c->B; l2c(l,cp);
-       l=c->C; l2c(l,cp);
-       l=c->D; l2c(l,cp);
+       HOST_c2l(data,l); X[ 0]=l;              HOST_c2l(data,l); X[ 1]=l;
+       /* Round 0 */
+       R0(A,B,C,D,X[ 0], 7,0xd76aa478L);       HOST_c2l(data,l); X[ 2]=l;
+       R0(D,A,B,C,X[ 1],12,0xe8c7b756L);       HOST_c2l(data,l); X[ 3]=l;
+       R0(C,D,A,B,X[ 2],17,0x242070dbL);       HOST_c2l(data,l); X[ 4]=l;
+       R0(B,C,D,A,X[ 3],22,0xc1bdceeeL);       HOST_c2l(data,l); X[ 5]=l;
+       R0(A,B,C,D,X[ 4], 7,0xf57c0fafL);       HOST_c2l(data,l); X[ 6]=l;
+       R0(D,A,B,C,X[ 5],12,0x4787c62aL);       HOST_c2l(data,l); X[ 7]=l;
+       R0(C,D,A,B,X[ 6],17,0xa8304613L);       HOST_c2l(data,l); X[ 8]=l;
+       R0(B,C,D,A,X[ 7],22,0xfd469501L);       HOST_c2l(data,l); X[ 9]=l;
+       R0(A,B,C,D,X[ 8], 7,0x698098d8L);       HOST_c2l(data,l); X[10]=l;
+       R0(D,A,B,C,X[ 9],12,0x8b44f7afL);       HOST_c2l(data,l); X[11]=l;
+       R0(C,D,A,B,X[10],17,0xffff5bb1L);       HOST_c2l(data,l); X[12]=l;
+       R0(B,C,D,A,X[11],22,0x895cd7beL);       HOST_c2l(data,l); X[13]=l;
+       R0(A,B,C,D,X[12], 7,0x6b901122L);       HOST_c2l(data,l); X[14]=l;
+       R0(D,A,B,C,X[13],12,0xfd987193L);       HOST_c2l(data,l); X[15]=l;
+       R0(C,D,A,B,X[14],17,0xa679438eL);
+       R0(B,C,D,A,X[15],22,0x49b40821L);
+       /* Round 1 */
+       R1(A,B,C,D,X[ 1], 5,0xf61e2562L);
+       R1(D,A,B,C,X[ 6], 9,0xc040b340L);
+       R1(C,D,A,B,X[11],14,0x265e5a51L);
+       R1(B,C,D,A,X[ 0],20,0xe9b6c7aaL);
+       R1(A,B,C,D,X[ 5], 5,0xd62f105dL);
+       R1(D,A,B,C,X[10], 9,0x02441453L);
+       R1(C,D,A,B,X[15],14,0xd8a1e681L);
+       R1(B,C,D,A,X[ 4],20,0xe7d3fbc8L);
+       R1(A,B,C,D,X[ 9], 5,0x21e1cde6L);
+       R1(D,A,B,C,X[14], 9,0xc33707d6L);
+       R1(C,D,A,B,X[ 3],14,0xf4d50d87L);
+       R1(B,C,D,A,X[ 8],20,0x455a14edL);
+       R1(A,B,C,D,X[13], 5,0xa9e3e905L);
+       R1(D,A,B,C,X[ 2], 9,0xfcefa3f8L);
+       R1(C,D,A,B,X[ 7],14,0x676f02d9L);
+       R1(B,C,D,A,X[12],20,0x8d2a4c8aL);
+       /* Round 2 */
+       R2(A,B,C,D,X[ 5], 4,0xfffa3942L);
+       R2(D,A,B,C,X[ 8],11,0x8771f681L);
+       R2(C,D,A,B,X[11],16,0x6d9d6122L);
+       R2(B,C,D,A,X[14],23,0xfde5380cL);
+       R2(A,B,C,D,X[ 1], 4,0xa4beea44L);
+       R2(D,A,B,C,X[ 4],11,0x4bdecfa9L);
+       R2(C,D,A,B,X[ 7],16,0xf6bb4b60L);
+       R2(B,C,D,A,X[10],23,0xbebfbc70L);
+       R2(A,B,C,D,X[13], 4,0x289b7ec6L);
+       R2(D,A,B,C,X[ 0],11,0xeaa127faL);
+       R2(C,D,A,B,X[ 3],16,0xd4ef3085L);
+       R2(B,C,D,A,X[ 6],23,0x04881d05L);
+       R2(A,B,C,D,X[ 9], 4,0xd9d4d039L);
+       R2(D,A,B,C,X[12],11,0xe6db99e5L);
+       R2(C,D,A,B,X[15],16,0x1fa27cf8L);
+       R2(B,C,D,A,X[ 2],23,0xc4ac5665L);
+       /* Round 3 */
+       R3(A,B,C,D,X[ 0], 6,0xf4292244L);
+       R3(D,A,B,C,X[ 7],10,0x432aff97L);
+       R3(C,D,A,B,X[14],15,0xab9423a7L);
+       R3(B,C,D,A,X[ 5],21,0xfc93a039L);
+       R3(A,B,C,D,X[12], 6,0x655b59c3L);
+       R3(D,A,B,C,X[ 3],10,0x8f0ccc92L);
+       R3(C,D,A,B,X[10],15,0xffeff47dL);
+       R3(B,C,D,A,X[ 1],21,0x85845dd1L);
+       R3(A,B,C,D,X[ 8], 6,0x6fa87e4fL);
+       R3(D,A,B,C,X[15],10,0xfe2ce6e0L);
+       R3(C,D,A,B,X[ 6],15,0xa3014314L);
+       R3(B,C,D,A,X[13],21,0x4e0811a1L);
+       R3(A,B,C,D,X[ 4], 6,0xf7537e82L);
+       R3(D,A,B,C,X[11],10,0xbd3af235L);
+       R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
+       R3(B,C,D,A,X[ 9],21,0xeb86d391L);
 
-       /* clear stuff, md5_block may be leaving some stuff on the stack
-        * but I'm not worried :-) */
-       c->num=0;
-/*     memset((char *)&c,0,sizeof(c));*/
+       A = c->A += A;
+       B = c->B += B;
+       C = c->C += C;
+       D = c->D += D;
+               }
        }
+#endif
 
 #ifdef undef
 int printit(unsigned long *l)
index fe7397a..56708ba 100644 (file)
  * [including the GNU Public Licence.]
  */
 
-/* On sparc, this actually slows things down :-( */
-#if defined(sun)
-#undef B_ENDIAN
-#endif
-
 #include <stdlib.h>
 #include <string.h>
 #include <openssl/md5.h>
 
-#define ULONG  unsigned long
-#define UCHAR  unsigned char
-#define UINT   unsigned int
-
-#undef c2l
-#define c2l(c,l)       (l = ((unsigned long)(*((c)++)))     , \
-                        l|=(((unsigned long)(*((c)++)))<< 8), \
-                        l|=(((unsigned long)(*((c)++)))<<16), \
-                        l|=(((unsigned long)(*((c)++)))<<24))
+#ifndef MD5_LONG_LOG2
+#define MD5_LONG_LOG2 2 /* default to 32 bits */
+#endif
 
-#undef p_c2l
-#define p_c2l(c,l,n)   { \
-                       switch (n) { \
-                       case 0: l =((unsigned long)(*((c)++))); \
-                       case 1: l|=((unsigned long)(*((c)++)))<< 8; \
-                       case 2: l|=((unsigned long)(*((c)++)))<<16; \
-                       case 3: l|=((unsigned long)(*((c)++)))<<24; \
-                               } \
-                       }
+#ifdef MD5_ASM
+# if defined(__i386) || defined(WIN32)
+#  define md5_block_host_order md5_block_asm_host_order
+# elif defined(__sparc) && defined(ULTRASPARC)
+   void md5_block_asm_data_order_aligned (MD5_CTX *c, const MD5_LONG *p,int num);
+#  define HASH_BLOCK_DATA_ORDER_ALIGNED md5_block_asm_data_order_aligned
+# endif
+#endif
 
-/* NOTE the pointer is not incremented at the end of this */
-#undef c2l_p
-#define c2l_p(c,l,n)   { \
-                       l=0; \
-                       (c)+=n; \
-                       switch (n) { \
-                       case 3: l =((unsigned long)(*(--(c))))<<16; \
-                       case 2: l|=((unsigned long)(*(--(c))))<< 8; \
-                       case 1: l|=((unsigned long)(*(--(c))))    ; \
-                               } \
-                       }
+void md5_block_host_order (MD5_CTX *c, const MD5_LONG *p,int num);
+void md5_block_data_order (MD5_CTX *c, const unsigned char *p,int num);
 
-#undef p_c2l_p
-#define p_c2l_p(c,l,sc,len) { \
-                       switch (sc) \
-                               { \
-                       case 0: l =((unsigned long)(*((c)++))); \
-                               if (--len == 0) break; \
-                       case 1: l|=((unsigned long)(*((c)++)))<< 8; \
-                               if (--len == 0) break; \
-                       case 2: l|=((unsigned long)(*((c)++)))<<16; \
-                               } \
-                       }
+#if defined(__i386)
+/*
+ * *_block_host_order is expected to handle aligned data while
+ * *_block_data_order - unaligned. As algorithm and host (x86)
+ * are in this case of the same "endianess" these two are
+ * otherwise indistinguishable. But normally you don't want to
+ * call the same function because unaligned access in places
+ * where alignment is expected is usually a "Bad Thing". Indeed,
+ * on RISCs you get punished with BUS ERROR signal or *severe*
+ * performance degradation. Intel CPUs are in turn perfectly
+ * capable of loading unaligned data without such drastic side
+ * effect. Yes, they say it's slower than aligned load, but no
+ * exception is generated and therefore performance degradation
+ * is *incomparable* with RISCs. What we should weight here is
+ * costs of unaligned access against costs of aligning data.
+ * According to my measurements allowing unaligned access results
+ * in ~9% performance improvement on Pentium II operating at
+ * 266MHz. I won't be surprised if the difference will be higher
+ * on faster systems:-)
+ *
+ *                             <appro@fy.chalmers.se>
+ */
+#define md5_block_data_order   md5_block_host_order
+#endif
 
-#undef l2c
-#define l2c(l,c)       (*((c)++)=(unsigned char)(((l)    )&0xff), \
-                        *((c)++)=(unsigned char)(((l)>> 8)&0xff), \
-                        *((c)++)=(unsigned char)(((l)>>16)&0xff), \
-                        *((c)++)=(unsigned char)(((l)>>24)&0xff))
+#define DATA_ORDER_IS_LITTLE_ENDIAN
+
+#define HASH_LONG              MD5_LONG
+#define HASH_LONG_LOG2         MD5_LONG_LOG2
+#define HASH_CTX               MD5_CTX
+#define HASH_CBLOCK            MD5_CBLOCK
+#define HASH_LBLOCK            MD5_LBLOCK
+#define HASH_UPDATE            MD5_Update
+#define HASH_TRANSFORM         MD5_Transform
+#define HASH_FINAL             MD5_Final
+#define HASH_BLOCK_HOST_ORDER  md5_block_host_order
+#if defined(B_ENDIAN) || defined(md5_block_data_order)
+#define        HASH_BLOCK_DATA_ORDER   md5_block_data_order
+/*
+ * Little-endians (Intel and Alpha) feel better without this.
+ * It looks like memcpy does better job than generic
+ * md5_block_data_order on copying-n-aligning input data.
+ * But franlky speaking I didn't expect such result on Alpha.
+ * On the other hand I've got this with egcs-1.0.2 and if
+ * program is compiled with another (better?) compiler it
+ * might turn out other way around.
+ *
+ *                             <appro@fy.chalmers.se>
+ */
+#endif
 
-/* NOTE - c is not incremented as per l2c */
-#undef l2cn
-#define l2cn(l1,l2,c,n)        { \
-                       c+=n; \
-                       switch (n) { \
-                       case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
-                       case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
-                       case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
-                       case 5: *(--(c))=(unsigned char)(((l2)    )&0xff); \
-                       case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
-                       case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
-                       case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
-                       case 1: *(--(c))=(unsigned char)(((l1)    )&0xff); \
-                               } \
-                       }
+#include "../md32_common.h"
 
-/* A nice byte order reversal from Wei Dai <weidai@eskimo.com> */
-#if defined(WIN32)
-/* 5 instructions with rotate instruction, else 9 */
-#define Endian_Reverse32(a) \
-       { \
-       unsigned long l=(a); \
-       (a)=((ROTATE(l,8)&0x00FF00FF)|(ROTATE(l,24)&0xFF00FF00)); \
-       }
-#else
-/* 6 instructions with rotate instruction, else 8 */
-#define Endian_Reverse32(a) \
-       { \
-       unsigned long l=(a); \
-       l=(((l&0xFF00FF00)>>8L)|((l&0x00FF00FF)<<8L)); \
-       (a)=ROTATE(l,16L); \
-       }
-#endif
 /*
 #define        F(x,y,z)        (((x) & (y))  |  ((~(x)) & (z)))
 #define        G(x,y,z)        (((x) & (z))  |  ((y) & (~(z))))
 #define        H(b,c,d)        ((b) ^ (c) ^ (d))
 #define        I(b,c,d)        (((~(d)) | (b)) ^ (c))
 
-#undef ROTATE
-#if defined(WIN32)
-#define ROTATE(a,n)     _lrotl(a,n)
-#else
-#define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
-#endif
-
-
 #define R0(a,b,c,d,k,s,t) { \
        a+=((k)+(t)+F((b),(c),(d))); \
        a=ROTATE(a,s); \
index c761401..c98721f 100644 (file)
@@ -57,7 +57,8 @@
  */
 
 #include <stdio.h>
-#include "md5_locl.h"
+#include <string.h>
+#include <openssl/md5.h>
 
 unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md)
        {