Kill unused macro and reimplement it for that single context it can
authorAndy Polyakov <appro@openssl.org>
Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)
actually be used, namely x86* platforms [because they don't bomb on
unaligned access]. This resulted in 30-40% [depending on message
length] improvement for SHA-256 compiled with gcc and running on P4.
In the lack of assembler implementation I give the compiler all the
help it can possibly get:-)

crypto/md32_common.h
crypto/sha/asm/sha512-sse2.pl

index 535ea85d16de5a0a886570338a1a129a52b42e72..53db17e1bceb77b7f8863f518ed01e4ecd58f763 100644 (file)
    * Some GNU C inline assembler templates. Note that these are
    * rotates by *constant* number of bits! But that's exactly
    * what we need here...
-   *
    *                                   <appro@fy.chalmers.se>
    */
 #  if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
                        })
 #  endif
 # endif
-
-/*
- * Engage compiler specific "fetch in reverse byte order"
- * intrinsic function if available.
- */
-# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-  /* some GNU C inline assembler templates by <appro@fy.chalmers.se> */
-#  if (defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)) && !defined(I386_ONLY)
-#   define BE_FETCH32(a)       ({ register unsigned int l=(a);\
-                               asm (                   \
-                               "bswapl %0"             \
-                               : "=r"(l) : "0"(l));    \
-                         l;                            \
-                       })
-#  elif defined(__powerpc)
-#   define LE_FETCH32(a)       ({ register unsigned int l;     \
-                               asm (                   \
-                               "lwbrx %0,0,%1"         \
-                               : "=r"(l)               \
-                               : "r"(a));              \
-                          l;                           \
-                       })
-
-#  elif defined(__sparc) && defined(OPENSSL_SYS_ULTRASPARC)
-#  define LE_FETCH32(a)        ({ register unsigned int l;             \
-                               asm (                           \
-                               "lda [%1]#ASI_PRIMARY_LITTLE,%0"\
-                               : "=r"(l)                       \
-                               : "r"(a));                      \
-                          l;                                   \
-                       })
-#  endif
-# endif
 #endif /* PEDANTIC */
 
 #if HASH_LONG_LOG2==2  /* Engage only if sizeof(HASH_LONG)== 4 */
 #    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
 #      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
 #    endif
-#  elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
-#    ifndef HOST_FETCH32
-#      ifdef LE_FETCH32
-#        define HOST_FETCH32(p,l)      LE_FETCH32(p)
-#      elif defined(REVERSE_FETCH32)
-#        define HOST_FETCH32(p,l)      REVERSE_FETCH32(p,l)
-#      endif
-#    endif
 #  endif
 #elif defined(L_ENDIAN)
 #  if defined(DATA_ORDER_IS_LITTLE_ENDIAN)
 #    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
 #      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
 #    endif
-#  elif defined(DATA_ORDER_IS_BIG_ENDIAN)
-#    ifndef HOST_FETCH32
-#      ifdef BE_FETCH32
-#        define HOST_FETCH32(p,l)      BE_FETCH32(p)
-#      elif defined(REVERSE_FETCH32)
-#        define HOST_FETCH32(p,l)      REVERSE_FETCH32(p,l)
-#      endif
-#    endif
 #  endif
 #endif
 
 
 #if defined(DATA_ORDER_IS_BIG_ENDIAN)
 
+#ifndef PEDANTIC
+# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#  if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
+    /*
+     * This gives ~30-40% performance improvement in SHA-256 compiled
+     * with gcc [on P4]. Well, first macro to be frank. We can pull
+     * this trick on x86* platforms only, because these CPUs can fetch
+     * unaligned data without raising an exception.
+     */
+#   define HOST_c2l(c,l)       ({ (l)=*((const unsigned int *)(c));    \
+                                  asm ("bswapl %0":"=r"(l):"0"(l));    \
+                                  (c)+=4; (l);                         })
+#   define HOST_l2c(l,c)       ({ unsigned int r=(l);                  \
+                                  asm ("bswapl %0":"=r"(r):"0"(r));    \
+                                  *((unsigned int *)(c))=r; (c)+=4; r; })
+#  endif
+# endif
+#endif
+
+#ifndef HOST_c2l
 #define HOST_c2l(c,l)  (l =(((unsigned long)(*((c)++)))<<24),          \
                         l|=(((unsigned long)(*((c)++)))<<16),          \
                         l|=(((unsigned long)(*((c)++)))<< 8),          \
                         l|=(((unsigned long)(*((c)++)))    ),          \
                         l)
+#endif
 #define HOST_p_c2l(c,l,n)      {                                       \
                        switch (n) {                                    \
                        case 0: l =((unsigned long)(*((c)++)))<<24;     \
                        case 2: l|=((unsigned long)(*(--(c))))<<16;     \
                        case 1: l|=((unsigned long)(*(--(c))))<<24;     \
                                } }
+#ifndef HOST_l2c
 #define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)>>24)&0xff),      \
                         *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
                         *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
                         *((c)++)=(unsigned char)(((l)    )&0xff),      \
                         l)
+#endif
 
 #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
 
+#if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
+  /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
+# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, l)
+# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, l)
+#endif
+
+#ifndef HOST_c2l
 #define HOST_c2l(c,l)  (l =(((unsigned long)(*((c)++)))    ),          \
                         l|=(((unsigned long)(*((c)++)))<< 8),          \
                         l|=(((unsigned long)(*((c)++)))<<16),          \
                         l|=(((unsigned long)(*((c)++)))<<24),          \
                         l)
+#endif
 #define HOST_p_c2l(c,l,n)      {                                       \
                        switch (n) {                                    \
                        case 0: l =((unsigned long)(*((c)++)));         \
                        case 2: l|=((unsigned long)(*(--(c))))<< 8;     \
                        case 1: l|=((unsigned long)(*(--(c))));         \
                                } }
+#ifndef HOST_l2c
 #define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)    )&0xff),      \
                         *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
                         *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
                         *((c)++)=(unsigned char)(((l)>>24)&0xff),      \
                         l)
+#endif
 
 #endif
 
@@ -415,7 +398,7 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
        const unsigned char *data=data_;
        register HASH_LONG * p;
        register HASH_LONG l;
-       int sw,sc,ew,ec;
+       unsigned int sw,sc,ew,ec;
 
        if (len==0) return 1;
 
@@ -481,7 +464,7 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
                 * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined
                 * only if sizeof(HASH_LONG)==4.
                 */
-               if ((((unsigned long)data)%4) == 0)
+               if ((((size_t)data)%4) == 0)
                        {
                        /* data is properly aligned so that we can cast it: */
                        HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,sw);
@@ -530,7 +513,7 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
 void HASH_TRANSFORM (HASH_CTX *c, const unsigned char *data)
        {
 #if defined(HASH_BLOCK_DATA_ORDER_ALIGNED)
-       if ((((unsigned long)data)%4) == 0)
+       if ((((size_t)data)%4) == 0)
                /* data is properly aligned so that we can cast it: */
                HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,1);
        else
index 4235dbae826b42eeeda7a061a6c2daeb27d0fe8f..797aedacd72eae2d3e44cf7bad58ba03dc9a3a34 100644 (file)
 # Throughput performance in MBps (larger is better):
 #
 #              2.4GHz P4       1.4GHz AMD32    1.4GHz AMD64(*)
-# SHA256/gcc(*)        39              42              59
+# SHA256/gcc(*)        54              43              59
 # SHA512/gcc   17              23              92
 # SHA512/sse2  54(**)          55(**)
 # SHA512/icc   26              28
-# SHA256/icc(*)        64              54
+# SHA256/icc(*)        65              54
 #
 # (*)  AMD64 and SHA256 numbers are presented mostly for amusement or
 #      reference purposes.