Kill unused macro and reimplement it for that single context it can

author Andy Polyakov <appro@openssl.org>

Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)

committer Andy Polyakov <appro@openssl.org>

Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)
author Andy Polyakov <appro@openssl.org>
Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)
committer Andy Polyakov <appro@openssl.org>
Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)
diff --git a/crypto/md32_common.h b/crypto/md32_common.h

index 535ea85d16de5a0a886570338a1a129a52b42e72..53db17e1bceb77b7f8863f518ed01e4ecd58f763 100644 (file)
--- a/crypto/md32_common.h
+++ b/crypto/md32_common.h
@@ -195,7 +195,6 @@
     * Some GNU C inline assembler templates. Note that these are
     * rotates by *constant* number of bits! But that's exactly
     * what we need here...
-   *
     *                                   <appro@fy.chalmers.se>
     */
  #  if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
@@ -217,39 +216,6 @@
                         })
  #  endif
  # endif
-
-/*
- * Engage compiler specific "fetch in reverse byte order"
- * intrinsic function if available.
- */
-# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-  /* some GNU C inline assembler templates by <appro@fy.chalmers.se> */
-#  if (defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)) && !defined(I386_ONLY)
-#   define BE_FETCH32(a)       ({ register unsigned int l=(a);\
-                               asm (                   \
-                               "bswapl %0"             \
-                               : "=r"(l) : "0"(l));    \
-                         l;                            \
-                       })
-#  elif defined(__powerpc)
-#   define LE_FETCH32(a)       ({ register unsigned int l;     \
-                               asm (                   \
-                               "lwbrx %0,0,%1"         \
-                               : "=r"(l)               \
-                               : "r"(a));              \
-                          l;                           \
-                       })
-
-#  elif defined(__sparc) && defined(OPENSSL_SYS_ULTRASPARC)
-#  define LE_FETCH32(a)        ({ register unsigned int l;             \
-                               asm (                           \
-                               "lda [%1]#ASI_PRIMARY_LITTLE,%0"\
-                               : "=r"(l)                       \
-                               : "r"(a));                      \
-                          l;                                   \
-                       })
-#  endif
-# endif
  #endif /* PEDANTIC */
  
  #if HASH_LONG_LOG2==2  /* Engage only if sizeof(HASH_LONG)== 4 */
@@ -301,28 +267,12 @@
  #    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
  #      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
  #    endif
-#  elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
-#    ifndef HOST_FETCH32
-#      ifdef LE_FETCH32
-#        define HOST_FETCH32(p,l)      LE_FETCH32(p)
-#      elif defined(REVERSE_FETCH32)
-#        define HOST_FETCH32(p,l)      REVERSE_FETCH32(p,l)
-#      endif
-#    endif
  #  endif
  #elif defined(L_ENDIAN)
  #  if defined(DATA_ORDER_IS_LITTLE_ENDIAN)
  #    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
  #      define HASH_BLOCK_DATA_ORDER_ALIGNED    HASH_BLOCK_HOST_ORDER
  #    endif
-#  elif defined(DATA_ORDER_IS_BIG_ENDIAN)
-#    ifndef HOST_FETCH32
-#      ifdef BE_FETCH32
-#        define HOST_FETCH32(p,l)      BE_FETCH32(p)
-#      elif defined(REVERSE_FETCH32)
-#        define HOST_FETCH32(p,l)      REVERSE_FETCH32(p,l)
-#      endif
-#    endif
  #  endif
  #endif
  
@@ -334,11 +284,32 @@
  
  #if defined(DATA_ORDER_IS_BIG_ENDIAN)
  
+#ifndef PEDANTIC
+# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#  if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
+    /*
+     * This gives ~30-40% performance improvement in SHA-256 compiled
+     * with gcc [on P4]. Well, first macro to be frank. We can pull
+     * this trick on x86* platforms only, because these CPUs can fetch
+     * unaligned data without raising an exception.
+     */
+#   define HOST_c2l(c,l)       ({ (l)=*((const unsigned int *)(c));    \
+                                  asm ("bswapl %0":"=r"(l):"0"(l));    \
+                                  (c)+=4; (l);                         })
+#   define HOST_l2c(l,c)       ({ unsigned int r=(l);                  \
+                                  asm ("bswapl %0":"=r"(r):"0"(r));    \
+                                  *((unsigned int *)(c))=r; (c)+=4; r; })
+#  endif
+# endif
+#endif
+
+#ifndef HOST_c2l
  #define HOST_c2l(c,l)  (l =(((unsigned long)(*((c)++)))<<24),          \
                          l|=(((unsigned long)(*((c)++)))<<16),          \
                          l|=(((unsigned long)(*((c)++)))<< 8),          \
                          l|=(((unsigned long)(*((c)++)))    ),          \
                          l)
+#endif
  #define HOST_p_c2l(c,l,n)      {                                       \
                         switch (n) {                                    \
                         case 0: l =((unsigned long)(*((c)++)))<<24;     \
@@ -362,19 +333,29 @@
                         case 2: l|=((unsigned long)(*(--(c))))<<16;     \
                         case 1: l|=((unsigned long)(*(--(c))))<<24;     \
                                 } }
+#ifndef HOST_l2c
  #define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)>>24)&0xff),      \
                          *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
                          *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
                          *((c)++)=(unsigned char)(((l)    )&0xff),      \
                          l)
+#endif
  
  #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
  
+#if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
+  /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
+# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, l)
+# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, l)
+#endif
+
+#ifndef HOST_c2l
  #define HOST_c2l(c,l)  (l =(((unsigned long)(*((c)++)))    ),          \
                          l|=(((unsigned long)(*((c)++)))<< 8),          \
                          l|=(((unsigned long)(*((c)++)))<<16),          \
                          l|=(((unsigned long)(*((c)++)))<<24),          \
                          l)
+#endif
  #define HOST_p_c2l(c,l,n)      {                                       \
                         switch (n) {                                    \
                         case 0: l =((unsigned long)(*((c)++)));         \
@@ -398,11 +379,13 @@
                         case 2: l|=((unsigned long)(*(--(c))))<< 8;     \
                         case 1: l|=((unsigned long)(*(--(c))));         \
                                 } }
+#ifndef HOST_l2c
  #define HOST_l2c(l,c)  (*((c)++)=(unsigned char)(((l)    )&0xff),      \
                          *((c)++)=(unsigned char)(((l)>> 8)&0xff),      \
                          *((c)++)=(unsigned char)(((l)>>16)&0xff),      \
                          *((c)++)=(unsigned char)(((l)>>24)&0xff),      \
                          l)
+#endif
  
  #endif
  
@@ -415,7 +398,7 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
         const unsigned char *data=data_;
         register HASH_LONG * p;
         register HASH_LONG l;
-       int sw,sc,ew,ec;
+       unsigned int sw,sc,ew,ec;
  
         if (len==0) return 1;
  
@@ -481,7 +464,7 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
                  * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined
                  * only if sizeof(HASH_LONG)==4.
                  */
-               if ((((unsigned long)data)%4) == 0)
+               if ((((size_t)data)%4) == 0)
                         {
                         /* data is properly aligned so that we can cast it: */
                         HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,sw);
@@ -530,7 +513,7 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len)
  void HASH_TRANSFORM (HASH_CTX *c, const unsigned char *data)
         {
  #if defined(HASH_BLOCK_DATA_ORDER_ALIGNED)
-       if ((((unsigned long)data)%4) == 0)
+       if ((((size_t)data)%4) == 0)
                 /* data is properly aligned so that we can cast it: */
                 HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,1);
         else
diff --git a/crypto/sha/asm/sha512-sse2.pl b/crypto/sha/asm/sha512-sse2.pl

index 4235dbae826b42eeeda7a061a6c2daeb27d0fe8f..797aedacd72eae2d3e44cf7bad58ba03dc9a3a34 100644 (file)
--- a/crypto/sha/asm/sha512-sse2.pl
+++ b/crypto/sha/asm/sha512-sse2.pl
@@ -21,11 +21,11 @@
  # Throughput performance in MBps (larger is better):
  #
  #              2.4GHz P4       1.4GHz AMD32    1.4GHz AMD64(*)
-# SHA256/gcc(*)        39              42              59
+# SHA256/gcc(*)        54              43              59
  # SHA512/gcc   17              23              92
  # SHA512/sse2  54(**)          55(**)
  # SHA512/icc   26              28
-# SHA256/icc(*)        64              54
+# SHA256/icc(*)        65              54
  #
  # (*)  AMD64 and SHA256 numbers are presented mostly for amusement or
  #      reference purposes.
author	Andy Polyakov <appro@openssl.org>
	Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Mon, 31 May 2004 12:06:27 +0000 (12:06 +0000)
crypto/md32_common.h		patch \| blob \| history
crypto/sha/asm/sha512-sse2.pl		patch \| blob \| history