ghash-s390x.pl: reschedule instructions for better performance.

[openssl.git] / crypto / modes / asm / ghash-armv4.pl
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl

index b3c0f7ee5a5126e0c38d981edc082c226f75d44b..2036f46f40b68a2c065a36ce60cdda2127631431 100644 (file)
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -19,6 +19,22 @@
  # loop, this assembler loop body was found to be ~3x smaller than
  # compiler-generated one...
  #
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
  # Byte order [in]dependence. =========================================
  #
  # Caller is expected to maintain specific *dword* order in Htable,
@@ -29,6 +45,9 @@
  # *native* byte order on current platform. See gcm128.c for working
  # example...
  
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
  $Xi="r0";      # argument block
  $Htbl="r1";
  $inp="r2";
@@ -49,9 +68,6 @@ $nhi="r14";
  $rem_4bit=$inp;        # used in gcm_gmult_4bit
  $cnt=$len;
  
-$output=shift;
-open STDOUT,">$output";
-
  sub Zsmash() {
    my $i=12;
    my @args=@_;
@@ -113,12 +129,12 @@ gcm_ghash_4bit:
  
         add     $Zhh,$Htbl,$nlo,lsl#4
         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
+       add     $Thh,$Htbl,$nhi
         ldrb    $nlo,[$inp,#14]
  
-       add     $Thh,$Htbl,$nhi
         and     $nhi,$Zll,#0xf          @ rem
         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
-       mov     $nhi,$nhi,lsl#1
+       add     $nhi,$nhi,$nhi
         eor     $Zll,$Tll,$Zll,lsr#4
         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
         eor     $Zll,$Zll,$Zlh,lsl#28
@@ -129,15 +145,15 @@ gcm_ghash_4bit:
         eor     $Zhl,$Zhl,$Zhh,lsl#28
         eor     $Zhh,$Thh,$Zhh,lsr#4
         eor     $nlo,$nlo,$nhi
-       eor     $Zhh,$Zhh,$Tll,lsl#16
         and     $nhi,$nlo,#0xf0
         and     $nlo,$nlo,#0x0f
+       eor     $Zhh,$Zhh,$Tll,lsl#16
  
  .Loop:
         add     $Thh,$Htbl,$nlo,lsl#4
         subs    $cnt,$cnt,#1
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
         and     $nlo,$Zll,#0xf          @ rem
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
         add     $nlo,$nlo,$nlo
         eor     $Zll,$Tll,$Zll,lsr#4
         ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
@@ -151,22 +167,22 @@ gcm_ghash_4bit:
  
         add     $Thh,$Htbl,$nhi
         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
         and     $nhi,$Zll,#0xf          @ rem
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
         add     $nhi,$nhi,$nhi
         eor     $Zll,$Tll,$Zll,lsr#4
         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
         eor     $Zll,$Zll,$Zlh,lsl#28
-       ldrplb  $nhi,[$Xi,$cnt]
         eor     $Zlh,$Tlh,$Zlh,lsr#4
+       ldrplb  $nhi,[$Xi,$cnt]
         eor     $Zlh,$Zlh,$Zhl,lsl#28
         eor     $Zhl,$Thl,$Zhl,lsr#4
         eor     $Zhl,$Zhl,$Zhh,lsl#28
-       eor     $Zhh,$Thh,$Zhh,lsr#4
         eorpl   $nlo,$nlo,$nhi
-       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+       eor     $Zhh,$Thh,$Zhh,lsr#4
         andpl   $nhi,$nlo,#0xf0
         andpl   $nlo,$nlo,#0x0f
+       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
         bpl     .Loop
  
         ldr     $len,[sp,#32]           @ re-load $len/end
@@ -202,7 +218,7 @@ gcm_gmult_4bit:
         add     $Thh,$Htbl,$nhi
         and     $nhi,$Zll,#0xf          @ rem
         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
-       mov     $nhi,$nhi,lsl#1
+       add     $nhi,$nhi,$nhi
         eor     $Zll,$Tll,$Zll,lsr#4
         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
         eor     $Zll,$Zll,$Zlh,lsl#28
@@ -218,8 +234,8 @@ gcm_gmult_4bit:
  .Loop2:
         add     $Thh,$Htbl,$nlo,lsl#4
         subs    $cnt,$cnt,#1
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
         and     $nlo,$Zll,#0xf          @ rem
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
         add     $nlo,$nlo,$nlo
         eor     $Zll,$Tll,$Zll,lsr#4
         ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
@@ -233,8 +249,8 @@ gcm_gmult_4bit:
  
         add     $Thh,$Htbl,$nhi
         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
         and     $nhi,$Zll,#0xf          @ rem
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
         add     $nhi,$nhi,$nhi
         eor     $Zll,$Tll,$Zll,lsr#4
         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
@@ -245,8 +261,8 @@ gcm_gmult_4bit:
         eor     $Zhl,$Zhl,$Zhh,lsl#28
         eor     $Zhh,$Thh,$Zhh,lsr#4
         andpl   $nhi,$nlo,#0xf0
-       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
         andpl   $nlo,$nlo,#0x0f
+       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
         bpl     .Loop2
  ___
         &Zsmash();