crypto/modes/asm/ghash-armv4.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # April 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+32 bytes shared table]. There is no
  15 # experimental performance data available yet. The only approximation
  16 # that can be made at this point is based on code size. Inner loop is
  17 # 32 instructions long and on single-issue core should execute in <40
  18 # cycles. Having verified that gcc 3.4 didn't unroll corresponding
  19 # loop, this assembler loop body was found to be ~3x smaller than
  20 # compiler-generated one...
  21 #
  22 # July 2010
  23 #
  24 # Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
  25 # Cortex A8 core and ~25 cycles per processed byte (which was observed
  26 # to be ~3 times faster than gcc-generated code:-)
  27 #
  28 # Note about "528B" variant. In ARM case it makes lesser sense to
  29 # implement it for following reasons:
  30 #
  31 # - performance improvement won't be anywhere near 50%, because 128-
  32 #   bit shift operation is neatly fused with 128-bit xor here, and
  33 #   "538B" variant would eliminate only 4-5 instructions out of 32
  34 #   in the inner loop (meaning that estimated improvement is ~15%);
  35 # - ARM-based systems are often embedded ones and extra memory
  36 #   consumption might be unappreciated (for so little improvement);
  37 #
  38 # Byte order [in]dependence. =========================================
  39 #
  40 # Caller is expected to maintain specific *dword* order in Htable,
  41 # namely with *least* significant dword of 128-bit value at *lower*
  42 # address. This differs completely from C code and has everything to
  43 # do with ldm instruction and order in which dwords are "consumed" by
  44 # algorithm. *Byte* order within these dwords in turn is whatever
  45 # *native* byte order on current platform. See gcm128.c for working
  46 # example...
  47
  48 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  49 open STDOUT,">$output";
  50
  51 $Xi="r0";       # argument block
  52 $Htbl="r1";
  53 $inp="r2";
  54 $len="r3";
  55 $Zll="r4";      # variables
  56 $Zlh="r5";
  57 $Zhl="r6";
  58 $Zhh="r7";
  59 $Tll="r8";
  60 $Tlh="r9";
  61 $Thl="r10";
  62 $Thh="r11";
  63 $nlo="r12";
  64 ################# r13 is stack pointer
  65 $nhi="r14";
  66 ################# r15 is program counter
  67
  68 $rem_4bit=$inp; # used in gcm_gmult_4bit
  69 $cnt=$len;
  70
  71 sub Zsmash() {
  72   my $i=12;
  73   my @args=@_;
  74   for ($Zll,$Zlh,$Zhl,$Zhh) {
  75     # can be reduced to single "str $_,[$Xi,$i]" on big-endian platforms
  76     $code.=<<___;
  77         mov     $Tlh,$_,lsr#8
  78         strb    $_,[$Xi,#$i+3]
  79         mov     $Thl,$_,lsr#16
  80         strb    $Tlh,[$Xi,#$i+2]
  81         mov     $Thh,$_,lsr#24
  82         strb    $Thl,[$Xi,#$i+1]
  83         strb    $Thh,[$Xi,#$i]
  84 ___
  85     $code.="\t".shift(@args)."\n";
  86     $i-=4;
  87   }
  88 }
  89
  90 $code=<<___;
  91 .text
  92 .code   32
  93
  94 .type   rem_4bit,%object
  95 .align  5
  96 rem_4bit:
  97 .short  0x0000,0x1C20,0x3840,0x2460
  98 .short  0x7080,0x6CA0,0x48C0,0x54E0
  99 .short  0xE100,0xFD20,0xD940,0xC560
 100 .short  0x9180,0x8DA0,0xA9C0,0xB5E0
 101 .size   rem_4bit,.-rem_4bit
 102
 103 .type   rem_4bit_get,%function
 104 rem_4bit_get:
 105         sub     $rem_4bit,pc,#8
 106         sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
 107         b       .Lrem_4bit_got
 108         nop
 109 .size   rem_4bit_get,.-rem_4bit_get
 110
 111 .global gcm_ghash_4bit
 112 .type   gcm_ghash_4bit,%function
 113 gcm_ghash_4bit:
 114         sub     r12,pc,#8
 115         add     $len,$inp,$len          @ $len to point at the end
 116         stmdb   sp!,{r3-r11,lr}         @ save $len/end too
 117         sub     r12,r12,#48             @ &rem_4bit
 118
 119         ldmia   r12,{r4-r11}            @ copy rem_4bit ...
 120         stmdb   sp!,{r4-r11}            @ ... to stack
 121
 122         ldrb    $nlo,[$inp,#15]
 123         ldrb    $nhi,[$Xi,#15]
 124 .Louter:
 125         eor     $nlo,$nlo,$nhi
 126         and     $nhi,$nlo,#0xf0
 127         and     $nlo,$nlo,#0x0f
 128         mov     $cnt,#14
 129
 130         add     $Zhh,$Htbl,$nlo,lsl#4
 131         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 132         add     $Thh,$Htbl,$nhi
 133         ldrb    $nlo,[$inp,#14]
 134
 135         and     $nhi,$Zll,#0xf          @ rem
 136         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 137         add     $nhi,$nhi,$nhi
 138         eor     $Zll,$Tll,$Zll,lsr#4
 139         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
 140         eor     $Zll,$Zll,$Zlh,lsl#28
 141         ldrb    $nhi,[$Xi,#14]
 142         eor     $Zlh,$Tlh,$Zlh,lsr#4
 143         eor     $Zlh,$Zlh,$Zhl,lsl#28
 144         eor     $Zhl,$Thl,$Zhl,lsr#4
 145         eor     $Zhl,$Zhl,$Zhh,lsl#28
 146         eor     $Zhh,$Thh,$Zhh,lsr#4
 147         eor     $nlo,$nlo,$nhi
 148         and     $nhi,$nlo,#0xf0
 149         and     $nlo,$nlo,#0x0f
 150         eor     $Zhh,$Zhh,$Tll,lsl#16
 151
 152 .Loop:
 153         add     $Thh,$Htbl,$nlo,lsl#4
 154         subs    $cnt,$cnt,#1
 155         and     $nlo,$Zll,#0xf          @ rem
 156         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 157         add     $nlo,$nlo,$nlo
 158         eor     $Zll,$Tll,$Zll,lsr#4
 159         ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
 160         eor     $Zll,$Zll,$Zlh,lsl#28
 161         eor     $Zlh,$Tlh,$Zlh,lsr#4
 162         eor     $Zlh,$Zlh,$Zhl,lsl#28
 163         eor     $Zhl,$Thl,$Zhl,lsr#4
 164         eor     $Zhl,$Zhl,$Zhh,lsl#28
 165         eor     $Zhh,$Thh,$Zhh,lsr#4
 166         ldrplb  $nlo,[$inp,$cnt]
 167
 168         add     $Thh,$Htbl,$nhi
 169         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 170         and     $nhi,$Zll,#0xf          @ rem
 171         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 172         add     $nhi,$nhi,$nhi
 173         eor     $Zll,$Tll,$Zll,lsr#4
 174         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
 175         eor     $Zll,$Zll,$Zlh,lsl#28
 176         eor     $Zlh,$Tlh,$Zlh,lsr#4
 177         ldrplb  $nhi,[$Xi,$cnt]
 178         eor     $Zlh,$Zlh,$Zhl,lsl#28
 179         eor     $Zhl,$Thl,$Zhl,lsr#4
 180         eor     $Zhl,$Zhl,$Zhh,lsl#28
 181         eorpl   $nlo,$nlo,$nhi
 182         eor     $Zhh,$Thh,$Zhh,lsr#4
 183         andpl   $nhi,$nlo,#0xf0
 184         andpl   $nlo,$nlo,#0x0f
 185         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 186         bpl     .Loop
 187
 188         ldr     $len,[sp,#32]           @ re-load $len/end
 189         add     $inp,$inp,#16
 190         mov     $nhi,$Zll
 191 ___
 192         &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
 193 $code.=<<___;
 194         bne     .Louter
 195
 196         add     sp,sp,#36
 197         ldmia   sp!,{r4-r11,lr}
 198         tst     lr,#1
 199         moveq   pc,lr                   @ be binary compatible with V4, yet
 200         bx      lr                      @ interoperable with Thumb ISA:-)
 201 .size   gcm_ghash_4bit,.-gcm_ghash_4bit
 202
 203 .global gcm_gmult_4bit
 204 .type   gcm_gmult_4bit,%function
 205 gcm_gmult_4bit:
 206         stmdb   sp!,{r4-r11,lr}
 207         ldrb    $nlo,[$Xi,#15]
 208         b       rem_4bit_get
 209 .Lrem_4bit_got:
 210         and     $nhi,$nlo,#0xf0
 211         and     $nlo,$nlo,#0x0f
 212         mov     $cnt,#14
 213
 214         add     $Zhh,$Htbl,$nlo,lsl#4
 215         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 216         ldrb    $nlo,[$Xi,#14]
 217
 218         add     $Thh,$Htbl,$nhi
 219         and     $nhi,$Zll,#0xf          @ rem
 220         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 221         add     $nhi,$nhi,$nhi
 222         eor     $Zll,$Tll,$Zll,lsr#4
 223         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 224         eor     $Zll,$Zll,$Zlh,lsl#28
 225         eor     $Zlh,$Tlh,$Zlh,lsr#4
 226         eor     $Zlh,$Zlh,$Zhl,lsl#28
 227         eor     $Zhl,$Thl,$Zhl,lsr#4
 228         eor     $Zhl,$Zhl,$Zhh,lsl#28
 229         eor     $Zhh,$Thh,$Zhh,lsr#4
 230         and     $nhi,$nlo,#0xf0
 231         eor     $Zhh,$Zhh,$Tll,lsl#16
 232         and     $nlo,$nlo,#0x0f
 233
 234 .Loop2:
 235         add     $Thh,$Htbl,$nlo,lsl#4
 236         subs    $cnt,$cnt,#1
 237         and     $nlo,$Zll,#0xf          @ rem
 238         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 239         add     $nlo,$nlo,$nlo
 240         eor     $Zll,$Tll,$Zll,lsr#4
 241         ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
 242         eor     $Zll,$Zll,$Zlh,lsl#28
 243         eor     $Zlh,$Tlh,$Zlh,lsr#4
 244         eor     $Zlh,$Zlh,$Zhl,lsl#28
 245         eor     $Zhl,$Thl,$Zhl,lsr#4
 246         eor     $Zhl,$Zhl,$Zhh,lsl#28
 247         eor     $Zhh,$Thh,$Zhh,lsr#4
 248         ldrplb  $nlo,[$Xi,$cnt]
 249
 250         add     $Thh,$Htbl,$nhi
 251         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 252         and     $nhi,$Zll,#0xf          @ rem
 253         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 254         add     $nhi,$nhi,$nhi
 255         eor     $Zll,$Tll,$Zll,lsr#4
 256         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 257         eor     $Zll,$Zll,$Zlh,lsl#28
 258         eor     $Zlh,$Tlh,$Zlh,lsr#4
 259         eor     $Zlh,$Zlh,$Zhl,lsl#28
 260         eor     $Zhl,$Thl,$Zhl,lsr#4
 261         eor     $Zhl,$Zhl,$Zhh,lsl#28
 262         eor     $Zhh,$Thh,$Zhh,lsr#4
 263         andpl   $nhi,$nlo,#0xf0
 264         andpl   $nlo,$nlo,#0x0f
 265         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 266         bpl     .Loop2
 267 ___
 268         &Zsmash();
 269 $code.=<<___;
 270         ldmia   sp!,{r4-r11,lr}
 271         tst     lr,#1
 272         moveq   pc,lr                   @ be binary compatible with V4, yet
 273         bx      lr                      @ interoperable with Thumb ISA:-)
 274 .size   gcm_gmult_4bit,.-gcm_gmult_4bit
 275 .asciz  "GHASH for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 276 .align  2
 277 ___
 278
 279 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 280 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 281 print $code;
 282 close STDOUT; # enforce flush