crypto/modes/asm/ghash-armv4.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # April 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+32 bytes shared table]. There is no
  15 # experimental performance data available yet. The only approximation
  16 # that can be made at this point is based on code size. Inner loop is
  17 # 32 instructions long and on single-issue core should execute in <40
  18 # cycles. Having verified that gcc 3.4 didn't unroll corresponding
  19 # loop, this assembler loop body was found to be ~3x smaller than
  20 # compiler-generated one...
  21 #
  22 # Note about "528B" variant. In ARM case it makes lesser sense to
  23 # implement it for following reasons:
  24 #
  25 # - performance improvement won't be anywhere near 50%, because 128-
  26 #   bit shift operation is neatly fused with 128-bit xor here, and
  27 #   "538B" variant would eliminate only 4-5 instructions out of 32
  28 #   in the inner loop (meaning that estimated improvement is ~15%);
  29 # - ARM-based systems are often embedded ones and extra memory
  30 #   consumption might be unappreciated (for so little improvement);
  31 #
  32 # Byte order [in]dependence. =========================================
  33 #
  34 # Caller is expected to maintain specific *dword* order in Htable,
  35 # namely with *least* significant dword of 128-bit value at *lower*
  36 # address. This differs completely from C code and has everything to
  37 # do with ldm instruction and order in which dwords are "consumed" by
  38 # algorithm. *Byte* order within these dwords in turn is whatever
  39 # *native* byte order on current platform. See gcm128.c for working
  40 # example...
  41
  42 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  43 open STDOUT,">$output";
  44
  45 $Xi="r0";       # argument block
  46 $Htbl="r1";
  47 $inp="r2";
  48 $len="r3";
  49 $Zll="r4";      # variables
  50 $Zlh="r5";
  51 $Zhl="r6";
  52 $Zhh="r7";
  53 $Tll="r8";
  54 $Tlh="r9";
  55 $Thl="r10";
  56 $Thh="r11";
  57 $nlo="r12";
  58 ################# r13 is stack pointer
  59 $nhi="r14";
  60 ################# r15 is program counter
  61
  62 $rem_4bit=$inp; # used in gcm_gmult_4bit
  63 $cnt=$len;
  64
  65 sub Zsmash() {
  66   my $i=12;
  67   my @args=@_;
  68   for ($Zll,$Zlh,$Zhl,$Zhh) {
  69     # can be reduced to single "str $_,[$Xi,$i]" on big-endian platforms
  70     $code.=<<___;
  71         mov     $Tlh,$_,lsr#8
  72         strb    $_,[$Xi,#$i+3]
  73         mov     $Thl,$_,lsr#16
  74         strb    $Tlh,[$Xi,#$i+2]
  75         mov     $Thh,$_,lsr#24
  76         strb    $Thl,[$Xi,#$i+1]
  77         strb    $Thh,[$Xi,#$i]
  78 ___
  79     $code.="\t".shift(@args)."\n";
  80     $i-=4;
  81   }
  82 }
  83
  84 $code=<<___;
  85 .text
  86 .code   32
  87
  88 .type   rem_4bit,%object
  89 .align  5
  90 rem_4bit:
  91 .short  0x0000,0x1C20,0x3840,0x2460
  92 .short  0x7080,0x6CA0,0x48C0,0x54E0
  93 .short  0xE100,0xFD20,0xD940,0xC560
  94 .short  0x9180,0x8DA0,0xA9C0,0xB5E0
  95 .size   rem_4bit,.-rem_4bit
  96
  97 .type   rem_4bit_get,%function
  98 rem_4bit_get:
  99         sub     $rem_4bit,pc,#8
 100         sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
 101         b       .Lrem_4bit_got
 102         nop
 103 .size   rem_4bit_get,.-rem_4bit_get
 104
 105 .global gcm_ghash_4bit
 106 .type   gcm_ghash_4bit,%function
 107 gcm_ghash_4bit:
 108         sub     r12,pc,#8
 109         add     $len,$inp,$len          @ $len to point at the end
 110         stmdb   sp!,{r3-r11,lr}         @ save $len/end too
 111         sub     r12,r12,#48             @ &rem_4bit
 112
 113         ldmia   r12,{r4-r11}            @ copy rem_4bit ...
 114         stmdb   sp!,{r4-r11}            @ ... to stack
 115
 116         ldrb    $nlo,[$inp,#15]
 117         ldrb    $nhi,[$Xi,#15]
 118 .Louter:
 119         eor     $nlo,$nlo,$nhi
 120         and     $nhi,$nlo,#0xf0
 121         and     $nlo,$nlo,#0x0f
 122         mov     $cnt,#14
 123
 124         add     $Zhh,$Htbl,$nlo,lsl#4
 125         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 126         ldrb    $nlo,[$inp,#14]
 127
 128         add     $Thh,$Htbl,$nhi
 129         and     $nhi,$Zll,#0xf          @ rem
 130         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 131         mov     $nhi,$nhi,lsl#1
 132         eor     $Zll,$Tll,$Zll,lsr#4
 133         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
 134         eor     $Zll,$Zll,$Zlh,lsl#28
 135         ldrb    $nhi,[$Xi,#14]
 136         eor     $Zlh,$Tlh,$Zlh,lsr#4
 137         eor     $Zlh,$Zlh,$Zhl,lsl#28
 138         eor     $Zhl,$Thl,$Zhl,lsr#4
 139         eor     $Zhl,$Zhl,$Zhh,lsl#28
 140         eor     $Zhh,$Thh,$Zhh,lsr#4
 141         eor     $nlo,$nlo,$nhi
 142         eor     $Zhh,$Zhh,$Tll,lsl#16
 143         and     $nhi,$nlo,#0xf0
 144         and     $nlo,$nlo,#0x0f
 145
 146 .Loop:
 147         add     $Thh,$Htbl,$nlo,lsl#4
 148         subs    $cnt,$cnt,#1
 149         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 150         and     $nlo,$Zll,#0xf          @ rem
 151         add     $nlo,$nlo,$nlo
 152         eor     $Zll,$Tll,$Zll,lsr#4
 153         ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
 154         eor     $Zll,$Zll,$Zlh,lsl#28
 155         eor     $Zlh,$Tlh,$Zlh,lsr#4
 156         eor     $Zlh,$Zlh,$Zhl,lsl#28
 157         eor     $Zhl,$Thl,$Zhl,lsr#4
 158         eor     $Zhl,$Zhl,$Zhh,lsl#28
 159         eor     $Zhh,$Thh,$Zhh,lsr#4
 160         ldrplb  $nlo,[$inp,$cnt]
 161
 162         add     $Thh,$Htbl,$nhi
 163         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 164         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 165         and     $nhi,$Zll,#0xf          @ rem
 166         add     $nhi,$nhi,$nhi
 167         eor     $Zll,$Tll,$Zll,lsr#4
 168         ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
 169         eor     $Zll,$Zll,$Zlh,lsl#28
 170         ldrplb  $nhi,[$Xi,$cnt]
 171         eor     $Zlh,$Tlh,$Zlh,lsr#4
 172         eor     $Zlh,$Zlh,$Zhl,lsl#28
 173         eor     $Zhl,$Thl,$Zhl,lsr#4
 174         eor     $Zhl,$Zhl,$Zhh,lsl#28
 175         eor     $Zhh,$Thh,$Zhh,lsr#4
 176         eorpl   $nlo,$nlo,$nhi
 177         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 178         andpl   $nhi,$nlo,#0xf0
 179         andpl   $nlo,$nlo,#0x0f
 180         bpl     .Loop
 181
 182         ldr     $len,[sp,#32]           @ re-load $len/end
 183         add     $inp,$inp,#16
 184         mov     $nhi,$Zll
 185 ___
 186         &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
 187 $code.=<<___;
 188         bne     .Louter
 189
 190         add     sp,sp,#36
 191         ldmia   sp!,{r4-r11,lr}
 192         tst     lr,#1
 193         moveq   pc,lr                   @ be binary compatible with V4, yet
 194         bx      lr                      @ interoperable with Thumb ISA:-)
 195 .size   gcm_ghash_4bit,.-gcm_ghash_4bit
 196
 197 .global gcm_gmult_4bit
 198 .type   gcm_gmult_4bit,%function
 199 gcm_gmult_4bit:
 200         stmdb   sp!,{r4-r11,lr}
 201         ldrb    $nlo,[$Xi,#15]
 202         b       rem_4bit_get
 203 .Lrem_4bit_got:
 204         and     $nhi,$nlo,#0xf0
 205         and     $nlo,$nlo,#0x0f
 206         mov     $cnt,#14
 207
 208         add     $Zhh,$Htbl,$nlo,lsl#4
 209         ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
 210         ldrb    $nlo,[$Xi,#14]
 211
 212         add     $Thh,$Htbl,$nhi
 213         and     $nhi,$Zll,#0xf          @ rem
 214         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 215         mov     $nhi,$nhi,lsl#1
 216         eor     $Zll,$Tll,$Zll,lsr#4
 217         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 218         eor     $Zll,$Zll,$Zlh,lsl#28
 219         eor     $Zlh,$Tlh,$Zlh,lsr#4
 220         eor     $Zlh,$Zlh,$Zhl,lsl#28
 221         eor     $Zhl,$Thl,$Zhl,lsr#4
 222         eor     $Zhl,$Zhl,$Zhh,lsl#28
 223         eor     $Zhh,$Thh,$Zhh,lsr#4
 224         and     $nhi,$nlo,#0xf0
 225         eor     $Zhh,$Zhh,$Tll,lsl#16
 226         and     $nlo,$nlo,#0x0f
 227
 228 .Loop2:
 229         add     $Thh,$Htbl,$nlo,lsl#4
 230         subs    $cnt,$cnt,#1
 231         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
 232         and     $nlo,$Zll,#0xf          @ rem
 233         add     $nlo,$nlo,$nlo
 234         eor     $Zll,$Tll,$Zll,lsr#4
 235         ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
 236         eor     $Zll,$Zll,$Zlh,lsl#28
 237         eor     $Zlh,$Tlh,$Zlh,lsr#4
 238         eor     $Zlh,$Zlh,$Zhl,lsl#28
 239         eor     $Zhl,$Thl,$Zhl,lsr#4
 240         eor     $Zhl,$Zhl,$Zhh,lsl#28
 241         eor     $Zhh,$Thh,$Zhh,lsr#4
 242         ldrplb  $nlo,[$Xi,$cnt]
 243
 244         add     $Thh,$Htbl,$nhi
 245         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 246         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
 247         and     $nhi,$Zll,#0xf          @ rem
 248         add     $nhi,$nhi,$nhi
 249         eor     $Zll,$Tll,$Zll,lsr#4
 250         ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
 251         eor     $Zll,$Zll,$Zlh,lsl#28
 252         eor     $Zlh,$Tlh,$Zlh,lsr#4
 253         eor     $Zlh,$Zlh,$Zhl,lsl#28
 254         eor     $Zhl,$Thl,$Zhl,lsr#4
 255         eor     $Zhl,$Zhl,$Zhh,lsl#28
 256         eor     $Zhh,$Thh,$Zhh,lsr#4
 257         andpl   $nhi,$nlo,#0xf0
 258         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
 259         andpl   $nlo,$nlo,#0x0f
 260         bpl     .Loop2
 261 ___
 262         &Zsmash();
 263 $code.=<<___;
 264         ldmia   sp!,{r4-r11,lr}
 265         tst     lr,#1
 266         moveq   pc,lr                   @ be binary compatible with V4, yet
 267         bx      lr                      @ interoperable with Thumb ISA:-)
 268 .size   gcm_gmult_4bit,.-gcm_gmult_4bit
 269 .asciz  "GHASH for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 270 .align  2
 271 ___
 272
 273 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 274 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 275 print $code;
 276 close STDOUT; # enforce flush