crypto/bn/asm/sparcv9-gf2m.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # October 2012
  11 #
  12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
  13 # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
  14 # the time being... Except that it has two code paths: one suitable
  15 # for all SPARCv9 processors and one for VIS3-capable ones. Former
  16 # delivers ~25-45% more, more for longer keys, heaviest DH and DSA
  17 # verify operations on venerable UltraSPARC II. On T4 VIS3 code is
  18 # ~100-230% faster than gcc-generated code and ~35-90% faster than
  19 # the pure SPARCv9 code path.
  20
  21 $bits=32;
  22 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  23 if ($bits==64)  { $bias=2047; $frame=192; }
  24 else            { $bias=0;    $frame=112; }
  25
  26 $locals=16*8;
  27
  28 $code.=<<___;
  29 #include <sparc_arch.h>
  30
  31 .section        ".text",#alloc,#execinstr
  32 ___
  33 $code.=<<___ if ($bits==64);
  34 .register       %g2,#scratch
  35 .register       %g3,#scratch
  36 ___
  37
  38 $tab="%l0";
  39
  40 @T=("%g2","%g3");
  41 @i=("%g4","%g5");
  42
  43 ($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
  44 ($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
  45
  46 $code.=<<___;
  47 #ifdef __PIC__
  48 SPARC_PIC_THUNK(%g1)
  49 #endif
  50
  51 .globl  bn_GF2m_mul_2x2
  52 .align  16
  53 bn_GF2m_mul_2x2:
  54         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
  55         ld      [%g1+0],%g1                     ! OPENSSL_sparcv9cap_P[0]
  56
  57         andcc   %g1, SPARCV9_VIS3, %g0
  58         bz,pn   %icc,.Lsoftware
  59         nop
  60
  61         sllx    %o1, 32, %o1
  62         sllx    %o3, 32, %o3
  63         or      %o2, %o1, %o1
  64         or      %o4, %o3, %o3
  65         .word   0x95b262ab                      ! xmulx   %o1, %o3, %o2
  66         .word   0x99b262cb                      ! xmulxhi %o1, %o3, %o4
  67         srlx    %o2, 32, %o1                    ! 13 cycles later
  68         st      %o2, [%o0+0]
  69         st      %o1, [%o0+4]
  70         srlx    %o4, 32, %o3
  71         st      %o4, [%o0+8]
  72         retl
  73         st      %o3, [%o0+12]
  74
  75 .align  16
  76 .Lsoftware:
  77         save    %sp,-$frame-$locals,%sp
  78
  79         sllx    %i1,32,$a
  80         mov     -1,$a12
  81         sllx    %i3,32,$b
  82         or      %i2,$a,$a
  83         srlx    $a12,1,$a48                     ! 0x7fff...
  84         or      %i4,$b,$b
  85         srlx    $a12,2,$a12                     ! 0x3fff...
  86         add     %sp,$bias+$frame,$tab
  87
  88         sllx    $a,2,$a4
  89         mov     $a,$a1
  90         sllx    $a,1,$a2
  91
  92         srax    $a4,63,@i[1]                    ! broadcast 61st bit
  93         and     $a48,$a4,$a4                    ! (a<<2)&0x7fff...
  94         srlx    $a48,2,$a48
  95         srax    $a2,63,@i[0]                    ! broadcast 62nd bit
  96         and     $a12,$a2,$a2                    ! (a<<1)&0x3fff...
  97         srax    $a1,63,$lo                      ! broadcast 63rd bit
  98         and     $a48,$a1,$a1                    ! (a<<0)&0x1fff...
  99
 100         sllx    $a1,3,$a8
 101         and     $b,$lo,$lo
 102         and     $b,@i[0],@i[0]
 103         and     $b,@i[1],@i[1]
 104
 105         stx     %g0,[$tab+0*8]                  ! tab[0]=0
 106         xor     $a1,$a2,$a12
 107         stx     $a1,[$tab+1*8]                  ! tab[1]=a1
 108         stx     $a2,[$tab+2*8]                  ! tab[2]=a2
 109          xor    $a4,$a8,$a48
 110         stx     $a12,[$tab+3*8]                 ! tab[3]=a1^a2
 111          xor    $a4,$a1,$a1
 112
 113         stx     $a4,[$tab+4*8]                  ! tab[4]=a4
 114         xor     $a4,$a2,$a2
 115         stx     $a1,[$tab+5*8]                  ! tab[5]=a1^a4
 116         xor     $a4,$a12,$a12
 117         stx     $a2,[$tab+6*8]                  ! tab[6]=a2^a4
 118          xor    $a48,$a1,$a1
 119         stx     $a12,[$tab+7*8]                 ! tab[7]=a1^a2^a4
 120          xor    $a48,$a2,$a2
 121
 122         stx     $a8,[$tab+8*8]                  ! tab[8]=a8
 123         xor     $a48,$a12,$a12
 124         stx     $a1,[$tab+9*8]                  ! tab[9]=a1^a8
 125          xor    $a4,$a1,$a1
 126         stx     $a2,[$tab+10*8]                 ! tab[10]=a2^a8
 127          xor    $a4,$a2,$a2
 128         stx     $a12,[$tab+11*8]                ! tab[11]=a1^a2^a8
 129
 130         xor     $a4,$a12,$a12
 131         stx     $a48,[$tab+12*8]                ! tab[12]=a4^a8
 132          srlx   $lo,1,$hi
 133         stx     $a1,[$tab+13*8]                 ! tab[13]=a1^a4^a8
 134          sllx   $lo,63,$lo
 135         stx     $a2,[$tab+14*8]                 ! tab[14]=a2^a4^a8
 136          srlx   @i[0],2,@T[0]
 137         stx     $a12,[$tab+15*8]                ! tab[15]=a1^a2^a4^a8
 138
 139         sllx    @i[0],62,$a1
 140          sllx   $b,3,@i[0]
 141         srlx    @i[1],3,@T[1]
 142          and    @i[0],`0xf<<3`,@i[0]
 143         sllx    @i[1],61,$a2
 144          ldx    [$tab+@i[0]],@i[0]
 145          srlx   $b,4-3,@i[1]
 146         xor     @T[0],$hi,$hi
 147          and    @i[1],`0xf<<3`,@i[1]
 148         xor     $a1,$lo,$lo
 149          ldx    [$tab+@i[1]],@i[1]
 150         xor     @T[1],$hi,$hi
 151
 152         xor     @i[0],$lo,$lo
 153         srlx    $b,8-3,@i[0]
 154          xor    $a2,$lo,$lo
 155         and     @i[0],`0xf<<3`,@i[0]
 156 ___
 157 for($n=1;$n<14;$n++) {
 158 $code.=<<___;
 159         sllx    @i[1],`$n*4`,@T[0]
 160         ldx     [$tab+@i[0]],@i[0]
 161         srlx    @i[1],`64-$n*4`,@T[1]
 162         xor     @T[0],$lo,$lo
 163         srlx    $b,`($n+2)*4`-3,@i[1]
 164         xor     @T[1],$hi,$hi
 165         and     @i[1],`0xf<<3`,@i[1]
 166 ___
 167         push(@i,shift(@i)); push(@T,shift(@T));
 168 }
 169 $code.=<<___;
 170         sllx    @i[1],`$n*4`,@T[0]
 171         ldx     [$tab+@i[0]],@i[0]
 172         srlx    @i[1],`64-$n*4`,@T[1]
 173         xor     @T[0],$lo,$lo
 174
 175         sllx    @i[0],`($n+1)*4`,@T[0]
 176          xor    @T[1],$hi,$hi
 177         srlx    @i[0],`64-($n+1)*4`,@T[1]
 178         xor     @T[0],$lo,$lo
 179         xor     @T[1],$hi,$hi
 180
 181         srlx    $lo,32,%i1
 182         st      $lo,[%i0+0]
 183         st      %i1,[%i0+4]
 184         srlx    $hi,32,%i2
 185         st      $hi,[%i0+8]
 186         st      %i2,[%i0+12]
 187
 188         ret
 189         restore
 190 .type   bn_GF2m_mul_2x2,#function
 191 .size   bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
 192 .asciz  "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 193 .align  4
 194 ___
 195
 196 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 197 print $code;
 198 close STDOUT;