crypto/bn/asm/c64xplus-gf2m.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # February 2012
  11 #
  12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  13 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  14 # C for the time being... The subroutine runs in 37 cycles, which is
  15 # 4.5x faster than compiler-generated code. Though comparison is
  16 # totally unfair, because this module utilizes Galois Field Multiply
  17 # instruction.
  18
  19 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  20 open STDOUT,">$output";
  21
  22 ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
  23
  24 ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
  25 ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
  26 ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
  27 ($A,$B)=($Alo,$B_1);
  28 $xFF="B1";
  29
  30 sub mul_1x1_upper {
  31 my ($A,$B)=@_;
  32 $code.=<<___;
  33         EXTU    $B,8,24,$B_2            ; smash $B to 4 bytes
  34 ||      AND     $B,$xFF,$B_0
  35 ||      SHRU    $B,24,$B_3
  36         SHRU    $A,16,   $Ahi           ; smash $A to two halfwords
  37 ||      EXTU    $A,16,16,$Alo
  38
  39         XORMPY  $Alo,$B_2,$Alox2        ; 16x8 bits muliplication
  40 ||      XORMPY  $Ahi,$B_2,$Ahix2
  41 ||      EXTU    $B,16,24,$B_1
  42         XORMPY  $Alo,$B_0,$Alox0
  43 ||      XORMPY  $Ahi,$B_0,$Ahix0
  44         XORMPY  $Alo,$B_3,$Alox3
  45 ||      XORMPY  $Ahi,$B_3,$Ahix3
  46         XORMPY  $Alo,$B_1,$Alox1
  47 ||      XORMPY  $Ahi,$B_1,$Ahix1
  48 ___
  49 }
  50 sub mul_1x1_merged {
  51 my ($OUTlo,$OUThi,$A,$B)=@_;
  52 $code.=<<___;
  53          EXTU   $B,8,24,$B_2            ; smash $B to 4 bytes
  54 ||       AND    $B,$xFF,$B_0
  55 ||       SHRU   $B,24,$B_3
  56          SHRU   $A,16,   $Ahi           ; smash $A to two halfwords
  57 ||       EXTU   $A,16,16,$Alo
  58
  59         XOR     $Ahix0,$Alox2,$Ahix0
  60 ||      MV      $Ahix2,$OUThi
  61 ||       XORMPY $Alo,$B_2,$Alox2
  62          XORMPY $Ahi,$B_2,$Ahix2
  63 ||       EXTU   $B,16,24,$B_1
  64 ||       XORMPY $Alo,$B_0,A1            ; $Alox0
  65         XOR     $Ahix1,$Alox3,$Ahix1
  66 ||      SHL     $Ahix0,16,$OUTlo
  67 ||      SHRU    $Ahix0,16,$Ahix0
  68         XOR     $Alox0,$OUTlo,$OUTlo
  69 ||      XOR     $Ahix0,$OUThi,$OUThi
  70 ||       XORMPY $Ahi,$B_0,$Ahix0
  71 ||       XORMPY $Alo,$B_3,$Alox3
  72 ||      SHL     $Alox1,8,$Alox1
  73 ||      SHL     $Ahix3,8,$Ahix3
  74         XOR     $Alox1,$OUTlo,$OUTlo
  75 ||      XOR     $Ahix3,$OUThi,$OUThi
  76 ||       XORMPY $Ahi,$B_3,$Ahix3
  77 ||      SHL     $Ahix1,24,$Alox1
  78 ||      SHRU    $Ahix1,8, $Ahix1
  79         XOR     $Alox1,$OUTlo,$OUTlo
  80 ||      XOR     $Ahix1,$OUThi,$OUThi
  81 ||       XORMPY $Alo,$B_1,$Alox1
  82 ||       XORMPY $Ahi,$B_1,$Ahix1
  83 ||       MV     A1,$Alox0
  84 ___
  85 }
  86 sub mul_1x1_lower {
  87 my ($OUTlo,$OUThi)=@_;
  88 $code.=<<___;
  89         ;NOP
  90         XOR     $Ahix0,$Alox2,$Ahix0
  91 ||      MV      $Ahix2,$OUThi
  92         NOP
  93         XOR     $Ahix1,$Alox3,$Ahix1
  94 ||      SHL     $Ahix0,16,$OUTlo
  95 ||      SHRU    $Ahix0,16,$Ahix0
  96         XOR     $Alox0,$OUTlo,$OUTlo
  97 ||      XOR     $Ahix0,$OUThi,$OUThi
  98 ||      SHL     $Alox1,8,$Alox1
  99 ||      SHL     $Ahix3,8,$Ahix3
 100         XOR     $Alox1,$OUTlo,$OUTlo
 101 ||      XOR     $Ahix3,$OUThi,$OUThi
 102 ||      SHL     $Ahix1,24,$Alox1
 103 ||      SHRU    $Ahix1,8, $Ahix1
 104         XOR     $Alox1,$OUTlo,$OUTlo
 105 ||      XOR     $Ahix1,$OUThi,$OUThi
 106 ___
 107 }
 108 $code.=<<___;
 109         .text
 110
 111         .if     .ASSEMBLER_VERSION<7000000
 112         .asg    0,__TI_EABI__
 113         .endif
 114         .if     __TI_EABI__
 115         .asg    bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
 116         .endif
 117
 118         .global _bn_GF2m_mul_2x2
 119 _bn_GF2m_mul_2x2:
 120         .asmfunc
 121         MVK     0xFF,$xFF
 122 ___
 123         &mul_1x1_upper($a0,$b0);                # a0·b0
 124 $code.=<<___;
 125 ||      MV      $b1,$B
 126         MV      $a1,$A
 127 ___
 128         &mul_1x1_merged("A28","B28",$A,$B);     # a0·b0/a1·b1
 129 $code.=<<___;
 130 ||      XOR     $b0,$b1,$B
 131         XOR     $a0,$a1,$A
 132 ___
 133         &mul_1x1_merged("A31","B31",$A,$B);     # a1·b1/(a0+a1)·(b0+b1)
 134 $code.=<<___;
 135         XOR     A28,A31,A29
 136 ||      XOR     B28,B31,B29                     ; a0·b0+a1·b1
 137 ___
 138         &mul_1x1_lower("A30","B30");            # (a0+a1)·(b0+b1)
 139 $code.=<<___;
 140 ||      BNOP    B3
 141         XOR     A29,A30,A30
 142 ||      XOR     B29,B30,B30                     ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
 143         XOR     B28,A30,A30
 144 ||      STW     A28,*${rp}[0]
 145         XOR     B30,A31,A31
 146 ||      STW     A30,*${rp}[1]
 147         STW     A31,*${rp}[2]
 148         STW     B31,*${rp}[3]
 149         .endasmfunc
 150 ___
 151
 152 print $code;
 153 close STDOUT;