crypto/bn/asm/c64xplus-gf2m.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # February 2012
  18 #
  19 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
  20 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
  21 # C for the time being... The subroutine runs in 37 cycles, which is
  22 # 4.5x faster than compiler-generated code. Though comparison is
  23 # totally unfair, because this module utilizes Galois Field Multiply
  24 # instruction.
  25
  26 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  27 open STDOUT,">$output";
  28
  29 ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
  30
  31 ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
  32 ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
  33 ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
  34 ($A,$B)=($Alo,$B_1);
  35 $xFF="B1";
  36
  37 sub mul_1x1_upper {
  38 my ($A,$B)=@_;
  39 $code.=<<___;
  40         EXTU    $B,8,24,$B_2            ; smash $B to 4 bytes
  41 ||      AND     $B,$xFF,$B_0
  42 ||      SHRU    $B,24,$B_3
  43         SHRU    $A,16,   $Ahi           ; smash $A to two halfwords
  44 ||      EXTU    $A,16,16,$Alo
  45
  46         XORMPY  $Alo,$B_2,$Alox2        ; 16x8 bits multiplication
  47 ||      XORMPY  $Ahi,$B_2,$Ahix2
  48 ||      EXTU    $B,16,24,$B_1
  49         XORMPY  $Alo,$B_0,$Alox0
  50 ||      XORMPY  $Ahi,$B_0,$Ahix0
  51         XORMPY  $Alo,$B_3,$Alox3
  52 ||      XORMPY  $Ahi,$B_3,$Ahix3
  53         XORMPY  $Alo,$B_1,$Alox1
  54 ||      XORMPY  $Ahi,$B_1,$Ahix1
  55 ___
  56 }
  57 sub mul_1x1_merged {
  58 my ($OUTlo,$OUThi,$A,$B)=@_;
  59 $code.=<<___;
  60          EXTU   $B,8,24,$B_2            ; smash $B to 4 bytes
  61 ||       AND    $B,$xFF,$B_0
  62 ||       SHRU   $B,24,$B_3
  63          SHRU   $A,16,   $Ahi           ; smash $A to two halfwords
  64 ||       EXTU   $A,16,16,$Alo
  65
  66         XOR     $Ahix0,$Alox2,$Ahix0
  67 ||      MV      $Ahix2,$OUThi
  68 ||       XORMPY $Alo,$B_2,$Alox2
  69          XORMPY $Ahi,$B_2,$Ahix2
  70 ||       EXTU   $B,16,24,$B_1
  71 ||       XORMPY $Alo,$B_0,A1            ; $Alox0
  72         XOR     $Ahix1,$Alox3,$Ahix1
  73 ||      SHL     $Ahix0,16,$OUTlo
  74 ||      SHRU    $Ahix0,16,$Ahix0
  75         XOR     $Alox0,$OUTlo,$OUTlo
  76 ||      XOR     $Ahix0,$OUThi,$OUThi
  77 ||       XORMPY $Ahi,$B_0,$Ahix0
  78 ||       XORMPY $Alo,$B_3,$Alox3
  79 ||      SHL     $Alox1,8,$Alox1
  80 ||      SHL     $Ahix3,8,$Ahix3
  81         XOR     $Alox1,$OUTlo,$OUTlo
  82 ||      XOR     $Ahix3,$OUThi,$OUThi
  83 ||       XORMPY $Ahi,$B_3,$Ahix3
  84 ||      SHL     $Ahix1,24,$Alox1
  85 ||      SHRU    $Ahix1,8, $Ahix1
  86         XOR     $Alox1,$OUTlo,$OUTlo
  87 ||      XOR     $Ahix1,$OUThi,$OUThi
  88 ||       XORMPY $Alo,$B_1,$Alox1
  89 ||       XORMPY $Ahi,$B_1,$Ahix1
  90 ||       MV     A1,$Alox0
  91 ___
  92 }
  93 sub mul_1x1_lower {
  94 my ($OUTlo,$OUThi)=@_;
  95 $code.=<<___;
  96         ;NOP
  97         XOR     $Ahix0,$Alox2,$Ahix0
  98 ||      MV      $Ahix2,$OUThi
  99         NOP
 100         XOR     $Ahix1,$Alox3,$Ahix1
 101 ||      SHL     $Ahix0,16,$OUTlo
 102 ||      SHRU    $Ahix0,16,$Ahix0
 103         XOR     $Alox0,$OUTlo,$OUTlo
 104 ||      XOR     $Ahix0,$OUThi,$OUThi
 105 ||      SHL     $Alox1,8,$Alox1
 106 ||      SHL     $Ahix3,8,$Ahix3
 107         XOR     $Alox1,$OUTlo,$OUTlo
 108 ||      XOR     $Ahix3,$OUThi,$OUThi
 109 ||      SHL     $Ahix1,24,$Alox1
 110 ||      SHRU    $Ahix1,8, $Ahix1
 111         XOR     $Alox1,$OUTlo,$OUTlo
 112 ||      XOR     $Ahix1,$OUThi,$OUThi
 113 ___
 114 }
 115 $code.=<<___;
 116         .text
 117
 118         .if     .ASSEMBLER_VERSION<7000000
 119         .asg    0,__TI_EABI__
 120         .endif
 121         .if     __TI_EABI__
 122         .asg    bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
 123         .endif
 124
 125         .global _bn_GF2m_mul_2x2
 126 _bn_GF2m_mul_2x2:
 127         .asmfunc
 128         MVK     0xFF,$xFF
 129 ___
 130         &mul_1x1_upper($a0,$b0);                # a0·b0
 131 $code.=<<___;
 132 ||      MV      $b1,$B
 133         MV      $a1,$A
 134 ___
 135         &mul_1x1_merged("A28","B28",$A,$B);     # a0·b0/a1·b1
 136 $code.=<<___;
 137 ||      XOR     $b0,$b1,$B
 138         XOR     $a0,$a1,$A
 139 ___
 140         &mul_1x1_merged("A31","B31",$A,$B);     # a1·b1/(a0+a1)·(b0+b1)
 141 $code.=<<___;
 142         XOR     A28,A31,A29
 143 ||      XOR     B28,B31,B29                     ; a0·b0+a1·b1
 144 ___
 145         &mul_1x1_lower("A30","B30");            # (a0+a1)·(b0+b1)
 146 $code.=<<___;
 147 ||      BNOP    B3
 148         XOR     A29,A30,A30
 149 ||      XOR     B29,B30,B30                     ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
 150         XOR     B28,A30,A30
 151 ||      STW     A28,*${rp}[0]
 152         XOR     B30,A31,A31
 153 ||      STW     A30,*${rp}[1]
 154         STW     A31,*${rp}[2]
 155         STW     B31,*${rp}[3]
 156         .endasmfunc
 157 ___
 158
 159 print $code;
 160 close STDOUT;