C64x+ assembly pack: improve EABI support.
[openssl.git] / crypto / bn / asm / c64xplus-gf2m.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # February 2012
11 #
12 # The module implements bn_GF2m_mul_2x2 polynomial multiplication
13 # used in bn_gf2m.c. It's kind of low-hanging mechanical port from
14 # C for the time being... The subroutine runs in 37 cycles, which is
15 # 4.5x faster than compiler-generated code. Though comparison is
16 # totally unfair, because this module utilizes Galois Field Multiply
17 # instruction.
18
19 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
20 open STDOUT,">$output";
21
22 ($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
23
24 ($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
25 ($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
26 ($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
27 ($A,$B)=($Alo,$B_1);
28 $xFF="B1";
29
30 sub mul_1x1_upper {
31 my ($A,$B)=@_;
32 $code.=<<___;
33         EXTU    $B,8,24,$B_2            ; smash $B to 4 bytes
34 ||      AND     $B,$xFF,$B_0
35 ||      SHRU    $B,24,$B_3
36         SHRU    $A,16,   $Ahi           ; smash $A to two halfwords
37 ||      EXTU    $A,16,16,$Alo
38
39         XORMPY  $Alo,$B_2,$Alox2        ; 16x8 bits muliplication
40 ||      XORMPY  $Ahi,$B_2,$Ahix2
41 ||      EXTU    $B,16,24,$B_1
42         XORMPY  $Alo,$B_0,$Alox0
43 ||      XORMPY  $Ahi,$B_0,$Ahix0
44         XORMPY  $Alo,$B_3,$Alox3
45 ||      XORMPY  $Ahi,$B_3,$Ahix3
46         XORMPY  $Alo,$B_1,$Alox1
47 ||      XORMPY  $Ahi,$B_1,$Ahix1
48 ___
49 }
50 sub mul_1x1_merged {
51 my ($OUTlo,$OUThi,$A,$B)=@_;
52 $code.=<<___;
53          EXTU   $B,8,24,$B_2            ; smash $B to 4 bytes
54 ||       AND    $B,$xFF,$B_0
55 ||       SHRU   $B,24,$B_3
56          SHRU   $A,16,   $Ahi           ; smash $A to two halfwords
57 ||       EXTU   $A,16,16,$Alo
58
59         XOR     $Ahix0,$Alox2,$Ahix0
60 ||      MV      $Ahix2,$OUThi
61 ||       XORMPY $Alo,$B_2,$Alox2
62          XORMPY $Ahi,$B_2,$Ahix2
63 ||       EXTU   $B,16,24,$B_1
64 ||       XORMPY $Alo,$B_0,A1            ; $Alox0
65         XOR     $Ahix1,$Alox3,$Ahix1
66 ||      SHL     $Ahix0,16,$OUTlo
67 ||      SHRU    $Ahix0,16,$Ahix0
68         XOR     $Alox0,$OUTlo,$OUTlo
69 ||      XOR     $Ahix0,$OUThi,$OUThi
70 ||       XORMPY $Ahi,$B_0,$Ahix0
71 ||       XORMPY $Alo,$B_3,$Alox3
72 ||      SHL     $Alox1,8,$Alox1
73 ||      SHL     $Ahix3,8,$Ahix3
74         XOR     $Alox1,$OUTlo,$OUTlo
75 ||      XOR     $Ahix3,$OUThi,$OUThi
76 ||       XORMPY $Ahi,$B_3,$Ahix3
77 ||      SHL     $Ahix1,24,$Alox1
78 ||      SHRU    $Ahix1,8, $Ahix1
79         XOR     $Alox1,$OUTlo,$OUTlo
80 ||      XOR     $Ahix1,$OUThi,$OUThi
81 ||       XORMPY $Alo,$B_1,$Alox1
82 ||       XORMPY $Ahi,$B_1,$Ahix1
83 ||       MV     A1,$Alox0
84 ___
85 }
86 sub mul_1x1_lower {
87 my ($OUTlo,$OUThi)=@_;
88 $code.=<<___;
89         ;NOP
90         XOR     $Ahix0,$Alox2,$Ahix0
91 ||      MV      $Ahix2,$OUThi
92         NOP
93         XOR     $Ahix1,$Alox3,$Ahix1
94 ||      SHL     $Ahix0,16,$OUTlo
95 ||      SHRU    $Ahix0,16,$Ahix0
96         XOR     $Alox0,$OUTlo,$OUTlo
97 ||      XOR     $Ahix0,$OUThi,$OUThi
98 ||      SHL     $Alox1,8,$Alox1
99 ||      SHL     $Ahix3,8,$Ahix3
100         XOR     $Alox1,$OUTlo,$OUTlo
101 ||      XOR     $Ahix3,$OUThi,$OUThi
102 ||      SHL     $Ahix1,24,$Alox1
103 ||      SHRU    $Ahix1,8, $Ahix1
104         XOR     $Alox1,$OUTlo,$OUTlo
105 ||      XOR     $Ahix1,$OUThi,$OUThi
106 ___
107 }
108 $code.=<<___;
109         .text
110         .if     __TI_EABI__
111         .asg    bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
112         .endif
113
114         .global _bn_GF2m_mul_2x2
115 _bn_GF2m_mul_2x2:
116         .asmfunc
117         MVK     0xFF,$xFF
118 ___
119         &mul_1x1_upper($a0,$b0);                # a0·b0
120 $code.=<<___;
121 ||      MV      $b1,$B
122         MV      $a1,$A
123 ___
124         &mul_1x1_merged("A28","B28",$A,$B);     # a0·b0/a1·b1
125 $code.=<<___;
126 ||      XOR     $b0,$b1,$B
127         XOR     $a0,$a1,$A
128 ___
129         &mul_1x1_merged("A31","B31",$A,$B);     # a1·b1/(a0+a1)·(b0+b1)
130 $code.=<<___;
131         XOR     A28,A31,A29
132 ||      XOR     B28,B31,B29                     ; a0·b0+a1·b1
133 ___
134         &mul_1x1_lower("A30","B30");            # (a0+a1)·(b0+b1)
135 $code.=<<___;
136 ||      BNOP    B3
137         XOR     A29,A30,A30
138 ||      XOR     B29,B30,B30                     ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
139         XOR     B28,A30,A30
140 ||      STW     A28,*${rp}[0]
141         XOR     B30,A31,A31
142 ||      STW     A30,*${rp}[1]
143         STW     A31,*${rp}[2]
144         STW     B31,*${rp}[3]
145         .endasmfunc
146 ___
147
148 print $code;
149 close STDOUT;