C64x+ assembler pack. linux-c64xplus build is *not* tested nor can it be
[openssl.git] / crypto / modes / asm / ghash-c64xplus.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # December 2011
11 #
12 # The module implements GCM GHASH function and underlying single
13 # multiplication operation in GF(2^128). Even though subroutines
14 # have _4bit suffix, they are not using any tables, but rely on
15 # hardware Galois Field Multiply support. Streamed GHASH processes
16 # byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
17 # code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
18 # comparing apples vs. oranges, but compiler surely could have done
19 # better, because theoretical [though not necessarily achievable]
20 # estimate for "4-bit" table-driven implementation is ~12 cycles.
21
22 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
23 open STDOUT,">$output";
24
25 ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
26
27 ($Z0,$Z1,$Z2,$Z3,       $H0, $H1, $H2, $H3,
28                         $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
29 ($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
30                         $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
31 ($FF000000,$E10000)=("B30","B31");
32 ($xip,$x0,$x1,$xib)=map("B$_",(6..9));  # $xip zaps $len
33  $xia="A9";
34 ($rem,$res)=("B4","B5");                # $rem zaps $Htable
35
36 $code.=<<___;
37         .text
38
39         .asg    B3,RA
40
41         .if     0
42         .global _gcm_gmult_1bit
43 _gcm_gmult_1bit:
44         ADDAD   $Htable,2,$Htable
45         .endif
46         .global _gcm_gmult_4bit
47 _gcm_gmult_4bit:
48         .asmfunc
49         LDDW    *${Htable}[-1],$H1:$H0  ; H.lo
50         LDDW    *${Htable}[-2],$H3:$H2  ; H.hi
51 ||      MV      $Xip,${xip}             ; reassign Xi
52 ||      MVK     15,B1                   ; SPLOOPD constant
53
54         MVK     0xE1,$E10000
55 ||      LDBU    *++${xip}[15],$x1       ; Xi[15]
56         MVK     0xFF,$FF000000
57 ||      LDBU    *--${xip},$x0           ; Xi[14]
58         SHL     $E10000,16,$E10000      ; [pre-shifted] reduction polynomial
59         SHL     $FF000000,24,$FF000000  ; upper byte mask
60 ||      BNOP    ghash_loop?
61 ||      MVK     1,B0                    ; take a single spin
62
63         PACKH2  $H0,$H1,$xia            ; pack H0' and H1's upper bytes
64         AND     $H2,$FF000000,$H2u      ; H2's upper byte
65         AND     $H3,$FF000000,$H3u      ; H3's upper byte
66 ||      SHRU    $H2u,8,$H2u
67         SHRU    $H3u,8,$H3u
68 ||      ZERO    $Z1:$Z0
69         SHRU2   $xia,8,$H01u
70 ||      ZERO    $Z3:$Z2
71         .endasmfunc
72
73         .global _gcm_ghash_4bit
74 _gcm_ghash_4bit:
75         .asmfunc
76         LDDW    *${Htable}[-1],$H1:$H0  ; H.lo
77 ||      SHRU    $len,4,B0               ; reassign len
78         LDDW    *${Htable}[-2],$H3:$H2  ; H.hi
79 ||      MV      $Xip,${xip}             ; reassign Xi
80 ||      MVK     15,B1                   ; SPLOOPD constant
81
82         MVK     0xE1,$E10000
83 || [B0] LDNDW   *${inp}[1],$H1x:$H0x
84         MVK     0xFF,$FF000000
85 || [B0] LDNDW   *${inp}++[2],$H3x:$H2x
86         SHL     $E10000,16,$E10000      ; [pre-shifted] reduction polynomial
87 ||      LDDW    *${xip}[1],$Z1:$Z0
88         SHL     $FF000000,24,$FF000000  ; upper byte mask
89 ||      LDDW    *${xip}[0],$Z3:$Z2
90
91         PACKH2  $H0,$H1,$xia            ; pack H0' and H1's upper bytes
92         AND     $H2,$FF000000,$H2u      ; H2's upper byte
93         AND     $H3,$FF000000,$H3u      ; H3's upper byte
94 ||      SHRU    $H2u,8,$H2u
95         SHRU    $H3u,8,$H3u
96         SHRU2   $xia,8,$H01u
97
98 || [B0] XOR     $H0x,$Z0,$Z0            ; Xi^=inp
99 || [B0] XOR     $H1x,$Z1,$Z1
100         .if     .LITTLE_ENDIAN
101    [B0] XOR     $H2x,$Z2,$Z2
102 || [B0] XOR     $H3x,$Z3,$Z3
103 || [B0] SHRU    $Z1,24,$xia             ; Xi[15], avoid cross-path stall
104         STDW    $Z1:$Z0,*${xip}[1]
105 || [B0] SHRU    $Z1,16,$x0              ; Xi[14]
106 || [B0] ZERO    $Z1:$Z0
107         .else
108    [B0] XOR     $H2x,$Z2,$Z2
109 || [B0] XOR     $H3x,$Z3,$Z3
110 || [B0] MV      $Z0,$xia                ; Xi[15], avoid cross-path stall
111         STDW    $Z1:$Z0,*${xip}[1]
112 || [B0] SHRU    $Z0,8,$x0               ; Xi[14]
113 || [B0] ZERO    $Z1:$Z0
114         .endif
115         STDW    $Z3:$Z2,*${xip}[0]
116 || [B0] ZERO    $Z3:$Z2
117 || [B0] MV      $xia,$x1
118    [B0] ADDK    14,${xip}
119
120 ghash_loop?:
121         SPLOOPD 6                       ; 6*16+7
122 ||      MVC     B1,ILC
123 || [B0] SUB     B0,1,B0
124 ||      ZERO    A0
125 ||      ADD     $x1,$x1,$xib            ; SHL   $x1,1,$xib
126 ||      SHL     $x1,1,$xia
127 ___
128 \f
129 ########____________________________
130 #  0    D2.     M1          M2      |
131 #  1            M1                  |
132 #  2            M1          M2      |
133 #  3        D1. M1          M2      |
134 #  4        S1. L1                  |
135 #  5    S2  S1x L1          D2  L2  |____________________________
136 #  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
137 #  7/1          L1  S1  D1x S2  M2  |        M1                  |
138 #  8/2              S1  L1x S2      |        M1          M2      |
139 #  9/3              S1  L1x         |    D1. M1          M2      |
140 # 10/4                  D1x         |    S1. L1                  |
141 # 11/5                              |S2  S1x L1          D2  L2  |____________
142 # 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
143 #    7/1                                     L1  S1  D1x S2  M2  |        ....
144 #    8/2                                         S1  L1x S2      |        ....
145 #####...                                         ................|............
146 $code.=<<___;
147         XORMPY  $H0,$xia,$H0x           ; 0     ; H·Xi[i]
148 ||      XORMPY  $H01u,$xib,$H01y
149 || [A0] LDBU    *--${xip},$x0
150         XORMPY  $H1,$xia,$H1x           ; 1
151         XORMPY  $H2,$xia,$H2x           ; 2
152 ||      XORMPY  $H2u,$xib,$H2y
153         XORMPY  $H3,$xia,$H3x           ; 3
154 ||      XORMPY  $H3u,$xib,$H3y
155 ||[!A0] MVK.D   15,A0                           ; *--${xip} counter
156         XOR.L   $H0x,$Z0,$Z0            ; 4     ; Z^=H·Xi[i]
157 || [A0] SUB.S   A0,1,A0
158         XOR.L   $H1x,$Z1,$Z1            ; 5
159 ||      AND.D   $H01y,$FF000000,$H0z
160 ||      SWAP2.L $H01y,$H1y              ;       ; SHL   $H01y,16,$H1y
161 ||      SHL     $x0,1,$xib
162 ||      SHL     $x0,1,$xia
163
164         XOR.L   $H2x,$Z2,$Z2            ; 6/0   ; [0,0] in epilogue
165 ||      SHL     $Z0,1,$rem              ;       ; rem=Z<<1
166 ||      SHRMB.S $Z1,$Z0,$Z0             ;       ; Z>>=8
167 ||      AND.L   $H1y,$FF000000,$H1z
168         XOR.L   $H3x,$Z3,$Z3            ; 7/1
169 ||      SHRMB.S $Z2,$Z1,$Z1
170 ||      XOR.D   $H0z,$Z0,$Z0                    ; merge upper byte products
171 ||      AND.S   $H2y,$FF000000,$H2z
172 ||      XORMPY  $E10000,$rem,$res       ;       ; implicit rem&0x1FE
173         XOR.L   $H1z,$Z1,$Z1            ; 8/2
174 ||      SHRMB.S $Z3,$Z2,$Z2
175 ||      AND.S   $H3y,$FF000000,$H3z
176         XOR.L   $H2z,$Z2,$Z2            ; 9/3
177 ||      SHRU    $Z3,8,$Z3
178         XOR.D   $H3z,$Z3,$Z3            ; 10/4
179         NOP                             ; 11/5
180
181         SPKERNEL 0,2
182 ||      XOR.D   $res,$Z3,$Z3            ; 12/6/0; Z^=res
183
184         ; input pre-fetch is possible where D1 slot is available...
185    [B0] LDNDW   *${inp}[1],$H1x:$H0x    ; 8/-
186    [B0] LDNDW   *${inp}++[2],$H3x:$H2x  ; 9/-
187         NOP                             ; 10/-
188         .if     .LITTLE_ENDIAN
189         SWAP2   $Z0,$Z1                 ; 11/-
190 ||      SWAP4   $Z1,$Z0
191         SWAP4   $Z1,$Z1                 ; 12/-
192 ||      SWAP2   $Z0,$Z0
193         SWAP2   $Z2,$Z3
194 ||      SWAP4   $Z3,$Z2
195 ||[!B0] BNOP    RA
196         SWAP4   $Z3,$Z3
197 ||      SWAP2   $Z2,$Z2
198 || [B0] BNOP    ghash_loop?
199    [B0] XOR     $H0x,$Z0,$Z0            ; Xi^=inp
200 || [B0] XOR     $H1x,$Z1,$Z1
201    [B0] XOR     $H2x,$Z2,$Z2
202 || [B0] XOR     $H3x,$Z3,$Z3
203 || [B0] SHRU    $Z1,24,$xia             ; Xi[15], avoid cross-path stall
204         STDW    $Z1:$Z0,*${xip}[1]
205 || [B0] SHRU    $Z1,16,$x0              ; Xi[14]
206 || [B0] ZERO    $Z1:$Z0
207         .else
208   [!B0] BNOP    RA                      ; 11/-
209    [B0] BNOP    ghash_loop?             ; 12/-
210    [B0] XOR     $H0x,$Z0,$Z0            ; Xi^=inp
211 || [B0] XOR     $H1x,$Z1,$Z1
212    [B0] XOR     $H2x,$Z2,$Z2
213 || [B0] XOR     $H3x,$Z3,$Z3
214 || [B0] MV      $Z0,$xia                ; Xi[15], avoid cross-path stall
215         STDW    $Z1:$Z0,*${xip}[1]
216 || [B0] SHRU    $Z0,8,$x0               ; Xi[14]
217 || [B0] ZERO    $Z1:$Z0
218         .endif
219         STDW    $Z3:$Z2,*${xip}[0]
220 || [B0] ZERO    $Z3:$Z2
221 || [B0] MV      $xia,$x1
222    [B0] ADDK    14,${xip}
223         .endasmfunc
224
225         .sect   .const
226         .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
227         .align  4
228 ___
229
230 print $code;
231 close STDOUT;