81e75f71a80033869ab5ec26444f7205eab81fb1
[openssl.git] / crypto / modes / asm / ghash-ia64.pl
1 #! /usr/bin/env perl
2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # March 2010
18 #
19 # The module implements "4-bit" GCM GHASH function and underlying
20 # single multiplication operation in GF(2^128). "4-bit" means that it
21 # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
22 # GHASH performance was measured to be 6.67 cycles per processed byte
23 # on Itanium 2, which is >90% better than Microsoft compiler generated
24 # code. To anchor to something else sha1-ia64.pl module processes one
25 # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
26 # byte.
27
28 # September 2010
29 #
30 # It was originally thought that it makes lesser sense to implement
31 # "528B" variant on Itanium 2 for following reason. Because number of
32 # functional units is naturally limited, it appeared impossible to
33 # implement "528B" loop in 4 cycles, only in 5. This would mean that
34 # theoretically performance improvement couldn't be more than 20%.
35 # But occasionally you prove yourself wrong:-) I figured out a way to
36 # fold couple of instructions and having freed yet another instruction
37 # slot by unrolling the loop... Resulting performance is 4.45 cycles
38 # per processed byte and 50% better than "256B" version. On original
39 # Itanium performance should remain the same as the "256B" version,
40 # i.e. ~8.5 cycles.
41
42 $output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
43
44 if ($^O eq "hpux") {
45     $ADDP="addp4";
46     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
47 } else { $ADDP="add"; }
48 for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
49                 $big_endian=0 if (/\-DL_ENDIAN/);  }
50 if (!defined($big_endian))
51              {  $big_endian=(unpack('L',pack('N',1))==1);  }
52
53 sub loop() {
54 my $label=shift;
55 my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
56
57 # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
58 # in scalable manner;-) Naturally assuming data in L1 cache...
59 # Special note about 'dep' instruction, which is used to construct
60 # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
61 # bytes boundary and lower 7 bits of its address are guaranteed to
62 # be zero.
63 $code.=<<___;
64 $label:
65 { .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
66         (p19)   dep     rem=Zlo,rem_4bitp,3,4   }
67 { .mfi; (p19)   xor     Zhi=Zhi,Hhi
68         ($p17)  xor     xi[1]=xi[1],in[1]       };;
69 { .mfi; (p18)   ld8     Hhi=[Hi[1]]
70         (p19)   shrp    Zlo=Zhi,Zlo,4           }
71 { .mfi; (p19)   ld8     rem=[rem]
72         (p18)   and     Hi[1]=mask0xf0,xi[2]    };;
73 { .mmi; ($p16)  ld1     in[0]=[inp],-1
74         (p18)   xor     Zlo=Zlo,Hlo
75         (p19)   shr.u   Zhi=Zhi,4               }
76 { .mib; (p19)   xor     Hhi=Hhi,rem
77         (p18)   add     Hi[1]=Htbl,Hi[1]        };;
78
79 { .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
80         (p18)   dep     rem=Zlo,rem_4bitp,3,4   }
81 { .mfi; (p17)   shladd  Hi[0]=xi[1],4,r0
82         (p18)   xor     Zhi=Zhi,Hhi             };;
83 { .mfi; (p18)   ld8     Hhi=[Hi[1]]
84         (p18)   shrp    Zlo=Zhi,Zlo,4           }
85 { .mfi; (p18)   ld8     rem=[rem]
86         (p17)   and     Hi[0]=mask0xf0,Hi[0]    };;
87 { .mmi; (p16)   ld1     xi[0]=[Xi],-1
88         (p18)   xor     Zlo=Zlo,Hlo
89         (p18)   shr.u   Zhi=Zhi,4               }
90 { .mib; (p18)   xor     Hhi=Hhi,rem
91         (p17)   add     Hi[0]=Htbl,Hi[0]
92         br.ctop.sptk    $label                  };;
93 ___
94 }
95
96 $code=<<___;
97 .explicit
98 .text
99
100 prevfs=r2;      prevlc=r3;      prevpr=r8;
101 mask0xf0=r21;
102 rem=r22;        rem_4bitp=r23;
103 Xi=r24;         Htbl=r25;
104 inp=r26;        end=r27;
105 Hhi=r28;        Hlo=r29;
106 Zhi=r30;        Zlo=r31;
107
108 .align  128
109 .skip   16                                      // aligns loop body
110 .global gcm_gmult_4bit#
111 .proc   gcm_gmult_4bit#
112 gcm_gmult_4bit:
113         .prologue
114 { .mmi; .save   ar.pfs,prevfs
115         alloc   prevfs=ar.pfs,2,6,0,8
116         $ADDP   Xi=15,in0                       // &Xi[15]
117         mov     rem_4bitp=ip            }
118 { .mii; $ADDP   Htbl=8,in1                      // &Htbl[0].lo
119         .save   ar.lc,prevlc
120         mov     prevlc=ar.lc
121         .save   pr,prevpr
122         mov     prevpr=pr               };;
123
124         .body
125         .rotr   in[3],xi[3],Hi[2]
126
127 { .mib; ld1     xi[2]=[Xi],-1                   // Xi[15]
128         mov     mask0xf0=0xf0
129         brp.loop.imp    .Loop1,.Lend1-16};;
130 { .mmi; ld1     xi[1]=[Xi],-1                   // Xi[14]
131                                         };;
132 { .mii; shladd  Hi[1]=xi[2],4,r0
133         mov     pr.rot=0x7<<16
134         mov     ar.lc=13                };;
135 { .mii; and     Hi[1]=mask0xf0,Hi[1]
136         mov     ar.ec=3
137         xor     Zlo=Zlo,Zlo             };;
138 { .mii; add     Hi[1]=Htbl,Hi[1]                // &Htbl[nlo].lo
139         add     rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
140         xor     Zhi=Zhi,Zhi             };;
141 ___
142         &loop   (".Loop1",1);
143 $code.=<<___;
144 .Lend1:
145 { .mib; xor     Zhi=Zhi,Hhi             };;     // modulo-scheduling artefact
146 { .mib; mux1    Zlo=Zlo,\@rev           };;
147 { .mib; mux1    Zhi=Zhi,\@rev           };;
148 { .mmi; add     Hlo=9,Xi;;                      // ;; is here to prevent
149         add     Hhi=1,Xi                };;     // pipeline flush on Itanium
150 { .mib; st8     [Hlo]=Zlo
151         mov     pr=prevpr,0x1ffff       };;
152 { .mib; st8     [Hhi]=Zhi
153         mov     ar.lc=prevlc
154         br.ret.sptk.many        b0      };;
155 .endp   gcm_gmult_4bit#
156 ___
157
158 ######################################################################
159 # "528B" (well, "512B" actualy) streamed GHASH
160 #
161 $Xip="in0";
162 $Htbl="in1";
163 $inp="in2";
164 $len="in3";
165 $rem_8bit="loc0";
166 $mask0xff="loc1";
167 ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
168
169 sub load_htable() {
170     for (my $i=0;$i<8;$i++) {
171         $code.=<<___;
172 { .mmi; ld8     r`16+2*$i+1`=[r8],16            // Htable[$i].hi
173         ld8     r`16+2*$i`=[r9],16      }       // Htable[$i].lo
174 { .mmi; ldf8    f`32+2*$i+1`=[r10],16           // Htable[`8+$i`].hi
175         ldf8    f`32+2*$i`=[r11],16             // Htable[`8+$i`].lo
176 ___
177         $code.=shift    if (($i+$#_)==7);
178         $code.="\t};;\n"
179     }
180 }
181
182 $code.=<<___;
183 prevsp=r3;
184
185 .align  32
186 .skip   16                                      // aligns loop body
187 .global gcm_ghash_4bit#
188 .proc   gcm_ghash_4bit#
189 gcm_ghash_4bit:
190         .prologue
191 { .mmi; .save   ar.pfs,prevfs
192         alloc   prevfs=ar.pfs,4,2,0,0
193         .vframe prevsp
194         mov     prevsp=sp
195         mov     $rem_8bit=ip            };;
196         .body
197 { .mfi; $ADDP   r8=0+0,$Htbl
198         $ADDP   r9=0+8,$Htbl            }
199 { .mfi; $ADDP   r10=128+0,$Htbl
200         $ADDP   r11=128+8,$Htbl         };;
201 ___
202         &load_htable(
203         "       $ADDP   $Xip=15,$Xip",          # &Xi[15]
204         "       $ADDP   $len=$len,$inp",        # &inp[len]
205         "       $ADDP   $inp=15,$inp",          # &inp[15]
206         "       mov     $mask0xff=0xff",
207         "       add     sp=-512,sp",
208         "       andcm   sp=sp,$mask0xff",       # align stack frame
209         "       add     r14=0,sp",
210         "       add     r15=8,sp");
211 $code.=<<___;
212 { .mmi; $sum    1<<1                            // go big-endian
213         add     r8=256+0,sp
214         add     r9=256+8,sp             }
215 { .mmi; add     r10=256+128+0,sp
216         add     r11=256+128+8,sp
217         add     $len=-17,$len           };;
218 ___
219 for($i=0;$i<8;$i++) {   # generate first half of Hshr4[]
220 my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
221 $code.=<<___;
222 { .mmi; st8     [r8]=$rlo,16                    // Htable[$i].lo
223         st8     [r9]=$rhi,16                    // Htable[$i].hi
224         shrp    $rlo=$rhi,$rlo,4        }//;;
225 { .mmi; stf8    [r10]=f`32+2*$i`,16             // Htable[`8+$i`].lo
226         stf8    [r11]=f`32+2*$i+1`,16           // Htable[`8+$i`].hi
227         shr.u   $rhi=$rhi,4             };;
228 { .mmi; st8     [r14]=$rlo,16                   // Htable[$i].lo>>4
229         st8     [r15]=$rhi,16           }//;;   // Htable[$i].hi>>4
230 ___
231 }
232 $code.=<<___;
233 { .mmi; ld8     r16=[r8],16                     // Htable[8].lo
234         ld8     r17=[r9],16             };;     // Htable[8].hi
235 { .mmi; ld8     r18=[r8],16                     // Htable[9].lo
236         ld8     r19=[r9],16             }       // Htable[9].hi
237 { .mmi; rum     1<<5                            // clear um.mfh
238         shrp    r16=r17,r16,4           };;
239 ___
240 for($i=0;$i<6;$i++) {   # generate second half of Hshr4[]
241 $code.=<<___;
242 { .mmi; ld8     r`20+2*$i`=[r8],16              // Htable[`10+$i`].lo
243         ld8     r`20+2*$i+1`=[r9],16            // Htable[`10+$i`].hi
244         shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
245 { .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
246         st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
247         shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
248 ___
249 }
250 $code.=<<___;
251 { .mmi; shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
252 { .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
253         st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
254         shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
255 { .mmi; add     $Htbl=256,sp                    // &Htable[0]
256         add     $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
257         shr.u   r`18+2*$i+1`=r`18+2*$i+1`,4     };;
258 { .mmi; st8     [r14]=r`18+2*$i`                // Htable[`8+$i`].lo>>4
259         st8     [r15]=r`18+2*$i+1`      }       // Htable[`8+$i`].hi>>4
260 ___
261
262 $in="r15";
263 @xi=("r16","r17");
264 @rem=("r18","r19");
265 ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
266 ($Atbl,$Btbl)=("r26","r27");
267
268 $code.=<<___;   # (p16)
269 { .mmi; ld1     $in=[$inp],-1                   //(p16) *inp--
270         ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
271         cmp.eq  p0,p6=r0,r0             };;     //      clear p6
272 ___
273 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
274
275 $code.=<<___;   # (p16),(p17)
276 { .mmi; ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
277         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
278 { .mii; ld1     $in=[$inp],-1                   //(p16) *inp--
279         dep     $Atbl=$xi[1],$Htbl,4,4          //(p17) &Htable[nlo].lo
280         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
281 .align  32
282 .LOOP:
283 { .mmi;
284 (p6)    st8     [$Xip]=$Zhi,13
285         xor     $Zlo=$Zlo,$Zlo
286         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi].lo
287 ___
288 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
289
290 $code.=<<___;   # (p16),(p17),(p18)
291 { .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
292         ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
293         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
294 { .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
295         dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
296 { .mfi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
297         xor     $Zlo=$Zlo,$Alo          };;     //(p18) Z.lo^=Htable[nlo].lo
298 { .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
299         ld1     $in=[$inp],-1           }       //(p16) *inp--
300 { .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
301         mov     $Zhi=$Ahi                       //(p18) Z.hi^=Htable[nlo].hi
302         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
303 { .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
304         ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
305         shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
306 { .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
307         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
308 ___
309 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
310
311 for ($i=1;$i<14;$i++) {
312 # Above and below fragments are derived from this one by removing
313 # unsuitable (p??) instructions.
314 $code.=<<___;   # (p16),(p17),(p18),(p19)
315 { .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
316         ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
317         shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
318 { .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
319         xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
320         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
321 { .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
322         ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
323         dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
324 { .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
325         xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
326         xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
327 { .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
328         ld1     $in=[$inp],-1                   //(p16) *inp--
329         shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
330 { .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
331         xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
332         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
333 { .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
334         ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
335         shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
336 { .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
337         xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
338         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
339 ___
340 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
341 }
342
343 $code.=<<___;   # (p17),(p18),(p19)
344 { .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
345         ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
346         shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
347 { .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
348         xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
349         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
350 { .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
351         ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
352         dep     $Atbl=$xi[1],$Htbl,4,4  };;     //(p17) &Htable[nlo].lo
353 { .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
354         xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
355         xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
356 { .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
357         shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
358 { .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
359         xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
360         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
361 { .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
362         shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
363 { .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
364         xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
365         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
366 ___
367 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
368
369 $code.=<<___;   # (p18),(p19)
370 { .mfi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
371         shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
372 { .mfi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
373         xor     $Zlo=$Zlo,$Blo          };;     //(p19) Z.lo^=Hshr4[nhi].lo
374 { .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
375         xor     $Zlo=$Zlo,$Alo          }       //(p18) Z.lo^=Htable[nlo].lo
376 { .mfi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
377         xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
378 { .mfi; ld8     $Blo=[$Btbl],8                  //(p18) Htable[nhi].lo,&Htable[nhi].hi
379         shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
380 { .mfi; shladd  $rem[0]=$Zlo,4,r0               //(p18) Z.lo<<4
381         xor     $Zhi=$Zhi,$Ahi          };;     //(p18) Z.hi^=Htable[nlo].hi
382 { .mfi; ld8     $Bhi=[$Btbl]                    //(p18) Htable[nhi].hi
383         shrp    $Zlo=$Zhi,$Zlo,4        }       //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
384 { .mfi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
385         xor     $Zhi=$Zhi,$rem[1]       };;     //(p19) Z.hi^=rem_8bit[rem]<<48
386 ___
387 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
388
389 $code.=<<___;   # (p19)
390 { .mmi; cmp.ltu p6,p0=$inp,$len
391         add     $inp=32,$inp
392         shr.u   $Zhi=$Zhi,4             }       //(p19) Z.hi>>=4
393 { .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
394         xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
395         add     $Xip=9,$Xip             };;     //      &Xi.lo
396 { .mmi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
397 (p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
398 (p6)    extr.u  $xi[1]=$Zlo,8,8         }       //[p17] Xi[14]
399 { .mmi; xor     $Zhi=$Zhi,$Bhi                  //(p19) Z.hi^=Hshr4[nhi].hi
400 (p6)    and     $xi[0]=$Zlo,$mask0xff   };;     //[p16] Xi[15]
401 { .mmi; st8     [$Xip]=$Zlo,-8
402 (p6)    xor     $xi[0]=$xi[0],$in               //[p17] xi=$xi[i]^inp[i]
403         shl     $rem[1]=$rem[1],48      };;     //(p19) rem_8bit[rem]<<48
404 { .mmi;
405 (p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
406         xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
407 (p6)    dep     $Atbl=$xi[0],$Htbl,4,4  }       //[p17] &Htable[nlo].lo
408 { .mib;
409 (p6)    and     $xi[0]=-16,$xi[0]               //[p17] nhi=xi&0xf0
410 (p6)    br.cond.dptk.many       .LOOP   };;
411
412 { .mib; st8     [$Xip]=$Zhi             };;
413 { .mib; $rum    1<<1                            // return to little-endian
414         .restore        sp
415         mov     sp=prevsp
416         br.ret.sptk.many        b0      };;
417 .endp   gcm_ghash_4bit#
418 ___
419 $code.=<<___;
420 .align  128
421 .type   rem_4bit#,\@object
422 rem_4bit:
423         data8   0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
424         data8   0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
425         data8   0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
426         data8   0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
427 .size   rem_4bit#,128
428 .type   rem_8bit#,\@object
429 rem_8bit:
430         data1   0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
431         data1   0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
432         data1   0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
433         data1   0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
434         data1   0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
435         data1   0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
436         data1   0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
437         data1   0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
438         data1   0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
439         data1   0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
440         data1   0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
441         data1   0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
442         data1   0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
443         data1   0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
444         data1   0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
445         data1   0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
446         data1   0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
447         data1   0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
448         data1   0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
449         data1   0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
450         data1   0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
451         data1   0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
452         data1   0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
453         data1   0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
454         data1   0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
455         data1   0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
456         data1   0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
457         data1   0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
458         data1   0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
459         data1   0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
460         data1   0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
461         data1   0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
462 .size   rem_8bit#,512
463 stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
464 ___
465
466 $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
467 $code =~ s/\`([^\`]*)\`/eval $1/gem;
468
469 print $code;
470 close STDOUT;