Unified - adapt the generation of modes assembler to use GENERATE
[openssl.git] / crypto / modes / asm / ghash-ia64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # March 2010
11 #
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. Streamed
15 # GHASH performance was measured to be 6.67 cycles per processed byte
16 # on Itanium 2, which is >90% better than Microsoft compiler generated
17 # code. To anchor to something else sha1-ia64.pl module processes one
18 # byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
19 # byte.
20
21 # September 2010
22 #
23 # It was originally thought that it makes lesser sense to implement
24 # "528B" variant on Itanium 2 for following reason. Because number of
25 # functional units is naturally limited, it appeared impossible to
26 # implement "528B" loop in 4 cycles, only in 5. This would mean that
27 # theoretically performance improvement couldn't be more than 20%.
28 # But occasionally you prove yourself wrong:-) I figured out a way to
29 # fold couple of instructions and having freed yet another instruction
30 # slot by unrolling the loop... Resulting performance is 4.45 cycles
31 # per processed byte and 50% better than "256B" version. On original
32 # Itanium performance should remain the same as the "256B" version,
33 # i.e. ~8.5 cycles.
34
35 $output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
36
37 if ($^O eq "hpux") {
38     $ADDP="addp4";
39     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
40 } else { $ADDP="add"; }
41 for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
42                 $big_endian=0 if (/\-DL_ENDIAN/);  }
43 if (!defined($big_endian))
44              {  $big_endian=(unpack('L',pack('N',1))==1);  }
45
46 sub loop() {
47 my $label=shift;
48 my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
49
50 # Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
51 # in scalable manner;-) Naturally assuming data in L1 cache...
52 # Special note about 'dep' instruction, which is used to construct
53 # &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
54 # bytes boundary and lower 7 bits of its address are guaranteed to
55 # be zero.
56 $code.=<<___;
57 $label:
58 { .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
59         (p19)   dep     rem=Zlo,rem_4bitp,3,4   }
60 { .mfi; (p19)   xor     Zhi=Zhi,Hhi
61         ($p17)  xor     xi[1]=xi[1],in[1]       };;
62 { .mfi; (p18)   ld8     Hhi=[Hi[1]]
63         (p19)   shrp    Zlo=Zhi,Zlo,4           }
64 { .mfi; (p19)   ld8     rem=[rem]
65         (p18)   and     Hi[1]=mask0xf0,xi[2]    };;
66 { .mmi; ($p16)  ld1     in[0]=[inp],-1
67         (p18)   xor     Zlo=Zlo,Hlo
68         (p19)   shr.u   Zhi=Zhi,4               }
69 { .mib; (p19)   xor     Hhi=Hhi,rem
70         (p18)   add     Hi[1]=Htbl,Hi[1]        };;
71
72 { .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
73         (p18)   dep     rem=Zlo,rem_4bitp,3,4   }
74 { .mfi; (p17)   shladd  Hi[0]=xi[1],4,r0
75         (p18)   xor     Zhi=Zhi,Hhi             };;
76 { .mfi; (p18)   ld8     Hhi=[Hi[1]]
77         (p18)   shrp    Zlo=Zhi,Zlo,4           }
78 { .mfi; (p18)   ld8     rem=[rem]
79         (p17)   and     Hi[0]=mask0xf0,Hi[0]    };;
80 { .mmi; (p16)   ld1     xi[0]=[Xi],-1
81         (p18)   xor     Zlo=Zlo,Hlo
82         (p18)   shr.u   Zhi=Zhi,4               }
83 { .mib; (p18)   xor     Hhi=Hhi,rem
84         (p17)   add     Hi[0]=Htbl,Hi[0]
85         br.ctop.sptk    $label                  };;
86 ___
87 }
88
89 $code=<<___;
90 .explicit
91 .text
92
93 prevfs=r2;      prevlc=r3;      prevpr=r8;
94 mask0xf0=r21;
95 rem=r22;        rem_4bitp=r23;
96 Xi=r24;         Htbl=r25;
97 inp=r26;        end=r27;
98 Hhi=r28;        Hlo=r29;
99 Zhi=r30;        Zlo=r31;
100
101 .align  128
102 .skip   16                                      // aligns loop body
103 .global gcm_gmult_4bit#
104 .proc   gcm_gmult_4bit#
105 gcm_gmult_4bit:
106         .prologue
107 { .mmi; .save   ar.pfs,prevfs
108         alloc   prevfs=ar.pfs,2,6,0,8
109         $ADDP   Xi=15,in0                       // &Xi[15]
110         mov     rem_4bitp=ip            }
111 { .mii; $ADDP   Htbl=8,in1                      // &Htbl[0].lo
112         .save   ar.lc,prevlc
113         mov     prevlc=ar.lc
114         .save   pr,prevpr
115         mov     prevpr=pr               };;
116
117         .body
118         .rotr   in[3],xi[3],Hi[2]
119
120 { .mib; ld1     xi[2]=[Xi],-1                   // Xi[15]
121         mov     mask0xf0=0xf0
122         brp.loop.imp    .Loop1,.Lend1-16};;
123 { .mmi; ld1     xi[1]=[Xi],-1                   // Xi[14]
124                                         };;
125 { .mii; shladd  Hi[1]=xi[2],4,r0
126         mov     pr.rot=0x7<<16
127         mov     ar.lc=13                };;
128 { .mii; and     Hi[1]=mask0xf0,Hi[1]
129         mov     ar.ec=3
130         xor     Zlo=Zlo,Zlo             };;
131 { .mii; add     Hi[1]=Htbl,Hi[1]                // &Htbl[nlo].lo
132         add     rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
133         xor     Zhi=Zhi,Zhi             };;
134 ___
135         &loop   (".Loop1",1);
136 $code.=<<___;
137 .Lend1:
138 { .mib; xor     Zhi=Zhi,Hhi             };;     // modulo-scheduling artefact
139 { .mib; mux1    Zlo=Zlo,\@rev           };;
140 { .mib; mux1    Zhi=Zhi,\@rev           };;
141 { .mmi; add     Hlo=9,Xi;;                      // ;; is here to prevent
142         add     Hhi=1,Xi                };;     // pipeline flush on Itanium
143 { .mib; st8     [Hlo]=Zlo
144         mov     pr=prevpr,0x1ffff       };;
145 { .mib; st8     [Hhi]=Zhi
146         mov     ar.lc=prevlc
147         br.ret.sptk.many        b0      };;
148 .endp   gcm_gmult_4bit#
149 ___
150
151 ######################################################################
152 # "528B" (well, "512B" actualy) streamed GHASH
153 #
154 $Xip="in0";
155 $Htbl="in1";
156 $inp="in2";
157 $len="in3";
158 $rem_8bit="loc0";
159 $mask0xff="loc1";
160 ($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
161
162 sub load_htable() {
163     for (my $i=0;$i<8;$i++) {
164         $code.=<<___;
165 { .mmi; ld8     r`16+2*$i+1`=[r8],16            // Htable[$i].hi
166         ld8     r`16+2*$i`=[r9],16      }       // Htable[$i].lo
167 { .mmi; ldf8    f`32+2*$i+1`=[r10],16           // Htable[`8+$i`].hi
168         ldf8    f`32+2*$i`=[r11],16             // Htable[`8+$i`].lo
169 ___
170         $code.=shift    if (($i+$#_)==7);
171         $code.="\t};;\n"
172     }
173 }
174
175 $code.=<<___;
176 prevsp=r3;
177
178 .align  32
179 .skip   16                                      // aligns loop body
180 .global gcm_ghash_4bit#
181 .proc   gcm_ghash_4bit#
182 gcm_ghash_4bit:
183         .prologue
184 { .mmi; .save   ar.pfs,prevfs
185         alloc   prevfs=ar.pfs,4,2,0,0
186         .vframe prevsp
187         mov     prevsp=sp
188         mov     $rem_8bit=ip            };;
189         .body
190 { .mfi; $ADDP   r8=0+0,$Htbl
191         $ADDP   r9=0+8,$Htbl            }
192 { .mfi; $ADDP   r10=128+0,$Htbl
193         $ADDP   r11=128+8,$Htbl         };;
194 ___
195         &load_htable(
196         "       $ADDP   $Xip=15,$Xip",          # &Xi[15]
197         "       $ADDP   $len=$len,$inp",        # &inp[len]
198         "       $ADDP   $inp=15,$inp",          # &inp[15]
199         "       mov     $mask0xff=0xff",
200         "       add     sp=-512,sp",
201         "       andcm   sp=sp,$mask0xff",       # align stack frame
202         "       add     r14=0,sp",
203         "       add     r15=8,sp");
204 $code.=<<___;
205 { .mmi; $sum    1<<1                            // go big-endian
206         add     r8=256+0,sp
207         add     r9=256+8,sp             }
208 { .mmi; add     r10=256+128+0,sp
209         add     r11=256+128+8,sp
210         add     $len=-17,$len           };;
211 ___
212 for($i=0;$i<8;$i++) {   # generate first half of Hshr4[]
213 my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
214 $code.=<<___;
215 { .mmi; st8     [r8]=$rlo,16                    // Htable[$i].lo
216         st8     [r9]=$rhi,16                    // Htable[$i].hi
217         shrp    $rlo=$rhi,$rlo,4        }//;;
218 { .mmi; stf8    [r10]=f`32+2*$i`,16             // Htable[`8+$i`].lo
219         stf8    [r11]=f`32+2*$i+1`,16           // Htable[`8+$i`].hi
220         shr.u   $rhi=$rhi,4             };;
221 { .mmi; st8     [r14]=$rlo,16                   // Htable[$i].lo>>4
222         st8     [r15]=$rhi,16           }//;;   // Htable[$i].hi>>4
223 ___
224 }
225 $code.=<<___;
226 { .mmi; ld8     r16=[r8],16                     // Htable[8].lo
227         ld8     r17=[r9],16             };;     // Htable[8].hi
228 { .mmi; ld8     r18=[r8],16                     // Htable[9].lo
229         ld8     r19=[r9],16             }       // Htable[9].hi
230 { .mmi; rum     1<<5                            // clear um.mfh
231         shrp    r16=r17,r16,4           };;
232 ___
233 for($i=0;$i<6;$i++) {   # generate second half of Hshr4[]
234 $code.=<<___;
235 { .mmi; ld8     r`20+2*$i`=[r8],16              // Htable[`10+$i`].lo
236         ld8     r`20+2*$i+1`=[r9],16            // Htable[`10+$i`].hi
237         shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
238 { .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
239         st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
240         shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
241 ___
242 }
243 $code.=<<___;
244 { .mmi; shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
245 { .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
246         st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
247         shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
248 { .mmi; add     $Htbl=256,sp                    // &Htable[0]
249         add     $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
250         shr.u   r`18+2*$i+1`=r`18+2*$i+1`,4     };;
251 { .mmi; st8     [r14]=r`18+2*$i`                // Htable[`8+$i`].lo>>4
252         st8     [r15]=r`18+2*$i+1`      }       // Htable[`8+$i`].hi>>4
253 ___
254
255 $in="r15";
256 @xi=("r16","r17");
257 @rem=("r18","r19");
258 ($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
259 ($Atbl,$Btbl)=("r26","r27");
260
261 $code.=<<___;   # (p16)
262 { .mmi; ld1     $in=[$inp],-1                   //(p16) *inp--
263         ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
264         cmp.eq  p0,p6=r0,r0             };;     //      clear p6
265 ___
266 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
267
268 $code.=<<___;   # (p16),(p17)
269 { .mmi; ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
270         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
271 { .mii; ld1     $in=[$inp],-1                   //(p16) *inp--
272         dep     $Atbl=$xi[1],$Htbl,4,4          //(p17) &Htable[nlo].lo
273         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
274 .align  32
275 .LOOP:
276 { .mmi;
277 (p6)    st8     [$Xip]=$Zhi,13
278         xor     $Zlo=$Zlo,$Zlo
279         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi].lo
280 ___
281 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
282
283 $code.=<<___;   # (p16),(p17),(p18)
284 { .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
285         ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
286         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
287 { .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
288         dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
289 { .mfi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
290         xor     $Zlo=$Zlo,$Alo          };;     //(p18) Z.lo^=Htable[nlo].lo
291 { .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
292         ld1     $in=[$inp],-1           }       //(p16) *inp--
293 { .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
294         mov     $Zhi=$Ahi                       //(p18) Z.hi^=Htable[nlo].hi
295         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
296 { .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
297         ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
298         shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
299 { .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
300         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
301 ___
302 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
303
304 for ($i=1;$i<14;$i++) {
305 # Above and below fragments are derived from this one by removing
306 # unsuitable (p??) instructions.
307 $code.=<<___;   # (p16),(p17),(p18),(p19)
308 { .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
309         ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
310         shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
311 { .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
312         xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
313         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
314 { .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
315         ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
316         dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
317 { .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
318         xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
319         xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
320 { .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
321         ld1     $in=[$inp],-1                   //(p16) *inp--
322         shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
323 { .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
324         xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
325         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
326 { .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
327         ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
328         shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
329 { .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
330         xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
331         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
332 ___
333 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
334 }
335
336 $code.=<<___;   # (p17),(p18),(p19)
337 { .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
338         ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
339         shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
340 { .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
341         xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
342         xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
343 { .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
344         ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
345         dep     $Atbl=$xi[1],$Htbl,4,4  };;     //(p17) &Htable[nlo].lo
346 { .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
347         xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
348         xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
349 { .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
350         shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
351 { .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
352         xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
353         and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
354 { .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
355         shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
356 { .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
357         xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
358         add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
359 ___
360 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
361
362 $code.=<<___;   # (p18),(p19)
363 { .mfi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
364         shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
365 { .mfi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
366         xor     $Zlo=$Zlo,$Blo          };;     //(p19) Z.lo^=Hshr4[nhi].lo
367 { .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
368         xor     $Zlo=$Zlo,$Alo          }       //(p18) Z.lo^=Htable[nlo].lo
369 { .mfi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
370         xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
371 { .mfi; ld8     $Blo=[$Btbl],8                  //(p18) Htable[nhi].lo,&Htable[nhi].hi
372         shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
373 { .mfi; shladd  $rem[0]=$Zlo,4,r0               //(p18) Z.lo<<4
374         xor     $Zhi=$Zhi,$Ahi          };;     //(p18) Z.hi^=Htable[nlo].hi
375 { .mfi; ld8     $Bhi=[$Btbl]                    //(p18) Htable[nhi].hi
376         shrp    $Zlo=$Zhi,$Zlo,4        }       //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
377 { .mfi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
378         xor     $Zhi=$Zhi,$rem[1]       };;     //(p19) Z.hi^=rem_8bit[rem]<<48
379 ___
380 push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
381
382 $code.=<<___;   # (p19)
383 { .mmi; cmp.ltu p6,p0=$inp,$len
384         add     $inp=32,$inp
385         shr.u   $Zhi=$Zhi,4             }       //(p19) Z.hi>>=4
386 { .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
387         xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
388         add     $Xip=9,$Xip             };;     //      &Xi.lo
389 { .mmi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
390 (p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
391 (p6)    extr.u  $xi[1]=$Zlo,8,8         }       //[p17] Xi[14]
392 { .mmi; xor     $Zhi=$Zhi,$Bhi                  //(p19) Z.hi^=Hshr4[nhi].hi
393 (p6)    and     $xi[0]=$Zlo,$mask0xff   };;     //[p16] Xi[15]
394 { .mmi; st8     [$Xip]=$Zlo,-8
395 (p6)    xor     $xi[0]=$xi[0],$in               //[p17] xi=$xi[i]^inp[i]
396         shl     $rem[1]=$rem[1],48      };;     //(p19) rem_8bit[rem]<<48
397 { .mmi;
398 (p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
399         xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
400 (p6)    dep     $Atbl=$xi[0],$Htbl,4,4  }       //[p17] &Htable[nlo].lo
401 { .mib;
402 (p6)    and     $xi[0]=-16,$xi[0]               //[p17] nhi=xi&0xf0
403 (p6)    br.cond.dptk.many       .LOOP   };;
404
405 { .mib; st8     [$Xip]=$Zhi             };;
406 { .mib; $rum    1<<1                            // return to little-endian
407         .restore        sp
408         mov     sp=prevsp
409         br.ret.sptk.many        b0      };;
410 .endp   gcm_ghash_4bit#
411 ___
412 $code.=<<___;
413 .align  128
414 .type   rem_4bit#,\@object
415 rem_4bit:
416         data8   0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
417         data8   0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
418         data8   0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
419         data8   0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
420 .size   rem_4bit#,128
421 .type   rem_8bit#,\@object
422 rem_8bit:
423         data1   0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
424         data1   0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
425         data1   0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
426         data1   0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
427         data1   0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
428         data1   0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
429         data1   0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
430         data1   0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
431         data1   0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
432         data1   0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
433         data1   0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
434         data1   0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
435         data1   0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
436         data1   0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
437         data1   0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
438         data1   0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
439         data1   0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
440         data1   0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
441         data1   0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
442         data1   0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
443         data1   0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
444         data1   0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
445         data1   0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
446         data1   0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
447         data1   0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
448         data1   0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
449         data1   0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
450         data1   0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
451         data1   0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
452         data1   0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
453         data1   0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
454         data1   0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
455 .size   rem_8bit#,512
456 stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
457 ___
458
459 $code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
460 $code =~ s/\`([^\`]*)\`/eval $1/gem;
461
462 print $code;
463 close STDOUT;