7811a98874cd915ef2d4a5f224b776fccb85bd9b
[openssl.git] / crypto / modes / asm / ghash-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # March 2010
11 #
12 # The module implements "4-bit" GCM GHASH function and underlying
13 # single multiplication operation in GF(2^128). "4-bit" means that it
14 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
15 # results are for streamed GHASH subroutine and are expressed in
16 # cycles per processed byte, less is better:
17 #
18 #               gcc 3.4.x       assembler
19 #
20 # Opteron       18.5            10.2            +80%
21 # Core2         17.5            11.0            +59%
22
23 # May 2010
24 #
25 # Add PCLMULQDQ version performing at 2.07 cycles per processed byte.
26 # See ghash-x86.pl for background information and details about coding
27 # techniques.
28
29 $flavour = shift;
30 $output  = shift;
31 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
32
33 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
34
35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
37 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
38 die "can't locate x86_64-xlate.pl";
39
40 open STDOUT,"| $^X $xlate $flavour $output";
41
42 # common register layout
43 $nlo="%rax";
44 $nhi="%rbx";
45 $Zlo="%r8";
46 $Zhi="%r9";
47 $tmp="%r10";
48 $rem_4bit = "%r11";
49
50 $Xi="%rdi";
51 $Htbl="%rsi";
52
53 # per-function register layout
54 $cnt="%rcx";
55 $rem="%rdx";
56
57 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
58                         $r =~ s/%[er]([sd]i)/%\1l/;
59                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
60 \f
61 { my $N;
62   sub loop() {
63   my $inp = shift;
64
65         $N++;
66 $code.=<<___;
67         xor     $nlo,$nlo
68         xor     $nhi,$nhi
69         mov     `&lo("$Zlo")`,`&lo("$nlo")`
70         mov     `&lo("$Zlo")`,`&lo("$nhi")`
71         shl     \$4,`&lo("$nlo")`
72         mov     \$14,$cnt
73         mov     8($Htbl,$nlo),$Zlo
74         mov     ($Htbl,$nlo),$Zhi
75         and     \$0xf0,`&lo("$nhi")`
76         mov     $Zlo,$rem
77         jmp     .Loop$N
78
79 .align  16
80 .Loop$N:
81         shr     \$4,$Zlo
82         and     \$0xf,$rem
83         mov     $Zhi,$tmp
84         mov     ($inp,$cnt),`&lo("$nlo")`
85         shr     \$4,$Zhi
86         xor     8($Htbl,$nhi),$Zlo
87         shl     \$60,$tmp
88         xor     ($Htbl,$nhi),$Zhi
89         mov     `&lo("$nlo")`,`&lo("$nhi")`
90         xor     ($rem_4bit,$rem,8),$Zhi
91         mov     $Zlo,$rem
92         shl     \$4,`&lo("$nlo")`
93         xor     $tmp,$Zlo
94         dec     $cnt
95         js      .Lbreak$N
96
97         shr     \$4,$Zlo
98         and     \$0xf,$rem
99         mov     $Zhi,$tmp
100         shr     \$4,$Zhi
101         xor     8($Htbl,$nlo),$Zlo
102         shl     \$60,$tmp
103         xor     ($Htbl,$nlo),$Zhi
104         and     \$0xf0,`&lo("$nhi")`
105         xor     ($rem_4bit,$rem,8),$Zhi
106         mov     $Zlo,$rem
107         xor     $tmp,$Zlo
108         jmp     .Loop$N
109
110 .align  16
111 .Lbreak$N:
112         shr     \$4,$Zlo
113         and     \$0xf,$rem
114         mov     $Zhi,$tmp
115         shr     \$4,$Zhi
116         xor     8($Htbl,$nlo),$Zlo
117         shl     \$60,$tmp
118         xor     ($Htbl,$nlo),$Zhi
119         and     \$0xf0,`&lo("$nhi")`
120         xor     ($rem_4bit,$rem,8),$Zhi
121         mov     $Zlo,$rem
122         xor     $tmp,$Zlo
123
124         shr     \$4,$Zlo
125         and     \$0xf,$rem
126         mov     $Zhi,$tmp
127         shr     \$4,$Zhi
128         xor     8($Htbl,$nhi),$Zlo
129         shl     \$60,$tmp
130         xor     ($Htbl,$nhi),$Zhi
131         xor     $tmp,$Zlo
132         xor     ($rem_4bit,$rem,8),$Zhi
133
134         bswap   $Zlo
135         bswap   $Zhi
136 ___
137 }}
138
139 $code=<<___;
140 .text
141
142 .globl  gcm_gmult_4bit
143 .type   gcm_gmult_4bit,\@function,2
144 .align  16
145 gcm_gmult_4bit:
146         push    %rbx
147         push    %rbp            # %rbp and %r12 are pushed exclusively in
148         push    %r12            # order to reuse Win64 exception handler...
149 .Lgmult_prologue:
150
151         movzb   15($Xi),$Zlo
152         lea     .Lrem_4bit(%rip),$rem_4bit
153 ___
154         &loop   ($Xi);
155 $code.=<<___;
156         mov     $Zlo,8($Xi)
157         mov     $Zhi,($Xi)
158
159         mov     16(%rsp),%rbx
160         lea     24(%rsp),%rsp
161 .Lgmult_epilogue:
162         ret
163 .size   gcm_gmult_4bit,.-gcm_gmult_4bit
164 ___
165 \f
166 # per-function register layout
167 $inp="%rdx";
168 $len="%rcx";
169
170 $cnt="%rbp";
171 $rem="%r12";
172
173 $code.=<<___;
174 .globl  gcm_ghash_4bit
175 .type   gcm_ghash_4bit,\@function,4
176 .align  16
177 gcm_ghash_4bit:
178         push    %rbx
179         push    %rbp
180         push    %r12
181 .Lghash_prologue:
182
183         mov     8($Xi),$Zlo
184         mov     ($Xi),$Zhi
185         add     $inp,$len
186         lea     .Lrem_4bit(%rip),$rem_4bit
187 .align  4
188 .Louter_loop:
189         xor     8($inp),$Zlo
190         xor     ($inp),$Zhi
191         lea     16($inp),$inp
192         mov     $Zlo,8($Xi)
193         mov     $Zhi,($Xi)
194         shr     \$56,$Zlo
195 ___
196         &loop   ($Xi);
197 $code.=<<___;
198         cmp     $len,$inp
199         jb      .Louter_loop
200
201         mov     $Zlo,8($Xi)
202         mov     $Zhi,($Xi)
203
204         mov     0(%rsp),%r12
205         mov     8(%rsp),%rbp
206         mov     16(%rsp),%rbx
207         lea     24(%rsp),%rsp
208 .Lghash_epilogue:
209         ret
210 .size   gcm_ghash_4bit,.-gcm_ghash_4bit
211 ___
212 \f
213 ######################################################################
214 # PCLMULQDQ version.
215
216 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
217                 ("%rdi","%rsi","%rdx","%rcx");  # Unix order
218
219 ($Xi,$Xhi)=("%xmm0","%xmm1");   $Hkey="%xmm2";
220 ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
221
222 sub clmul64x64_T2 {     # minimal register pressure
223 my ($Xhi,$Xi,$Hkey,$modulo)=@_;
224
225 $code.=<<___ if (!defined($modulo));
226         movdqa          $Xi,$Xhi                #
227         pshufd          \$0b01001110,$Xi,$T1
228         pshufd          \$0b01001110,$Hkey,$T2
229         pxor            $Xi,$T1                 #
230         pxor            $Hkey,$T2
231 ___
232 $code.=<<___;
233         pclmulqdq       \$0x00,$Hkey,$Xi        #######
234         pclmulqdq       \$0x11,$Hkey,$Xhi       #######
235         pclmulqdq       \$0x00,$T2,$T1          #######
236         pxor            $Xi,$T1                 #
237         pxor            $Xhi,$T1                #
238
239         movdqa          $T1,$T2                 #
240         psrldq          \$8,$T1
241         pslldq          \$8,$T2                 #
242         pxor            $T1,$Xhi
243         pxor            $T2,$Xi                 #
244 ___
245 }
246
247 sub reduction_alg9 {    # 17/13 times faster than Intel version
248 my ($Xhi,$Xi) = @_;
249
250 $code.=<<___;
251         # 1st phase
252         movdqa          $Xi,$T1                 #
253         psllq           \$1,$Xi
254         pxor            $T1,$Xi                 #
255         psllq           \$5,$Xi                 #
256         pxor            $T1,$Xi                 #
257         psllq           \$57,$Xi                #
258         movdqa          $Xi,$T2                 #
259         pslldq          \$8,$Xi
260         psrldq          \$8,$T2                 #       
261         pxor            $T1,$Xi
262         pxor            $T2,$Xhi                #
263
264         # 2nd phase
265         movdqa          $Xi,$T2
266         psrlq           \$5,$Xi
267         pxor            $T2,$Xi                 #
268         psrlq           \$1,$Xi                 #
269         pxor            $T2,$Xi                 #
270         pxor            $Xhi,$T2
271         psrlq           \$1,$Xi                 #
272         pxor            $T2,$Xi                 #
273 ___
274 }
275 \f
276 { my ($Htbl,$Xip)=@_4args;
277
278 $code.=<<___;
279 .globl  gcm_init_clmul
280 .type   gcm_init_clmul,\@abi-omnipotent
281 .align  16
282 gcm_init_clmul:
283         movdqu          ($Xip),$Hkey
284         pshufd          \$0b01001110,$Hkey,$Hkey        # dword swap
285
286         # <<1 twist
287         pshufd          \$0b11111111,$Hkey,$T2  # broadcast uppermost dword
288         movdqa          $Hkey,$T1
289         psllq           \$1,$Hkey
290         pxor            $T3,$T3                 #
291         psrlq           \$63,$T1
292         pcmpgtd         $T2,$T3                 # broadcast carry bit
293         pslldq          \$8,$T1
294         por             $T1,$Hkey               # H<<=1
295
296         # magic reduction
297         pand            .L0x1c2_polynomial(%rip),$T3
298         pxor            $T3,$Hkey               # if(carry) H^=0x1c2_polynomial
299
300         # calculate H^2
301         movdqa          $Hkey,$Xi
302 ___
303         &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
304         &reduction_alg9 ($Xhi,$Xi);
305 $code.=<<___;
306         movdqu          $Hkey,($Htbl)           # save H
307         movdqu          $Xi,16($Htbl)           # save H^2
308         ret
309 .size   gcm_init_clmul,.-gcm_init_clmul
310 ___
311 }
312
313 { my ($Xip,$Htbl)=@_4args;
314
315 $code.=<<___;
316 .globl  gcm_gmult_clmul
317 .type   gcm_gmult_clmul,\@abi-omnipotent
318 .align  16
319 gcm_gmult_clmul:
320         movdqu          ($Xip),$Xi
321         movdqa          .Lbswap_mask(%rip),$T3
322         movdqu          ($Htbl),$Hkey
323         pshufb          $T3,$Xi
324 ___
325         &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
326         &reduction_alg9 ($Xhi,$Xi);
327 $code.=<<___;
328         pshufb          $T3,$Xi
329         movdqu          $Xi,($Xip)
330         ret
331 .size   gcm_gmult_clmul,.-gcm_gmult_clmul
332 ___
333 }
334 \f
335 { my ($Xip,$Htbl,$inp,$len)=@_4args;
336   my $Xn="%xmm6";
337   my $Xhn="%xmm7";
338   my $Hkey2="%xmm8";
339   my $T1n="%xmm9";
340   my $T2n="%xmm10";
341
342 $code.=<<___;
343 .globl  gcm_ghash_clmul
344 .type   gcm_ghash_clmul,\@abi-omnipotent
345 .align  16
346 gcm_ghash_clmul:
347 ___
348 $code.=<<___ if ($win64);
349 .LSEH_begin_gcm_ghash_clmul:
350         # I can't trust assembler to use specific encoding:-(
351         .byte   0x48,0x83,0xec,0x58             #sub    \$0x58,%rsp
352         .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
353         .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
354         .byte   0x44,0x0f,0x29,0x44,0x24,0x20   #movaps %xmm8,0x20(%rsp)
355         .byte   0x44,0x0f,0x29,0x4c,0x24,0x30   #movaps %xmm9,0x30(%rsp)
356         .byte   0x44,0x0f,0x29,0x54,0x24,0x40   #movaps %xmm10,0x40(%rsp)
357 ___
358 $code.=<<___;
359         movdqa          .Lbswap_mask(%rip),$T3
360
361         movdqu          ($Xip),$Xi
362         movdqu          ($Htbl),$Hkey
363         pshufb          $T3,$Xi
364
365         sub             \$0x10,$len
366         jz              .Lodd_tail
367
368         movdqu          16($Htbl),$Hkey2
369         #######
370         # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
371         #       [(H*Ii+1) + (H*Xi+1)] mod P =
372         #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
373         #
374         movdqu          ($inp),$T1              # Ii
375         movdqu          16($inp),$Xn            # Ii+1
376         pshufb          $T3,$T1
377         pshufb          $T3,$Xn
378         pxor            $T1,$Xi                 # Ii+Xi
379 ___
380         &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
381 $code.=<<___;
382         movdqa          $Xi,$Xhi                #
383         pshufd          \$0b01001110,$Xi,$T1
384         pshufd          \$0b01001110,$Hkey2,$T2
385         pxor            $Xi,$T1                 #
386         pxor            $Hkey2,$T2
387
388         lea             32($inp),$inp           # i+=2
389         sub             \$0x20,$len
390         jbe             .Leven_tail
391
392 .Lmod_loop:
393 ___
394         &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
395 $code.=<<___;
396         movdqu          ($inp),$T1              # Ii
397         pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
398         pxor            $Xhn,$Xhi
399
400         movdqu          16($inp),$Xn            # Ii+1
401         pshufb          $T3,$T1
402         pshufb          $T3,$Xn
403
404         movdqa          $Xn,$Xhn                #
405         pshufd          \$0b01001110,$Xn,$T1n
406         pshufd          \$0b01001110,$Hkey,$T2n
407         pxor            $Xn,$T1n                #
408         pxor            $Hkey,$T2n
409          pxor           $T1,$Xhi                # "Ii+Xi", consume early
410
411           movdqa        $Xi,$T1                 # 1st phase
412           psllq         \$1,$Xi
413           pxor          $T1,$Xi                 #
414           psllq         \$5,$Xi                 #
415           pxor          $T1,$Xi                 #
416         pclmulqdq       \$0x00,$Hkey,$Xn        #######
417           psllq         \$57,$Xi                #
418           movdqa        $Xi,$T2                 #
419           pslldq        \$8,$Xi
420           psrldq        \$8,$T2                 #       
421           pxor          $T1,$Xi
422           pxor          $T2,$Xhi                #
423
424         pclmulqdq       \$0x11,$Hkey,$Xhn       #######
425           movdqa        $Xi,$T2                 # 2nd phase
426           psrlq         \$5,$Xi
427           pxor          $T2,$Xi                 #
428           psrlq         \$1,$Xi                 #
429           pxor          $T2,$Xi                 #
430           pxor          $Xhi,$T2
431           psrlq         \$1,$Xi                 #
432           pxor          $T2,$Xi                 #
433
434         pclmulqdq       \$0x00,$T2n,$T1n        #######
435          movdqa         $Xi,$Xhi                #
436          pshufd         \$0b01001110,$Xi,$T1
437          pshufd         \$0b01001110,$Hkey2,$T2
438          pxor           $Xi,$T1                 #
439          pxor           $Hkey2,$T2
440
441         pxor            $Xn,$T1n                #
442         pxor            $Xhn,$T1n               #
443         movdqa          $T1n,$T2n               #
444         psrldq          \$8,$T1n
445         pslldq          \$8,$T2n                #
446         pxor            $T1n,$Xhn
447         pxor            $T2n,$Xn                #
448
449         lea             32($inp),$inp
450         sub             \$0x20,$len
451         ja              .Lmod_loop
452
453 .Leven_tail:
454 ___
455         &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
456 $code.=<<___;
457         pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
458         pxor            $Xhn,$Xhi
459 ___
460         &reduction_alg9 ($Xhi,$Xi);
461 $code.=<<___;
462         test            $len,$len
463         jnz             .Ldone
464
465 .Lodd_tail:
466         movdqu          ($inp),$T1              # Ii
467         pshufb          $T3,$T1
468         pxor            $T1,$Xi                 # Ii+Xi
469 ___
470         &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
471         &reduction_alg9 ($Xhi,$Xi);
472 $code.=<<___;
473 .Ldone:
474         pshufb          $T3,$Xi
475         movdqu          $Xi,($Xip)
476 ___
477 $code.=<<___ if ($win64);
478         movaps  (%rsp),%xmm6
479         movaps  0x10(%rsp),%xmm7
480         movaps  0x20(%rsp),%xmm8
481         movaps  0x30(%rsp),%xmm9
482         movaps  0x40(%rsp),%xmm10
483         add     \$0x58,%rsp
484 ___
485 $code.=<<___;
486         ret
487 .LSEH_end_gcm_ghash_clmul:
488 .size   gcm_ghash_clmul,.-gcm_ghash_clmul
489 ___
490 }
491
492 $code.=<<___;
493 .align  64
494 .Lbswap_mask:
495         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
496 .L0x1c2_polynomial:
497         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
498 .align  64
499 .type   .Lrem_4bit,\@object
500 .Lrem_4bit:
501         .long   0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
502         .long   0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
503         .long   0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
504         .long   0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
505 .asciz  "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
506 .align  64
507 ___
508 \f
509 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
510 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
511 if ($win64) {
512 $rec="%rcx";
513 $frame="%rdx";
514 $context="%r8";
515 $disp="%r9";
516
517 $code.=<<___;
518 .extern __imp_RtlVirtualUnwind
519 .type   se_handler,\@abi-omnipotent
520 .align  16
521 se_handler:
522         push    %rsi
523         push    %rdi
524         push    %rbx
525         push    %rbp
526         push    %r12
527         push    %r13
528         push    %r14
529         push    %r15
530         pushfq
531         sub     \$64,%rsp
532
533         mov     120($context),%rax      # pull context->Rax
534         mov     248($context),%rbx      # pull context->Rip
535
536         mov     8($disp),%rsi           # disp->ImageBase
537         mov     56($disp),%r11          # disp->HandlerData
538
539         mov     0(%r11),%r10d           # HandlerData[0]
540         lea     (%rsi,%r10),%r10        # prologue label
541         cmp     %r10,%rbx               # context->Rip<prologue label
542         jb      .Lin_prologue
543
544         mov     152($context),%rax      # pull context->Rsp
545
546         mov     4(%r11),%r10d           # HandlerData[1]
547         lea     (%rsi,%r10),%r10        # epilogue label
548         cmp     %r10,%rbx               # context->Rip>=epilogue label
549         jae     .Lin_prologue
550
551         lea     24(%rax),%rax           # adjust "rsp"
552
553         mov     -8(%rax),%rbx
554         mov     -16(%rax),%rbp
555         mov     -24(%rax),%r12
556         mov     %rbx,144($context)      # restore context->Rbx
557         mov     %rbp,160($context)      # restore context->Rbp
558         mov     %r12,216($context)      # restore context->R12
559
560 .Lin_prologue:
561         mov     8(%rax),%rdi
562         mov     16(%rax),%rsi
563         mov     %rax,152($context)      # restore context->Rsp
564         mov     %rsi,168($context)      # restore context->Rsi
565         mov     %rdi,176($context)      # restore context->Rdi
566
567         mov     40($disp),%rdi          # disp->ContextRecord
568         mov     $context,%rsi           # context
569         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
570         .long   0xa548f3fc              # cld; rep movsq
571
572         mov     $disp,%rsi
573         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
574         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
575         mov     0(%rsi),%r8             # arg3, disp->ControlPc
576         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
577         mov     40(%rsi),%r10           # disp->ContextRecord
578         lea     56(%rsi),%r11           # &disp->HandlerData
579         lea     24(%rsi),%r12           # &disp->EstablisherFrame
580         mov     %r10,32(%rsp)           # arg5
581         mov     %r11,40(%rsp)           # arg6
582         mov     %r12,48(%rsp)           # arg7
583         mov     %rcx,56(%rsp)           # arg8, (NULL)
584         call    *__imp_RtlVirtualUnwind(%rip)
585
586         mov     \$1,%eax                # ExceptionContinueSearch
587         add     \$64,%rsp
588         popfq
589         pop     %r15
590         pop     %r14
591         pop     %r13
592         pop     %r12
593         pop     %rbp
594         pop     %rbx
595         pop     %rdi
596         pop     %rsi
597         ret
598 .size   se_handler,.-se_handler
599
600 .section        .pdata
601 .align  4
602         .rva    .LSEH_begin_gcm_gmult_4bit
603         .rva    .LSEH_end_gcm_gmult_4bit
604         .rva    .LSEH_info_gcm_gmult_4bit
605
606         .rva    .LSEH_begin_gcm_ghash_4bit
607         .rva    .LSEH_end_gcm_ghash_4bit
608         .rva    .LSEH_info_gcm_ghash_4bit
609
610         .rva    .LSEH_begin_gcm_ghash_clmul
611         .rva    .LSEH_end_gcm_ghash_clmul
612         .rva    .LSEH_info_gcm_ghash_clmul
613
614 .section        .xdata
615 .align  8
616 .LSEH_info_gcm_gmult_4bit:
617         .byte   9,0,0,0
618         .rva    se_handler
619         .rva    .Lgmult_prologue,.Lgmult_epilogue       # HandlerData
620 .LSEH_info_gcm_ghash_4bit:
621         .byte   9,0,0,0
622         .rva    se_handler
623         .rva    .Lghash_prologue,.Lghash_epilogue       # HandlerData
624 .LSEH_info_gcm_ghash_clmul:
625         .byte   0x01,0x1f,0x0b,0x00
626         .byte   0x1f,0xa8,0x04,0x00     #movaps 0x40(rsp),xmm10
627         .byte   0x19,0x98,0x03,0x00     #movaps 0x30(rsp),xmm9
628         .byte   0x13,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
629         .byte   0x0d,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
630         .byte   0x08,0x68,0x00,0x00     #movaps (rsp),xmm6
631         .byte   0x04,0xa2,0x00,0x00     #sub    rsp,0x58
632 ___
633 }
634 \f
635 sub rex {
636  local *opcode=shift;
637  my ($dst,$src)=@_;
638
639    if ($dst>=8 || $src>=8) {
640         $rex=0x40;
641         $rex|=0x04 if($dst>=8);
642         $rex|=0x01 if($src>=8);
643         push @opcode,$rex;
644    }
645 }
646
647 sub pclmulqdq {
648   my $arg=shift;
649   my @opcode=(0x66);
650
651     if ($arg=~/\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
652         rex(\@opcode,$3,$2);
653         push @opcode,0x0f,0x3a,0x44;
654         push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
655         my $c=$1;
656         push @opcode,$c=~/^0/?oct($c):$c;
657         return ".byte\t".join(',',@opcode);
658     }
659     return "pclmulqdq\t".$arg;
660 }
661
662 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
663 $code =~ s/\bpclmulqdq\s+(\$.*%xmm[0-9]+).*$/pclmulqdq($1)/gem;
664
665 print $code;
666
667 close STDOUT;