443b649830f4548577a92f8fd0abd19b619c3b0a
[openssl.git] / crypto / sha / asm / sha1-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA1 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 #               this    +aesni(i)       sha1    aesni-sha1      gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii)  10.7/n  +1.28=3.96(n=4) 5.30    6.66            +68%
25 # Atom(ii)      18.1/n  +3.93=8.46(n=4) 9.37    12.8            +51%
26 # Sandy Bridge  (8.16   +5.15=13.3)/n   4.99    5.98            +80%
27 # Ivy Bridge    (8.08   +5.14=13.2)/n   4.60    5.54            +68%
28 # Haswell(iii)  (8.96   +5.00=14.0)/n   3.57    4.55            +160%
29 # Skylake       (8.70   +5.00=13.7)/n   3.64    4.20            +145%
30 # Bulldozer     (9.76   +5.76=15.5)/n   5.95    6.37            +64%
31 #
32 # (i)   multi-block CBC encrypt with 128-bit key;
33 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 #       because of lower AES-NI instruction throughput;
35 # (iii) "this" is for n=8, when we gather twice as much data, result
36 #       for n=4 is 8.00+4.44=12.4;
37 # (iv)  presented improvement coefficients are asymptotic limits and
38 #       in real-life application are somewhat lower, e.g. for 2KB
39 #       fragments they range from 30% to 100% (on Haswell);
40
41 $flavour = shift;
42 $output  = shift;
43 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
44
45 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
46
47 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
49 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
50 die "can't locate x86_64-xlate.pl";
51
52 $avx=0;
53
54 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
55                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
56         $avx = ($1>=2.19) + ($1>=2.22);
57 }
58
59 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
60            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
61         $avx = ($1>=2.09) + ($1>=2.10);
62 }
63
64 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
65            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
66         $avx = ($1>=10) + ($1>=11);
67 }
68
69 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
70         $avx = ($2>=3.0) + ($2>3.0);
71 }
72
73 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
74 *STDOUT=*OUT;
75
76 # void sha1_multi_block (
77 #     struct {  unsigned int A[8];
78 #               unsigned int B[8];
79 #               unsigned int C[8];
80 #               unsigned int D[8];
81 #               unsigned int E[8];      } *ctx,
82 #     struct {  void *ptr; int blocks;  } inp[8],
83 #     int num);         /* 1 or 2 */
84 #
85 $ctx="%rdi";    # 1st arg
86 $inp="%rsi";    # 2nd arg
87 $num="%edx";
88 @ptr=map("%r$_",(8..11));
89 $Tbl="%rbp";
90
91 @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
92 ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
93 @Xi=map("%xmm$_",(10..14));
94 $K="%xmm15";
95
96 if (1) {
97     # Atom-specific optimization aiming to eliminate pshufb with high
98     # registers [and thus get rid of 48 cycles accumulated penalty]
99     @Xi=map("%xmm$_",(0..4));
100     ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
101     @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
102 }
103
104 $REG_SZ=16;
105
106 sub Xi_off {
107 my $off = shift;
108
109     $off %= 16; $off *= $REG_SZ;
110     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
111 }
112
113 sub BODY_00_19 {
114 my ($i,$a,$b,$c,$d,$e)=@_;
115 my $j=$i+1;
116 my $k=$i+2;
117
118 # Loads are performed 2+3/4 iterations in advance. 3/4 means that out
119 # of 4 words you would expect to be loaded per given iteration one is
120 # spilled to next iteration. In other words indices in four input
121 # streams are distributed as following:
122 #
123 # $i==0:        0,0,0,0,1,1,1,1,2,2,2,
124 # $i==1:        2,3,3,3,
125 # $i==2:        3,4,4,4,
126 # ...
127 # $i==13:       14,15,15,15,
128 # $i==14:       15
129 #
130 # Then at $i==15 Xupdate is applied one iteration in advance...
131 $code.=<<___ if ($i==0);
132         movd            (@ptr[0]),@Xi[0]
133          lea            `16*4`(@ptr[0]),@ptr[0]
134         movd            (@ptr[1]),@Xi[2]        # borrow @Xi[2]
135          lea            `16*4`(@ptr[1]),@ptr[1]
136         movd            (@ptr[2]),@Xi[3]        # borrow @Xi[3]
137          lea            `16*4`(@ptr[2]),@ptr[2]
138         movd            (@ptr[3]),@Xi[4]        # borrow @Xi[4]
139          lea            `16*4`(@ptr[3]),@ptr[3]
140         punpckldq       @Xi[3],@Xi[0]
141          movd           `4*$j-16*4`(@ptr[0]),@Xi[1]
142         punpckldq       @Xi[4],@Xi[2]
143          movd           `4*$j-16*4`(@ptr[1]),$t3
144         punpckldq       @Xi[2],@Xi[0]
145          movd           `4*$j-16*4`(@ptr[2]),$t2
146         pshufb          $tx,@Xi[0]
147 ___
148 $code.=<<___ if ($i<14);                        # just load input
149          movd           `4*$j-16*4`(@ptr[3]),$t1
150          punpckldq      $t2,@Xi[1]
151         movdqa  $a,$t2
152         paddd   $K,$e                           # e+=K_00_19
153          punpckldq      $t1,$t3
154         movdqa  $b,$t1
155         movdqa  $b,$t0
156         pslld   \$5,$t2
157         pandn   $d,$t1
158         pand    $c,$t0
159          punpckldq      $t3,@Xi[1]
160         movdqa  $a,$t3
161
162         movdqa  @Xi[0],`&Xi_off($i)`
163         paddd   @Xi[0],$e                       # e+=X[i]
164          movd           `4*$k-16*4`(@ptr[0]),@Xi[2]
165         psrld   \$27,$t3
166         pxor    $t1,$t0                         # Ch(b,c,d)
167         movdqa  $b,$t1
168
169         por     $t3,$t2                         # rol(a,5)
170          movd           `4*$k-16*4`(@ptr[1]),$t3
171         pslld   \$30,$t1
172         paddd   $t0,$e                          # e+=Ch(b,c,d)
173
174         psrld   \$2,$b
175         paddd   $t2,$e                          # e+=rol(a,5)
176          pshufb $tx,@Xi[1]
177          movd           `4*$k-16*4`(@ptr[2]),$t2
178         por     $t1,$b                          # b=rol(b,30)
179 ___
180 $code.=<<___ if ($i==14);                       # just load input
181          movd           `4*$j-16*4`(@ptr[3]),$t1
182          punpckldq      $t2,@Xi[1]
183         movdqa  $a,$t2
184         paddd   $K,$e                           # e+=K_00_19
185          punpckldq      $t1,$t3
186         movdqa  $b,$t1
187         movdqa  $b,$t0
188         pslld   \$5,$t2
189          prefetcht0     63(@ptr[0])
190         pandn   $d,$t1
191         pand    $c,$t0
192          punpckldq      $t3,@Xi[1]
193         movdqa  $a,$t3
194
195         movdqa  @Xi[0],`&Xi_off($i)`
196         paddd   @Xi[0],$e                       # e+=X[i]
197         psrld   \$27,$t3
198         pxor    $t1,$t0                         # Ch(b,c,d)
199         movdqa  $b,$t1
200          prefetcht0     63(@ptr[1])
201
202         por     $t3,$t2                         # rol(a,5)
203         pslld   \$30,$t1
204         paddd   $t0,$e                          # e+=Ch(b,c,d)
205          prefetcht0     63(@ptr[2])
206
207         psrld   \$2,$b
208         paddd   $t2,$e                          # e+=rol(a,5)
209          pshufb $tx,@Xi[1]
210          prefetcht0     63(@ptr[3])
211         por     $t1,$b                          # b=rol(b,30)
212 ___
213 $code.=<<___ if ($i>=13 && $i<15);
214         movdqa  `&Xi_off($j+2)`,@Xi[3]          # preload "X[2]"
215 ___
216 $code.=<<___ if ($i>=15);                       # apply Xupdate
217         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
218         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
219
220         movdqa  $a,$t2
221          pxor   `&Xi_off($j+8)`,@Xi[1]
222         paddd   $K,$e                           # e+=K_00_19
223         movdqa  $b,$t1
224         pslld   \$5,$t2
225          pxor   @Xi[3],@Xi[1]
226         movdqa  $b,$t0
227         pandn   $d,$t1
228          movdqa @Xi[1],$tx
229         pand    $c,$t0
230         movdqa  $a,$t3
231          psrld  \$31,$tx
232          paddd  @Xi[1],@Xi[1]
233
234         movdqa  @Xi[0],`&Xi_off($i)`
235         paddd   @Xi[0],$e                       # e+=X[i]
236         psrld   \$27,$t3
237         pxor    $t1,$t0                         # Ch(b,c,d)
238
239         movdqa  $b,$t1
240         por     $t3,$t2                         # rol(a,5)
241         pslld   \$30,$t1
242         paddd   $t0,$e                          # e+=Ch(b,c,d)
243
244         psrld   \$2,$b
245         paddd   $t2,$e                          # e+=rol(a,5)
246          por    $tx,@Xi[1]                      # rol   \$1,@Xi[1]
247         por     $t1,$b                          # b=rol(b,30)
248 ___
249 push(@Xi,shift(@Xi));
250 }
251
252 sub BODY_20_39 {
253 my ($i,$a,$b,$c,$d,$e)=@_;
254 my $j=$i+1;
255
256 $code.=<<___ if ($i<79);
257         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
258         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
259
260         movdqa  $a,$t2
261         movdqa  $d,$t0
262          pxor   `&Xi_off($j+8)`,@Xi[1]
263         paddd   $K,$e                           # e+=K_20_39
264         pslld   \$5,$t2
265         pxor    $b,$t0
266
267         movdqa  $a,$t3
268 ___
269 $code.=<<___ if ($i<72);
270         movdqa  @Xi[0],`&Xi_off($i)`
271 ___
272 $code.=<<___ if ($i<79);
273         paddd   @Xi[0],$e                       # e+=X[i]
274          pxor   @Xi[3],@Xi[1]
275         psrld   \$27,$t3
276         pxor    $c,$t0                          # Parity(b,c,d)
277         movdqa  $b,$t1
278
279         pslld   \$30,$t1
280          movdqa @Xi[1],$tx
281         por     $t3,$t2                         # rol(a,5)
282          psrld  \$31,$tx
283         paddd   $t0,$e                          # e+=Parity(b,c,d)
284          paddd  @Xi[1],@Xi[1]
285
286         psrld   \$2,$b
287         paddd   $t2,$e                          # e+=rol(a,5)
288          por    $tx,@Xi[1]                      # rol(@Xi[1],1)
289         por     $t1,$b                          # b=rol(b,30)
290 ___
291 $code.=<<___ if ($i==79);
292         movdqa  $a,$t2
293         paddd   $K,$e                           # e+=K_20_39
294         movdqa  $d,$t0
295         pslld   \$5,$t2
296         pxor    $b,$t0
297
298         movdqa  $a,$t3
299         paddd   @Xi[0],$e                       # e+=X[i]
300         psrld   \$27,$t3
301         movdqa  $b,$t1
302         pxor    $c,$t0                          # Parity(b,c,d)
303
304         pslld   \$30,$t1
305         por     $t3,$t2                         # rol(a,5)
306         paddd   $t0,$e                          # e+=Parity(b,c,d)
307
308         psrld   \$2,$b
309         paddd   $t2,$e                          # e+=rol(a,5)
310         por     $t1,$b                          # b=rol(b,30)
311 ___
312 push(@Xi,shift(@Xi));
313 }
314
315 sub BODY_40_59 {
316 my ($i,$a,$b,$c,$d,$e)=@_;
317 my $j=$i+1;
318
319 $code.=<<___;
320         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
321         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
322
323         movdqa  $a,$t2
324         movdqa  $d,$t1
325          pxor   `&Xi_off($j+8)`,@Xi[1]
326         pxor    @Xi[3],@Xi[1]
327         paddd   $K,$e                           # e+=K_40_59
328         pslld   \$5,$t2
329         movdqa  $a,$t3
330         pand    $c,$t1
331
332         movdqa  $d,$t0
333          movdqa @Xi[1],$tx
334         psrld   \$27,$t3
335         paddd   $t1,$e
336         pxor    $c,$t0
337
338         movdqa  @Xi[0],`&Xi_off($i)`
339         paddd   @Xi[0],$e                       # e+=X[i]
340         por     $t3,$t2                         # rol(a,5)
341          psrld  \$31,$tx
342         pand    $b,$t0
343         movdqa  $b,$t1
344
345         pslld   \$30,$t1
346          paddd  @Xi[1],@Xi[1]
347         paddd   $t0,$e                          # e+=Maj(b,d,c)
348
349         psrld   \$2,$b
350         paddd   $t2,$e                          # e+=rol(a,5)
351          por    $tx,@Xi[1]                      # rol(@X[1],1)
352         por     $t1,$b                          # b=rol(b,30)
353 ___
354 push(@Xi,shift(@Xi));
355 }
356
357 $code.=<<___;
358 .text
359
360 .extern OPENSSL_ia32cap_P
361
362 .globl  sha1_multi_block
363 .type   sha1_multi_block,\@function,3
364 .align  32
365 sha1_multi_block:
366 .cfi_startproc
367         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
368         bt      \$61,%rcx                       # check SHA bit
369         jc      _shaext_shortcut
370 ___
371 $code.=<<___ if ($avx);
372         test    \$`1<<28`,%ecx
373         jnz     _avx_shortcut
374 ___
375 $code.=<<___;
376         mov     %rsp,%rax
377 .cfi_def_cfa_register   %rax
378         push    %rbx
379 .cfi_push       %rbx
380         push    %rbp
381 .cfi_push       %rbx
382 ___
383 $code.=<<___ if ($win64);
384         lea     -0xa8(%rsp),%rsp
385         movaps  %xmm6,(%rsp)
386         movaps  %xmm7,0x10(%rsp)
387         movaps  %xmm8,0x20(%rsp)
388         movaps  %xmm9,0x30(%rsp)
389         movaps  %xmm10,-0x78(%rax)
390         movaps  %xmm11,-0x68(%rax)
391         movaps  %xmm12,-0x58(%rax)
392         movaps  %xmm13,-0x48(%rax)
393         movaps  %xmm14,-0x38(%rax)
394         movaps  %xmm15,-0x28(%rax)
395 ___
396 $code.=<<___;
397         sub     \$`$REG_SZ*18`,%rsp
398         and     \$-256,%rsp
399         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
400 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
401 .Lbody:
402         lea     K_XX_XX(%rip),$Tbl
403         lea     `$REG_SZ*16`(%rsp),%rbx
404
405 .Loop_grande:
406         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
407         xor     $num,$num
408 ___
409 for($i=0;$i<4;$i++) {
410     $code.=<<___;
411         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
412         mov     `16*$i+8`($inp),%ecx            # number of blocks
413         cmp     $num,%ecx
414         cmovg   %ecx,$num                       # find maximum
415         test    %ecx,%ecx
416         mov     %ecx,`4*$i`(%rbx)               # initialize counters
417         cmovle  $Tbl,@ptr[$i]                   # cancel input
418 ___
419 }
420 $code.=<<___;
421         test    $num,$num
422         jz      .Ldone
423
424         movdqu  0x00($ctx),$A                   # load context
425          lea    128(%rsp),%rax
426         movdqu  0x20($ctx),$B
427         movdqu  0x40($ctx),$C
428         movdqu  0x60($ctx),$D
429         movdqu  0x80($ctx),$E
430         movdqa  0x60($Tbl),$tx                  # pbswap_mask
431         movdqa  -0x20($Tbl),$K                  # K_00_19
432         jmp     .Loop
433
434 .align  32
435 .Loop:
436 ___
437 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
438 $code.="        movdqa  0x00($Tbl),$K\n";       # K_20_39
439 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
440 $code.="        movdqa  0x20($Tbl),$K\n";       # K_40_59
441 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
442 $code.="        movdqa  0x40($Tbl),$K\n";       # K_60_79
443 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
444 $code.=<<___;
445         movdqa  (%rbx),@Xi[0]                   # pull counters
446         mov     \$1,%ecx
447         cmp     4*0(%rbx),%ecx                  # examine counters
448         pxor    $t2,$t2
449         cmovge  $Tbl,@ptr[0]                    # cancel input
450         cmp     4*1(%rbx),%ecx
451         movdqa  @Xi[0],@Xi[1]
452         cmovge  $Tbl,@ptr[1]
453         cmp     4*2(%rbx),%ecx
454         pcmpgtd $t2,@Xi[1]                      # mask value
455         cmovge  $Tbl,@ptr[2]
456         cmp     4*3(%rbx),%ecx
457         paddd   @Xi[1],@Xi[0]                   # counters--
458         cmovge  $Tbl,@ptr[3]
459
460         movdqu  0x00($ctx),$t0
461         pand    @Xi[1],$A
462         movdqu  0x20($ctx),$t1
463         pand    @Xi[1],$B
464         paddd   $t0,$A
465         movdqu  0x40($ctx),$t2
466         pand    @Xi[1],$C
467         paddd   $t1,$B
468         movdqu  0x60($ctx),$t3
469         pand    @Xi[1],$D
470         paddd   $t2,$C
471         movdqu  0x80($ctx),$tx
472         pand    @Xi[1],$E
473         movdqu  $A,0x00($ctx)
474         paddd   $t3,$D
475         movdqu  $B,0x20($ctx)
476         paddd   $tx,$E
477         movdqu  $C,0x40($ctx)
478         movdqu  $D,0x60($ctx)
479         movdqu  $E,0x80($ctx)
480
481         movdqa  @Xi[0],(%rbx)                   # save counters
482         movdqa  0x60($Tbl),$tx                  # pbswap_mask
483         movdqa  -0x20($Tbl),$K                  # K_00_19
484         dec     $num
485         jnz     .Loop
486
487         mov     `$REG_SZ*17+8`(%rsp),$num
488         lea     $REG_SZ($ctx),$ctx
489         lea     `16*$REG_SZ/4`($inp),$inp
490         dec     $num
491         jnz     .Loop_grande
492
493 .Ldone:
494         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
495 .cfi_def_cfa    %rax,8
496 ___
497 $code.=<<___ if ($win64);
498         movaps  -0xb8(%rax),%xmm6
499         movaps  -0xa8(%rax),%xmm7
500         movaps  -0x98(%rax),%xmm8
501         movaps  -0x88(%rax),%xmm9
502         movaps  -0x78(%rax),%xmm10
503         movaps  -0x68(%rax),%xmm11
504         movaps  -0x58(%rax),%xmm12
505         movaps  -0x48(%rax),%xmm13
506         movaps  -0x38(%rax),%xmm14
507         movaps  -0x28(%rax),%xmm15
508 ___
509 $code.=<<___;
510         mov     -16(%rax),%rbp
511 .cfi_restore    %rbp
512         mov     -8(%rax),%rbx
513 .cfi_restore    %rbx
514         lea     (%rax),%rsp
515 .cfi_def_cfa_register   %rsp
516 .Lepilogue:
517         ret
518 .cfi_endproc
519 .size   sha1_multi_block,.-sha1_multi_block
520 ___
521                                                 {{{
522 my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
523 my @MSG0=map("%xmm$_",(4..7));
524 my @MSG1=map("%xmm$_",(11..14));
525
526 $code.=<<___;
527 .type   sha1_multi_block_shaext,\@function,3
528 .align  32
529 sha1_multi_block_shaext:
530 .cfi_startproc
531 _shaext_shortcut:
532         mov     %rsp,%rax
533 .cfi_def_cfa_register   %rax
534         push    %rbx
535 .cfi_push       %rbx
536         push    %rbp
537 .cfi_push       %rbp
538 ___
539 $code.=<<___ if ($win64);
540         lea     -0xa8(%rsp),%rsp
541         movaps  %xmm6,(%rsp)
542         movaps  %xmm7,0x10(%rsp)
543         movaps  %xmm8,0x20(%rsp)
544         movaps  %xmm9,0x30(%rsp)
545         movaps  %xmm10,-0x78(%rax)
546         movaps  %xmm11,-0x68(%rax)
547         movaps  %xmm12,-0x58(%rax)
548         movaps  %xmm13,-0x48(%rax)
549         movaps  %xmm14,-0x38(%rax)
550         movaps  %xmm15,-0x28(%rax)
551 ___
552 $code.=<<___;
553         sub     \$`$REG_SZ*18`,%rsp
554         shl     \$1,$num                        # we process pair at a time
555         and     \$-256,%rsp
556         lea     0x40($ctx),$ctx                 # size optimization
557         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
558 .Lbody_shaext:
559         lea     `$REG_SZ*16`(%rsp),%rbx
560         movdqa  K_XX_XX+0x80(%rip),$BSWAP       # byte-n-word swap
561
562 .Loop_grande_shaext:
563         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
564         xor     $num,$num
565 ___
566 for($i=0;$i<2;$i++) {
567     $code.=<<___;
568         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
569         mov     `16*$i+8`($inp),%ecx            # number of blocks
570         cmp     $num,%ecx
571         cmovg   %ecx,$num                       # find maximum
572         test    %ecx,%ecx
573         mov     %ecx,`4*$i`(%rbx)               # initialize counters
574         cmovle  %rsp,@ptr[$i]                   # cancel input
575 ___
576 }
577 $code.=<<___;
578         test    $num,$num
579         jz      .Ldone_shaext
580
581         movq            0x00-0x40($ctx),$ABCD0  # a1.a0
582         movq            0x20-0x40($ctx),@MSG0[0]# b1.b0
583         movq            0x40-0x40($ctx),@MSG0[1]# c1.c0
584         movq            0x60-0x40($ctx),@MSG0[2]# d1.d0
585         movq            0x80-0x40($ctx),@MSG0[3]# e1.e0
586
587         punpckldq       @MSG0[0],$ABCD0         # b1.a1.b0.a0
588         punpckldq       @MSG0[2],@MSG0[1]       # d1.c1.d0.c0
589
590         movdqa          $ABCD0,$ABCD1
591         punpcklqdq      @MSG0[1],$ABCD0         # d0.c0.b0.a0
592         punpckhqdq      @MSG0[1],$ABCD1         # d1.c1.b1.a1
593
594         pshufd          \$0b00111111,@MSG0[3],$E0
595         pshufd          \$0b01111111,@MSG0[3],$E1
596         pshufd          \$0b00011011,$ABCD0,$ABCD0
597         pshufd          \$0b00011011,$ABCD1,$ABCD1
598         jmp             .Loop_shaext
599
600 .align  32
601 .Loop_shaext:
602         movdqu          0x00(@ptr[0]),@MSG0[0]
603          movdqu         0x00(@ptr[1]),@MSG1[0]
604         movdqu          0x10(@ptr[0]),@MSG0[1]
605          movdqu         0x10(@ptr[1]),@MSG1[1]
606         movdqu          0x20(@ptr[0]),@MSG0[2]
607         pshufb          $BSWAP,@MSG0[0]
608          movdqu         0x20(@ptr[1]),@MSG1[2]
609          pshufb         $BSWAP,@MSG1[0]
610         movdqu          0x30(@ptr[0]),@MSG0[3]
611         lea             0x40(@ptr[0]),@ptr[0]
612         pshufb          $BSWAP,@MSG0[1]
613          movdqu         0x30(@ptr[1]),@MSG1[3]
614          lea            0x40(@ptr[1]),@ptr[1]
615          pshufb         $BSWAP,@MSG1[1]
616
617         movdqa          $E0,0x50(%rsp)          # offload
618         paddd           @MSG0[0],$E0
619          movdqa         $E1,0x70(%rsp)
620          paddd          @MSG1[0],$E1
621         movdqa          $ABCD0,0x40(%rsp)       # offload
622         movdqa          $ABCD0,$E0_
623          movdqa         $ABCD1,0x60(%rsp)
624          movdqa         $ABCD1,$E1_
625         sha1rnds4       \$0,$E0,$ABCD0          # 0-3
626         sha1nexte       @MSG0[1],$E0_
627          sha1rnds4      \$0,$E1,$ABCD1          # 0-3
628          sha1nexte      @MSG1[1],$E1_
629         pshufb          $BSWAP,@MSG0[2]
630         prefetcht0      127(@ptr[0])
631         sha1msg1        @MSG0[1],@MSG0[0]
632          pshufb         $BSWAP,@MSG1[2]
633          prefetcht0     127(@ptr[1])
634          sha1msg1       @MSG1[1],@MSG1[0]
635
636         pshufb          $BSWAP,@MSG0[3]
637         movdqa          $ABCD0,$E0
638          pshufb         $BSWAP,@MSG1[3]
639          movdqa         $ABCD1,$E1
640         sha1rnds4       \$0,$E0_,$ABCD0         # 4-7
641         sha1nexte       @MSG0[2],$E0
642          sha1rnds4      \$0,$E1_,$ABCD1         # 4-7
643          sha1nexte      @MSG1[2],$E1
644         pxor            @MSG0[2],@MSG0[0]
645         sha1msg1        @MSG0[2],@MSG0[1]
646          pxor           @MSG1[2],@MSG1[0]
647          sha1msg1       @MSG1[2],@MSG1[1]
648 ___
649 for($i=2;$i<20-4;$i++) {
650 $code.=<<___;
651         movdqa          $ABCD0,$E0_
652          movdqa         $ABCD1,$E1_
653         sha1rnds4       \$`int($i/5)`,$E0,$ABCD0        # 8-11
654         sha1nexte       @MSG0[3],$E0_
655          sha1rnds4      \$`int($i/5)`,$E1,$ABCD1        # 8-11
656          sha1nexte      @MSG1[3],$E1_
657         sha1msg2        @MSG0[3],@MSG0[0]
658          sha1msg2       @MSG1[3],@MSG1[0]
659         pxor            @MSG0[3],@MSG0[1]
660         sha1msg1        @MSG0[3],@MSG0[2]
661          pxor           @MSG1[3],@MSG1[1]
662          sha1msg1       @MSG1[3],@MSG1[2]
663 ___
664         ($E0,$E0_)=($E0_,$E0);          ($E1,$E1_)=($E1_,$E1);
665         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
666 }
667 $code.=<<___;
668         movdqa          $ABCD0,$E0_
669          movdqa         $ABCD1,$E1_
670         sha1rnds4       \$3,$E0,$ABCD0          # 64-67
671         sha1nexte       @MSG0[3],$E0_
672          sha1rnds4      \$3,$E1,$ABCD1          # 64-67
673          sha1nexte      @MSG1[3],$E1_
674         sha1msg2        @MSG0[3],@MSG0[0]
675          sha1msg2       @MSG1[3],@MSG1[0]
676         pxor            @MSG0[3],@MSG0[1]
677          pxor           @MSG1[3],@MSG1[1]
678
679         mov             \$1,%ecx
680         pxor            @MSG0[2],@MSG0[2]       # zero
681         cmp             4*0(%rbx),%ecx          # examine counters
682         cmovge          %rsp,@ptr[0]            # cancel input
683
684         movdqa          $ABCD0,$E0
685          movdqa         $ABCD1,$E1
686         sha1rnds4       \$3,$E0_,$ABCD0         # 68-71
687         sha1nexte       @MSG0[0],$E0
688          sha1rnds4      \$3,$E1_,$ABCD1         # 68-71
689          sha1nexte      @MSG1[0],$E1
690         sha1msg2        @MSG0[0],@MSG0[1]
691          sha1msg2       @MSG1[0],@MSG1[1]
692
693         cmp             4*1(%rbx),%ecx
694         cmovge          %rsp,@ptr[1]
695         movq            (%rbx),@MSG0[0]         # pull counters
696
697         movdqa          $ABCD0,$E0_
698          movdqa         $ABCD1,$E1_
699         sha1rnds4       \$3,$E0,$ABCD0          # 72-75
700         sha1nexte       @MSG0[1],$E0_
701          sha1rnds4      \$3,$E1,$ABCD1          # 72-75
702          sha1nexte      @MSG1[1],$E1_
703
704         pshufd          \$0x00,@MSG0[0],@MSG1[2]
705         pshufd          \$0x55,@MSG0[0],@MSG1[3]
706         movdqa          @MSG0[0],@MSG0[1]
707         pcmpgtd         @MSG0[2],@MSG1[2]
708         pcmpgtd         @MSG0[2],@MSG1[3]
709
710         movdqa          $ABCD0,$E0
711          movdqa         $ABCD1,$E1
712         sha1rnds4       \$3,$E0_,$ABCD0         # 76-79
713         sha1nexte       $MSG0[2],$E0
714          sha1rnds4      \$3,$E1_,$ABCD1         # 76-79
715          sha1nexte      $MSG0[2],$E1
716
717         pcmpgtd         @MSG0[2],@MSG0[1]       # counter mask
718         pand            @MSG1[2],$ABCD0
719         pand            @MSG1[2],$E0
720          pand           @MSG1[3],$ABCD1
721          pand           @MSG1[3],$E1
722         paddd           @MSG0[1],@MSG0[0]       # counters--
723
724         paddd           0x40(%rsp),$ABCD0
725         paddd           0x50(%rsp),$E0
726          paddd          0x60(%rsp),$ABCD1
727          paddd          0x70(%rsp),$E1
728
729         movq            @MSG0[0],(%rbx)         # save counters
730         dec             $num
731         jnz             .Loop_shaext
732
733         mov             `$REG_SZ*17+8`(%rsp),$num
734
735         pshufd          \$0b00011011,$ABCD0,$ABCD0
736         pshufd          \$0b00011011,$ABCD1,$ABCD1
737
738         movdqa          $ABCD0,@MSG0[0]
739         punpckldq       $ABCD1,$ABCD0           # b1.b0.a1.a0
740         punpckhdq       $ABCD1,@MSG0[0]         # d1.d0.c1.c0
741         punpckhdq       $E1,$E0                 # e1.e0.xx.xx
742         movq            $ABCD0,0x00-0x40($ctx)  # a1.a0
743         psrldq          \$8,$ABCD0
744         movq            @MSG0[0],0x40-0x40($ctx)# c1.c0
745         psrldq          \$8,@MSG0[0]
746         movq            $ABCD0,0x20-0x40($ctx)  # b1.b0
747         psrldq          \$8,$E0
748         movq            @MSG0[0],0x60-0x40($ctx)# d1.d0
749         movq            $E0,0x80-0x40($ctx)     # e1.e0
750
751         lea     `$REG_SZ/2`($ctx),$ctx
752         lea     `16*2`($inp),$inp
753         dec     $num
754         jnz     .Loop_grande_shaext
755
756 .Ldone_shaext:
757         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
758 ___
759 $code.=<<___ if ($win64);
760         movaps  -0xb8(%rax),%xmm6
761         movaps  -0xa8(%rax),%xmm7
762         movaps  -0x98(%rax),%xmm8
763         movaps  -0x88(%rax),%xmm9
764         movaps  -0x78(%rax),%xmm10
765         movaps  -0x68(%rax),%xmm11
766         movaps  -0x58(%rax),%xmm12
767         movaps  -0x48(%rax),%xmm13
768         movaps  -0x38(%rax),%xmm14
769         movaps  -0x28(%rax),%xmm15
770 ___
771 $code.=<<___;
772         mov     -16(%rax),%rbp
773 .cfi_restore    %rbp
774         mov     -8(%rax),%rbx
775 .cfi_restore    %rbx
776         lea     (%rax),%rsp
777 .cfi_def_cfa_register   %rsp
778 .Lepilogue_shaext:
779         ret
780 .cfi_endproc
781 .size   sha1_multi_block_shaext,.-sha1_multi_block_shaext
782 ___
783                                                 }}}
784
785                                                 if ($avx) {{{
786 sub BODY_00_19_avx {
787 my ($i,$a,$b,$c,$d,$e)=@_;
788 my $j=$i+1;
789 my $k=$i+2;
790 my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
791 my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
792
793 $code.=<<___ if ($i==0 && $REG_SZ==16);
794         vmovd           (@ptr[0]),@Xi[0]
795          lea            `16*4`(@ptr[0]),@ptr[0]
796         vmovd           (@ptr[1]),@Xi[2]        # borrow Xi[2]
797          lea            `16*4`(@ptr[1]),@ptr[1]
798         vpinsrd         \$1,(@ptr[2]),@Xi[0],@Xi[0]
799          lea            `16*4`(@ptr[2]),@ptr[2]
800         vpinsrd         \$1,(@ptr[3]),@Xi[2],@Xi[2]
801          lea            `16*4`(@ptr[3]),@ptr[3]
802          vmovd          `4*$j-16*4`(@ptr[0]),@Xi[1]
803         vpunpckldq      @Xi[2],@Xi[0],@Xi[0]
804          vmovd          `4*$j-16*4`($ptr_n),$t3
805         vpshufb         $tx,@Xi[0],@Xi[0]
806 ___
807 $code.=<<___ if ($i<15 && $REG_SZ==16);         # just load input
808          vpinsrd        \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
809          vpinsrd        \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
810 ___
811 $code.=<<___ if ($i==0 && $REG_SZ==32);
812         vmovd           (@ptr[0]),@Xi[0]
813          lea            `16*4`(@ptr[0]),@ptr[0]
814         vmovd           (@ptr[4]),@Xi[2]        # borrow Xi[2]
815          lea            `16*4`(@ptr[4]),@ptr[4]
816         vmovd           (@ptr[1]),$t2
817          lea            `16*4`(@ptr[1]),@ptr[1]
818         vmovd           (@ptr[5]),$t1
819          lea            `16*4`(@ptr[5]),@ptr[5]
820         vpinsrd         \$1,(@ptr[2]),@Xi[0],@Xi[0]
821          lea            `16*4`(@ptr[2]),@ptr[2]
822         vpinsrd         \$1,(@ptr[6]),@Xi[2],@Xi[2]
823          lea            `16*4`(@ptr[6]),@ptr[6]
824         vpinsrd         \$1,(@ptr[3]),$t2,$t2
825          lea            `16*4`(@ptr[3]),@ptr[3]
826         vpunpckldq      $t2,@Xi[0],@Xi[0]
827         vpinsrd         \$1,(@ptr[7]),$t1,$t1
828          lea            `16*4`(@ptr[7]),@ptr[7]
829         vpunpckldq      $t1,@Xi[2],@Xi[2]
830          vmovd          `4*$j-16*4`(@ptr[0]),@Xi[1]
831         vinserti128     @Xi[2],@Xi[0],@Xi[0]
832          vmovd          `4*$j-16*4`($ptr_n),$t3
833         vpshufb         $tx,@Xi[0],@Xi[0]
834 ___
835 $code.=<<___ if ($i<15 && $REG_SZ==32);         # just load input
836          vmovd          `4*$j-16*4`(@ptr[1]),$t2
837          vmovd          `4*$j-16*4`(@ptr[5]),$t1
838          vpinsrd        \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
839          vpinsrd        \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
840          vpinsrd        \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
841          vpunpckldq     $t2,@Xi[1],@Xi[1]
842          vpinsrd        \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
843          vpunpckldq     $t1,$t3,$t3
844 ___
845 $code.=<<___ if ($i<14);
846         vpaddd  $K,$e,$e                        # e+=K_00_19
847         vpslld  \$5,$a,$t2
848         vpandn  $d,$b,$t1
849         vpand   $c,$b,$t0
850
851         vmovdqa @Xi[0],`&Xi_off($i)`
852         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
853          $vpack         $t3,@Xi[1],@Xi[1]
854         vpsrld  \$27,$a,$t3
855         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
856          vmovd          `4*$k-16*4`(@ptr[0]),@Xi[2]
857
858         vpslld  \$30,$b,$t1
859         vpor    $t3,$t2,$t2                     # rol(a,5)
860          vmovd          `4*$k-16*4`($ptr_n),$t3
861         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
862
863         vpsrld  \$2,$b,$b
864         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
865          vpshufb        $tx,@Xi[1],@Xi[1]
866         vpor    $t1,$b,$b                       # b=rol(b,30)
867 ___
868 $code.=<<___ if ($i==14);
869         vpaddd  $K,$e,$e                        # e+=K_00_19
870          prefetcht0     63(@ptr[0])
871         vpslld  \$5,$a,$t2
872         vpandn  $d,$b,$t1
873         vpand   $c,$b,$t0
874
875         vmovdqa @Xi[0],`&Xi_off($i)`
876         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
877          $vpack         $t3,@Xi[1],@Xi[1]
878         vpsrld  \$27,$a,$t3
879          prefetcht0     63(@ptr[1])
880         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
881
882         vpslld  \$30,$b,$t1
883         vpor    $t3,$t2,$t2                     # rol(a,5)
884          prefetcht0     63(@ptr[2])
885         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
886
887         vpsrld  \$2,$b,$b
888         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
889          prefetcht0     63(@ptr[3])
890          vpshufb        $tx,@Xi[1],@Xi[1]
891         vpor    $t1,$b,$b                       # b=rol(b,30)
892 ___
893 $code.=<<___ if ($i>=13 && $i<15);
894         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # preload "X[2]"
895 ___
896 $code.=<<___ if ($i>=15);                       # apply Xupdate
897         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
898         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
899
900         vpaddd  $K,$e,$e                        # e+=K_00_19
901         vpslld  \$5,$a,$t2
902         vpandn  $d,$b,$t1
903          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
904         vpand   $c,$b,$t0
905
906         vmovdqa @Xi[0],`&Xi_off($i)`
907         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
908          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
909         vpsrld  \$27,$a,$t3
910         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
911          vpxor  @Xi[3],@Xi[1],@Xi[1]
912          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
913
914         vpslld  \$30,$b,$t1
915         vpor    $t3,$t2,$t2                     # rol(a,5)
916         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
917          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
918          vpsrld \$31,@Xi[1],$tx
919          vpaddd @Xi[1],@Xi[1],@Xi[1]
920
921         vpsrld  \$2,$b,$b
922          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
923         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
924          vpor   $tx,@Xi[1],@Xi[1]               # rol   \$1,@Xi[1]
925         vpor    $t1,$b,$b                       # b=rol(b,30)
926 ___
927 push(@Xi,shift(@Xi));
928 }
929
930 sub BODY_20_39_avx {
931 my ($i,$a,$b,$c,$d,$e)=@_;
932 my $j=$i+1;
933
934 $code.=<<___ if ($i<79);
935         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
936         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
937
938         vpslld  \$5,$a,$t2
939         vpaddd  $K,$e,$e                        # e+=K_20_39
940         vpxor   $b,$d,$t0
941 ___
942 $code.=<<___ if ($i<72);
943         vmovdqa @Xi[0],`&Xi_off($i)`
944 ___
945 $code.=<<___ if ($i<79);
946         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
947          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
948         vpsrld  \$27,$a,$t3
949         vpxor   $c,$t0,$t0                      # Parity(b,c,d)
950          vpxor  @Xi[3],@Xi[1],@Xi[1]
951
952         vpslld  \$30,$b,$t1
953         vpor    $t3,$t2,$t2                     # rol(a,5)
954         vpaddd  $t0,$e,$e                       # e+=Parity(b,c,d)
955          vpsrld \$31,@Xi[1],$tx
956          vpaddd @Xi[1],@Xi[1],@Xi[1]
957
958         vpsrld  \$2,$b,$b
959         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
960          vpor   $tx,@Xi[1],@Xi[1]               # rol(@Xi[1],1)
961         vpor    $t1,$b,$b                       # b=rol(b,30)
962 ___
963 $code.=<<___ if ($i==79);
964         vpslld  \$5,$a,$t2
965         vpaddd  $K,$e,$e                        # e+=K_20_39
966         vpxor   $b,$d,$t0
967
968         vpsrld  \$27,$a,$t3
969         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
970         vpxor   $c,$t0,$t0                      # Parity(b,c,d)
971
972         vpslld  \$30,$b,$t1
973         vpor    $t3,$t2,$t2                     # rol(a,5)
974         vpaddd  $t0,$e,$e                       # e+=Parity(b,c,d)
975
976         vpsrld  \$2,$b,$b
977         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
978         vpor    $t1,$b,$b                       # b=rol(b,30)
979 ___
980 push(@Xi,shift(@Xi));
981 }
982
983 sub BODY_40_59_avx {
984 my ($i,$a,$b,$c,$d,$e)=@_;
985 my $j=$i+1;
986
987 $code.=<<___;
988         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
989         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
990
991         vpaddd  $K,$e,$e                        # e+=K_40_59
992         vpslld  \$5,$a,$t2
993         vpand   $c,$d,$t1
994          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
995
996         vpaddd  $t1,$e,$e
997         vpsrld  \$27,$a,$t3
998         vpxor   $c,$d,$t0
999          vpxor  @Xi[3],@Xi[1],@Xi[1]
1000
1001         vmovdqu @Xi[0],`&Xi_off($i)`
1002         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
1003         vpor    $t3,$t2,$t2                     # rol(a,5)
1004          vpsrld \$31,@Xi[1],$tx
1005         vpand   $b,$t0,$t0
1006          vpaddd @Xi[1],@Xi[1],@Xi[1]
1007
1008         vpslld  \$30,$b,$t1
1009         vpaddd  $t0,$e,$e                       # e+=Maj(b,d,c)
1010
1011         vpsrld  \$2,$b,$b
1012         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
1013          vpor   $tx,@Xi[1],@Xi[1]               # rol(@X[1],1)
1014         vpor    $t1,$b,$b                       # b=rol(b,30)
1015 ___
1016 push(@Xi,shift(@Xi));
1017 }
1018
1019 $code.=<<___;
1020 .type   sha1_multi_block_avx,\@function,3
1021 .align  32
1022 sha1_multi_block_avx:
1023 .cfi_startproc
1024 _avx_shortcut:
1025 ___
1026 $code.=<<___ if ($avx>1);
1027         shr     \$32,%rcx
1028         cmp     \$2,$num
1029         jb      .Lavx
1030         test    \$`1<<5`,%ecx
1031         jnz     _avx2_shortcut
1032         jmp     .Lavx
1033 .align  32
1034 .Lavx:
1035 ___
1036 $code.=<<___;
1037         mov     %rsp,%rax
1038 .cfi_def_cfa_register   %rax
1039         push    %rbx
1040 .cfi_push       %rbx
1041         push    %rbp
1042 .cfi_push       %rbp
1043 ___
1044 $code.=<<___ if ($win64);
1045         lea     -0xa8(%rsp),%rsp
1046         movaps  %xmm6,(%rsp)
1047         movaps  %xmm7,0x10(%rsp)
1048         movaps  %xmm8,0x20(%rsp)
1049         movaps  %xmm9,0x30(%rsp)
1050         movaps  %xmm10,-0x78(%rax)
1051         movaps  %xmm11,-0x68(%rax)
1052         movaps  %xmm12,-0x58(%rax)
1053         movaps  %xmm13,-0x48(%rax)
1054         movaps  %xmm14,-0x38(%rax)
1055         movaps  %xmm15,-0x28(%rax)
1056 ___
1057 $code.=<<___;
1058         sub     \$`$REG_SZ*18`, %rsp
1059         and     \$-256,%rsp
1060         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1061 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
1062 .Lbody_avx:
1063         lea     K_XX_XX(%rip),$Tbl
1064         lea     `$REG_SZ*16`(%rsp),%rbx
1065
1066         vzeroupper
1067 .Loop_grande_avx:
1068         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1069         xor     $num,$num
1070 ___
1071 for($i=0;$i<4;$i++) {
1072     $code.=<<___;
1073         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1074         mov     `16*$i+8`($inp),%ecx            # number of blocks
1075         cmp     $num,%ecx
1076         cmovg   %ecx,$num                       # find maximum
1077         test    %ecx,%ecx
1078         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1079         cmovle  $Tbl,@ptr[$i]                   # cancel input
1080 ___
1081 }
1082 $code.=<<___;
1083         test    $num,$num
1084         jz      .Ldone_avx
1085
1086         vmovdqu 0x00($ctx),$A                   # load context
1087          lea    128(%rsp),%rax
1088         vmovdqu 0x20($ctx),$B
1089         vmovdqu 0x40($ctx),$C
1090         vmovdqu 0x60($ctx),$D
1091         vmovdqu 0x80($ctx),$E
1092         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1093         jmp     .Loop_avx
1094
1095 .align  32
1096 .Loop_avx:
1097 ___
1098 $code.="        vmovdqa -0x20($Tbl),$K\n";      # K_00_19
1099 for($i=0;$i<20;$i++)    { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1100 $code.="        vmovdqa 0x00($Tbl),$K\n";       # K_20_39
1101 for(;$i<40;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1102 $code.="        vmovdqa 0x20($Tbl),$K\n";       # K_40_59
1103 for(;$i<60;$i++)        { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1104 $code.="        vmovdqa 0x40($Tbl),$K\n";       # K_60_79
1105 for(;$i<80;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1106 $code.=<<___;
1107         mov     \$1,%ecx
1108 ___
1109 for($i=0;$i<4;$i++) {
1110     $code.=<<___;
1111         cmp     `4*$i`(%rbx),%ecx               # examine counters
1112         cmovge  $Tbl,@ptr[$i]                   # cancel input
1113 ___
1114 }
1115 $code.=<<___;
1116         vmovdqu (%rbx),$t0                      # pull counters
1117         vpxor   $t2,$t2,$t2
1118         vmovdqa $t0,$t1
1119         vpcmpgtd $t2,$t1,$t1                    # mask value
1120         vpaddd  $t1,$t0,$t0                     # counters--
1121
1122         vpand   $t1,$A,$A
1123         vpand   $t1,$B,$B
1124         vpaddd  0x00($ctx),$A,$A
1125         vpand   $t1,$C,$C
1126         vpaddd  0x20($ctx),$B,$B
1127         vpand   $t1,$D,$D
1128         vpaddd  0x40($ctx),$C,$C
1129         vpand   $t1,$E,$E
1130         vpaddd  0x60($ctx),$D,$D
1131         vpaddd  0x80($ctx),$E,$E
1132         vmovdqu $A,0x00($ctx)
1133         vmovdqu $B,0x20($ctx)
1134         vmovdqu $C,0x40($ctx)
1135         vmovdqu $D,0x60($ctx)
1136         vmovdqu $E,0x80($ctx)
1137
1138         vmovdqu $t0,(%rbx)                      # save counters
1139         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1140         dec     $num
1141         jnz     .Loop_avx
1142
1143         mov     `$REG_SZ*17+8`(%rsp),$num
1144         lea     $REG_SZ($ctx),$ctx
1145         lea     `16*$REG_SZ/4`($inp),$inp
1146         dec     $num
1147         jnz     .Loop_grande_avx
1148
1149 .Ldone_avx:
1150         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1151 .cfi_def_cfa    %rax,8
1152         vzeroupper
1153 ___
1154 $code.=<<___ if ($win64);
1155         movaps  -0xb8(%rax),%xmm6
1156         movaps  -0xa8(%rax),%xmm7
1157         movaps  -0x98(%rax),%xmm8
1158         movaps  -0x88(%rax),%xmm9
1159         movaps  -0x78(%rax),%xmm10
1160         movaps  -0x68(%rax),%xmm11
1161         movaps  -0x58(%rax),%xmm12
1162         movaps  -0x48(%rax),%xmm13
1163         movaps  -0x38(%rax),%xmm14
1164         movaps  -0x28(%rax),%xmm15
1165 ___
1166 $code.=<<___;
1167         mov     -16(%rax),%rbp
1168 .cfi_restore    %rbp
1169         mov     -8(%rax),%rbx
1170 .cfi_restore    %rbx
1171         lea     (%rax),%rsp
1172 .cfi_def_cfa_register   %rsp
1173 .Lepilogue_avx:
1174         ret
1175 .cfi_endproc
1176 .size   sha1_multi_block_avx,.-sha1_multi_block_avx
1177 ___
1178
1179                                                 if ($avx>1) {
1180 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1181
1182 $REG_SZ=32;
1183
1184 @ptr=map("%r$_",(12..15,8..11));
1185
1186 @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
1187 ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
1188 @Xi=map("%ymm$_",(10..14));
1189 $K="%ymm15";
1190
1191 $code.=<<___;
1192 .type   sha1_multi_block_avx2,\@function,3
1193 .align  32
1194 sha1_multi_block_avx2:
1195 .cfi_startproc
1196 _avx2_shortcut:
1197         mov     %rsp,%rax
1198 .cfi_def_cfa_register   %rax
1199         push    %rbx
1200 .cfi_push       %rbx
1201         push    %rbp
1202 .cfi_push       %rbp
1203         push    %r12
1204 .cfi_push       %r12
1205         push    %r13
1206 .cfi_push       %r13
1207         push    %r14
1208 .cfi_push       %r14
1209         push    %r15
1210 .cfi_push       %r15
1211 ___
1212 $code.=<<___ if ($win64);
1213         lea     -0xa8(%rsp),%rsp
1214         movaps  %xmm6,(%rsp)
1215         movaps  %xmm7,0x10(%rsp)
1216         movaps  %xmm8,0x20(%rsp)
1217         movaps  %xmm9,0x30(%rsp)
1218         movaps  %xmm10,0x40(%rsp)
1219         movaps  %xmm11,0x50(%rsp)
1220         movaps  %xmm12,-0x78(%rax)
1221         movaps  %xmm13,-0x68(%rax)
1222         movaps  %xmm14,-0x58(%rax)
1223         movaps  %xmm15,-0x48(%rax)
1224 ___
1225 $code.=<<___;
1226         sub     \$`$REG_SZ*18`, %rsp
1227         and     \$-256,%rsp
1228         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1229 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
1230 .Lbody_avx2:
1231         lea     K_XX_XX(%rip),$Tbl
1232         shr     \$1,$num
1233
1234         vzeroupper
1235 .Loop_grande_avx2:
1236         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1237         xor     $num,$num
1238         lea     `$REG_SZ*16`(%rsp),%rbx
1239 ___
1240 for($i=0;$i<8;$i++) {
1241     $code.=<<___;
1242         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1243         mov     `16*$i+8`($inp),%ecx            # number of blocks
1244         cmp     $num,%ecx
1245         cmovg   %ecx,$num                       # find maximum
1246         test    %ecx,%ecx
1247         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1248         cmovle  $Tbl,@ptr[$i]                   # cancel input
1249 ___
1250 }
1251 $code.=<<___;
1252         vmovdqu 0x00($ctx),$A                   # load context
1253          lea    128(%rsp),%rax
1254         vmovdqu 0x20($ctx),$B
1255          lea    256+128(%rsp),%rbx
1256         vmovdqu 0x40($ctx),$C
1257         vmovdqu 0x60($ctx),$D
1258         vmovdqu 0x80($ctx),$E
1259         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1260         jmp     .Loop_avx2
1261
1262 .align  32
1263 .Loop_avx2:
1264 ___
1265 $code.="        vmovdqa -0x20($Tbl),$K\n";      # K_00_19
1266 for($i=0;$i<20;$i++)    { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1267 $code.="        vmovdqa 0x00($Tbl),$K\n";       # K_20_39
1268 for(;$i<40;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1269 $code.="        vmovdqa 0x20($Tbl),$K\n";       # K_40_59
1270 for(;$i<60;$i++)        { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1271 $code.="        vmovdqa 0x40($Tbl),$K\n";       # K_60_79
1272 for(;$i<80;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1273 $code.=<<___;
1274         mov     \$1,%ecx
1275         lea     `$REG_SZ*16`(%rsp),%rbx
1276 ___
1277 for($i=0;$i<8;$i++) {
1278     $code.=<<___;
1279         cmp     `4*$i`(%rbx),%ecx               # examine counters
1280         cmovge  $Tbl,@ptr[$i]                   # cancel input
1281 ___
1282 }
1283 $code.=<<___;
1284         vmovdqu (%rbx),$t0              # pull counters
1285         vpxor   $t2,$t2,$t2
1286         vmovdqa $t0,$t1
1287         vpcmpgtd $t2,$t1,$t1                    # mask value
1288         vpaddd  $t1,$t0,$t0                     # counters--
1289
1290         vpand   $t1,$A,$A
1291         vpand   $t1,$B,$B
1292         vpaddd  0x00($ctx),$A,$A
1293         vpand   $t1,$C,$C
1294         vpaddd  0x20($ctx),$B,$B
1295         vpand   $t1,$D,$D
1296         vpaddd  0x40($ctx),$C,$C
1297         vpand   $t1,$E,$E
1298         vpaddd  0x60($ctx),$D,$D
1299         vpaddd  0x80($ctx),$E,$E
1300         vmovdqu $A,0x00($ctx)
1301         vmovdqu $B,0x20($ctx)
1302         vmovdqu $C,0x40($ctx)
1303         vmovdqu $D,0x60($ctx)
1304         vmovdqu $E,0x80($ctx)
1305
1306         vmovdqu $t0,(%rbx)                      # save counters
1307         lea     256+128(%rsp),%rbx
1308         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1309         dec     $num
1310         jnz     .Loop_avx2
1311
1312         #mov    `$REG_SZ*17+8`(%rsp),$num
1313         #lea    $REG_SZ($ctx),$ctx
1314         #lea    `16*$REG_SZ/4`($inp),$inp
1315         #dec    $num
1316         #jnz    .Loop_grande_avx2
1317
1318 .Ldone_avx2:
1319         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1320 .cfi_def_cfa    %rax,8
1321         vzeroupper
1322 ___
1323 $code.=<<___ if ($win64);
1324         movaps  -0xd8(%rax),%xmm6
1325         movaps  -0xc8(%rax),%xmm7
1326         movaps  -0xb8(%rax),%xmm8
1327         movaps  -0xa8(%rax),%xmm9
1328         movaps  -0x98(%rax),%xmm10
1329         movaps  -0x88(%rax),%xmm11
1330         movaps  -0x78(%rax),%xmm12
1331         movaps  -0x68(%rax),%xmm13
1332         movaps  -0x58(%rax),%xmm14
1333         movaps  -0x48(%rax),%xmm15
1334 ___
1335 $code.=<<___;
1336         mov     -48(%rax),%r15
1337 .cfi_restore    %r15
1338         mov     -40(%rax),%r14
1339 .cfi_restore    %r14
1340         mov     -32(%rax),%r13
1341 .cfi_restore    %r13
1342         mov     -24(%rax),%r12
1343 .cfi_restore    %r12
1344         mov     -16(%rax),%rbp
1345 .cfi_restore    %rbp
1346         mov     -8(%rax),%rbx
1347 .cfi_restore    %rbx
1348         lea     (%rax),%rsp
1349 .cfi_def_cfa_register   %rsp
1350 .Lepilogue_avx2:
1351         ret
1352 .cfi_endproc
1353 .size   sha1_multi_block_avx2,.-sha1_multi_block_avx2
1354 ___
1355                                                 }       }}}
1356 $code.=<<___;
1357
1358 .align  256
1359         .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1360         .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1361 K_XX_XX:
1362         .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1363         .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1364         .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1365         .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1366         .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1367         .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1368         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1369         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1370         .byte   0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1371         .asciz  "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1372 ___
1373
1374 if ($win64) {
1375 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1376 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1377 $rec="%rcx";
1378 $frame="%rdx";
1379 $context="%r8";
1380 $disp="%r9";
1381
1382 $code.=<<___;
1383 .extern __imp_RtlVirtualUnwind
1384 .type   se_handler,\@abi-omnipotent
1385 .align  16
1386 se_handler:
1387         push    %rsi
1388         push    %rdi
1389         push    %rbx
1390         push    %rbp
1391         push    %r12
1392         push    %r13
1393         push    %r14
1394         push    %r15
1395         pushfq
1396         sub     \$64,%rsp
1397
1398         mov     120($context),%rax      # pull context->Rax
1399         mov     248($context),%rbx      # pull context->Rip
1400
1401         mov     8($disp),%rsi           # disp->ImageBase
1402         mov     56($disp),%r11          # disp->HandlerData
1403
1404         mov     0(%r11),%r10d           # HandlerData[0]
1405         lea     (%rsi,%r10),%r10        # end of prologue label
1406         cmp     %r10,%rbx               # context->Rip<.Lbody
1407         jb      .Lin_prologue
1408
1409         mov     152($context),%rax      # pull context->Rsp
1410
1411         mov     4(%r11),%r10d           # HandlerData[1]
1412         lea     (%rsi,%r10),%r10        # epilogue label
1413         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1414         jae     .Lin_prologue
1415
1416         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1417
1418         mov     -8(%rax),%rbx
1419         mov     -16(%rax),%rbp
1420         mov     %rbx,144($context)      # restore context->Rbx
1421         mov     %rbp,160($context)      # restore context->Rbp
1422
1423         lea     -24-10*16(%rax),%rsi
1424         lea     512($context),%rdi      # &context.Xmm6
1425         mov     \$20,%ecx
1426         .long   0xa548f3fc              # cld; rep movsq
1427
1428 .Lin_prologue:
1429         mov     8(%rax),%rdi
1430         mov     16(%rax),%rsi
1431         mov     %rax,152($context)      # restore context->Rsp
1432         mov     %rsi,168($context)      # restore context->Rsi
1433         mov     %rdi,176($context)      # restore context->Rdi
1434
1435         mov     40($disp),%rdi          # disp->ContextRecord
1436         mov     $context,%rsi           # context
1437         mov     \$154,%ecx              # sizeof(CONTEXT)
1438         .long   0xa548f3fc              # cld; rep movsq
1439
1440         mov     $disp,%rsi
1441         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1442         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1443         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1444         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1445         mov     40(%rsi),%r10           # disp->ContextRecord
1446         lea     56(%rsi),%r11           # &disp->HandlerData
1447         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1448         mov     %r10,32(%rsp)           # arg5
1449         mov     %r11,40(%rsp)           # arg6
1450         mov     %r12,48(%rsp)           # arg7
1451         mov     %rcx,56(%rsp)           # arg8, (NULL)
1452         call    *__imp_RtlVirtualUnwind(%rip)
1453
1454         mov     \$1,%eax                # ExceptionContinueSearch
1455         add     \$64,%rsp
1456         popfq
1457         pop     %r15
1458         pop     %r14
1459         pop     %r13
1460         pop     %r12
1461         pop     %rbp
1462         pop     %rbx
1463         pop     %rdi
1464         pop     %rsi
1465         ret
1466 .size   se_handler,.-se_handler
1467 ___
1468 $code.=<<___ if ($avx>1);
1469 .type   avx2_handler,\@abi-omnipotent
1470 .align  16
1471 avx2_handler:
1472         push    %rsi
1473         push    %rdi
1474         push    %rbx
1475         push    %rbp
1476         push    %r12
1477         push    %r13
1478         push    %r14
1479         push    %r15
1480         pushfq
1481         sub     \$64,%rsp
1482
1483         mov     120($context),%rax      # pull context->Rax
1484         mov     248($context),%rbx      # pull context->Rip
1485
1486         mov     8($disp),%rsi           # disp->ImageBase
1487         mov     56($disp),%r11          # disp->HandlerData
1488
1489         mov     0(%r11),%r10d           # HandlerData[0]
1490         lea     (%rsi,%r10),%r10        # end of prologue label
1491         cmp     %r10,%rbx               # context->Rip<body label
1492         jb      .Lin_prologue
1493
1494         mov     152($context),%rax      # pull context->Rsp
1495
1496         mov     4(%r11),%r10d           # HandlerData[1]
1497         lea     (%rsi,%r10),%r10        # epilogue label
1498         cmp     %r10,%rbx               # context->Rip>=epilogue label
1499         jae     .Lin_prologue
1500
1501         mov     `32*17`($context),%rax  # pull saved stack pointer
1502
1503         mov     -8(%rax),%rbx
1504         mov     -16(%rax),%rbp
1505         mov     -24(%rax),%r12
1506         mov     -32(%rax),%r13
1507         mov     -40(%rax),%r14
1508         mov     -48(%rax),%r15
1509         mov     %rbx,144($context)      # restore context->Rbx
1510         mov     %rbp,160($context)      # restore context->Rbp
1511         mov     %r12,216($context)      # restore context->R12
1512         mov     %r13,224($context)      # restore context->R13
1513         mov     %r14,232($context)      # restore context->R14
1514         mov     %r15,240($context)      # restore context->R15
1515
1516         lea     -56-10*16(%rax),%rsi
1517         lea     512($context),%rdi      # &context.Xmm6
1518         mov     \$20,%ecx
1519         .long   0xa548f3fc              # cld; rep movsq
1520
1521         jmp     .Lin_prologue
1522 .size   avx2_handler,.-avx2_handler
1523 ___
1524 $code.=<<___;
1525 .section        .pdata
1526 .align  4
1527         .rva    .LSEH_begin_sha1_multi_block
1528         .rva    .LSEH_end_sha1_multi_block
1529         .rva    .LSEH_info_sha1_multi_block
1530         .rva    .LSEH_begin_sha1_multi_block_shaext
1531         .rva    .LSEH_end_sha1_multi_block_shaext
1532         .rva    .LSEH_info_sha1_multi_block_shaext
1533 ___
1534 $code.=<<___ if ($avx);
1535         .rva    .LSEH_begin_sha1_multi_block_avx
1536         .rva    .LSEH_end_sha1_multi_block_avx
1537         .rva    .LSEH_info_sha1_multi_block_avx
1538 ___
1539 $code.=<<___ if ($avx>1);
1540         .rva    .LSEH_begin_sha1_multi_block_avx2
1541         .rva    .LSEH_end_sha1_multi_block_avx2
1542         .rva    .LSEH_info_sha1_multi_block_avx2
1543 ___
1544 $code.=<<___;
1545 .section        .xdata
1546 .align  8
1547 .LSEH_info_sha1_multi_block:
1548         .byte   9,0,0,0
1549         .rva    se_handler
1550         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1551 .LSEH_info_sha1_multi_block_shaext:
1552         .byte   9,0,0,0
1553         .rva    se_handler
1554         .rva    .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1555 ___
1556 $code.=<<___ if ($avx);
1557 .LSEH_info_sha1_multi_block_avx:
1558         .byte   9,0,0,0
1559         .rva    se_handler
1560         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1561 ___
1562 $code.=<<___ if ($avx>1);
1563 .LSEH_info_sha1_multi_block_avx2:
1564         .byte   9,0,0,0
1565         .rva    avx2_handler
1566         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1567 ___
1568 }
1569 ####################################################################
1570
1571 sub rex {
1572   local *opcode=shift;
1573   my ($dst,$src)=@_;
1574   my $rex=0;
1575
1576     $rex|=0x04                  if ($dst>=8);
1577     $rex|=0x01                  if ($src>=8);
1578     unshift @opcode,$rex|0x40   if ($rex);
1579 }
1580
1581 sub sha1rnds4 {
1582     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1583       my @opcode=(0x0f,0x3a,0xcc);
1584         rex(\@opcode,$3,$2);
1585         push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
1586         my $c=$1;
1587         push @opcode,$c=~/^0/?oct($c):$c;
1588         return ".byte\t".join(',',@opcode);
1589     } else {
1590         return "sha1rnds4\t".@_[0];
1591     }
1592 }
1593
1594 sub sha1op38 {
1595     my $instr = shift;
1596     my %opcodelet = (
1597                 "sha1nexte" => 0xc8,
1598                 "sha1msg1"  => 0xc9,
1599                 "sha1msg2"  => 0xca     );
1600
1601     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1602       my @opcode=(0x0f,0x38);
1603         rex(\@opcode,$2,$1);
1604         push @opcode,$opcodelet{$instr};
1605         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1606         return ".byte\t".join(',',@opcode);
1607     } else {
1608         return $instr."\t".@_[0];
1609     }
1610 }
1611
1612 foreach (split("\n",$code)) {
1613         s/\`([^\`]*)\`/eval($1)/ge;
1614
1615         s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo                or
1616         s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo             or
1617
1618         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1619         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1620         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1621         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1622         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1623         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1624
1625         print $_,"\n";
1626 }
1627
1628 close STDOUT;