3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance in cycles per byte out of large buffer.
20 # Core2 9.56/+89% 4.83
21 # Westmere 9.50/+45% 3.35
22 # Sandy Bridge 10.5/+47% 3.20
23 # Haswell 8.15/+50% 2.83
24 # Silvermont 17.4/+36% 8.35
25 # Sledgehammer 10.2/+54%
26 # Bulldozer 13.4/+50% 4.38(*)
28 # (*) Bulldozer actually executes 4xXOP code path that delivers 3.55;
30 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
31 push(@INC,"${dir}","${dir}../../perlasm");
34 &asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386");
37 for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
40 `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
41 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
42 $1>=2.19); # first version supporting AVX
44 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
45 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
46 $1>=2.03); # first version supporting AVX
48 $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" &&
49 `ml 2>&1` =~ /Version ([0-9]+)\./ &&
50 $1>=10); # first version supporting AVX
52 $ymm=1 if ($xmm && !$ymm &&
53 `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ &&
54 $2>=3.0); # first version supporting AVX
57 ($b,$b_)=("ebx","ebp");
58 ($c,$c_)=("ecx","esi");
59 ($d,$d_)=("edx","edi");
62 my ($ai,$bi,$ci,$di,$i)=@_;
63 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
64 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
68 # 0 4 8 12 < even round
72 # 0 5 10 15 < odd round
79 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
82 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
85 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
88 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
91 #&add ($a,$b); # see elsewhere
93 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3);
95 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0);
97 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3);
99 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn);
101 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7);
102 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter
105 &mov (&DWP(4*$ai,"esp"),$a);
107 &mov ($a,&DWP(4*$an,"esp"));
109 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn);
110 &mov ($d_,$d) if ($di==$dn);
112 &add ($a,$b_) if ($i<7); # elsewhere
120 &static_label("ssse3_shortcut");
121 &static_label("xop_shortcut");
122 &static_label("ssse3_data");
123 &static_label("pic_point");
125 &function_begin("ChaCha20_ctr32");
127 &cmp ("eax",&wparam(2)); # len==0?
128 &je (&label("no_data"));
130 &call (&label("pic_point"));
131 &set_label("pic_point");
133 &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
134 &test (&DWP(0,"ebp"),1<<24); # test FXSR bit
136 &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit
138 &jmp (&label("ssse3_shortcut"));
141 &mov ("esi",&wparam(3)); # key
142 &mov ("edi",&wparam(4)); # counter and nonce
146 &mov ("eax",&DWP(4*0,"esi")); # copy key
147 &mov ("ebx",&DWP(4*1,"esi"));
148 &mov ("ecx",&DWP(4*2,"esi"));
149 &mov ("edx",&DWP(4*3,"esi"));
150 &mov (&DWP(64+4*4,"esp"),"eax");
151 &mov (&DWP(64+4*5,"esp"),"ebx");
152 &mov (&DWP(64+4*6,"esp"),"ecx");
153 &mov (&DWP(64+4*7,"esp"),"edx");
154 &mov ("eax",&DWP(4*4,"esi"));
155 &mov ("ebx",&DWP(4*5,"esi"));
156 &mov ("ecx",&DWP(4*6,"esi"));
157 &mov ("edx",&DWP(4*7,"esi"));
158 &mov (&DWP(64+4*8,"esp"),"eax");
159 &mov (&DWP(64+4*9,"esp"),"ebx");
160 &mov (&DWP(64+4*10,"esp"),"ecx");
161 &mov (&DWP(64+4*11,"esp"),"edx");
162 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce
163 &mov ("ebx",&DWP(4*1,"edi"));
164 &mov ("ecx",&DWP(4*2,"edi"));
165 &mov ("edx",&DWP(4*3,"edi"));
167 &mov (&DWP(64+4*12,"esp"),"eax");
168 &mov (&DWP(64+4*13,"esp"),"ebx");
169 &mov (&DWP(64+4*14,"esp"),"ecx");
170 &mov (&DWP(64+4*15,"esp"),"edx");
171 &jmp (&label("entry"));
173 &set_label("outer_loop",16);
174 &mov (&wparam(1),$b); # save input
175 &mov (&wparam(0),$a); # save output
176 &mov (&wparam(2),$c); # save len
178 &mov ($a,0x61707865);
179 &mov (&DWP(4*1,"esp"),0x3320646e);
180 &mov (&DWP(4*2,"esp"),0x79622d32);
181 &mov (&DWP(4*3,"esp"),0x6b206574);
183 &mov ($b, &DWP(64+4*5,"esp")); # copy key material
184 &mov ($b_,&DWP(64+4*6,"esp"));
185 &mov ($c, &DWP(64+4*10,"esp"));
186 &mov ($c_,&DWP(64+4*11,"esp"));
187 &mov ($d, &DWP(64+4*13,"esp"));
188 &mov ($d_,&DWP(64+4*14,"esp"));
189 &mov (&DWP(4*5,"esp"),$b);
190 &mov (&DWP(4*6,"esp"),$b_);
191 &mov (&DWP(4*10,"esp"),$c);
192 &mov (&DWP(4*11,"esp"),$c_);
193 &mov (&DWP(4*13,"esp"),$d);
194 &mov (&DWP(4*14,"esp"),$d_);
196 &mov ($b, &DWP(64+4*7,"esp"));
197 &mov ($d_,&DWP(64+4*15,"esp"));
198 &mov ($d, &DWP(64+4*12,"esp"));
199 &mov ($b_,&DWP(64+4*4,"esp"));
200 &mov ($c, &DWP(64+4*8,"esp"));
201 &mov ($c_,&DWP(64+4*9,"esp"));
202 &add ($d,1); # counter value
203 &mov (&DWP(4*7,"esp"),$b);
204 &mov (&DWP(4*15,"esp"),$d_);
205 &mov (&DWP(64+4*12,"esp"),$d); # save counter value
207 &mov ($b,10); # loop counter
208 &jmp (&label("loop"));
210 &set_label("loop",16);
211 &add ($a,$b_); # elsewhere
212 &mov (&DWP(128,"esp"),$b); # save loop counter
214 &QUARTERROUND(0, 4, 8, 12, 0);
215 &QUARTERROUND(1, 5, 9, 13, 1);
216 &QUARTERROUND(2, 6,10, 14, 2);
217 &QUARTERROUND(3, 7,11, 15, 3);
218 &QUARTERROUND(0, 5,10, 15, 4);
219 &QUARTERROUND(1, 6,11, 12, 5);
220 &QUARTERROUND(2, 7, 8, 13, 6);
221 &QUARTERROUND(3, 4, 9, 14, 7);
223 &jnz (&label("loop"));
225 &mov ($b,&wparam(2)); # load len
227 &add ($a,0x61707865); # accumulate key material
228 &add ($b_,&DWP(64+4*4,"esp"));
229 &add ($c, &DWP(64+4*8,"esp"));
230 &add ($c_,&DWP(64+4*9,"esp"));
233 &jb (&label("tail"));
235 &mov ($b,&wparam(1)); # load input pointer
236 &add ($d, &DWP(64+4*12,"esp"));
237 &add ($d_,&DWP(64+4*14,"esp"));
239 &xor ($a, &DWP(4*0,$b)); # xor with input
240 &xor ($b_,&DWP(4*4,$b));
241 &mov (&DWP(4*0,"esp"),$a);
242 &mov ($a,&wparam(0)); # load output pointer
243 &xor ($c, &DWP(4*8,$b));
244 &xor ($c_,&DWP(4*9,$b));
245 &xor ($d, &DWP(4*12,$b));
246 &xor ($d_,&DWP(4*14,$b));
247 &mov (&DWP(4*4,$a),$b_); # write output
248 &mov (&DWP(4*8,$a),$c);
249 &mov (&DWP(4*9,$a),$c_);
250 &mov (&DWP(4*12,$a),$d);
251 &mov (&DWP(4*14,$a),$d_);
253 &mov ($b_,&DWP(4*1,"esp"));
254 &mov ($c, &DWP(4*2,"esp"));
255 &mov ($c_,&DWP(4*3,"esp"));
256 &mov ($d, &DWP(4*5,"esp"));
257 &mov ($d_,&DWP(4*6,"esp"));
258 &add ($b_,0x3320646e); # accumulate key material
259 &add ($c, 0x79622d32);
260 &add ($c_,0x6b206574);
261 &add ($d, &DWP(64+4*5,"esp"));
262 &add ($d_,&DWP(64+4*6,"esp"));
263 &xor ($b_,&DWP(4*1,$b));
264 &xor ($c, &DWP(4*2,$b));
265 &xor ($c_,&DWP(4*3,$b));
266 &xor ($d, &DWP(4*5,$b));
267 &xor ($d_,&DWP(4*6,$b));
268 &mov (&DWP(4*1,$a),$b_);
269 &mov (&DWP(4*2,$a),$c);
270 &mov (&DWP(4*3,$a),$c_);
271 &mov (&DWP(4*5,$a),$d);
272 &mov (&DWP(4*6,$a),$d_);
274 &mov ($b_,&DWP(4*7,"esp"));
275 &mov ($c, &DWP(4*10,"esp"));
276 &mov ($c_,&DWP(4*11,"esp"));
277 &mov ($d, &DWP(4*13,"esp"));
278 &mov ($d_,&DWP(4*15,"esp"));
279 &add ($b_,&DWP(64+4*7,"esp"));
280 &add ($c, &DWP(64+4*10,"esp"));
281 &add ($c_,&DWP(64+4*11,"esp"));
282 &add ($d, &DWP(64+4*13,"esp"));
283 &add ($d_,&DWP(64+4*15,"esp"));
284 &xor ($b_,&DWP(4*7,$b));
285 &xor ($c, &DWP(4*10,$b));
286 &xor ($c_,&DWP(4*11,$b));
287 &xor ($d, &DWP(4*13,$b));
288 &xor ($d_,&DWP(4*15,$b));
289 &lea ($b,&DWP(4*16,$b));
290 &mov (&DWP(4*7,$a),$b_);
291 &mov ($b_,&DWP(4*0,"esp"));
292 &mov (&DWP(4*10,$a),$c);
293 &mov ($c,&wparam(2)); # len
294 &mov (&DWP(4*11,$a),$c_);
295 &mov (&DWP(4*13,$a),$d);
296 &mov (&DWP(4*15,$a),$d_);
297 &mov (&DWP(4*0,$a),$b_);
298 &lea ($a,&DWP(4*16,$a));
300 &jnz (&label("outer_loop"));
302 &jmp (&label("done"));
305 &add ($d, &DWP(64+4*12,"esp"));
306 &add ($d_,&DWP(64+4*14,"esp"));
307 &mov (&DWP(4*0,"esp"),$a);
308 &mov (&DWP(4*4,"esp"),$b_);
309 &mov (&DWP(4*8,"esp"),$c);
310 &mov (&DWP(4*9,"esp"),$c_);
311 &mov (&DWP(4*12,"esp"),$d);
312 &mov (&DWP(4*14,"esp"),$d_);
314 &mov ($b_,&DWP(4*1,"esp"));
315 &mov ($c, &DWP(4*2,"esp"));
316 &mov ($c_,&DWP(4*3,"esp"));
317 &mov ($d, &DWP(4*5,"esp"));
318 &mov ($d_,&DWP(4*6,"esp"));
319 &add ($b_,0x3320646e); # accumulate key material
320 &add ($c, 0x79622d32);
321 &add ($c_,0x6b206574);
322 &add ($d, &DWP(64+4*5,"esp"));
323 &add ($d_,&DWP(64+4*6,"esp"));
324 &mov (&DWP(4*1,"esp"),$b_);
325 &mov (&DWP(4*2,"esp"),$c);
326 &mov (&DWP(4*3,"esp"),$c_);
327 &mov (&DWP(4*5,"esp"),$d);
328 &mov (&DWP(4*6,"esp"),$d_);
330 &mov ($b_,&DWP(4*7,"esp"));
331 &mov ($c, &DWP(4*10,"esp"));
332 &mov ($c_,&DWP(4*11,"esp"));
333 &mov ($d, &DWP(4*13,"esp"));
334 &mov ($d_,&DWP(4*15,"esp"));
335 &add ($b_,&DWP(64+4*7,"esp"));
336 &add ($c, &DWP(64+4*10,"esp"));
337 &add ($c_,&DWP(64+4*11,"esp"));
338 &add ($d, &DWP(64+4*13,"esp"));
339 &add ($d_,&DWP(64+4*15,"esp"));
340 &mov (&DWP(4*7,"esp"),$b_);
341 &mov ($b_,&wparam(1)); # load input
342 &mov (&DWP(4*10,"esp"),$c);
343 &mov ($c,&wparam(0)); # load output
344 &mov (&DWP(4*11,"esp"),$c_);
346 &mov (&DWP(4*13,"esp"),$d);
347 &mov (&DWP(4*15,"esp"),$d_);
351 &set_label("tail_loop");
352 &movb ("al",&BP(0,$c_,$b_));
353 &movb ("dl",&BP(0,"esp",$c_));
354 &lea ($c_,&DWP(1,$c_));
356 &mov (&BP(-1,$c,$c_),"al");
358 &jnz (&label("tail_loop"));
362 &set_label("no_data");
363 &function_end("ChaCha20_ctr32");
366 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
367 my ($out,$inp,$len)=("edi","esi","ecx");
369 sub QUARTERROUND_SSSE3 {
370 my ($ai,$bi,$ci,$di,$i)=@_;
371 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
372 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
376 # 0 4 8 12 < even round
380 # 0 5 10 15 < odd round
387 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
390 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
393 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
396 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
399 #&paddd ($xa,$xb); # see elsewhere
400 #&pxor ($xd,$xa); # see elsewhere
401 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
402 &pshufb ($xd,&QWP(0,"eax")); # rot16
403 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
405 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
407 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
408 &movdqa ($xa_,$xb); # borrow as temporary
412 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
414 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
416 &movdqa (&QWP(16*$ai-128,"ebx"),$xa);
417 &pshufb ($xd,&QWP(16,"eax")); # rot8
419 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
420 &movdqa ($xd_,$xd) if ($di==$dn);
422 &paddd ($xa_,$xb_) if ($i<7); # elsewhere
423 &movdqa ($xa,$xb); # borrow as temporary
426 &pxor ($xd_,$xa_) if ($i<7); # elsewhere
429 ($xa,$xa_)=($xa_,$xa);
430 ($xb,$xb_)=($xb_,$xb);
431 ($xc,$xc_)=($xc_,$xc);
432 ($xd,$xd_)=($xd_,$xd);
435 &function_begin("ChaCha20_ssse3");
436 &set_label("ssse3_shortcut");
437 &test (&DWP(4,"ebp"),1<<11); # test XOP bit
438 &jnz (&label("xop_shortcut"));
440 &mov ($out,&wparam(0));
441 &mov ($inp,&wparam(1));
442 &mov ($len,&wparam(2));
443 &mov ("edx",&wparam(3)); # key
444 &mov ("ebx",&wparam(4)); # counter and nonce
449 &mov (&DWP(512,"esp"),"ebp");
451 &lea ("eax",&DWP(&label("ssse3_data")."-".
452 &label("pic_point"),"eax"));
453 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
458 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers
459 &mov (&DWP(512+8,"esp"),"ebx");
460 &sub ($len,64*4); # bias len
461 &lea ("ebp",&DWP(256+128,"esp")); # size optimization
463 &movdqu ("xmm7",&QWP(0,"edx")); # key
464 &pshufd ("xmm0","xmm3",0x00);
465 &pshufd ("xmm1","xmm3",0x55);
466 &pshufd ("xmm2","xmm3",0xaa);
467 &pshufd ("xmm3","xmm3",0xff);
468 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters
469 &pshufd ("xmm4","xmm7",0x00);
470 &pshufd ("xmm5","xmm7",0x55);
471 &psubd ("xmm0",&QWP(16*4,"eax"));
472 &pshufd ("xmm6","xmm7",0xaa);
473 &pshufd ("xmm7","xmm7",0xff);
474 &movdqa (&QWP(16*12-128,"ebp"),"xmm0");
475 &movdqa (&QWP(16*13-128,"ebp"),"xmm1");
476 &movdqa (&QWP(16*14-128,"ebp"),"xmm2");
477 &movdqa (&QWP(16*15-128,"ebp"),"xmm3");
478 &movdqu ("xmm3",&QWP(16,"edx")); # key
479 &movdqa (&QWP(16*4-128,"ebp"),"xmm4");
480 &movdqa (&QWP(16*5-128,"ebp"),"xmm5");
481 &movdqa (&QWP(16*6-128,"ebp"),"xmm6");
482 &movdqa (&QWP(16*7-128,"ebp"),"xmm7");
483 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma
484 &lea ("ebx",&DWP(128,"esp")); # size optimization
486 &pshufd ("xmm0","xmm3",0x00);
487 &pshufd ("xmm1","xmm3",0x55);
488 &pshufd ("xmm2","xmm3",0xaa);
489 &pshufd ("xmm3","xmm3",0xff);
490 &pshufd ("xmm4","xmm7",0x00);
491 &pshufd ("xmm5","xmm7",0x55);
492 &pshufd ("xmm6","xmm7",0xaa);
493 &pshufd ("xmm7","xmm7",0xff);
494 &movdqa (&QWP(16*8-128,"ebp"),"xmm0");
495 &movdqa (&QWP(16*9-128,"ebp"),"xmm1");
496 &movdqa (&QWP(16*10-128,"ebp"),"xmm2");
497 &movdqa (&QWP(16*11-128,"ebp"),"xmm3");
498 &movdqa (&QWP(16*0-128,"ebp"),"xmm4");
499 &movdqa (&QWP(16*1-128,"ebp"),"xmm5");
500 &movdqa (&QWP(16*2-128,"ebp"),"xmm6");
501 &movdqa (&QWP(16*3-128,"ebp"),"xmm7");
503 &lea ($inp,&DWP(128,$inp)); # size optimization
504 &lea ($out,&DWP(128,$out)); # size optimization
505 &jmp (&label("outer_loop"));
507 &set_label("outer_loop",16);
508 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
509 &movdqa ("xmm1",&QWP(16*1-128,"ebp"));
510 &movdqa ("xmm2",&QWP(16*2-128,"ebp"));
511 &movdqa ("xmm3",&QWP(16*3-128,"ebp"));
512 #&movdqa ("xmm4",&QWP(16*4-128,"ebp"));
513 &movdqa ("xmm5",&QWP(16*5-128,"ebp"));
514 &movdqa ("xmm6",&QWP(16*6-128,"ebp"));
515 &movdqa ("xmm7",&QWP(16*7-128,"ebp"));
516 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0");
517 &movdqa (&QWP(16*1-128,"ebx"),"xmm1");
518 &movdqa (&QWP(16*2-128,"ebx"),"xmm2");
519 &movdqa (&QWP(16*3-128,"ebx"),"xmm3");
520 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4");
521 &movdqa (&QWP(16*5-128,"ebx"),"xmm5");
522 &movdqa (&QWP(16*6-128,"ebx"),"xmm6");
523 &movdqa (&QWP(16*7-128,"ebx"),"xmm7");
524 #&movdqa ("xmm0",&QWP(16*8-128,"ebp"));
525 #&movdqa ("xmm1",&QWP(16*9-128,"ebp"));
526 &movdqa ("xmm2",&QWP(16*10-128,"ebp"));
527 &movdqa ("xmm3",&QWP(16*11-128,"ebp"));
528 &movdqa ("xmm4",&QWP(16*12-128,"ebp"));
529 &movdqa ("xmm5",&QWP(16*13-128,"ebp"));
530 &movdqa ("xmm6",&QWP(16*14-128,"ebp"));
531 &movdqa ("xmm7",&QWP(16*15-128,"ebp"));
532 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value
533 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0");
534 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1");
535 &movdqa (&QWP(16*10-128,"ebx"),"xmm2");
536 &movdqa (&QWP(16*11-128,"ebx"),"xmm3");
537 &movdqa (&QWP(16*12-128,"ebx"),"xmm4");
538 &movdqa (&QWP(16*13-128,"ebx"),"xmm5");
539 &movdqa (&QWP(16*14-128,"ebx"),"xmm6");
540 &movdqa (&QWP(16*15-128,"ebx"),"xmm7");
541 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
543 &movdqa ($xa, &QWP(16*0-128,"ebp"));
544 &movdqa ($xd, "xmm4");
545 &movdqa ($xb_,&QWP(16*4-128,"ebp"));
546 &movdqa ($xc, &QWP(16*8-128,"ebp"));
547 &movdqa ($xc_,&QWP(16*9-128,"ebp"));
549 &mov ("edx",10); # loop counter
552 &set_label("loop",16);
553 &paddd ($xa,$xb_); # elsewhere
555 &pxor ($xd,$xa); # elsewhere
556 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
557 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
558 &QUARTERROUND_SSSE3(2, 6,10, 14, 2);
559 &QUARTERROUND_SSSE3(3, 7,11, 15, 3);
560 &QUARTERROUND_SSSE3(0, 5,10, 15, 4);
561 &QUARTERROUND_SSSE3(1, 6,11, 12, 5);
562 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
563 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
565 &jnz (&label("loop"));
567 &movdqa (&QWP(16*4-128,"ebx"),$xb_);
568 &movdqa (&QWP(16*8-128,"ebx"),$xc);
569 &movdqa (&QWP(16*9-128,"ebx"),$xc_);
570 &movdqa (&QWP(16*12-128,"ebx"),$xd);
571 &movdqa (&QWP(16*14-128,"ebx"),$xd_);
573 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
575 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
576 &movdqa ($xa1,&QWP(16*1-128,"ebx"));
577 &movdqa ($xa2,&QWP(16*2-128,"ebx"));
578 &movdqa ($xa3,&QWP(16*3-128,"ebx"));
580 for($i=0;$i<256;$i+=64) {
581 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
582 &paddd ($xa1,&QWP($i+16*1-128,"ebp"));
583 &paddd ($xa2,&QWP($i+16*2-128,"ebp"));
584 &paddd ($xa3,&QWP($i+16*3-128,"ebp"));
586 &movdqa ($xt2,$xa0); # "de-interlace" data
587 &punpckldq ($xa0,$xa1);
589 &punpckldq ($xa2,$xa3);
590 &punpckhdq ($xt2,$xa1);
591 &punpckhdq ($xt3,$xa3);
593 &punpcklqdq ($xa0,$xa2); # "a0"
595 &punpcklqdq ($xt2,$xt3); # "a2"
596 &punpckhqdq ($xa1,$xa2); # "a1"
597 &punpckhqdq ($xa3,$xt3); # "a3"
599 #($xa2,$xt2)=($xt2,$xa2);
601 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input
602 &movdqu ($xt1,&QWP(64*1-128,$inp));
603 &movdqu ($xa2,&QWP(64*2-128,$inp));
604 &movdqu ($xt3,&QWP(64*3-128,$inp));
605 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
607 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
609 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
611 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
613 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
614 &movdqu (&QWP(64*0-128,$out),$xt0); # store output
615 &movdqu (&QWP(64*1-128,$out),$xt1);
616 &movdqu (&QWP(64*2-128,$out),$xt2);
617 &movdqu (&QWP(64*3-128,$out),$xt3);
618 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
621 &jnc (&label("outer_loop"));
624 &jz (&label("done"));
626 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers
627 &lea ($inp,&DWP(-128,$inp));
628 &mov ("edx",&DWP(512+4,"esp"));
629 &lea ($out,&DWP(-128,$out));
631 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
632 &movdqu ("xmm3",&QWP(0,"ebx"));
633 &paddd ("xmm2",&QWP(16*6,"eax")); # +four
634 &pand ("xmm3",&QWP(16*7,"eax"));
635 &por ("xmm3","xmm2"); # counter value
637 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
639 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
664 &movdqa ($a,&QWP(16*2,"eax")); # sigma
665 &movdqu ($b,&QWP(0,"edx"));
666 &movdqu ($c,&QWP(16,"edx"));
667 #&movdqu ($d,&QWP(0,"ebx")); # already loaded
668 &movdqa ($rot16,&QWP(0,"eax"));
669 &movdqa ($rot24,&QWP(16,"eax"));
670 &mov (&DWP(16*3,"esp"),"ebp");
672 &movdqa (&QWP(16*0,"esp"),$a);
673 &movdqa (&QWP(16*1,"esp"),$b);
674 &movdqa (&QWP(16*2,"esp"),$c);
675 &movdqa (&QWP(16*3,"esp"),$d);
677 &jmp (&label("loop1x"));
679 &set_label("outer1x",16);
680 &movdqa ($d,&QWP(16*5,"eax")); # one
681 &movdqa ($a,&QWP(16*0,"esp"));
682 &movdqa ($b,&QWP(16*1,"esp"));
683 &movdqa ($c,&QWP(16*2,"esp"));
684 &paddd ($d,&QWP(16*3,"esp"));
686 &movdqa (&QWP(16*3,"esp"),$d);
687 &jmp (&label("loop1x"));
689 &set_label("loop1x",16);
691 &pshufd ($c,$c,0b01001110);
692 &pshufd ($b,$b,0b00111001);
693 &pshufd ($d,$d,0b10010011);
697 &pshufd ($c,$c,0b01001110);
698 &pshufd ($b,$b,0b10010011);
699 &pshufd ($d,$d,0b00111001);
702 &jnz (&label("loop1x"));
704 &paddd ($a,&QWP(16*0,"esp"));
705 &paddd ($b,&QWP(16*1,"esp"));
706 &paddd ($c,&QWP(16*2,"esp"));
707 &paddd ($d,&QWP(16*3,"esp"));
710 &jb (&label("tail"));
712 &movdqu ($t,&QWP(16*0,$inp));
713 &movdqu ($t1,&QWP(16*1,$inp));
714 &pxor ($a,$t); # xor with input
715 &movdqu ($t,&QWP(16*2,$inp));
717 &movdqu ($t1,&QWP(16*3,$inp));
720 &lea ($inp,&DWP(16*4,$inp)); # inp+=64
722 &movdqu (&QWP(16*0,$out),$a); # write output
723 &movdqu (&QWP(16*1,$out),$b);
724 &movdqu (&QWP(16*2,$out),$c);
725 &movdqu (&QWP(16*3,$out),$d);
726 &lea ($out,&DWP(16*4,$out)); # inp+=64
729 &jnz (&label("outer1x"));
731 &jmp (&label("done"));
734 &movdqa (&QWP(16*0,"esp"),$a);
735 &movdqa (&QWP(16*1,"esp"),$b);
736 &movdqa (&QWP(16*2,"esp"),$c);
737 &movdqa (&QWP(16*3,"esp"),$d);
743 &set_label("tail_loop");
744 &movb ("al",&BP(0,"esp","ebp"));
745 &movb ("dl",&BP(0,$inp,"ebp"));
746 &lea ("ebp",&DWP(1,"ebp"));
748 &movb (&BP(-1,$out,"ebp"),"al");
750 &jnz (&label("tail_loop"));
753 &mov ("esp",&DWP(512,"esp"));
754 &function_end("ChaCha20_ssse3");
757 &set_label("ssse3_data");
758 &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
759 &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
760 &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
765 &data_word(0,-1,-1,-1);
768 &asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
771 my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
772 my ($out,$inp,$len)=("edi","esi","ecx");
774 sub QUARTERROUND_XOP {
775 my ($ai,$bi,$ci,$di,$i)=@_;
776 my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next
777 my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous
781 # 0 4 8 12 < even round
785 # 0 5 10 15 < odd round
792 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
795 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
798 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
801 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
804 #&vpaddd ($xa,$xa,$xb); # see elsewhere
805 #&vpxor ($xd,$xd,$xa); # see elsewhere
806 &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3);
807 &vprotd ($xd,$xd,16);
808 &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0);
809 &vpaddd ($xc,$xc,$xd);
810 &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3);
811 &vpxor ($xb,$i!=0?$xb:$xb_,$xc);
812 &vmovdqa ($xa_,&QWP(16*$an-128,"ebx"));
813 &vprotd ($xb,$xb,12);
814 &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7);
815 &vpaddd ($xa,$xa,$xb);
816 &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn);
817 &vpxor ($xd,$xd,$xa);
818 &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere
820 &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa);
821 &vpaddd ($xc,$xc,$xd);
822 &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn);
823 &vpxor ($xb,$xb,$xc);
824 &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere
827 ($xa,$xa_)=($xa_,$xa);
828 ($xb,$xb_)=($xb_,$xb);
829 ($xc,$xc_)=($xc_,$xc);
830 ($xd,$xd_)=($xd_,$xd);
833 &function_begin("ChaCha20_xop");
834 &set_label("xop_shortcut");
835 &mov ($out,&wparam(0));
836 &mov ($inp,&wparam(1));
837 &mov ($len,&wparam(2));
838 &mov ("edx",&wparam(3)); # key
839 &mov ("ebx",&wparam(4)); # counter and nonce
845 &mov (&DWP(512,"esp"),"ebp");
847 &lea ("eax",&DWP(&label("ssse3_data")."-".
848 &label("pic_point"),"eax"));
849 &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce
854 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers
855 &mov (&DWP(512+8,"esp"),"ebx");
856 &sub ($len,64*4); # bias len
857 &lea ("ebp",&DWP(256+128,"esp")); # size optimization
859 &vmovdqu ("xmm7",&QWP(0,"edx")); # key
860 &vpshufd ("xmm0","xmm3",0x00);
861 &vpshufd ("xmm1","xmm3",0x55);
862 &vpshufd ("xmm2","xmm3",0xaa);
863 &vpshufd ("xmm3","xmm3",0xff);
864 &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters
865 &vpshufd ("xmm4","xmm7",0x00);
866 &vpshufd ("xmm5","xmm7",0x55);
867 &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax"));
868 &vpshufd ("xmm6","xmm7",0xaa);
869 &vpshufd ("xmm7","xmm7",0xff);
870 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0");
871 &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1");
872 &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2");
873 &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3");
874 &vmovdqu ("xmm3",&QWP(16,"edx")); # key
875 &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4");
876 &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5");
877 &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6");
878 &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7");
879 &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma
880 &lea ("ebx",&DWP(128,"esp")); # size optimization
882 &vpshufd ("xmm0","xmm3",0x00);
883 &vpshufd ("xmm1","xmm3",0x55);
884 &vpshufd ("xmm2","xmm3",0xaa);
885 &vpshufd ("xmm3","xmm3",0xff);
886 &vpshufd ("xmm4","xmm7",0x00);
887 &vpshufd ("xmm5","xmm7",0x55);
888 &vpshufd ("xmm6","xmm7",0xaa);
889 &vpshufd ("xmm7","xmm7",0xff);
890 &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0");
891 &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1");
892 &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2");
893 &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3");
894 &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4");
895 &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5");
896 &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6");
897 &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7");
899 &lea ($inp,&DWP(128,$inp)); # size optimization
900 &lea ($out,&DWP(128,$out)); # size optimization
901 &jmp (&label("outer_loop"));
903 &set_label("outer_loop",32);
904 #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material
905 &vmovdqa ("xmm1",&QWP(16*1-128,"ebp"));
906 &vmovdqa ("xmm2",&QWP(16*2-128,"ebp"));
907 &vmovdqa ("xmm3",&QWP(16*3-128,"ebp"));
908 #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp"));
909 &vmovdqa ("xmm5",&QWP(16*5-128,"ebp"));
910 &vmovdqa ("xmm6",&QWP(16*6-128,"ebp"));
911 &vmovdqa ("xmm7",&QWP(16*7-128,"ebp"));
912 #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0");
913 &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1");
914 &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2");
915 &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3");
916 #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4");
917 &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5");
918 &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6");
919 &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7");
920 #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp"));
921 #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp"));
922 &vmovdqa ("xmm2",&QWP(16*10-128,"ebp"));
923 &vmovdqa ("xmm3",&QWP(16*11-128,"ebp"));
924 &vmovdqa ("xmm4",&QWP(16*12-128,"ebp"));
925 &vmovdqa ("xmm5",&QWP(16*13-128,"ebp"));
926 &vmovdqa ("xmm6",&QWP(16*14-128,"ebp"));
927 &vmovdqa ("xmm7",&QWP(16*15-128,"ebp"));
928 &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value
929 #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0");
930 #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1");
931 &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2");
932 &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3");
933 &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4");
934 &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5");
935 &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6");
936 &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7");
937 &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value
939 &vmovdqa ($xa, &QWP(16*0-128,"ebp"));
940 &vmovdqa ($xd, "xmm4");
941 &vmovdqa ($xb_,&QWP(16*4-128,"ebp"));
942 &vmovdqa ($xc, &QWP(16*8-128,"ebp"));
943 &vmovdqa ($xc_,&QWP(16*9-128,"ebp"));
945 &mov ("edx",10); # loop counter
948 &set_label("loop",32);
949 &vpaddd ($xa,$xa,$xb_); # elsewhere
950 &vpxor ($xd,$xd,$xa); # elsewhere
951 &QUARTERROUND_XOP(0, 4, 8, 12, 0);
952 &QUARTERROUND_XOP(1, 5, 9, 13, 1);
953 &QUARTERROUND_XOP(2, 6,10, 14, 2);
954 &QUARTERROUND_XOP(3, 7,11, 15, 3);
955 &QUARTERROUND_XOP(0, 5,10, 15, 4);
956 &QUARTERROUND_XOP(1, 6,11, 12, 5);
957 &QUARTERROUND_XOP(2, 7, 8, 13, 6);
958 &QUARTERROUND_XOP(3, 4, 9, 14, 7);
960 &jnz (&label("loop"));
962 &vmovdqa (&QWP(16*4-128,"ebx"),$xb_);
963 &vmovdqa (&QWP(16*8-128,"ebx"),$xc);
964 &vmovdqa (&QWP(16*9-128,"ebx"),$xc_);
965 &vmovdqa (&QWP(16*12-128,"ebx"),$xd);
966 &vmovdqa (&QWP(16*14-128,"ebx"),$xd_);
968 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
970 #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there
971 &vmovdqa ($xa1,&QWP(16*1-128,"ebx"));
972 &vmovdqa ($xa2,&QWP(16*2-128,"ebx"));
973 &vmovdqa ($xa3,&QWP(16*3-128,"ebx"));
975 for($i=0;$i<256;$i+=64) {
976 &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material
977 &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp"));
978 &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp"));
979 &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp"));
981 &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data
982 &vpunpckldq ($xt3,$xa2,$xa3);
983 &vpunpckhdq ($xa0,$xa0,$xa1);
984 &vpunpckhdq ($xa2,$xa2,$xa3);
985 &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0"
986 &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1"
987 &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2"
988 &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3"
990 &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp));
991 &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp));
992 &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp));
993 &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp));
994 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp));
995 &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192);
996 &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192);
997 &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192);
998 &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192);
999 &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output
1000 &vmovdqu (&QWP(64*1-128,$out),$xt1);
1001 &vmovdqu (&QWP(64*2-128,$out),$xt2);
1002 &vmovdqu (&QWP(64*3-128,$out),$xt3);
1003 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out));
1006 &jnc (&label("outer_loop"));
1009 &jz (&label("done"));
1011 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers
1012 &lea ($inp,&DWP(-128,$inp));
1013 &mov ("edx",&DWP(512+4,"esp"));
1014 &lea ($out,&DWP(-128,$out));
1016 &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value
1017 &vmovdqu ("xmm3",&QWP(0,"ebx"));
1018 &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four
1019 &vpand ("xmm3","xmm3",&QWP(16*7,"eax"));
1020 &vpor ("xmm3","xmm3","xmm2"); # counter value
1022 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
1043 &vmovdqa ($a,&QWP(16*2,"eax")); # sigma
1044 &vmovdqu ($b,&QWP(0,"edx"));
1045 &vmovdqu ($c,&QWP(16,"edx"));
1046 #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded
1047 &vmovdqa ($rot16,&QWP(0,"eax"));
1048 &vmovdqa ($rot24,&QWP(16,"eax"));
1049 &mov (&DWP(16*3,"esp"),"ebp");
1051 &vmovdqa (&QWP(16*0,"esp"),$a);
1052 &vmovdqa (&QWP(16*1,"esp"),$b);
1053 &vmovdqa (&QWP(16*2,"esp"),$c);
1054 &vmovdqa (&QWP(16*3,"esp"),$d);
1056 &jmp (&label("loop1x"));
1058 &set_label("outer1x",16);
1059 &vmovdqa ($d,&QWP(16*5,"eax")); # one
1060 &vmovdqa ($a,&QWP(16*0,"esp"));
1061 &vmovdqa ($b,&QWP(16*1,"esp"));
1062 &vmovdqa ($c,&QWP(16*2,"esp"));
1063 &vpaddd ($d,$d,&QWP(16*3,"esp"));
1065 &vmovdqa (&QWP(16*3,"esp"),$d);
1066 &jmp (&label("loop1x"));
1068 &set_label("loop1x",16);
1070 &vpshufd ($c,$c,0b01001110);
1071 &vpshufd ($b,$b,0b00111001);
1072 &vpshufd ($d,$d,0b10010011);
1075 &vpshufd ($c,$c,0b01001110);
1076 &vpshufd ($b,$b,0b10010011);
1077 &vpshufd ($d,$d,0b00111001);
1080 &jnz (&label("loop1x"));
1082 &vpaddd ($a,$a,&QWP(16*0,"esp"));
1083 &vpaddd ($b,$b,&QWP(16*1,"esp"));
1084 &vpaddd ($c,$c,&QWP(16*2,"esp"));
1085 &vpaddd ($d,$d,&QWP(16*3,"esp"));
1088 &jb (&label("tail"));
1090 &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input
1091 &vpxor ($b,$b,&QWP(16*1,$inp));
1092 &vpxor ($c,$c,&QWP(16*2,$inp));
1093 &vpxor ($d,$d,&QWP(16*3,$inp));
1094 &lea ($inp,&DWP(16*4,$inp)); # inp+=64
1096 &vmovdqu (&QWP(16*0,$out),$a); # write output
1097 &vmovdqu (&QWP(16*1,$out),$b);
1098 &vmovdqu (&QWP(16*2,$out),$c);
1099 &vmovdqu (&QWP(16*3,$out),$d);
1100 &lea ($out,&DWP(16*4,$out)); # inp+=64
1103 &jnz (&label("outer1x"));
1105 &jmp (&label("done"));
1108 &vmovdqa (&QWP(16*0,"esp"),$a);
1109 &vmovdqa (&QWP(16*1,"esp"),$b);
1110 &vmovdqa (&QWP(16*2,"esp"),$c);
1111 &vmovdqa (&QWP(16*3,"esp"),$d);
1117 &set_label("tail_loop");
1118 &movb ("al",&BP(0,"esp","ebp"));
1119 &movb ("dl",&BP(0,$inp,"ebp"));
1120 &lea ("ebp",&DWP(1,"ebp"));
1122 &movb (&BP(-1,$out,"ebp"),"al");
1124 &jnz (&label("tail_loop"));
1128 &mov ("esp",&DWP(512,"esp"));
1129 &function_end("ChaCha20_xop");