bf6b312cd1b824cde1ac60906ff915248940554a
[openssl.git] / engines / asm / e_padlock-x86.pl
1 #! /usr/bin/env perl
2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # September 2011
18 #
19 # Assembler helpers for Padlock engine. Compared to original engine
20 # version relying on inline assembler and compiled with gcc 3.4.6 it
21 # was measured to provide ~100% improvement on misaligned data in ECB
22 # mode and ~75% in CBC mode. For aligned data improvement can be
23 # observed for short inputs only, e.g. 45% for 64-byte messages in
24 # ECB mode, 20% in CBC. Difference in performance for aligned vs.
25 # misaligned data depends on misalignment and is either ~1.8x or 2.9x.
26 # These are approximately same factors as for hardware support, so
27 # there is little reason to rely on the latter. On the contrary, it
28 # might actually hurt performance in mixture of aligned and misaligned
29 # buffers, because a) if you choose to flip 'align' flag in control
30 # word on per-buffer basis, then you'd have to reload key context,
31 # which incurs penalty; b) if you choose to set 'align' flag
32 # permanently, it limits performance even for aligned data to ~1/2.
33 # All above mentioned results were collected on 1.5GHz C7. Nano on the
34 # other hand handles unaligned data more gracefully. Depending on
35 # algorithm and how unaligned data is, hardware can be up to 70% more
36 # efficient than below software alignment procedures, nor does 'align'
37 # flag have affect on aligned performance [if has any meaning at all].
38 # Therefore suggestion is to unconditionally set 'align' flag on Nano
39 # for optimal performance.
40
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 push(@INC,"${dir}","${dir}../../crypto/perlasm");
43 require "x86asm.pl";
44
45 $output=pop;
46 open STDOUT,">$output";
47
48 &asm_init($ARGV[0],$0);
49
50 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64);  # prefetch errata
51 $PADLOCK_CHUNK=512;     # Must be a power of 2 larger than 16
52
53 $ctx="edx";
54 $out="edi";
55 $inp="esi";
56 $len="ecx";
57 $chunk="ebx";
58
59 &function_begin_B("padlock_capability");
60         &push   ("ebx");
61         &pushf  ();
62         &pop    ("eax");
63         &mov    ("ecx","eax");
64         &xor    ("eax",1<<21);
65         &push   ("eax");
66         &popf   ();
67         &pushf  ();
68         &pop    ("eax");
69         &xor    ("ecx","eax");
70         &xor    ("eax","eax");
71         &bt     ("ecx",21);
72         &jnc    (&label("noluck"));
73         &cpuid  ();
74         &xor    ("eax","eax");
75         &cmp    ("ebx","0x".unpack("H*",'tneC'));
76         &jne    (&label("noluck"));
77         &cmp    ("edx","0x".unpack("H*",'Hrua'));
78         &jne    (&label("noluck"));
79         &cmp    ("ecx","0x".unpack("H*",'slua'));
80         &jne    (&label("noluck"));
81         &mov    ("eax",0xC0000000);
82         &cpuid  ();
83         &mov    ("edx","eax");
84         &xor    ("eax","eax");
85         &cmp    ("edx",0xC0000001);
86         &jb     (&label("noluck"));
87         &mov    ("eax",1);
88         &cpuid  ();
89         &or     ("eax",0x0f);
90         &xor    ("ebx","ebx");
91         &and    ("eax",0x0fff);
92         &cmp    ("eax",0x06ff);         # check for Nano
93         &sete   ("bl");
94         &mov    ("eax",0xC0000001);
95         &push   ("ebx");
96         &cpuid  ();
97         &pop    ("ebx");
98         &mov    ("eax","edx");
99         &shl    ("ebx",4);              # bit#4 denotes Nano
100         &and    ("eax",0xffffffef);
101         &or     ("eax","ebx")
102 &set_label("noluck");
103         &pop    ("ebx");
104         &ret    ();
105 &function_end_B("padlock_capability")
106
107 &function_begin_B("padlock_key_bswap");
108         &mov    ("edx",&wparam(0));
109         &mov    ("ecx",&DWP(240,"edx"));
110 &set_label("bswap_loop");
111         &mov    ("eax",&DWP(0,"edx"));
112         &bswap  ("eax");
113         &mov    (&DWP(0,"edx"),"eax");
114         &lea    ("edx",&DWP(4,"edx"));
115         &sub    ("ecx",1);
116         &jnz    (&label("bswap_loop"));
117         &ret    ();
118 &function_end_B("padlock_key_bswap");
119
120 # This is heuristic key context tracing. At first one
121 # believes that one should use atomic swap instructions,
122 # but it's not actually necessary. Point is that if
123 # padlock_saved_context was changed by another thread
124 # after we've read it and before we compare it with ctx,
125 # our key *shall* be reloaded upon thread context switch
126 # and we are therefore set in either case...
127 &static_label("padlock_saved_context");
128
129 &function_begin_B("padlock_verify_context");
130         &mov    ($ctx,&wparam(0));
131         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
132                        &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
133         &pushf  ();
134         &call   ("_padlock_verify_ctx");
135 &set_label("verify_pic_point");
136         &lea    ("esp",&DWP(4,"esp"));
137         &ret    ();
138 &function_end_B("padlock_verify_context");
139
140 &function_begin_B("_padlock_verify_ctx");
141         &add    ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context
142         &bt     (&DWP(4,"esp"),30);             # eflags
143         &jnc    (&label("verified"));
144         &cmp    ($ctx,&DWP(0,"eax"));
145         &je     (&label("verified"));
146         &pushf  ();
147         &popf   ();
148 &set_label("verified");
149         &mov    (&DWP(0,"eax"),$ctx);
150         &ret    ();
151 &function_end_B("_padlock_verify_ctx");
152
153 &function_begin_B("padlock_reload_key");
154         &pushf  ();
155         &popf   ();
156         &ret    ();
157 &function_end_B("padlock_reload_key");
158
159 &function_begin_B("padlock_aes_block");
160         &push   ("edi");
161         &push   ("esi");
162         &push   ("ebx");
163         &mov    ($out,&wparam(0));              # must be 16-byte aligned
164         &mov    ($inp,&wparam(1));              # must be 16-byte aligned
165         &mov    ($ctx,&wparam(2));
166         &mov    ($len,1);
167         &lea    ("ebx",&DWP(32,$ctx));          # key
168         &lea    ($ctx,&DWP(16,$ctx));           # control word
169         &data_byte(0xf3,0x0f,0xa7,0xc8);        # rep xcryptecb
170         &pop    ("ebx");
171         &pop    ("esi");
172         &pop    ("edi");
173         &ret    ();
174 &function_end_B("padlock_aes_block");
175
176 sub generate_mode {
177 my ($mode,$opcode) = @_;
178 # int padlock_$mode_encrypt(void *out, const void *inp,
179 #               struct padlock_cipher_data *ctx, size_t len);
180 &function_begin("padlock_${mode}_encrypt");
181         &mov    ($out,&wparam(0));
182         &mov    ($inp,&wparam(1));
183         &mov    ($ctx,&wparam(2));
184         &mov    ($len,&wparam(3));
185         &test   ($ctx,15);
186         &jnz    (&label("${mode}_abort"));
187         &test   ($len,15);
188         &jnz    (&label("${mode}_abort"));
189         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
190                        &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
191         &pushf  ();
192         &cld    ();
193         &call   ("_padlock_verify_ctx");
194 &set_label("${mode}_pic_point");
195         &lea    ($ctx,&DWP(16,$ctx));   # control word
196         &xor    ("eax","eax");
197                                         if ($mode eq "ctr32") {
198         &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
199                                         } else {
200         &xor    ("ebx","ebx");
201         &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
202         &jnz    (&label("${mode}_aligned"));
203         &test   ($out,0x0f);
204         &setz   ("al");                 # !out_misaligned
205         &test   ($inp,0x0f);
206         &setz   ("bl");                 # !inp_misaligned
207         &test   ("eax","ebx");
208         &jnz    (&label("${mode}_aligned"));
209         &neg    ("eax");
210                                         }
211         &mov    ($chunk,$PADLOCK_CHUNK);
212         &not    ("eax");                # out_misaligned?-1:0
213         &lea    ("ebp",&DWP(-24,"esp"));
214         &cmp    ($len,$chunk);
215         &cmovc  ($chunk,$len);          # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
216         &and    ("eax",$chunk);         # out_misaligned?chunk:0
217         &mov    ($chunk,$len);
218         &neg    ("eax");
219         &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
220         &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
221         &mov    ("eax",$PADLOCK_CHUNK);
222         &cmovz  ($chunk,"eax");                 # chunk=chunk?:PADLOCK_CHUNK
223         &mov    ("eax","ebp");
224         &and    ("ebp",-16);
225         &and    ("esp",-16);
226         &mov    (&DWP(16,"ebp"),"eax");
227     if ($PADLOCK_PREFETCH{$mode}) {
228         &cmp    ($len,$chunk);
229         &ja     (&label("${mode}_loop"));
230         &mov    ("eax",$inp);           # check if prefetch crosses page
231         &cmp    ("ebp","esp");
232         &cmove  ("eax",$out);
233         &add    ("eax",$len);
234         &neg    ("eax");
235         &and    ("eax",0xfff);          # distance to page boundary
236         &cmp    ("eax",$PADLOCK_PREFETCH{$mode});
237         &mov    ("eax",-$PADLOCK_PREFETCH{$mode});
238         &cmovae ("eax",$chunk);         # mask=distance<prefetch?-prefetch:-1
239         &and    ($chunk,"eax");
240         &jz     (&label("${mode}_unaligned_tail"));
241     }
242         &jmp    (&label("${mode}_loop"));
243
244 &set_label("${mode}_loop",16);
245         &mov    (&DWP(0,"ebp"),$out);           # save parameters
246         &mov    (&DWP(4,"ebp"),$inp);
247         &mov    (&DWP(8,"ebp"),$len);
248         &mov    ($len,$chunk);
249         &mov    (&DWP(12,"ebp"),$chunk);        # chunk
250                                                 if ($mode eq "ctr32") {
251         &mov    ("ecx",&DWP(-4,$ctx));
252         &xor    ($out,$out);
253         &mov    ("eax",&DWP(-8,$ctx));          # borrow $len
254 &set_label("${mode}_prepare");
255         &mov    (&DWP(12,"esp",$out),"ecx");
256         &bswap  ("ecx");
257         &movq   (&QWP(0,"esp",$out),"mm0");
258         &inc    ("ecx");
259         &mov    (&DWP(8,"esp",$out),"eax");
260         &bswap  ("ecx");
261         &lea    ($out,&DWP(16,$out));
262         &cmp    ($out,$chunk);
263         &jb     (&label("${mode}_prepare"));
264
265         &mov    (&DWP(-4,$ctx),"ecx");
266         &lea    ($inp,&DWP(0,"esp"));
267         &lea    ($out,&DWP(0,"esp"));
268         &mov    ($len,$chunk);
269                                                 } else {
270         &test   ($out,0x0f);                    # out_misaligned
271         &cmovnz ($out,"esp");
272         &test   ($inp,0x0f);                    # inp_misaligned
273         &jz     (&label("${mode}_inp_aligned"));
274         &shr    ($len,2);
275         &data_byte(0xf3,0xa5);                  # rep movsl
276         &sub    ($out,$chunk);
277         &mov    ($len,$chunk);
278         &mov    ($inp,$out);
279 &set_label("${mode}_inp_aligned");
280                                                 }
281         &lea    ("eax",&DWP(-16,$ctx));         # ivp
282         &lea    ("ebx",&DWP(16,$ctx));          # key
283         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
284         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
285                                                 if ($mode !~ /ecb|ctr/) {
286         &movaps ("xmm0",&QWP(0,"eax"));
287         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
288                                                 }
289         &mov    ($out,&DWP(0,"ebp"));           # restore parameters
290         &mov    ($chunk,&DWP(12,"ebp"));
291                                                 if ($mode eq "ctr32") {
292         &mov    ($inp,&DWP(4,"ebp"));
293         &xor    ($len,$len);
294 &set_label("${mode}_xor");
295         &movups ("xmm1",&QWP(0,$inp,$len));
296         &lea    ($len,&DWP(16,$len));
297         &pxor   ("xmm1",&QWP(-16,"esp",$len));
298         &movups (&QWP(-16,$out,$len),"xmm1");
299         &cmp    ($len,$chunk);
300         &jb     (&label("${mode}_xor"));
301                                                 } else {
302         &test   ($out,0x0f);
303         &jz     (&label("${mode}_out_aligned"));
304         &mov    ($len,$chunk);
305         &lea    ($inp,&DWP(0,"esp"));
306         &shr    ($len,2);
307         &data_byte(0xf3,0xa5);                  # rep movsl
308         &sub    ($out,$chunk);
309 &set_label("${mode}_out_aligned");
310         &mov    ($inp,&DWP(4,"ebp"));
311                                                 }
312         &mov    ($len,&DWP(8,"ebp"));
313         &add    ($out,$chunk);
314         &add    ($inp,$chunk);
315         &sub    ($len,$chunk);
316         &mov    ($chunk,$PADLOCK_CHUNK);
317     if (!$PADLOCK_PREFETCH{$mode}) {
318         &jnz    (&label("${mode}_loop"));
319     } else {
320         &jz     (&label("${mode}_break"));
321         &cmp    ($len,$chunk);
322         &jae    (&label("${mode}_loop"));
323
324 &set_label("${mode}_unaligned_tail");
325         &xor    ("eax","eax");
326         &cmp    ("esp","ebp");
327         &cmove  ("eax",$len);
328         &sub    ("esp","eax");                  # alloca
329         &mov    ("eax", $out);                  # save parameters
330         &mov    ($chunk,$len);
331         &shr    ($len,2);
332         &lea    ($out,&DWP(0,"esp"));
333         &data_byte(0xf3,0xa5);                  # rep movsl
334         &mov    ($inp,"esp");
335         &mov    ($out,"eax");                   # restore parameters
336         &mov    ($len,$chunk);
337         &jmp    (&label("${mode}_loop"));
338
339 &set_label("${mode}_break",16);
340     }
341                                                 if ($mode ne "ctr32") {
342         &cmp    ("esp","ebp");
343         &je     (&label("${mode}_done"));
344                                                 }
345         &pxor   ("xmm0","xmm0");
346         &lea    ("eax",&DWP(0,"esp"));
347 &set_label("${mode}_bzero");
348         &movaps (&QWP(0,"eax"),"xmm0");
349         &lea    ("eax",&DWP(16,"eax"));
350         &cmp    ("ebp","eax");
351         &ja     (&label("${mode}_bzero"));
352
353 &set_label("${mode}_done");
354         &mov    ("ebp",&DWP(16,"ebp"));
355         &lea    ("esp",&DWP(24,"ebp"));
356                                                 if ($mode ne "ctr32") {
357         &jmp    (&label("${mode}_exit"));
358
359 &set_label("${mode}_aligned",16);
360     if ($PADLOCK_PREFETCH{$mode}) {
361         &lea    ("ebp",&DWP(0,$inp,$len));
362         &neg    ("ebp");
363         &and    ("ebp",0xfff);                  # distance to page boundary
364         &xor    ("eax","eax");
365         &cmp    ("ebp",$PADLOCK_PREFETCH{$mode});
366         &mov    ("ebp",$PADLOCK_PREFETCH{$mode}-1);
367         &cmovae ("ebp","eax");
368         &and    ("ebp",$len);                   # remainder
369         &sub    ($len,"ebp");
370         &jz     (&label("${mode}_aligned_tail"));
371     }
372         &lea    ("eax",&DWP(-16,$ctx));         # ivp
373         &lea    ("ebx",&DWP(16,$ctx));          # key
374         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
375         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
376                                                 if ($mode ne "ecb") {
377         &movaps ("xmm0",&QWP(0,"eax"));
378         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
379                                                 }
380     if ($PADLOCK_PREFETCH{$mode}) {
381         &test   ("ebp","ebp");
382         &jz     (&label("${mode}_exit"));
383
384 &set_label("${mode}_aligned_tail");
385         &mov    ($len,"ebp");
386         &lea    ("ebp",&DWP(-24,"esp"));
387         &mov    ("esp","ebp");
388         &mov    ("eax","ebp");
389         &sub    ("esp",$len);
390         &and    ("ebp",-16);
391         &and    ("esp",-16);
392         &mov    (&DWP(16,"ebp"),"eax");
393         &mov    ("eax", $out);                  # save parameters
394         &mov    ($chunk,$len);
395         &shr    ($len,2);
396         &lea    ($out,&DWP(0,"esp"));
397         &data_byte(0xf3,0xa5);                  # rep movsl
398         &mov    ($inp,"esp");
399         &mov    ($out,"eax");                   # restore parameters
400         &mov    ($len,$chunk);
401         &jmp    (&label("${mode}_loop"));
402     }
403 &set_label("${mode}_exit");                     }
404         &mov    ("eax",1);
405         &lea    ("esp",&DWP(4,"esp"));          # popf
406         &emms   ()                              if ($mode eq "ctr32");
407 &set_label("${mode}_abort");
408 &function_end("padlock_${mode}_encrypt");
409 }
410
411 &generate_mode("ecb",0xc8);
412 &generate_mode("cbc",0xd0);
413 &generate_mode("cfb",0xe0);
414 &generate_mode("ofb",0xe8);
415 &generate_mode("ctr32",0xc8);   # yes, it implements own CTR with ECB opcode,
416                                 # because hardware CTR was introduced later
417                                 # and even has errata on certain C7 stepping.
418                                 # own implementation *always* works, though
419                                 # ~15% slower than dedicated hardware...
420
421 &function_begin_B("padlock_xstore");
422         &push   ("edi");
423         &mov    ("edi",&wparam(0));
424         &mov    ("edx",&wparam(1));
425         &data_byte(0x0f,0xa7,0xc0);             # xstore
426         &pop    ("edi");
427         &ret    ();
428 &function_end_B("padlock_xstore");
429
430 &function_begin_B("_win32_segv_handler");
431         &mov    ("eax",1);                      # ExceptionContinueSearch
432         &mov    ("edx",&wparam(0));             # *ExceptionRecord
433         &mov    ("ecx",&wparam(2));             # *ContextRecord
434         &cmp    (&DWP(0,"edx"),0xC0000005)      # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
435         &jne    (&label("ret"));
436         &add    (&DWP(184,"ecx"),4);            # skip over rep sha*
437         &mov    ("eax",0);                      # ExceptionContinueExecution
438 &set_label("ret");
439         &ret    ();
440 &function_end_B("_win32_segv_handler");
441 &safeseh("_win32_segv_handler")                 if ($::win32);
442
443 &function_begin_B("padlock_sha1_oneshot");
444         &push   ("edi");
445         &push   ("esi");
446         &xor    ("eax","eax");
447         &mov    ("edi",&wparam(0));
448         &mov    ("esi",&wparam(1));
449         &mov    ("ecx",&wparam(2));
450     if ($::win32 or $::coff) {
451         &push   (&::islabel("_win32_segv_handler"));
452         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
453         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
454     }
455         &mov    ("edx","esp");                  # put aside %esp
456         &add    ("esp",-128);                   # 32 is enough but spec says 128
457         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
458         &and    ("esp",-16);
459         &mov    ("eax",&DWP(16,"edi"));
460         &movaps (&QWP(0,"esp"),"xmm0");
461         &mov    ("edi","esp");
462         &mov    (&DWP(16,"esp"),"eax");
463         &xor    ("eax","eax");
464         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
465         &movaps ("xmm0",&QWP(0,"esp"));
466         &mov    ("eax",&DWP(16,"esp"));
467         &mov    ("esp","edx");                  # restore %esp
468     if ($::win32 or $::coff) {
469         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
470         &lea    ("esp",&DWP(4,"esp"));
471     }
472         &mov    ("edi",&wparam(0));
473         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
474         &mov    (&DWP(16,"edi"),"eax");
475         &pop    ("esi");
476         &pop    ("edi");
477         &ret    ();
478 &function_end_B("padlock_sha1_oneshot");
479
480 &function_begin_B("padlock_sha1_blocks");
481         &push   ("edi");
482         &push   ("esi");
483         &mov    ("edi",&wparam(0));
484         &mov    ("esi",&wparam(1));
485         &mov    ("edx","esp");                  # put aside %esp
486         &mov    ("ecx",&wparam(2));
487         &add    ("esp",-128);
488         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
489         &and    ("esp",-16);
490         &mov    ("eax",&DWP(16,"edi"));
491         &movaps (&QWP(0,"esp"),"xmm0");
492         &mov    ("edi","esp");
493         &mov    (&DWP(16,"esp"),"eax");
494         &mov    ("eax",-1);
495         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
496         &movaps ("xmm0",&QWP(0,"esp"));
497         &mov    ("eax",&DWP(16,"esp"));
498         &mov    ("esp","edx");                  # restore %esp
499         &mov    ("edi",&wparam(0));
500         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
501         &mov    (&DWP(16,"edi"),"eax");
502         &pop    ("esi");
503         &pop    ("edi");
504         &ret    ();
505 &function_end_B("padlock_sha1_blocks");
506
507 &function_begin_B("padlock_sha256_oneshot");
508         &push   ("edi");
509         &push   ("esi");
510         &xor    ("eax","eax");
511         &mov    ("edi",&wparam(0));
512         &mov    ("esi",&wparam(1));
513         &mov    ("ecx",&wparam(2));
514     if ($::win32 or $::coff) {
515         &push   (&::islabel("_win32_segv_handler"));
516         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
517         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
518     }
519         &mov    ("edx","esp");                  # put aside %esp
520         &add    ("esp",-128);
521         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
522         &and    ("esp",-16);
523         &movups ("xmm1",&QWP(16,"edi"));
524         &movaps (&QWP(0,"esp"),"xmm0");
525         &mov    ("edi","esp");
526         &movaps (&QWP(16,"esp"),"xmm1");
527         &xor    ("eax","eax");
528         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
529         &movaps ("xmm0",&QWP(0,"esp"));
530         &movaps ("xmm1",&QWP(16,"esp"));
531         &mov    ("esp","edx");                  # restore %esp
532     if ($::win32 or $::coff) {
533         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
534         &lea    ("esp",&DWP(4,"esp"));
535     }
536         &mov    ("edi",&wparam(0));
537         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
538         &movups (&QWP(16,"edi"),"xmm1");
539         &pop    ("esi");
540         &pop    ("edi");
541         &ret    ();
542 &function_end_B("padlock_sha256_oneshot");
543
544 &function_begin_B("padlock_sha256_blocks");
545         &push   ("edi");
546         &push   ("esi");
547         &mov    ("edi",&wparam(0));
548         &mov    ("esi",&wparam(1));
549         &mov    ("ecx",&wparam(2));
550         &mov    ("edx","esp");                  # put aside %esp
551         &add    ("esp",-128);
552         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
553         &and    ("esp",-16);
554         &movups ("xmm1",&QWP(16,"edi"));
555         &movaps (&QWP(0,"esp"),"xmm0");
556         &mov    ("edi","esp");
557         &movaps (&QWP(16,"esp"),"xmm1");
558         &mov    ("eax",-1);
559         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
560         &movaps ("xmm0",&QWP(0,"esp"));
561         &movaps ("xmm1",&QWP(16,"esp"));
562         &mov    ("esp","edx");                  # restore %esp
563         &mov    ("edi",&wparam(0));
564         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
565         &movups (&QWP(16,"edi"),"xmm1");
566         &pop    ("esi");
567         &pop    ("edi");
568         &ret    ();
569 &function_end_B("padlock_sha256_blocks");
570
571 &function_begin_B("padlock_sha512_blocks");
572         &push   ("edi");
573         &push   ("esi");
574         &mov    ("edi",&wparam(0));
575         &mov    ("esi",&wparam(1));
576         &mov    ("ecx",&wparam(2));
577         &mov    ("edx","esp");                  # put aside %esp
578         &add    ("esp",-128);
579         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
580         &and    ("esp",-16);
581         &movups ("xmm1",&QWP(16,"edi"));
582         &movups ("xmm2",&QWP(32,"edi"));
583         &movups ("xmm3",&QWP(48,"edi"));
584         &movaps (&QWP(0,"esp"),"xmm0");
585         &mov    ("edi","esp");
586         &movaps (&QWP(16,"esp"),"xmm1");
587         &movaps (&QWP(32,"esp"),"xmm2");
588         &movaps (&QWP(48,"esp"),"xmm3");
589         &data_byte(0xf3,0x0f,0xa6,0xe0);        # rep xsha512
590         &movaps ("xmm0",&QWP(0,"esp"));
591         &movaps ("xmm1",&QWP(16,"esp"));
592         &movaps ("xmm2",&QWP(32,"esp"));
593         &movaps ("xmm3",&QWP(48,"esp"));
594         &mov    ("esp","edx");                  # restore %esp
595         &mov    ("edi",&wparam(0));
596         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
597         &movups (&QWP(16,"edi"),"xmm1");
598         &movups (&QWP(32,"edi"),"xmm2");
599         &movups (&QWP(48,"edi"),"xmm3");
600         &pop    ("esi");
601         &pop    ("edi");
602         &ret    ();
603 &function_end_B("padlock_sha512_blocks");
604
605 &asciz  ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
606 &align  (16);
607
608 &dataseg();
609 # Essentially this variable belongs in thread local storage.
610 # Having this variable global on the other hand can only cause
611 # few bogus key reloads [if any at all on signle-CPU system],
612 # so we accept the penalty...
613 &set_label("padlock_saved_context",4);
614 &data_word(0);
615
616 &asm_finish();
617
618 close STDOUT;