e_padlock-x86*.pl: Nano-related update.
[openssl.git] / engines / asm / e_padlock-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # September 2011
11 #
12 # Assembler helpers for Padlock engine. Compared to original engine
13 # version relying on inline assembler and compiled with gcc 3.4.6 it
14 # was measured to provide ~100% improvement on misaligned data in ECB
15 # mode and ~75% in CBC mode. For aligned data improvement can be
16 # observed for short inputs only, e.g. 45% for 64-byte messages in
17 # ECB mode, 20% in CBC. Difference in performance for aligned vs.
18 # misaligned data depends on misalignment and is either ~1.8x or 2.9x.
19 # These are approximately same factors as for hardware support, so
20 # there is little reason to rely on the latter. On the contrary, it
21 # might actually hurt performance in mixture of aligned and misaligned
22 # buffers, because a) if you choose to flip 'align' flag in control
23 # word on per-buffer basis, then you'd have to reload key context,
24 # which incurs penalty; b) if you choose to set 'align' flag
25 # permanently, it limits performance even for aligned data to ~1/2.
26 # All above mentioned results were collected on 1.5GHz C7. Nano on the
27 # other hand handles unaligned data more gracefully. Depending on
28 # algorithm and how unaligned data is, hardware can be up to 70% more
29 # efficient than below software alignment procedures, nor does 'align'
30 # flag have affect on aligned performance [if has any meaning at all].
31 # Therefore suggestion is to unconditionally set 'align' flag on Nano
32 # for optimal performance.
33
34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35 push(@INC,"${dir}","${dir}../../crypto/perlasm");
36 require "x86asm.pl";
37
38 &asm_init($ARGV[0],$0);
39
40 $PADLOCK_CHUNK=512;     # Must be a power of 2 larger than 16
41
42 $ctx="edx";
43 $out="edi";
44 $inp="esi";
45 $len="ecx";
46 $chunk="ebx";
47
48 &function_begin_B("padlock_capability");
49         &push   ("ebx");
50         &pushf  ();
51         &pop    ("eax");
52         &mov    ("ecx","eax");
53         &xor    ("eax",1<<21);
54         &push   ("eax");
55         &popf   ();
56         &pushf  ();
57         &pop    ("eax");
58         &xor    ("ecx","eax");
59         &xor    ("eax","eax");
60         &bt     ("ecx",21);
61         &jnc    (&label("noluck"));
62         &cpuid  ();
63         &xor    ("eax","eax");
64         &cmp    ("ebx","0x".unpack("H*",'tneC'));
65         &jne    (&label("noluck"));
66         &cmp    ("edx","0x".unpack("H*",'Hrua'));
67         &jne    (&label("noluck"));
68         &cmp    ("ecx","0x".unpack("H*",'slua'));
69         &jne    (&label("noluck"));
70         &mov    ("eax",0xC0000000);
71         &cpuid  ();
72         &mov    ("edx","eax");
73         &xor    ("eax","eax");
74         &cmp    ("edx",0xC0000001);
75         &jb     (&label("noluck"));
76         &mov    ("eax",1);
77         &cpuid  ();
78         &or     ("eax",0x0f);
79         &xor    ("ebx","ebx");
80         &and    ("eax",0x0fff);
81         &cmp    ("eax",0x06ff);         # check for Nano
82         &sete   ("bl");
83         &mov    ("eax",0xC0000001);
84         &push   ("ebx");
85         &cpuid  ();
86         &pop    ("ebx");
87         &mov    ("eax","edx");
88         &shl    ("ebx",4);              # bit#4 denotes Nano
89         &and    ("eax",0xffffffef);
90         &or     ("eax","ebx")
91 &set_label("noluck");
92         &pop    ("ebx");
93         &ret    ();
94 &function_end_B("padlock_capability")
95
96 &function_begin_B("padlock_key_bswap");
97         &mov    ("edx",&wparam(0));
98         &mov    ("ecx",&DWP(240,"edx"));
99 &set_label("bswap_loop");
100         &mov    ("eax",&DWP(0,"edx"));
101         &bswap  ("eax");
102         &mov    (&DWP(0,"edx"),"eax");
103         &lea    ("edx",&DWP(4,"edx"));
104         &sub    ("ecx",1);
105         &jnz    (&label("bswap_loop"));
106         &ret    ();
107 &function_end_B("padlock_key_bswap");
108
109 # This is heuristic key context tracing. At first one
110 # believes that one should use atomic swap instructions,
111 # but it's not actually necessary. Point is that if
112 # padlock_saved_context was changed by another thread
113 # after we've read it and before we compare it with ctx,
114 # our key *shall* be reloaded upon thread context switch
115 # and we are therefore set in either case...
116 &static_label("padlock_saved_context");
117
118 &function_begin_B("padlock_verify_context");
119         &mov    ($ctx,&wparam(0));
120         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
121                        &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
122         &pushf  ();
123         &call   ("_padlock_verify_ctx");
124 &set_label("verify_pic_point");
125         &lea    ("esp",&DWP(4,"esp"));
126         &ret    ();
127 &function_end_B("padlock_verify_context");
128
129 &function_begin_B("_padlock_verify_ctx");
130         &add    ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context
131         &bt     (&DWP(4,"esp"),30);             # eflags
132         &jnc    (&label("verified"));
133         &cmp    ($ctx,&DWP(0,"eax"));
134         &je     (&label("verified"));
135         &pushf  ();
136         &popf   ();
137 &set_label("verified");
138         &mov    (&DWP(0,"eax"),$ctx);
139         &ret    ();
140 &function_end_B("_padlock_verify_ctx");
141
142 &function_begin_B("padlock_reload_key");
143         &pushf  ();
144         &popf   ();
145         &ret    ();
146 &function_end_B("padlock_reload_key");
147
148 &function_begin_B("padlock_aes_block");
149         &push   ("edi");
150         &push   ("esi");
151         &push   ("ebx");
152         &mov    ($out,&wparam(0));              # must be 16-byte aligned
153         &mov    ($inp,&wparam(1));              # must be 16-byte aligned
154         &mov    ($ctx,&wparam(2));
155         &mov    ($len,1);
156         &lea    ("ebx",&DWP(32,$ctx));          # key
157         &lea    ($ctx,&DWP(16,$ctx));           # control word
158         &data_byte(0xf3,0x0f,0xa7,0xc8);        # rep xcryptecb
159         &pop    ("ebx");
160         &pop    ("esi");
161         &pop    ("edi");
162         &ret    ();
163 &function_end_B("padlock_aes_block");
164
165 sub generate_mode {
166 my ($mode,$opcode) = @_;
167 # int padlock_$mode_encrypt(void *out, const void *inp,
168 #               struct padlock_cipher_data *ctx, size_t len);
169 &function_begin("padlock_${mode}_encrypt");
170         &mov    ($out,&wparam(0));
171         &mov    ($inp,&wparam(1));
172         &mov    ($ctx,&wparam(2));
173         &mov    ($len,&wparam(3));
174         &test   ($ctx,15);
175         &jnz    (&label("${mode}_abort"));
176         &test   ($len,15);
177         &jnz    (&label("${mode}_abort"));
178         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
179                        &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
180         &pushf  ();
181         &cld    ();
182         &call   ("_padlock_verify_ctx");
183 &set_label("${mode}_pic_point");
184         &lea    ($ctx,&DWP(16,$ctx));   # control word
185         &xor    ("eax","eax");
186                                         if ($mode eq "ctr16") {
187         &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
188                                         } else {
189         &xor    ("ebx","ebx");
190         &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
191         &jnz    (&label("${mode}_aligned"));
192         &test   ($out,0x0f);
193         &setz   ("al");                 # !out_misaligned
194         &test   ($inp,0x0f);
195         &setz   ("bl");                 # !inp_misaligned
196         &test   ("eax","ebx");
197         &jnz    (&label("${mode}_aligned"));
198         &neg    ("eax");
199                                         }
200         &mov    ($chunk,$PADLOCK_CHUNK);
201         &not    ("eax");                # out_misaligned?-1:0
202         &lea    ("ebp",&DWP(-24,"esp"));
203         &cmp    ($len,$chunk);
204         &cmovc  ($chunk,$len);          # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
205         &and    ("eax",$chunk);         # out_misaligned?chunk:0
206         &mov    ($chunk,$len);
207         &neg    ("eax");
208         &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
209         &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
210         &and    ("esp",-16);
211         &jmp    (&label("${mode}_loop"));
212
213 &set_label("${mode}_loop",16);
214         &mov    (&DWP(0,"ebp"),$out);           # save parameters
215         &mov    (&DWP(4,"ebp"),$inp);
216         &mov    (&DWP(8,"ebp"),$len);
217         &mov    ($len,$chunk);
218         &mov    (&DWP(12,"ebp"),$chunk);        # chunk
219                                                 if ($mode eq "ctr16") {
220         &mov    ("ecx",&DWP(-4,$ctx));
221         &xor    ($out,$out);
222         &mov    ("eax",&DWP(-8,$ctx));          # borrow $len
223 &set_label("${mode}_prepare");
224         &mov    (&DWP(12,"esp",$out),"ecx");
225         &bswap  ("ecx");
226         &movq   (&QWP(0,"esp",$out),"mm0");
227         &inc    ("ecx");
228         &mov    (&DWP(8,"esp",$out),"eax");
229         &bswap  ("ecx");
230         &lea    ($out,&DWP(16,$out));
231         &cmp    ($out,$chunk);
232         &jb     (&label("${mode}_prepare"));
233
234         &mov    (&DWP(-4,$ctx),"ecx");
235         &lea    ($inp,&DWP(0,"esp"));
236         &lea    ($out,&DWP(0,"esp"));
237         &mov    ($len,$chunk);
238                                                 } else {
239         &test   ($out,0x0f);                    # out_misaligned
240         &cmovnz ($out,"esp");
241         &test   ($inp,0x0f);                    # inp_misaligned
242         &jz     (&label("${mode}_inp_aligned"));
243         &shr    ($len,2);
244         &data_byte(0xf3,0xa5);                  # rep movsl
245         &sub    ($out,$chunk);
246         &mov    ($len,$chunk);
247         &mov    ($inp,$out);
248 &set_label("${mode}_inp_aligned");
249                                                 }
250         &lea    ("eax",&DWP(-16,$ctx));         # ivp
251         &lea    ("ebx",&DWP(16,$ctx));          # key
252         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
253         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
254                                                 if ($mode !~ /ecb|ctr/) {
255         &movaps ("xmm0",&QWP(0,"eax"));
256         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
257                                                 }
258         &mov    ($out,&DWP(0,"ebp"));           # restore parameters
259         &mov    ($chunk,&DWP(12,"ebp"));
260                                                 if ($mode eq "ctr16") {
261         &mov    ($inp,&DWP(4,"ebp"));
262         &xor    ($len,$len);
263 &set_label("${mode}_xor");
264         &movups ("xmm1",&QWP(0,$inp,$len));
265         &lea    ($len,&DWP(16,$len));
266         &pxor   ("xmm1",&QWP(-16,"esp",$len));
267         &movups (&QWP(-16,$out,$len),"xmm1");
268         &cmp    ($len,$chunk);
269         &jb     (&label("${mode}_xor"));
270                                                 } else {
271         &test   ($out,0x0f);
272         &jz     (&label("${mode}_out_aligned"));
273         &mov    ($len,$chunk);
274         &shr    ($len,2);
275         &lea    ($inp,&DWP(0,"esp"));
276         &data_byte(0xf3,0xa5);                  # rep movsl
277         &sub    ($out,$chunk);
278 &set_label("${mode}_out_aligned");
279         &mov    ($inp,&DWP(4,"ebp"));
280                                                 }
281         &mov    ($len,&DWP(8,"ebp"));
282         &add    ($out,$chunk);
283         &add    ($inp,$chunk);
284         &sub    ($len,$chunk);
285         &mov    ($chunk,$PADLOCK_CHUNK);
286         &jnz    (&label("${mode}_loop"));
287                                                 if ($mode ne "ctr16") {
288         &test   ($out,0x0f);                    # out_misaligned
289         &jz     (&label("${mode}_done"));
290                                                 }
291         &mov    ($len,"ebp");
292         &mov    ($out,"esp");
293         &sub    ($len,"esp");
294         &xor    ("eax","eax");
295         &shr    ($len,2);
296         &data_byte(0xf3,0xab);                  # rep stosl
297 &set_label("${mode}_done");
298         &lea    ("esp",&DWP(24,"ebp"));
299                                                 if ($mode ne "ctr16") {
300         &jmp    (&label("${mode}_exit"));
301
302 &set_label("${mode}_aligned",16);
303         &lea    ("eax",&DWP(-16,$ctx));         # ivp
304         &lea    ("ebx",&DWP(16,$ctx));          # key
305         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
306         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
307                                                 if ($mode ne "ecb") {
308         &movaps ("xmm0",&QWP(0,"eax"));
309         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
310                                                 }
311 &set_label("${mode}_exit");                     }
312         &mov    ("eax",1);
313         &lea    ("esp",&DWP(4,"esp"));          # popf
314         &emms   ()                              if ($mode eq "ctr16");
315 &set_label("${mode}_abort");
316 &function_end("padlock_${mode}_encrypt");
317 }
318
319 &generate_mode("ecb",0xc8);
320 &generate_mode("cbc",0xd0);
321 &generate_mode("cfb",0xe0);
322 &generate_mode("ofb",0xe8);
323 &generate_mode("ctr16",0xc8);   # yes, it implements own ctr with ecb opcode,
324                                 # because hardware ctr was introduced later
325                                 # and even has errata on certain CPU stepping.
326                                 # own implementation *always* works...
327
328 &function_begin_B("padlock_xstore");
329         &push   ("edi");
330         &mov    ("edi",&wparam(0));
331         &mov    ("edx",&wparam(1));
332         &data_byte(0x0f,0xa7,0xc0);             # xstore
333         &pop    ("edi");
334         &ret    ();
335 &function_end_B("padlock_xstore");
336
337 &function_begin_B("_win32_segv_handler");
338         &mov    ("eax",1);                      # ExceptionContinueSearch
339         &mov    ("edx",&wparam(0));             # *ExceptionRecord
340         &mov    ("ecx",&wparam(2));             # *ContextRecord
341         &cmp    (&DWP(0,"edx"),0xC0000005)      # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
342         &jne    (&label("ret"));
343         &add    (&DWP(184,"ecx"),4);            # skip over rep sha*
344         &mov    ("eax",0);                      # ExceptionContinueExecution
345 &set_label("ret");
346         &ret    ();
347 &function_end_B("_win32_segv_handler");
348 &safeseh("_win32_segv_handler")                 if ($::win32);
349
350 &function_begin_B("padlock_sha1_oneshot");
351         &push   ("edi");
352         &push   ("esi");
353         &xor    ("eax","eax");
354     if ($::win32 or $::coff) {
355         &push   (&::islabel("_win32_segv_handler"));
356         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
357         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
358     }
359         &mov    ("edi",&wparam(0));
360         &mov    ("esi",&wparam(1));
361         &mov    ("ecx",&wparam(2));
362         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
363     if ($::win32 or $::coff) {
364         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
365         &lea    ("esp",&DWP(4,"esp"));
366     }
367         &pop    ("esi");
368         &pop    ("edi");
369         &ret    ();
370 &function_end_B("padlock_sha1_oneshot");
371
372 &function_begin_B("padlock_sha1_blocks");
373         &push   ("edi");
374         &push   ("esi");
375         &mov    ("eax",-1);
376         &mov    ("edi",&wparam(0));
377         &mov    ("esi",&wparam(1));
378         &mov    ("ecx",&wparam(2));
379         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
380         &pop    ("esi");
381         &pop    ("edi");
382         &ret    ();
383 &function_end_B("padlock_sha1_blocks");
384
385 &function_begin_B("padlock_sha256_oneshot");
386         &push   ("edi");
387         &push   ("esi");
388         &xor    ("eax","eax");
389     if ($::win32 or $::coff) {
390         &push   (&::islabel("_win32_segv_handler"));
391         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
392         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
393     }
394         &mov    ("edi",&wparam(0));
395         &mov    ("esi",&wparam(1));
396         &mov    ("ecx",&wparam(2));
397         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
398     if ($::win32 or $::coff) {
399         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
400         &lea    ("esp",&DWP(4,"esp"));
401     }
402         &pop    ("esi");
403         &pop    ("edi");
404         &ret    ();
405 &function_end_B("padlock_sha256_oneshot");
406
407 &function_begin_B("padlock_sha256_blocks");
408         &push   ("edi");
409         &push   ("esi");
410         &mov    ("eax",-1);
411         &mov    ("edi",&wparam(0));
412         &mov    ("esi",&wparam(1));
413         &mov    ("ecx",&wparam(2));
414         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
415         &pop    ("esi");
416         &pop    ("edi");
417         &ret    ();
418 &function_end_B("padlock_sha256_blocks");
419
420 &function_begin_B("padlock_sha512_blocks");
421         &push   ("edi");
422         &push   ("esi");
423         &mov    ("edi",&wparam(0));
424         &mov    ("esi",&wparam(1));
425         &mov    ("ecx",&wparam(2));
426         &data_byte(0xf3,0x0f,0xa6,0xe0);        # rep xsha512
427         &pop    ("esi");
428         &pop    ("edi");
429         &ret    ();
430 &function_end_B("padlock_sha512_blocks");
431
432 &asciz  ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
433 &align  (16);
434
435 &dataseg();
436 # Essentially this variable belongs in thread local storage.
437 # Having this variable global on the other hand can only cause
438 # few bogus key reloads [if any at all on signle-CPU system],
439 # so we accept the penalty...
440 &set_label("padlock_saved_context",4);
441 &data_word(0);
442
443 &asm_finish();