672d91257a363a010fc2ae7e10bea1db03bfe690
[openssl.git] / engines / asm / e_padlock-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # September 2011
11 #
12 # Assembler helpers for Padlock engine. Compared to original engine
13 # version relying on inline assembler and compiled with gcc 3.4.6 it
14 # was measured to provide ~100% improvement on misaligned data in ECB
15 # mode and ~75% in CBC mode. For aligned data improvement can be
16 # observed for short inputs only, e.g. 45% for 64-byte messages in
17 # ECB mode, 20% in CBC. Difference in performance for aligned vs.
18 # misaligned data depends on misalignment and is either ~1.8x or
19 # ~2.9x. These are approximately same factors as for hardware support,
20 # so there is little reason to rely on the latter. It might actually
21 # hurt performance in mixture of aligned and misaligned buffers,
22 # because a) if you choose to flip 'align' flag on per-buffer basis,
23 # then you'd have to reload key context; b) if you choose to set
24 # 'align' flag permanently, it limits performance for aligned data
25 # to ~1/2. All results were collected on 1.5GHz C7.
26
27 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28 push(@INC,"${dir}","${dir}../../crypto/perlasm");
29 require "x86asm.pl";
30
31 &asm_init($ARGV[0],$0);
32
33 $PADLOCK_CHUNK=512;     # Must be a power of 2 larger than 16
34
35 $ctx="edx";
36 $out="edi";
37 $inp="esi";
38 $len="ecx";
39 $chunk="ebx";
40
41 &function_begin_B("padlock_capability");
42         &push   ("ebx");
43         &pushf  ();
44         &pop    ("eax");
45         &mov    ("ecx","eax");
46         &xor    ("eax",1<<21);
47         &push   ("eax");
48         &popf   ();
49         &pushf  ();
50         &pop    ("eax");
51         &xor    ("ecx","eax");
52         &xor    ("eax","eax");
53         &bt     ("ecx",21);
54         &jnc    (&label("noluck"));
55         &cpuid  ();
56         &xor    ("eax","eax");
57         &cmp    ("ebx","0x".unpack("H*",'tneC'));
58         &jne    (&label("noluck"));
59         &cmp    ("edx","0x".unpack("H*",'Hrua'));
60         &jne    (&label("noluck"));
61         &cmp    ("ecx","0x".unpack("H*",'slua'));
62         &jne    (&label("noluck"));
63         &mov    ("eax",0xC0000000);
64         &cpuid  ();
65         &mov    ("edx","eax");
66         &xor    ("eax","eax");
67         &cmp    ("edx",0xC0000001);
68         &jb     (&label("noluck"));
69         &mov    ("eax",1);
70         &cpuid  ();
71         &or     ("eax",0x0f);
72         &xor    ("ebx","ebx");
73         &and    ("eax",0x0fff);
74         &cmp    ("eax",0x06ff);         # check for Nano
75         &sete   ("bl");
76         &mov    ("eax",0xC0000001);
77         &push   ("ebx");
78         &cpuid  ();
79         &pop    ("ebx");
80         &mov    ("eax","edx");
81         &shl    ("ebx",4);              # bit#4 denotes Nano
82         &and    ("eax",0xffffffef);
83         &or     ("eax","ebx")
84 &set_label("noluck");
85         &pop    ("ebx");
86         &ret    ();
87 &function_end_B("padlock_capability")
88
89 &function_begin_B("padlock_key_bswap");
90         &mov    ("edx",&wparam(0));
91         &mov    ("ecx",&DWP(240,"edx"));
92 &set_label("bswap_loop");
93         &mov    ("eax",&DWP(0,"edx"));
94         &bswap  ("eax");
95         &mov    (&DWP(0,"edx"),"eax");
96         &lea    ("edx",&DWP(4,"edx"));
97         &sub    ("ecx",1);
98         &jnz    (&label("bswap_loop"));
99         &ret    ();
100 &function_end_B("padlock_key_bswap");
101
102 # This is heuristic key context tracing. At first one
103 # believes that one should use atomic swap instructions,
104 # but it's not actually necessary. Point is that if
105 # padlock_saved_context was changed by another thread
106 # after we've read it and before we compare it with ctx,
107 # our key *shall* be reloaded upon thread context switch
108 # and we are therefore set in either case...
109 &static_label("padlock_saved_context");
110
111 &function_begin_B("padlock_verify_context");
112         &mov    ($ctx,&wparam(0));
113         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
114                        &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
115         &pushf  ();
116         &call   ("_padlock_verify_ctx");
117 &set_label("verify_pic_point");
118         &lea    ("esp",&DWP(4,"esp"));
119         &ret    ();
120 &function_end_B("padlock_verify_context");
121
122 &function_begin_B("_padlock_verify_ctx");
123         &add    ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context
124         &bt     (&DWP(4,"esp"),30);             # eflags
125         &jnc    (&label("verified"));
126         &cmp    ($ctx,&DWP(0,"eax"));
127         &je     (&label("verified"));
128         &pushf  ();
129         &popf   ();
130 &set_label("verified");
131         &mov    (&DWP(0,"eax"),$ctx);
132         &ret    ();
133 &function_end_B("_padlock_verify_ctx");
134
135 &function_begin_B("padlock_reload_key");
136         &pushf  ();
137         &popf   ();
138         &ret    ();
139 &function_end_B("padlock_reload_key");
140
141 &function_begin_B("padlock_aes_block");
142         &push   ("edi");
143         &push   ("esi");
144         &push   ("ebx");
145         &mov    ($out,&wparam(0));              # must be 16-byte aligned
146         &mov    ($inp,&wparam(1));              # must be 16-byte aligned
147         &mov    ($ctx,&wparam(2));
148         &mov    ($len,1);
149         &lea    ("ebx",&DWP(32,$ctx));          # key
150         &lea    ($ctx,&DWP(16,$ctx));           # control word
151         &data_byte(0xf3,0x0f,0xa7,0xc8);        # rep xcryptecb
152         &pop    ("ebx");
153         &pop    ("esi");
154         &pop    ("edi");
155         &ret    ();
156 &function_end_B("padlock_aes_block");
157
158 sub generate_mode {
159 my ($mode,$opcode) = @_;
160 # int padlock_$mode_encrypt(void *out, const void *inp,
161 #               struct padlock_cipher_data *ctx, size_t len);
162 &function_begin("padlock_${mode}_encrypt");
163         &mov    ($out,&wparam(0));
164         &mov    ($inp,&wparam(1));
165         &mov    ($ctx,&wparam(2));
166         &mov    ($len,&wparam(3));
167         &test   ($ctx,15);
168         &jnz    (&label("${mode}_abort"));
169         &test   ($len,15);
170         &jnz    (&label("${mode}_abort"));
171         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
172                        &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
173         &pushf  ();
174         &cld    ();
175         &call   ("_padlock_verify_ctx");
176 &set_label("${mode}_pic_point");
177         &lea    ($ctx,&DWP(16,$ctx));   # control word
178         &xor    ("eax","eax");
179                                         if ($mode eq "ctr16") {
180         &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
181                                         } else {
182         &xor    ("ebx","ebx");
183         &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
184         &jnz    (&label("${mode}_aligned"));
185         &test   ($out,0x0f);
186         &setz   ("al");                 # !out_misaligned
187         &test   ($inp,0x0f);
188         &setz   ("bl");                 # !inp_misaligned
189         &test   ("eax","ebx");
190         &jnz    (&label("${mode}_aligned"));
191         &neg    ("eax");
192                                         }
193         &mov    ($chunk,$PADLOCK_CHUNK);
194         &not    ("eax");                # out_misaligned?-1:0
195         &lea    ("ebp",&DWP(-24,"esp"));
196         &cmp    ($len,$chunk);
197         &cmovc  ($chunk,$len);          # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
198         &and    ("eax",$chunk);         # out_misaligned?chunk:0
199         &mov    ($chunk,$len);
200         &neg    ("eax");
201         &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
202         &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
203         &and    ("esp",-16);
204         &jmp    (&label("${mode}_loop"));
205
206 &set_label("${mode}_loop",16);
207         &mov    (&DWP(0,"ebp"),$out);           # save parameters
208         &mov    (&DWP(4,"ebp"),$inp);
209         &mov    (&DWP(8,"ebp"),$len);
210         &mov    ($len,$chunk);
211         &mov    (&DWP(12,"ebp"),$chunk);        # chunk
212                                                 if ($mode eq "ctr16") {
213         &mov    ("ecx",&DWP(-4,$ctx));
214         &xor    ($out,$out);
215         &mov    ("eax",&DWP(-8,$ctx));          # borrow $len
216 &set_label("${mode}_prepare");
217         &mov    (&DWP(12,"esp",$out),"ecx");
218         &bswap  ("ecx");
219         &movq   (&QWP(0,"esp",$out),"mm0");
220         &inc    ("ecx");
221         &mov    (&DWP(8,"esp",$out),"eax");
222         &bswap  ("ecx");
223         &lea    ($out,&DWP(16,$out));
224         &cmp    ($out,$chunk);
225         &jb     (&label("${mode}_prepare"));
226
227         &mov    (&DWP(-4,$ctx),"ecx");
228         &lea    ($inp,&DWP(0,"esp"));
229         &lea    ($out,&DWP(0,"esp"));
230         &mov    ($len,$chunk);
231                                                 } else {
232         &test   ($out,0x0f);                    # out_misaligned
233         &cmovnz ($out,"esp");
234         &test   ($inp,0x0f);                    # inp_misaligned
235         &jz     (&label("${mode}_inp_aligned"));
236         &shr    ($len,2);
237         &data_byte(0xf3,0xa5);                  # rep movsl
238         &sub    ($out,$chunk);
239         &mov    ($len,$chunk);
240         &mov    ($inp,$out);
241 &set_label("${mode}_inp_aligned");
242                                                 }
243         &lea    ("eax",&DWP(-16,$ctx));         # ivp
244         &lea    ("ebx",&DWP(16,$ctx));          # key
245         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
246         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
247                                                 if ($mode !~ /ecb|ctr/) {
248         &movaps ("xmm0",&QWP(0,"eax"));
249         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
250                                                 }
251         &mov    ($out,&DWP(0,"ebp"));           # restore parameters
252         &mov    ($chunk,&DWP(12,"ebp"));
253                                                 if ($mode eq "ctr16") {
254         &mov    ($inp,&DWP(4,"ebp"));
255         &xor    ($len,$len);
256 &set_label("${mode}_xor");
257         &movups ("xmm1",&QWP(0,$inp,$len));
258         &lea    ($len,&DWP(16,$len));
259         &pxor   ("xmm1",&QWP(-16,"esp",$len));
260         &movups (&QWP(-16,$out,$len),"xmm1");
261         &cmp    ($len,$chunk);
262         &jb     (&label("${mode}_xor"));
263                                                 } else {
264         &test   ($out,0x0f);
265         &jz     (&label("${mode}_out_aligned"));
266         &mov    ($len,$chunk);
267         &shr    ($len,2);
268         &lea    ($inp,&DWP(0,"esp"));
269         &data_byte(0xf3,0xa5);                  # rep movsl
270         &sub    ($out,$chunk);
271 &set_label("${mode}_out_aligned");
272         &mov    ($inp,&DWP(4,"ebp"));
273                                                 }
274         &mov    ($len,&DWP(8,"ebp"));
275         &add    ($out,$chunk);
276         &add    ($inp,$chunk);
277         &sub    ($len,$chunk);
278         &mov    ($chunk,$PADLOCK_CHUNK);
279         &jnz    (&label("${mode}_loop"));
280                                                 if ($mode ne "ctr16") {
281         &test   ($out,0x0f);                    # out_misaligned
282         &jz     (&label("${mode}_done"));
283                                                 }
284         &mov    ($len,"ebp");
285         &mov    ($out,"esp");
286         &sub    ($len,"esp");
287         &xor    ("eax","eax");
288         &shr    ($len,2);
289         &data_byte(0xf3,0xab);                  # rep stosl
290 &set_label("${mode}_done");
291         &lea    ("esp",&DWP(24,"ebp"));
292                                                 if ($mode ne "ctr16") {
293         &jmp    (&label("${mode}_exit"));
294
295 &set_label("${mode}_aligned",16);
296         &lea    ("eax",&DWP(-16,$ctx));         # ivp
297         &lea    ("ebx",&DWP(16,$ctx));          # key
298         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
299         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
300                                                 if ($mode ne "ecb") {
301         &movaps ("xmm0",&QWP(0,"eax"));
302         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
303                                                 }
304 &set_label("${mode}_exit");                     }
305         &mov    ("eax",1);
306         &lea    ("esp",&DWP(4,"esp"));          # popf
307         &emms   ()                              if ($mode eq "ctr16");
308 &set_label("${mode}_abort");
309 &function_end("padlock_${mode}_encrypt");
310 }
311
312 &generate_mode("ecb",0xc8);
313 &generate_mode("cbc",0xd0);
314 &generate_mode("cfb",0xe0);
315 &generate_mode("ofb",0xe8);
316 &generate_mode("ctr16",0xc8);   # yes, it implements own ctr with ecb opcode,
317                                 # because hardware ctr was introduced later
318                                 # and even has errata on certain CPU stepping.
319                                 # own implementation *always* works...
320
321 &function_begin_B("padlock_xstore");
322         &push   ("edi");
323         &mov    ("edi",&wparam(0));
324         &mov    ("edx",&wparam(1));
325         &data_byte(0x0f,0xa7,0xc0);             # xstore
326         &pop    ("edi");
327         &ret    ();
328 &function_end_B("padlock_xstore");
329
330 &function_begin_B("_win32_segv_handler");
331         &mov    ("eax",1);                      # ExceptionContinueSearch
332         &mov    ("edx",&wparam(0));             # *ExceptionRecord
333         &mov    ("ecx",&wparam(2));             # *ContextRecord
334         &cmp    (&DWP(0,"edx"),0xC0000005)      # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
335         &jne    (&label("ret"));
336         &add    (&DWP(184,"ecx"),4);            # skip over rep sha*
337         &mov    ("eax",0);                      # ExceptionContinueExecution
338 &set_label("ret");
339         &ret    ();
340 &function_end_B("_win32_segv_handler");
341 &safeseh("_win32_segv_handler")                 if ($::win32);
342
343 &function_begin_B("padlock_sha1_oneshot");
344         &push   ("edi");
345         &push   ("esi");
346         &xor    ("eax","eax");
347     if ($::win32 or $::coff) {
348         &push   (&::islabel("_win32_segv_handler"));
349         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
350         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
351     }
352         &mov    ("edi",&wparam(0));
353         &mov    ("esi",&wparam(1));
354         &mov    ("ecx",&wparam(2));
355         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
356     if ($::win32 or $::coff) {
357         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
358         &lea    ("esp",&DWP(4,"esp"));
359     }
360         &pop    ("esi");
361         &pop    ("edi");
362         &ret    ();
363 &function_end_B("padlock_sha1_oneshot");
364
365 &function_begin_B("padlock_sha1");
366         &push   ("edi");
367         &push   ("esi");
368         &mov    ("eax",-1);
369         &mov    ("edi",&wparam(0));
370         &mov    ("esi",&wparam(1));
371         &mov    ("ecx",&wparam(2));
372         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
373         &pop    ("esi");
374         &pop    ("edi");
375         &ret    ();
376 &function_end_B("padlock_sha1");
377
378 &function_begin_B("padlock_sha256_oneshot");
379         &push   ("edi");
380         &push   ("esi");
381         &xor    ("eax","eax");
382     if ($::win32 or $::coff) {
383         &push   (&::islabel("_win32_segv_handler"));
384         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
385         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
386     }
387         &mov    ("edi",&wparam(0));
388         &mov    ("esi",&wparam(1));
389         &mov    ("ecx",&wparam(2));
390         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
391     if ($::win32 or $::coff) {
392         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
393         &lea    ("esp",&DWP(4,"esp"));
394     }
395         &pop    ("esi");
396         &pop    ("edi");
397         &ret    ();
398 &function_end_B("padlock_sha256_oneshot");
399
400 &function_begin_B("padlock_sha256");
401         &push   ("edi");
402         &push   ("esi");
403         &mov    ("eax",-1);
404         &mov    ("edi",&wparam(0));
405         &mov    ("esi",&wparam(1));
406         &mov    ("ecx",&wparam(2));
407         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
408         &pop    ("esi");
409         &pop    ("edi");
410         &ret    ();
411 &function_end_B("padlock_sha256");
412
413 &asciz  ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
414 &align  (16);
415
416 &dataseg();
417 # Essentially this variable belongs in thread local storage.
418 # Having this variable global on the other hand can only cause
419 # few bogus key reloads [if any at all on signle-CPU system],
420 # so we accept the panalty...
421 &set_label("padlock_saved_context",4);
422 &data_word(0);
423
424 &asm_finish();