Padlock engine: make it independent of inline assembler.
[openssl.git] / engines / asm / e_padlock-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # September 2011
11 #
12 # Assembler helpers for Padlock engine. Compared to original engine
13 # version relying on inline assembler and compiled with gcc 3.4.6 it
14 # was measured to provide ~100% improvement on misaligned data in ECB
15 # mode and ~75% in CBC mode. For aligned data improvement can be
16 # observed for short inputs only, e.g. 45% for 64-byte messages in
17 # ECB mode, 20% in CBC. Difference in performance for aligned vs.
18 # misaligned data depends on misalignment and is either ~1.8x or
19 # ~2.9x. These are approximately same factors as for hardware support,
20 # so there is little reason to rely on the latter. It might actually
21 # hurt performance in mixture of aligned and misaligned buffers,
22 # because a) if you choose to flip 'align' flag on per-buffer basis,
23 # then you'd have to reload key context; b) if you choose to set
24 # 'align' flag permanently, it limits performance for aligned data
25 # to ~1/2. All results were collected on 1.5GHz C7.
26
27 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
28 push(@INC,"${dir}","${dir}../../crypto/perlasm");
29 require "x86asm.pl";
30
31 &asm_init($ARGV[0],$0);
32
33 $PADLOCK_CHUNK=512;     # Must be a power of 2 larger than 16
34
35 $ctx="edx";
36 $out="edi";
37 $inp="esi";
38 $len="ecx";
39 $chunk="ebx";
40
41 &function_begin_B("padlock_capability");
42         &push   ("ebx");
43         &pushf  ();
44         &pop    ("eax");
45         &mov    ("ecx","eax");
46         &xor    ("eax",1<<21);
47         &push   ("eax");
48         &popf   ();
49         &pushf  ();
50         &pop    ("eax");
51         &xor    ("ecx","eax");
52         &xor    ("eax","eax");
53         &bt     ("ecx",21);
54         &jnc    (&label("noluck"));
55         &cpuid  ();
56         &xor    ("eax","eax");
57         &cmp    ("ebx","0x".unpack("H*",'tneC'));
58         &jne    (&label("noluck"));
59         &cmp    ("edx","0x".unpack("H*",'Hrua'));
60         &jne    (&label("noluck"));
61         &cmp    ("ecx","0x".unpack("H*",'slua'));
62         &jne    (&label("noluck"));
63         &mov    ("eax",0xC0000000);
64         &cpuid  ();
65         &mov    ("edx","eax");
66         &xor    ("eax","eax");
67         &cmp    ("edx",0xC0000001);
68         &jb     (&label("noluck"));
69         &mov    ("eax",1);
70         &cpuid  ();
71         &or     ("eax",0x0f);
72         &xor    ("ebx","ebx");
73         &and    ("eax",0x0fff);
74         &cmp    ("eax",0x06ff);         # check for Nano
75         &sete   ("bl");
76         &mov    ("eax",0xC0000001);
77         &push   ("ebx");
78         &cpuid  ();
79         &pop    ("ebx");
80         &mov    ("eax","edx");
81         &shl    ("ebx",4);              # bit#4 denotes Nano
82         &and    ("eax",0xffffffef);
83         &or     ("eax","ebx")
84 &set_label("noluck");
85         &pop    ("ebx");
86         &ret    ();
87 &function_end_B("padlock_capability")
88
89 &function_begin_B("padlock_key_bswap");
90         &mov    ("edx",&wparam(0));
91         &mov    ("ecx",&DWP(240,"edx"));
92 &set_label("bswap_loop");
93         &mov    ("eax",&DWP(0,"edx"));
94         &bswap  ("eax");
95         &mov    (&DWP(0,"edx"),"eax");
96         &lea    ("edx",&DWP(4,"edx"));
97         &sub    ("ecx",1);
98         &jnz    (&label("bswap_loop"));
99         &ret    ();
100 &function_end_B("padlock_key_bswap");
101
102 # This is heuristic key context tracing. At first one
103 # believes that one should use atomic swap instructions,
104 # but it's not actually necessary. Point is that if
105 # padlock_saved_context was changed by another thread
106 # after we've read it and before we compare it with ctx,
107 # our key *shall* be reloaded upon thread context switch
108 # and we are therefore set in either case...
109 &static_label("padlock_saved_context");
110
111 &function_begin_B("padlock_verify_context");
112         &mov    ($ctx,&wparam(0));
113         &lea    ("eax",&DWP("padlock_saved_context-".&label("verify_pic_point")));
114         &pushf  ();
115         &call   ("_padlock_verify_ctx");
116 &set_label("verify_pic_point");
117         &lea    ("esp",&DWP(4,"esp"));
118         &ret    ();
119 &function_end_B("padlock_verify_context");
120
121 &function_begin_B("_padlock_verify_ctx");
122         &add    ("eax",&DWP(0,"esp"));          # &padlock_saved_context
123         &bt     (&DWP(4,"esp"),30);             # eflags
124         &jnc    (&label("verified"));
125         &cmp    ($ctx,&DWP(0,"eax"));
126         &je     (&label("verified"));
127         &pushf  ();
128         &popf   ();
129 &set_label("verified");
130         &mov    (&DWP(0,"eax"),$ctx);
131         &ret    ();
132 &function_end_B("_padlock_verify_ctx");
133
134 &function_begin_B("padlock_reload_key");
135         &pushf  ();
136         &popf   ();
137         &ret    ();
138 &function_end_B("padlock_reload_key");
139
140 &function_begin_B("padlock_aes_block");
141         &push   ("edi");
142         &push   ("esi");
143         &push   ("ebx");
144         &mov    ($out,&wparam(0));              # must be 16-byte aligned
145         &mov    ($inp,&wparam(1));              # must be 16-byte aligned
146         &mov    ($ctx,&wparam(2));
147         &mov    ($len,1);
148         &lea    ("ebx",&DWP(32,$ctx));          # key
149         &lea    ($ctx,&DWP(16,$ctx));           # control word
150         &data_byte(0xf3,0x0f,0xa7,0xc8);        # rep xcryptecb
151         &pop    ("ebx");
152         &pop    ("esi");
153         &pop    ("edi");
154         &ret    ();
155 &function_end_B("padlock_aes_block");
156
157 sub generate_mode {
158 my ($mode,$opcode) = @_;
159 # int padlock_$mode_encrypt(void *out, const void *inp,
160 #               struct padlock_cipher_data *ctx, size_t len);
161 &function_begin("padlock_${mode}_encrypt");
162         &mov    ($out,&wparam(0));
163         &mov    ($inp,&wparam(1));
164         &mov    ($ctx,&wparam(2));
165         &mov    ($len,&wparam(3));
166         &test   ($ctx,15);
167         &jnz    (&label("${mode}_abort"));
168         &test   ($len,15);
169         &jnz    (&label("${mode}_abort"));
170         &lea    ("eax",&DWP("padlock_saved_context-".&label("${mode}_pic_point")));
171         &pushf  ();
172         &cld    ();
173         &call   ("_padlock_verify_ctx");
174 &set_label("${mode}_pic_point");
175         &lea    ($ctx,&DWP(16,$ctx));   # control word
176         &xor    ("eax","eax");
177                                         if ($mode eq "ctr16") {
178         &movdqa ("xmm0",&QWP(-16,$ctx));# load iv
179                                         } else {
180         &xor    ("ebx","ebx");
181         &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
182         &jnz    (&label("${mode}_aligned"));
183         &test   ($out,0x0f);
184         &setz   ("al");                 # !out_misaligned
185         &test   ($inp,0x0f);
186         &setz   ("bl");                 # !inp_misaligned
187         &test   ("eax","ebx");
188         &jnz    (&label("${mode}_aligned"));
189         &neg    ("eax");
190                                         }
191         &mov    ($chunk,$PADLOCK_CHUNK);
192         &not    ("eax");                # out_misaligned?-1:0
193         &lea    ("ebp",&DWP(-24,"esp"));
194         &cmp    ($len,$chunk);
195         &cmovc  ($chunk,$len);          # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
196         &and    ("eax",$chunk);         # out_misaligned?chunk:0
197         &mov    ($chunk,$len);
198         &neg    ("eax");
199         &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
200         &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
201         &and    ("esp",-16);
202         &jmp    (&label("${mode}_loop"));
203
204 &set_label("${mode}_loop",16);
205         &mov    (&DWP(0,"ebp"),$out);           # save parameters
206         &mov    (&DWP(4,"ebp"),$inp);
207         &mov    (&DWP(8,"ebp"),$len);
208         &mov    ($len,$chunk);
209         &mov    (&DWP(12,"ebp"),$chunk);        # chunk
210                                                 if ($mode eq "ctr16") {
211         &pextrw ("ecx","xmm0",7);               # borrow $len
212         &mov    ($inp,1);
213         &xor    ($out,$out);
214         &xchg   ("ch","cl");
215 &set_label("${mode}_prepare");
216         &movdqa (&QWP(0,"esp",$out),"xmm0");
217         &lea    ("eax",&DWP(0,"ecx",$inp));
218         &xchg   ("ah","al");
219         &lea    ($out,&DWP(16,$out));
220         &pinsrw ("xmm0","eax",7);
221         &lea    ($inp,&DWP(1,$inp));
222         &cmp    ($out,$chunk);
223         &jb     (&label("${mode}_prepare"));
224
225         &lea    ($inp,&DWP(0,"esp"));
226         &lea    ($out,&DWP(0,"esp"));
227         &mov    ($len,$chunk);
228                                                 } else {
229         &test   ($out,0x0f);                    # out_misaligned
230         &cmovnz ($out,"esp");
231         &test   ($inp,0x0f);                    # inp_misaligned
232         &jz     (&label("${mode}_inp_aligned"));
233         &shr    ($len,2);
234         &data_byte(0xf3,0xa5);                  # rep movsl
235         &sub    ($out,$chunk);
236         &mov    ($len,$chunk);
237         &mov    ($inp,$out);
238 &set_label("${mode}_inp_aligned");
239                                                 }
240         &lea    ("eax",&DWP(-16,$ctx));         # ivp
241         &lea    ("ebx",&DWP(16,$ctx));          # key
242         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
243         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
244                                                 if ($mode !~ /ecb|ctr/) {
245         &movdqa ("xmm0",&QWP(0,"eax"));
246         &movdqa (&DWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
247                                                 }
248         &mov    ($out,&DWP(0,"ebp"));           # restore parameters
249         &mov    ($chunk,&DWP(12,"ebp"));
250                                                 if ($mode eq "ctr16") {
251         &mov    ($inp,&DWP(4,"ebp"));
252         &xor    ($len,$len);
253 &set_label("${mode}_xor");
254         &movdqu ("xmm1",&QWP(0,$inp,$len));
255         &lea    ($len,&DWP(16,$len));
256         &pxor   ("xmm1",&QWP(-16,"esp",$len));
257         &movdqu (&QWP(-16,$out,$len),"xmm1");
258         &cmp    ($len,$chunk);
259         &jb     (&label("${mode}_xor"));
260                                                 } else {
261         &test   ($out,0x0f);
262         &jz     (&label("${mode}_out_aligned"));
263         &mov    ($len,$chunk);
264         &shr    ($len,2);
265         &lea    ($inp,&DWP(0,"esp"));
266         &data_byte(0xf3,0xa5);                  # rep movsl
267         &sub    ($out,$chunk);
268 &set_label("${mode}_out_aligned");
269         &mov    ($inp,&DWP(4,"ebp"));
270                                                 }
271         &mov    ($len,&DWP(8,"ebp"));
272         &add    ($out,$chunk);
273         &add    ($inp,$chunk);
274         &sub    ($len,$chunk);
275         &mov    ($chunk,$PADLOCK_CHUNK);
276         &jnz    (&label("${mode}_loop"));
277                                                 if ($mode eq "ctr16") {
278         &movdqa (&QWP(-16,$ctx),"xmm0");        # write out iv
279         &pxor   ("xmm0","xmm0");
280         &pxor   ("xmm1","xmm1");
281                                                 } else {
282         &test   ($out,0x0f);                    # out_misaligned
283         &jz     (&label("${mode}_done"));
284                                                 }
285         &mov    ($len,"ebp");
286         &mov    ($out,"esp");
287         &sub    ($len,"esp");
288         &xor    ("eax","eax");
289         &shr    ($len,2);
290         &data_byte(0xf3,0xab);                  # rep stosl
291 &set_label("${mode}_done");
292         &lea    ("esp",&DWP(24,"ebp"));
293                                                 if ($mode ne "ctr16") {
294         &jmp    (&label("${mode}_exit"));
295
296 &set_label("${mode}_aligned",16);
297         &lea    ("eax",&DWP(-16,$ctx));         # ivp
298         &lea    ("ebx",&DWP(16,$ctx));          # key
299         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
300         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
301                                                 if ($mode ne "ecb") {
302         &movdqa ("xmm0",&QWP(0,"eax"));
303         &movdqa (&DWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
304                                                 }
305 &set_label("${mode}_exit");                     }
306         &mov    ("eax",1);
307         &lea    ("esp",&DWP(4,"esp"));          # popf
308 &set_label("${mode}_abort");
309 &function_end("padlock_${mode}_encrypt");
310 }
311
312 &generate_mode("ecb",0xc8);
313 &generate_mode("cbc",0xd0);
314 &generate_mode("cfb",0xe0);
315 &generate_mode("ofb",0xe8);
316 &generate_mode("ctr16",0xc8);   # yes, it implements own ctr with ecb opcode,
317                                 # because hardware ctr was introduced later
318                                 # and even has errata on certain CPU stepping.
319                                 # own implementation *always* works...
320
321 &function_begin_B("padlock_xstore");
322         &push   ("edi");
323         &mov    ("edi",&wparam(0));
324         &mov    ("edx",&wparam(1));
325         &data_byte(0x0f,0xa7,0xc0);             # xstore
326         &pop    ("edi");
327         &ret    ();
328 &function_end_B("padlock_xstore");
329
330 &function_begin_B("_win32_segv_handler");
331         &mov    ("eax",1);                      # ExceptionContinueSearch
332         &mov    ("edx",&wparam(0));             # *ExceptionRecord
333         &mov    ("ecx",&wparam(2));             # *ContextRecord
334         &cmp    (&DWP(0,"edx"),0xC0000005)      # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
335         &jne    (&label("ret"));
336         &add    (&DWP(184,"ecx"),4);            # skip over rep sha*
337         &mov    ("eax",0);                      # ExceptionContinueExecution
338 &set_label("ret");
339         &ret    ();
340 &function_end_B("_win32_segv_handler");
341 &safeseh("_win32_segv_handler")                 if ($::win32);
342
343 &function_begin_B("padlock_sha1_oneshot");
344         &push   ("edi");
345         &push   ("esi");
346         &xor    ("eax","eax");
347     if ($::win32 or $::coff) {
348         &push   (&::islabel("_win32_segv_handler"));
349         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
350         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
351     }
352         &mov    ("edi",&wparam(0));
353         &mov    ("esi",&wparam(1));
354         &mov    ("ecx",&wparam(2));
355         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
356     if ($::win32 or $::coff) {
357         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
358         &lea    ("esp",&DWP(4,"esp"));
359     }
360         &pop    ("esi");
361         &pop    ("edi");
362         &ret    ();
363 &function_end_B("padlock_sha1_oneshot");
364
365 &function_begin_B("padlock_sha1");
366         &push   ("edi");
367         &push   ("esi");
368         &mov    ("eax",-1);
369         &mov    ("edi",&wparam(0));
370         &mov    ("esi",&wparam(1));
371         &mov    ("ecx",&wparam(2));
372         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
373         &pop    ("esi");
374         &pop    ("edi");
375         &ret    ();
376 &function_end_B("padlock_sha1");
377
378 &function_begin_B("padlock_sha256_oneshot");
379         &push   ("edi");
380         &push   ("esi");
381         &xor    ("eax","eax");
382     if ($::win32 or $::coff) {
383         &push   (&::islabel("_win32_segv_handler"));
384         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
385         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
386     }
387         &mov    ("edi",&wparam(0));
388         &mov    ("esi",&wparam(1));
389         &mov    ("ecx",&wparam(2));
390         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
391     if ($::win32 or $::coff) {
392         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
393         &lea    ("esp",&DWP(4,"esp"));
394     }
395         &pop    ("esi");
396         &pop    ("edi");
397         &ret    ();
398 &function_end_B("padlock_sha256_oneshot");
399
400 &function_begin_B("padlock_sha256");
401         &push   ("edi");
402         &push   ("esi");
403         &mov    ("eax",-1);
404         &mov    ("edi",&wparam(0));
405         &mov    ("esi",&wparam(1));
406         &mov    ("ecx",&wparam(2));
407         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
408         &pop    ("esi");
409         &pop    ("edi");
410         &ret    ();
411 &function_end_B("padlock_sha256");
412
413 &asciz  ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
414 &align  (16);
415
416 &dataseg();
417 # Essentially this variable belongs in thread local storage.
418 # Having this variable global on the other hand can only cause
419 # few bogus key reloads [if any at all on signle-CPU system],
420 # so we accept the panalty...
421 &set_label("padlock_saved_context",4);
422 &data_word(0);
423
424 &asm_finish();