e_padlock-x86[_64].pl: SHA fixes, comply with specification and fix bug.
[openssl.git] / engines / asm / e_padlock-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # September 2011
11 #
12 # Assembler helpers for Padlock engine. Compared to original engine
13 # version relying on inline assembler and compiled with gcc 3.4.6 it
14 # was measured to provide ~100% improvement on misaligned data in ECB
15 # mode and ~75% in CBC mode. For aligned data improvement can be
16 # observed for short inputs only, e.g. 45% for 64-byte messages in
17 # ECB mode, 20% in CBC. Difference in performance for aligned vs.
18 # misaligned data depends on misalignment and is either ~1.8x or 2.9x.
19 # These are approximately same factors as for hardware support, so
20 # there is little reason to rely on the latter. On the contrary, it
21 # might actually hurt performance in mixture of aligned and misaligned
22 # buffers, because a) if you choose to flip 'align' flag in control
23 # word on per-buffer basis, then you'd have to reload key context,
24 # which incurs penalty; b) if you choose to set 'align' flag
25 # permanently, it limits performance even for aligned data to ~1/2.
26 # All above mentioned results were collected on 1.5GHz C7. Nano on the
27 # other hand handles unaligned data more gracefully. Depending on
28 # algorithm and how unaligned data is, hardware can be up to 70% more
29 # efficient than below software alignment procedures, nor does 'align'
30 # flag have affect on aligned performance [if has any meaning at all].
31 # Therefore suggestion is to unconditionally set 'align' flag on Nano
32 # for optimal performance.
33
34 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35 push(@INC,"${dir}","${dir}../../crypto/perlasm");
36 require "x86asm.pl";
37
38 &asm_init($ARGV[0],$0);
39
40 $PADLOCK_CHUNK=512;     # Must be a power of 2 larger than 16
41
42 $ctx="edx";
43 $out="edi";
44 $inp="esi";
45 $len="ecx";
46 $chunk="ebx";
47
48 &function_begin_B("padlock_capability");
49         &push   ("ebx");
50         &pushf  ();
51         &pop    ("eax");
52         &mov    ("ecx","eax");
53         &xor    ("eax",1<<21);
54         &push   ("eax");
55         &popf   ();
56         &pushf  ();
57         &pop    ("eax");
58         &xor    ("ecx","eax");
59         &xor    ("eax","eax");
60         &bt     ("ecx",21);
61         &jnc    (&label("noluck"));
62         &cpuid  ();
63         &xor    ("eax","eax");
64         &cmp    ("ebx","0x".unpack("H*",'tneC'));
65         &jne    (&label("noluck"));
66         &cmp    ("edx","0x".unpack("H*",'Hrua'));
67         &jne    (&label("noluck"));
68         &cmp    ("ecx","0x".unpack("H*",'slua'));
69         &jne    (&label("noluck"));
70         &mov    ("eax",0xC0000000);
71         &cpuid  ();
72         &mov    ("edx","eax");
73         &xor    ("eax","eax");
74         &cmp    ("edx",0xC0000001);
75         &jb     (&label("noluck"));
76         &mov    ("eax",1);
77         &cpuid  ();
78         &or     ("eax",0x0f);
79         &xor    ("ebx","ebx");
80         &and    ("eax",0x0fff);
81         &cmp    ("eax",0x06ff);         # check for Nano
82         &sete   ("bl");
83         &mov    ("eax",0xC0000001);
84         &push   ("ebx");
85         &cpuid  ();
86         &pop    ("ebx");
87         &mov    ("eax","edx");
88         &shl    ("ebx",4);              # bit#4 denotes Nano
89         &and    ("eax",0xffffffef);
90         &or     ("eax","ebx")
91 &set_label("noluck");
92         &pop    ("ebx");
93         &ret    ();
94 &function_end_B("padlock_capability")
95
96 &function_begin_B("padlock_key_bswap");
97         &mov    ("edx",&wparam(0));
98         &mov    ("ecx",&DWP(240,"edx"));
99 &set_label("bswap_loop");
100         &mov    ("eax",&DWP(0,"edx"));
101         &bswap  ("eax");
102         &mov    (&DWP(0,"edx"),"eax");
103         &lea    ("edx",&DWP(4,"edx"));
104         &sub    ("ecx",1);
105         &jnz    (&label("bswap_loop"));
106         &ret    ();
107 &function_end_B("padlock_key_bswap");
108
109 # This is heuristic key context tracing. At first one
110 # believes that one should use atomic swap instructions,
111 # but it's not actually necessary. Point is that if
112 # padlock_saved_context was changed by another thread
113 # after we've read it and before we compare it with ctx,
114 # our key *shall* be reloaded upon thread context switch
115 # and we are therefore set in either case...
116 &static_label("padlock_saved_context");
117
118 &function_begin_B("padlock_verify_context");
119         &mov    ($ctx,&wparam(0));
120         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
121                        &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
122         &pushf  ();
123         &call   ("_padlock_verify_ctx");
124 &set_label("verify_pic_point");
125         &lea    ("esp",&DWP(4,"esp"));
126         &ret    ();
127 &function_end_B("padlock_verify_context");
128
129 &function_begin_B("_padlock_verify_ctx");
130         &add    ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context
131         &bt     (&DWP(4,"esp"),30);             # eflags
132         &jnc    (&label("verified"));
133         &cmp    ($ctx,&DWP(0,"eax"));
134         &je     (&label("verified"));
135         &pushf  ();
136         &popf   ();
137 &set_label("verified");
138         &mov    (&DWP(0,"eax"),$ctx);
139         &ret    ();
140 &function_end_B("_padlock_verify_ctx");
141
142 &function_begin_B("padlock_reload_key");
143         &pushf  ();
144         &popf   ();
145         &ret    ();
146 &function_end_B("padlock_reload_key");
147
148 &function_begin_B("padlock_aes_block");
149         &push   ("edi");
150         &push   ("esi");
151         &push   ("ebx");
152         &mov    ($out,&wparam(0));              # must be 16-byte aligned
153         &mov    ($inp,&wparam(1));              # must be 16-byte aligned
154         &mov    ($ctx,&wparam(2));
155         &mov    ($len,1);
156         &lea    ("ebx",&DWP(32,$ctx));          # key
157         &lea    ($ctx,&DWP(16,$ctx));           # control word
158         &data_byte(0xf3,0x0f,0xa7,0xc8);        # rep xcryptecb
159         &pop    ("ebx");
160         &pop    ("esi");
161         &pop    ("edi");
162         &ret    ();
163 &function_end_B("padlock_aes_block");
164
165 sub generate_mode {
166 my ($mode,$opcode) = @_;
167 # int padlock_$mode_encrypt(void *out, const void *inp,
168 #               struct padlock_cipher_data *ctx, size_t len);
169 &function_begin("padlock_${mode}_encrypt");
170         &mov    ($out,&wparam(0));
171         &mov    ($inp,&wparam(1));
172         &mov    ($ctx,&wparam(2));
173         &mov    ($len,&wparam(3));
174         &test   ($ctx,15);
175         &jnz    (&label("${mode}_abort"));
176         &test   ($len,15);
177         &jnz    (&label("${mode}_abort"));
178         &lea    ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
179                        &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
180         &pushf  ();
181         &cld    ();
182         &call   ("_padlock_verify_ctx");
183 &set_label("${mode}_pic_point");
184         &lea    ($ctx,&DWP(16,$ctx));   # control word
185         &xor    ("eax","eax");
186                                         if ($mode eq "ctr32") {
187         &movq   ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
188                                         } else {
189         &xor    ("ebx","ebx");
190         &test   (&DWP(0,$ctx),1<<5);    # align bit in control word
191         &jnz    (&label("${mode}_aligned"));
192         &test   ($out,0x0f);
193         &setz   ("al");                 # !out_misaligned
194         &test   ($inp,0x0f);
195         &setz   ("bl");                 # !inp_misaligned
196         &test   ("eax","ebx");
197         &jnz    (&label("${mode}_aligned"));
198         &neg    ("eax");
199                                         }
200         &mov    ($chunk,$PADLOCK_CHUNK);
201         &not    ("eax");                # out_misaligned?-1:0
202         &lea    ("ebp",&DWP(-24,"esp"));
203         &cmp    ($len,$chunk);
204         &cmovc  ($chunk,$len);          # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
205         &and    ("eax",$chunk);         # out_misaligned?chunk:0
206         &mov    ($chunk,$len);
207         &neg    ("eax");
208         &and    ($chunk,$PADLOCK_CHUNK-1);      # chunk=len%PADLOCK_CHUNK
209         &lea    ("esp",&DWP(0,"eax","ebp"));    # alloca
210         &and    ("esp",-16);
211         &jmp    (&label("${mode}_loop"));
212
213 &set_label("${mode}_loop",16);
214         &mov    (&DWP(0,"ebp"),$out);           # save parameters
215         &mov    (&DWP(4,"ebp"),$inp);
216         &mov    (&DWP(8,"ebp"),$len);
217         &mov    ($len,$chunk);
218         &mov    (&DWP(12,"ebp"),$chunk);        # chunk
219                                                 if ($mode eq "ctr32") {
220         &mov    ("ecx",&DWP(-4,$ctx));
221         &xor    ($out,$out);
222         &mov    ("eax",&DWP(-8,$ctx));          # borrow $len
223 &set_label("${mode}_prepare");
224         &mov    (&DWP(12,"esp",$out),"ecx");
225         &bswap  ("ecx");
226         &movq   (&QWP(0,"esp",$out),"mm0");
227         &inc    ("ecx");
228         &mov    (&DWP(8,"esp",$out),"eax");
229         &bswap  ("ecx");
230         &lea    ($out,&DWP(16,$out));
231         &cmp    ($out,$chunk);
232         &jb     (&label("${mode}_prepare"));
233
234         &mov    (&DWP(-4,$ctx),"ecx");
235         &lea    ($inp,&DWP(0,"esp"));
236         &lea    ($out,&DWP(0,"esp"));
237         &mov    ($len,$chunk);
238                                                 } else {
239         &test   ($out,0x0f);                    # out_misaligned
240         &cmovnz ($out,"esp");
241         &test   ($inp,0x0f);                    # inp_misaligned
242         &jz     (&label("${mode}_inp_aligned"));
243         &shr    ($len,2);
244         &data_byte(0xf3,0xa5);                  # rep movsl
245         &sub    ($out,$chunk);
246         &mov    ($len,$chunk);
247         &mov    ($inp,$out);
248 &set_label("${mode}_inp_aligned");
249                                                 }
250         &lea    ("eax",&DWP(-16,$ctx));         # ivp
251         &lea    ("ebx",&DWP(16,$ctx));          # key
252         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
253         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
254                                                 if ($mode !~ /ecb|ctr/) {
255         &movaps ("xmm0",&QWP(0,"eax"));
256         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
257                                                 }
258         &mov    ($out,&DWP(0,"ebp"));           # restore parameters
259         &mov    ($chunk,&DWP(12,"ebp"));
260                                                 if ($mode eq "ctr32") {
261         &mov    ($inp,&DWP(4,"ebp"));
262         &xor    ($len,$len);
263 &set_label("${mode}_xor");
264         &movups ("xmm1",&QWP(0,$inp,$len));
265         &lea    ($len,&DWP(16,$len));
266         &pxor   ("xmm1",&QWP(-16,"esp",$len));
267         &movups (&QWP(-16,$out,$len),"xmm1");
268         &cmp    ($len,$chunk);
269         &jb     (&label("${mode}_xor"));
270                                                 } else {
271         &test   ($out,0x0f);
272         &jz     (&label("${mode}_out_aligned"));
273         &mov    ($len,$chunk);
274         &shr    ($len,2);
275         &lea    ($inp,&DWP(0,"esp"));
276         &data_byte(0xf3,0xa5);                  # rep movsl
277         &sub    ($out,$chunk);
278 &set_label("${mode}_out_aligned");
279         &mov    ($inp,&DWP(4,"ebp"));
280                                                 }
281         &mov    ($len,&DWP(8,"ebp"));
282         &add    ($out,$chunk);
283         &add    ($inp,$chunk);
284         &sub    ($len,$chunk);
285         &mov    ($chunk,$PADLOCK_CHUNK);
286         &jnz    (&label("${mode}_loop"));
287                                                 if ($mode ne "ctr32") {
288         &test   ($out,0x0f);                    # out_misaligned
289         &jz     (&label("${mode}_done"));
290                                                 }
291         &mov    ($len,"ebp");
292         &mov    ($out,"esp");
293         &sub    ($len,"esp");
294         &xor    ("eax","eax");
295         &shr    ($len,2);
296         &data_byte(0xf3,0xab);                  # rep stosl
297 &set_label("${mode}_done");
298         &lea    ("esp",&DWP(24,"ebp"));
299                                                 if ($mode ne "ctr32") {
300         &jmp    (&label("${mode}_exit"));
301
302 &set_label("${mode}_aligned",16);
303         &lea    ("eax",&DWP(-16,$ctx));         # ivp
304         &lea    ("ebx",&DWP(16,$ctx));          # key
305         &shr    ($len,4);                       # len/=AES_BLOCK_SIZE
306         &data_byte(0xf3,0x0f,0xa7,$opcode);     # rep xcrypt*
307                                                 if ($mode ne "ecb") {
308         &movaps ("xmm0",&QWP(0,"eax"));
309         &movaps (&QWP(-16,$ctx),"xmm0");        # copy [or refresh] iv
310                                                 }
311 &set_label("${mode}_exit");                     }
312         &mov    ("eax",1);
313         &lea    ("esp",&DWP(4,"esp"));          # popf
314         &emms   ()                              if ($mode eq "ctr32");
315 &set_label("${mode}_abort");
316 &function_end("padlock_${mode}_encrypt");
317 }
318
319 &generate_mode("ecb",0xc8);
320 &generate_mode("cbc",0xd0);
321 &generate_mode("cfb",0xe0);
322 &generate_mode("ofb",0xe8);
323 &generate_mode("ctr32",0xc8);   # yes, it implements own CTR with ECB opcode,
324                                 # because hardware CTR was introduced later
325                                 # and even has errata on certain C7 stepping.
326                                 # own implementation *always* works, though
327                                 # ~15% slower than dedicated hardware...
328
329 &function_begin_B("padlock_xstore");
330         &push   ("edi");
331         &mov    ("edi",&wparam(0));
332         &mov    ("edx",&wparam(1));
333         &data_byte(0x0f,0xa7,0xc0);             # xstore
334         &pop    ("edi");
335         &ret    ();
336 &function_end_B("padlock_xstore");
337
338 &function_begin_B("_win32_segv_handler");
339         &mov    ("eax",1);                      # ExceptionContinueSearch
340         &mov    ("edx",&wparam(0));             # *ExceptionRecord
341         &mov    ("ecx",&wparam(2));             # *ContextRecord
342         &cmp    (&DWP(0,"edx"),0xC0000005)      # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
343         &jne    (&label("ret"));
344         &add    (&DWP(184,"ecx"),4);            # skip over rep sha*
345         &mov    ("eax",0);                      # ExceptionContinueExecution
346 &set_label("ret");
347         &ret    ();
348 &function_end_B("_win32_segv_handler");
349 &safeseh("_win32_segv_handler")                 if ($::win32);
350
351 &function_begin_B("padlock_sha1_oneshot");
352         &push   ("edi");
353         &push   ("esi");
354         &xor    ("eax","eax");
355         &mov    ("edi",&wparam(0));
356         &mov    ("esi",&wparam(1));
357         &mov    ("ecx",&wparam(2));
358     if ($::win32 or $::coff) {
359         &push   (&::islabel("_win32_segv_handler"));
360         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
361         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
362     }
363         &mov    ("edx","esp");                  # put aside %esp
364         &add    ("esp",-128);                   # 32 is enough but spec says 128
365         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
366         &and    ("esp",-16);
367         &mov    ("eax",&DWP(16,"edi"));
368         &movaps (&QWP(0,"esp"),"xmm0");
369         &mov    ("edi","esp");
370         &mov    (&DWP(16,"esp"),"eax");
371         &xor    ("eax","eax");
372         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
373         &movaps ("xmm0",&QWP(0,"esp"));
374         &mov    ("eax",&DWP(16,"esp"));
375         &mov    ("esp","edx");                  # restore %esp
376     if ($::win32 or $::coff) {
377         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
378         &lea    ("esp",&DWP(4,"esp"));
379     }
380         &mov    ("edi",&wparam(0));
381         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
382         &mov    (&DWP(16,"edi"),"eax");
383         &pop    ("esi");
384         &pop    ("edi");
385         &ret    ();
386 &function_end_B("padlock_sha1_oneshot");
387
388 &function_begin_B("padlock_sha1_blocks");
389         &push   ("edi");
390         &push   ("esi");
391         &mov    ("edi",&wparam(0));
392         &mov    ("esi",&wparam(1));
393         &mov    ("edx","esp");                  # put aside %esp
394         &mov    ("ecx",&wparam(2));
395         &add    ("esp",-128);
396         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
397         &and    ("esp",-16);
398         &mov    ("eax",&DWP(16,"edi"));
399         &movaps (&QWP(0,"esp"),"xmm0");
400         &mov    ("edi","esp");
401         &mov    (&DWP(16,"esp"),"eax");
402         &mov    ("eax",-1);
403         &data_byte(0xf3,0x0f,0xa6,0xc8);        # rep xsha1
404         &movaps ("xmm0",&QWP(0,"esp"));
405         &mov    ("eax",&DWP(16,"esp"));
406         &mov    ("esp","edx");                  # restore %esp
407         &mov    ("edi",&wparam(0));
408         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
409         &mov    (&DWP(16,"edi"),"eax");
410         &pop    ("esi");
411         &pop    ("edi");
412         &ret    ();
413 &function_end_B("padlock_sha1_blocks");
414
415 &function_begin_B("padlock_sha256_oneshot");
416         &push   ("edi");
417         &push   ("esi");
418         &xor    ("eax","eax");
419         &mov    ("edi",&wparam(0));
420         &mov    ("esi",&wparam(1));
421         &mov    ("ecx",&wparam(2));
422     if ($::win32 or $::coff) {
423         &push   (&::islabel("_win32_segv_handler"));
424         &data_byte(0x64,0xff,0x30);             # push  %fs:(%eax)
425         &data_byte(0x64,0x89,0x20);             # mov   %esp,%fs:(%eax)
426     }
427         &mov    ("edx","esp");                  # put aside %esp
428         &add    ("esp",-128);
429         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
430         &and    ("esp",-16);
431         &movups ("xmm1",&QWP(16,"edi"));
432         &movaps (&QWP(0,"esp"),"xmm0");
433         &mov    ("edi","esp");
434         &movaps (&QWP(16,"esp"),"xmm1");
435         &xor    ("eax","eax");
436         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
437         &movaps ("xmm0",&QWP(0,"esp"));
438         &movaps ("xmm1",&QWP(16,"esp"));
439         &mov    ("esp","edx");                  # restore %esp
440     if ($::win32 or $::coff) {
441         &data_byte(0x64,0x8f,0x05,0,0,0,0);     # pop   %fs:0
442         &lea    ("esp",&DWP(4,"esp"));
443     }
444         &mov    ("edi",&wparam(0));
445         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
446         &movups (&QWP(16,"edi"),"xmm1");
447         &pop    ("esi");
448         &pop    ("edi");
449         &ret    ();
450 &function_end_B("padlock_sha256_oneshot");
451
452 &function_begin_B("padlock_sha256_blocks");
453         &push   ("edi");
454         &push   ("esi");
455         &mov    ("edi",&wparam(0));
456         &mov    ("esi",&wparam(1));
457         &mov    ("ecx",&wparam(2));
458         &mov    ("edx","esp");                  # put aside %esp
459         &add    ("esp",-128);
460         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
461         &and    ("esp",-16);
462         &movups ("xmm1",&QWP(16,"edi"));
463         &movaps (&QWP(0,"esp"),"xmm0");
464         &mov    ("edi","esp");
465         &movaps (&QWP(16,"esp"),"xmm1");
466         &mov    ("eax",-1);
467         &data_byte(0xf3,0x0f,0xa6,0xd0);        # rep xsha256
468         &movaps ("xmm0",&QWP(0,"esp"));
469         &movaps ("xmm1",&QWP(16,"esp"));
470         &mov    ("esp","edx");                  # restore %esp
471         &mov    ("edi",&wparam(0));
472         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
473         &movups (&QWP(16,"edi"),"xmm1");
474         &pop    ("esi");
475         &pop    ("edi");
476         &ret    ();
477 &function_end_B("padlock_sha256_blocks");
478
479 &function_begin_B("padlock_sha512_blocks");
480         &push   ("edi");
481         &push   ("esi");
482         &mov    ("edi",&wparam(0));
483         &mov    ("esi",&wparam(1));
484         &mov    ("ecx",&wparam(2));
485         &mov    ("edx","esp");                  # put aside %esp
486         &add    ("esp",-128);
487         &movups ("xmm0",&QWP(0,"edi"));         # copy-in context
488         &and    ("esp",-16);
489         &movups ("xmm1",&QWP(16,"edi"));
490         &movups ("xmm2",&QWP(32,"edi"));
491         &movups ("xmm3",&QWP(48,"edi"));
492         &movaps (&QWP(0,"esp"),"xmm0");
493         &mov    ("edi","esp");
494         &movaps (&QWP(16,"esp"),"xmm1");
495         &movaps (&QWP(32,"esp"),"xmm2");
496         &movaps (&QWP(48,"esp"),"xmm3");
497         &data_byte(0xf3,0x0f,0xa6,0xe0);        # rep xsha512
498         &movaps ("xmm0",&QWP(0,"esp"));
499         &movaps ("xmm1",&QWP(16,"esp"));
500         &movaps ("xmm2",&QWP(32,"esp"));
501         &movaps ("xmm3",&QWP(48,"esp"));
502         &mov    ("esp","edx");                  # restore %esp
503         &mov    ("edi",&wparam(0));
504         &movups (&QWP(0,"edi"),"xmm0");         # copy-out context
505         &movups (&QWP(16,"edi"),"xmm1");
506         &movups (&QWP(32,"edi"),"xmm2");
507         &movups (&QWP(48,"edi"),"xmm3");
508         &pop    ("esi");
509         &pop    ("edi");
510         &ret    ();
511 &function_end_B("padlock_sha512_blocks");
512
513 &asciz  ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
514 &align  (16);
515
516 &dataseg();
517 # Essentially this variable belongs in thread local storage.
518 # Having this variable global on the other hand can only cause
519 # few bogus key reloads [if any at all on signle-CPU system],
520 # so we accept the penalty...
521 &set_label("padlock_saved_context",4);
522 &data_word(0);
523
524 &asm_finish();