2 # Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
22 # $output is the last argument if it looks like a file (it has an extension)
23 # $flavour is the first argument if it doesn't look like a file
24 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
25 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
27 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
32 die "can't locate x86_64-xlate.pl";
34 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
35 or die "can't call $xlate: $!";
40 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
41 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
49 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
50 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
53 .globl padlock_capability
54 .type padlock_capability,\@abi-omnipotent
61 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
63 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
65 cmp \$`"0x".unpack("H*",'slua')`,%ecx
69 cmp \$`"0x".unpack("H*",'hS ')`,%ebx
71 cmp \$`"0x".unpack("H*",'hgna')`,%edx
73 cmp \$`"0x".unpack("H*",' ia')`,%ecx
86 or \$0x10,%eax # set Nano bit#4
90 .size padlock_capability,.-padlock_capability
92 .globl padlock_key_bswap
93 .type padlock_key_bswap,\@abi-omnipotent,0
107 .size padlock_key_bswap,.-padlock_key_bswap
109 .globl padlock_verify_context
110 .type padlock_verify_context,\@abi-omnipotent
112 padlock_verify_context:
115 lea .Lpadlock_saved_context(%rip),%rax
116 call _padlock_verify_ctx
119 .size padlock_verify_context,.-padlock_verify_context
121 .type _padlock_verify_ctx,\@abi-omnipotent
134 .size _padlock_verify_ctx,.-_padlock_verify_ctx
136 .globl padlock_reload_key
137 .type padlock_reload_key,\@abi-omnipotent
143 .size padlock_reload_key,.-padlock_reload_key
145 .globl padlock_aes_block
146 .type padlock_aes_block,\@function,3
151 lea 32($ctx),%rbx # key
152 lea 16($ctx),$ctx # control word
153 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
156 .size padlock_aes_block,.-padlock_aes_block
158 .globl padlock_xstore
159 .type padlock_xstore,\@function,2
163 .byte 0x0f,0xa7,0xc0 # xstore
165 .size padlock_xstore,.-padlock_xstore
167 .globl padlock_sha1_oneshot
168 .type padlock_sha1_oneshot,\@function,3
170 padlock_sha1_oneshot:
172 mov %rdi,%rdx # put aside %rdi
173 movups (%rdi),%xmm0 # copy-in context
180 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
184 movups %xmm0,(%rdx) # copy-out context
187 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
189 .globl padlock_sha1_blocks
190 .type padlock_sha1_blocks,\@function,3
194 mov %rdi,%rdx # put aside %rdi
195 movups (%rdi),%xmm0 # copy-in context
202 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
206 movups %xmm0,(%rdx) # copy-out context
209 .size padlock_sha1_blocks,.-padlock_sha1_blocks
211 .globl padlock_sha256_oneshot
212 .type padlock_sha256_oneshot,\@function,3
214 padlock_sha256_oneshot:
216 mov %rdi,%rdx # put aside %rdi
217 movups (%rdi),%xmm0 # copy-in context
219 movups 16(%rdi),%xmm1
222 movaps %xmm1,16(%rsp)
224 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
226 movaps 16(%rsp),%xmm1
228 movups %xmm0,(%rdx) # copy-out context
229 movups %xmm1,16(%rdx)
231 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
233 .globl padlock_sha256_blocks
234 .type padlock_sha256_blocks,\@function,3
236 padlock_sha256_blocks:
238 mov %rdi,%rdx # put aside %rdi
239 movups (%rdi),%xmm0 # copy-in context
241 movups 16(%rdi),%xmm1
244 movaps %xmm1,16(%rsp)
246 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
248 movaps 16(%rsp),%xmm1
250 movups %xmm0,(%rdx) # copy-out context
251 movups %xmm1,16(%rdx)
253 .size padlock_sha256_blocks,.-padlock_sha256_blocks
255 .globl padlock_sha512_blocks
256 .type padlock_sha512_blocks,\@function,3
258 padlock_sha512_blocks:
260 mov %rdi,%rdx # put aside %rdi
261 movups (%rdi),%xmm0 # copy-in context
263 movups 16(%rdi),%xmm1
264 movups 32(%rdi),%xmm2
265 movups 48(%rdi),%xmm3
268 movaps %xmm1,16(%rsp)
269 movaps %xmm2,32(%rsp)
270 movaps %xmm3,48(%rsp)
271 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
273 movaps 16(%rsp),%xmm1
274 movaps 32(%rsp),%xmm2
275 movaps 48(%rsp),%xmm3
277 movups %xmm0,(%rdx) # copy-out context
278 movups %xmm1,16(%rdx)
279 movups %xmm2,32(%rdx)
280 movups %xmm3,48(%rdx)
282 .size padlock_sha512_blocks,.-padlock_sha512_blocks
286 my ($mode,$opcode) = @_;
287 # int padlock_$mode_encrypt(void *out, const void *inp,
288 # struct padlock_cipher_data *ctx, size_t len);
290 .globl padlock_${mode}_encrypt
291 .type padlock_${mode}_encrypt,\@function,4
293 padlock_${mode}_encrypt:
302 lea .Lpadlock_saved_context(%rip),%rax
305 call _padlock_verify_ctx
306 lea 16($ctx),$ctx # control word
309 testl \$`1<<5`,($ctx) # align bit in control word
310 jnz .L${mode}_aligned
312 setz %al # !out_misaligned
314 setz %bl # !inp_misaligned
316 jnz .L${mode}_aligned
318 mov \$$PADLOCK_CHUNK,$chunk
319 not %rax # out_misaligned?-1:0
322 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
323 and $chunk,%rax # out_misaligned?chunk:0
326 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
328 mov \$$PADLOCK_CHUNK,%rax
329 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
331 $code.=<<___ if ($mode eq "ctr32");
333 mov -4($ctx),%eax # pull 32-bit counter
336 and \$`$PADLOCK_CHUNK/16-1`,%eax
337 mov \$$PADLOCK_CHUNK,$chunk
341 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
344 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
347 mov $inp,%rax # check if prefetch crosses page
352 and \$0xfff,%rax # distance to page boundary
353 cmp \$$PADLOCK_PREFETCH{$mode},%rax
354 mov \$-$PADLOCK_PREFETCH{$mode},%rax
355 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
357 jz .L${mode}_unaligned_tail
363 cmp $len,$chunk # ctr32 artefact
364 cmova $len,$chunk # ctr32 artefact
365 mov $out,%r8 # save parameters
370 test \$0x0f,$out # out_misaligned
372 test \$0x0f,$inp # inp_misaligned
373 jz .L${mode}_inp_aligned
375 .byte 0xf3,0x48,0xa5 # rep movsq
379 .L${mode}_inp_aligned:
380 lea -16($ctx),%rax # ivp
381 lea 16($ctx),%rbx # key
383 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
385 $code.=<<___ if ($mode !~ /ecb|ctr/);
387 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
389 $code.=<<___ if ($mode eq "ctr32");
390 mov -4($ctx),%eax # pull 32-bit counter
391 test \$0xffff0000,%eax
392 jnz .L${mode}_no_carry
400 mov %r8,$out # restore parameters
403 jz .L${mode}_out_aligned
407 .byte 0xf3,0x48,0xa5 # rep movsq
409 .L${mode}_out_aligned:
415 mov \$$PADLOCK_CHUNK,$chunk
417 if (!$PADLOCK_PREFETCH{$mode}) {
427 $code.=<<___ if ($mode eq "ctr32");
429 mov $inp,%rax # check if prefetch crosses page
434 and \$0xfff,%rax # distance to page boundary
435 cmp \$$PADLOCK_PREFETCH{$mode},%rax
436 mov \$-$PADLOCK_PREFETCH{$mode},%rax
442 .L${mode}_unaligned_tail:
446 mov $out,%r8 # save parameters
448 sub %rax,%rsp # alloca
451 .byte 0xf3,0x48,0xa5 # rep movsq
453 mov %r8, $out # restore parameters
479 $code.=<<___ if ($mode eq "ctr32");
480 mov -4($ctx),%eax # pull 32-bit counter
484 mov \$`16*0x10000`,$chunk
488 cmova %rax,$chunk # don't let counter cross 2^16
490 jbe .L${mode}_aligned_skip
492 .L${mode}_aligned_loop:
493 mov $len,%r10 # save parameters
497 lea -16($ctx),%rax # ivp
498 lea 16($ctx),%rbx # key
499 shr \$4,$len # len/=AES_BLOCK_SIZE
500 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
502 mov -4($ctx),%eax # pull 32-bit counter
508 mov %r10,$len # restore parameters
510 mov \$`16*0x10000`,$chunk
513 jae .L${mode}_aligned_loop
515 .L${mode}_aligned_skip:
517 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
520 and \$0xfff,%rbp # distance to page boundary
522 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
523 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
525 and $len,%rbp # remainder
527 jz .L${mode}_aligned_tail
530 lea -16($ctx),%rax # ivp
531 lea 16($ctx),%rbx # key
532 shr \$4,$len # len/=AES_BLOCK_SIZE
533 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
535 $code.=<<___ if ($mode !~ /ecb|ctr/);
537 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
539 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
540 test %rbp,%rbp # check remainder
543 .L${mode}_aligned_tail:
551 .byte 0xf3,0x48,0xa5 # rep movsq
565 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
569 &generate_mode("ecb",0xc8);
570 &generate_mode("cbc",0xd0);
571 &generate_mode("cfb",0xe0);
572 &generate_mode("ofb",0xe8);
573 &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
576 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
580 .Lpadlock_saved_context:
583 $code =~ s/\`([^\`]*)\`/eval($1)/gem;