Also check for errors in x86_64-xlate.pl.
[openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12 #
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
20
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
23 #
24 #                       AMD64   Core2   EM64T
25 # -evp camellia-128-ecb 16.7    21.0    22.7
26 # + over gcc 3.4.6      +25%    +5%     0%
27 #
28 # camellia-128-cbc      15.7    20.4    21.1
29 #
30 # 128-bit key setup     128     216     205     cycles/key
31 # + over gcc 3.4.6      +54%    +39%    +15%
32 #
33 # Numbers in "+" rows represent performance improvement over compiler
34 # generated code. Key setup timings are impressive on AMD and Core2
35 # thanks to 64-bit operations being covertly deployed. Improvement on
36 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37 # apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39 # $output is the last argument if it looks like a file (it has an extension)
40 # $flavour is the first argument if it doesn't look like a file
41 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
42 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
43
44 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49 die "can't locate x86_64-xlate.pl";
50
51 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
52     or die "can't call $xlate: $!";
53 *STDOUT=*OUT;
54
55 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
56 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
57                         $r =~ s/%[er]([sd]i)/%\1l/;
58                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
59
60 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
61 @S=("%r8d","%r9d","%r10d","%r11d");
62 $i0="%esi";
63 $i1="%edi";
64 $Tbl="%rbp";    # size optimization
65 $inp="%r12";
66 $out="%r13";
67 $key="%r14";
68 $keyend="%r15";
69 $arg0d=$win64?"%ecx":"%edi";
70
71 # const unsigned int Camellia_SBOX[4][256];
72 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
73 # and [2][] - with [3][]. This is done to minimize code size.
74 $SBOX1_1110=0;          # Camellia_SBOX[0]
75 $SBOX4_4404=4;          # Camellia_SBOX[1]
76 $SBOX2_0222=2048;       # Camellia_SBOX[2]
77 $SBOX3_3033=2052;       # Camellia_SBOX[3]
78
79 sub Camellia_Feistel {
80 my $i=@_[0];
81 my $seed=defined(@_[1])?@_[1]:0;
82 my $scale=$seed<0?-8:8;
83 my $j=($i&1)*2;
84 my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
85
86 $code.=<<___;
87         xor     $s0,$t0                         # t0^=key[0]
88         xor     $s1,$t1                         # t1^=key[1]
89         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
90         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
91         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
92         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
93         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
94         shr     \$16,$t0
95         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
96         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
97         shr     \$16,$t1
98         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
99         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
100         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
101         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
102         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
103         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
104         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
105         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
106         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
107         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
108         mov     `$seed+($i+1)*$scale+4`($key),$t0
109         xor     $t3,$t2                         # t2^=t3
110         ror     \$8,$t3                         # t3=RightRotate(t3,8)
111         xor     $t2,$s2
112         xor     $t2,$s3
113         xor     $t3,$s3
114 ___
115 }
116
117 # void Camellia_EncryptBlock_Rounds(
118 #               int grandRounds,
119 #               const Byte plaintext[],
120 #               const KEY_TABLE_TYPE keyTable,
121 #               Byte ciphertext[])
122 $code=<<___;
123 .text
124
125 # V1.x API
126 .globl  Camellia_EncryptBlock
127 .type   Camellia_EncryptBlock,\@abi-omnipotent
128 .align  16
129 Camellia_EncryptBlock:
130 .cfi_startproc
131         movl    \$128,%eax
132         subl    $arg0d,%eax
133         movl    \$3,$arg0d
134         adcl    \$0,$arg0d      # keyBitLength==128?3:4
135         jmp     .Lenc_rounds
136 .cfi_endproc
137 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
138 # V2
139 .globl  Camellia_EncryptBlock_Rounds
140 .type   Camellia_EncryptBlock_Rounds,\@function,4
141 .align  16
142 .Lenc_rounds:
143 Camellia_EncryptBlock_Rounds:
144 .cfi_startproc
145         push    %rbx
146 .cfi_push       %rbx
147         push    %rbp
148 .cfi_push       %rbp
149         push    %r13
150 .cfi_push       %r13
151         push    %r14
152 .cfi_push       %r14
153         push    %r15
154 .cfi_push       %r15
155 .Lenc_prologue:
156
157         #mov    %rsi,$inp               # put away arguments
158         mov     %rcx,$out
159         mov     %rdx,$key
160
161         shl     \$6,%edi                # process grandRounds
162         lea     .LCamellia_SBOX(%rip),$Tbl
163         lea     ($key,%rdi),$keyend
164
165         mov     0(%rsi),@S[0]           # load plaintext
166         mov     4(%rsi),@S[1]
167         mov     8(%rsi),@S[2]
168         bswap   @S[0]
169         mov     12(%rsi),@S[3]
170         bswap   @S[1]
171         bswap   @S[2]
172         bswap   @S[3]
173
174         call    _x86_64_Camellia_encrypt
175
176         bswap   @S[0]
177         bswap   @S[1]
178         bswap   @S[2]
179         mov     @S[0],0($out)
180         bswap   @S[3]
181         mov     @S[1],4($out)
182         mov     @S[2],8($out)
183         mov     @S[3],12($out)
184
185         mov     0(%rsp),%r15
186 .cfi_restore    %r15
187         mov     8(%rsp),%r14
188 .cfi_restore    %r14
189         mov     16(%rsp),%r13
190 .cfi_restore    %r13
191         mov     24(%rsp),%rbp
192 .cfi_restore    %rbp
193         mov     32(%rsp),%rbx
194 .cfi_restore    %rbx
195         lea     40(%rsp),%rsp
196 .cfi_adjust_cfa_offset  -40
197 .Lenc_epilogue:
198         ret
199 .cfi_endproc
200 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
201
202 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
203 .align  16
204 _x86_64_Camellia_encrypt:
205 .cfi_startproc
206         xor     0($key),@S[1]
207         xor     4($key),@S[0]           # ^=key[0-3]
208         xor     8($key),@S[3]
209         xor     12($key),@S[2]
210 .align  16
211 .Leloop:
212         mov     16($key),$t1            # prefetch key[4-5]
213         mov     20($key),$t0
214
215 ___
216         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
217 $code.=<<___;
218         lea     16*4($key),$key
219         cmp     $keyend,$key
220         mov     8($key),$t3             # prefetch key[2-3]
221         mov     12($key),$t2
222         je      .Ledone
223
224         and     @S[0],$t0
225         or      @S[3],$t3
226         rol     \$1,$t0
227         xor     $t3,@S[2]               # s2^=s3|key[3];
228         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
229         and     @S[2],$t2
230         or      @S[1],$t1
231         rol     \$1,$t2
232         xor     $t1,@S[0]               # s0^=s1|key[1];
233         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
234         jmp     .Leloop
235
236 .align  16
237 .Ledone:
238         xor     @S[2],$t0               # SwapHalf
239         xor     @S[3],$t1
240         xor     @S[0],$t2
241         xor     @S[1],$t3
242
243         mov     $t0,@S[0]
244         mov     $t1,@S[1]
245         mov     $t2,@S[2]
246         mov     $t3,@S[3]
247
248         .byte   0xf3,0xc3               # rep ret
249 .cfi_endproc
250 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
251
252 # V1.x API
253 .globl  Camellia_DecryptBlock
254 .type   Camellia_DecryptBlock,\@abi-omnipotent
255 .align  16
256 Camellia_DecryptBlock:
257 .cfi_startproc
258         movl    \$128,%eax
259         subl    $arg0d,%eax
260         movl    \$3,$arg0d
261         adcl    \$0,$arg0d      # keyBitLength==128?3:4
262         jmp     .Ldec_rounds
263 .cfi_endproc
264 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
265 # V2
266 .globl  Camellia_DecryptBlock_Rounds
267 .type   Camellia_DecryptBlock_Rounds,\@function,4
268 .align  16
269 .Ldec_rounds:
270 Camellia_DecryptBlock_Rounds:
271 .cfi_startproc
272         push    %rbx
273 .cfi_push       %rbx
274         push    %rbp
275 .cfi_push       %rbp
276         push    %r13
277 .cfi_push       %r13
278         push    %r14
279 .cfi_push       %r14
280         push    %r15
281 .cfi_push       %r15
282 .Ldec_prologue:
283
284         #mov    %rsi,$inp               # put away arguments
285         mov     %rcx,$out
286         mov     %rdx,$keyend
287
288         shl     \$6,%edi                # process grandRounds
289         lea     .LCamellia_SBOX(%rip),$Tbl
290         lea     ($keyend,%rdi),$key
291
292         mov     0(%rsi),@S[0]           # load plaintext
293         mov     4(%rsi),@S[1]
294         mov     8(%rsi),@S[2]
295         bswap   @S[0]
296         mov     12(%rsi),@S[3]
297         bswap   @S[1]
298         bswap   @S[2]
299         bswap   @S[3]
300
301         call    _x86_64_Camellia_decrypt
302
303         bswap   @S[0]
304         bswap   @S[1]
305         bswap   @S[2]
306         mov     @S[0],0($out)
307         bswap   @S[3]
308         mov     @S[1],4($out)
309         mov     @S[2],8($out)
310         mov     @S[3],12($out)
311
312         mov     0(%rsp),%r15
313 .cfi_restore    %r15
314         mov     8(%rsp),%r14
315 .cfi_restore    %r14
316         mov     16(%rsp),%r13
317 .cfi_restore    %r13
318         mov     24(%rsp),%rbp
319 .cfi_restore    %rbp
320         mov     32(%rsp),%rbx
321 .cfi_restore    %rbx
322         lea     40(%rsp),%rsp
323 .cfi_adjust_cfa_offset  -40
324 .Ldec_epilogue:
325         ret
326 .cfi_endproc
327 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
328
329 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
330 .align  16
331 _x86_64_Camellia_decrypt:
332 .cfi_startproc
333         xor     0($key),@S[1]
334         xor     4($key),@S[0]           # ^=key[0-3]
335         xor     8($key),@S[3]
336         xor     12($key),@S[2]
337 .align  16
338 .Ldloop:
339         mov     -8($key),$t1            # prefetch key[4-5]
340         mov     -4($key),$t0
341
342 ___
343         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
344 $code.=<<___;
345         lea     -16*4($key),$key
346         cmp     $keyend,$key
347         mov     0($key),$t3             # prefetch key[2-3]
348         mov     4($key),$t2
349         je      .Lddone
350
351         and     @S[0],$t0
352         or      @S[3],$t3
353         rol     \$1,$t0
354         xor     $t3,@S[2]               # s2^=s3|key[3];
355         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
356         and     @S[2],$t2
357         or      @S[1],$t1
358         rol     \$1,$t2
359         xor     $t1,@S[0]               # s0^=s1|key[1];
360         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
361
362         jmp     .Ldloop
363
364 .align  16
365 .Lddone:
366         xor     @S[2],$t2
367         xor     @S[3],$t3
368         xor     @S[0],$t0
369         xor     @S[1],$t1
370
371         mov     $t2,@S[0]               # SwapHalf
372         mov     $t3,@S[1]
373         mov     $t0,@S[2]
374         mov     $t1,@S[3]
375
376         .byte   0xf3,0xc3               # rep ret
377 .cfi_endproc
378 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
379 ___
380
381 sub _saveround {
382 my ($rnd,$key,@T)=@_;
383 my $bias=int(@T[0])?shift(@T):0;
384
385     if ($#T==3) {
386         $code.=<<___;
387         mov     @T[1],`$bias+$rnd*8+0`($key)
388         mov     @T[0],`$bias+$rnd*8+4`($key)
389         mov     @T[3],`$bias+$rnd*8+8`($key)
390         mov     @T[2],`$bias+$rnd*8+12`($key)
391 ___
392     } else {
393         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
394         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
395     }
396 }
397
398 sub _loadround {
399 my ($rnd,$key,@T)=@_;
400 my $bias=int(@T[0])?shift(@T):0;
401
402 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
403 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
404 }
405
406 # shld is very slow on Intel EM64T family. Even on AMD it limits
407 # instruction decode rate [because it's VectorPath] and consequently
408 # performance...
409 sub __rotl128 {
410 my ($i0,$i1,$rot)=@_;
411
412     if ($rot) {
413         $code.=<<___;
414         mov     $i0,%r11
415         shld    \$$rot,$i1,$i0
416         shld    \$$rot,%r11,$i1
417 ___
418     }
419 }
420
421 # ... Implementing 128-bit rotate without shld gives 80% better
422 # performance EM64T, +15% on AMD64 and only ~7% degradation on
423 # Core2. This is therefore preferred.
424 sub _rotl128 {
425 my ($i0,$i1,$rot)=@_;
426
427     if ($rot) {
428         $code.=<<___;
429         mov     $i0,%r11
430         shl     \$$rot,$i0
431         mov     $i1,%r9
432         shr     \$`64-$rot`,%r9
433         shr     \$`64-$rot`,%r11
434         or      %r9,$i0
435         shl     \$$rot,$i1
436         or      %r11,$i1
437 ___
438     }
439 }
440
441 { my $step=0;
442
443 $code.=<<___;
444 .globl  Camellia_Ekeygen
445 .type   Camellia_Ekeygen,\@function,3
446 .align  16
447 Camellia_Ekeygen:
448 .cfi_startproc
449         push    %rbx
450 .cfi_push       %rbx
451         push    %rbp
452 .cfi_push       %rbp
453         push    %r13
454 .cfi_push       %r13
455         push    %r14
456 .cfi_push       %r14
457         push    %r15
458 .cfi_push       %r15
459 .Lkey_prologue:
460
461         mov     %edi,${keyend}d         # put away arguments, keyBitLength
462         mov     %rdx,$out               # keyTable
463
464         mov     0(%rsi),@S[0]           # load 0-127 bits
465         mov     4(%rsi),@S[1]
466         mov     8(%rsi),@S[2]
467         mov     12(%rsi),@S[3]
468
469         bswap   @S[0]
470         bswap   @S[1]
471         bswap   @S[2]
472         bswap   @S[3]
473 ___
474         &_saveround     (0,$out,@S);    # KL<<<0
475 $code.=<<___;
476         cmp     \$128,$keyend           # check keyBitLength
477         je      .L1st128
478
479         mov     16(%rsi),@S[0]          # load 128-191 bits
480         mov     20(%rsi),@S[1]
481         cmp     \$192,$keyend
482         je      .L1st192
483         mov     24(%rsi),@S[2]          # load 192-255 bits
484         mov     28(%rsi),@S[3]
485         jmp     .L1st256
486 .L1st192:
487         mov     @S[0],@S[2]
488         mov     @S[1],@S[3]
489         not     @S[2]
490         not     @S[3]
491 .L1st256:
492         bswap   @S[0]
493         bswap   @S[1]
494         bswap   @S[2]
495         bswap   @S[3]
496 ___
497         &_saveround     (4,$out,@S);    # temp storage for KR!
498 $code.=<<___;
499         xor     0($out),@S[1]           # KR^KL
500         xor     4($out),@S[0]
501         xor     8($out),@S[3]
502         xor     12($out),@S[2]
503
504 .L1st128:
505         lea     .LCamellia_SIGMA(%rip),$key
506         lea     .LCamellia_SBOX(%rip),$Tbl
507
508         mov     0($key),$t1
509         mov     4($key),$t0
510 ___
511         &Camellia_Feistel($step++);
512         &Camellia_Feistel($step++);
513 $code.=<<___;
514         xor     0($out),@S[1]           # ^KL
515         xor     4($out),@S[0]
516         xor     8($out),@S[3]
517         xor     12($out),@S[2]
518 ___
519         &Camellia_Feistel($step++);
520         &Camellia_Feistel($step++);
521 $code.=<<___;
522         cmp     \$128,$keyend
523         jne     .L2nd256
524
525         lea     128($out),$out          # size optimization
526         shl     \$32,%r8                # @S[0]||
527         shl     \$32,%r10               # @S[2]||
528         or      %r9,%r8                 # ||@S[1]
529         or      %r11,%r10               # ||@S[3]
530 ___
531         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
532         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
533         &_rotl128       ("%rax","%rbx",15);
534         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
535         &_rotl128       ("%r8","%r10",15);
536         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
537         &_rotl128       ("%r8","%r10",15);              # 15+15=30
538         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
539         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
540         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
541         &_rotl128       ("%r8","%r10",15);              # 30+15=45
542         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
543         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
544         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
545         &_rotl128       ("%r8","%r10",15);              # 45+15=60
546         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
547         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
548         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
549         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
550         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
551         &_rotl128       ("%r8","%r10",34);              # 60+34=94
552         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
553         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
554         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
555         &_rotl128       ("%r8","%r10",17);              # 94+17=111
556         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
557 $code.=<<___;
558         mov     \$3,%eax
559         jmp     .Ldone
560 .align  16
561 .L2nd256:
562 ___
563         &_saveround     (6,$out,@S);    # temp storage for KA!
564 $code.=<<___;
565         xor     `4*8+0`($out),@S[1]     # KA^KR
566         xor     `4*8+4`($out),@S[0]
567         xor     `5*8+0`($out),@S[3]
568         xor     `5*8+4`($out),@S[2]
569 ___
570         &Camellia_Feistel($step++);
571         &Camellia_Feistel($step++);
572
573         &_loadround     (0,$out,"%rax","%rbx"); # KL
574         &_loadround     (4,$out,"%rcx","%rdx"); # KR
575         &_loadround     (6,$out,"%r14","%r15"); # KA
576 $code.=<<___;
577         lea     128($out),$out          # size optimization
578         shl     \$32,%r8                # @S[0]||
579         shl     \$32,%r10               # @S[2]||
580         or      %r9,%r8                 # ||@S[1]
581         or      %r11,%r10               # ||@S[3]
582 ___
583         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
584         &_rotl128       ("%rcx","%rdx",15);
585         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
586         &_rotl128       ("%r14","%r15",15);
587         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
588         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
589         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
590         &_rotl128       ("%r8","%r10",30);
591         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
592         &_rotl128       ("%rax","%rbx",45);
593         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
594         &_rotl128       ("%r14","%r15",30);             # 15+30=45
595         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
596         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
597         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
598         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
599         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
600         &_rotl128       ("%r8","%r10",30);              # 30+30=60
601         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
602         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
603         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
604         &_rotl128       ("%r14","%r15",32);             # 45+32=77
605         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
606         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
607         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
608         &_rotl128       ("%r14","%r15",17);             # 77+17=94
609         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
610         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
611         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
612         &_rotl128       ("%r8","%r10",51);              # 60+51=111
613         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
614 $code.=<<___;
615         mov     \$4,%eax
616 .Ldone:
617         mov     0(%rsp),%r15
618 .cfi_restore    %r15
619         mov     8(%rsp),%r14
620 .cfi_restore    %r14
621         mov     16(%rsp),%r13
622 .cfi_restore    %r13
623         mov     24(%rsp),%rbp
624 .cfi_restore    %rbp
625         mov     32(%rsp),%rbx
626 .cfi_restore    %rbx
627         lea     40(%rsp),%rsp
628 .cfi_adjust_cfa_offset  -40
629 .Lkey_epilogue:
630         ret
631 .cfi_endproc
632 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
633 ___
634 }
635
636 @SBOX=(
637 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
638  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
639 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
640 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
641 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
642 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
643  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
644 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
645 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
646  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
647 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
648  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
649 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
650 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
651 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
652  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
653
654 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
655 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
656 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
657 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
658
659 $code.=<<___;
660 .align  64
661 .LCamellia_SIGMA:
662 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
663 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
664 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
665 .long   0,          0,          0,          0
666 .LCamellia_SBOX:
667 ___
668 # tables are interleaved, remember?
669 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
670 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
671 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
672
673 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
674 #                       size_t length, const CAMELLIA_KEY *key,
675 #                       unsigned char *ivp,const int enc);
676 {
677 $_key="0(%rsp)";
678 $_end="8(%rsp)";        # inp+len&~15
679 $_res="16(%rsp)";       # len&15
680 $ivec="24(%rsp)";
681 $_ivp="40(%rsp)";
682 $_rsp="48(%rsp)";
683
684 $code.=<<___;
685 .globl  Camellia_cbc_encrypt
686 .type   Camellia_cbc_encrypt,\@function,6
687 .align  16
688 Camellia_cbc_encrypt:
689 .cfi_startproc
690         endbranch
691         cmp     \$0,%rdx
692         je      .Lcbc_abort
693         push    %rbx
694 .cfi_push       %rbx
695         push    %rbp
696 .cfi_push       %rbp
697         push    %r12
698 .cfi_push       %r12
699         push    %r13
700 .cfi_push       %r13
701         push    %r14
702 .cfi_push       %r14
703         push    %r15
704 .cfi_push       %r15
705 .Lcbc_prologue:
706
707         mov     %rsp,%rbp
708 .cfi_def_cfa_register   %rbp
709         sub     \$64,%rsp
710         and     \$-64,%rsp
711
712         # place stack frame just "above mod 1024" the key schedule,
713         # this ensures that cache associativity suffices
714         lea     -64-63(%rcx),%r10
715         sub     %rsp,%r10
716         neg     %r10
717         and     \$0x3C0,%r10
718         sub     %r10,%rsp
719         #add    \$8,%rsp                # 8 is reserved for callee's ra
720
721         mov     %rdi,$inp               # inp argument
722         mov     %rsi,$out               # out argument
723         mov     %r8,%rbx                # ivp argument
724         mov     %rcx,$key               # key argument
725         mov     272(%rcx),${keyend}d    # grandRounds
726
727         mov     %r8,$_ivp
728         mov     %rbp,$_rsp
729 .cfi_cfa_expression     $_rsp,deref,+56
730
731 .Lcbc_body:
732         lea     .LCamellia_SBOX(%rip),$Tbl
733
734         mov     \$32,%ecx
735 .align  4
736 .Lcbc_prefetch_sbox:
737         mov     0($Tbl),%rax
738         mov     32($Tbl),%rsi
739         mov     64($Tbl),%rdi
740         mov     96($Tbl),%r11
741         lea     128($Tbl),$Tbl
742         loop    .Lcbc_prefetch_sbox
743         sub     \$4096,$Tbl
744         shl     \$6,$keyend
745         mov     %rdx,%rcx               # len argument
746         lea     ($key,$keyend),$keyend
747
748         cmp     \$0,%r9d                # enc argument
749         je      .LCBC_DECRYPT
750
751         and     \$-16,%rdx
752         and     \$15,%rcx               # length residue
753         lea     ($inp,%rdx),%rdx
754         mov     $key,$_key
755         mov     %rdx,$_end
756         mov     %rcx,$_res
757
758         cmp     $inp,%rdx
759         mov     0(%rbx),@S[0]           # load IV
760         mov     4(%rbx),@S[1]
761         mov     8(%rbx),@S[2]
762         mov     12(%rbx),@S[3]
763         je      .Lcbc_enc_tail
764         jmp     .Lcbc_eloop
765
766 .align  16
767 .Lcbc_eloop:
768         xor     0($inp),@S[0]
769         xor     4($inp),@S[1]
770         xor     8($inp),@S[2]
771         bswap   @S[0]
772         xor     12($inp),@S[3]
773         bswap   @S[1]
774         bswap   @S[2]
775         bswap   @S[3]
776
777         call    _x86_64_Camellia_encrypt
778
779         mov     $_key,$key              # "rewind" the key
780         bswap   @S[0]
781         mov     $_end,%rdx
782         bswap   @S[1]
783         mov     $_res,%rcx
784         bswap   @S[2]
785         mov     @S[0],0($out)
786         bswap   @S[3]
787         mov     @S[1],4($out)
788         mov     @S[2],8($out)
789         lea     16($inp),$inp
790         mov     @S[3],12($out)
791         cmp     %rdx,$inp
792         lea     16($out),$out
793         jne     .Lcbc_eloop
794
795         cmp     \$0,%rcx
796         jne     .Lcbc_enc_tail
797
798         mov     $_ivp,$out
799         mov     @S[0],0($out)           # write out IV residue
800         mov     @S[1],4($out)
801         mov     @S[2],8($out)
802         mov     @S[3],12($out)
803         jmp     .Lcbc_done
804
805 .align  16
806 .Lcbc_enc_tail:
807         xor     %rax,%rax
808         mov     %rax,0+$ivec
809         mov     %rax,8+$ivec
810         mov     %rax,$_res
811
812 .Lcbc_enc_pushf:
813         pushfq
814         cld
815         mov     $inp,%rsi
816         lea     8+$ivec,%rdi
817         .long   0x9066A4F3              # rep movsb
818         popfq
819 .Lcbc_enc_popf:
820
821         lea     $ivec,$inp
822         lea     16+$ivec,%rax
823         mov     %rax,$_end
824         jmp     .Lcbc_eloop             # one more time
825
826 .align  16
827 .LCBC_DECRYPT:
828         xchg    $key,$keyend
829         add     \$15,%rdx
830         and     \$15,%rcx               # length residue
831         and     \$-16,%rdx
832         mov     $key,$_key
833         lea     ($inp,%rdx),%rdx
834         mov     %rdx,$_end
835         mov     %rcx,$_res
836
837         mov     (%rbx),%rax             # load IV
838         mov     8(%rbx),%rbx
839         jmp     .Lcbc_dloop
840 .align  16
841 .Lcbc_dloop:
842         mov     0($inp),@S[0]
843         mov     4($inp),@S[1]
844         mov     8($inp),@S[2]
845         bswap   @S[0]
846         mov     12($inp),@S[3]
847         bswap   @S[1]
848         mov     %rax,0+$ivec            # save IV to temporary storage
849         bswap   @S[2]
850         mov     %rbx,8+$ivec
851         bswap   @S[3]
852
853         call    _x86_64_Camellia_decrypt
854
855         mov     $_key,$key              # "rewind" the key
856         mov     $_end,%rdx
857         mov     $_res,%rcx
858
859         bswap   @S[0]
860         mov     ($inp),%rax             # load IV for next iteration
861         bswap   @S[1]
862         mov     8($inp),%rbx
863         bswap   @S[2]
864         xor     0+$ivec,@S[0]
865         bswap   @S[3]
866         xor     4+$ivec,@S[1]
867         xor     8+$ivec,@S[2]
868         lea     16($inp),$inp
869         xor     12+$ivec,@S[3]
870         cmp     %rdx,$inp
871         je      .Lcbc_ddone
872
873         mov     @S[0],0($out)
874         mov     @S[1],4($out)
875         mov     @S[2],8($out)
876         mov     @S[3],12($out)
877
878         lea     16($out),$out
879         jmp     .Lcbc_dloop
880
881 .align  16
882 .Lcbc_ddone:
883         mov     $_ivp,%rdx
884         cmp     \$0,%rcx
885         jne     .Lcbc_dec_tail
886
887         mov     @S[0],0($out)
888         mov     @S[1],4($out)
889         mov     @S[2],8($out)
890         mov     @S[3],12($out)
891
892         mov     %rax,(%rdx)             # write out IV residue
893         mov     %rbx,8(%rdx)
894         jmp     .Lcbc_done
895 .align  16
896 .Lcbc_dec_tail:
897         mov     @S[0],0+$ivec
898         mov     @S[1],4+$ivec
899         mov     @S[2],8+$ivec
900         mov     @S[3],12+$ivec
901
902 .Lcbc_dec_pushf:
903         pushfq
904         cld
905         lea     8+$ivec,%rsi
906         lea     ($out),%rdi
907         .long   0x9066A4F3              # rep movsb
908         popfq
909 .Lcbc_dec_popf:
910
911         mov     %rax,(%rdx)             # write out IV residue
912         mov     %rbx,8(%rdx)
913         jmp     .Lcbc_done
914
915 .align  16
916 .Lcbc_done:
917         mov     $_rsp,%rcx
918 .cfi_def_cfa    %rcx,56
919         mov     0(%rcx),%r15
920 .cfi_restore    %r15
921         mov     8(%rcx),%r14
922 .cfi_restore    %r14
923         mov     16(%rcx),%r13
924 .cfi_restore    %r13
925         mov     24(%rcx),%r12
926 .cfi_restore    %r12
927         mov     32(%rcx),%rbp
928 .cfi_restore    %rbp
929         mov     40(%rcx),%rbx
930 .cfi_restore    %rbx
931         lea     48(%rcx),%rsp
932 .cfi_def_cfa    %rsp,8
933 .Lcbc_abort:
934         ret
935 .cfi_endproc
936 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
937
938 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
939 ___
940 }
941
942 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
943 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
944 if ($win64) {
945 $rec="%rcx";
946 $frame="%rdx";
947 $context="%r8";
948 $disp="%r9";
949
950 $code.=<<___;
951 .extern __imp_RtlVirtualUnwind
952 .type   common_se_handler,\@abi-omnipotent
953 .align  16
954 common_se_handler:
955         push    %rsi
956         push    %rdi
957         push    %rbx
958         push    %rbp
959         push    %r12
960         push    %r13
961         push    %r14
962         push    %r15
963         pushfq
964         lea     -64(%rsp),%rsp
965
966         mov     120($context),%rax      # pull context->Rax
967         mov     248($context),%rbx      # pull context->Rip
968
969         mov     8($disp),%rsi           # disp->ImageBase
970         mov     56($disp),%r11          # disp->HandlerData
971
972         mov     0(%r11),%r10d           # HandlerData[0]
973         lea     (%rsi,%r10),%r10        # prologue label
974         cmp     %r10,%rbx               # context->Rip<prologue label
975         jb      .Lin_prologue
976
977         mov     152($context),%rax      # pull context->Rsp
978
979         mov     4(%r11),%r10d           # HandlerData[1]
980         lea     (%rsi,%r10),%r10        # epilogue label
981         cmp     %r10,%rbx               # context->Rip>=epilogue label
982         jae     .Lin_prologue
983
984         lea     40(%rax),%rax
985         mov     -8(%rax),%rbx
986         mov     -16(%rax),%rbp
987         mov     -24(%rax),%r13
988         mov     -32(%rax),%r14
989         mov     -40(%rax),%r15
990         mov     %rbx,144($context)      # restore context->Rbx
991         mov     %rbp,160($context)      # restore context->Rbp
992         mov     %r13,224($context)      # restore context->R13
993         mov     %r14,232($context)      # restore context->R14
994         mov     %r15,240($context)      # restore context->R15
995
996 .Lin_prologue:
997         mov     8(%rax),%rdi
998         mov     16(%rax),%rsi
999         mov     %rax,152($context)      # restore context->Rsp
1000         mov     %rsi,168($context)      # restore context->Rsi
1001         mov     %rdi,176($context)      # restore context->Rdi
1002
1003         jmp     .Lcommon_seh_exit
1004 .size   common_se_handler,.-common_se_handler
1005
1006 .type   cbc_se_handler,\@abi-omnipotent
1007 .align  16
1008 cbc_se_handler:
1009         push    %rsi
1010         push    %rdi
1011         push    %rbx
1012         push    %rbp
1013         push    %r12
1014         push    %r13
1015         push    %r14
1016         push    %r15
1017         pushfq
1018         lea     -64(%rsp),%rsp
1019
1020         mov     120($context),%rax      # pull context->Rax
1021         mov     248($context),%rbx      # pull context->Rip
1022
1023         lea     .Lcbc_prologue(%rip),%r10
1024         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
1025         jb      .Lin_cbc_prologue
1026
1027         lea     .Lcbc_body(%rip),%r10
1028         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
1029         jb      .Lin_cbc_frame_setup
1030
1031         mov     152($context),%rax      # pull context->Rsp
1032
1033         lea     .Lcbc_abort(%rip),%r10
1034         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
1035         jae     .Lin_cbc_prologue
1036
1037         # handle pushf/popf in Camellia_cbc_encrypt
1038         lea     .Lcbc_enc_pushf(%rip),%r10
1039         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
1040         jbe     .Lin_cbc_no_flag
1041         lea     8(%rax),%rax
1042         lea     .Lcbc_enc_popf(%rip),%r10
1043         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
1044         jb      .Lin_cbc_no_flag
1045         lea     -8(%rax),%rax
1046         lea     .Lcbc_dec_pushf(%rip),%r10
1047         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
1048         jbe     .Lin_cbc_no_flag
1049         lea     8(%rax),%rax
1050         lea     .Lcbc_dec_popf(%rip),%r10
1051         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
1052         jb      .Lin_cbc_no_flag
1053         lea     -8(%rax),%rax
1054
1055 .Lin_cbc_no_flag:
1056         mov     48(%rax),%rax           # $_rsp
1057         lea     48(%rax),%rax
1058
1059 .Lin_cbc_frame_setup:
1060         mov     -8(%rax),%rbx
1061         mov     -16(%rax),%rbp
1062         mov     -24(%rax),%r12
1063         mov     -32(%rax),%r13
1064         mov     -40(%rax),%r14
1065         mov     -48(%rax),%r15
1066         mov     %rbx,144($context)      # restore context->Rbx
1067         mov     %rbp,160($context)      # restore context->Rbp
1068         mov     %r12,216($context)      # restore context->R12
1069         mov     %r13,224($context)      # restore context->R13
1070         mov     %r14,232($context)      # restore context->R14
1071         mov     %r15,240($context)      # restore context->R15
1072
1073 .Lin_cbc_prologue:
1074         mov     8(%rax),%rdi
1075         mov     16(%rax),%rsi
1076         mov     %rax,152($context)      # restore context->Rsp
1077         mov     %rsi,168($context)      # restore context->Rsi
1078         mov     %rdi,176($context)      # restore context->Rdi
1079
1080 .align  4
1081 .Lcommon_seh_exit:
1082
1083         mov     40($disp),%rdi          # disp->ContextRecord
1084         mov     $context,%rsi           # context
1085         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1086         .long   0xa548f3fc              # cld; rep movsq
1087
1088         mov     $disp,%rsi
1089         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1090         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1091         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1092         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1093         mov     40(%rsi),%r10           # disp->ContextRecord
1094         lea     56(%rsi),%r11           # &disp->HandlerData
1095         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1096         mov     %r10,32(%rsp)           # arg5
1097         mov     %r11,40(%rsp)           # arg6
1098         mov     %r12,48(%rsp)           # arg7
1099         mov     %rcx,56(%rsp)           # arg8, (NULL)
1100         call    *__imp_RtlVirtualUnwind(%rip)
1101
1102         mov     \$1,%eax                # ExceptionContinueSearch
1103         lea     64(%rsp),%rsp
1104         popfq
1105         pop     %r15
1106         pop     %r14
1107         pop     %r13
1108         pop     %r12
1109         pop     %rbp
1110         pop     %rbx
1111         pop     %rdi
1112         pop     %rsi
1113         ret
1114 .size   cbc_se_handler,.-cbc_se_handler
1115
1116 .section        .pdata
1117 .align  4
1118         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1119         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1120         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1121
1122         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1123         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1124         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1125
1126         .rva    .LSEH_begin_Camellia_Ekeygen
1127         .rva    .LSEH_end_Camellia_Ekeygen
1128         .rva    .LSEH_info_Camellia_Ekeygen
1129
1130         .rva    .LSEH_begin_Camellia_cbc_encrypt
1131         .rva    .LSEH_end_Camellia_cbc_encrypt
1132         .rva    .LSEH_info_Camellia_cbc_encrypt
1133
1134 .section        .xdata
1135 .align  8
1136 .LSEH_info_Camellia_EncryptBlock_Rounds:
1137         .byte   9,0,0,0
1138         .rva    common_se_handler
1139         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1140 .LSEH_info_Camellia_DecryptBlock_Rounds:
1141         .byte   9,0,0,0
1142         .rva    common_se_handler
1143         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1144 .LSEH_info_Camellia_Ekeygen:
1145         .byte   9,0,0,0
1146         .rva    common_se_handler
1147         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1148 .LSEH_info_Camellia_cbc_encrypt:
1149         .byte   9,0,0,0
1150         .rva    cbc_se_handler
1151 ___
1152 }
1153
1154 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1155 print $code;
1156 close STDOUT or die "error closing STDOUT: $!";