02c52c3efe47ddf35cbf59cac458abcf269cd73a
[openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12 #
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
20
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
23 #
24 #                       AMD64   Core2   EM64T
25 # -evp camellia-128-ecb 16.7    21.0    22.7
26 # + over gcc 3.4.6      +25%    +5%     0%
27 #
28 # camellia-128-cbc      15.7    20.4    21.1
29 #
30 # 128-bit key setup     128     216     205     cycles/key
31 # + over gcc 3.4.6      +54%    +39%    +15%
32 #
33 # Numbers in "+" rows represent performance improvement over compiler
34 # generated code. Key setup timings are impressive on AMD and Core2
35 # thanks to 64-bit operations being covertly deployed. Improvement on
36 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37 # apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39 $flavour = shift;
40 $output  = shift;
41 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48 die "can't locate x86_64-xlate.pl";
49
50 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51 *STDOUT=*OUT;
52
53 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
54 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
55                         $r =~ s/%[er]([sd]i)/%\1l/;
56                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
57
58 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
59 @S=("%r8d","%r9d","%r10d","%r11d");
60 $i0="%esi";
61 $i1="%edi";
62 $Tbl="%rbp";    # size optimization
63 $inp="%r12";
64 $out="%r13";
65 $key="%r14";
66 $keyend="%r15";
67 $arg0d=$win64?"%ecx":"%edi";
68
69 # const unsigned int Camellia_SBOX[4][256];
70 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71 # and [2][] - with [3][]. This is done to minimize code size.
72 $SBOX1_1110=0;          # Camellia_SBOX[0]
73 $SBOX4_4404=4;          # Camellia_SBOX[1]
74 $SBOX2_0222=2048;       # Camellia_SBOX[2]
75 $SBOX3_3033=2052;       # Camellia_SBOX[3]
76
77 sub Camellia_Feistel {
78 my $i=@_[0];
79 my $seed=defined(@_[1])?@_[1]:0;
80 my $scale=$seed<0?-8:8;
81 my $j=($i&1)*2;
82 my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
83
84 $code.=<<___;
85         xor     $s0,$t0                         # t0^=key[0]
86         xor     $s1,$t1                         # t1^=key[1]
87         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
88         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
89         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
90         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
91         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
92         shr     \$16,$t0
93         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
94         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
95         shr     \$16,$t1
96         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
97         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
98         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
99         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
100         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
101         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
102         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
103         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
104         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
105         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
106         mov     `$seed+($i+1)*$scale+4`($key),$t0
107         xor     $t3,$t2                         # t2^=t3
108         ror     \$8,$t3                         # t3=RightRotate(t3,8)
109         xor     $t2,$s2
110         xor     $t2,$s3
111         xor     $t3,$s3
112 ___
113 }
114
115 # void Camellia_EncryptBlock_Rounds(
116 #               int grandRounds,
117 #               const Byte plaintext[],
118 #               const KEY_TABLE_TYPE keyTable,
119 #               Byte ciphertext[])
120 $code=<<___;
121 .text
122
123 # V1.x API
124 .globl  Camellia_EncryptBlock
125 .type   Camellia_EncryptBlock,\@abi-omnipotent
126 .align  16
127 Camellia_EncryptBlock:
128         movl    \$128,%eax
129         subl    $arg0d,%eax
130         movl    \$3,$arg0d
131         adcl    \$0,$arg0d      # keyBitLength==128?3:4
132         jmp     .Lenc_rounds
133 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
134 # V2
135 .globl  Camellia_EncryptBlock_Rounds
136 .type   Camellia_EncryptBlock_Rounds,\@function,4
137 .align  16
138 .Lenc_rounds:
139 Camellia_EncryptBlock_Rounds:
140 .cfi_startproc
141         push    %rbx
142 .cfi_push       %rbx
143         push    %rbp
144 .cfi_push       %rbp
145         push    %r13
146 .cfi_push       %r13
147         push    %r14
148 .cfi_push       %r14
149         push    %r15
150 .cfi_push       %r15
151 .Lenc_prologue:
152
153         #mov    %rsi,$inp               # put away arguments
154         mov     %rcx,$out
155         mov     %rdx,$key
156
157         shl     \$6,%edi                # process grandRounds
158         lea     .LCamellia_SBOX(%rip),$Tbl
159         lea     ($key,%rdi),$keyend
160
161         mov     0(%rsi),@S[0]           # load plaintext
162         mov     4(%rsi),@S[1]
163         mov     8(%rsi),@S[2]
164         bswap   @S[0]
165         mov     12(%rsi),@S[3]
166         bswap   @S[1]
167         bswap   @S[2]
168         bswap   @S[3]
169
170         call    _x86_64_Camellia_encrypt
171
172         bswap   @S[0]
173         bswap   @S[1]
174         bswap   @S[2]
175         mov     @S[0],0($out)
176         bswap   @S[3]
177         mov     @S[1],4($out)
178         mov     @S[2],8($out)
179         mov     @S[3],12($out)
180
181         mov     0(%rsp),%r15
182 .cfi_restore    %r15
183         mov     8(%rsp),%r14
184 .cfi_restore    %r14
185         mov     16(%rsp),%r13
186 .cfi_restore    %r13
187         mov     24(%rsp),%rbp
188 .cfi_restore    %rbp
189         mov     32(%rsp),%rbx
190 .cfi_restore    %rbx
191         lea     40(%rsp),%rsp
192 .cfi_adjust_cfa_offset  -40
193 .Lenc_epilogue:
194         ret
195 .cfi_endproc
196 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
197
198 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
199 .align  16
200 _x86_64_Camellia_encrypt:
201         xor     0($key),@S[1]
202         xor     4($key),@S[0]           # ^=key[0-3]
203         xor     8($key),@S[3]
204         xor     12($key),@S[2]
205 .align  16
206 .Leloop:
207         mov     16($key),$t1            # prefetch key[4-5]
208         mov     20($key),$t0
209
210 ___
211         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
212 $code.=<<___;
213         lea     16*4($key),$key
214         cmp     $keyend,$key
215         mov     8($key),$t3             # prefetch key[2-3]
216         mov     12($key),$t2
217         je      .Ledone
218
219         and     @S[0],$t0
220         or      @S[3],$t3
221         rol     \$1,$t0
222         xor     $t3,@S[2]               # s2^=s3|key[3];
223         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
224         and     @S[2],$t2
225         or      @S[1],$t1
226         rol     \$1,$t2
227         xor     $t1,@S[0]               # s0^=s1|key[1];
228         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
229         jmp     .Leloop
230
231 .align  16
232 .Ledone:
233         xor     @S[2],$t0               # SwapHalf
234         xor     @S[3],$t1
235         xor     @S[0],$t2
236         xor     @S[1],$t3
237
238         mov     $t0,@S[0]
239         mov     $t1,@S[1]
240         mov     $t2,@S[2]
241         mov     $t3,@S[3]
242
243         .byte   0xf3,0xc3               # rep ret
244 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
245
246 # V1.x API
247 .globl  Camellia_DecryptBlock
248 .type   Camellia_DecryptBlock,\@abi-omnipotent
249 .align  16
250 Camellia_DecryptBlock:
251         movl    \$128,%eax
252         subl    $arg0d,%eax
253         movl    \$3,$arg0d
254         adcl    \$0,$arg0d      # keyBitLength==128?3:4
255         jmp     .Ldec_rounds
256 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
257 # V2
258 .globl  Camellia_DecryptBlock_Rounds
259 .type   Camellia_DecryptBlock_Rounds,\@function,4
260 .align  16
261 .Ldec_rounds:
262 Camellia_DecryptBlock_Rounds:
263 .cfi_startproc
264         push    %rbx
265 .cfi_push       %rbx
266         push    %rbp
267 .cfi_push       %rbp
268         push    %r13
269 .cfi_push       %r13
270         push    %r14
271 .cfi_push       %r14
272         push    %r15
273 .cfi_push       %r15
274 .Ldec_prologue:
275
276         #mov    %rsi,$inp               # put away arguments
277         mov     %rcx,$out
278         mov     %rdx,$keyend
279
280         shl     \$6,%edi                # process grandRounds
281         lea     .LCamellia_SBOX(%rip),$Tbl
282         lea     ($keyend,%rdi),$key
283
284         mov     0(%rsi),@S[0]           # load plaintext
285         mov     4(%rsi),@S[1]
286         mov     8(%rsi),@S[2]
287         bswap   @S[0]
288         mov     12(%rsi),@S[3]
289         bswap   @S[1]
290         bswap   @S[2]
291         bswap   @S[3]
292
293         call    _x86_64_Camellia_decrypt
294
295         bswap   @S[0]
296         bswap   @S[1]
297         bswap   @S[2]
298         mov     @S[0],0($out)
299         bswap   @S[3]
300         mov     @S[1],4($out)
301         mov     @S[2],8($out)
302         mov     @S[3],12($out)
303
304         mov     0(%rsp),%r15
305 .cfi_restore    %r15
306         mov     8(%rsp),%r14
307 .cfi_restore    %r14
308         mov     16(%rsp),%r13
309 .cfi_restore    %r13
310         mov     24(%rsp),%rbp
311 .cfi_restore    %rbp
312         mov     32(%rsp),%rbx
313 .cfi_restore    %rbx
314         lea     40(%rsp),%rsp
315 .cfi_adjust_cfa_offset  -40
316 .Ldec_epilogue:
317         ret
318 .cfi_endproc
319 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
320
321 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
322 .align  16
323 _x86_64_Camellia_decrypt:
324         xor     0($key),@S[1]
325         xor     4($key),@S[0]           # ^=key[0-3]
326         xor     8($key),@S[3]
327         xor     12($key),@S[2]
328 .align  16
329 .Ldloop:
330         mov     -8($key),$t1            # prefetch key[4-5]
331         mov     -4($key),$t0
332
333 ___
334         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
335 $code.=<<___;
336         lea     -16*4($key),$key
337         cmp     $keyend,$key
338         mov     0($key),$t3             # prefetch key[2-3]
339         mov     4($key),$t2
340         je      .Lddone
341
342         and     @S[0],$t0
343         or      @S[3],$t3
344         rol     \$1,$t0
345         xor     $t3,@S[2]               # s2^=s3|key[3];
346         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
347         and     @S[2],$t2
348         or      @S[1],$t1
349         rol     \$1,$t2
350         xor     $t1,@S[0]               # s0^=s1|key[1];
351         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
352
353         jmp     .Ldloop
354
355 .align  16
356 .Lddone:
357         xor     @S[2],$t2
358         xor     @S[3],$t3
359         xor     @S[0],$t0
360         xor     @S[1],$t1
361
362         mov     $t2,@S[0]               # SwapHalf
363         mov     $t3,@S[1]
364         mov     $t0,@S[2]
365         mov     $t1,@S[3]
366
367         .byte   0xf3,0xc3               # rep ret
368 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
369 ___
370
371 sub _saveround {
372 my ($rnd,$key,@T)=@_;
373 my $bias=int(@T[0])?shift(@T):0;
374
375     if ($#T==3) {
376         $code.=<<___;
377         mov     @T[1],`$bias+$rnd*8+0`($key)
378         mov     @T[0],`$bias+$rnd*8+4`($key)
379         mov     @T[3],`$bias+$rnd*8+8`($key)
380         mov     @T[2],`$bias+$rnd*8+12`($key)
381 ___
382     } else {
383         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
384         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
385     }
386 }
387
388 sub _loadround {
389 my ($rnd,$key,@T)=@_;
390 my $bias=int(@T[0])?shift(@T):0;
391
392 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
393 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
394 }
395
396 # shld is very slow on Intel EM64T family. Even on AMD it limits
397 # instruction decode rate [because it's VectorPath] and consequently
398 # performance...
399 sub __rotl128 {
400 my ($i0,$i1,$rot)=@_;
401
402     if ($rot) {
403         $code.=<<___;
404         mov     $i0,%r11
405         shld    \$$rot,$i1,$i0
406         shld    \$$rot,%r11,$i1
407 ___
408     }
409 }
410
411 # ... Implementing 128-bit rotate without shld gives 80% better
412 # performance EM64T, +15% on AMD64 and only ~7% degradation on
413 # Core2. This is therefore preferred.
414 sub _rotl128 {
415 my ($i0,$i1,$rot)=@_;
416
417     if ($rot) {
418         $code.=<<___;
419         mov     $i0,%r11
420         shl     \$$rot,$i0
421         mov     $i1,%r9
422         shr     \$`64-$rot`,%r9
423         shr     \$`64-$rot`,%r11
424         or      %r9,$i0
425         shl     \$$rot,$i1
426         or      %r11,$i1
427 ___
428     }
429 }
430
431 { my $step=0;
432
433 $code.=<<___;
434 .globl  Camellia_Ekeygen
435 .type   Camellia_Ekeygen,\@function,3
436 .align  16
437 Camellia_Ekeygen:
438 .cfi_startproc
439         push    %rbx
440 .cfi_push       %rbx
441         push    %rbp
442 .cfi_push       %rbp
443         push    %r13
444 .cfi_push       %r13
445         push    %r14
446 .cfi_push       %r14
447         push    %r15
448 .cfi_push       %r15
449 .Lkey_prologue:
450
451         mov     %edi,${keyend}d         # put away arguments, keyBitLength
452         mov     %rdx,$out               # keyTable
453
454         mov     0(%rsi),@S[0]           # load 0-127 bits
455         mov     4(%rsi),@S[1]
456         mov     8(%rsi),@S[2]
457         mov     12(%rsi),@S[3]
458
459         bswap   @S[0]
460         bswap   @S[1]
461         bswap   @S[2]
462         bswap   @S[3]
463 ___
464         &_saveround     (0,$out,@S);    # KL<<<0
465 $code.=<<___;
466         cmp     \$128,$keyend           # check keyBitLength
467         je      .L1st128
468
469         mov     16(%rsi),@S[0]          # load 128-191 bits
470         mov     20(%rsi),@S[1]
471         cmp     \$192,$keyend
472         je      .L1st192
473         mov     24(%rsi),@S[2]          # load 192-255 bits
474         mov     28(%rsi),@S[3]
475         jmp     .L1st256
476 .L1st192:
477         mov     @S[0],@S[2]
478         mov     @S[1],@S[3]
479         not     @S[2]
480         not     @S[3]
481 .L1st256:
482         bswap   @S[0]
483         bswap   @S[1]
484         bswap   @S[2]
485         bswap   @S[3]
486 ___
487         &_saveround     (4,$out,@S);    # temp storage for KR!
488 $code.=<<___;
489         xor     0($out),@S[1]           # KR^KL
490         xor     4($out),@S[0]
491         xor     8($out),@S[3]
492         xor     12($out),@S[2]
493
494 .L1st128:
495         lea     .LCamellia_SIGMA(%rip),$key
496         lea     .LCamellia_SBOX(%rip),$Tbl
497
498         mov     0($key),$t1
499         mov     4($key),$t0
500 ___
501         &Camellia_Feistel($step++);
502         &Camellia_Feistel($step++);
503 $code.=<<___;
504         xor     0($out),@S[1]           # ^KL
505         xor     4($out),@S[0]
506         xor     8($out),@S[3]
507         xor     12($out),@S[2]
508 ___
509         &Camellia_Feistel($step++);
510         &Camellia_Feistel($step++);
511 $code.=<<___;
512         cmp     \$128,$keyend
513         jne     .L2nd256
514
515         lea     128($out),$out          # size optimization
516         shl     \$32,%r8                # @S[0]||
517         shl     \$32,%r10               # @S[2]||
518         or      %r9,%r8                 # ||@S[1]
519         or      %r11,%r10               # ||@S[3]
520 ___
521         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
522         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
523         &_rotl128       ("%rax","%rbx",15);
524         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
525         &_rotl128       ("%r8","%r10",15);
526         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
527         &_rotl128       ("%r8","%r10",15);              # 15+15=30
528         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
529         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
530         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
531         &_rotl128       ("%r8","%r10",15);              # 30+15=45
532         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
533         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
534         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
535         &_rotl128       ("%r8","%r10",15);              # 45+15=60
536         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
537         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
538         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
539         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
540         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
541         &_rotl128       ("%r8","%r10",34);              # 60+34=94
542         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
543         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
544         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
545         &_rotl128       ("%r8","%r10",17);              # 94+17=111
546         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
547 $code.=<<___;
548         mov     \$3,%eax
549         jmp     .Ldone
550 .align  16
551 .L2nd256:
552 ___
553         &_saveround     (6,$out,@S);    # temp storage for KA!
554 $code.=<<___;
555         xor     `4*8+0`($out),@S[1]     # KA^KR
556         xor     `4*8+4`($out),@S[0]
557         xor     `5*8+0`($out),@S[3]
558         xor     `5*8+4`($out),@S[2]
559 ___
560         &Camellia_Feistel($step++);
561         &Camellia_Feistel($step++);
562
563         &_loadround     (0,$out,"%rax","%rbx"); # KL
564         &_loadround     (4,$out,"%rcx","%rdx"); # KR
565         &_loadround     (6,$out,"%r14","%r15"); # KA
566 $code.=<<___;
567         lea     128($out),$out          # size optimization
568         shl     \$32,%r8                # @S[0]||
569         shl     \$32,%r10               # @S[2]||
570         or      %r9,%r8                 # ||@S[1]
571         or      %r11,%r10               # ||@S[3]
572 ___
573         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
574         &_rotl128       ("%rcx","%rdx",15);
575         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
576         &_rotl128       ("%r14","%r15",15);
577         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
578         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
579         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
580         &_rotl128       ("%r8","%r10",30);
581         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
582         &_rotl128       ("%rax","%rbx",45);
583         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
584         &_rotl128       ("%r14","%r15",30);             # 15+30=45
585         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
586         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
587         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
588         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
589         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
590         &_rotl128       ("%r8","%r10",30);              # 30+30=60
591         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
592         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
593         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
594         &_rotl128       ("%r14","%r15",32);             # 45+32=77
595         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
596         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
597         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
598         &_rotl128       ("%r14","%r15",17);             # 77+17=94
599         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
600         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
601         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
602         &_rotl128       ("%r8","%r10",51);              # 60+51=111
603         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
604 $code.=<<___;
605         mov     \$4,%eax
606 .Ldone:
607         mov     0(%rsp),%r15
608 .cfi_restore    %r15
609         mov     8(%rsp),%r14
610 .cfi_restore    %r14
611         mov     16(%rsp),%r13
612 .cfi_restore    %r13
613         mov     24(%rsp),%rbp
614 .cfi_restore    %rbp
615         mov     32(%rsp),%rbx
616 .cfi_restore    %rbx
617         lea     40(%rsp),%rsp
618 .cfi_adjust_cfa_offset  -40
619 .Lkey_epilogue:
620         ret
621 .cfi_endproc
622 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
623 ___
624 }
625
626 @SBOX=(
627 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
628  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
629 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
630 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
631 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
632 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
633  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
634 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
635 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
636  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
637 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
638  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
639 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
640 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
641 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
642  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
643
644 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
645 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
646 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
647 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
648
649 $code.=<<___;
650 .align  64
651 .LCamellia_SIGMA:
652 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
653 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
654 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
655 .long   0,          0,          0,          0
656 .LCamellia_SBOX:
657 ___
658 # tables are interleaved, remember?
659 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
660 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
661 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
662
663 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
664 #                       size_t length, const CAMELLIA_KEY *key,
665 #                       unsigned char *ivp,const int enc);
666 {
667 $_key="0(%rsp)";
668 $_end="8(%rsp)";        # inp+len&~15
669 $_res="16(%rsp)";       # len&15
670 $ivec="24(%rsp)";
671 $_ivp="40(%rsp)";
672 $_rsp="48(%rsp)";
673
674 $code.=<<___;
675 .globl  Camellia_cbc_encrypt
676 .type   Camellia_cbc_encrypt,\@function,6
677 .align  16
678 Camellia_cbc_encrypt:
679 .cfi_startproc
680         cmp     \$0,%rdx
681         je      .Lcbc_abort
682         push    %rbx
683 .cfi_push       %rbx
684         push    %rbp
685 .cfi_push       %rbp
686         push    %r12
687 .cfi_push       %r12
688         push    %r13
689 .cfi_push       %r13
690         push    %r14
691 .cfi_push       %r14
692         push    %r15
693 .cfi_push       %r15
694 .Lcbc_prologue:
695
696         mov     %rsp,%rbp
697 .cfi_def_cfa_register   %rbp
698         sub     \$64,%rsp
699         and     \$-64,%rsp
700
701         # place stack frame just "above mod 1024" the key schedule,
702         # this ensures that cache associativity suffices
703         lea     -64-63(%rcx),%r10
704         sub     %rsp,%r10
705         neg     %r10
706         and     \$0x3C0,%r10
707         sub     %r10,%rsp
708         #add    \$8,%rsp                # 8 is reserved for callee's ra
709
710         mov     %rdi,$inp               # inp argument
711         mov     %rsi,$out               # out argument
712         mov     %r8,%rbx                # ivp argument
713         mov     %rcx,$key               # key argument
714         mov     272(%rcx),${keyend}d    # grandRounds
715
716         mov     %r8,$_ivp
717         mov     %rbp,$_rsp
718 .cfi_cfa_expression     $_rsp,deref,+56
719
720 .Lcbc_body:
721         lea     .LCamellia_SBOX(%rip),$Tbl
722
723         mov     \$32,%ecx
724 .align  4
725 .Lcbc_prefetch_sbox:
726         mov     0($Tbl),%rax
727         mov     32($Tbl),%rsi
728         mov     64($Tbl),%rdi
729         mov     96($Tbl),%r11
730         lea     128($Tbl),$Tbl
731         loop    .Lcbc_prefetch_sbox
732         sub     \$4096,$Tbl
733         shl     \$6,$keyend
734         mov     %rdx,%rcx               # len argument
735         lea     ($key,$keyend),$keyend
736
737         cmp     \$0,%r9d                # enc argument
738         je      .LCBC_DECRYPT
739
740         and     \$-16,%rdx
741         and     \$15,%rcx               # length residue
742         lea     ($inp,%rdx),%rdx
743         mov     $key,$_key
744         mov     %rdx,$_end
745         mov     %rcx,$_res
746
747         cmp     $inp,%rdx
748         mov     0(%rbx),@S[0]           # load IV
749         mov     4(%rbx),@S[1]
750         mov     8(%rbx),@S[2]
751         mov     12(%rbx),@S[3]
752         je      .Lcbc_enc_tail
753         jmp     .Lcbc_eloop
754
755 .align  16
756 .Lcbc_eloop:
757         xor     0($inp),@S[0]
758         xor     4($inp),@S[1]
759         xor     8($inp),@S[2]
760         bswap   @S[0]
761         xor     12($inp),@S[3]
762         bswap   @S[1]
763         bswap   @S[2]
764         bswap   @S[3]
765
766         call    _x86_64_Camellia_encrypt
767
768         mov     $_key,$key              # "rewind" the key
769         bswap   @S[0]
770         mov     $_end,%rdx
771         bswap   @S[1]
772         mov     $_res,%rcx
773         bswap   @S[2]
774         mov     @S[0],0($out)
775         bswap   @S[3]
776         mov     @S[1],4($out)
777         mov     @S[2],8($out)
778         lea     16($inp),$inp
779         mov     @S[3],12($out)
780         cmp     %rdx,$inp
781         lea     16($out),$out
782         jne     .Lcbc_eloop
783
784         cmp     \$0,%rcx
785         jne     .Lcbc_enc_tail
786
787         mov     $_ivp,$out
788         mov     @S[0],0($out)           # write out IV residue
789         mov     @S[1],4($out)
790         mov     @S[2],8($out)
791         mov     @S[3],12($out)
792         jmp     .Lcbc_done
793
794 .align  16
795 .Lcbc_enc_tail:
796         xor     %rax,%rax
797         mov     %rax,0+$ivec
798         mov     %rax,8+$ivec
799         mov     %rax,$_res
800
801 .Lcbc_enc_pushf:
802         pushfq
803         cld
804         mov     $inp,%rsi
805         lea     8+$ivec,%rdi
806         .long   0x9066A4F3              # rep movsb
807         popfq
808 .Lcbc_enc_popf:
809
810         lea     $ivec,$inp
811         lea     16+$ivec,%rax
812         mov     %rax,$_end
813         jmp     .Lcbc_eloop             # one more time
814
815 .align  16
816 .LCBC_DECRYPT:
817         xchg    $key,$keyend
818         add     \$15,%rdx
819         and     \$15,%rcx               # length residue
820         and     \$-16,%rdx
821         mov     $key,$_key
822         lea     ($inp,%rdx),%rdx
823         mov     %rdx,$_end
824         mov     %rcx,$_res
825
826         mov     (%rbx),%rax             # load IV
827         mov     8(%rbx),%rbx
828         jmp     .Lcbc_dloop
829 .align  16
830 .Lcbc_dloop:
831         mov     0($inp),@S[0]
832         mov     4($inp),@S[1]
833         mov     8($inp),@S[2]
834         bswap   @S[0]
835         mov     12($inp),@S[3]
836         bswap   @S[1]
837         mov     %rax,0+$ivec            # save IV to temporary storage
838         bswap   @S[2]
839         mov     %rbx,8+$ivec
840         bswap   @S[3]
841
842         call    _x86_64_Camellia_decrypt
843
844         mov     $_key,$key              # "rewind" the key
845         mov     $_end,%rdx
846         mov     $_res,%rcx
847
848         bswap   @S[0]
849         mov     ($inp),%rax             # load IV for next iteration
850         bswap   @S[1]
851         mov     8($inp),%rbx
852         bswap   @S[2]
853         xor     0+$ivec,@S[0]
854         bswap   @S[3]
855         xor     4+$ivec,@S[1]
856         xor     8+$ivec,@S[2]
857         lea     16($inp),$inp
858         xor     12+$ivec,@S[3]
859         cmp     %rdx,$inp
860         je      .Lcbc_ddone
861
862         mov     @S[0],0($out)
863         mov     @S[1],4($out)
864         mov     @S[2],8($out)
865         mov     @S[3],12($out)
866
867         lea     16($out),$out
868         jmp     .Lcbc_dloop
869
870 .align  16
871 .Lcbc_ddone:
872         mov     $_ivp,%rdx
873         cmp     \$0,%rcx
874         jne     .Lcbc_dec_tail
875
876         mov     @S[0],0($out)
877         mov     @S[1],4($out)
878         mov     @S[2],8($out)
879         mov     @S[3],12($out)
880
881         mov     %rax,(%rdx)             # write out IV residue
882         mov     %rbx,8(%rdx)
883         jmp     .Lcbc_done
884 .align  16
885 .Lcbc_dec_tail:
886         mov     @S[0],0+$ivec
887         mov     @S[1],4+$ivec
888         mov     @S[2],8+$ivec
889         mov     @S[3],12+$ivec
890
891 .Lcbc_dec_pushf:
892         pushfq
893         cld
894         lea     8+$ivec,%rsi
895         lea     ($out),%rdi
896         .long   0x9066A4F3              # rep movsb
897         popfq
898 .Lcbc_dec_popf:
899
900         mov     %rax,(%rdx)             # write out IV residue
901         mov     %rbx,8(%rdx)
902         jmp     .Lcbc_done
903
904 .align  16
905 .Lcbc_done:
906         mov     $_rsp,%rcx
907 .cfi_def_cfa    %rcx,56
908         mov     0(%rcx),%r15
909 .cfi_restore    %r15
910         mov     8(%rcx),%r14
911 .cfi_restore    %r14
912         mov     16(%rcx),%r13
913 .cfi_restore    %r13
914         mov     24(%rcx),%r12
915 .cfi_restore    %r12
916         mov     32(%rcx),%rbp
917 .cfi_restore    %rbp
918         mov     40(%rcx),%rbx
919 .cfi_restore    %rbx
920         lea     48(%rcx),%rsp
921 .cfi_def_cfa    %rsp,8
922 .Lcbc_abort:
923         ret
924 .cfi_endproc
925 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
926
927 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
928 ___
929 }
930
931 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
932 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
933 if ($win64) {
934 $rec="%rcx";
935 $frame="%rdx";
936 $context="%r8";
937 $disp="%r9";
938
939 $code.=<<___;
940 .extern __imp_RtlVirtualUnwind
941 .type   common_se_handler,\@abi-omnipotent
942 .align  16
943 common_se_handler:
944         push    %rsi
945         push    %rdi
946         push    %rbx
947         push    %rbp
948         push    %r12
949         push    %r13
950         push    %r14
951         push    %r15
952         pushfq
953         lea     -64(%rsp),%rsp
954
955         mov     120($context),%rax      # pull context->Rax
956         mov     248($context),%rbx      # pull context->Rip
957
958         mov     8($disp),%rsi           # disp->ImageBase
959         mov     56($disp),%r11          # disp->HandlerData
960
961         mov     0(%r11),%r10d           # HandlerData[0]
962         lea     (%rsi,%r10),%r10        # prologue label
963         cmp     %r10,%rbx               # context->Rip<prologue label
964         jb      .Lin_prologue
965
966         mov     152($context),%rax      # pull context->Rsp
967
968         mov     4(%r11),%r10d           # HandlerData[1]
969         lea     (%rsi,%r10),%r10        # epilogue label
970         cmp     %r10,%rbx               # context->Rip>=epilogue label
971         jae     .Lin_prologue
972
973         lea     40(%rax),%rax
974         mov     -8(%rax),%rbx
975         mov     -16(%rax),%rbp
976         mov     -24(%rax),%r13
977         mov     -32(%rax),%r14
978         mov     -40(%rax),%r15
979         mov     %rbx,144($context)      # restore context->Rbx
980         mov     %rbp,160($context)      # restore context->Rbp
981         mov     %r13,224($context)      # restore context->R13
982         mov     %r14,232($context)      # restore context->R14
983         mov     %r15,240($context)      # restore context->R15
984
985 .Lin_prologue:
986         mov     8(%rax),%rdi
987         mov     16(%rax),%rsi
988         mov     %rax,152($context)      # restore context->Rsp
989         mov     %rsi,168($context)      # restore context->Rsi
990         mov     %rdi,176($context)      # restore context->Rdi
991
992         jmp     .Lcommon_seh_exit
993 .size   common_se_handler,.-common_se_handler
994
995 .type   cbc_se_handler,\@abi-omnipotent
996 .align  16
997 cbc_se_handler:
998         push    %rsi
999         push    %rdi
1000         push    %rbx
1001         push    %rbp
1002         push    %r12
1003         push    %r13
1004         push    %r14
1005         push    %r15
1006         pushfq
1007         lea     -64(%rsp),%rsp
1008
1009         mov     120($context),%rax      # pull context->Rax
1010         mov     248($context),%rbx      # pull context->Rip
1011
1012         lea     .Lcbc_prologue(%rip),%r10
1013         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
1014         jb      .Lin_cbc_prologue
1015
1016         lea     .Lcbc_body(%rip),%r10
1017         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
1018         jb      .Lin_cbc_frame_setup
1019
1020         mov     152($context),%rax      # pull context->Rsp
1021
1022         lea     .Lcbc_abort(%rip),%r10
1023         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
1024         jae     .Lin_cbc_prologue
1025
1026         # handle pushf/popf in Camellia_cbc_encrypt
1027         lea     .Lcbc_enc_pushf(%rip),%r10
1028         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
1029         jbe     .Lin_cbc_no_flag
1030         lea     8(%rax),%rax
1031         lea     .Lcbc_enc_popf(%rip),%r10
1032         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
1033         jb      .Lin_cbc_no_flag
1034         lea     -8(%rax),%rax
1035         lea     .Lcbc_dec_pushf(%rip),%r10
1036         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
1037         jbe     .Lin_cbc_no_flag
1038         lea     8(%rax),%rax
1039         lea     .Lcbc_dec_popf(%rip),%r10
1040         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
1041         jb      .Lin_cbc_no_flag
1042         lea     -8(%rax),%rax
1043
1044 .Lin_cbc_no_flag:
1045         mov     48(%rax),%rax           # $_rsp
1046         lea     48(%rax),%rax
1047
1048 .Lin_cbc_frame_setup:
1049         mov     -8(%rax),%rbx
1050         mov     -16(%rax),%rbp
1051         mov     -24(%rax),%r12
1052         mov     -32(%rax),%r13
1053         mov     -40(%rax),%r14
1054         mov     -48(%rax),%r15
1055         mov     %rbx,144($context)      # restore context->Rbx
1056         mov     %rbp,160($context)      # restore context->Rbp
1057         mov     %r12,216($context)      # restore context->R12
1058         mov     %r13,224($context)      # restore context->R13
1059         mov     %r14,232($context)      # restore context->R14
1060         mov     %r15,240($context)      # restore context->R15
1061
1062 .Lin_cbc_prologue:
1063         mov     8(%rax),%rdi
1064         mov     16(%rax),%rsi
1065         mov     %rax,152($context)      # restore context->Rsp
1066         mov     %rsi,168($context)      # restore context->Rsi
1067         mov     %rdi,176($context)      # restore context->Rdi
1068
1069 .align  4
1070 .Lcommon_seh_exit:
1071
1072         mov     40($disp),%rdi          # disp->ContextRecord
1073         mov     $context,%rsi           # context
1074         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1075         .long   0xa548f3fc              # cld; rep movsq
1076
1077         mov     $disp,%rsi
1078         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1079         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1080         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1081         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1082         mov     40(%rsi),%r10           # disp->ContextRecord
1083         lea     56(%rsi),%r11           # &disp->HandlerData
1084         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1085         mov     %r10,32(%rsp)           # arg5
1086         mov     %r11,40(%rsp)           # arg6
1087         mov     %r12,48(%rsp)           # arg7
1088         mov     %rcx,56(%rsp)           # arg8, (NULL)
1089         call    *__imp_RtlVirtualUnwind(%rip)
1090
1091         mov     \$1,%eax                # ExceptionContinueSearch
1092         lea     64(%rsp),%rsp
1093         popfq
1094         pop     %r15
1095         pop     %r14
1096         pop     %r13
1097         pop     %r12
1098         pop     %rbp
1099         pop     %rbx
1100         pop     %rdi
1101         pop     %rsi
1102         ret
1103 .size   cbc_se_handler,.-cbc_se_handler
1104
1105 .section        .pdata
1106 .align  4
1107         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1108         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1109         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1110
1111         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1112         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1113         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1114
1115         .rva    .LSEH_begin_Camellia_Ekeygen
1116         .rva    .LSEH_end_Camellia_Ekeygen
1117         .rva    .LSEH_info_Camellia_Ekeygen
1118
1119         .rva    .LSEH_begin_Camellia_cbc_encrypt
1120         .rva    .LSEH_end_Camellia_cbc_encrypt
1121         .rva    .LSEH_info_Camellia_cbc_encrypt
1122
1123 .section        .xdata
1124 .align  8
1125 .LSEH_info_Camellia_EncryptBlock_Rounds:
1126         .byte   9,0,0,0
1127         .rva    common_se_handler
1128         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1129 .LSEH_info_Camellia_DecryptBlock_Rounds:
1130         .byte   9,0,0,0
1131         .rva    common_se_handler
1132         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1133 .LSEH_info_Camellia_Ekeygen:
1134         .byte   9,0,0,0
1135         .rva    common_se_handler
1136         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1137 .LSEH_info_Camellia_cbc_encrypt:
1138         .byte   9,0,0,0
1139         .rva    cbc_se_handler
1140 ___
1141 }
1142
1143 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1144 print $code;
1145 close STDOUT;