Missed some copyright merge
[openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
12 #
13 # This module may be used under the terms of either the GNU General
14 # Public License version 2 or later, the GNU Lesser General Public
15 # License version 2.1 or later, the Mozilla Public License version
16 # 1.1 or the BSD License. The exact terms of either license are
17 # distributed along with this module. For further details see
18 # http://www.openssl.org/~appro/camellia/.
19 # ====================================================================
20
21 # Performance in cycles per processed byte (less is better) in
22 # 'openssl speed ...' benchmark:
23 #
24 #                       AMD64   Core2   EM64T
25 # -evp camellia-128-ecb 16.7    21.0    22.7
26 # + over gcc 3.4.6      +25%    +5%     0%
27 #
28 # camellia-128-cbc      15.7    20.4    21.1
29 #
30 # 128-bit key setup     128     216     205     cycles/key
31 # + over gcc 3.4.6      +54%    +39%    +15%
32 #
33 # Numbers in "+" rows represent performance improvement over compiler
34 # generated code. Key setup timings are impressive on AMD and Core2
35 # thanks to 64-bit operations being covertly deployed. Improvement on
36 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
37 # apparently emulates some of 64-bit operations in [32-bit] microcode.
38
39 $flavour = shift;
40 $output  = shift;
41 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42
43 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
48 die "can't locate x86_64-xlate.pl";
49
50 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
51 *STDOUT=*OUT;
52
53 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
54 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
55                         $r =~ s/%[er]([sd]i)/%\1l/;
56                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
57
58 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
59 @S=("%r8d","%r9d","%r10d","%r11d");
60 $i0="%esi";
61 $i1="%edi";
62 $Tbl="%rbp";    # size optimization
63 $inp="%r12";
64 $out="%r13";
65 $key="%r14";
66 $keyend="%r15";
67 $arg0d=$win64?"%ecx":"%edi";
68
69 # const unsigned int Camellia_SBOX[4][256];
70 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
71 # and [2][] - with [3][]. This is done to minimize code size.
72 $SBOX1_1110=0;          # Camellia_SBOX[0]
73 $SBOX4_4404=4;          # Camellia_SBOX[1]
74 $SBOX2_0222=2048;       # Camellia_SBOX[2]
75 $SBOX3_3033=2052;       # Camellia_SBOX[3]
76
77 sub Camellia_Feistel {
78 my $i=@_[0];
79 my $seed=defined(@_[1])?@_[1]:0;
80 my $scale=$seed<0?-8:8;
81 my $j=($i&1)*2;
82 my ($s0,$s1,$s2,$s3)=(@S[($j)%4],@S[($j+1)%4],@S[($j+2)%4],@S[($j+3)%4]);
83
84 $code.=<<___;
85         xor     $s0,$t0                         # t0^=key[0]
86         xor     $s1,$t1                         # t1^=key[1]
87         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
88         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
89         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
90         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
91         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
92         shr     \$16,$t0
93         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
94         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
95         shr     \$16,$t1
96         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
97         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
98         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
99         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
100         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
101         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
102         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
103         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
104         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
105         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
106         mov     `$seed+($i+1)*$scale+4`($key),$t0
107         xor     $t3,$t2                         # t2^=t3
108         ror     \$8,$t3                         # t3=RightRotate(t3,8)
109         xor     $t2,$s2
110         xor     $t2,$s3
111         xor     $t3,$s3
112 ___
113 }
114
115 # void Camellia_EncryptBlock_Rounds(
116 #               int grandRounds,
117 #               const Byte plaintext[],
118 #               const KEY_TABLE_TYPE keyTable,
119 #               Byte ciphertext[])
120 $code=<<___;
121 .text
122
123 # V1.x API
124 .globl  Camellia_EncryptBlock
125 .type   Camellia_EncryptBlock,\@abi-omnipotent
126 .align  16
127 Camellia_EncryptBlock:
128         movl    \$128,%eax
129         subl    $arg0d,%eax
130         movl    \$3,$arg0d
131         adcl    \$0,$arg0d      # keyBitLength==128?3:4
132         jmp     .Lenc_rounds
133 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
134 # V2
135 .globl  Camellia_EncryptBlock_Rounds
136 .type   Camellia_EncryptBlock_Rounds,\@function,4
137 .align  16
138 .Lenc_rounds:
139 Camellia_EncryptBlock_Rounds:
140         push    %rbx
141         push    %rbp
142         push    %r13
143         push    %r14
144         push    %r15
145 .Lenc_prologue:
146
147         #mov    %rsi,$inp               # put away arguments
148         mov     %rcx,$out
149         mov     %rdx,$key
150
151         shl     \$6,%edi                # process grandRounds
152         lea     .LCamellia_SBOX(%rip),$Tbl
153         lea     ($key,%rdi),$keyend
154
155         mov     0(%rsi),@S[0]           # load plaintext
156         mov     4(%rsi),@S[1]
157         mov     8(%rsi),@S[2]
158         bswap   @S[0]
159         mov     12(%rsi),@S[3]
160         bswap   @S[1]
161         bswap   @S[2]
162         bswap   @S[3]
163
164         call    _x86_64_Camellia_encrypt
165
166         bswap   @S[0]
167         bswap   @S[1]
168         bswap   @S[2]
169         mov     @S[0],0($out)
170         bswap   @S[3]
171         mov     @S[1],4($out)
172         mov     @S[2],8($out)
173         mov     @S[3],12($out)
174
175         mov     0(%rsp),%r15
176         mov     8(%rsp),%r14
177         mov     16(%rsp),%r13
178         mov     24(%rsp),%rbp
179         mov     32(%rsp),%rbx
180         lea     40(%rsp),%rsp
181 .Lenc_epilogue:
182         ret
183 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
184
185 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
186 .align  16
187 _x86_64_Camellia_encrypt:
188         xor     0($key),@S[1]
189         xor     4($key),@S[0]           # ^=key[0-3]
190         xor     8($key),@S[3]
191         xor     12($key),@S[2]
192 .align  16
193 .Leloop:
194         mov     16($key),$t1            # prefetch key[4-5]
195         mov     20($key),$t0
196
197 ___
198         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
199 $code.=<<___;
200         lea     16*4($key),$key
201         cmp     $keyend,$key
202         mov     8($key),$t3             # prefetch key[2-3]
203         mov     12($key),$t2
204         je      .Ledone
205
206         and     @S[0],$t0
207         or      @S[3],$t3
208         rol     \$1,$t0
209         xor     $t3,@S[2]               # s2^=s3|key[3];
210         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
211         and     @S[2],$t2
212         or      @S[1],$t1
213         rol     \$1,$t2
214         xor     $t1,@S[0]               # s0^=s1|key[1];
215         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
216         jmp     .Leloop
217
218 .align  16
219 .Ledone:
220         xor     @S[2],$t0               # SwapHalf
221         xor     @S[3],$t1
222         xor     @S[0],$t2
223         xor     @S[1],$t3
224
225         mov     $t0,@S[0]
226         mov     $t1,@S[1]
227         mov     $t2,@S[2]
228         mov     $t3,@S[3]
229
230         .byte   0xf3,0xc3               # rep ret
231 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
232
233 # V1.x API
234 .globl  Camellia_DecryptBlock
235 .type   Camellia_DecryptBlock,\@abi-omnipotent
236 .align  16
237 Camellia_DecryptBlock:
238         movl    \$128,%eax
239         subl    $arg0d,%eax
240         movl    \$3,$arg0d
241         adcl    \$0,$arg0d      # keyBitLength==128?3:4
242         jmp     .Ldec_rounds
243 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
244 # V2
245 .globl  Camellia_DecryptBlock_Rounds
246 .type   Camellia_DecryptBlock_Rounds,\@function,4
247 .align  16
248 .Ldec_rounds:
249 Camellia_DecryptBlock_Rounds:
250         push    %rbx
251         push    %rbp
252         push    %r13
253         push    %r14
254         push    %r15
255 .Ldec_prologue:
256
257         #mov    %rsi,$inp               # put away arguments
258         mov     %rcx,$out
259         mov     %rdx,$keyend
260
261         shl     \$6,%edi                # process grandRounds
262         lea     .LCamellia_SBOX(%rip),$Tbl
263         lea     ($keyend,%rdi),$key
264
265         mov     0(%rsi),@S[0]           # load plaintext
266         mov     4(%rsi),@S[1]
267         mov     8(%rsi),@S[2]
268         bswap   @S[0]
269         mov     12(%rsi),@S[3]
270         bswap   @S[1]
271         bswap   @S[2]
272         bswap   @S[3]
273
274         call    _x86_64_Camellia_decrypt
275
276         bswap   @S[0]
277         bswap   @S[1]
278         bswap   @S[2]
279         mov     @S[0],0($out)
280         bswap   @S[3]
281         mov     @S[1],4($out)
282         mov     @S[2],8($out)
283         mov     @S[3],12($out)
284
285         mov     0(%rsp),%r15
286         mov     8(%rsp),%r14
287         mov     16(%rsp),%r13
288         mov     24(%rsp),%rbp
289         mov     32(%rsp),%rbx
290         lea     40(%rsp),%rsp
291 .Ldec_epilogue:
292         ret
293 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
294
295 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
296 .align  16
297 _x86_64_Camellia_decrypt:
298         xor     0($key),@S[1]
299         xor     4($key),@S[0]           # ^=key[0-3]
300         xor     8($key),@S[3]
301         xor     12($key),@S[2]
302 .align  16
303 .Ldloop:
304         mov     -8($key),$t1            # prefetch key[4-5]
305         mov     -4($key),$t0
306
307 ___
308         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
309 $code.=<<___;
310         lea     -16*4($key),$key
311         cmp     $keyend,$key
312         mov     0($key),$t3             # prefetch key[2-3]
313         mov     4($key),$t2
314         je      .Lddone
315
316         and     @S[0],$t0
317         or      @S[3],$t3
318         rol     \$1,$t0
319         xor     $t3,@S[2]               # s2^=s3|key[3];
320         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
321         and     @S[2],$t2
322         or      @S[1],$t1
323         rol     \$1,$t2
324         xor     $t1,@S[0]               # s0^=s1|key[1];
325         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
326
327         jmp     .Ldloop
328
329 .align  16
330 .Lddone:
331         xor     @S[2],$t2
332         xor     @S[3],$t3
333         xor     @S[0],$t0
334         xor     @S[1],$t1
335
336         mov     $t2,@S[0]               # SwapHalf
337         mov     $t3,@S[1]
338         mov     $t0,@S[2]
339         mov     $t1,@S[3]
340
341         .byte   0xf3,0xc3               # rep ret
342 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
343 ___
344
345 sub _saveround {
346 my ($rnd,$key,@T)=@_;
347 my $bias=int(@T[0])?shift(@T):0;
348
349     if ($#T==3) {
350         $code.=<<___;
351         mov     @T[1],`$bias+$rnd*8+0`($key)
352         mov     @T[0],`$bias+$rnd*8+4`($key)
353         mov     @T[3],`$bias+$rnd*8+8`($key)
354         mov     @T[2],`$bias+$rnd*8+12`($key)
355 ___
356     } else {
357         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
358         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
359     }
360 }
361
362 sub _loadround {
363 my ($rnd,$key,@T)=@_;
364 my $bias=int(@T[0])?shift(@T):0;
365
366 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
367 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
368 }
369
370 # shld is very slow on Intel EM64T family. Even on AMD it limits
371 # instruction decode rate [because it's VectorPath] and consequently
372 # performance...
373 sub __rotl128 {
374 my ($i0,$i1,$rot)=@_;
375
376     if ($rot) {
377         $code.=<<___;
378         mov     $i0,%r11
379         shld    \$$rot,$i1,$i0
380         shld    \$$rot,%r11,$i1
381 ___
382     }
383 }
384
385 # ... Implementing 128-bit rotate without shld gives 80% better
386 # performance EM64T, +15% on AMD64 and only ~7% degradation on
387 # Core2. This is therefore preferred.
388 sub _rotl128 {
389 my ($i0,$i1,$rot)=@_;
390
391     if ($rot) {
392         $code.=<<___;
393         mov     $i0,%r11
394         shl     \$$rot,$i0
395         mov     $i1,%r9
396         shr     \$`64-$rot`,%r9
397         shr     \$`64-$rot`,%r11
398         or      %r9,$i0
399         shl     \$$rot,$i1
400         or      %r11,$i1
401 ___
402     }
403 }
404
405 { my $step=0;
406
407 $code.=<<___;
408 .globl  Camellia_Ekeygen
409 .type   Camellia_Ekeygen,\@function,3
410 .align  16
411 Camellia_Ekeygen:
412         push    %rbx
413         push    %rbp
414         push    %r13
415         push    %r14
416         push    %r15
417 .Lkey_prologue:
418
419         mov     %edi,${keyend}d         # put away arguments, keyBitLength
420         mov     %rdx,$out               # keyTable
421
422         mov     0(%rsi),@S[0]           # load 0-127 bits
423         mov     4(%rsi),@S[1]
424         mov     8(%rsi),@S[2]
425         mov     12(%rsi),@S[3]
426
427         bswap   @S[0]
428         bswap   @S[1]
429         bswap   @S[2]
430         bswap   @S[3]
431 ___
432         &_saveround     (0,$out,@S);    # KL<<<0
433 $code.=<<___;
434         cmp     \$128,$keyend           # check keyBitLength
435         je      .L1st128
436
437         mov     16(%rsi),@S[0]          # load 128-191 bits
438         mov     20(%rsi),@S[1]
439         cmp     \$192,$keyend
440         je      .L1st192
441         mov     24(%rsi),@S[2]          # load 192-255 bits
442         mov     28(%rsi),@S[3]
443         jmp     .L1st256
444 .L1st192:
445         mov     @S[0],@S[2]
446         mov     @S[1],@S[3]
447         not     @S[2]
448         not     @S[3]
449 .L1st256:
450         bswap   @S[0]
451         bswap   @S[1]
452         bswap   @S[2]
453         bswap   @S[3]
454 ___
455         &_saveround     (4,$out,@S);    # temp storage for KR!
456 $code.=<<___;
457         xor     0($out),@S[1]           # KR^KL
458         xor     4($out),@S[0]
459         xor     8($out),@S[3]
460         xor     12($out),@S[2]
461
462 .L1st128:
463         lea     .LCamellia_SIGMA(%rip),$key
464         lea     .LCamellia_SBOX(%rip),$Tbl
465
466         mov     0($key),$t1
467         mov     4($key),$t0
468 ___
469         &Camellia_Feistel($step++);
470         &Camellia_Feistel($step++);
471 $code.=<<___;
472         xor     0($out),@S[1]           # ^KL
473         xor     4($out),@S[0]
474         xor     8($out),@S[3]
475         xor     12($out),@S[2]
476 ___
477         &Camellia_Feistel($step++);
478         &Camellia_Feistel($step++);
479 $code.=<<___;
480         cmp     \$128,$keyend
481         jne     .L2nd256
482
483         lea     128($out),$out          # size optimization
484         shl     \$32,%r8                # @S[0]||
485         shl     \$32,%r10               # @S[2]||
486         or      %r9,%r8                 # ||@S[1]
487         or      %r11,%r10               # ||@S[3]
488 ___
489         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
490         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
491         &_rotl128       ("%rax","%rbx",15);
492         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
493         &_rotl128       ("%r8","%r10",15);
494         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
495         &_rotl128       ("%r8","%r10",15);              # 15+15=30
496         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
497         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
498         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
499         &_rotl128       ("%r8","%r10",15);              # 30+15=45
500         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
501         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
502         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
503         &_rotl128       ("%r8","%r10",15);              # 45+15=60
504         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
505         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
506         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
507         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
508         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
509         &_rotl128       ("%r8","%r10",34);              # 60+34=94
510         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
511         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
512         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
513         &_rotl128       ("%r8","%r10",17);              # 94+17=111
514         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
515 $code.=<<___;
516         mov     \$3,%eax
517         jmp     .Ldone
518 .align  16
519 .L2nd256:
520 ___
521         &_saveround     (6,$out,@S);    # temp storage for KA!
522 $code.=<<___;
523         xor     `4*8+0`($out),@S[1]     # KA^KR
524         xor     `4*8+4`($out),@S[0]
525         xor     `5*8+0`($out),@S[3]
526         xor     `5*8+4`($out),@S[2]
527 ___
528         &Camellia_Feistel($step++);
529         &Camellia_Feistel($step++);
530
531         &_loadround     (0,$out,"%rax","%rbx"); # KL
532         &_loadround     (4,$out,"%rcx","%rdx"); # KR
533         &_loadround     (6,$out,"%r14","%r15"); # KA
534 $code.=<<___;
535         lea     128($out),$out          # size optimization
536         shl     \$32,%r8                # @S[0]||
537         shl     \$32,%r10               # @S[2]||
538         or      %r9,%r8                 # ||@S[1]
539         or      %r11,%r10               # ||@S[3]
540 ___
541         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
542         &_rotl128       ("%rcx","%rdx",15);
543         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
544         &_rotl128       ("%r14","%r15",15);
545         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
546         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
547         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
548         &_rotl128       ("%r8","%r10",30);
549         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
550         &_rotl128       ("%rax","%rbx",45);
551         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
552         &_rotl128       ("%r14","%r15",30);             # 15+30=45
553         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
554         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
555         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
556         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
557         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
558         &_rotl128       ("%r8","%r10",30);              # 30+30=60
559         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
560         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
561         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
562         &_rotl128       ("%r14","%r15",32);             # 45+32=77
563         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
564         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
565         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
566         &_rotl128       ("%r14","%r15",17);             # 77+17=94
567         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
568         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
569         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
570         &_rotl128       ("%r8","%r10",51);              # 60+51=111
571         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
572 $code.=<<___;
573         mov     \$4,%eax
574 .Ldone:
575         mov     0(%rsp),%r15
576         mov     8(%rsp),%r14
577         mov     16(%rsp),%r13
578         mov     24(%rsp),%rbp
579         mov     32(%rsp),%rbx
580         lea     40(%rsp),%rsp
581 .Lkey_epilogue:
582         ret
583 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
584 ___
585 }
586
587 @SBOX=(
588 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
589  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
590 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
591 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
592 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
593 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
594  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
595 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
596 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
597  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
598 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
599  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
600 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
601 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
602 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
603  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
604
605 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
606 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
607 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
608 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
609
610 $code.=<<___;
611 .align  64
612 .LCamellia_SIGMA:
613 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
614 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
615 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
616 .long   0,          0,          0,          0
617 .LCamellia_SBOX:
618 ___
619 # tables are interleaved, remember?
620 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
621 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
622 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
623
624 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
625 #                       size_t length, const CAMELLIA_KEY *key,
626 #                       unsigned char *ivp,const int enc);
627 {
628 $_key="0(%rsp)";
629 $_end="8(%rsp)";        # inp+len&~15
630 $_res="16(%rsp)";       # len&15
631 $ivec="24(%rsp)";
632 $_ivp="40(%rsp)";
633 $_rsp="48(%rsp)";
634
635 $code.=<<___;
636 .globl  Camellia_cbc_encrypt
637 .type   Camellia_cbc_encrypt,\@function,6
638 .align  16
639 Camellia_cbc_encrypt:
640         cmp     \$0,%rdx
641         je      .Lcbc_abort
642         push    %rbx
643         push    %rbp
644         push    %r12
645         push    %r13
646         push    %r14
647         push    %r15
648 .Lcbc_prologue:
649
650         mov     %rsp,%rbp
651         sub     \$64,%rsp
652         and     \$-64,%rsp
653
654         # place stack frame just "above mod 1024" the key schedule,
655         # this ensures that cache associativity suffices
656         lea     -64-63(%rcx),%r10
657         sub     %rsp,%r10
658         neg     %r10
659         and     \$0x3C0,%r10
660         sub     %r10,%rsp
661         #add    \$8,%rsp                # 8 is reserved for callee's ra
662
663         mov     %rdi,$inp               # inp argument
664         mov     %rsi,$out               # out argument
665         mov     %r8,%rbx                # ivp argument
666         mov     %rcx,$key               # key argument
667         mov     272(%rcx),${keyend}d    # grandRounds
668
669         mov     %r8,$_ivp
670         mov     %rbp,$_rsp
671
672 .Lcbc_body:
673         lea     .LCamellia_SBOX(%rip),$Tbl
674
675         mov     \$32,%ecx
676 .align  4
677 .Lcbc_prefetch_sbox:
678         mov     0($Tbl),%rax
679         mov     32($Tbl),%rsi
680         mov     64($Tbl),%rdi
681         mov     96($Tbl),%r11
682         lea     128($Tbl),$Tbl
683         loop    .Lcbc_prefetch_sbox
684         sub     \$4096,$Tbl
685         shl     \$6,$keyend
686         mov     %rdx,%rcx               # len argument
687         lea     ($key,$keyend),$keyend
688
689         cmp     \$0,%r9d                # enc argument
690         je      .LCBC_DECRYPT
691
692         and     \$-16,%rdx
693         and     \$15,%rcx               # length residue
694         lea     ($inp,%rdx),%rdx
695         mov     $key,$_key
696         mov     %rdx,$_end
697         mov     %rcx,$_res
698
699         cmp     $inp,%rdx
700         mov     0(%rbx),@S[0]           # load IV
701         mov     4(%rbx),@S[1]
702         mov     8(%rbx),@S[2]
703         mov     12(%rbx),@S[3]
704         je      .Lcbc_enc_tail
705         jmp     .Lcbc_eloop
706
707 .align  16
708 .Lcbc_eloop:
709         xor     0($inp),@S[0]
710         xor     4($inp),@S[1]
711         xor     8($inp),@S[2]
712         bswap   @S[0]
713         xor     12($inp),@S[3]
714         bswap   @S[1]
715         bswap   @S[2]
716         bswap   @S[3]
717
718         call    _x86_64_Camellia_encrypt
719
720         mov     $_key,$key              # "rewind" the key
721         bswap   @S[0]
722         mov     $_end,%rdx
723         bswap   @S[1]
724         mov     $_res,%rcx
725         bswap   @S[2]
726         mov     @S[0],0($out)
727         bswap   @S[3]
728         mov     @S[1],4($out)
729         mov     @S[2],8($out)
730         lea     16($inp),$inp
731         mov     @S[3],12($out)
732         cmp     %rdx,$inp
733         lea     16($out),$out
734         jne     .Lcbc_eloop
735
736         cmp     \$0,%rcx
737         jne     .Lcbc_enc_tail
738
739         mov     $_ivp,$out
740         mov     @S[0],0($out)           # write out IV residue
741         mov     @S[1],4($out)
742         mov     @S[2],8($out)
743         mov     @S[3],12($out)
744         jmp     .Lcbc_done
745
746 .align  16
747 .Lcbc_enc_tail:
748         xor     %rax,%rax
749         mov     %rax,0+$ivec
750         mov     %rax,8+$ivec
751         mov     %rax,$_res
752
753 .Lcbc_enc_pushf:
754         pushfq
755         cld
756         mov     $inp,%rsi
757         lea     8+$ivec,%rdi
758         .long   0x9066A4F3              # rep movsb
759         popfq
760 .Lcbc_enc_popf:
761
762         lea     $ivec,$inp
763         lea     16+$ivec,%rax
764         mov     %rax,$_end
765         jmp     .Lcbc_eloop             # one more time
766
767 .align  16
768 .LCBC_DECRYPT:
769         xchg    $key,$keyend
770         add     \$15,%rdx
771         and     \$15,%rcx               # length residue
772         and     \$-16,%rdx
773         mov     $key,$_key
774         lea     ($inp,%rdx),%rdx
775         mov     %rdx,$_end
776         mov     %rcx,$_res
777
778         mov     (%rbx),%rax             # load IV
779         mov     8(%rbx),%rbx
780         jmp     .Lcbc_dloop
781 .align  16
782 .Lcbc_dloop:
783         mov     0($inp),@S[0]
784         mov     4($inp),@S[1]
785         mov     8($inp),@S[2]
786         bswap   @S[0]
787         mov     12($inp),@S[3]
788         bswap   @S[1]
789         mov     %rax,0+$ivec            # save IV to temporary storage
790         bswap   @S[2]
791         mov     %rbx,8+$ivec
792         bswap   @S[3]
793
794         call    _x86_64_Camellia_decrypt
795
796         mov     $_key,$key              # "rewind" the key
797         mov     $_end,%rdx
798         mov     $_res,%rcx
799
800         bswap   @S[0]
801         mov     ($inp),%rax             # load IV for next iteration
802         bswap   @S[1]
803         mov     8($inp),%rbx
804         bswap   @S[2]
805         xor     0+$ivec,@S[0]
806         bswap   @S[3]
807         xor     4+$ivec,@S[1]
808         xor     8+$ivec,@S[2]
809         lea     16($inp),$inp
810         xor     12+$ivec,@S[3]
811         cmp     %rdx,$inp
812         je      .Lcbc_ddone
813
814         mov     @S[0],0($out)
815         mov     @S[1],4($out)
816         mov     @S[2],8($out)
817         mov     @S[3],12($out)
818
819         lea     16($out),$out
820         jmp     .Lcbc_dloop
821
822 .align  16
823 .Lcbc_ddone:
824         mov     $_ivp,%rdx
825         cmp     \$0,%rcx
826         jne     .Lcbc_dec_tail
827
828         mov     @S[0],0($out)
829         mov     @S[1],4($out)
830         mov     @S[2],8($out)
831         mov     @S[3],12($out)
832
833         mov     %rax,(%rdx)             # write out IV residue
834         mov     %rbx,8(%rdx)
835         jmp     .Lcbc_done
836 .align  16
837 .Lcbc_dec_tail:
838         mov     @S[0],0+$ivec
839         mov     @S[1],4+$ivec
840         mov     @S[2],8+$ivec
841         mov     @S[3],12+$ivec
842
843 .Lcbc_dec_pushf:
844         pushfq
845         cld
846         lea     8+$ivec,%rsi
847         lea     ($out),%rdi
848         .long   0x9066A4F3              # rep movsb
849         popfq
850 .Lcbc_dec_popf:
851
852         mov     %rax,(%rdx)             # write out IV residue
853         mov     %rbx,8(%rdx)
854         jmp     .Lcbc_done
855
856 .align  16
857 .Lcbc_done:
858         mov     $_rsp,%rcx
859         mov     0(%rcx),%r15
860         mov     8(%rcx),%r14
861         mov     16(%rcx),%r13
862         mov     24(%rcx),%r12
863         mov     32(%rcx),%rbp
864         mov     40(%rcx),%rbx
865         lea     48(%rcx),%rsp
866 .Lcbc_abort:
867         ret
868 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
869
870 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
871 ___
872 }
873
874 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
875 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
876 if ($win64) {
877 $rec="%rcx";
878 $frame="%rdx";
879 $context="%r8";
880 $disp="%r9";
881
882 $code.=<<___;
883 .extern __imp_RtlVirtualUnwind
884 .type   common_se_handler,\@abi-omnipotent
885 .align  16
886 common_se_handler:
887         push    %rsi
888         push    %rdi
889         push    %rbx
890         push    %rbp
891         push    %r12
892         push    %r13
893         push    %r14
894         push    %r15
895         pushfq
896         lea     -64(%rsp),%rsp
897
898         mov     120($context),%rax      # pull context->Rax
899         mov     248($context),%rbx      # pull context->Rip
900
901         mov     8($disp),%rsi           # disp->ImageBase
902         mov     56($disp),%r11          # disp->HandlerData
903
904         mov     0(%r11),%r10d           # HandlerData[0]
905         lea     (%rsi,%r10),%r10        # prologue label
906         cmp     %r10,%rbx               # context->Rip<prologue label
907         jb      .Lin_prologue
908
909         mov     152($context),%rax      # pull context->Rsp
910
911         mov     4(%r11),%r10d           # HandlerData[1]
912         lea     (%rsi,%r10),%r10        # epilogue label
913         cmp     %r10,%rbx               # context->Rip>=epilogue label
914         jae     .Lin_prologue
915
916         lea     40(%rax),%rax
917         mov     -8(%rax),%rbx
918         mov     -16(%rax),%rbp
919         mov     -24(%rax),%r13
920         mov     -32(%rax),%r14
921         mov     -40(%rax),%r15
922         mov     %rbx,144($context)      # restore context->Rbx
923         mov     %rbp,160($context)      # restore context->Rbp
924         mov     %r13,224($context)      # restore context->R13
925         mov     %r14,232($context)      # restore context->R14
926         mov     %r15,240($context)      # restore context->R15
927
928 .Lin_prologue:
929         mov     8(%rax),%rdi
930         mov     16(%rax),%rsi
931         mov     %rax,152($context)      # restore context->Rsp
932         mov     %rsi,168($context)      # restore context->Rsi
933         mov     %rdi,176($context)      # restore context->Rdi
934
935         jmp     .Lcommon_seh_exit
936 .size   common_se_handler,.-common_se_handler
937
938 .type   cbc_se_handler,\@abi-omnipotent
939 .align  16
940 cbc_se_handler:
941         push    %rsi
942         push    %rdi
943         push    %rbx
944         push    %rbp
945         push    %r12
946         push    %r13
947         push    %r14
948         push    %r15
949         pushfq
950         lea     -64(%rsp),%rsp
951
952         mov     120($context),%rax      # pull context->Rax
953         mov     248($context),%rbx      # pull context->Rip
954
955         lea     .Lcbc_prologue(%rip),%r10
956         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
957         jb      .Lin_cbc_prologue
958
959         lea     .Lcbc_body(%rip),%r10
960         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
961         jb      .Lin_cbc_frame_setup
962
963         mov     152($context),%rax      # pull context->Rsp
964
965         lea     .Lcbc_abort(%rip),%r10
966         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
967         jae     .Lin_cbc_prologue
968
969         # handle pushf/popf in Camellia_cbc_encrypt
970         lea     .Lcbc_enc_pushf(%rip),%r10
971         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
972         jbe     .Lin_cbc_no_flag
973         lea     8(%rax),%rax
974         lea     .Lcbc_enc_popf(%rip),%r10
975         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
976         jb      .Lin_cbc_no_flag
977         lea     -8(%rax),%rax
978         lea     .Lcbc_dec_pushf(%rip),%r10
979         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
980         jbe     .Lin_cbc_no_flag
981         lea     8(%rax),%rax
982         lea     .Lcbc_dec_popf(%rip),%r10
983         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
984         jb      .Lin_cbc_no_flag
985         lea     -8(%rax),%rax
986
987 .Lin_cbc_no_flag:
988         mov     48(%rax),%rax           # $_rsp
989         lea     48(%rax),%rax
990
991 .Lin_cbc_frame_setup:
992         mov     -8(%rax),%rbx
993         mov     -16(%rax),%rbp
994         mov     -24(%rax),%r12
995         mov     -32(%rax),%r13
996         mov     -40(%rax),%r14
997         mov     -48(%rax),%r15
998         mov     %rbx,144($context)      # restore context->Rbx
999         mov     %rbp,160($context)      # restore context->Rbp
1000         mov     %r12,216($context)      # restore context->R12
1001         mov     %r13,224($context)      # restore context->R13
1002         mov     %r14,232($context)      # restore context->R14
1003         mov     %r15,240($context)      # restore context->R15
1004
1005 .Lin_cbc_prologue:
1006         mov     8(%rax),%rdi
1007         mov     16(%rax),%rsi
1008         mov     %rax,152($context)      # restore context->Rsp
1009         mov     %rsi,168($context)      # restore context->Rsi
1010         mov     %rdi,176($context)      # restore context->Rdi
1011
1012 .align  4
1013 .Lcommon_seh_exit:
1014
1015         mov     40($disp),%rdi          # disp->ContextRecord
1016         mov     $context,%rsi           # context
1017         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1018         .long   0xa548f3fc              # cld; rep movsq
1019
1020         mov     $disp,%rsi
1021         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1022         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1023         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1024         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1025         mov     40(%rsi),%r10           # disp->ContextRecord
1026         lea     56(%rsi),%r11           # &disp->HandlerData
1027         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1028         mov     %r10,32(%rsp)           # arg5
1029         mov     %r11,40(%rsp)           # arg6
1030         mov     %r12,48(%rsp)           # arg7
1031         mov     %rcx,56(%rsp)           # arg8, (NULL)
1032         call    *__imp_RtlVirtualUnwind(%rip)
1033
1034         mov     \$1,%eax                # ExceptionContinueSearch
1035         lea     64(%rsp),%rsp
1036         popfq
1037         pop     %r15
1038         pop     %r14
1039         pop     %r13
1040         pop     %r12
1041         pop     %rbp
1042         pop     %rbx
1043         pop     %rdi
1044         pop     %rsi
1045         ret
1046 .size   cbc_se_handler,.-cbc_se_handler
1047
1048 .section        .pdata
1049 .align  4
1050         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1051         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1052         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1053
1054         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1055         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1056         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1057
1058         .rva    .LSEH_begin_Camellia_Ekeygen
1059         .rva    .LSEH_end_Camellia_Ekeygen
1060         .rva    .LSEH_info_Camellia_Ekeygen
1061
1062         .rva    .LSEH_begin_Camellia_cbc_encrypt
1063         .rva    .LSEH_end_Camellia_cbc_encrypt
1064         .rva    .LSEH_info_Camellia_cbc_encrypt
1065
1066 .section        .xdata
1067 .align  8
1068 .LSEH_info_Camellia_EncryptBlock_Rounds:
1069         .byte   9,0,0,0
1070         .rva    common_se_handler
1071         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1072 .LSEH_info_Camellia_DecryptBlock_Rounds:
1073         .byte   9,0,0,0
1074         .rva    common_se_handler
1075         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1076 .LSEH_info_Camellia_Ekeygen:
1077         .byte   9,0,0,0
1078         .rva    common_se_handler
1079         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1080 .LSEH_info_Camellia_cbc_encrypt:
1081         .byte   9,0,0,0
1082         .rva    cbc_se_handler
1083 ___
1084 }
1085
1086 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1087 print $code;
1088 close STDOUT;