Add Camellia assembler x86 and x86_64 modules.
[openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5 #
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
13
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
16 #
17 #                       AMD64   Core2   EM64T
18 # -evp camellia-128-ecb 16.7    21.0    22.7
19 # + over gcc 3.4.6      +25%    +5%     0%
20 #
21 # camellia-128-cbc      15.7    20.4    21.1
22 #
23 # 128-bit key setup     128     216     205     cycles/key
24 # + over gcc 3.4.6      +54%    +39%    +15%
25 #
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32 $flavour = shift;
33 $output  = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open STDOUT,"| $^X $xlate $flavour $output";
44
45 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
46 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
47                         $r =~ s/%[er]([sd]i)/%\1l/;
48                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
49
50 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
51 @S=("%r8d","%r9d","%r10d","%r11d");
52 $i0="%esi";
53 $i1="%edi";
54 $Tbl="%rbp";    # size optimization
55 $inp="%r12";
56 $out="%r13";
57 $key="%r14";
58 $keyend="%r15";
59 $arg0d=$win64?"%ecx":"%edi";
60
61 # const unsigned int Camellia_SBOX[4][256];
62 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
63 # and [2][] - with [3][]. This is done to minimize code size.
64 $SBOX1_1110=0;          # Camellia_SBOX[0]
65 $SBOX4_4404=4;          # Camellia_SBOX[1]
66 $SBOX2_0222=2048;       # Camellia_SBOX[2]
67 $SBOX3_3033=2052;       # Camellia_SBOX[3]
68
69 sub Camellia_Feistel {
70 my $i=@_[0];
71 my $seed=defined(@_[1])?@_[1]:0;
72 my $scale=$seed<0?-8:8;
73 my $j=($i&1)*2;
74 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
75
76 $code.=<<___;
77         xor     $s0,$t0                         # t0^=key[0]
78         xor     $s1,$t1                         # t1^=key[1]
79         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
80         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
81         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
82         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
83         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
84         shr     \$16,$t0
85         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
86         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
87         shr     \$16,$t1
88         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
89         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
90         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
91         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
92         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
93         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
94         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
95         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
96         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
97         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
98         mov     `$seed+($i+1)*$scale+4`($key),$t0
99         xor     $t3,$t2                         # t2^=t3
100         ror     \$8,$t3                         # t3=RightRotate(t3,8)
101         xor     $t2,$s2
102         xor     $t2,$s3
103         xor     $t3,$s3
104 ___
105 }
106
107 # void Camellia_EncryptBlock_Rounds(
108 #               int grandRounds,
109 #               const Byte plaintext[],
110 #               const KEY_TABLE_TYPE keyTable,
111 #               Byte ciphertext[])
112 $code=<<___;
113 .text
114
115 # V1.x API
116 .globl  Camellia_EncryptBlock
117 .type   Camellia_EncryptBlock,\@abi-omnipotent
118 .align  16
119 Camellia_EncryptBlock:
120         movl    \$128,%eax
121         subl    $arg0d,%eax
122         movl    \$3,$arg0d
123         adcl    \$0,$arg0d      # keyBitLength==128?3:4
124         jmp     .Lenc_rounds
125 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
126 # V2
127 .globl  Camellia_EncryptBlock_Rounds
128 .type   Camellia_EncryptBlock_Rounds,\@function,4
129 .align  16
130 .Lenc_rounds:
131 Camellia_EncryptBlock_Rounds:
132         push    %rbx
133         push    %rbp
134         push    %r13
135         push    %r14
136         push    %r15
137 .Lenc_prologue:
138
139         #mov    %rsi,$inp               # put away arguments
140         mov     %rcx,$out
141         mov     %rdx,$key
142
143         shl     \$6,%edi                # process grandRounds
144         lea     .LCamellia_SBOX(%rip),$Tbl
145         lea     ($key,%rdi),$keyend
146
147         mov     0(%rsi),@S[0]           # load plaintext
148         mov     4(%rsi),@S[1]
149         mov     8(%rsi),@S[2]
150         bswap   @S[0]
151         mov     12(%rsi),@S[3]
152         bswap   @S[1]
153         bswap   @S[2]
154         bswap   @S[3]
155
156         call    _x86_64_Camellia_encrypt
157
158         bswap   @S[0]
159         bswap   @S[1]
160         bswap   @S[2]
161         mov     @S[0],0($out)
162         bswap   @S[3]
163         mov     @S[1],4($out)
164         mov     @S[2],8($out)
165         mov     @S[3],12($out)
166
167         mov     0(%rsp),%r15
168         mov     8(%rsp),%r14
169         mov     16(%rsp),%r13
170         mov     24(%rsp),%rbp
171         mov     32(%rsp),%rbx
172         lea     40(%rsp),%rsp
173 .Lenc_epilogue:
174         ret
175 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
176
177 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
178 .align  16
179 _x86_64_Camellia_encrypt:
180         xor     0($key),@S[1]
181         xor     4($key),@S[0]           # ^=key[0-3]
182         xor     8($key),@S[3]
183         xor     12($key),@S[2]
184 .align  16
185 .Leloop:
186         mov     16($key),$t1            # prefetch key[4-5]
187         mov     20($key),$t0
188
189 ___
190         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
191 $code.=<<___;
192         lea     16*4($key),$key
193         cmp     $keyend,$key
194         mov     8($key),$t3             # prefetch key[2-3]
195         mov     12($key),$t2
196         je      .Ledone
197
198         and     @S[0],$t0
199         or      @S[3],$t3
200         rol     \$1,$t0
201         xor     $t3,@S[2]               # s2^=s3|key[3];
202         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
203         and     @S[2],$t2
204         or      @S[1],$t1
205         rol     \$1,$t2
206         xor     $t1,@S[0]               # s0^=s1|key[1];
207         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
208         jmp     .Leloop
209
210 .align  16
211 .Ledone:
212         xor     @S[2],$t0               # SwapHalf
213         xor     @S[3],$t1
214         xor     @S[0],$t2
215         xor     @S[1],$t3
216
217         mov     $t0,@S[0]
218         mov     $t1,@S[1]
219         mov     $t2,@S[2]
220         mov     $t3,@S[3]
221
222         .byte   0xf3,0xc3               # rep ret
223 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
224
225 # V1.x API
226 .globl  Camellia_DecryptBlock
227 .type   Camellia_DecryptBlock,\@abi-omnipotent
228 .align  16
229 Camellia_DecryptBlock:
230         movl    \$128,%eax
231         subl    $arg0d,%eax
232         movl    \$3,$arg0d
233         adcl    \$0,$arg0d      # keyBitLength==128?3:4
234         jmp     .Ldec_rounds
235 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
236 # V2
237 .globl  Camellia_DecryptBlock_Rounds
238 .type   Camellia_DecryptBlock_Rounds,\@function,4
239 .align  16
240 .Ldec_rounds:
241 Camellia_DecryptBlock_Rounds:
242         push    %rbx
243         push    %rbp
244         push    %r13
245         push    %r14
246         push    %r15
247 .Ldec_prologue:
248
249         #mov    %rsi,$inp               # put away arguments
250         mov     %rcx,$out
251         mov     %rdx,$keyend
252
253         shl     \$6,%edi                # process grandRounds
254         lea     .LCamellia_SBOX(%rip),$Tbl
255         lea     ($keyend,%rdi),$key
256
257         mov     0(%rsi),@S[0]           # load plaintext
258         mov     4(%rsi),@S[1]
259         mov     8(%rsi),@S[2]
260         bswap   @S[0]
261         mov     12(%rsi),@S[3]
262         bswap   @S[1]
263         bswap   @S[2]
264         bswap   @S[3]
265
266         call    _x86_64_Camellia_decrypt
267
268         bswap   @S[0]
269         bswap   @S[1]
270         bswap   @S[2]
271         mov     @S[0],0($out)
272         bswap   @S[3]
273         mov     @S[1],4($out)
274         mov     @S[2],8($out)
275         mov     @S[3],12($out)
276
277         mov     0(%rsp),%r15
278         mov     8(%rsp),%r14
279         mov     16(%rsp),%r13
280         mov     24(%rsp),%rbp
281         mov     32(%rsp),%rbx
282         lea     40(%rsp),%rsp
283 .Ldec_epilogue:
284         ret
285 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
286
287 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
288 .align  16
289 _x86_64_Camellia_decrypt:
290         xor     0($key),@S[1]
291         xor     4($key),@S[0]           # ^=key[0-3]
292         xor     8($key),@S[3]
293         xor     12($key),@S[2]
294 .align  16
295 .Ldloop:
296         mov     -8($key),$t1            # prefetch key[4-5]
297         mov     -4($key),$t0
298
299 ___
300         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
301 $code.=<<___;
302         lea     -16*4($key),$key
303         cmp     $keyend,$key
304         mov     0($key),$t3             # prefetch key[2-3]
305         mov     4($key),$t2
306         je      .Lddone
307
308         and     @S[0],$t0
309         or      @S[3],$t3
310         rol     \$1,$t0
311         xor     $t3,@S[2]               # s2^=s3|key[3];
312         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
313         and     @S[2],$t2
314         or      @S[1],$t1
315         rol     \$1,$t2
316         xor     $t1,@S[0]               # s0^=s1|key[1];
317         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
318
319         jmp     .Ldloop
320
321 .align  16
322 .Lddone:
323         xor     @S[2],$t2
324         xor     @S[3],$t3
325         xor     @S[0],$t0
326         xor     @S[1],$t1
327
328         mov     $t2,@S[0]               # SwapHalf
329         mov     $t3,@S[1]
330         mov     $t0,@S[2]
331         mov     $t1,@S[3]
332
333         .byte   0xf3,0xc3               # rep ret
334 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
335 ___
336
337 sub _saveround {
338 my ($rnd,$key,@T)=@_;
339 my $bias=int(@T[0])?shift(@T):0;
340
341     if ($#T==3) {
342         $code.=<<___;
343         mov     @T[1],`$bias+$rnd*8+0`($key)
344         mov     @T[0],`$bias+$rnd*8+4`($key)
345         mov     @T[3],`$bias+$rnd*8+8`($key)
346         mov     @T[2],`$bias+$rnd*8+12`($key)
347 ___
348     } else {
349         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
350         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
351     }
352 }
353
354 sub _loadround {
355 my ($rnd,$key,@T)=@_;
356 my $bias=int(@T[0])?shift(@T):0;
357
358 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
359 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
360 }
361
362 # shld is very slow on Intel EM64T family. Even on AMD it limits
363 # instruction decode rate [because it's VectorPath] and consequently
364 # performance...
365 sub __rotl128 {
366 my ($i0,$i1,$rot)=@_;
367
368     if ($rot) {
369         $code.=<<___;
370         mov     $i0,%r11
371         shld    \$$rot,$i1,$i0
372         shld    \$$rot,%r11,$i1
373 ___
374     }
375 }
376
377 # ... Implementing 128-bit rotate without shld gives 80% better
378 # performance EM64T, +15% on AMD64 and only ~7% degradation on
379 # Core2. This is therefore preferred.
380 sub _rotl128 {
381 my ($i0,$i1,$rot)=@_;
382
383     if ($rot) {
384         $code.=<<___;
385         mov     $i0,%r11
386         shl     \$$rot,$i0
387         mov     $i1,%r9
388         shr     \$`64-$rot`,%r9
389         shr     \$`64-$rot`,%r11
390         or      %r9,$i0
391         shl     \$$rot,$i1
392         or      %r11,$i1
393 ___
394     }
395 }
396
397 { my $step=0;
398
399 $code.=<<___;
400 .globl  Camellia_Ekeygen
401 .type   Camellia_Ekeygen,\@function,3
402 .align  16
403 Camellia_Ekeygen:
404         push    %rbx
405         push    %rbp
406         push    %r13
407         push    %r14
408         push    %r15
409 .Lkey_prologue:
410
411         mov     %rdi,$keyend            # put away arguments, keyBitLength
412         mov     %rdx,$out               # keyTable
413
414         mov     0(%rsi),@S[0]           # load 0-127 bits
415         mov     4(%rsi),@S[1]
416         mov     8(%rsi),@S[2]
417         mov     12(%rsi),@S[3]
418
419         bswap   @S[0]
420         bswap   @S[1]
421         bswap   @S[2]
422         bswap   @S[3]
423 ___
424         &_saveround     (0,$out,@S);    # KL<<<0
425 $code.=<<___;
426         cmp     \$128,$keyend           # check keyBitLength
427         je      .L1st128
428
429         mov     16(%rsi),@S[0]          # load 128-191 bits
430         mov     20(%rsi),@S[1]
431         cmp     \$192,$keyend
432         je      .L1st192
433         mov     24(%rsi),@S[2]          # load 192-255 bits
434         mov     28(%rsi),@S[3]
435         jmp     .L1st256
436 .L1st192:
437         mov     @S[0],@S[2]
438         mov     @S[1],@S[3]
439         not     @S[2]
440         not     @S[3]
441 .L1st256:
442         bswap   @S[0]
443         bswap   @S[1]
444         bswap   @S[2]
445         bswap   @S[3]
446 ___
447         &_saveround     (4,$out,@S);    # temp storage for KR!
448 $code.=<<___;
449         xor     0($out),@S[1]           # KR^KL
450         xor     4($out),@S[0]
451         xor     8($out),@S[3]
452         xor     12($out),@S[2]
453
454 .L1st128:
455         lea     .LCamellia_SIGMA(%rip),$key
456         lea     .LCamellia_SBOX(%rip),$Tbl
457
458         mov     0($key),$t1
459         mov     4($key),$t0
460 ___
461         &Camellia_Feistel($step++);
462         &Camellia_Feistel($step++);
463 $code.=<<___;
464         xor     0($out),@S[1]           # ^KL
465         xor     4($out),@S[0]
466         xor     8($out),@S[3]
467         xor     12($out),@S[2]
468 ___
469         &Camellia_Feistel($step++);
470         &Camellia_Feistel($step++);
471 $code.=<<___;
472         cmp     \$128,$keyend
473         jne     .L2nd256
474
475         lea     128($out),$out          # size optimization
476         shl     \$32,%r8                # @S[0]||
477         shl     \$32,%r10               # @S[2]||
478         or      %r9,%r8                 # ||@S[1]
479         or      %r11,%r10               # ||@S[3]
480 ___
481         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
482         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
483         &_rotl128       ("%rax","%rbx",15);
484         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
485         &_rotl128       ("%r8","%r10",15);
486         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
487         &_rotl128       ("%r8","%r10",15);              # 15+15=30
488         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
489         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
490         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
491         &_rotl128       ("%r8","%r10",15);              # 30+15=45
492         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
493         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
494         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
495         &_rotl128       ("%r8","%r10",15);              # 45+15=60
496         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
497         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
498         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
499         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
500         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
501         &_rotl128       ("%r8","%r10",34);              # 60+34=94
502         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
503         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
504         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
505         &_rotl128       ("%r8","%r10",17);              # 94+17=111
506         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
507 $code.=<<___;
508         mov     \$3,%eax
509         jmp     .Ldone
510 .align  16
511 .L2nd256:
512 ___
513         &_saveround     (6,$out,@S);    # temp storage for KA!
514 $code.=<<___;
515         xor     `4*8+0`($out),@S[1]     # KA^KR
516         xor     `4*8+4`($out),@S[0]
517         xor     `5*8+0`($out),@S[3]
518         xor     `5*8+4`($out),@S[2]
519 ___
520         &Camellia_Feistel($step++);
521         &Camellia_Feistel($step++);
522
523         &_loadround     (0,$out,"%rax","%rbx"); # KL
524         &_loadround     (4,$out,"%rcx","%rdx"); # KR
525         &_loadround     (6,$out,"%r14","%r15"); # KA
526 $code.=<<___;
527         lea     128($out),$out          # size optimization
528         shl     \$32,%r8                # @S[0]||
529         shl     \$32,%r10               # @S[2]||
530         or      %r9,%r8                 # ||@S[1]
531         or      %r11,%r10               # ||@S[3]
532 ___
533         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
534         &_rotl128       ("%rcx","%rdx",15);
535         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
536         &_rotl128       ("%r14","%r15",15);
537         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
538         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
539         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
540         &_rotl128       ("%r8","%r10",30);
541         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
542         &_rotl128       ("%rax","%rbx",45);
543         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
544         &_rotl128       ("%r14","%r15",30);             # 15+30=45
545         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
546         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
547         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
548         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
549         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
550         &_rotl128       ("%r8","%r10",30);              # 30+30=60
551         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
552         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
553         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
554         &_rotl128       ("%r14","%r15",32);             # 45+32=77
555         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
556         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
557         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
558         &_rotl128       ("%r14","%r15",17);             # 77+17=94
559         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
560         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
561         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
562         &_rotl128       ("%r8","%r10",51);              # 60+51=111
563         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
564 $code.=<<___;
565         mov     \$4,%eax
566 .Ldone:
567         mov     0(%rsp),%r15
568         mov     8(%rsp),%r14
569         mov     16(%rsp),%r13
570         mov     24(%rsp),%rbp
571         mov     32(%rsp),%rbx
572         lea     40(%rsp),%rsp
573 .Lkey_epilogue:
574         ret
575 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
576 ___
577 }
578
579 @SBOX=(
580 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
581  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
582 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
583 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
584 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
585 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
586  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
587 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
588 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
589  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
590 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
591  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
592 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
593 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
594 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
595  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596
597 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
598 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
599 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
600 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
601
602 $code.=<<___;
603 .align  64
604 .LCamellia_SIGMA:
605 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
606 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
607 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
608 .long   0,          0,          0,          0
609 .LCamellia_SBOX:
610 ___
611 # tables are interleaved, remember?
612 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
613 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
614 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
615
616 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
617 #                       size_t length, const CAMELLIA_KEY *key,
618 #                       unsigned char *ivp,const int enc);
619 {
620 $_key="0(%rsp)";
621 $_end="8(%rsp)";        # inp+len&~15
622 $_res="16(%rsp)";       # len&15
623 $ivec="24(%rsp)";
624 $_ivp="40(%rsp)";
625 $_rsp="48(%rsp)";
626
627 $code.=<<___;
628 .globl  Camellia_cbc_encrypt
629 .type   Camellia_cbc_encrypt,\@function,6
630 .align  16
631 Camellia_cbc_encrypt:
632         cmp     \$0,%rdx
633         je      .Lcbc_abort
634         push    %rbx
635         push    %rbp
636         push    %r12
637         push    %r13
638         push    %r14
639         push    %r15
640 .Lcbc_prologue:
641
642         mov     %rsp,%rbp
643         sub     \$64,%rsp
644         and     \$-64,%rsp
645
646         # place stack frame just "above mod 1024" the key schedule,
647         # this ensures that cache associativity suffices
648         lea     -64-63(%rcx),%r10
649         sub     %rsp,%r10
650         neg     %r10
651         and     \$0x3C0,%r10
652         sub     %r10,%rsp
653         add     \$8,%rsp                # 8 is reserved for callee's ra
654
655         mov     %rdi,$inp               # inp argument
656         mov     %rsi,$out               # out argument
657         mov     %r8,%rbx                # ivp argument
658         mov     %rcx,$key               # key argument
659         mov     272(%rcx),$keyend       # grandRounds
660
661         mov     %r8,$_ivp
662         mov     %rbp,$_rsp
663
664 .Lcbc_body:
665         lea     .LCamellia_SBOX(%rip),$Tbl
666
667         mov     \$32,%ecx
668 .align  4
669 .Lcbc_prefetch_sbox:
670         mov     0($Tbl),%rax
671         mov     32($Tbl),%rsi
672         mov     64($Tbl),%rdi
673         mov     96($Tbl),%r11
674         lea     128($Tbl),$Tbl
675         loop    .Lcbc_prefetch_sbox
676         sub     \$4096,$Tbl
677         shl     \$6,$keyend
678         mov     %rdx,%rcx               # len argument
679         lea     ($key,$keyend),$keyend
680
681         cmp     \$0,%r9d                # enc argument
682         je      .LCBC_DECRYPT
683
684         and     \$-16,%rdx
685         and     \$15,%rcx               # length residue
686         lea     ($inp,%rdx),%rdx
687         mov     $key,$_key
688         mov     %rdx,$_end
689         mov     %rcx,$_res
690
691         cmp     $inp,%rdx
692         mov     0(%rbx),@S[0]           # load IV
693         mov     4(%rbx),@S[1]
694         mov     8(%rbx),@S[2]
695         mov     12(%rbx),@S[3]
696         je      .Lcbc_enc_tail
697         jmp     .Lcbc_eloop
698
699 .align  16
700 .Lcbc_eloop:
701         xor     0($inp),@S[0]
702         xor     4($inp),@S[1]
703         xor     8($inp),@S[2]
704         bswap   @S[0]
705         xor     12($inp),@S[3]
706         bswap   @S[1]
707         bswap   @S[2]
708         bswap   @S[3]
709
710         call    _x86_64_Camellia_encrypt
711
712         mov     $_key,$key              # "rewind" the key
713         bswap   @S[0]
714         mov     $_end,%rdx
715         bswap   @S[1]
716         mov     $_res,%rcx
717         bswap   @S[2]
718         mov     @S[0],0($out)
719         bswap   @S[3]
720         mov     @S[1],4($out)
721         mov     @S[2],8($out)
722         lea     16($inp),$inp
723         mov     @S[3],12($out)
724         cmp     %rdx,$inp
725         lea     16($out),$out
726         jne     .Lcbc_eloop
727
728         cmp     \$0,%rcx
729         jne     .Lcbc_enc_tail
730
731         mov     $_ivp,$out
732         mov     @S[0],0($out)           # write out IV residue
733         mov     @S[1],4($out)
734         mov     @S[2],8($out)
735         mov     @S[3],12($out)
736         jmp     .Lcbc_done
737
738 .align  16
739 .Lcbc_enc_tail:
740         xor     %rax,%rax
741         mov     %rax,0+$ivec
742         mov     %rax,8+$ivec
743         mov     %rax,$_res
744
745         pushfq
746         cld
747         mov     $inp,%rsi
748         lea     $ivec,%rdi
749         .long   0x9066A4F3              # rep movsb
750         popfq
751
752         lea     $ivec,$inp
753         lea     16+$ivec,%rax
754         mov     %rax,$_end
755         jmp     .Lcbc_eloop             # one more time
756
757 .align  16
758 .LCBC_DECRYPT:
759         xchg    $key,$keyend
760         add     \$15,%rdx
761         and     \$15,%rcx               # length residue
762         and     \$-16,%rdx
763         mov     $key,$_key
764         lea     ($inp,%rdx),%rdx
765         mov     %rdx,$_end
766         mov     %rcx,$_res
767
768         mov     (%rbx),%rax             # load IV
769         mov     8(%rbx),%rbx
770         jmp     .Lcbc_dloop
771 .align  16
772 .Lcbc_dloop:
773         mov     0($inp),@S[0]
774         mov     4($inp),@S[1]
775         mov     8($inp),@S[2]
776         bswap   @S[0]
777         mov     12($inp),@S[3]
778         bswap   @S[1]
779         mov     %rax,0+$ivec            # save IV to temporary storage
780         bswap   @S[2]
781         mov     %rbx,8+$ivec
782         bswap   @S[3]
783
784         call    _x86_64_Camellia_decrypt
785
786         mov     $_key,$key              # "rewind" the key
787         mov     $_end,%rdx
788         mov     $_res,%rcx
789
790         bswap   @S[0]
791         mov     ($inp),%rax             # load IV for next iteration
792         bswap   @S[1]
793         mov     8($inp),%rbx
794         bswap   @S[2]
795         xor     0+$ivec,@S[0]
796         bswap   @S[3]
797         xor     4+$ivec,@S[1]
798         xor     8+$ivec,@S[2]
799         lea     16($inp),$inp
800         xor     12+$ivec,@S[3]
801         cmp     %rdx,$inp
802         je      .Lcbc_ddone
803
804         mov     @S[0],0($out)
805         mov     @S[1],4($out)
806         mov     @S[2],8($out)
807         mov     @S[3],12($out)
808
809         lea     16($out),$out
810         jmp     .Lcbc_dloop
811
812 .align  16
813 .Lcbc_ddone:
814         mov     $_ivp,%rdx
815         cmp     \$0,%rcx
816         jne     .Lcbc_dec_tail
817
818         mov     @S[0],0($out)
819         mov     @S[1],4($out)
820         mov     @S[2],8($out)
821         mov     @S[3],12($out)
822
823         mov     %rax,(%rdx)             # write out IV residue
824         mov     %rbx,8(%rdx)
825         jmp     .Lcbc_done
826 .align  16
827 .Lcbc_dec_tail:
828         mov     @S[0],0+$ivec
829         mov     @S[1],4+$ivec
830         mov     @S[2],8+$ivec
831         mov     @S[3],12+$ivec
832
833         pushfq
834         cld
835         lea     $ivec,%rsi
836         lea     ($out),%rdi
837         .long   0x9066A4F3              # rep movsb
838         popfq
839
840         mov     %rax,(%rdx)             # write out IV residue
841         mov     %rbx,8(%rdx)
842         jmp     .Lcbc_done
843
844 .align  16
845 .Lcbc_done:
846         mov     $_rsp,%rcx
847         mov     0(%rcx),%r15
848         mov     8(%rcx),%r14
849         mov     16(%rcx),%r13
850         mov     24(%rcx),%r12
851         mov     32(%rcx),%rbp
852         mov     40(%rcx),%rbx
853         lea     48(%rcx),%rsp
854 .Lcbc_abort:
855         ret
856 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
857
858 .asciz  "Camellia for x86_64 by <appro@openssl.org>"
859 ___
860 }
861
862 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
863 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
864 if ($win64) {
865 $rec="%rcx";
866 $frame="%rdx";
867 $context="%r8";
868 $disp="%r9";
869
870 $code.=<<___;
871 .extern __imp_RtlVirtualUnwind
872 .type   common_se_handler,\@abi-omnipotent
873 .align  16
874 common_se_handler:
875         push    %rsi
876         push    %rdi
877         push    %rbx
878         push    %rbp
879         push    %r13
880         push    %r14
881         push    %r15
882         pushfq
883         lea     -72(%rsp),%rsp
884
885         mov     120($context),%rax      # pull context->Rax
886         mov     248($context),%rbx      # pull context->Rip
887
888         mov     8($disp),%rsi           # disp->ImageBase
889         mov     56($disp),%r11          # disp->HandlerData
890
891         mov     0(%r11),%r10d           # HandlerData[0]
892         lea     (%rsi,%r10),%r10        # prologue label
893         cmp     %r10,%rbx               # context->Rip<prologue label
894         jb      .Lin_prologue
895
896         mov     152($context),%rax      # pull context->Rsp
897
898         mov     4(%r11),%r10d           # HandlerData[1]
899         lea     (%rsi,%r10),%r10        # epilogue label
900         cmp     %r10,%rbx               # context->Rip>=epilogue label
901         jae     .Lin_prologue
902
903         lea     40(%rax),%rax
904         mov     -8(%rax),%rbx
905         mov     -16(%rax),%rbp
906         mov     -24(%rax),%r13
907         mov     -32(%rax),%r14
908         mov     -40(%rax),%r15
909         mov     %rbx,144($context)      # restore context->Rbx
910         mov     %rbp,160($context)      # restore context->Rbp
911         mov     %r13,224($context)      # restore context->R13
912         mov     %r14,232($context)      # restore context->R14
913         mov     %r15,240($context)      # restore context->R15
914
915 .Lin_prologue:
916         mov     8(%rax),%rdi
917         mov     16(%rax),%rsi
918         mov     %rax,152($context)      # restore context->Rsp
919         mov     %rsi,168($context)      # restore context->Rsi
920         mov     %rdi,176($context)      # restore context->Rdi
921
922         mov     40($disp),%rdi          # disp->ContextRecord
923         mov     $context,%rsi
924         mov     \$`1232/8`,%ecx
925         .long   0xa548f3fc              # cld; rep movsq
926
927         mov     $disp,%rsi
928         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
929         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
930         mov     0(%rsi),%r8             # arg3, disp->ControlPc
931         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
932         mov     40(%rsi),%r10           # disp->ContextRecord
933         lea     56(%rsi),%r11           # &disp->HandlerData
934         lea     24(%rsi),%r12           # &disp->EstablisherFrame
935         mov     %r10,32(%rsp)           # arg5
936         mov     %r11,40(%rsp)           # arg6
937         mov     %r12,48(%rsp)           # arg7
938         mov     %rcx,56(%rsp)           # arg8, (NULL)
939         call    *__imp_RtlVirtualUnwind(%rip)
940
941         mov     \$1,%eax                # ExceptionContinueSearch
942         lea     72(%rsp),%rsp
943         popfq
944         pop     %r15
945         pop     %r14
946         pop     %r13
947         pop     %rbp
948         pop     %rbx
949         pop     %rdi
950         pop     %rsi
951         ret
952 .size   common_se_handler,.-common_se_handler
953
954 .type   cbc_se_handler,\@abi-omnipotent
955 .align  16
956 cbc_se_handler:
957         push    %rsi
958         push    %rdi
959         push    %rbx
960         push    %rbp
961         push    %r12
962         push    %r13
963         push    %r14
964         push    %r15
965         pushfq
966         lea     -64(%rsp),%rsp
967
968         mov     120($context),%rax      # pull context->Rax
969         mov     248($context),%rbx      # pull context->Rip
970
971         lea     .Lcbc_prologue(%rip),%r10
972         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
973         jb      .Lin_cbc_prologue
974
975         lea     .Lcbc_body(%rip),%r10
976         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
977         jb      .Lin_cbc_frame_setup
978
979         mov     152($context),%rax      # pull context->Rsp
980
981         lea     .Lcbc_abort(%rip),%r10
982         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
983         jae     .Lin_cbc_prologue
984
985         mov     48(%rax),%rax           # $_rsp
986         lea     48(%rax),%rax
987
988 .Lin_cbc_frame_setup:
989         mov     -8(%rax),%rbx
990         mov     -16(%rax),%rbp
991         mov     -24(%rax),%r12
992         mov     -32(%rax),%r13
993         mov     -40(%rax),%r14
994         mov     -48(%rax),%r15
995         mov     %rbx,144($context)      # restore context->Rbx
996         mov     %rbp,160($context)      # restore context->Rbp
997         mov     %r12,216($context)      # restore context->R12
998         mov     %r13,224($context)      # restore context->R13
999         mov     %r14,232($context)      # restore context->R14
1000         mov     %r15,240($context)      # restore context->R15
1001
1002 .Lin_cbc_prologue:
1003         mov     8(%rax),%rdi
1004         mov     16(%rax),%rsi
1005         mov     %rax,152($context)      # restore context->Rsp
1006         mov     %rsi,168($context)      # restore context->Rsi
1007         mov     %rdi,176($context)      # restore context->Rdi
1008
1009         mov     40($disp),%rdi          # disp->ContextRecord
1010         mov     $context,%rsi           # context
1011         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1012         .long   0xa548f3fc              # cld; rep movsq
1013
1014         mov     $disp,%rsi
1015         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1016         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1017         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1018         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1019         mov     40(%rsi),%r10           # disp->ContextRecord
1020         lea     56(%rsi),%r11           # &disp->HandlerData
1021         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1022         mov     %r10,32(%rsp)           # arg5
1023         mov     %r11,40(%rsp)           # arg6
1024         mov     %r12,48(%rsp)           # arg7
1025         mov     %rcx,56(%rsp)           # arg8, (NULL)
1026         call    *__imp_RtlVirtualUnwind(%rip)
1027
1028         mov     \$1,%eax                # ExceptionContinueSearch
1029         lea     64(%rsp),%rsp
1030         popfq
1031         pop     %r15
1032         pop     %r14
1033         pop     %r13
1034         pop     %r12
1035         pop     %rbp
1036         pop     %rbx
1037         pop     %rdi
1038         pop     %rsi
1039         ret
1040 .size   cbc_se_handler,.-cbc_se_handler
1041
1042 .section        .pdata
1043 .align  4
1044         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1045         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1046         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1047
1048         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1049         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1050         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1051
1052         .rva    .LSEH_begin_Camellia_Ekeygen
1053         .rva    .LSEH_end_Camellia_Ekeygen
1054         .rva    .LSEH_info_Camellia_Ekeygen
1055
1056         .rva    .LSEH_begin_Camellia_cbc_encrypt
1057         .rva    .LSEH_end_Camellia_cbc_encrypt
1058         .rva    .LSEH_info_Camellia_cbc_encrypt
1059
1060 .section        .xdata
1061 .align  8
1062 .LSEH_info_Camellia_EncryptBlock_Rounds:
1063         .byte   9,0,0,0
1064         .rva    common_se_handler
1065         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1066 .LSEH_info_Camellia_DecryptBlock_Rounds:
1067         .byte   9,0,0,0
1068         .rva    common_se_handler
1069         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1070 .LSEH_info_Camellia_Ekeygen:
1071         .byte   9,0,0,0
1072         .rva    common_se_handler
1073         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1074 .LSEH_info_Camellia_cbc_encrypt:
1075         .byte   9,0,0,0
1076         .rva    cbc_se_handler
1077 ___
1078 }
1079
1080 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1081 print $code;
1082 close STDOUT;