Camellia update: make it respect NO_[INLINE_]ASM and typo in assembler.
[openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5 #
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
13
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
16 #
17 #                       AMD64   Core2   EM64T
18 # -evp camellia-128-ecb 16.7    21.0    22.7
19 # + over gcc 3.4.6      +25%    +5%     0%
20 #
21 # camellia-128-cbc      15.7    20.4    21.1
22 #
23 # 128-bit key setup     128     216     205     cycles/key
24 # + over gcc 3.4.6      +54%    +39%    +15%
25 #
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32 $flavour = shift;
33 $output  = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open STDOUT,"| $^X $xlate $flavour $output";
44
45 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
46 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
47                         $r =~ s/%[er]([sd]i)/%\1l/;
48                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
49
50 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
51 @S=("%r8d","%r9d","%r10d","%r11d");
52 $i0="%esi";
53 $i1="%edi";
54 $Tbl="%rbp";    # size optimization
55 $inp="%r12";
56 $out="%r13";
57 $key="%r14";
58 $keyend="%r15";
59 $arg0d=$win64?"%ecx":"%edi";
60
61 # const unsigned int Camellia_SBOX[4][256];
62 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
63 # and [2][] - with [3][]. This is done to minimize code size.
64 $SBOX1_1110=0;          # Camellia_SBOX[0]
65 $SBOX4_4404=4;          # Camellia_SBOX[1]
66 $SBOX2_0222=2048;       # Camellia_SBOX[2]
67 $SBOX3_3033=2052;       # Camellia_SBOX[3]
68
69 sub Camellia_Feistel {
70 my $i=@_[0];
71 my $seed=defined(@_[1])?@_[1]:0;
72 my $scale=$seed<0?-8:8;
73 my $j=($i&1)*2;
74 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
75
76 $code.=<<___;
77         xor     $s0,$t0                         # t0^=key[0]
78         xor     $s1,$t1                         # t1^=key[1]
79         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
80         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
81         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
82         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
83         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
84         shr     \$16,$t0
85         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
86         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
87         shr     \$16,$t1
88         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
89         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
90         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
91         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
92         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
93         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
94         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
95         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
96         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
97         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
98         mov     `$seed+($i+1)*$scale+4`($key),$t0
99         xor     $t3,$t2                         # t2^=t3
100         ror     \$8,$t3                         # t3=RightRotate(t3,8)
101         xor     $t2,$s2
102         xor     $t2,$s3
103         xor     $t3,$s3
104 ___
105 }
106
107 # void Camellia_EncryptBlock_Rounds(
108 #               int grandRounds,
109 #               const Byte plaintext[],
110 #               const KEY_TABLE_TYPE keyTable,
111 #               Byte ciphertext[])
112 $code=<<___;
113 .text
114
115 # V1.x API
116 .globl  Camellia_EncryptBlock
117 .type   Camellia_EncryptBlock,\@abi-omnipotent
118 .align  16
119 Camellia_EncryptBlock:
120         movl    \$128,%eax
121         subl    $arg0d,%eax
122         movl    \$3,$arg0d
123         adcl    \$0,$arg0d      # keyBitLength==128?3:4
124         jmp     .Lenc_rounds
125 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
126 # V2
127 .globl  Camellia_EncryptBlock_Rounds
128 .type   Camellia_EncryptBlock_Rounds,\@function,4
129 .align  16
130 .Lenc_rounds:
131 Camellia_EncryptBlock_Rounds:
132         push    %rbx
133         push    %rbp
134         push    %r13
135         push    %r14
136         push    %r15
137 .Lenc_prologue:
138
139         #mov    %rsi,$inp               # put away arguments
140         mov     %rcx,$out
141         mov     %rdx,$key
142
143         shl     \$6,%edi                # process grandRounds
144         lea     .LCamellia_SBOX(%rip),$Tbl
145         lea     ($key,%rdi),$keyend
146
147         mov     0(%rsi),@S[0]           # load plaintext
148         mov     4(%rsi),@S[1]
149         mov     8(%rsi),@S[2]
150         bswap   @S[0]
151         mov     12(%rsi),@S[3]
152         bswap   @S[1]
153         bswap   @S[2]
154         bswap   @S[3]
155
156         call    _x86_64_Camellia_encrypt
157
158         bswap   @S[0]
159         bswap   @S[1]
160         bswap   @S[2]
161         mov     @S[0],0($out)
162         bswap   @S[3]
163         mov     @S[1],4($out)
164         mov     @S[2],8($out)
165         mov     @S[3],12($out)
166
167         mov     0(%rsp),%r15
168         mov     8(%rsp),%r14
169         mov     16(%rsp),%r13
170         mov     24(%rsp),%rbp
171         mov     32(%rsp),%rbx
172         lea     40(%rsp),%rsp
173 .Lenc_epilogue:
174         ret
175 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
176
177 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
178 .align  16
179 _x86_64_Camellia_encrypt:
180         xor     0($key),@S[1]
181         xor     4($key),@S[0]           # ^=key[0-3]
182         xor     8($key),@S[3]
183         xor     12($key),@S[2]
184 .align  16
185 .Leloop:
186         mov     16($key),$t1            # prefetch key[4-5]
187         mov     20($key),$t0
188
189 ___
190         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
191 $code.=<<___;
192         lea     16*4($key),$key
193         cmp     $keyend,$key
194         mov     8($key),$t3             # prefetch key[2-3]
195         mov     12($key),$t2
196         je      .Ledone
197
198         and     @S[0],$t0
199         or      @S[3],$t3
200         rol     \$1,$t0
201         xor     $t3,@S[2]               # s2^=s3|key[3];
202         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
203         and     @S[2],$t2
204         or      @S[1],$t1
205         rol     \$1,$t2
206         xor     $t1,@S[0]               # s0^=s1|key[1];
207         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
208         jmp     .Leloop
209
210 .align  16
211 .Ledone:
212         xor     @S[2],$t0               # SwapHalf
213         xor     @S[3],$t1
214         xor     @S[0],$t2
215         xor     @S[1],$t3
216
217         mov     $t0,@S[0]
218         mov     $t1,@S[1]
219         mov     $t2,@S[2]
220         mov     $t3,@S[3]
221
222         .byte   0xf3,0xc3               # rep ret
223 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
224
225 # V1.x API
226 .globl  Camellia_DecryptBlock
227 .type   Camellia_DecryptBlock,\@abi-omnipotent
228 .align  16
229 Camellia_DecryptBlock:
230         movl    \$128,%eax
231         subl    $arg0d,%eax
232         movl    \$3,$arg0d
233         adcl    \$0,$arg0d      # keyBitLength==128?3:4
234         jmp     .Ldec_rounds
235 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
236 # V2
237 .globl  Camellia_DecryptBlock_Rounds
238 .type   Camellia_DecryptBlock_Rounds,\@function,4
239 .align  16
240 .Ldec_rounds:
241 Camellia_DecryptBlock_Rounds:
242         push    %rbx
243         push    %rbp
244         push    %r13
245         push    %r14
246         push    %r15
247 .Ldec_prologue:
248
249         #mov    %rsi,$inp               # put away arguments
250         mov     %rcx,$out
251         mov     %rdx,$keyend
252
253         shl     \$6,%edi                # process grandRounds
254         lea     .LCamellia_SBOX(%rip),$Tbl
255         lea     ($keyend,%rdi),$key
256
257         mov     0(%rsi),@S[0]           # load plaintext
258         mov     4(%rsi),@S[1]
259         mov     8(%rsi),@S[2]
260         bswap   @S[0]
261         mov     12(%rsi),@S[3]
262         bswap   @S[1]
263         bswap   @S[2]
264         bswap   @S[3]
265
266         call    _x86_64_Camellia_decrypt
267
268         bswap   @S[0]
269         bswap   @S[1]
270         bswap   @S[2]
271         mov     @S[0],0($out)
272         bswap   @S[3]
273         mov     @S[1],4($out)
274         mov     @S[2],8($out)
275         mov     @S[3],12($out)
276
277         mov     0(%rsp),%r15
278         mov     8(%rsp),%r14
279         mov     16(%rsp),%r13
280         mov     24(%rsp),%rbp
281         mov     32(%rsp),%rbx
282         lea     40(%rsp),%rsp
283 .Ldec_epilogue:
284         ret
285 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
286
287 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
288 .align  16
289 _x86_64_Camellia_decrypt:
290         xor     0($key),@S[1]
291         xor     4($key),@S[0]           # ^=key[0-3]
292         xor     8($key),@S[3]
293         xor     12($key),@S[2]
294 .align  16
295 .Ldloop:
296         mov     -8($key),$t1            # prefetch key[4-5]
297         mov     -4($key),$t0
298
299 ___
300         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
301 $code.=<<___;
302         lea     -16*4($key),$key
303         cmp     $keyend,$key
304         mov     0($key),$t3             # prefetch key[2-3]
305         mov     4($key),$t2
306         je      .Lddone
307
308         and     @S[0],$t0
309         or      @S[3],$t3
310         rol     \$1,$t0
311         xor     $t3,@S[2]               # s2^=s3|key[3];
312         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
313         and     @S[2],$t2
314         or      @S[1],$t1
315         rol     \$1,$t2
316         xor     $t1,@S[0]               # s0^=s1|key[1];
317         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
318
319         jmp     .Ldloop
320
321 .align  16
322 .Lddone:
323         xor     @S[2],$t2
324         xor     @S[3],$t3
325         xor     @S[0],$t0
326         xor     @S[1],$t1
327
328         mov     $t2,@S[0]               # SwapHalf
329         mov     $t3,@S[1]
330         mov     $t0,@S[2]
331         mov     $t1,@S[3]
332
333         .byte   0xf3,0xc3               # rep ret
334 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
335 ___
336
337 sub _saveround {
338 my ($rnd,$key,@T)=@_;
339 my $bias=int(@T[0])?shift(@T):0;
340
341     if ($#T==3) {
342         $code.=<<___;
343         mov     @T[1],`$bias+$rnd*8+0`($key)
344         mov     @T[0],`$bias+$rnd*8+4`($key)
345         mov     @T[3],`$bias+$rnd*8+8`($key)
346         mov     @T[2],`$bias+$rnd*8+12`($key)
347 ___
348     } else {
349         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
350         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
351     }
352 }
353
354 sub _loadround {
355 my ($rnd,$key,@T)=@_;
356 my $bias=int(@T[0])?shift(@T):0;
357
358 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
359 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
360 }
361
362 # shld is very slow on Intel EM64T family. Even on AMD it limits
363 # instruction decode rate [because it's VectorPath] and consequently
364 # performance...
365 sub __rotl128 {
366 my ($i0,$i1,$rot)=@_;
367
368     if ($rot) {
369         $code.=<<___;
370         mov     $i0,%r11
371         shld    \$$rot,$i1,$i0
372         shld    \$$rot,%r11,$i1
373 ___
374     }
375 }
376
377 # ... Implementing 128-bit rotate without shld gives 80% better
378 # performance EM64T, +15% on AMD64 and only ~7% degradation on
379 # Core2. This is therefore preferred.
380 sub _rotl128 {
381 my ($i0,$i1,$rot)=@_;
382
383     if ($rot) {
384         $code.=<<___;
385         mov     $i0,%r11
386         shl     \$$rot,$i0
387         mov     $i1,%r9
388         shr     \$`64-$rot`,%r9
389         shr     \$`64-$rot`,%r11
390         or      %r9,$i0
391         shl     \$$rot,$i1
392         or      %r11,$i1
393 ___
394     }
395 }
396
397 { my $step=0;
398
399 $code.=<<___;
400 .globl  Camellia_Ekeygen
401 .type   Camellia_Ekeygen,\@function,3
402 .align  16
403 Camellia_Ekeygen:
404         push    %rbx
405         push    %rbp
406         push    %r13
407         push    %r14
408         push    %r15
409 .Lkey_prologue:
410
411         mov     %rdi,$keyend            # put away arguments, keyBitLength
412         mov     %rdx,$out               # keyTable
413
414         mov     0(%rsi),@S[0]           # load 0-127 bits
415         mov     4(%rsi),@S[1]
416         mov     8(%rsi),@S[2]
417         mov     12(%rsi),@S[3]
418
419         bswap   @S[0]
420         bswap   @S[1]
421         bswap   @S[2]
422         bswap   @S[3]
423 ___
424         &_saveround     (0,$out,@S);    # KL<<<0
425 $code.=<<___;
426         cmp     \$128,$keyend           # check keyBitLength
427         je      .L1st128
428
429         mov     16(%rsi),@S[0]          # load 128-191 bits
430         mov     20(%rsi),@S[1]
431         cmp     \$192,$keyend
432         je      .L1st192
433         mov     24(%rsi),@S[2]          # load 192-255 bits
434         mov     28(%rsi),@S[3]
435         jmp     .L1st256
436 .L1st192:
437         mov     @S[0],@S[2]
438         mov     @S[1],@S[3]
439         not     @S[2]
440         not     @S[3]
441 .L1st256:
442         bswap   @S[0]
443         bswap   @S[1]
444         bswap   @S[2]
445         bswap   @S[3]
446 ___
447         &_saveround     (4,$out,@S);    # temp storage for KR!
448 $code.=<<___;
449         xor     0($out),@S[1]           # KR^KL
450         xor     4($out),@S[0]
451         xor     8($out),@S[3]
452         xor     12($out),@S[2]
453
454 .L1st128:
455         lea     .LCamellia_SIGMA(%rip),$key
456         lea     .LCamellia_SBOX(%rip),$Tbl
457
458         mov     0($key),$t1
459         mov     4($key),$t0
460 ___
461         &Camellia_Feistel($step++);
462         &Camellia_Feistel($step++);
463 $code.=<<___;
464         xor     0($out),@S[1]           # ^KL
465         xor     4($out),@S[0]
466         xor     8($out),@S[3]
467         xor     12($out),@S[2]
468 ___
469         &Camellia_Feistel($step++);
470         &Camellia_Feistel($step++);
471 $code.=<<___;
472         cmp     \$128,$keyend
473         jne     .L2nd256
474
475         lea     128($out),$out          # size optimization
476         shl     \$32,%r8                # @S[0]||
477         shl     \$32,%r10               # @S[2]||
478         or      %r9,%r8                 # ||@S[1]
479         or      %r11,%r10               # ||@S[3]
480 ___
481         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
482         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
483         &_rotl128       ("%rax","%rbx",15);
484         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
485         &_rotl128       ("%r8","%r10",15);
486         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
487         &_rotl128       ("%r8","%r10",15);              # 15+15=30
488         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
489         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
490         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
491         &_rotl128       ("%r8","%r10",15);              # 30+15=45
492         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
493         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
494         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
495         &_rotl128       ("%r8","%r10",15);              # 45+15=60
496         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
497         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
498         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
499         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
500         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
501         &_rotl128       ("%r8","%r10",34);              # 60+34=94
502         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
503         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
504         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
505         &_rotl128       ("%r8","%r10",17);              # 94+17=111
506         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
507 $code.=<<___;
508         mov     \$3,%eax
509         jmp     .Ldone
510 .align  16
511 .L2nd256:
512 ___
513         &_saveround     (6,$out,@S);    # temp storage for KA!
514 $code.=<<___;
515         xor     `4*8+0`($out),@S[1]     # KA^KR
516         xor     `4*8+4`($out),@S[0]
517         xor     `5*8+0`($out),@S[3]
518         xor     `5*8+4`($out),@S[2]
519 ___
520         &Camellia_Feistel($step++);
521         &Camellia_Feistel($step++);
522
523         &_loadround     (0,$out,"%rax","%rbx"); # KL
524         &_loadround     (4,$out,"%rcx","%rdx"); # KR
525         &_loadround     (6,$out,"%r14","%r15"); # KA
526 $code.=<<___;
527         lea     128($out),$out          # size optimization
528         shl     \$32,%r8                # @S[0]||
529         shl     \$32,%r10               # @S[2]||
530         or      %r9,%r8                 # ||@S[1]
531         or      %r11,%r10               # ||@S[3]
532 ___
533         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
534         &_rotl128       ("%rcx","%rdx",15);
535         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
536         &_rotl128       ("%r14","%r15",15);
537         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
538         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
539         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
540         &_rotl128       ("%r8","%r10",30);
541         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
542         &_rotl128       ("%rax","%rbx",45);
543         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
544         &_rotl128       ("%r14","%r15",30);             # 15+30=45
545         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
546         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
547         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
548         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
549         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
550         &_rotl128       ("%r8","%r10",30);              # 30+30=60
551         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
552         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
553         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
554         &_rotl128       ("%r14","%r15",32);             # 45+32=77
555         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
556         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
557         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
558         &_rotl128       ("%r14","%r15",17);             # 77+17=94
559         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
560         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
561         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
562         &_rotl128       ("%r8","%r10",51);              # 60+51=111
563         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
564 $code.=<<___;
565         mov     \$4,%eax
566 .Ldone:
567         mov     0(%rsp),%r15
568         mov     8(%rsp),%r14
569         mov     16(%rsp),%r13
570         mov     24(%rsp),%rbp
571         mov     32(%rsp),%rbx
572         lea     40(%rsp),%rsp
573 .Lkey_epilogue:
574         ret
575 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
576 ___
577 }
578
579 @SBOX=(
580 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
581  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
582 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
583 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
584 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
585 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
586  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
587 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
588 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
589  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
590 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
591  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
592 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
593 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
594 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
595  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596
597 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
598 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
599 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
600 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
601
602 $code.=<<___;
603 .align  64
604 .LCamellia_SIGMA:
605 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
606 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
607 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
608 .long   0,          0,          0,          0
609 .LCamellia_SBOX:
610 ___
611 # tables are interleaved, remember?
612 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
613 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
614 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
615
616 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
617 #                       size_t length, const CAMELLIA_KEY *key,
618 #                       unsigned char *ivp,const int enc);
619 {
620 $_key="0(%rsp)";
621 $_end="8(%rsp)";        # inp+len&~15
622 $_res="16(%rsp)";       # len&15
623 $ivec="24(%rsp)";
624 $_ivp="40(%rsp)";
625 $_rsp="48(%rsp)";
626
627 $code.=<<___;
628 .globl  Camellia_cbc_encrypt
629 .type   Camellia_cbc_encrypt,\@function,6
630 .align  16
631 Camellia_cbc_encrypt:
632         cmp     \$0,%rdx
633         je      .Lcbc_abort
634         push    %rbx
635         push    %rbp
636         push    %r12
637         push    %r13
638         push    %r14
639         push    %r15
640 .Lcbc_prologue:
641
642         mov     %rsp,%rbp
643         sub     \$64,%rsp
644         and     \$-64,%rsp
645
646         # place stack frame just "above mod 1024" the key schedule,
647         # this ensures that cache associativity suffices
648         lea     -64-63(%rcx),%r10
649         sub     %rsp,%r10
650         neg     %r10
651         and     \$0x3C0,%r10
652         sub     %r10,%rsp
653         #add    \$8,%rsp                # 8 is reserved for callee's ra
654
655         mov     %rdi,$inp               # inp argument
656         mov     %rsi,$out               # out argument
657         mov     %r8,%rbx                # ivp argument
658         mov     %rcx,$key               # key argument
659         mov     272(%rcx),$keyend       # grandRounds
660
661         mov     %r8,$_ivp
662         mov     %rbp,$_rsp
663
664 .Lcbc_body:
665         lea     .LCamellia_SBOX(%rip),$Tbl
666
667         mov     \$32,%ecx
668 .align  4
669 .Lcbc_prefetch_sbox:
670         mov     0($Tbl),%rax
671         mov     32($Tbl),%rsi
672         mov     64($Tbl),%rdi
673         mov     96($Tbl),%r11
674         lea     128($Tbl),$Tbl
675         loop    .Lcbc_prefetch_sbox
676         sub     \$4096,$Tbl
677         shl     \$6,$keyend
678         mov     %rdx,%rcx               # len argument
679         lea     ($key,$keyend),$keyend
680
681         cmp     \$0,%r9d                # enc argument
682         je      .LCBC_DECRYPT
683
684         and     \$-16,%rdx
685         and     \$15,%rcx               # length residue
686         lea     ($inp,%rdx),%rdx
687         mov     $key,$_key
688         mov     %rdx,$_end
689         mov     %rcx,$_res
690
691         cmp     $inp,%rdx
692         mov     0(%rbx),@S[0]           # load IV
693         mov     4(%rbx),@S[1]
694         mov     8(%rbx),@S[2]
695         mov     12(%rbx),@S[3]
696         je      .Lcbc_enc_tail
697         jmp     .Lcbc_eloop
698
699 .align  16
700 .Lcbc_eloop:
701         xor     0($inp),@S[0]
702         xor     4($inp),@S[1]
703         xor     8($inp),@S[2]
704         bswap   @S[0]
705         xor     12($inp),@S[3]
706         bswap   @S[1]
707         bswap   @S[2]
708         bswap   @S[3]
709
710         call    _x86_64_Camellia_encrypt
711
712         mov     $_key,$key              # "rewind" the key
713         bswap   @S[0]
714         mov     $_end,%rdx
715         bswap   @S[1]
716         mov     $_res,%rcx
717         bswap   @S[2]
718         mov     @S[0],0($out)
719         bswap   @S[3]
720         mov     @S[1],4($out)
721         mov     @S[2],8($out)
722         lea     16($inp),$inp
723         mov     @S[3],12($out)
724         cmp     %rdx,$inp
725         lea     16($out),$out
726         jne     .Lcbc_eloop
727
728         cmp     \$0,%rcx
729         jne     .Lcbc_enc_tail
730
731         mov     $_ivp,$out
732         mov     @S[0],0($out)           # write out IV residue
733         mov     @S[1],4($out)
734         mov     @S[2],8($out)
735         mov     @S[3],12($out)
736         jmp     .Lcbc_done
737
738 .align  16
739 .Lcbc_enc_tail:
740         xor     %rax,%rax
741         mov     %rax,0+$ivec
742         mov     %rax,8+$ivec
743         mov     %rax,$_res
744
745 .Lcbc_enc_pushf:
746         pushfq
747         cld
748         mov     $inp,%rsi
749         lea     8+$ivec,%rdi
750         .long   0x9066A4F3              # rep movsb
751         popfq
752 .Lcbc_enc_popf:
753
754         lea     $ivec,$inp
755         lea     16+$ivec,%rax
756         mov     %rax,$_end
757         jmp     .Lcbc_eloop             # one more time
758
759 .align  16
760 .LCBC_DECRYPT:
761         xchg    $key,$keyend
762         add     \$15,%rdx
763         and     \$15,%rcx               # length residue
764         and     \$-16,%rdx
765         mov     $key,$_key
766         lea     ($inp,%rdx),%rdx
767         mov     %rdx,$_end
768         mov     %rcx,$_res
769
770         mov     (%rbx),%rax             # load IV
771         mov     8(%rbx),%rbx
772         jmp     .Lcbc_dloop
773 .align  16
774 .Lcbc_dloop:
775         mov     0($inp),@S[0]
776         mov     4($inp),@S[1]
777         mov     8($inp),@S[2]
778         bswap   @S[0]
779         mov     12($inp),@S[3]
780         bswap   @S[1]
781         mov     %rax,0+$ivec            # save IV to temporary storage
782         bswap   @S[2]
783         mov     %rbx,8+$ivec
784         bswap   @S[3]
785
786         call    _x86_64_Camellia_decrypt
787
788         mov     $_key,$key              # "rewind" the key
789         mov     $_end,%rdx
790         mov     $_res,%rcx
791
792         bswap   @S[0]
793         mov     ($inp),%rax             # load IV for next iteration
794         bswap   @S[1]
795         mov     8($inp),%rbx
796         bswap   @S[2]
797         xor     0+$ivec,@S[0]
798         bswap   @S[3]
799         xor     4+$ivec,@S[1]
800         xor     8+$ivec,@S[2]
801         lea     16($inp),$inp
802         xor     12+$ivec,@S[3]
803         cmp     %rdx,$inp
804         je      .Lcbc_ddone
805
806         mov     @S[0],0($out)
807         mov     @S[1],4($out)
808         mov     @S[2],8($out)
809         mov     @S[3],12($out)
810
811         lea     16($out),$out
812         jmp     .Lcbc_dloop
813
814 .align  16
815 .Lcbc_ddone:
816         mov     $_ivp,%rdx
817         cmp     \$0,%rcx
818         jne     .Lcbc_dec_tail
819
820         mov     @S[0],0($out)
821         mov     @S[1],4($out)
822         mov     @S[2],8($out)
823         mov     @S[3],12($out)
824
825         mov     %rax,(%rdx)             # write out IV residue
826         mov     %rbx,8(%rdx)
827         jmp     .Lcbc_done
828 .align  16
829 .Lcbc_dec_tail:
830         mov     @S[0],0+$ivec
831         mov     @S[1],4+$ivec
832         mov     @S[2],8+$ivec
833         mov     @S[3],12+$ivec
834
835 .Lcbc_dec_pushf:
836         pushfq
837         cld
838         lea     8+$ivec,%rsi
839         lea     ($out),%rdi
840         .long   0x9066A4F3              # rep movsb
841         popfq
842 .Lcbc_dec_popf:
843
844         mov     %rax,(%rdx)             # write out IV residue
845         mov     %rbx,8(%rdx)
846         jmp     .Lcbc_done
847
848 .align  16
849 .Lcbc_done:
850         mov     $_rsp,%rcx
851         mov     0(%rcx),%r15
852         mov     8(%rcx),%r14
853         mov     16(%rcx),%r13
854         mov     24(%rcx),%r12
855         mov     32(%rcx),%rbp
856         mov     40(%rcx),%rbx
857         lea     48(%rcx),%rsp
858 .Lcbc_abort:
859         ret
860 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
861
862 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
863 ___
864 }
865
866 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
867 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
868 if ($win64) {
869 $rec="%rcx";
870 $frame="%rdx";
871 $context="%r8";
872 $disp="%r9";
873
874 $code.=<<___;
875 .extern __imp_RtlVirtualUnwind
876 .type   common_se_handler,\@abi-omnipotent
877 .align  16
878 common_se_handler:
879         push    %rsi
880         push    %rdi
881         push    %rbx
882         push    %rbp
883         push    %r12
884         push    %r13
885         push    %r14
886         push    %r15
887         pushfq
888         lea     -64(%rsp),%rsp
889
890         mov     120($context),%rax      # pull context->Rax
891         mov     248($context),%rbx      # pull context->Rip
892
893         mov     8($disp),%rsi           # disp->ImageBase
894         mov     56($disp),%r11          # disp->HandlerData
895
896         mov     0(%r11),%r10d           # HandlerData[0]
897         lea     (%rsi,%r10),%r10        # prologue label
898         cmp     %r10,%rbx               # context->Rip<prologue label
899         jb      .Lin_prologue
900
901         mov     152($context),%rax      # pull context->Rsp
902
903         mov     4(%r11),%r10d           # HandlerData[1]
904         lea     (%rsi,%r10),%r10        # epilogue label
905         cmp     %r10,%rbx               # context->Rip>=epilogue label
906         jae     .Lin_prologue
907
908         lea     40(%rax),%rax
909         mov     -8(%rax),%rbx
910         mov     -16(%rax),%rbp
911         mov     -24(%rax),%r13
912         mov     -32(%rax),%r14
913         mov     -40(%rax),%r15
914         mov     %rbx,144($context)      # restore context->Rbx
915         mov     %rbp,160($context)      # restore context->Rbp
916         mov     %r13,224($context)      # restore context->R13
917         mov     %r14,232($context)      # restore context->R14
918         mov     %r15,240($context)      # restore context->R15
919
920 .Lin_prologue:
921         mov     8(%rax),%rdi
922         mov     16(%rax),%rsi
923         mov     %rax,152($context)      # restore context->Rsp
924         mov     %rsi,168($context)      # restore context->Rsi
925         mov     %rdi,176($context)      # restore context->Rdi
926
927         jmp     .Lcommon_seh_exit
928 .size   common_se_handler,.-common_se_handler
929
930 .type   cbc_se_handler,\@abi-omnipotent
931 .align  16
932 cbc_se_handler:
933         push    %rsi
934         push    %rdi
935         push    %rbx
936         push    %rbp
937         push    %r12
938         push    %r13
939         push    %r14
940         push    %r15
941         pushfq
942         lea     -64(%rsp),%rsp
943
944         mov     120($context),%rax      # pull context->Rax
945         mov     248($context),%rbx      # pull context->Rip
946
947         lea     .Lcbc_prologue(%rip),%r10
948         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
949         jb      .Lin_cbc_prologue
950
951         lea     .Lcbc_body(%rip),%r10
952         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
953         jb      .Lin_cbc_frame_setup
954
955         mov     152($context),%rax      # pull context->Rsp
956
957         lea     .Lcbc_abort(%rip),%r10
958         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
959         jae     .Lin_cbc_prologue
960
961         # handle pushf/popf in Camellia_cbc_encrypt
962         lea     .Lcbc_enc_pushf(%rip),%r10
963         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
964         jbe     .Lin_cbc_no_flag
965         lea     8(%rax),%rax
966         lea     .Lcbc_enc_popf(%rip),%r10
967         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
968         jb      .Lin_cbc_no_flag
969         lea     -8(%rax),%rax
970         lea     .Lcbc_dec_pushf(%rip),%r10
971         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
972         jbe     .Lin_cbc_no_flag
973         lea     8(%rax),%rax
974         lea     .Lcbc_dec_popf(%rip),%r10
975         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
976         jb      .Lin_cbc_no_flag
977         lea     -8(%rax),%rax
978
979 .Lin_cbc_no_flag:
980         mov     48(%rax),%rax           # $_rsp
981         lea     48(%rax),%rax
982
983 .Lin_cbc_frame_setup:
984         mov     -8(%rax),%rbx
985         mov     -16(%rax),%rbp
986         mov     -24(%rax),%r12
987         mov     -32(%rax),%r13
988         mov     -40(%rax),%r14
989         mov     -48(%rax),%r15
990         mov     %rbx,144($context)      # restore context->Rbx
991         mov     %rbp,160($context)      # restore context->Rbp
992         mov     %r12,216($context)      # restore context->R12
993         mov     %r13,224($context)      # restore context->R13
994         mov     %r14,232($context)      # restore context->R14
995         mov     %r15,240($context)      # restore context->R15
996
997 .Lin_cbc_prologue:
998         mov     8(%rax),%rdi
999         mov     16(%rax),%rsi
1000         mov     %rax,152($context)      # restore context->Rsp
1001         mov     %rsi,168($context)      # restore context->Rsi
1002         mov     %rdi,176($context)      # restore context->Rdi
1003
1004 .align  4
1005 .Lcommon_seh_exit:
1006
1007         mov     40($disp),%rdi          # disp->ContextRecord
1008         mov     $context,%rsi           # context
1009         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1010         .long   0xa548f3fc              # cld; rep movsq
1011
1012         mov     $disp,%rsi
1013         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1014         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1015         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1016         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1017         mov     40(%rsi),%r10           # disp->ContextRecord
1018         lea     56(%rsi),%r11           # &disp->HandlerData
1019         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1020         mov     %r10,32(%rsp)           # arg5
1021         mov     %r11,40(%rsp)           # arg6
1022         mov     %r12,48(%rsp)           # arg7
1023         mov     %rcx,56(%rsp)           # arg8, (NULL)
1024         call    *__imp_RtlVirtualUnwind(%rip)
1025
1026         mov     \$1,%eax                # ExceptionContinueSearch
1027         lea     64(%rsp),%rsp
1028         popfq
1029         pop     %r15
1030         pop     %r14
1031         pop     %r13
1032         pop     %r12
1033         pop     %rbp
1034         pop     %rbx
1035         pop     %rdi
1036         pop     %rsi
1037         ret
1038 .size   cbc_se_handler,.-cbc_se_handler
1039
1040 .section        .pdata
1041 .align  4
1042         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1043         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1044         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1045
1046         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1047         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1048         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1049
1050         .rva    .LSEH_begin_Camellia_Ekeygen
1051         .rva    .LSEH_end_Camellia_Ekeygen
1052         .rva    .LSEH_info_Camellia_Ekeygen
1053
1054         .rva    .LSEH_begin_Camellia_cbc_encrypt
1055         .rva    .LSEH_end_Camellia_cbc_encrypt
1056         .rva    .LSEH_info_Camellia_cbc_encrypt
1057
1058 .section        .xdata
1059 .align  8
1060 .LSEH_info_Camellia_EncryptBlock_Rounds:
1061         .byte   9,0,0,0
1062         .rva    common_se_handler
1063         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1064 .LSEH_info_Camellia_DecryptBlock_Rounds:
1065         .byte   9,0,0,0
1066         .rva    common_se_handler
1067         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1068 .LSEH_info_Camellia_Ekeygen:
1069         .byte   9,0,0,0
1070         .rva    common_se_handler
1071         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1072 .LSEH_info_Camellia_cbc_encrypt:
1073         .byte   9,0,0,0
1074         .rva    cbc_se_handler
1075 ___
1076 }
1077
1078 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1079 print $code;
1080 close STDOUT;