199165f95f6f5dd1f344a05cb0d33cf3f086fea4
[openssl.git] / crypto / camellia / asm / cmll-x86_64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
5 #
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
13
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
16 #
17 #                       AMD64   Core2   EM64T
18 # -evp camellia-128-ecb 16.7    21.0    22.7
19 # + over gcc 3.4.6      +25%    +5%     0%
20 #
21 # camellia-128-cbc      15.7    20.4    21.1
22 #
23 # 128-bit key setup     128     216     205     cycles/key
24 # + over gcc 3.4.6      +54%    +39%    +15%
25 #
26 # Numbers in "+" rows represent performance improvement over compiler
27 # generated code. Key setup timings are impressive on AMD and Core2
28 # thanks to 64-bit operations being covertly deployed. Improvement on
29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
31
32 $flavour = shift;
33 $output  = shift;
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
42
43 open STDOUT,"| $^X $xlate $flavour $output";
44
45 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
46 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
47                         $r =~ s/%[er]([sd]i)/%\1l/;
48                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
49
50 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
51 @S=("%r8d","%r9d","%r10d","%r11d");
52 $i0="%esi";
53 $i1="%edi";
54 $Tbl="%rbp";    # size optimization
55 $inp="%r12";
56 $out="%r13";
57 $key="%r14";
58 $keyend="%r15";
59 $arg0d=$win64?"%ecx":"%edi";
60
61 # const unsigned int Camellia_SBOX[4][256];
62 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
63 # and [2][] - with [3][]. This is done to minimize code size.
64 $SBOX1_1110=0;          # Camellia_SBOX[0]
65 $SBOX4_4404=4;          # Camellia_SBOX[1]
66 $SBOX2_0222=2048;       # Camellia_SBOX[2]
67 $SBOX3_3033=2052;       # Camellia_SBOX[3]
68
69 sub Camellia_Feistel {
70 my $i=@_[0];
71 my $seed=defined(@_[1])?@_[1]:0;
72 my $scale=$seed<0?-8:8;
73 my $j=($i&1)*2;
74 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
75
76 $code.=<<___;
77         xor     $s0,$t0                         # t0^=key[0]
78         xor     $s1,$t1                         # t1^=key[1]
79         movz    `&hi("$t0")`,$i0                # (t0>>8)&0xff
80         movz    `&lo("$t1")`,$i1                # (t1>>0)&0xff
81         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
82         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
83         movz    `&lo("$t0")`,$i0                # (t0>>0)&0xff
84         shr     \$16,$t0
85         movz    `&hi("$t1")`,$i1                # (t1>>8)&0xff
86         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
87         shr     \$16,$t1
88         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
89         movz    `&hi("$t0")`,$i0                # (t0>>24)&0xff
90         movz    `&lo("$t1")`,$i1                # (t1>>16)&0xff
91         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
92         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
93         movz    `&lo("$t0")`,$i0                # (t0>>16)&0xff
94         movz    `&hi("$t1")`,$i1                # (t1>>24)&0xff
95         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
96         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
97         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
98         mov     `$seed+($i+1)*$scale+4`($key),$t0
99         xor     $t3,$t2                         # t2^=t3
100         ror     \$8,$t3                         # t3=RightRotate(t3,8)
101         xor     $t2,$s2
102         xor     $t2,$s3
103         xor     $t3,$s3
104 ___
105 }
106
107 # void Camellia_EncryptBlock_Rounds(
108 #               int grandRounds,
109 #               const Byte plaintext[],
110 #               const KEY_TABLE_TYPE keyTable,
111 #               Byte ciphertext[])
112 $code=<<___;
113 .text
114
115 # V1.x API
116 .globl  Camellia_EncryptBlock
117 .type   Camellia_EncryptBlock,\@abi-omnipotent
118 .align  16
119 Camellia_EncryptBlock:
120         movl    \$128,%eax
121         subl    $arg0d,%eax
122         movl    \$3,$arg0d
123         adcl    \$0,$arg0d      # keyBitLength==128?3:4
124         jmp     .Lenc_rounds
125 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
126 # V2
127 .globl  Camellia_EncryptBlock_Rounds
128 .type   Camellia_EncryptBlock_Rounds,\@function,4
129 .align  16
130 .Lenc_rounds:
131 Camellia_EncryptBlock_Rounds:
132         push    %rbx
133         push    %rbp
134         push    %r13
135         push    %r14
136         push    %r15
137 .Lenc_prologue:
138
139         #mov    %rsi,$inp               # put away arguments
140         mov     %rcx,$out
141         mov     %rdx,$key
142
143         shl     \$6,%edi                # process grandRounds
144         lea     .LCamellia_SBOX(%rip),$Tbl
145         lea     ($key,%rdi),$keyend
146
147         mov     0(%rsi),@S[0]           # load plaintext
148         mov     4(%rsi),@S[1]
149         mov     8(%rsi),@S[2]
150         bswap   @S[0]
151         mov     12(%rsi),@S[3]
152         bswap   @S[1]
153         bswap   @S[2]
154         bswap   @S[3]
155
156         call    _x86_64_Camellia_encrypt
157
158         bswap   @S[0]
159         bswap   @S[1]
160         bswap   @S[2]
161         mov     @S[0],0($out)
162         bswap   @S[3]
163         mov     @S[1],4($out)
164         mov     @S[2],8($out)
165         mov     @S[3],12($out)
166
167         mov     0(%rsp),%r15
168         mov     8(%rsp),%r14
169         mov     16(%rsp),%r13
170         mov     24(%rsp),%rbp
171         mov     32(%rsp),%rbx
172         lea     40(%rsp),%rsp
173 .Lenc_epilogue:
174         ret
175 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
176
177 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
178 .align  16
179 _x86_64_Camellia_encrypt:
180         xor     0($key),@S[1]
181         xor     4($key),@S[0]           # ^=key[0-3]
182         xor     8($key),@S[3]
183         xor     12($key),@S[2]
184 .align  16
185 .Leloop:
186         mov     16($key),$t1            # prefetch key[4-5]
187         mov     20($key),$t0
188
189 ___
190         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
191 $code.=<<___;
192         lea     16*4($key),$key
193         cmp     $keyend,$key
194         mov     8($key),$t3             # prefetch key[2-3]
195         mov     12($key),$t2
196         je      .Ledone
197
198         and     @S[0],$t0
199         or      @S[3],$t3
200         rol     \$1,$t0
201         xor     $t3,@S[2]               # s2^=s3|key[3];
202         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
203         and     @S[2],$t2
204         or      @S[1],$t1
205         rol     \$1,$t2
206         xor     $t1,@S[0]               # s0^=s1|key[1];
207         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
208         jmp     .Leloop
209
210 .align  16
211 .Ledone:
212         xor     @S[2],$t0               # SwapHalf
213         xor     @S[3],$t1
214         xor     @S[0],$t2
215         xor     @S[1],$t3
216
217         mov     $t0,@S[0]
218         mov     $t1,@S[1]
219         mov     $t2,@S[2]
220         mov     $t3,@S[3]
221
222         .byte   0xf3,0xc3               # rep ret
223 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
224
225 # V1.x API
226 .globl  Camellia_DecryptBlock
227 .type   Camellia_DecryptBlock,\@abi-omnipotent
228 .align  16
229 Camellia_DecryptBlock:
230         movl    \$128,%eax
231         subl    $arg0d,%eax
232         movl    \$3,$arg0d
233         adcl    \$0,$arg0d      # keyBitLength==128?3:4
234         jmp     .Ldec_rounds
235 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
236 # V2
237 .globl  Camellia_DecryptBlock_Rounds
238 .type   Camellia_DecryptBlock_Rounds,\@function,4
239 .align  16
240 .Ldec_rounds:
241 Camellia_DecryptBlock_Rounds:
242         push    %rbx
243         push    %rbp
244         push    %r13
245         push    %r14
246         push    %r15
247 .Ldec_prologue:
248
249         #mov    %rsi,$inp               # put away arguments
250         mov     %rcx,$out
251         mov     %rdx,$keyend
252
253         shl     \$6,%edi                # process grandRounds
254         lea     .LCamellia_SBOX(%rip),$Tbl
255         lea     ($keyend,%rdi),$key
256
257         mov     0(%rsi),@S[0]           # load plaintext
258         mov     4(%rsi),@S[1]
259         mov     8(%rsi),@S[2]
260         bswap   @S[0]
261         mov     12(%rsi),@S[3]
262         bswap   @S[1]
263         bswap   @S[2]
264         bswap   @S[3]
265
266         call    _x86_64_Camellia_decrypt
267
268         bswap   @S[0]
269         bswap   @S[1]
270         bswap   @S[2]
271         mov     @S[0],0($out)
272         bswap   @S[3]
273         mov     @S[1],4($out)
274         mov     @S[2],8($out)
275         mov     @S[3],12($out)
276
277         mov     0(%rsp),%r15
278         mov     8(%rsp),%r14
279         mov     16(%rsp),%r13
280         mov     24(%rsp),%rbp
281         mov     32(%rsp),%rbx
282         lea     40(%rsp),%rsp
283 .Ldec_epilogue:
284         ret
285 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
286
287 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
288 .align  16
289 _x86_64_Camellia_decrypt:
290         xor     0($key),@S[1]
291         xor     4($key),@S[0]           # ^=key[0-3]
292         xor     8($key),@S[3]
293         xor     12($key),@S[2]
294 .align  16
295 .Ldloop:
296         mov     -8($key),$t1            # prefetch key[4-5]
297         mov     -4($key),$t0
298
299 ___
300         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
301 $code.=<<___;
302         lea     -16*4($key),$key
303         cmp     $keyend,$key
304         mov     0($key),$t3             # prefetch key[2-3]
305         mov     4($key),$t2
306         je      .Lddone
307
308         and     @S[0],$t0
309         or      @S[3],$t3
310         rol     \$1,$t0
311         xor     $t3,@S[2]               # s2^=s3|key[3];
312         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
313         and     @S[2],$t2
314         or      @S[1],$t1
315         rol     \$1,$t2
316         xor     $t1,@S[0]               # s0^=s1|key[1];
317         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
318
319         jmp     .Ldloop
320
321 .align  16
322 .Lddone:
323         xor     @S[2],$t2
324         xor     @S[3],$t3
325         xor     @S[0],$t0
326         xor     @S[1],$t1
327
328         mov     $t2,@S[0]               # SwapHalf
329         mov     $t3,@S[1]
330         mov     $t0,@S[2]
331         mov     $t1,@S[3]
332
333         .byte   0xf3,0xc3               # rep ret
334 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
335 ___
336
337 sub _saveround {
338 my ($rnd,$key,@T)=@_;
339 my $bias=int(@T[0])?shift(@T):0;
340
341     if ($#T==3) {
342         $code.=<<___;
343         mov     @T[1],`$bias+$rnd*8+0`($key)
344         mov     @T[0],`$bias+$rnd*8+4`($key)
345         mov     @T[3],`$bias+$rnd*8+8`($key)
346         mov     @T[2],`$bias+$rnd*8+12`($key)
347 ___
348     } else {
349         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
350         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
351     }
352 }
353
354 sub _loadround {
355 my ($rnd,$key,@T)=@_;
356 my $bias=int(@T[0])?shift(@T):0;
357
358 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
359 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
360 }
361
362 # shld is very slow on Intel EM64T family. Even on AMD it limits
363 # instruction decode rate [because it's VectorPath] and consequently
364 # performance...
365 sub __rotl128 {
366 my ($i0,$i1,$rot)=@_;
367
368     if ($rot) {
369         $code.=<<___;
370         mov     $i0,%r11
371         shld    \$$rot,$i1,$i0
372         shld    \$$rot,%r11,$i1
373 ___
374     }
375 }
376
377 # ... Implementing 128-bit rotate without shld gives 80% better
378 # performance EM64T, +15% on AMD64 and only ~7% degradation on
379 # Core2. This is therefore preferred.
380 sub _rotl128 {
381 my ($i0,$i1,$rot)=@_;
382
383     if ($rot) {
384         $code.=<<___;
385         mov     $i0,%r11
386         shl     \$$rot,$i0
387         mov     $i1,%r9
388         shr     \$`64-$rot`,%r9
389         shr     \$`64-$rot`,%r11
390         or      %r9,$i0
391         shl     \$$rot,$i1
392         or      %r11,$i1
393 ___
394     }
395 }
396
397 { my $step=0;
398
399 $code.=<<___;
400 .globl  Camellia_Ekeygen
401 .type   Camellia_Ekeygen,\@function,3
402 .align  16
403 Camellia_Ekeygen:
404         push    %rbx
405         push    %rbp
406         push    %r13
407         push    %r14
408         push    %r15
409 .Lkey_prologue:
410
411         mov     %rdi,$keyend            # put away arguments, keyBitLength
412         mov     %rdx,$out               # keyTable
413
414         mov     0(%rsi),@S[0]           # load 0-127 bits
415         mov     4(%rsi),@S[1]
416         mov     8(%rsi),@S[2]
417         mov     12(%rsi),@S[3]
418
419         bswap   @S[0]
420         bswap   @S[1]
421         bswap   @S[2]
422         bswap   @S[3]
423 ___
424         &_saveround     (0,$out,@S);    # KL<<<0
425 $code.=<<___;
426         cmp     \$128,$keyend           # check keyBitLength
427         je      .L1st128
428
429         mov     16(%rsi),@S[0]          # load 128-191 bits
430         mov     20(%rsi),@S[1]
431         cmp     \$192,$keyend
432         je      .L1st192
433         mov     24(%rsi),@S[2]          # load 192-255 bits
434         mov     28(%rsi),@S[3]
435         jmp     .L1st256
436 .L1st192:
437         mov     @S[0],@S[2]
438         mov     @S[1],@S[3]
439         not     @S[2]
440         not     @S[3]
441 .L1st256:
442         bswap   @S[0]
443         bswap   @S[1]
444         bswap   @S[2]
445         bswap   @S[3]
446 ___
447         &_saveround     (4,$out,@S);    # temp storage for KR!
448 $code.=<<___;
449         xor     0($out),@S[1]           # KR^KL
450         xor     4($out),@S[0]
451         xor     8($out),@S[3]
452         xor     12($out),@S[2]
453
454 .L1st128:
455         lea     .LCamellia_SIGMA(%rip),$key
456         lea     .LCamellia_SBOX(%rip),$Tbl
457
458         mov     0($key),$t1
459         mov     4($key),$t0
460 ___
461         &Camellia_Feistel($step++);
462         &Camellia_Feistel($step++);
463 $code.=<<___;
464         xor     0($out),@S[1]           # ^KL
465         xor     4($out),@S[0]
466         xor     8($out),@S[3]
467         xor     12($out),@S[2]
468 ___
469         &Camellia_Feistel($step++);
470         &Camellia_Feistel($step++);
471 $code.=<<___;
472         cmp     \$128,$keyend
473         jne     .L2nd256
474
475         lea     128($out),$out          # size optimization
476         shl     \$32,%r8                # @S[0]||
477         shl     \$32,%r10               # @S[2]||
478         or      %r9,%r8                 # ||@S[1]
479         or      %r11,%r10               # ||@S[3]
480 ___
481         &_loadround     (0,$out,-128,"%rax","%rbx");    # KL
482         &_saveround     (2,$out,-128,"%r8","%r10");     # KA<<<0
483         &_rotl128       ("%rax","%rbx",15);
484         &_saveround     (4,$out,-128,"%rax","%rbx");    # KL<<<15
485         &_rotl128       ("%r8","%r10",15);
486         &_saveround     (6,$out,-128,"%r8","%r10");     # KA<<<15
487         &_rotl128       ("%r8","%r10",15);              # 15+15=30
488         &_saveround     (8,$out,-128,"%r8","%r10");     # KA<<<30
489         &_rotl128       ("%rax","%rbx",30);             # 15+30=45
490         &_saveround     (10,$out,-128,"%rax","%rbx");   # KL<<<45
491         &_rotl128       ("%r8","%r10",15);              # 30+15=45
492         &_saveround     (12,$out,-128,"%r8");           # KA<<<45
493         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
494         &_saveround     (13,$out,-128,"%rbx");          # KL<<<60
495         &_rotl128       ("%r8","%r10",15);              # 45+15=60
496         &_saveround     (14,$out,-128,"%r8","%r10");    # KA<<<60
497         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
498         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<77
499         &_rotl128       ("%rax","%rbx",17);             # 77+17=94
500         &_saveround     (18,$out,-128,"%rax","%rbx");   # KL<<<94
501         &_rotl128       ("%r8","%r10",34);              # 60+34=94
502         &_saveround     (20,$out,-128,"%r8","%r10");    # KA<<<94
503         &_rotl128       ("%rax","%rbx",17);             # 94+17=111
504         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<111
505         &_rotl128       ("%r8","%r10",17);              # 94+17=111
506         &_saveround     (24,$out,-128,"%r8","%r10");    # KA<<<111
507 $code.=<<___;
508         mov     \$3,%eax
509         jmp     .Ldone
510 .align  16
511 .L2nd256:
512 ___
513         &_saveround     (6,$out,@S);    # temp storage for KA!
514 $code.=<<___;
515         xor     `4*8+0`($out),@S[1]     # KA^KR
516         xor     `4*8+4`($out),@S[0]
517         xor     `5*8+0`($out),@S[3]
518         xor     `5*8+4`($out),@S[2]
519 ___
520         &Camellia_Feistel($step++);
521         &Camellia_Feistel($step++);
522
523         &_loadround     (0,$out,"%rax","%rbx"); # KL
524         &_loadround     (4,$out,"%rcx","%rdx"); # KR
525         &_loadround     (6,$out,"%r14","%r15"); # KA
526 $code.=<<___;
527         lea     128($out),$out          # size optimization
528         shl     \$32,%r8                # @S[0]||
529         shl     \$32,%r10               # @S[2]||
530         or      %r9,%r8                 # ||@S[1]
531         or      %r11,%r10               # ||@S[3]
532 ___
533         &_saveround     (2,$out,-128,"%r8","%r10");     # KB<<<0
534         &_rotl128       ("%rcx","%rdx",15);
535         &_saveround     (4,$out,-128,"%rcx","%rdx");    # KR<<<15
536         &_rotl128       ("%r14","%r15",15);
537         &_saveround     (6,$out,-128,"%r14","%r15");    # KA<<<15
538         &_rotl128       ("%rcx","%rdx",15);             # 15+15=30
539         &_saveround     (8,$out,-128,"%rcx","%rdx");    # KR<<<30
540         &_rotl128       ("%r8","%r10",30);
541         &_saveround     (10,$out,-128,"%r8","%r10");    # KB<<<30
542         &_rotl128       ("%rax","%rbx",45);
543         &_saveround     (12,$out,-128,"%rax","%rbx");   # KL<<<45
544         &_rotl128       ("%r14","%r15",30);             # 15+30=45
545         &_saveround     (14,$out,-128,"%r14","%r15");   # KA<<<45
546         &_rotl128       ("%rax","%rbx",15);             # 45+15=60
547         &_saveround     (16,$out,-128,"%rax","%rbx");   # KL<<<60
548         &_rotl128       ("%rcx","%rdx",30);             # 30+30=60
549         &_saveround     (18,$out,-128,"%rcx","%rdx");   # KR<<<60
550         &_rotl128       ("%r8","%r10",30);              # 30+30=60
551         &_saveround     (20,$out,-128,"%r8","%r10");    # KB<<<60
552         &_rotl128       ("%rax","%rbx",17);             # 60+17=77
553         &_saveround     (22,$out,-128,"%rax","%rbx");   # KL<<<77
554         &_rotl128       ("%r14","%r15",32);             # 45+32=77
555         &_saveround     (24,$out,-128,"%r14","%r15");   # KA<<<77
556         &_rotl128       ("%rcx","%rdx",34);             # 60+34=94
557         &_saveround     (26,$out,-128,"%rcx","%rdx");   # KR<<<94
558         &_rotl128       ("%r14","%r15",17);             # 77+17=94
559         &_saveround     (28,$out,-128,"%r14","%r15");   # KA<<<77
560         &_rotl128       ("%rax","%rbx",34);             # 77+34=111
561         &_saveround     (30,$out,-128,"%rax","%rbx");   # KL<<<111
562         &_rotl128       ("%r8","%r10",51);              # 60+51=111
563         &_saveround     (32,$out,-128,"%r8","%r10");    # KB<<<111
564 $code.=<<___;
565         mov     \$4,%eax
566 .Ldone:
567         mov     0(%rsp),%r15
568         mov     8(%rsp),%r14
569         mov     16(%rsp),%r13
570         mov     24(%rsp),%rbp
571         mov     32(%rsp),%rbx
572         lea     40(%rsp),%rsp
573 .Lkey_epilogue:
574         ret
575 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
576 ___
577 }
578
579 @SBOX=(
580 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
581  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
582 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
583 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
584 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
585 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
586  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
587 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
588 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
589  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
590 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
591  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
592 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
593 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
594 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
595  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
596
597 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
598 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
599 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
600 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
601
602 $code.=<<___;
603 .align  64
604 .LCamellia_SIGMA:
605 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
606 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
607 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
608 .long   0,          0,          0,          0
609 .LCamellia_SBOX:
610 ___
611 # tables are interleaved, remember?
612 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
613 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
614 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
615
616 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
617 #                       size_t length, const CAMELLIA_KEY *key,
618 #                       unsigned char *ivp,const int enc);
619 {
620 $_key="0(%rsp)";
621 $_end="8(%rsp)";        # inp+len&~15
622 $_res="16(%rsp)";       # len&15
623 $ivec="24(%rsp)";
624 $_ivp="40(%rsp)";
625 $_rsp="48(%rsp)";
626
627 $code.=<<___;
628 .globl  Camellia_cbc_encrypt
629 .type   Camellia_cbc_encrypt,\@function,6
630 .align  16
631 Camellia_cbc_encrypt:
632         cmp     \$0,%rdx
633         je      .Lcbc_abort
634         push    %rbx
635         push    %rbp
636         push    %r12
637         push    %r13
638         push    %r14
639         push    %r15
640 .Lcbc_prologue:
641
642         mov     %rsp,%rbp
643         sub     \$64,%rsp
644         and     \$-64,%rsp
645
646         # place stack frame just "above mod 1024" the key schedule,
647         # this ensures that cache associativity suffices
648         lea     -64-63(%rcx),%r10
649         sub     %rsp,%r10
650         neg     %r10
651         and     \$0x3C0,%r10
652         sub     %r10,%rsp
653         add     \$8,%rsp                # 8 is reserved for callee's ra
654
655         mov     %rdi,$inp               # inp argument
656         mov     %rsi,$out               # out argument
657         mov     %r8,%rbx                # ivp argument
658         mov     %rcx,$key               # key argument
659         mov     272(%rcx),$keyend       # grandRounds
660
661         mov     %r8,$_ivp
662         mov     %rbp,$_rsp
663
664 .Lcbc_body:
665         lea     .LCamellia_SBOX(%rip),$Tbl
666
667         mov     \$32,%ecx
668 .align  4
669 .Lcbc_prefetch_sbox:
670         mov     0($Tbl),%rax
671         mov     32($Tbl),%rsi
672         mov     64($Tbl),%rdi
673         mov     96($Tbl),%r11
674         lea     128($Tbl),$Tbl
675         loop    .Lcbc_prefetch_sbox
676         sub     \$4096,$Tbl
677         shl     \$6,$keyend
678         mov     %rdx,%rcx               # len argument
679         lea     ($key,$keyend),$keyend
680
681         cmp     \$0,%r9d                # enc argument
682         je      .LCBC_DECRYPT
683
684         and     \$-16,%rdx
685         and     \$15,%rcx               # length residue
686         lea     ($inp,%rdx),%rdx
687         mov     $key,$_key
688         mov     %rdx,$_end
689         mov     %rcx,$_res
690
691         cmp     $inp,%rdx
692         mov     0(%rbx),@S[0]           # load IV
693         mov     4(%rbx),@S[1]
694         mov     8(%rbx),@S[2]
695         mov     12(%rbx),@S[3]
696         je      .Lcbc_enc_tail
697         jmp     .Lcbc_eloop
698
699 .align  16
700 .Lcbc_eloop:
701         xor     0($inp),@S[0]
702         xor     4($inp),@S[1]
703         xor     8($inp),@S[2]
704         bswap   @S[0]
705         xor     12($inp),@S[3]
706         bswap   @S[1]
707         bswap   @S[2]
708         bswap   @S[3]
709
710         call    _x86_64_Camellia_encrypt
711
712         mov     $_key,$key              # "rewind" the key
713         bswap   @S[0]
714         mov     $_end,%rdx
715         bswap   @S[1]
716         mov     $_res,%rcx
717         bswap   @S[2]
718         mov     @S[0],0($out)
719         bswap   @S[3]
720         mov     @S[1],4($out)
721         mov     @S[2],8($out)
722         lea     16($inp),$inp
723         mov     @S[3],12($out)
724         cmp     %rdx,$inp
725         lea     16($out),$out
726         jne     .Lcbc_eloop
727
728         cmp     \$0,%rcx
729         jne     .Lcbc_enc_tail
730
731         mov     $_ivp,$out
732         mov     @S[0],0($out)           # write out IV residue
733         mov     @S[1],4($out)
734         mov     @S[2],8($out)
735         mov     @S[3],12($out)
736         jmp     .Lcbc_done
737
738 .align  16
739 .Lcbc_enc_tail:
740         xor     %rax,%rax
741         mov     %rax,0+$ivec
742         mov     %rax,8+$ivec
743         mov     %rax,$_res
744
745         pushfq
746         cld
747         mov     $inp,%rsi
748         lea     $ivec,%rdi
749         .long   0x9066A4F3              # rep movsb
750         popfq
751
752         lea     $ivec,$inp
753         lea     16+$ivec,%rax
754         mov     %rax,$_end
755         jmp     .Lcbc_eloop             # one more time
756
757 .align  16
758 .LCBC_DECRYPT:
759         xchg    $key,$keyend
760         add     \$15,%rdx
761         and     \$15,%rcx               # length residue
762         and     \$-16,%rdx
763         mov     $key,$_key
764         lea     ($inp,%rdx),%rdx
765         mov     %rdx,$_end
766         mov     %rcx,$_res
767
768         mov     (%rbx),%rax             # load IV
769         mov     8(%rbx),%rbx
770         jmp     .Lcbc_dloop
771 .align  16
772 .Lcbc_dloop:
773         mov     0($inp),@S[0]
774         mov     4($inp),@S[1]
775         mov     8($inp),@S[2]
776         bswap   @S[0]
777         mov     12($inp),@S[3]
778         bswap   @S[1]
779         mov     %rax,0+$ivec            # save IV to temporary storage
780         bswap   @S[2]
781         mov     %rbx,8+$ivec
782         bswap   @S[3]
783
784         call    _x86_64_Camellia_decrypt
785
786         mov     $_key,$key              # "rewind" the key
787         mov     $_end,%rdx
788         mov     $_res,%rcx
789
790         bswap   @S[0]
791         mov     ($inp),%rax             # load IV for next iteration
792         bswap   @S[1]
793         mov     8($inp),%rbx
794         bswap   @S[2]
795         xor     0+$ivec,@S[0]
796         bswap   @S[3]
797         xor     4+$ivec,@S[1]
798         xor     8+$ivec,@S[2]
799         lea     16($inp),$inp
800         xor     12+$ivec,@S[3]
801         cmp     %rdx,$inp
802         je      .Lcbc_ddone
803
804         mov     @S[0],0($out)
805         mov     @S[1],4($out)
806         mov     @S[2],8($out)
807         mov     @S[3],12($out)
808
809         lea     16($out),$out
810         jmp     .Lcbc_dloop
811
812 .align  16
813 .Lcbc_ddone:
814         mov     $_ivp,%rdx
815         cmp     \$0,%rcx
816         jne     .Lcbc_dec_tail
817
818         mov     @S[0],0($out)
819         mov     @S[1],4($out)
820         mov     @S[2],8($out)
821         mov     @S[3],12($out)
822
823         mov     %rax,(%rdx)             # write out IV residue
824         mov     %rbx,8(%rdx)
825         jmp     .Lcbc_done
826 .align  16
827 .Lcbc_dec_tail:
828         mov     @S[0],0+$ivec
829         mov     @S[1],4+$ivec
830         mov     @S[2],8+$ivec
831         mov     @S[3],12+$ivec
832
833         pushfq
834         cld
835         lea     $ivec,%rsi
836         lea     ($out),%rdi
837         .long   0x9066A4F3              # rep movsb
838         popfq
839
840         mov     %rax,(%rdx)             # write out IV residue
841         mov     %rbx,8(%rdx)
842         jmp     .Lcbc_done
843
844 .align  16
845 .Lcbc_done:
846         mov     $_rsp,%rcx
847         mov     0(%rcx),%r15
848         mov     8(%rcx),%r14
849         mov     16(%rcx),%r13
850         mov     24(%rcx),%r12
851         mov     32(%rcx),%rbp
852         mov     40(%rcx),%rbx
853         lea     48(%rcx),%rsp
854 .Lcbc_abort:
855         ret
856 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
857
858 .asciz  "Camellia for x86_64 by <appro@openssl.org>"
859 ___
860 }
861
862 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
863 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
864 if ($win64) {
865 $rec="%rcx";
866 $frame="%rdx";
867 $context="%r8";
868 $disp="%r9";
869
870 $code.=<<___;
871 .extern __imp_RtlVirtualUnwind
872 .type   common_se_handler,\@abi-omnipotent
873 .align  16
874 common_se_handler:
875         push    %rsi
876         push    %rdi
877         push    %rbx
878         push    %rbp
879         push    %r12
880         push    %r13
881         push    %r14
882         push    %r15
883         pushfq
884         lea     -64(%rsp),%rsp
885
886         mov     120($context),%rax      # pull context->Rax
887         mov     248($context),%rbx      # pull context->Rip
888
889         mov     8($disp),%rsi           # disp->ImageBase
890         mov     56($disp),%r11          # disp->HandlerData
891
892         mov     0(%r11),%r10d           # HandlerData[0]
893         lea     (%rsi,%r10),%r10        # prologue label
894         cmp     %r10,%rbx               # context->Rip<prologue label
895         jb      .Lin_prologue
896
897         mov     152($context),%rax      # pull context->Rsp
898
899         mov     4(%r11),%r10d           # HandlerData[1]
900         lea     (%rsi,%r10),%r10        # epilogue label
901         cmp     %r10,%rbx               # context->Rip>=epilogue label
902         jae     .Lin_prologue
903
904         lea     40(%rax),%rax
905         mov     -8(%rax),%rbx
906         mov     -16(%rax),%rbp
907         mov     -24(%rax),%r13
908         mov     -32(%rax),%r14
909         mov     -40(%rax),%r15
910         mov     %rbx,144($context)      # restore context->Rbx
911         mov     %rbp,160($context)      # restore context->Rbp
912         mov     %r13,224($context)      # restore context->R13
913         mov     %r14,232($context)      # restore context->R14
914         mov     %r15,240($context)      # restore context->R15
915
916 .Lin_prologue:
917         mov     8(%rax),%rdi
918         mov     16(%rax),%rsi
919         mov     %rax,152($context)      # restore context->Rsp
920         mov     %rsi,168($context)      # restore context->Rsi
921         mov     %rdi,176($context)      # restore context->Rdi
922
923         jmp     .Lcommon_seh_exit
924 .size   common_se_handler,.-common_se_handler
925
926 .type   cbc_se_handler,\@abi-omnipotent
927 .align  16
928 cbc_se_handler:
929         push    %rsi
930         push    %rdi
931         push    %rbx
932         push    %rbp
933         push    %r12
934         push    %r13
935         push    %r14
936         push    %r15
937         pushfq
938         lea     -64(%rsp),%rsp
939
940         mov     120($context),%rax      # pull context->Rax
941         mov     248($context),%rbx      # pull context->Rip
942
943         lea     .Lcbc_prologue(%rip),%r10
944         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
945         jb      .Lin_cbc_prologue
946
947         lea     .Lcbc_body(%rip),%r10
948         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
949         jb      .Lin_cbc_frame_setup
950
951         mov     152($context),%rax      # pull context->Rsp
952
953         lea     .Lcbc_abort(%rip),%r10
954         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
955         jae     .Lin_cbc_prologue
956
957         mov     48(%rax),%rax           # $_rsp
958         lea     48(%rax),%rax
959
960 .Lin_cbc_frame_setup:
961         mov     -8(%rax),%rbx
962         mov     -16(%rax),%rbp
963         mov     -24(%rax),%r12
964         mov     -32(%rax),%r13
965         mov     -40(%rax),%r14
966         mov     -48(%rax),%r15
967         mov     %rbx,144($context)      # restore context->Rbx
968         mov     %rbp,160($context)      # restore context->Rbp
969         mov     %r12,216($context)      # restore context->R12
970         mov     %r13,224($context)      # restore context->R13
971         mov     %r14,232($context)      # restore context->R14
972         mov     %r15,240($context)      # restore context->R15
973
974 .Lin_cbc_prologue:
975         mov     8(%rax),%rdi
976         mov     16(%rax),%rsi
977         mov     %rax,152($context)      # restore context->Rsp
978         mov     %rsi,168($context)      # restore context->Rsi
979         mov     %rdi,176($context)      # restore context->Rdi
980
981 .align  4
982 .Lcommon_seh_exit:
983
984         mov     40($disp),%rdi          # disp->ContextRecord
985         mov     $context,%rsi           # context
986         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
987         .long   0xa548f3fc              # cld; rep movsq
988
989         mov     $disp,%rsi
990         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
991         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
992         mov     0(%rsi),%r8             # arg3, disp->ControlPc
993         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
994         mov     40(%rsi),%r10           # disp->ContextRecord
995         lea     56(%rsi),%r11           # &disp->HandlerData
996         lea     24(%rsi),%r12           # &disp->EstablisherFrame
997         mov     %r10,32(%rsp)           # arg5
998         mov     %r11,40(%rsp)           # arg6
999         mov     %r12,48(%rsp)           # arg7
1000         mov     %rcx,56(%rsp)           # arg8, (NULL)
1001         call    *__imp_RtlVirtualUnwind(%rip)
1002
1003         mov     \$1,%eax                # ExceptionContinueSearch
1004         lea     64(%rsp),%rsp
1005         popfq
1006         pop     %r15
1007         pop     %r14
1008         pop     %r13
1009         pop     %r12
1010         pop     %rbp
1011         pop     %rbx
1012         pop     %rdi
1013         pop     %rsi
1014         ret
1015 .size   cbc_se_handler,.-cbc_se_handler
1016
1017 .section        .pdata
1018 .align  4
1019         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1020         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1021         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1022
1023         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1024         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1025         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1026
1027         .rva    .LSEH_begin_Camellia_Ekeygen
1028         .rva    .LSEH_end_Camellia_Ekeygen
1029         .rva    .LSEH_info_Camellia_Ekeygen
1030
1031         .rva    .LSEH_begin_Camellia_cbc_encrypt
1032         .rva    .LSEH_end_Camellia_cbc_encrypt
1033         .rva    .LSEH_info_Camellia_cbc_encrypt
1034
1035 .section        .xdata
1036 .align  8
1037 .LSEH_info_Camellia_EncryptBlock_Rounds:
1038         .byte   9,0,0,0
1039         .rva    common_se_handler
1040         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1041 .LSEH_info_Camellia_DecryptBlock_Rounds:
1042         .byte   9,0,0,0
1043         .rva    common_se_handler
1044         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1045 .LSEH_info_Camellia_Ekeygen:
1046         .byte   9,0,0,0
1047         .rva    common_se_handler
1048         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1049 .LSEH_info_Camellia_cbc_encrypt:
1050         .byte   9,0,0,0
1051         .rva    cbc_se_handler
1052 ___
1053 }
1054
1055 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1056 print $code;
1057 close STDOUT;