Add sha/asm/keccak1600-mmx.pl, x86 MMX module.
[openssl.git] / crypto / sha / asm / keccak1600-mmx.pl
1 #!/usr/bin/env perl
2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for x86 MMX.
17 #
18 # June 2017.
19 #
20 # Below code is KECCAK_2X implementation (see sha/keccak1600.c) with
21 # C[5] held in register bank and D[5] offloaded to memory. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] and the end of round. Since number of
24 # rounds is even last round writes to A[][] and everything works out.
25 #
26 ########################################################################
27 # Numbers are cycles per processed byte out of large message.
28 #
29 #                       r=1088(i)
30 #
31 # PIII                  31
32 # Pentium M             27
33 # P4                    42
34 # Core 2                20
35 # Sandy Bridge(ii)      18
36 # Atom                  37
37 # Silvermont(ii)        80(iv)
38 # VIA Nano(ii)          44
39 # Sledgehammer(ii)(iii) 25
40 #
41 # (i)   Corresponds to SHA3-256.
42 # (ii)  64-bit processor executing 32-bit code.
43 # (iii) Result is considered to be representative even for older AMD
44 #       processors.
45 # (iv)  This seems to be some processor anomaly. Successor doesn't
46 #       have this problem...
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 push(@INC,"${dir}","${dir}../../perlasm");
50 require "x86asm.pl";
51
52 $output=pop;
53 open STDOUT,">$output";
54
55 &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
56
57 my @C = map("mm$_",(0..4));
58 my @T = map("mm$_",(5..7));
59 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
60               8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
61 my @D = map(8*$_+4, (0..4));
62 my @rhotates = ([  0,  1, 62, 28, 27 ],
63                 [ 36, 44,  6, 55, 20 ],
64                 [  3, 10, 43, 25, 39 ],
65                 [ 41, 45, 15, 21,  8 ],
66                 [ 18,  2, 61, 56, 14 ]);
67
68 &static_label("iotas");
69
70 &function_begin_B("_KeccakF1600");
71         &movq   (@C[0],&QWP($A[4][0],"esi"));
72         &movq   (@C[1],&QWP($A[4][1],"esi"));
73         &movq   (@C[2],&QWP($A[4][2],"esi"));
74         &movq   (@C[3],&QWP($A[4][3],"esi"));
75         &movq   (@C[4],&QWP($A[4][4],"esi"));
76
77         &mov    ("ecx",24);                     # loop counter
78         &jmp    (&label("loop"));
79
80     &set_label("loop",16);
81         ######################################### Theta
82         &pxor   (@C[0],&QWP($A[0][0],"esi"));
83         &pxor   (@C[1],&QWP($A[0][1],"esi"));
84         &pxor   (@C[2],&QWP($A[0][2],"esi"));
85         &pxor   (@C[3],&QWP($A[0][3],"esi"));
86         &pxor   (@C[4],&QWP($A[0][4],"esi"));
87
88         &pxor   (@C[0],&QWP($A[1][0],"esi"));
89         &pxor   (@C[1],&QWP($A[1][1],"esi"));
90         &pxor   (@C[2],&QWP($A[1][2],"esi"));
91         &pxor   (@C[3],&QWP($A[1][3],"esi"));
92         &pxor   (@C[4],&QWP($A[1][4],"esi"));
93
94         &pxor   (@C[0],&QWP($A[2][0],"esi"));
95         &pxor   (@C[1],&QWP($A[2][1],"esi"));
96         &pxor   (@C[2],&QWP($A[2][2],"esi"));
97         &pxor   (@C[3],&QWP($A[2][3],"esi"));
98         &pxor   (@C[4],&QWP($A[2][4],"esi"));
99
100         &pxor   (@C[0],&QWP($A[3][0],"esi"));
101         &pxor   (@C[1],&QWP($A[3][1],"esi"));
102         &pxor   (@C[2],&QWP($A[3][2],"esi"));
103         &pxor   (@C[3],&QWP($A[3][3],"esi"));
104         &pxor   (@C[4],&QWP($A[3][4],"esi"));
105
106         &movq   (@T[0],@C[2]);
107         &movq   (@T[2],@C[2]);
108         &psrlq  (@T[0],63);
109         &psllq  (@T[2],1);
110         &pxor   (@T[0],@C[0]);
111         &pxor   (@T[0],@T[2]);
112         &movq   (&QWP(@D[1],"esp"),@T[0]);      # D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
113
114         &movq   (@T[1],@C[0]);
115         &psrlq  (@C[0],63);
116         &psllq  (@T[1],1);
117         &pxor   (@T[1],@C[0]);
118         &pxor   (@T[1],@C[3]);
119         &movq   (&QWP(@D[4],"esp"),@T[1]);      # D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
120
121         &movq   (@C[0],@C[1]);
122         &movq   (@T[2],@C[1]);
123         &psrlq  (@C[0],63);
124         &psllq  (@T[2],1);
125         &pxor   (@C[0],@C[4]);
126         &pxor   (@C[0],@T[2]);
127         &movq   (&QWP(@D[0],"esp"),@C[0]);      # D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
128
129         &movq   (@T[2],@C[3]);
130         &psrlq  (@C[3],63);
131         &psllq  (@T[2],1);
132         &pxor   (@C[1],@C[3]);
133         &pxor   (@C[1],@T[2]);
134         &movq   (&QWP(@D[2],"esp"),@C[1]);      # D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
135
136         &movq   (@T[2],@C[4]);
137         &psrlq  (@C[4],63);
138         &psllq  (@T[2],1);
139         &pxor   (@C[2],@C[4]);
140         &pxor   (@C[2],@T[2]);
141         &movq   (&QWP(@D[3],"esp"),@C[2]);      # D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
142
143         ######################################### first Rho step is special
144         &movq   (@C[3],&QWP($A[3][3],"esi"));
145         &pxor   (@C[3],@C[2]);
146         &movq   (@T[2],@C[3]);
147         &psrlq  (@C[3],64-$rhotates[3][3]);
148         &psllq  (@T[2],$rhotates[3][3]);
149         &por    (@C[3],@T[2]);          # C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
150
151         &movq   (@C[4],&QWP($A[4][4],"esi"));
152         &pxor   (@C[4],@T[1]);
153         &movq   (@T[2],@C[4]);
154         &psrlq  (@C[4],64-$rhotates[4][4]);
155         &psllq  (@T[2],$rhotates[4][4]);
156         &por    (@C[4],@T[2]);          # C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
157
158         &pxor   (@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */  /* D[0] */
159
160         &movq   (@C[2],&QWP($A[2][2],"esi"));
161         &pxor   (@C[2],@C[1]);
162         &movq   (@T[1],@C[2]);
163         &psrlq  (@C[2],64-$rhotates[2][2]);
164         &psllq  (@T[1],$rhotates[2][2]);
165         &por    (@C[2],@T[1]);          # C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
166
167         &movq   (@C[1],&QWP($A[1][1],"esi"));
168         &pxor   (@C[1],@T[0]);
169         &movq   (@T[2],@C[1]);
170         &psrlq  (@C[1],64-$rhotates[1][1]);
171         &psllq  (@T[2],$rhotates[1][1]);
172         &por    (@C[1],@T[2]);          # C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]);   /* D[1] */
173
174 sub Chi() {                             ######### regular Chi step
175     my $y = shift;
176
177         &movq   (@T[0],@C[1]);
178         &pandn  (@T[0],@C[2]);
179         &pxor   (@T[0],@C[0]);
180         &pxor   (@T[0],&QWP(0,"ebx"))           if ($y == 0);
181         &lea    ("ebx",&DWP(8,"ebx"))           if ($y == 0);
182         &movq   (&QWP($A[$y][0],"edi"),@T[0]);  # R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
183
184         &movq   (@T[1],@C[2]);
185         &pandn  (@T[1],@C[3]);
186         &pxor   (@T[1],@C[1]);
187         &movq   (&QWP($A[$y][1],"edi"),@T[1]);  # R[0][1] = C[1] ^ (~C[2] & C[3]);
188
189         &movq   (@T[2],@C[3]);
190         &pandn  (@T[2],@C[4]);
191         &pxor   (@T[2],@C[2]);
192         &movq   (&QWP($A[$y][2],"edi"),@T[2]);  # R[0][2] = C[2] ^ (~C[3] & C[4]);
193
194         &movq   (@T[0],@C[4]);
195         &pandn  (@T[0],@C[0]);
196         &pxor   (@T[0],@C[3]);
197         &movq   (&QWP($A[$y][3],"edi"),@T[0]);  # R[0][3] = C[3] ^ (~C[4] & C[0]);
198
199         &movq   (@T[1],@C[0]);
200         &pandn  (@T[1],@C[1]);
201         &pxor   (@T[1],@C[4]);
202         &movq   (&QWP($A[$y][4],"edi"),@T[1]);  # R[0][4] = C[4] ^ (~C[0] & C[1]);
203 }
204         &Chi    (0);
205
206 sub Rho() {                             ######### regular Rho step
207     my $x = shift;
208
209         &movq   (@C[0],&QWP($A[0][$x],"esi"));
210         &pxor   (@C[0],&QWP(@D[$x],"esp"));
211         &movq   (@T[0],@C[0]);
212         &psrlq  (@C[0],64-$rhotates[0][$x]);
213         &psllq  (@T[0],$rhotates[0][$x]);
214         &por    (@C[0],@T[0]);          # C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
215
216         &movq   (@C[1],&QWP($A[1][($x+1)%5],"esi"));
217         &pxor   (@C[1],&QWP(@D[($x+1)%5],"esp"));
218         &movq   (@T[1],@C[1]);
219         &psrlq  (@C[1],64-$rhotates[1][($x+1)%5]);
220         &psllq  (@T[1],$rhotates[1][($x+1)%5]);
221         &por    (@C[1],@T[1]);          # C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
222
223         &movq   (@C[2],&QWP($A[2][($x+2)%5],"esi"));
224         &pxor   (@C[2],&QWP(@D[($x+2)%5],"esp"));
225         &movq   (@T[2],@C[2]);
226         &psrlq  (@C[2],64-$rhotates[2][($x+2)%5]);
227         &psllq  (@T[2],$rhotates[2][($x+2)%5]);
228         &por    (@C[2],@T[2]);          # C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
229
230         &movq   (@C[3],&QWP($A[3][($x+3)%5],"esi"));
231         &pxor   (@C[3],&QWP(@D[($x+3)%5],"esp"));
232         &movq   (@T[0],@C[3]);
233         &psrlq  (@C[3],64-$rhotates[3][($x+3)%5]);
234         &psllq  (@T[0],$rhotates[3][($x+3)%5]);
235         &por    (@C[3],@T[0]);          # C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
236
237         &movq   (@C[4],&QWP($A[4][($x+4)%5],"esi"));
238         &pxor   (@C[4],&QWP(@D[($x+4)%5],"esp"));
239         &movq   (@T[1],@C[4]);
240         &psrlq  (@C[4],64-$rhotates[4][($x+4)%5]);
241         &psllq  (@T[1],$rhotates[4][($x+4)%5]);
242         &por    (@C[4],@T[1]);          # C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
243 }
244         &Rho    (3);    &Chi    (1);
245         &Rho    (1);    &Chi    (2);
246         &Rho    (4);    &Chi    (3);
247         &Rho    (2);    #&Chi   (4);
248
249         &movq   (@T[0],@C[0]);          ######### last Chi(4) is special
250         &movq   (&QWP(@D[1],"esp"),@C[1]);
251
252         &movq   (@T[1],@C[1]);
253         &pandn  (@T[1],@C[2]);
254         &pxor   (@C[0],@T[1]);
255         &movq   (&QWP($A[4][0],"edi"),@C[0]);   # R[4][0] = C[0] ^= (~C[1] & C[2]);
256
257         &movq   (@T[2],@C[2]);
258         &pandn  (@T[2],@C[3]);
259         &pxor   (@C[1],@T[2]);
260         &movq   (&QWP($A[4][1],"edi"),@C[1]);   # R[4][1] = C[1] ^= (~C[2] & C[3]);
261
262         &movq   (@T[1],@C[3]);
263         &pandn  (@T[1],@C[4]);
264         &pxor   (@C[2],@T[1]);
265         &movq   (&QWP($A[4][2],"edi"),@C[2]);   # R[4][2] = C[2] ^= (~C[3] & C[4]);
266
267         &movq   (@T[2],@C[4]);
268         &pandn  (@T[2],@T[0]);
269         &pxor   (@C[3],@T[2]);
270         &movq   (&QWP($A[4][3],"edi"),@C[3]);   # R[4][3] = C[3] ^= (~C[4] & D[0]);
271
272         &pandn  (@T[0],&QWP(@D[1],"esp"));
273         &pxor   (@C[4],@T[0]);
274         &movq   (&QWP($A[4][4],"edi"),@C[4]);   # R[4][4] = C[4] ^= (~D[0] & D[1]);
275
276         &xchg   ("esi","edi");
277         &dec    ("ecx");
278         &jnz    (&label("loop"));
279
280         &lea    ("ebx",&DWP(-192,"ebx"));       # rewind iotas
281         &ret    ();
282 &function_end_B("_KeccakF1600");
283
284 &function_begin("KeccakF1600");
285         &mov    ("esi",&wparam(0));
286         &mov    ("ebp","esp");
287         &sub    ("esp",240);
288         &call   (&label("pic_point"));
289     &set_label("pic_point");
290         &blindpop("ebx");
291         &lea    ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
292         &and    ("esp",-8);
293         &lea    ("esi",&DWP(100,"esi"));        # size optimization
294         &lea    ("edi",&DWP(8*5+100,"esp"));    # size optimization
295
296         &call   ("_KeccakF1600");
297
298         &mov    ("esp","ebp");
299         &emms   ();
300 &function_end("KeccakF1600");
301
302 &function_begin("SHA3_absorb");
303         &mov    ("esi",&wparam(0));             # A[][]
304         &mov    ("eax",&wparam(1));             # inp
305         &mov    ("ecx",&wparam(2));             # len
306         &mov    ("edx",&wparam(3));             # bsz
307         &mov    ("ebp","esp");
308         &sub    ("esp",240+8);
309         &call   (&label("pic_point"));
310     &set_label("pic_point");
311         &blindpop("ebx");
312         &lea    ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
313         &and    ("esp",-8);
314
315         &mov    ("edi","esi");
316         &lea    ("esi",&DWP(100,"esi"));        # size optimization
317         &mov    (&DWP(-4,"ebp"),"edx");         # save bsz
318         &jmp    (&label("loop"));
319
320 &set_label("loop",16);
321         &cmp    ("ecx","edx");                  # len < bsz?
322         &jc     (&label("absorbed"));
323
324         &shr    ("edx",3);                      # bsz /= 8
325 &set_label("block");
326         &movq   ("mm0",&QWP(0,"eax"));
327         &lea    ("eax",&DWP(8,"eax"));
328         &pxor   ("mm0",&QWP(0,"edi"));
329         &lea    ("edi",&DWP(8,"edi"));
330         &sub    ("ecx",8);                      # len -= 8
331         &movq   (&QWP(-8,"edi"),"mm0");
332         &dec    ("edx");                        # bsz--
333         &jnz    (&label("block"));
334
335         &lea    ("edi",&DWP(8*5+100,"esp"));    # size optimization
336         &mov    (&DWP(-8,"ebp"),"ecx");         # save len
337         &call   ("_KeccakF1600");
338         &mov    ("ecx",&DWP(-8,"ebp"));         # pull len
339         &mov    ("edx",&DWP(-4,"ebp"));         # pull bsz
340         &lea    ("edi",&DWP(-100,"esi"));
341         &jmp    (&label("loop"));
342
343 &set_label("absorbed",16);
344         &mov    ("eax","ecx");                  # return value
345         &mov    ("esp","ebp");
346         &emms   ();
347 &function_end("SHA3_absorb");
348
349 &function_begin("SHA3_squeeze");
350         &mov    ("esi",&wparam(0));             # A[][]
351         &mov    ("eax",&wparam(1));             # out
352         &mov    ("ecx",&wparam(2));             # len
353         &mov    ("edx",&wparam(3));             # bsz
354         &mov    ("ebp","esp");
355         &sub    ("esp",240+8);
356         &call   (&label("pic_point"));
357     &set_label("pic_point");
358         &blindpop("ebx");
359         &lea    ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
360         &and    ("esp",-8);
361
362         &shr    ("edx",3);                      # bsz /= 8
363         &mov    ("edi","esi");
364         &lea    ("esi",&DWP(100,"esi"));        # size optimization
365         &mov    (&DWP(-4,"ebp"),"edx");         # save bsz
366         &jmp    (&label("loop"));
367
368 &set_label("loop",16);
369         &cmp    ("ecx",8);                      # len < 8?
370         &jc     (&label("tail"));
371
372         &movq   ("mm0",&QWP(0,"edi"));
373         &lea    ("edi",&DWP(8,"edi"));
374         &movq   (&QWP(0,"eax"),"mm0");
375         &lea    ("eax",&DWP(8,"eax"));
376         &sub    ("ecx",8);                      # len -= 8
377         &jz     (&label("done"));
378
379         &dec    ("edx");                        # bsz--
380         &jnz    (&label("loop"));
381
382         &lea    ("edi",&DWP(8*5+100,"esp"));    # size optimization
383         &mov    (&DWP(-8,"ebp"),"ecx");         # save len
384         &call   ("_KeccakF1600");
385         &mov    ("ecx",&DWP(-8,"ebp"));         # pull len
386         &mov    ("edx",&DWP(-4,"ebp"));         # pull bsz
387         &lea    ("edi",&DWP(-100,"esi"));
388         &jmp    (&label("loop"));
389
390 &set_label("tail",16);
391         &mov    ("esi","edi");
392         &mov    ("edi","eax");
393         &data_word("0xA4F39066");               # rep movsb
394
395 &set_label("done");
396         &mov    ("esp","ebp");
397         &emms   ();
398 &function_end("SHA3_squeeze");
399
400 &set_label("iotas",32);
401         &data_word(0x00000001,0x00000000);
402         &data_word(0x00008082,0x00000000);
403         &data_word(0x0000808a,0x80000000);
404         &data_word(0x80008000,0x80000000);
405         &data_word(0x0000808b,0x00000000);
406         &data_word(0x80000001,0x00000000);
407         &data_word(0x80008081,0x80000000);
408         &data_word(0x00008009,0x80000000);
409         &data_word(0x0000008a,0x00000000);
410         &data_word(0x00000088,0x00000000);
411         &data_word(0x80008009,0x00000000);
412         &data_word(0x8000000a,0x00000000);
413         &data_word(0x8000808b,0x00000000);
414         &data_word(0x0000008b,0x80000000);
415         &data_word(0x00008089,0x80000000);
416         &data_word(0x00008003,0x80000000);
417         &data_word(0x00008002,0x80000000);
418         &data_word(0x00000080,0x80000000);
419         &data_word(0x0000800a,0x00000000);
420         &data_word(0x8000000a,0x80000000);
421         &data_word(0x80008081,0x80000000);
422         &data_word(0x00008080,0x80000000);
423         &data_word(0x80000001,0x00000000);
424         &data_word(0x80008008,0x80000000);
425 &asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>");
426
427 &asm_finish();
428
429 close STDOUT;