x86[_64]cpuid.pl: handle new extensions.
[openssl.git] / crypto / x86_64cpuid.pl
1 #!/usr/bin/env perl
2
3 $flavour = shift;
4 $output  = shift;
5 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
6
7 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
8
9 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
10 open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
11
12 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
13                                  ("%rdi","%rsi","%rdx","%rcx"); # Unix order
14
15 print<<___;
16 .extern         OPENSSL_cpuid_setup
17 .section        .init
18         call    OPENSSL_cpuid_setup
19
20 .text
21
22 .globl  OPENSSL_atomic_add
23 .type   OPENSSL_atomic_add,\@abi-omnipotent
24 .align  16
25 OPENSSL_atomic_add:
26         movl    ($arg1),%eax
27 .Lspin: leaq    ($arg2,%rax),%r8
28         .byte   0xf0            # lock
29         cmpxchgl        %r8d,($arg1)
30         jne     .Lspin
31         movl    %r8d,%eax
32         .byte   0x48,0x98       # cltq/cdqe
33         ret
34 .size   OPENSSL_atomic_add,.-OPENSSL_atomic_add
35
36 .globl  OPENSSL_rdtsc
37 .type   OPENSSL_rdtsc,\@abi-omnipotent
38 .align  16
39 OPENSSL_rdtsc:
40         rdtsc
41         shl     \$32,%rdx
42         or      %rdx,%rax
43         ret
44 .size   OPENSSL_rdtsc,.-OPENSSL_rdtsc
45
46 .globl  OPENSSL_ia32_cpuid
47 .type   OPENSSL_ia32_cpuid,\@abi-omnipotent
48 .align  16
49 OPENSSL_ia32_cpuid:
50         mov     %rbx,%r8                # save %rbx
51
52         xor     %eax,%eax
53         cpuid
54         mov     %eax,%r11d              # max value for standard query level
55
56         xor     %eax,%eax
57         cmp     \$0x756e6547,%ebx       # "Genu"
58         setne   %al
59         mov     %eax,%r9d
60         cmp     \$0x49656e69,%edx       # "ineI"
61         setne   %al
62         or      %eax,%r9d
63         cmp     \$0x6c65746e,%ecx       # "ntel"
64         setne   %al
65         or      %eax,%r9d               # 0 indicates Intel CPU
66         jz      .Lintel
67
68         cmp     \$0x68747541,%ebx       # "Auth"
69         setne   %al
70         mov     %eax,%r10d
71         cmp     \$0x69746E65,%edx       # "enti"
72         setne   %al
73         or      %eax,%r10d
74         cmp     \$0x444D4163,%ecx       # "cAMD"
75         setne   %al
76         or      %eax,%r10d              # 0 indicates AMD CPU
77         jnz     .Lintel
78
79         # AMD specific
80         mov     \$0x80000000,%eax
81         cpuid
82         cmp     \$0x80000001,%eax
83         jb      .Lintel
84         mov     %eax,%r10d
85         mov     \$0x80000001,%eax
86         cpuid
87         or      %ecx,%r9d
88         and     \$0x00000801,%r9d       # isolate AMD XOP bit, 1<<11
89
90         cmp     \$0x80000008,%r10d
91         jb      .Lintel
92
93         mov     \$0x80000008,%eax
94         cpuid
95         movzb   %cl,%r10                # number of cores - 1
96         inc     %r10                    # number of cores
97
98         mov     \$1,%eax
99         cpuid
100         bt      \$28,%edx               # test hyper-threading bit
101         jnc     .Lgeneric
102         shr     \$16,%ebx               # number of logical processors
103         cmp     %r10b,%bl
104         ja      .Lgeneric
105         and     \$0xefffffff,%edx       # ~(1<<28)
106         jmp     .Lgeneric
107
108 .Lintel:
109         cmp     \$4,%r11d
110         mov     \$-1,%r10d
111         jb      .Lnocacheinfo
112
113         mov     \$4,%eax
114         mov     \$0,%ecx                # query L1D
115         cpuid
116         mov     %eax,%r10d
117         shr     \$14,%r10d
118         and     \$0xfff,%r10d           # number of cores -1 per L1D
119
120 .Lnocacheinfo:
121         mov     \$1,%eax
122         cpuid
123         cmp     \$0,%r9d
124         jne     .Lnotintel
125         or      \$0x00100000,%edx       # use reserved 20th bit to engage RC4_CHAR
126         and     \$15,%ah
127         cmp     \$15,%ah                # examine Family ID
128         je      .Lnotintel
129         or      \$0x40000000,%edx       # use reserved bit to skip unrolled loop
130 .Lnotintel:
131         bt      \$28,%edx               # test hyper-threading bit
132         jnc     .Lgeneric
133         and     \$0xefffffff,%edx       # ~(1<<28)
134         cmp     \$0,%r10d
135         je      .Lgeneric
136
137         or      \$0x10000000,%edx       # 1<<28
138         shr     \$16,%ebx
139         cmp     \$1,%bl                 # see if cache is shared
140         ja      .Lgeneric
141         and     \$0xefffffff,%edx       # ~(1<<28)
142 .Lgeneric:
143         and     \$0x00000800,%r9d       # isolate AMD XOP flag
144         and     \$0xfffff7ff,%ecx
145         or      %r9d,%ecx               # merge AMD XOP flag
146
147         shl     \$32,%rcx
148         mov     %edx,%ebx
149         or      %rcx,%rbx               # compose capability vector in %rbx
150         bt      \$27+32,%rcx            # check OSXSAVE bit
151         jnc     .Lclear_avx
152         xor     %ecx,%ecx               # XCR0
153         .byte   0x0f,0x01,0xd0          # xgetbv
154         and     \$6,%eax                # isolate XMM and YMM state support
155         cmp     \$6,%eax
156         je      .Ldone
157 .Lclear_avx:
158         mov     \$0xefffe7ff,%eax       # ~(1<<28|1<<12|1<<11)
159         shl     \$32,%rax
160         and     %rax,%rbx               # clear AVX, FMA and AMD XOP bits
161 .Ldone:
162         mov     %rbx,%rax
163         mov     %r8,%rbx                # restore %rbx
164         ret
165 .size   OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
166
167 .globl  OPENSSL_cleanse
168 .type   OPENSSL_cleanse,\@abi-omnipotent
169 .align  16
170 OPENSSL_cleanse:
171         xor     %rax,%rax
172         cmp     \$15,$arg2
173         jae     .Lot
174         cmp     \$0,$arg2
175         je      .Lret
176 .Little:
177         mov     %al,($arg1)
178         sub     \$1,$arg2
179         lea     1($arg1),$arg1
180         jnz     .Little
181 .Lret:
182         ret
183 .align  16
184 .Lot:
185         test    \$7,$arg1
186         jz      .Laligned
187         mov     %al,($arg1)
188         lea     -1($arg2),$arg2
189         lea     1($arg1),$arg1
190         jmp     .Lot
191 .Laligned:
192         mov     %rax,($arg1)
193         lea     -8($arg2),$arg2
194         test    \$-8,$arg2
195         lea     8($arg1),$arg1
196         jnz     .Laligned
197         cmp     \$0,$arg2
198         jne     .Little
199         ret
200 .size   OPENSSL_cleanse,.-OPENSSL_cleanse
201 ___
202
203 print<<___ if (!$win64);
204 .globl  OPENSSL_wipe_cpu
205 .type   OPENSSL_wipe_cpu,\@abi-omnipotent
206 .align  16
207 OPENSSL_wipe_cpu:
208         pxor    %xmm0,%xmm0
209         pxor    %xmm1,%xmm1
210         pxor    %xmm2,%xmm2
211         pxor    %xmm3,%xmm3
212         pxor    %xmm4,%xmm4
213         pxor    %xmm5,%xmm5
214         pxor    %xmm6,%xmm6
215         pxor    %xmm7,%xmm7
216         pxor    %xmm8,%xmm8
217         pxor    %xmm9,%xmm9
218         pxor    %xmm10,%xmm10
219         pxor    %xmm11,%xmm11
220         pxor    %xmm12,%xmm12
221         pxor    %xmm13,%xmm13
222         pxor    %xmm14,%xmm14
223         pxor    %xmm15,%xmm15
224         xorq    %rcx,%rcx
225         xorq    %rdx,%rdx
226         xorq    %rsi,%rsi
227         xorq    %rdi,%rdi
228         xorq    %r8,%r8
229         xorq    %r9,%r9
230         xorq    %r10,%r10
231         xorq    %r11,%r11
232         leaq    8(%rsp),%rax
233         ret
234 .size   OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
235 ___
236 print<<___ if ($win64);
237 .globl  OPENSSL_wipe_cpu
238 .type   OPENSSL_wipe_cpu,\@abi-omnipotent
239 .align  16
240 OPENSSL_wipe_cpu:
241         pxor    %xmm0,%xmm0
242         pxor    %xmm1,%xmm1
243         pxor    %xmm2,%xmm2
244         pxor    %xmm3,%xmm3
245         pxor    %xmm4,%xmm4
246         pxor    %xmm5,%xmm5
247         xorq    %rcx,%rcx
248         xorq    %rdx,%rdx
249         xorq    %r8,%r8
250         xorq    %r9,%r9
251         xorq    %r10,%r10
252         xorq    %r11,%r11
253         leaq    8(%rsp),%rax
254         ret
255 .size   OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
256 ___
257 {
258 my $out="%r10";
259 my $cnt="%rcx";
260 my $max="%r11";
261 my $lasttick="%r8d";
262 my $lastdiff="%r9d";
263 my $redzone=win64?8:-8;
264
265 print<<___;
266 .globl  OPENSSL_instrument_bus
267 .type   OPENSSL_instrument_bus,\@abi-omnipotent
268 .align  16
269 OPENSSL_instrument_bus:
270         mov     $arg1,$out      # tribute to Win64
271         mov     $arg2,$cnt
272         mov     $arg2,$max
273
274         rdtsc                   # collect 1st tick
275         mov     %eax,$lasttick  # lasttick = tick
276         mov     \$0,$lastdiff   # lastdiff = 0
277         clflush ($out)
278         .byte   0xf0            # lock
279         add     $lastdiff,($out)
280         jmp     .Loop
281 .align  16
282 .Loop:  rdtsc
283         mov     %eax,%edx
284         sub     $lasttick,%eax
285         mov     %edx,$lasttick
286         mov     %eax,$lastdiff
287         clflush ($out)
288         .byte   0xf0            # lock
289         add     %eax,($out)
290         lea     4($out),$out
291         sub     \$1,$cnt
292         jnz     .Loop
293
294         mov     $max,%rax
295         ret
296 .size   OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
297
298 .globl  OPENSSL_instrument_bus2
299 .type   OPENSSL_instrument_bus2,\@abi-omnipotent
300 .align  16
301 OPENSSL_instrument_bus2:
302         mov     $arg1,$out      # tribute to Win64
303         mov     $arg2,$cnt
304         mov     $arg3,$max
305         mov     $cnt,$redzone(%rsp)
306
307         rdtsc                   # collect 1st tick
308         mov     %eax,$lasttick  # lasttick = tick
309         mov     \$0,$lastdiff   # lastdiff = 0
310
311         clflush ($out)
312         .byte   0xf0            # lock
313         add     $lastdiff,($out)
314
315         rdtsc                   # collect 1st diff
316         mov     %eax,%edx
317         sub     $lasttick,%eax  # diff
318         mov     %edx,$lasttick  # lasttick = tick
319         mov     %eax,$lastdiff  # lastdiff = diff
320 .Loop2:
321         clflush ($out)
322         .byte   0xf0            # lock
323         add     %eax,($out)     # accumulate diff
324
325         sub     \$1,$max
326         jz      .Ldone2
327
328         rdtsc
329         mov     %eax,%edx
330         sub     $lasttick,%eax  # diff
331         mov     %edx,$lasttick  # lasttick = tick
332         cmp     $lastdiff,%eax
333         mov     %eax,$lastdiff  # lastdiff = diff
334         mov     \$0,%edx
335         setne   %dl
336         sub     %rdx,$cnt       # conditional --$cnt
337         lea     ($out,%rdx,4),$out      # conditional ++$out
338         jnz     .Loop2
339
340 .Ldone2:
341         mov     $redzone(%rsp),%rax
342         sub     $cnt,%rax
343         ret
344 .size   OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
345 ___
346 }
347
348 close STDOUT;   # flush