84b1cbe85f59984d99e5ecda20b0449c057d1729
[openssl.git] / crypto / x86_64cpuid.pl
1 #!/usr/bin/env perl
2
3 $flavour = shift;
4 $output  = shift;
5 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
6
7 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
8
9 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
10 open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
11
12 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
13                                  ("%rdi","%rsi","%rdx","%rcx"); # Unix order
14
15 print<<___;
16 .extern         OPENSSL_cpuid_setup
17 .hidden         OPENSSL_cpuid_setup
18 .section        .init
19         call    OPENSSL_cpuid_setup
20
21 .hidden OPENSSL_ia32cap_P
22 .comm   OPENSSL_ia32cap_P,8
23
24 .text
25
26 .globl  OPENSSL_atomic_add
27 .type   OPENSSL_atomic_add,\@abi-omnipotent
28 .align  16
29 OPENSSL_atomic_add:
30         movl    ($arg1),%eax
31 .Lspin: leaq    ($arg2,%rax),%r8
32         .byte   0xf0            # lock
33         cmpxchgl        %r8d,($arg1)
34         jne     .Lspin
35         movl    %r8d,%eax
36         .byte   0x48,0x98       # cltq/cdqe
37         ret
38 .size   OPENSSL_atomic_add,.-OPENSSL_atomic_add
39
40 .globl  OPENSSL_rdtsc
41 .type   OPENSSL_rdtsc,\@abi-omnipotent
42 .align  16
43 OPENSSL_rdtsc:
44         rdtsc
45         shl     \$32,%rdx
46         or      %rdx,%rax
47         ret
48 .size   OPENSSL_rdtsc,.-OPENSSL_rdtsc
49
50 .globl  OPENSSL_ia32_cpuid
51 .type   OPENSSL_ia32_cpuid,\@abi-omnipotent
52 .align  16
53 OPENSSL_ia32_cpuid:
54         mov     %rbx,%r8                # save %rbx
55
56         xor     %eax,%eax
57         cpuid
58         mov     %eax,%r11d              # max value for standard query level
59
60         xor     %eax,%eax
61         cmp     \$0x756e6547,%ebx       # "Genu"
62         setne   %al
63         mov     %eax,%r9d
64         cmp     \$0x49656e69,%edx       # "ineI"
65         setne   %al
66         or      %eax,%r9d
67         cmp     \$0x6c65746e,%ecx       # "ntel"
68         setne   %al
69         or      %eax,%r9d               # 0 indicates Intel CPU
70         jz      .Lintel
71
72         cmp     \$0x68747541,%ebx       # "Auth"
73         setne   %al
74         mov     %eax,%r10d
75         cmp     \$0x69746E65,%edx       # "enti"
76         setne   %al
77         or      %eax,%r10d
78         cmp     \$0x444D4163,%ecx       # "cAMD"
79         setne   %al
80         or      %eax,%r10d              # 0 indicates AMD CPU
81         jnz     .Lintel
82
83         # AMD specific
84         mov     \$0x80000000,%eax
85         cpuid
86         cmp     \$0x80000001,%eax
87         jb      .Lintel
88         mov     %eax,%r10d
89         mov     \$0x80000001,%eax
90         cpuid
91         or      %ecx,%r9d
92         and     \$0x00000801,%r9d       # isolate AMD XOP bit, 1<<11
93
94         cmp     \$0x80000008,%r10d
95         jb      .Lintel
96
97         mov     \$0x80000008,%eax
98         cpuid
99         movzb   %cl,%r10                # number of cores - 1
100         inc     %r10                    # number of cores
101
102         mov     \$1,%eax
103         cpuid
104         bt      \$28,%edx               # test hyper-threading bit
105         jnc     .Lgeneric
106         shr     \$16,%ebx               # number of logical processors
107         cmp     %r10b,%bl
108         ja      .Lgeneric
109         and     \$0xefffffff,%edx       # ~(1<<28)
110         jmp     .Lgeneric
111
112 .Lintel:
113         cmp     \$4,%r11d
114         mov     \$-1,%r10d
115         jb      .Lnocacheinfo
116
117         mov     \$4,%eax
118         mov     \$0,%ecx                # query L1D
119         cpuid
120         mov     %eax,%r10d
121         shr     \$14,%r10d
122         and     \$0xfff,%r10d           # number of cores -1 per L1D
123
124 .Lnocacheinfo:
125         mov     \$1,%eax
126         cpuid
127         cmp     \$0,%r9d
128         jne     .Lnotintel
129         or      \$0x00100000,%edx       # use reserved 20th bit to engage RC4_CHAR
130         and     \$15,%ah
131         cmp     \$15,%ah                # examine Family ID
132         je      .Lnotintel
133         or      \$0x40000000,%edx       # use reserved bit to skip unrolled loop
134 .Lnotintel:
135         bt      \$28,%edx               # test hyper-threading bit
136         jnc     .Lgeneric
137         and     \$0xefffffff,%edx       # ~(1<<28)
138         cmp     \$0,%r10d
139         je      .Lgeneric
140
141         or      \$0x10000000,%edx       # 1<<28
142         shr     \$16,%ebx
143         cmp     \$1,%bl                 # see if cache is shared
144         ja      .Lgeneric
145         and     \$0xefffffff,%edx       # ~(1<<28)
146 .Lgeneric:
147         and     \$0x00000800,%r9d       # isolate AMD XOP flag
148         and     \$0xfffff7ff,%ecx
149         or      %ecx,%r9d               # merge AMD XOP flag
150
151         mov     %edx,%r10d              # %r9d:%r10d is copy of %ecx:%edx
152         bt      \$27,%r9d               # check OSXSAVE bit
153         jnc     .Lclear_avx
154         xor     %ecx,%ecx               # XCR0
155         .byte   0x0f,0x01,0xd0          # xgetbv
156         and     \$6,%eax                # isolate XMM and YMM state support
157         cmp     \$6,%eax
158         je      .Ldone
159 .Lclear_avx:
160         mov     \$0xefffe7ff,%eax       # ~(1<<28|1<<12|1<<11)
161         and     %eax,%r9d               # clear AVX, FMA and AMD XOP bits
162 .Ldone:
163         shl     \$32,%r9
164         mov     %r10d,%eax
165         mov     %r8,%rbx                # restore %rbx
166         or      %r9,%rax
167         ret
168 .size   OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
169
170 .globl  OPENSSL_cleanse
171 .type   OPENSSL_cleanse,\@abi-omnipotent
172 .align  16
173 OPENSSL_cleanse:
174         xor     %rax,%rax
175         cmp     \$15,$arg2
176         jae     .Lot
177         cmp     \$0,$arg2
178         je      .Lret
179 .Little:
180         mov     %al,($arg1)
181         sub     \$1,$arg2
182         lea     1($arg1),$arg1
183         jnz     .Little
184 .Lret:
185         ret
186 .align  16
187 .Lot:
188         test    \$7,$arg1
189         jz      .Laligned
190         mov     %al,($arg1)
191         lea     -1($arg2),$arg2
192         lea     1($arg1),$arg1
193         jmp     .Lot
194 .Laligned:
195         mov     %rax,($arg1)
196         lea     -8($arg2),$arg2
197         test    \$-8,$arg2
198         lea     8($arg1),$arg1
199         jnz     .Laligned
200         cmp     \$0,$arg2
201         jne     .Little
202         ret
203 .size   OPENSSL_cleanse,.-OPENSSL_cleanse
204 ___
205
206 print<<___ if (!$win64);
207 .globl  OPENSSL_wipe_cpu
208 .type   OPENSSL_wipe_cpu,\@abi-omnipotent
209 .align  16
210 OPENSSL_wipe_cpu:
211         pxor    %xmm0,%xmm0
212         pxor    %xmm1,%xmm1
213         pxor    %xmm2,%xmm2
214         pxor    %xmm3,%xmm3
215         pxor    %xmm4,%xmm4
216         pxor    %xmm5,%xmm5
217         pxor    %xmm6,%xmm6
218         pxor    %xmm7,%xmm7
219         pxor    %xmm8,%xmm8
220         pxor    %xmm9,%xmm9
221         pxor    %xmm10,%xmm10
222         pxor    %xmm11,%xmm11
223         pxor    %xmm12,%xmm12
224         pxor    %xmm13,%xmm13
225         pxor    %xmm14,%xmm14
226         pxor    %xmm15,%xmm15
227         xorq    %rcx,%rcx
228         xorq    %rdx,%rdx
229         xorq    %rsi,%rsi
230         xorq    %rdi,%rdi
231         xorq    %r8,%r8
232         xorq    %r9,%r9
233         xorq    %r10,%r10
234         xorq    %r11,%r11
235         leaq    8(%rsp),%rax
236         ret
237 .size   OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
238 ___
239 print<<___ if ($win64);
240 .globl  OPENSSL_wipe_cpu
241 .type   OPENSSL_wipe_cpu,\@abi-omnipotent
242 .align  16
243 OPENSSL_wipe_cpu:
244         pxor    %xmm0,%xmm0
245         pxor    %xmm1,%xmm1
246         pxor    %xmm2,%xmm2
247         pxor    %xmm3,%xmm3
248         pxor    %xmm4,%xmm4
249         pxor    %xmm5,%xmm5
250         xorq    %rcx,%rcx
251         xorq    %rdx,%rdx
252         xorq    %r8,%r8
253         xorq    %r9,%r9
254         xorq    %r10,%r10
255         xorq    %r11,%r11
256         leaq    8(%rsp),%rax
257         ret
258 .size   OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
259 ___
260 {
261 my $out="%r10";
262 my $cnt="%rcx";
263 my $max="%r11";
264 my $lasttick="%r8d";
265 my $lastdiff="%r9d";
266 my $redzone=win64?8:-8;
267
268 print<<___;
269 .globl  OPENSSL_instrument_bus
270 .type   OPENSSL_instrument_bus,\@abi-omnipotent
271 .align  16
272 OPENSSL_instrument_bus:
273         mov     $arg1,$out      # tribute to Win64
274         mov     $arg2,$cnt
275         mov     $arg2,$max
276
277         rdtsc                   # collect 1st tick
278         mov     %eax,$lasttick  # lasttick = tick
279         mov     \$0,$lastdiff   # lastdiff = 0
280         clflush ($out)
281         .byte   0xf0            # lock
282         add     $lastdiff,($out)
283         jmp     .Loop
284 .align  16
285 .Loop:  rdtsc
286         mov     %eax,%edx
287         sub     $lasttick,%eax
288         mov     %edx,$lasttick
289         mov     %eax,$lastdiff
290         clflush ($out)
291         .byte   0xf0            # lock
292         add     %eax,($out)
293         lea     4($out),$out
294         sub     \$1,$cnt
295         jnz     .Loop
296
297         mov     $max,%rax
298         ret
299 .size   OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
300
301 .globl  OPENSSL_instrument_bus2
302 .type   OPENSSL_instrument_bus2,\@abi-omnipotent
303 .align  16
304 OPENSSL_instrument_bus2:
305         mov     $arg1,$out      # tribute to Win64
306         mov     $arg2,$cnt
307         mov     $arg3,$max
308         mov     $cnt,$redzone(%rsp)
309
310         rdtsc                   # collect 1st tick
311         mov     %eax,$lasttick  # lasttick = tick
312         mov     \$0,$lastdiff   # lastdiff = 0
313
314         clflush ($out)
315         .byte   0xf0            # lock
316         add     $lastdiff,($out)
317
318         rdtsc                   # collect 1st diff
319         mov     %eax,%edx
320         sub     $lasttick,%eax  # diff
321         mov     %edx,$lasttick  # lasttick = tick
322         mov     %eax,$lastdiff  # lastdiff = diff
323 .Loop2:
324         clflush ($out)
325         .byte   0xf0            # lock
326         add     %eax,($out)     # accumulate diff
327
328         sub     \$1,$max
329         jz      .Ldone2
330
331         rdtsc
332         mov     %eax,%edx
333         sub     $lasttick,%eax  # diff
334         mov     %edx,$lasttick  # lasttick = tick
335         cmp     $lastdiff,%eax
336         mov     %eax,$lastdiff  # lastdiff = diff
337         mov     \$0,%edx
338         setne   %dl
339         sub     %rdx,$cnt       # conditional --$cnt
340         lea     ($out,%rdx,4),$out      # conditional ++$out
341         jnz     .Loop2
342
343 .Ldone2:
344         mov     $redzone(%rsp),%rax
345         sub     $cnt,%rax
346         ret
347 .size   OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
348 ___
349 }
350
351 close STDOUT;   # flush