Skip to content

Commit

Permalink
x86[_64]cpuid.pl: harmonize OPENSSL_ia32_cpuid [from HEAD].
Browse files Browse the repository at this point in the history
  • Loading branch information
Andy Polyakov committed Jun 28, 2011
1 parent 4a46dc6 commit 10fd0b7
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 34 deletions.
70 changes: 52 additions & 18 deletions crypto/x86_64cpuid.pl
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,24 @@
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open STDOUT,"| $^X $xlate $flavour $output";

($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order

if ($win64) { $arg1="%rcx"; $arg2="%rdx"; }
else { $arg1="%rdi"; $arg2="%rsi"; }
print<<___;
.extern OPENSSL_cpuid_setup
.hidden OPENSSL_cpuid_setup
.section .init
call OPENSSL_cpuid_setup
.hidden OPENSSL_ia32cap_P
.comm OPENSSL_ia32cap_P,8
.text
.globl OPENSSL_atomic_add
Expand Down Expand Up @@ -46,7 +55,7 @@
.type OPENSSL_ia32_cpuid,\@abi-omnipotent
.align 16
OPENSSL_ia32_cpuid:
mov %rbx,%r8
mov %rbx,%r8 # save %rbx
xor %eax,%eax
cpuid
Expand Down Expand Up @@ -78,7 +87,15 @@
# AMD specific
mov \$0x80000000,%eax
cpuid
cmp \$0x80000008,%eax
cmp \$0x80000001,%eax
jb .Lintel
mov %eax,%r10d
mov \$0x80000001,%eax
cpuid
or %ecx,%r9d
and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11
cmp \$0x80000008,%r10d
jb .Lintel
mov \$0x80000008,%eax
Expand All @@ -89,12 +106,12 @@
mov \$1,%eax
cpuid
bt \$28,%edx # test hyper-threading bit
jnc .Ldone
jnc .Lgeneric
shr \$16,%ebx # number of logical processors
cmp %r10b,%bl
ja .Ldone
ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
jmp .Ldone
jmp .Lgeneric
.Lintel:
cmp \$4,%r11d
Expand All @@ -111,30 +128,47 @@
.Lnocacheinfo:
mov \$1,%eax
cpuid
and \$0xbfefffff,%edx # force reserved bits to 0
cmp \$0,%r9d
jne .Lnotintel
or \$0x00100000,%edx # use reserved 20th bit to engage RC4_CHAR
or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs
and \$15,%ah
cmp \$15,%ah # examine Family ID
je .Lnotintel
or \$0x40000000,%edx # use reserved bit to skip unrolled loop
jne .Lnotintel
or \$0x00100000,%edx # set reserved bit#20 to engage RC4_CHAR
.Lnotintel:
bt \$28,%edx # test hyper-threading bit
jnc .Ldone
jnc .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
cmp \$0,%r10d
je .Ldone
je .Lgeneric
or \$0x10000000,%edx # 1<<28
shr \$16,%ebx
cmp \$1,%bl # see if cache is shared
ja .Ldone
ja .Lgeneric
and \$0xefffffff,%edx # ~(1<<28)
.Lgeneric:
and \$0x00000800,%r9d # isolate AMD XOP flag
and \$0xfffff7ff,%ecx
or %ecx,%r9d # merge AMD XOP flag
mov %edx,%r10d # %r9d:%r10d is copy of %ecx:%edx
bt \$27,%r9d # check OSXSAVE bit
jnc .Lclear_avx
xor %ecx,%ecx # XCR0
.byte 0x0f,0x01,0xd0 # xgetbv
and \$6,%eax # isolate XMM and YMM state support
cmp \$6,%eax
je .Ldone
.Lclear_avx:
mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11)
and %eax,%r9d # clear AVX, FMA and AMD XOP bits
.Ldone:
shl \$32,%rcx
mov %edx,%eax
mov %r8,%rbx
or %rcx,%rax
shl \$32,%r9
mov %r10d,%eax
mov %r8,%rbx # restore %rbx
or %r9,%rax
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
Expand Down
65 changes: 49 additions & 16 deletions crypto/x86cpuid.pl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
&pop ("eax");
&xor ("ecx","eax");
&bt ("ecx",21);
&jnc (&label("done"));
&jnc (&label("generic"));
&xor ("eax","eax");
&cpuid ();
&mov ("edi","eax"); # max value for standard query level
Expand Down Expand Up @@ -51,7 +51,14 @@
# AMD specific
&mov ("eax",0x80000000);
&cpuid ();
&cmp ("eax",0x80000008);
&cmp ("eax",0x80000001);
&jb (&label("intel"));
&mov ("esi","eax");
&mov ("eax",0x80000001);
&cpuid ();
&or ("ebp","ecx");
&and ("ebp",1<<11|1); # isolate XOP bit
&cmp ("esi",0x80000008);
&jb (&label("intel"));

&mov ("eax",0x80000008);
Expand All @@ -62,13 +69,13 @@
&mov ("eax",1);
&cpuid ();
&bt ("edx",28);
&jnc (&label("done"));
&jnc (&label("generic"));
&shr ("ebx",16);
&and ("ebx",0xff);
&cmp ("ebx","esi");
&ja (&label("done"));
&ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit
&jmp (&label("done"));
&jmp (&label("generic"));

&set_label("intel");
&cmp ("edi",4);
Expand All @@ -85,27 +92,52 @@
&set_label("nocacheinfo");
&mov ("eax",1);
&cpuid ();
&and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0
&cmp ("ebp",0);
&jne (&label("notP4"));
&jne (&label("notintel"));
&or ("edx",1<<30); # set reserved bit#30 on Intel CPUs
&and (&HB("eax"),15); # familiy ID
&cmp (&HB("eax"),15); # P4?
&jne (&label("notP4"));
&or ("edx",1<<20); # use reserved bit to engage RC4_CHAR
&set_label("notP4");
&jne (&label("notintel"));
&or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR
&set_label("notintel");
&bt ("edx",28); # test hyper-threading bit
&jnc (&label("done"));
&jnc (&label("generic"));
&and ("edx",0xefffffff);
&cmp ("edi",0);
&je (&label("done"));
&je (&label("generic"));

&or ("edx",0x10000000);
&shr ("ebx",16);
&cmp (&LB("ebx"),1);
&ja (&label("done"));
&ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit if not

&set_label("generic");
&and ("ebp",1<<11); # isolate AMD XOP flag
&and ("ecx",0xfffff7ff); # force 11th bit to 0
&mov ("esi","edx");
&or ("ebp","ecx"); # merge AMD XOP flag

&bt ("ecx",26); # check XSAVE bit
&jnc (&label("done"));
&bt ("ecx",27); # check OSXSAVE bit
&jnc (&label("clear_xmm"));
&xor ("ecx","ecx");
&data_byte(0x0f,0x01,0xd0); # xgetbv
&and ("eax",6);
&cmp ("eax",6);
&je (&label("done"));
&cmp ("eax",2);
&je (&label("clear_avx"));
&set_label("clear_xmm");
&and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits
&and ("esi",0xfeffffff); # clear FXSR
&set_label("clear_avx");
&and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits
&set_label("done");
&mov ("eax","edx");
&mov ("edx","ecx");
&mov ("eax","esi");
&mov ("edx","ebp");
&function_end("OPENSSL_ia32_cpuid");

&external_label("OPENSSL_ia32cap_P");
Expand Down Expand Up @@ -199,8 +231,9 @@
&bt (&DWP(0,"ecx"),1);
&jnc (&label("no_x87"));
if ($sse2) {
&bt (&DWP(0,"ecx"),26);
&jnc (&label("no_sse2"));
&and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits
&cmp ("ecx",1<<26|1<<24);
&jne (&label("no_sse2"));
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
Expand Down

0 comments on commit 10fd0b7

Please sign in to comment.