Cpuid modules updates.
authorAndy Polyakov <appro@openssl.org>
Tue, 3 May 2005 21:05:06 +0000 (21:05 +0000)
committerAndy Polyakov <appro@openssl.org>
Tue, 3 May 2005 21:05:06 +0000 (21:05 +0000)
crypto/ia64cpuid.S
crypto/sparccpuid.S [new file with mode: 0644]
crypto/x86cpuid.pl

index a800527..04fbb34 100644 (file)
@@ -4,6 +4,118 @@
 .global        OPENSSL_rdtsc#
 .proc  OPENSSL_rdtsc#
 OPENSSL_rdtsc:
-       mov     r8=ar.itc
-       br.ret  b0
+{ .mib;        mov                     r8=ar.itc
+       br.ret.sptk.many        b0              };;
 .endp   OPENSSL_rdtsc#
+
+.global        OPENSSL_atomic_add#
+.proc  OPENSSL_atomic_add#
+.align 32
+OPENSSL_atomic_add:
+{ .mii;        ld4             r2=[r32]
+       nop.i           0
+       nop.i           0               };;
+.Lspin:
+{ .mii;        mov             ar.ccv=r2
+       add             r8=r2,r33
+       mov             r3=r2           };;
+{ .mmi;        mf
+       cmpxchg4.acq    r2=[r32],r8,ar.ccv
+       nop.i           0               };;
+{ .mib;        cmp.ne          p6,p0=r2,r3
+       nop.i           0
+(p6)   br.dpnt         .Lspin          };;
+{ .mib;        nop.m           0
+       sxt4            r8=r8
+       br.ret.sptk.many        b0      };;
+.endp  OPENSSL_atomic_add#
+
+// Returns a structure comprising pointer to the top of stack of
+// the caller and pointer beyond backing storage for the current
+// register frame. The latter is required, because it might be
+// insufficient to wipe backing storage for the current frame
+// (as this procedure does), one might have to go further, toward
+// higher addresses to reach for whole "retroactively" saved
+// context...
+.global        OPENSSL_wipe_cpu#
+.proc  OPENSSL_wipe_cpu#
+.align 32
+OPENSSL_wipe_cpu:
+       .prologue
+       .fframe 0
+       .save   ar.pfs,r2
+       .save   ar.lc,r3
+{ .mib;        alloc           r2=ar.pfs,0,96,0,96
+       mov             r3=ar.lc
+       brp.loop.imp    .L_wipe_top,.L_wipe_end-16
+                                       };;
+{ .mii;        mov             r9=ar.bsp
+       mov             r8=pr
+       mov             ar.lc=96        };;
+       .body
+{ .mii;        add             r9=96*8-8,r9
+       mov             ar.ec=1         };;
+
+// One can sweep double as fast, but then we can't quarantee
+// that backing storage is wiped...
+.L_wipe_top:
+{ .mfi;        st8             [r9]=r0,-8
+       mov             f127=f0
+       mov             r127=r0         }
+{ .mfb;        nop.m           0
+       nop.f           0
+       br.ctop.sptk    .L_wipe_top     };;
+.L_wipe_end:
+
+{ .mfi;        mov             r11=r0
+       mov             f6=f0
+       mov             r14=r0          }
+{ .mfi;        mov             r15=r0
+       mov             f7=f0
+       mov             r16=r0          }
+{ .mfi;        mov             r17=r0
+       mov             f8=f0
+       mov             r18=r0          }
+{ .mfi;        mov             r19=r0
+       mov             f9=f0
+       mov             r20=r0          }
+{ .mfi;        mov             r21=r0
+       mov             f10=f0
+       mov             r22=r0          }
+{ .mfi;        mov             r23=r0
+       mov             f11=f0
+       mov             r24=r0          }
+{ .mfi;        mov             r25=r0
+       mov             f12=f0
+       mov             r26=r0          }
+{ .mfi;        mov             r27=r0
+       mov             f13=f0
+       mov             r28=r0          }
+{ .mfi;        mov             r29=r0
+       mov             f14=f0
+       mov             r30=r0          }
+{ .mfi;        mov             r31=r0
+       mov             f15=f0
+       nop.i           0               }
+{ .mfi;        mov             f16=f0          }
+{ .mfi;        mov             f17=f0          }
+{ .mfi;        mov             f18=f0          }
+{ .mfi;        mov             f19=f0          }
+{ .mfi;        mov             f20=f0          }
+{ .mfi;        mov             f21=f0          }
+{ .mfi;        mov             f22=f0          }
+{ .mfi;        mov             f23=f0          }
+{ .mfi;        mov             f24=f0          }
+{ .mfi;        mov             f25=f0          }
+{ .mfi;        mov             f26=f0          }
+{ .mfi;        mov             f27=f0          }
+{ .mfi;        mov             f28=f0          }
+{ .mfi;        mov             f29=f0          }
+{ .mfi;        mov             f30=f0          }
+{ .mfi;        add             r9=96*8+8,r9
+       mov             f31=f0
+       mov             pr=r8,0x1ffff   }
+{ .mib;        mov             r8=sp
+       mov             ar.lc=r3
+       br.ret.sptk     b0              };;
+.endp  OPENSSL_wipe_cpu#
diff --git a/crypto/sparccpuid.S b/crypto/sparccpuid.S
new file mode 100644 (file)
index 0000000..c17350f
--- /dev/null
@@ -0,0 +1,239 @@
+#if defined(__SUNPRO_C) && defined(__sparcv9)
+# define ABI64  /* They've said -xarch=v9 at command line */
+#elif defined(__GNUC__) && defined(__arch64__)
+# define ABI64  /* They've said -m64 at command line */
+#endif
+
+#ifdef ABI64
+  .register    %g2,#scratch
+  .register    %g3,#scratch
+# define       FRAME   -192
+# define       BIAS    2047
+#else
+# define       FRAME   -96
+# define       BIAS    0
+#endif
+
+.text
+.align 32
+.global        OPENSSL_wipe_cpu
+.type  OPENSSL_wipe_cpu,#function
+! Keep in mind that this does not excuse us from wiping the stack!
+! This routine wipes registers, but not the backing store [which
+! resides on the stack, toward lower addresses]. To facilitate for
+! stack wiping I return pointer to the top of stack of the *caller*.
+OPENSSL_wipe_cpu:
+       save    %sp,FRAME,%sp
+       nop
+#ifdef __sun
+#include <sys/trap.h>
+       ta      ST_CLEAN_WINDOWS
+#else
+       call    .walk.reg.wins
+#endif
+       nop
+       call    .PIC.zero.up
+       mov     .zero-(.-4),%o0
+       ldd     [%o0],%f0
+
+       subcc   %g0,1,%o0
+       ! Following is V9 "rd %ccr,%o0" instruction. However! V8
+       ! specification says that it ("rd %asr2,%o0" in V8 terms) does
+       ! not cause illegal_instruction trap. It therefore can be used
+       ! to determine if the CPU the code is executing on is V8- or
+       ! V9-compliant, as V9 returns a distinct value of 0x99,
+       ! "negative" and "borrow" bits set in both %icc and %xcc.
+       .word   0x91408000      !rd     %ccr,%o0
+       cmp     %o0,0x99
+       bne     .v8
+       nop
+                       ! Even though we do not use %fp register bank,
+                       ! we wipe it as memcpy might have used it...
+                       .word   0xbfa00040      !fmovd  %f0,%f62
+                       .word   0xbba00040      !...
+                       .word   0xb7a00040
+                       .word   0xb3a00040
+                       .word   0xafa00040
+                       .word   0xaba00040
+                       .word   0xa7a00040
+                       .word   0xa3a00040
+                       .word   0x9fa00040
+                       .word   0x9ba00040
+                       .word   0x97a00040
+                       .word   0x93a00040
+                       .word   0x8fa00040
+                       .word   0x8ba00040
+                       .word   0x87a00040
+                       .word   0x83a00040      !fmovd  %f0,%f32
+.v8:                   fmovs   %f1,%f31
+       clr     %o0
+                       fmovs   %f0,%f30
+       clr     %o1
+                       fmovs   %f1,%f29
+       clr     %o2
+                       fmovs   %f0,%f28
+       clr     %o3
+                       fmovs   %f1,%f27
+       clr     %o4
+                       fmovs   %f0,%f26
+       clr     %o5
+                       fmovs   %f1,%f25
+       clr     %o7
+                       fmovs   %f0,%f24
+       clr     %l0
+                       fmovs   %f1,%f23
+       clr     %l1
+                       fmovs   %f0,%f22
+       clr     %l2
+                       fmovs   %f1,%f21
+       clr     %l3
+                       fmovs   %f0,%f20
+       clr     %l4
+                       fmovs   %f1,%f19
+       clr     %l5
+                       fmovs   %f0,%f18
+       clr     %l6
+                       fmovs   %f1,%f17
+       clr     %l7
+                       fmovs   %f0,%f16
+       clr     %i0
+                       fmovs   %f1,%f15
+       clr     %i1
+                       fmovs   %f0,%f14
+       clr     %i2
+                       fmovs   %f1,%f13
+       clr     %i3
+                       fmovs   %f0,%f12
+       clr     %i4
+                       fmovs   %f1,%f11
+       clr     %i5
+                       fmovs   %f0,%f10
+       clr     %g1
+                       fmovs   %f1,%f9
+       clr     %g2
+                       fmovs   %f0,%f8
+       clr     %g3
+                       fmovs   %f1,%f7
+       clr     %g4
+                       fmovs   %f0,%f6
+       clr     %g5
+                       fmovs   %f1,%f5
+                       fmovs   %f0,%f4
+                       fmovs   %f1,%f3
+                       fmovs   %f0,%f2
+
+       add     %fp,BIAS,%i0    ! return pointer to callerĀ“s top of stack
+
+       ret
+       restore
+
+.zero: .long   0x0,0x0
+.PIC.zero.up:
+       retl
+       add     %o0,%o7,%o0
+#ifdef DEBUG
+.global        walk_reg_wins
+.type  walk_reg_wins,#function
+walk_reg_wins:
+#endif
+.walk.reg.wins:
+       save    %sp,FRAME,%sp
+       cmp     %i7,%o7
+       be      2f
+       clr     %o0
+       cmp     %o7,0   ! compiler never cleans %o7...
+       be      1f      ! could have been a leaf function...
+       clr     %o1
+       call    .walk.reg.wins
+       nop
+1:     clr     %o2
+       clr     %o3
+       clr     %o4
+       clr     %o5
+       clr     %o7
+       clr     %l0
+       clr     %l1
+       clr     %l2
+       clr     %l3
+       clr     %l4
+       clr     %l5
+       clr     %l6
+       clr     %l7
+       add     %o0,1,%i0       ! used for debugging
+2:     ret
+       restore
+.size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global        OPENSSL_atomic_add
+.type  OPENSSL_atomic_add,#function
+OPENSSL_atomic_add:
+#ifndef ABI64
+       subcc   %g0,1,%o2
+       .word   0x95408000      !rd     %ccr,%o2, see comment above
+       cmp     %o2,0x99
+       be      .v9
+       nop
+       save    %sp,FRAME,%sp
+       ba      .enter
+       nop
+#ifdef __sun
+! Note that you don't have to link with libthread to call thr_yield,
+! as libc provides a stub, which is overloaded the moment you link
+! with *either* libpthread or libthread...
+#define        YIELD_CPU       thr_yield
+#else
+! applies at least to Linux and FreeBSD... Feedback expected...
+#define        YIELD_CPU       sched_yield
+#endif
+.spin: call    YIELD_CPU
+       nop
+.enter:        ld      [%i0],%i2
+       cmp     %i2,-4096
+       be      .spin
+       mov     -1,%i2
+       swap    [%i0],%i2
+       cmp     %i2,-1
+       be      .spin
+       add     %i2,%i1,%i2
+       stbar
+       st      %i2,[%i0]
+       sra     %i2,%g0,%i0
+       ret
+       restore
+.v9:
+#endif
+       ld      [%o0],%o2
+1:     add     %o1,%o2,%o3
+       .word   0xd7e2100a      !cas [%o0],%o2,%o3, compare [%o0] with %o2 and swap %o3
+       cmp     %o2,%o3
+       bne     1b
+       mov     %o3,%o2         ! cas is always fetching to dest. register
+       add     %o1,%o2,%o0     ! OpenSSL expects the new value
+       retl
+       sra     %o0,%g0,%o0     ! we return signed int, remember?
+.size  OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global        OPENSSL_rdtsc
+       subcc   %g0,1,%o0
+       .word   0x91408000      !rd     %ccr,%o0
+       cmp     %o0,0x99
+       bne     .notsc
+       xor     %o0,%o0,%o0
+       save    %sp,FRAME-16,%sp
+       mov     513,%o0         !SI_PLATFORM
+       add     %sp,BIAS+16,%o1
+       call    sysinfo
+       mov     256,%o2
+
+       add     %sp,BIAS-16,%o1
+       ld      [%o1],%l0
+       ld      [%o1+4],%l1
+       ld      [%o1+8],%l2
+       mov     %lo('SUNW'),%l3
+       ret
+       restore
+.notsc:
+       retl
+       nop
+.type  OPENSSL_rdtsc,#function
+.size  OPENSSL_rdtsc,.-OPENSSL_atomic_add
index 894c49c..9ad9435 100644 (file)
@@ -72,6 +72,84 @@ require "x86asm.pl";
        &ret    ();
 &function_end_B("OPENSSL_instrument_halt");
 
+# Essentially there is only one use for this function. Under DJGPP:
+#
+#      #include <go32.h>
+#      ...
+#      i=OPENSSL_far_spin(_dos_ds,0x46c);
+#      ...
+# to obtain the number of spins till closest timer interrupt.
+
+&function_begin_B("OPENSSL_far_spin");
+       &pushf  ();
+       &pop    ("eax")
+       &bt     ("eax",9);
+       &jnc    (&label("nospin"));     # interrupts are disabled
+
+       &mov    ("eax",&DWP(4,"esp"));
+       &mov    ("ecx",&DWP(8,"esp"));
+       &data_word (0x90d88e1e);        # push %ds, mov %eax,%ds
+       &xor    ("eax","eax");
+       &mov    ("edx",&DWP(0,"ecx"));
+       &jmp    (&label("spin"));
+
+       &align  (16);
+&set_label("spin");
+       &inc    ("eax");
+       &cmp    ("edx",&DWP(0,"ecx"));
+       &je     (&label("spin"));
+
+       &data_word (0x1f909090);        # pop   %ds
+       &ret    ();
+
+&set_label("nospin");
+       &xor    ("eax","eax");
+       &xor    ("edx","edx");
+       &ret    ();
+&function_end_B("OPENSSL_far_spin");
+
+&function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
+       &xor    ("eax","eax");
+       &xor    ("edx","edx");
+       &picmeup("ecx","OPENSSL_ia32cap_P");
+       &mov    ("ecx",&DWP(0,"ecx"));
+       &bt     (&DWP(0,"ecx"),1);
+       &jnc    (&label("no_x87"));
+       &bt     (&DWP(0,"ecx"),26);
+       &jnc    (&label("no_sse2"));
+       &pxor   ("xmm0","xmm0");
+       &pxor   ("xmm1","xmm1");
+       &pxor   ("xmm2","xmm2");
+       &pxor   ("xmm3","xmm3");
+       &pxor   ("xmm4","xmm4");
+       &pxor   ("xmm5","xmm5");
+       &pxor   ("xmm6","xmm6");
+       &pxor   ("xmm7","xmm7");
+&set_label("no_sse2");
+       # just a bunch of fldz to zap the fp/mm bank...
+       &data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9);
+       &emms   ();
+&set_label("no_x87");
+       &lea    ("eax",&DWP(4,"esp"));
+       &ret    ();
+&function_end_B("OPENSSL_wipe_cpu");
+
+&function_begin_B("OPENSSL_atomic_add");
+       &mov    ("edx",&DWP(4,"esp"));  # fetch the pointer, 1st arg
+       &mov    ("ecx",&DWP(8,"esp"));  # fetch the increment, 2nd arg
+       &push   ("ebx");
+       &nop    ();
+       &mov    ("eax",&DWP(0,"edx"));
+&set_label("spin");
+       &lea    ("ebx",&DWP(0,"eax","ecx"));
+       &nop    ();
+       &data_word(0x1ab10ff0); # lock; cmpxchg %ebx,(%edx)     # %eax is envolved and is always reloaded
+       &jne    (&label("spin"));
+       &mov    ("eax","ebx");  # OpenSSL expects the new value
+       &pop    ("ebx");
+       &ret    ();
+&function_end_B("OPENSSL_atomic_add");
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();