Multiple assembler packs: add experimental memory bus instrumentation.
authorAndy Polyakov <appro@openssl.org>
Sun, 17 Apr 2011 12:46:00 +0000 (12:46 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 17 Apr 2011 12:46:00 +0000 (12:46 +0000)
crypto/alphacpuid.pl
crypto/ia64cpuid.S
crypto/pariscid.pl
crypto/ppccpuid.pl
crypto/s390xcpuid.S
crypto/sparccpuid.S
crypto/sparcv9cap.c
crypto/x86_64cpuid.pl
crypto/x86cpuid.pl
doc/crypto/OPENSSL_instrument_bus.pod [new file with mode: 0644]

index c9474ff..11f2e30 100644 (file)
@@ -126,3 +126,93 @@ OPENSSL_cleanse:
 .Ldone: ret    ($26)
 .end   OPENSSL_cleanse
 ___
+{
+my ($out,$cnt,$max)=("\$16","\$17","\$18");
+my ($tick,$lasttick)=("\$19","\$20");
+my ($diff,$lastdiff)=("\$21","\$22");
+my ($v0,$ra,$sp,$zero)=("\$0","\$26","\$30","\$31");
+
+print <<___;
+.globl OPENSSL_instrument_bus
+.ent   OPENSSL_instrument_bus
+OPENSSL_instrument_bus:
+       .frame  $sp,0,$ra
+       .prologue 0
+       mov     $cnt,$v0
+
+       rpcc    $lasttick
+       mov     0,$diff
+
+       ecb     ($out)
+       ldl_l   $tick,0($out)
+       addl    $diff,$tick,$tick
+       mov     $tick,$diff
+       stl_c   $tick,0($out)
+       stl     $diff,0($out)
+
+.Loop: rpcc    $tick
+       subq    $tick,$lasttick,$diff
+       mov     $tick,$lasttick
+
+       ecb     ($out)
+       ldl_l   $tick,0($out)
+       addl    $diff,$tick,$tick
+       mov     $tick,$diff
+       stl_c   $tick,0($out)
+       stl     $diff,0($out)
+
+       subl    $cnt,1,$cnt
+       lda     $out,4($out)
+       bne     $cnt,.Loop
+
+       ret     ($ra)
+.end   OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.ent   OPENSSL_instrument_bus2
+OPENSSL_instrument_bus2:
+       .frame  $sp,0,$ra
+       .prologue 0
+       mov     $cnt,$v0
+
+       rpcc    $lasttick
+       mov     0,$diff
+
+       ecb     ($out)
+       ldl_l   $tick,0($out)
+       addl    $diff,$tick,$tick
+       mov     $tick,$diff
+       stl_c   $tick,0($out)
+       stl     $diff,0($out)
+
+       rpcc    $tick
+       subq    $tick,$lasttick,$diff
+       mov     $tick,$lasttick
+       mov     $diff,$lastdiff
+.Loop2:
+       ecb     ($out)
+       ldl_l   $tick,0($out)
+       addl    $diff,$tick,$tick
+       mov     $tick,$diff
+       stl_c   $tick,0($out)
+       stl     $diff,0($out)
+
+       subl    $max,1,$max
+       beq     $max,.Ldone2
+
+       rpcc    $tick
+       subq    $tick,$lasttick,$diff
+       mov     $tick,$lasttick
+       subq    $lastdiff,$diff,$tick
+       mov     $diff,$lastdiff
+       cmovne  $tick,1,$tick
+       subl    $cnt,$tick,$cnt
+       s4addq  $tick,$out,$out
+       bne     $cnt,.Loop2
+
+.Ldone2:
+       subl    $v0,$cnt,$v0
+       ret     ($ra)
+.end   OPENSSL_instrument_bus2
+___
+}
index d705fff..dd27e16 100644 (file)
@@ -26,7 +26,7 @@ OPENSSL_atomic_add:
 { .mii;        mov             ar.ccv=r2
        add             r8=r2,r33
        mov             r3=r2           };;
-{ .mmi;        mf
+{ .mmi;        mf;;
        cmpxchg4.acq    r2=[r32],r8,ar.ccv
        nop.i           0               };;
 { .mib;        cmp.ne          p6,p0=r2,r3
@@ -165,3 +165,89 @@ OPENSSL_cleanse:
 (p7)   br.cond.dpnt    .Little
 (p6)   br.ret.sptk.many        b0      };;
 .endp  OPENSSL_cleanse#
+
+.global        OPENSSL_instrument_bus#
+.proc  OPENSSL_instrument_bus#
+OPENSSL_instrument_cache:
+{ .mmi;        mov             r2=r33
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+       addp4           r32=0,r32
+#endif
+                                       }
+{ .mmi;        mov             r8=ar.itc;;
+       mov             r10=r0
+       mov             r9=r8           };;
+
+{ .mmi;        fc              r32;;
+       ld4             r8=[r32]        };;
+{ .mmi;        mf
+       mov             ar.ccv=r8
+       add             r8=r8,r10       };;
+{ .mmi;        cmpxchg4.acq    r3=[r32],r8,ar.ccv
+                                       };;
+.Loop:
+{ .mmi;        mov             r8=ar.itc;;
+       sub             r10=r8,r9               // diff=tick-lasttick
+       mov             r9=r8           };;     // lasttick=tick
+{ .mmi;        fc              r32;;
+       ld4             r8=[r32]        };;
+{ .mmi;        mf
+       mov             ar.ccv=r8
+       add             r8=r8,r10       };;
+{ .mmi;        cmpxchg4.acq    r3=[r32],r8,ar.ccv
+       add             r33=-1,r33
+       add             r32=4,r32       };;
+{ .mib;        cmp4.ne         p6,p0=0,r33
+(p6)   br.cond.dptk    .Loop           };;
+
+{ .mib;        sub             r8=r2,r33
+       br.ret.sptk.many        b0      };;
+.endp  OPENSSL_instrument_bus#
+
+.global        OPENSSL_instrument_bus2#
+.proc  OPENSSL_instrument_bus2#
+OPENSSL_instrument_cache2:
+{ .mmi;        mov             r2=r33                  // put aside cnt
+#if defined(_HPUX_SOURCE) && !defined(_LP64)
+       addp4           r32=0,r32
+#endif
+                                       }
+{ .mmi;        mov             r8=ar.itc;;
+       mov             r10=r0
+       mov             r9=r8           };;
+
+{ .mmi;        fc              r32;;
+       ld4             r8=[r32]        };;
+{ .mmi;        mf
+       mov             ar.ccv=r8
+       add             r8=r8,r10       };;
+{ .mmi;        cmpxchg4.acq    r3=[r32],r8,ar.ccv
+                                       };;
+
+{ .mmi;        mov             r8=ar.itc;;
+       sub             r10=r8,r9
+       mov             r9=r8           };;
+.Loop2:
+{ .mmi;        mov             r11=r10                 // lastdiff=diff
+       add             r34=-1,r34      };;     // --max
+{ .mmi;        fc              r32;;
+       ld4             r8=[r32]
+       cmp4.eq         p6,p0=0,r34     };;
+{ .mmi;        mf
+       mov             ar.ccv=r8
+       add             r8=r8,r10       };;
+{ .mmb;        cmpxchg4.acq    r3=[r32],r8,ar.ccv
+(p6)   br.cond.spnt    .Ldone2         };;
+
+{ .mmi;        mov             r8=ar.itc;;
+       sub             r10=r8,r9               // diff=tick-lasttick
+       mov             r9=r8           };;     // lasttick=tick
+{ .mmi;        cmp.ne          p6,p0=r10,r11;;         // diff!=lastdiff
+(p6)   add             r33=-1,r33      };;     // conditional --cnt
+{ .mib;        cmp4.ne         p7,p0=0,r33
+(p6)   add             r32=4,r32               // conditional ++out
+(p7)   br.cond.dptk    .Loop2          };;
+.Ldone2:
+{ .mib;        sub             r8=r2,r33
+       br.ret.sptk.many        b0      };;
+.endp  OPENSSL_instrument_bus2#
index 1ed5381..477ec9b 100644 (file)
@@ -87,8 +87,8 @@ OPENSSL_wipe_cpu
        .PROCEND
 ___
 {
-$inp="%r26";
-$len="%r25";
+my $inp="%r26";
+my $len="%r25";
 
 $code.=<<___;
        .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
@@ -112,9 +112,9 @@ Lalign
 
 Laligned
        andcm           $len,%r1,%r28
-Loop
+Lot
        $ST             %r0,0($inp)
-       addib,*<>       -$SIZE_T,%r28,Loop
+       addib,*<>       -$SIZE_T,%r28,Lot
        ldo             $SIZE_T($inp),$inp
 
        and,*<>         $len,%r1,$len
@@ -130,7 +130,93 @@ Ldone
        .PROCEND
 ___
 }
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+
+$code.=<<___;
+       .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+       .ALIGN  8
+OPENSSL_instrument_bus
+       .PROC
+       .CALLINFO       NO_CALLS
+       .ENTRY
+       copy            $cnt,$rv
+       mfctl           %cr16,$tick
+       copy            $tick,$lasttick
+       ldi             0,$diff
+
+       fdc             0($out)
+       ldw             0($out),$tick
+       add             $diff,$tick,$tick
+       stw             $tick,0($out)
+Loop
+       mfctl           %cr16,$tick
+       sub             $tick,$lasttick,$diff
+       copy            $tick,$lasttick
+
+       fdc             0($out)
+       ldw             0($out),$tick
+       add             $diff,$tick,$tick
+       stw             $tick,0($out)
+
+       addib,<>        -1,$cnt,Loop
+       addi            4,$out,$out
+
+       bv              ($rp)
+       .EXIT
+       sub             $rv,$cnt,$rv
+       .PROCEND
 
+       .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+       .ALIGN  8
+OPENSSL_instrument_bus2
+       .PROC
+       .CALLINFO       NO_CALLS
+       .ENTRY
+       copy            $cnt,$rv
+       sub             %r0,$cnt,$cnt
+
+       mfctl           %cr16,$tick
+       copy            $tick,$lasttick
+       ldi             0,$diff
+
+       fdc             0($out)
+       ldw             0($out),$tick
+       add             $diff,$tick,$tick
+       stw             $tick,0($out)
+
+       mfctl           %cr16,$tick
+       sub             $tick,$lasttick,$diff
+       copy            $tick,$lasttick
+Loop2
+       copy            $diff,$lastdiff
+       fdc             0($out)
+       ldw             0($out),$tick
+       add             $diff,$tick,$tick
+       stw             $tick,0($out)
+
+       addib,=         -1,$max,Ldone2
+       nop
+
+       mfctl           %cr16,$tick
+       sub             $tick,$lasttick,$diff
+       copy            $tick,$lasttick
+       cmpclr,<>       $lastdiff,$diff,$tick
+       ldi             1,$tick
+
+       ldi             1,%r1
+       xor             %r1,$tick,$tick
+       addb,<>         $tick,$cnt,Loop2
+       shladd,l        $tick,2,$out,$out
+Ldone2
+       bv              ($rp)
+       .EXIT
+       add             $rv,$cnt,$rv
+       .PROCEND
+___
+}
 $code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
 $code =~ s/,\*/,/gm if ($SIZE_T==4);
 print $code;
index 2131d30..d6220e7 100755 (executable)
@@ -69,10 +69,10 @@ $code=<<___;
 .globl .OPENSSL_atomic_add
 .align 4
 .OPENSSL_atomic_add:
-Loop:  lwarx   r5,0,r3
+Ladd:  lwarx   r5,0,r3
        add     r0,r4,r5
        stwcx.  r0,0,r3
-       bne-    Loop
+       bne-    Ladd
        $SIGNX  r3,r0
        blr
 
@@ -112,6 +112,89 @@ Laligned:
        bne     Little
        blr
 ___
+{
+my ($out,$cnt,$max)=("r3","r4","r5");
+my ($tick,$lasttick)=("r6","r7");
+my ($diff,$lastdiff)=("r8","r9");
+
+$code.=<<___;
+.globl .OPENSSL_instrument_bus
+.align 4
+.OPENSSL_instrument_bus:
+       mtctr   $cnt
+
+       mftb    $lasttick               # collect 1st tick
+       li      $diff,0
+
+       dcbf    0,$out                  # flush cache line
+       lwarx   $tick,0,$out            # load and lock
+       add     $tick,$tick,$diff
+       stwcx.  $tick,0,$out
+       stwx    $tick,0,$out
+
+Loop:  mftb    $tick
+       sub     $diff,$tick,$lasttick
+       mr      $lasttick,$tick
+       dcbf    0,$out                  # flush cache line
+       lwarx   $tick,0,$out            # load and lock
+       add     $tick,$tick,$diff
+       stwcx.  $tick,0,$out
+       stwx    $tick,0,$out
+       addi    $out,$out,4             # ++$out
+       bdnz    Loop
+
+       mr      r3,$cnt
+       blr
+
+.globl .OPENSSL_instrument_bus2
+.align 4
+.OPENSSL_instrument_bus2:
+       mr      r0,$cnt
+       slwi    $cnt,$cnt,2
+
+       mftb    $lasttick               # collect 1st tick
+       li      $diff,0
+
+       dcbf    0,$out                  # flush cache line
+       lwarx   $tick,0,$out            # load and lock
+       add     $tick,$tick,$diff
+       stwcx.  $tick,0,$out
+       stwx    $tick,0,$out
+
+       mftb    $tick                   # collect 1st diff
+       sub     $diff,$tick,$lasttick
+       mr      $lasttick,$tick
+       mr      $lastdiff,$diff
+Loop2:
+       dcbf    0,$out                  # flush cache line
+       lwarx   $tick,0,$out            # load and lock
+       add     $tick,$tick,$diff
+       stwcx.  $tick,0,$out
+       stwx    $tick,0,$out
+
+       addic.  $max,$max,-1
+       beq     Ldone2
+
+       mftb    $tick
+       sub     $diff,$tick,$lasttick
+       mr      $lasttick,$tick
+       cmplw   7,$diff,$lastdiff
+       mr      $lastdiff,$diff
+
+       mfcr    $tick                   # pull cr
+       not     $tick,$tick             # flip bits
+       rlwinm  $tick,$tick,1,29,29     # isolate flipped eq bit and scale
+
+       sub.    $cnt,$cnt,$tick         # conditional --$cnt
+       add     $out,$out,$tick         # conditional ++$out
+       bne     Loop2
+
+Ldone2:
+       srwi    $cnt,$cnt,2
+       sub     r3,r0,$cnt
+       blr
+___
+}
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
index 0681534..3402a24 100644 (file)
@@ -93,6 +93,22 @@ OPENSSL_cleanse:
        br      %r14
 .size  OPENSSL_cleanse,.-OPENSSL_cleanse
 
+.globl OPENSSL_instrument_bus
+.type  OPENSSL_instrument_bus,@function
+.align 16
+OPENSSL_instrument_bus:
+       lghi    %r2,0
+       br      %r14
+.size  OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.type  OPENSSL_instrument_bus2,@function
+.align 16
+OPENSSL_instrument_bus2:
+       lghi    %r2,0
+       br      %r14
+.size  OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
 .section       .init
        brasl   %r14,OPENSSL_cpuid_setup
 
index ae61f7f..329efcd 100644 (file)
@@ -397,6 +397,102 @@ OPENSSL_cleanse:
 .type  OPENSSL_cleanse,#function
 .size  OPENSSL_cleanse,.-OPENSSL_cleanse
 
+.global        _sparcv9_vis1_instrument_bus
+.align 8
+_sparcv9_vis1_instrument_bus:
+       mov     %o1,%o3                                 ! save cnt
+       .word   0x99410000      !rd     %tick,%o4       ! tick
+       mov     %o4,%o5                                 ! lasttick = tick
+       set     0,%g4                                   ! diff
+
+       andn    %o0,63,%g1
+       .word   0xc1985e00      !ldda   [%g1]0xf0,%f0   ! block load
+       .word   0x8143e040      !membar #Sync
+       .word   0xc1b85c00      !stda   %f0,[%g1]0xe0   ! block store and commit
+       .word   0x8143e040      !membar #Sync
+       ld      [%o0],%o4
+       add     %o4,%g4,%g4
+       .word   0xc9e2100c      !cas    [%o0],%o4,%g4
+
+.Loop: .word   0x99410000      !rd     %tick,%o4
+       sub     %o4,%o5,%g4                             ! diff=tick-lasttick
+       mov     %o4,%o5                                 ! lasttick=tick
+
+       andn    %o0,63,%g1
+       .word   0xc1985e00      !ldda   [%g1]0xf0,%f0   ! block load
+       .word   0x8143e040      !membar #Sync
+       .word   0xc1b85c00      !stda   %f0,[%g1]0xe0   ! block store and commit
+       .word   0x8143e040      !membar #Sync
+       ld      [%o0],%o4
+       add     %o4,%g4,%g4
+       .word   0xc9e2100c      !cas    [%o0],%o4,%g4
+       subcc   %o1,1,%o1                               ! --$cnt
+       bnz     .Loop
+       add     %o0,4,%o0                               ! ++$out
+
+       retl
+       mov     %o3,%o0
+.type  _sparcv9_vis1_instrument_bus,#function
+.size  _sparcv9_vis1_instrument_bus,.-_sparcv9_vis1_instrument_bus
+
+.global        _sparcv9_vis1_instrument_bus2
+.align 8
+_sparcv9_vis1_instrument_bus2:
+       mov     %o1,%o3                                 ! save cnt
+       sll     %o1,2,%o1                               ! cnt*=4
+
+       .word   0x99410000      !rd     %tick,%o4       ! tick
+       mov     %o4,%o5                                 ! lasttick = tick
+       set     0,%g4                                   ! diff
+
+       andn    %o0,63,%g1
+       .word   0xc1985e00      !ldda   [%g1]0xf0,%f0   ! block load
+       .word   0x8143e040      !membar #Sync
+       .word   0xc1b85c00      !stda   %f0,[%g1]0xe0   ! block store and commit
+       .word   0x8143e040      !membar #Sync
+       ld      [%o0],%o4
+       add     %o4,%g4,%g4
+       .word   0xc9e2100c      !cas    [%o0],%o4,%g4
+
+       .word   0x99410000      !rd     %tick,%o4       ! tick
+       sub     %o4,%o5,%g4                             ! diff=tick-lasttick
+       mov     %o4,%o5                                 ! lasttick=tick
+       mov     %g4,%g5                                 ! lastdiff=diff
+.Loop2:
+       andn    %o0,63,%g1
+       .word   0xc1985e00      !ldda   [%g1]0xf0,%f0   ! block load
+       .word   0x8143e040      !membar #Sync
+       .word   0xc1b85c00      !stda   %f0,[%g1]0xe0   ! block store and commit
+       .word   0x8143e040      !membar #Sync
+       ld      [%o0],%o4
+       add     %o4,%g4,%g4
+       .word   0xc9e2100c      !cas    [%o0],%o4,%g4
+
+       subcc   %o2,1,%o2                               ! --max
+       bz      .Ldone2
+       nop
+
+       .word   0x99410000      !rd     %tick,%o4       ! tick
+       sub     %o4,%o5,%g4                             ! diff=tick-lasttick
+       mov     %o4,%o5                                 ! lasttick=tick
+       cmp     %g4,%g5
+       mov     %g4,%g5                                 ! lastdiff=diff
+
+       .word   0x83408000      !rd     %ccr,%g1
+       and     %g1,4,%g1                               ! isolate zero flag
+       xor     %g1,4,%g1                               ! flip zero flag
+
+       subcc   %o1,%g1,%o1                             ! conditional --$cnt
+       bnz     .Loop2
+       add     %o0,%g1,%o0                             ! conditional ++$out
+
+.Ldone2:
+       srl     %o1,2,%o1
+       retl
+       sub     %o3,%o1,%o0
+.type  _sparcv9_vis1_instrument_bus2,#function
+.size  _sparcv9_vis1_instrument_bus2,.-_sparcv9_vis1_instrument_bus2
+
 .section       ".init",#alloc,#execinstr
        call    OPENSSL_cpuid_setup
        nop
index ed195ab..ad4b3be 100644 (file)
@@ -11,6 +11,7 @@
 #define SPARCV9_VIS1           (1<<2)
 #define SPARCV9_VIS2           (1<<3)  /* reserved */
 #define SPARCV9_FMADD          (1<<4)  /* reserved for SPARC64 V */
+#define SPARCV9_BLK            (1<<5)  /* VIS1 block copy */
 
 static int OPENSSL_sparcv9cap_P=SPARCV9_TICK_PRIVILEGED;
 
@@ -31,6 +32,8 @@ void          _sparcv9_vis1_probe(void);
 unsigned long  _sparcv9_vis1_instrument(void);
 void           _sparcv9_vis2_probe(void);
 void           _sparcv9_fmadd_probe(void);
+size_t                 _sparcv9_vis1_instrument_bus(unsigned int *,size_t);
+size_t         _sparcv8_vis1_instrument_bus2(unsigned int *,size_t,size_t);
 
 unsigned long OPENSSL_rdtsc(void)
        {
@@ -44,6 +47,24 @@ unsigned long OPENSSL_rdtsc(void)
                return _sparcv9_rdtick();
        }
 
+size_t OPENSSL_instrument_bus(unsigned int *out,size_t cnt)
+       {
+       if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+                       SPARCV9_BLK)
+               return _sparcv9_vis1_instrument_bus(out,cnt);
+       else
+               return 0;
+       }
+
+size_t OPENSSL_instrument_bus2(unsigned int *out,size_t cnt,size_t max)
+       {
+       if (OPENSSL_sparcv9cap_P&(SPARCV9_TICK_PRIVILEGED|SPARCV9_BLK) ==
+                       SPARCV9_BLK)
+               return _sparcv9_vis1_instrument_bus2(out,cnt,max);
+       else
+               return 0;
+       }
+
 #if 0 && defined(__sun) && defined(__SVR4)
 /* This code path is disabled, because of incompatibility of
  * libdevinfo.so.1 and libmalloc.so.1 (see below for details)
@@ -112,7 +133,7 @@ void OPENSSL_cpuid_setup(void)
        if (sysinfo(SI_ISALIST,si,sizeof(si))>0)
                {
                if (strstr(si,"+vis"))
-                       OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+                       OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
                if (strstr(si,"+vis2"))
                        {
                        OPENSSL_sparcv9cap_P |= SPARCV9_VIS2;
@@ -169,7 +190,6 @@ void OPENSSL_cpuid_setup(void)
        char *e;
        struct sigaction        common_act,ill_oact,bus_oact;
        sigset_t                all_masked,oset;
-       int                     sig;
        static int trigger=0;
 
        if (trigger) return;
@@ -211,7 +231,7 @@ void OPENSSL_cpuid_setup(void)
        if (sigsetjmp(common_jmp,1) == 0)
                {
                _sparcv9_vis1_probe();
-               OPENSSL_sparcv9cap_P |= SPARCV9_VIS1;
+               OPENSSL_sparcv9cap_P |= SPARCV9_VIS1|SPARCV9_BLK;
                /* detect UltraSPARC-Tx, see sparccpud.S for details... */
                if (_sparcv9_vis1_instrument() >= 12)
                        OPENSSL_sparcv9cap_P &= ~(SPARCV9_VIS1|SPARCV9_PREFER_FPU);
index c96821a..ecfcfc7 100644 (file)
@@ -9,8 +9,9 @@ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
 
-if ($win64)    { $arg1="%rcx"; $arg2="%rdx"; }
-else           { $arg1="%rdi"; $arg2="%rsi"; }
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :        # Win64 order
+                                ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+
 print<<___;
 .extern                OPENSSL_cpuid_setup
 .section       .init
@@ -228,5 +229,95 @@ OPENSSL_wipe_cpu:
        ret
 .size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 ___
+{
+my $out="%r10";
+my $cnt="%rcx";
+my $max="%r11";
+my $lasttick="%r8d";
+my $lastdiff="%r9d";
+my $redzone=win64?8:-8;
+
+print<<___;
+.globl OPENSSL_instrument_bus
+.type  OPENSSL_instrument_bus,\@abi-omnipotent
+.align 16
+OPENSSL_instrument_bus:
+       mov     $arg1,$out      # tribute to Win64
+       mov     $arg2,$cnt
+       mov     $arg2,$max
+
+       rdtsc                   # collect 1st tick
+       mov     %eax,$lasttick  # lasttick = tick
+       mov     \$0,$lastdiff   # lastdiff = 0
+       clflush ($out)
+       lock
+       add     $lastdiff,($out)
+       jmp     .Loop
+.align 16
+.Loop: rdtsc
+       mov     %eax,%edx
+       sub     $lasttick,%eax
+       mov     %edx,$lasttick
+       mov     %eax,$lastdiff
+       clflush ($out)
+       lock
+       add     %eax,($out)
+       lea     4($out),$out
+       sub     \$1,$cnt
+       jnz     .Loop
+
+       mov     $max,%rax
+       ret
+.size  OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.globl OPENSSL_instrument_bus2
+.type  OPENSSL_instrument_bus2,\@abi-omnipotent
+.align 16
+OPENSSL_instrument_bus2:
+       mov     $arg1,$out      # tribute to Win64
+       mov     $arg2,$cnt
+       mov     $arg3,$max
+       mov     $cnt,$redzone(%rsp)
+
+       rdtsc                   # collect 1st tick
+       mov     %eax,$lasttick  # lasttick = tick
+       mov     \$0,$lastdiff   # lastdiff = 0
+
+       clflush ($out)
+       lock
+       add     $lastdiff,($out)
+
+       rdtsc                   # collect 1st diff
+       mov     %eax,%edx
+       sub     $lasttick,%eax  # diff
+       mov     %edx,$lasttick  # lasttick = tick
+       mov     %eax,$lastdiff  # lastdiff = diff
+.Loop2:
+       clflush ($out)
+       lock
+       add     %eax,($out)     # accumulate diff
+
+       sub     \$1,$max
+       jz      .Ldone2
+
+       rdtsc
+       mov     %eax,%edx
+       sub     $lasttick,%eax  # diff
+       mov     %edx,$lasttick  # lasttick = tick
+       cmp     $lastdiff,%eax
+       mov     %eax,$lastdiff  # lastdiff = diff
+       mov     \$0,%edx
+       setne   %dl
+       sub     %rdx,$cnt       # conditional --$cnt
+       lea     ($out,%rdx,4),$out      # conditional ++$out
+       jnz     .Loop2
+
+.Ldone2:
+       mov     $redzone(%rsp),%rax
+       sub     $cnt,%rax
+       ret
+.size  OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+___
+}
 
 close STDOUT;  # flush
index a7464af..0513398 100644 (file)
@@ -307,6 +307,108 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
        &ret    ();
 &function_end_B("OPENSSL_cleanse");
 
+{
+my $lasttick = "esi";
+my $lastdiff = "ebx";
+my $out = "edi";
+my $cnt = "ecx";
+my $max = "ebp";
+
+&function_begin("OPENSSL_instrument_bus");
+    &mov       ("eax",0);
+    if ($sse2) {
+       &picmeup("edx","OPENSSL_ia32cap_P");
+       &bt     (&DWP(0,"edx"),4);
+       &jnc    (&label("nogo"));       # no TSC
+       &bt     (&DWP(0,"edx"),19);
+       &jnc    (&label("nogo"));       # no CLFLUSH
+
+       &mov    ($out,&wparam(0));      # load arguments
+       &mov    ($cnt,&wparam(1));
+
+       # collect 1st tick
+       &rdtsc  ();
+       &mov    ($lasttick,"eax");      # lasttick = tick
+       &mov    ($lastdiff,0);          # lastdiff = 0
+       &clflush(&DWP(0,$out));
+       &lock   ();
+       &add    (&DWP(0,$out),$lastdiff);
+       &jmp    (&label("loop"));
+
+&set_label("loop",16);
+       &rdtsc  ();
+       &mov    ("edx","eax");          # put aside tick (yes, I neglect edx)
+       &sub    ("eax",$lasttick);      # diff
+       &mov    ($lasttick,"edx");      # lasttick = tick
+       &mov    ($lastdiff,"eax");      # lastdiff = diff
+       &clflush(&DWP(0,$out));
+       &lock   ();
+       &add    (&DWP(0,$out),"eax");   # accumulate diff
+       &lea    ($out,&DWP(4,$out));    # ++$out
+       &sub    ($cnt,1);               # --$cnt
+       &jnz    (&label("loop"));
+
+       &mov    ("eax",&wparam(1));
+&set_label("nogo");
+    }
+&function_end("OPENSSL_instrument_bus");
+
+&function_begin("OPENSSL_instrument_bus2");
+    &mov       ("eax",0);
+    if ($sse2) {
+       &picmeup("edx","OPENSSL_ia32cap_P");
+       &bt     (&DWP(0,"edx"),4);
+       &jnc    (&label("nogo"));       # no TSC
+       &bt     (&DWP(0,"edx"),19);
+       &jnc    (&label("nogo"));       # no CLFLUSH
+
+       &mov    ($out,&wparam(0));      # load arguments
+       &mov    ($cnt,&wparam(1));
+       &mov    ($max,&wparam(2));
+
+       &rdtsc  ();                     # collect 1st tick
+       &mov    ($lasttick,"eax");      # lasttick = tick
+       &mov    ($lastdiff,0);          # lastdiff = 0
+
+       &clflush(&DWP(0,$out));
+       &lock   ();
+       &add    (&DWP(0,$out),$lastdiff);
+
+       &rdtsc  ();                     # collect 1st diff
+       &mov    ("edx","eax");          # put aside tick (yes, I neglect edx)
+       &sub    ("eax",$lasttick);      # diff
+       &mov    ($lasttick,"edx");      # lasttick = tick
+       &mov    ($lastdiff,"eax");      # lastdiff = diff
+       &jmp    (&label("loop2"));
+
+&set_label("loop2",16);
+       &clflush(&DWP(0,$out));
+       &lock   ();
+       &add    (&DWP(0,$out),"eax");   # accumulate diff
+
+       &sub    ($max,1);
+       &jz     (&label("done2"));
+
+       &rdtsc  ();
+       &mov    ("edx","eax");          # put aside tick (yes, I neglect edx)
+       &sub    ("eax",$lasttick);      # diff
+       &mov    ($lasttick,"edx");      # lasttick = tick
+       &cmp    ("eax",$lastdiff);
+       &mov    ($lastdiff,"eax");      # lastdiff = diff
+       &mov    ("edx",0);
+       &setne  ("dl");
+       &sub    ($cnt,"edx");           # conditional --$cnt
+       &lea    ($out,&DWP(0,$out,"edx",4));    # conditional ++$out
+       &jnz    (&label("loop2"));
+
+&set_label("done2");
+       &mov    ("eax",&wparam(1));
+       &sub    ("eax",$cnt);
+&set_label("nogo");
+    }
+&function_end("OPENSSL_instrument_bus2");
+}
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();
diff --git a/doc/crypto/OPENSSL_instrument_bus.pod b/doc/crypto/OPENSSL_instrument_bus.pod
new file mode 100644 (file)
index 0000000..539957b
--- /dev/null
@@ -0,0 +1,42 @@
+=pod
+
+=head1 NAME
+
+OPENSSL_instrument_bus[2] - instrument references to memory bus
+
+=head1 SYNOPSIS
+
+ #ifdef OPENSSL_CPUID_OBJ
+ size_t OPENSSL_instrument_bus (int *vector,size_t num);
+ size_t OPENSSL_instrument_bus2(int *vector,size_t num,size_t max);
+ #endif
+
+=head1 DESCRIPTION
+
+It was empirically found that timings of references to primary memory
+are subject to irregular, apparently non-deterministic variations. The
+subroutines in question instrument these references for purposes of
+gathering entropy for random number generator. In order to make it
+bus-bound a 'flush cache line' instruction is used between probes. In
+addition probes are added to B<vector> elements in atomic or
+interlocked manner, which should contribute additional noise on
+multi-processor systems. This also means that B<vector[num]> should be
+zeroed upon invocation (if you want to retrieve actual probe values).
+
+OPENSSL_instrument_bus performs B<num> probes and records the number of
+oscillator cycles every probe took.
+
+OPENSSL_instrument_bus2 on the other hand B<accumulates> consecutive
+probes with the same value, i.e. in a way it records duration of
+periods when probe values appeared deterministic. The subroutine
+performs at most B<max> probes in attempt to fill the B<vector[num]>,
+with B<max> value of 0 meaning "as many as it takes."
+
+=head1 RETURN VALUE
+
+Return value of 0 indicates that CPU is not capable of performing the
+benchmark, either because oscillator counter or 'flush cache line' is
+not available on current platform. For reference, on x86 'flush cache
+line' was introduced with the SSE2 extensions.
+
+Otherwise number of recorded values is returned.