doc/lhash.doc

   1 The LHASH library.
   2
   3 I wrote this library in 1991 and have since forgotten why I called it lhash.
   4 It implements a hash table from an article I read at the
   5 time from 'Communications of the ACM'.  What makes this hash
   6 table different is that as the table fills, the hash table is
   7 increased (or decreased) in size via realloc().
   8 When a 'resize' is done, instead of all hashes being redistributed over
   9 twice as many 'buckets', one bucket is split.  So when an 'expand' is done,
  10 there is only a minimal cost to redistribute some values.  Subsequent
  11 inserts will cause more single 'bucket' redistributions but there will
  12 never be a sudden large cost due to redistributing all the 'buckets'.
  13
  14 The state for a particular hash table is kept in the LHASH structure.
  15 The LHASH structure also records statistics about most aspects of accessing
  16 the hash table.  This is mostly a legacy of my writing this library for
  17 the reasons of implementing what looked like a nice algorithm rather than
  18 for a particular software product.
  19
  20 Internal stuff you probably don't want to know about.
  21 The decision to increase or decrease the hash table size is made depending
  22 on the 'load' of the hash table.  The load is the number of items in the
  23 hash table divided by the size of the hash table.  The default values are
  24 as follows.  If (hash->up_load < load) => expand.
  25 if (hash->down_load > load) =>  contract.  The 'up_load' has a default value of
  26 1 and 'down_load' has a default value of 2.  These numbers can be modified
  27 by the application by just playing with the 'up_load' and 'down_load'
  28 variables.  The 'load' is kept in a form which is multiplied by 256.  So
  29 hash->up_load=8*256; will cause a load of 8 to be set.
  30
  31 If you are interested in performance the field to watch is
  32 num_comp_calls.  The hash library keeps track of the 'hash' value for
  33 each item so when a lookup is done, the 'hashes' are compared, if
  34 there is a match, then a full compare is done, and
  35 hash->num_comp_calls is incremented.  If num_comp_calls is not equal
  36 to num_delete plus num_retrieve it means that your hash function is
  37 generating hashes that are the same for different values.  It is
  38 probably worth changing your hash function if this is the case because
  39 even if your hash table has 10 items in a 'bucked', it can be searched
  40 with 10 'unsigned long' compares and 10 linked list traverses.  This
  41 will be much less expensive that 10 calls to you compare function.
  42
  43 LHASH *lh_new(
  44 unsigned long (*hash)(),
  45 int (*cmp)());
  46         This function is used to create a new LHASH structure.  It is passed
  47         function pointers that are used to store and retrieve values passed
  48         into the hash table.  The 'hash'
  49         function is a hashing function that will return a hashed value of
  50         it's passed structure.  'cmp' is passed 2 parameters, it returns 0
  51         is they are equal, otherwise, non zero.
  52         If there are any problems (usually malloc failures), NULL is
  53         returned, otherwise a new LHASH structure is returned.  The
  54         hash value is normally truncated to a power of 2, so make sure
  55         that your hash function returns well mixed low order bits.
  56
  57 void lh_free(
  58 LHASH *lh);
  59         This function free()s a LHASH structure.  If there is malloced
  60         data in the hash table, it will not be freed.  Consider using the
  61         lh_doall function to deallocate any remaining entries in the hash
  62         table.
  63
  64 char *lh_insert(
  65 LHASH *lh,
  66 char *data);
  67         This function inserts the data pointed to by data into the lh hash
  68         table.  If there is already and entry in the hash table entry, the
  69         value being replaced is returned.  A NULL is returned if the new
  70         entry does not clash with an entry already in the table (the normal
  71         case) or on a malloc() failure (perhaps I should change this....).
  72         The 'char *data' is exactly what is passed to the hash and
  73         comparison functions specified in lh_new().
  74
  75 char *lh_delete(
  76 LHASH *lh,
  77 char *data);
  78         This routine deletes an entry from the hash table.  The value being
  79         deleted is returned.  NULL is returned if there is no such value in
  80         the hash table.
  81
  82 char *lh_retrieve(
  83 LHASH *lh,
  84 char *data);
  85         If 'data' is in the hash table it is returned, else NULL is
  86         returned.  The way these routines would normally be uses is that a
  87         dummy structure would have key fields populated and then
  88         ret=lh_retrieve(hash,&dummy);.  Ret would now be a pointer to a fully
  89         populated structure.
  90
  91 void lh_doall(
  92 LHASH *lh,
  93 void (*func)(char *a));
  94         This function will, for every entry in the hash table, call function
  95         'func' with the data item as parameters.
  96         This function can be quite useful when used as follows.
  97         void cleanup(STUFF *a)
  98                 { STUFF_free(a); }
  99         lh_doall(hash,cleanup);
 100         lh_free(hash);
 101         This can be used to free all the entries, lh_free() then
 102         cleans up the 'buckets' that point to nothing.  Be careful
 103         when doing this.  If you delete entries from the hash table,
 104         in the call back function, the table may decrease in size,
 105         moving item that you are
 106         currently on down lower in the hash table.  This could cause
 107         some entries to be skipped.  The best solution to this problem
 108         is to set lh->down_load=0 before you start.  This will stop
 109         the hash table ever being decreased in size.
 110
 111 void lh_doall_arg(
 112 LHASH *lh;
 113 void(*func)(char *a,char *arg));
 114 char *arg;
 115         This function is the same as lh_doall except that the function
 116         called will be passed 'arg' as the second argument.
 117
 118 unsigned long lh_strhash(
 119 char *c);
 120         This function is a demo string hashing function.  Since the LHASH
 121         routines would normally be passed structures, this routine would
 122         not normally be passed to lh_new(), rather it would be used in the
 123         function passed to lh_new().
 124
 125 The next three routines print out various statistics about the state of the
 126 passed hash table.  These numbers are all kept in the lhash structure.
 127
 128 void lh_stats(
 129 LHASH *lh,
 130 FILE *out);
 131         This function prints out statistics on the size of the hash table,
 132         how many entries are in it, and the number and result of calls to
 133         the routines in this library.
 134
 135 void lh_node_stats(
 136 LHASH *lh,
 137 FILE *out);
 138         For each 'bucket' in the hash table, the number of entries is
 139         printed.
 140
 141 void lh_node_usage_stats(
 142 LHASH *lh,
 143 FILE *out);
 144         This function prints out a short summary of the state of the hash
 145         table.  It prints what I call the 'load' and the 'actual load'.
 146         The load is the average number of data items per 'bucket' in the
 147         hash table.  The 'actual load' is the average number of items per
 148         'bucket', but only for buckets which contain entries.  So the
 149         'actual load' is the average number of searches that will need to
 150         find an item in the hash table, while the 'load' is the average number
 151         that will be done to record a miss.