summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2003-01-07 12:40:05 +0000
committerBruno Haible <bruno@clisp.org>2003-01-07 12:40:05 +0000
commitcd08b4d519edab24110b356a273920dcf549ec09 (patch)
treef5fcb1c851c47c3d096e43fe8e0fdaec43e31dc3
parentb91e4511c018268b66ea31beda0bb631c5cfd4b7 (diff)
downloadgperf-cd08b4d519edab24110b356a273920dcf549ec09.tar.gz
Restructure the asso_values[] searching code.
-rw-r--r--ChangeLog16
-rw-r--r--src/search.cc277
-rw-r--r--src/search.h40
3 files changed, 225 insertions, 108 deletions
diff --git a/ChangeLog b/ChangeLog
index b3dedf7..b387548 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,21 @@
2002-11-03 Bruno Haible <bruno@clisp.org>
+ * src/search.h (Search::init_asso_values, Search::find_asso_values):
+ New declarations.
+ (Search::try_asso_value): Renamed from Search::affects_prev.
+ (Search::change_some_asso_value): Renamed from Search::change.
+ (Search::set_asso_max, Search::get_asso_max): Remove methods.
+ (Search::_union_set): New field.
+ * src/search.cc (Search::init_asso_values): New method, extracted
+ from Search::optimize.
+ (Search::try_asso_value): Renamed from Search::affects_prev. Take the
+ iteration count as argument.
+ (Search::change_some_asso_value): Renamed from Search::change. Don't
+ make union_set static. Don't increment _fewest_collisions here.
+ (Search::find_asso_values): New method, extracted from
+ Search::optimize.
+ (Search::optimize); Update.
+
* src/search.h (Search::compute_hash): Renamed from Search::hash.
(Search::compute_disjoint_union): Remove declaration.
(Search::sort_by_occurrence): Renamed from Search::sort_set.
diff --git a/src/search.cc b/src/search.cc
index 773e2ed..c0cdd86 100644
--- a/src/search.cc
+++ b/src/search.cc
@@ -31,8 +31,7 @@
#include "options.h"
#include "hash-table.h"
-/* Efficiently returns the least power of two greater than or equal to X! */
-#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
+/* -------------------- Initialization and Preparation --------------------- */
Search::Search (KeywordExt_List *list)
: _head (list),
@@ -41,7 +40,6 @@ Search::Search (KeywordExt_List *list)
_asso_values (new int[_alpha_size]),
_determined (new bool[_alpha_size])
{
- memset (_asso_values, 0, _alpha_size * sizeof (_asso_values[0]));
}
void
@@ -154,6 +152,8 @@ Search::prepare ()
}
}
+/* ----------------------- Sorting the Keyword list ------------------------ */
+
/* Merges two sorted lists together to form one sorted list.
The sorting criterion depends on which of _occurrence_sort and _hash_sort
is set to true. This is a kludge, but permits nice sharing of almost
@@ -230,6 +230,8 @@ Search::merge_sort (KeywordExt_List *head)
}
}
+/* ---------------- Reordering the Keyword list (optional) ----------------- */
+
/* Computes the sum of occurrences of the _selchars of a keyword.
This is a kind of correlation measure: Keywords which have many
selected characters in common with other keywords have a high
@@ -356,6 +358,8 @@ Search::reorder ()
}
}
+/* ------------------------------------------------------------------------- */
+
/* Returns the length of keyword list. */
int
@@ -380,6 +384,71 @@ Search::get_max_keysig_size ()
return option[ALLCHARS] ? _max_key_len : option.get_max_keysig_size ();
}
+/* ---------------------- Finding good asso_values[] ----------------------- */
+
+/* Initializes the asso_values[] related parameters and put a first guess
+ into asso_values[]. */
+
+void
+Search::init_asso_values ()
+{
+ int size_multiple = option.get_size_multiple ();
+ int non_linked_length = keyword_list_length ();
+ int asso_value_max;
+
+ if (size_multiple == 0)
+ asso_value_max = non_linked_length;
+ else if (size_multiple > 0)
+ asso_value_max = non_linked_length * size_multiple;
+ else /* if (size_multiple < 0) */
+ asso_value_max = non_linked_length / -size_multiple;
+ /* Round up to the next power of two. This makes it easy to ensure
+ an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
+ being odd, it guarantees that Search::try_asso_value() will iterate
+ through different values for _asso_value[c]. */
+ if (asso_value_max == 0)
+ asso_value_max = 1;
+ asso_value_max |= asso_value_max >> 1;
+ asso_value_max |= asso_value_max >> 2;
+ asso_value_max |= asso_value_max >> 4;
+ asso_value_max |= asso_value_max >> 8;
+ asso_value_max |= asso_value_max >> 16;
+ asso_value_max++;
+ _asso_value_max = asso_value_max;
+
+ if (option[RANDOM])
+ {
+ srand (reinterpret_cast<long>(time (0)));
+
+ for (int i = 0; i < _alpha_size; i++)
+ _asso_values[i] = rand () & (asso_value_max - 1);
+ }
+ else
+ {
+ int asso_value = option.get_initial_asso_value ();
+
+ asso_value = asso_value & (_asso_value_max - 1);
+ for (int i = 0; i < _alpha_size; i++)
+ _asso_values[i] = asso_value;
+ }
+
+ /* Given the bound for _asso_values[c], we have a bound for the possible
+ hash values, as computed in compute_hash(). */
+ _max_hash_value = (option[NOLENGTH] ? 0 : max_key_length ())
+ + (_asso_value_max - 1) * get_max_keysig_size ();
+ /* Allocate a sparse bit vector for detection of collisions of hash
+ values. */
+ _collision_detector = new Bool_Array (_max_hash_value + 1);
+
+ /* Allocate scratch set. */
+ _union_set = new unsigned char [2 * get_max_keysig_size ()];
+
+ if (option[DEBUG])
+ fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
+ "\nmaximum size of generated hash table is %d\n",
+ non_linked_length, asso_value_max, _max_hash_value);
+}
+
/* Computes a keyword's hash value, relative to the current _asso_values[],
and stores it in keyword->_hash_value.
This is called very frequently, and needs to be fast! */
@@ -467,68 +536,67 @@ Search::sort_by_occurrence (unsigned char *set, int len)
}
}
-/* Find out how character value change affects successfully hashed items.
- Returns FALSE if no other hash values are affected, else returns TRUE.
- Note that because option.get_asso_max() is a power of two we can guarantee
- that all valid asso_values are visited without repetition since
- Option.Get_Jump was forced to be an odd value! */
+/* Tries various other values for _asso_values[c]. A value is successful
+ if, with it, the recomputed hash values for the keywords from
+ _head->first() to curr - inclusive - give fewer than _fewest_collisions
+ collisions. Up to the given number of iterations are performed.
+ If successful, _asso_values[c] is changed, _fewest_collisions is decreased,
+ and false is returned.
+ If all iterations are unsuccessful, _asso_values[c] is restored and
+ true is returned.
+ This is called very frequently, and needs to be fast! */
inline bool
-Search::affects_prev (unsigned char c, KeywordExt *curr)
+Search::try_asso_value (unsigned char c, KeywordExt *curr, int iterations)
{
- int original_char = _asso_values[c];
- int total_iterations = !option[FAST]
- ? get_asso_max () : option.get_iterations () ? option.get_iterations () : keyword_list_length ();
-
- /* Try all valid associated values. */
+ int original_value = _asso_values[c];
- for (int i = total_iterations - 1; i >= 0; i--)
+ /* Try many valid associated values. */
+ for (int i = iterations - 1; i >= 0; i--)
{
int collisions = 0;
+ /* Try next value. Wrap around mod _asso_value_max. */
_asso_values[c] =
(_asso_values[c] + (option.get_jump () ? option.get_jump () : rand ()))
- & (get_asso_max () - 1);
+ & (_asso_value_max - 1);
- /* Iteration Number array is a win, O(1) intialization time! */
+ /* Iteration Number array is a win, O(1) intialization time! */
_collision_detector->clear ();
- /* See how this asso_value change affects previous keywords. If
- it does better than before we'll take it! */
-
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
{
KeywordExt *keyword = ptr->first();
+
+ /* Compute new hash code for the keyword, and see whether it
+ collides with another keyword's hash code. If we have too
+ many collisions, we can safely abort the fruitless loop. */
if (_collision_detector->set_bit (compute_hash (keyword))
&& ++collisions >= _fewest_collisions)
break;
+
if (keyword == curr)
{
_fewest_collisions = collisions;
if (option[DEBUG])
- fprintf (stderr, "- resolved after %d iterations", total_iterations - i);
+ fprintf (stderr, "- resolved after %d iterations",
+ iterations - i);
return false;
}
}
}
- /* Restore original values, no more tries. */
- _asso_values[c] = original_char;
- /* If we're this far it's time to try the next character.... */
+ /* Restore original values, no more tries. */
+ _asso_values[c] = original_value;
return true;
}
-/* Change a character value, try least-used characters first. */
+/* Attempts to change an _asso_value[], in order to resolve a hash value
+ collision between the two given keywords. */
void
-Search::change (KeywordExt *prior, KeywordExt *curr)
+Search::change_some_asso_value (KeywordExt *prior, KeywordExt *curr)
{
- static unsigned char *union_set;
- int union_set_length;
-
- if (!union_set)
- union_set = new unsigned char [2 * get_max_keysig_size ()];
-
if (option[DEBUG])
{
fprintf (stderr, "collision on keyword #%d, prior = \"%.*s\", curr = \"%.*s\" hash = %d\n",
@@ -538,26 +606,50 @@ Search::change (KeywordExt *prior, KeywordExt *curr)
curr->_hash_value);
fflush (stderr);
}
- union_set_length = compute_disjoint_union (prior->_selchars, prior->_selchars_length, curr->_selchars, curr->_selchars_length, union_set);
+
+ /* To achieve that the two hash values become different, we have to
+ change an _asso_values[c] for a character c that contributes to the
+ hash functions of prior and curr with different multiplicity.
+ So we compute the set of such c. */
+ unsigned char *union_set = _union_set;
+ int union_set_length =
+ compute_disjoint_union (prior->_selchars, prior->_selchars_length,
+ curr->_selchars, curr->_selchars_length,
+ union_set);
+
+ /* Sort by decreasing occurrence: Try least-used characters c first.
+ The idea is that this reduces the number of freshly introduced
+ collisions. */
sort_by_occurrence (union_set, union_set_length);
- /* Try changing some values, if change doesn't alter other values continue normal action. */
- _fewest_collisions++;
+ int iterations =
+ !option[FAST]
+ ? _asso_value_max /* Try all possible values of _asso_values[c]. */
+ : option.get_iterations ()
+ ? option.get_iterations ()
+ : keyword_list_length ();
const unsigned char *p = union_set;
int i = union_set_length;
for (; i > 0; p++, i--)
- if (!affects_prev (*p, curr))
+ if (!try_asso_value (*p, curr, iterations))
{
+ /* Good, this _asso_values[] modification reduces the number of
+ collisions so far.
+ All keyword->_hash_value up to curr - inclusive - and
+ _fewest_collisions have been updated. */
if (option[DEBUG])
{
fprintf (stderr, " by changing asso_value['%c'] (char #%d) to %d\n",
*p, p - union_set + 1, _asso_values[*p]);
fflush (stderr);
}
- return; /* Good, doesn't affect previous hash values, we'll take it. */
+ return;
}
+ /* Failed to resolve a collision. */
+
+ /* Recompute all keyword->_hash_value up to curr - inclusive -. */
for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
{
KeywordExt* keyword = ptr->first();
@@ -569,91 +661,78 @@ Search::change (KeywordExt *prior, KeywordExt *curr)
if (option[DEBUG])
{
fprintf (stderr, "** collision not resolved after %d iterations, %d duplicates remain, continuing...\n",
- !option[FAST] ? get_asso_max () : option.get_iterations () ? option.get_iterations () : keyword_list_length (),
- _fewest_collisions + _total_duplicates);
+ iterations, _fewest_collisions + _total_duplicates);
fflush (stderr);
}
}
+/* Finds good _asso_values[]. */
+
+void
+Search::find_asso_values ()
+{
+ _fewest_collisions = 0;
+ init_asso_values ();
+
+ /* Add one keyword after the other and see whether its hash value collides
+ with one of the previous hash values. */
+ _num_done = 1;
+ for (KeywordExt_List *curr_ptr = _head;
+ curr_ptr != NULL;
+ curr_ptr = curr_ptr->rest(), _num_done++)
+ {
+ KeywordExt *curr = curr_ptr->first();
+
+ /* Compute this keyword's hash value. */
+ compute_hash (curr);
+
+ /* See if it collides with a prior keyword. */
+ for (KeywordExt_List *prior_ptr = _head;
+ prior_ptr != curr_ptr;
+ prior_ptr = prior_ptr->rest())
+ {
+ KeywordExt *prior = prior_ptr->first();
+
+ if (prior->_hash_value == curr->_hash_value)
+ {
+ _fewest_collisions++;
+ /* Handle collision. */
+ change_some_asso_value (prior, curr);
+ break;
+ }
+ }
+ }
+}
+
+/* ------------------------------------------------------------------------- */
+
/* Sorts the keys by hash value. */
void
Search::sort ()
{
- _hash_sort = true;
+ _hash_sort = true;
_occurrence_sort = false;
-
_head = merge_sort (_head);
}
void
Search::optimize ()
{
+ /* Preparations. */
prepare ();
if (option[ORDER])
reorder ();
- _num_done = 1;
- _fewest_collisions = 0;
- int asso_value_max = option.get_size_multiple ();
- int non_linked_length = keyword_list_length ();
- if (asso_value_max == 0)
- asso_value_max = non_linked_length;
- else if (asso_value_max > 0)
- asso_value_max *= non_linked_length;
- else /* if (asso_value_max < 0) */
- asso_value_max = non_linked_length / -asso_value_max;
- set_asso_max (POW (asso_value_max));
-
- if (option[RANDOM])
- {
- srand (reinterpret_cast<long>(time (0)));
-
- for (int i = 0; i < _alpha_size; i++)
- _asso_values[i] = rand () & (asso_value_max - 1);
- }
- else
- {
- int asso_value = option.get_initial_asso_value ();
-
- if (asso_value) /* Initialize array if user requests non-zero default. */
- for (int i = _alpha_size - 1; i >= 0; i--)
- _asso_values[i] = asso_value & get_asso_max () - 1;
- }
- _max_hash_value = max_key_length () + get_asso_max () * get_max_keysig_size ();
- _collision_detector = new Bool_Array (_max_hash_value + 1);
-
- if (option[DEBUG])
- fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
- "\nmaximum size of generated hash table is %d\n",
- non_linked_length, asso_value_max, _max_hash_value);
- KeywordExt_List *curr;
- for (curr = _head; curr != NULL; curr = curr->rest())
- {
- KeywordExt *currkw = curr->first();
-
- compute_hash (currkw);
-
- for (KeywordExt_List *ptr = _head; ptr != curr; ptr = ptr->rest())
- {
- KeywordExt *ptrkw = ptr->first();
-
- if (ptrkw->_hash_value == currkw->_hash_value)
- {
- change (ptrkw, currkw);
- break;
- }
- }
- _num_done++;
- }
+ /* Search for good _asso_values[]. */
+ find_asso_values ();
/* Make one final check, just to make sure nothing weird happened.... */
-
_collision_detector->clear ();
-
- for (curr = _head; curr; curr = curr->rest())
+ for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
{
- unsigned int hashcode = compute_hash (curr->first());
+ KeywordExt *curr = curr_ptr->first();
+ unsigned int hashcode = compute_hash (curr);
if (_collision_detector->set_bit (hashcode))
{
if (option[DUP]) /* Keep track of this number... */
@@ -673,7 +752,7 @@ Search::optimize ()
sort ();
}
-/* Prints out some diagnostics upon completion. */
+/* Prints out some diagnostics upon completion. */
Search::~Search ()
{
diff --git a/src/search.h b/src/search.h
index bd7942c..51fbd4f 100644
--- a/src/search.h
+++ b/src/search.h
@@ -62,15 +62,27 @@ private:
/* Returns the number of key positions. */
int get_max_keysig_size ();
+ /* Initializes the asso_values[] related parameters and put a first guess
+ into asso_values[]. */
+ void init_asso_values ();
+
/* Computes a keyword's hash value, relative to the current _asso_values[],
and stores it in keyword->_hash_value. */
- int compute_hash (KeywordExt *key_node);
+ int compute_hash (KeywordExt *keyword);
/* Sorts the given set in increasing frequency of _occurrences[]. */
void sort_by_occurrence (unsigned char *set, int len);
- bool affects_prev (unsigned char c, KeywordExt *curr);
- void change (KeywordExt *prior, KeywordExt *curr);
+ /* Tries various other values for _asso_values[c]. */
+ bool try_asso_value (unsigned char c, KeywordExt *curr, int iterations);
+
+ /* Attempts to change an _asso_value[], in order to resolve a hash value
+ collision between the two given keywords. */
+ void change_some_asso_value (KeywordExt *prior, KeywordExt *curr);
+
+ /* Finds good _asso_values[]. */
+ void find_asso_values ();
+
void sort ();
public:
@@ -114,13 +126,23 @@ private:
/* Vector used during Search::reorder(). */
bool * const _determined;
- int _num_done; /* Number of keywords processed without a collision. */
- int _fewest_collisions; /* Records fewest # of collisions for asso value. */
- int _max_hash_value; /* Maximum possible hash value. */
+ /* Exclusive upper bound for every _asso_values[c]. A power of 2. */
+ int _asso_value_max;
+
+ /* Maximal possible hash value. */
+ int _max_hash_value;
+
+ /* Sparse bit vector for collision detection. */
Bool_Array * _collision_detector;
- int _size; /* Range of the hash table. */
- void set_asso_max (int r) { _size = r; }
- int get_asso_max () { return _size; }
+
+ /* Minimal number of collisions found so far. */
+ int _fewest_collisions;
+
+ /* Scratch set, used during Search::change_some_asso_value. */
+ unsigned char * _union_set;
+
+ /* Number of keyword being handled during Search::find_asso_values. */
+ int _num_done;
};
#endif