summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2003-03-18 10:22:37 +0000
committerBruno Haible <bruno@clisp.org>2003-03-18 10:22:37 +0000
commit6d268d095b4cb2bab821e82b785a4b74810b01a6 (patch)
tree5b3f7f60909fc6a7d5f4697c9d4ae58f947f0a7a /src
parent40f37680ac107a61bbe14efe6213f70c91c6b461 (diff)
downloadgperf-6d268d095b4cb2bab821e82b785a4b74810b01a6.tar.gz
Completely new asso_values search algorithm.
Diffstat (limited to 'src')
-rw-r--r--src/keyword-list.cc2
-rw-r--r--src/keyword-list.h2
-rw-r--r--src/keyword.h1
-rw-r--r--src/options.cc40
-rw-r--r--src/options.h53
-rw-r--r--src/options.icc7
-rw-r--r--src/search.cc1073
-rw-r--r--src/search.h39
8 files changed, 536 insertions, 681 deletions
diff --git a/src/keyword-list.cc b/src/keyword-list.cc
index 4f27117..d4b55f9 100644
--- a/src/keyword-list.cc
+++ b/src/keyword-list.cc
@@ -35,7 +35,7 @@ Keyword_List::Keyword_List (Keyword *car)
/* ------------------------- KeywordExt_List class ------------------------- */
-/* Unused constructor. */
+/* Constructor. */
KeywordExt_List::KeywordExt_List (KeywordExt *car)
: Keyword_List (car)
{
diff --git a/src/keyword-list.h b/src/keyword-list.h
index 664f7de..d371022 100644
--- a/src/keyword-list.h
+++ b/src/keyword-list.h
@@ -48,7 +48,7 @@ protected:
class KeywordExt_List : public Keyword_List
{
public:
- /* Unused constructor. */
+ /* Constructor. */
KeywordExt_List (KeywordExt *car);
/* Access to first element of list. */
diff --git a/src/keyword.h b/src/keyword.h
index de89730..02e0364 100644
--- a/src/keyword.h
+++ b/src/keyword.h
@@ -75,7 +75,6 @@ struct KeywordExt : public Keyword
void delete_selchars ();
/* Data members used by the algorithm. */
- int _occurrence; /* Frequency of key set occurrences. */
int _hash_value; /* Hash value for the keyword. */
/* Data members used by the output routines. */
diff --git a/src/options.cc b/src/options.cc
index e5428e5..88dacf2 100644
--- a/src/options.cc
+++ b/src/options.cc
@@ -196,13 +196,6 @@ Options::long_usage (FILE * stream) const
" -D, --duplicates Handle keywords that hash to duplicate values. This\n"
" is useful for certain highly redundant keyword sets.\n");
fprintf (stream,
- " -f, --fast=ITERATIONS Generate the gen-perf.hash function \"fast\". This\n"
- " decreases gperf's running time at the cost of\n"
- " minimizing generated table size. The numeric\n"
- " argument represents the number of times to iterate\n"
- " when resolving a collision. '0' means \"iterate by\n"
- " the number of keywords\".\n");
- fprintf (stream,
" -m, --multiple-iterations=ITERATIONS\n"
" Perform multiple choices of the -i and -j values,\n"
" and choose the best results. This increases the\n"
@@ -221,10 +214,6 @@ Options::long_usage (FILE * stream) const
" -n, --no-strlen Do not include the length of the keyword when\n"
" computing the hash function.\n");
fprintf (stream,
- " -o, --occurrence-sort Reorders input keys by frequency of occurrence of\n"
- " the key sets. This should decrease the search time\n"
- " dramatically.\n");
- fprintf (stream,
" -r, --random Utilizes randomness to initialize the associated\n"
" values table.\n");
fprintf (stream,
@@ -429,7 +418,6 @@ Options::Options ()
_input_file_name (NULL),
_output_file_name (NULL),
_language (NULL),
- _iterations (0),
_jump (DEFAULT_JUMP_VALUE),
_initial_asso_value (0),
_asso_iterations (0),
@@ -454,14 +442,12 @@ Options::~Options ()
{
fprintf (stderr, "\ndumping Options:"
"\nDEBUG is.......: %s"
- "\nORDER is.......: %s"
"\nTYPE is........: %s"
"\nRANDOM is......: %s"
"\nSWITCH is......: %s"
"\nNOLENGTH is....: %s"
"\nLENTABLE is....: %s"
"\nDUP is.........: %s"
- "\nFAST is........: %s"
"\nCOMP is........: %s"
"\nNOTYPE is......: %s"
"\nGLOBAL is......: %s"
@@ -473,8 +459,6 @@ Options::~Options ()
"\nENUM is........: %s"
"\nINCLUDE is.....: %s"
"\nSEVENBIT is....: %s"
- "\nOPT_CHOICE is..: %s"
- "\niterations = %d"
"\nlookup function name = %s"
"\nhash function name = %s"
"\nword list name = %s"
@@ -487,14 +471,12 @@ Options::~Options ()
"\ndelimiters = %s"
"\nnumber of switch statements = %d\n",
_option_word & DEBUG ? "enabled" : "disabled",
- _option_word & ORDER ? "enabled" : "disabled",
_option_word & TYPE ? "enabled" : "disabled",
_option_word & RANDOM ? "enabled" : "disabled",
_option_word & SWITCH ? "enabled" : "disabled",
_option_word & NOLENGTH ? "enabled" : "disabled",
_option_word & LENTABLE ? "enabled" : "disabled",
_option_word & DUP ? "enabled" : "disabled",
- _option_word & FAST ? "enabled" : "disabled",
_option_word & COMP ? "enabled" : "disabled",
_option_word & NOTYPE ? "enabled" : "disabled",
_option_word & GLOBAL ? "enabled" : "disabled",
@@ -506,8 +488,6 @@ Options::~Options ()
_option_word & ENUM ? "enabled" : "disabled",
_option_word & INCLUDE ? "enabled" : "disabled",
_option_word & SEVENBIT ? "enabled" : "disabled",
- _option_word & OPT_CHOICE ? "enabled" : "disabled",
- _iterations,
_function_name, _hash_name, _wordlist_name, _slot_name,
_initializer_suffix, _asso_iterations, _jump, _size_multiple,
_initial_asso_value, _delimiters, _total_switches);
@@ -711,15 +691,7 @@ Options::parse_options (int argc, char *argv[])
break;
}
case 'f': /* Generate the hash table "fast". */
- {
- _option_word |= FAST;
- if ((_iterations = atoi (/*getopt*/optarg)) < 0)
- {
- fprintf (stderr, "iterations value must not be negative, assuming 0\n");
- _iterations = 0;
- }
- break;
- }
+ break; /* Not needed any more. */
case 'F':
{
_initializer_suffix = /*getopt*/optarg;
@@ -861,15 +833,9 @@ Options::parse_options (int argc, char *argv[])
break;
}
case 'o': /* Order input by frequency of key set occurrence. */
- {
- _option_word |= ORDER;
- break;
- }
+ break; /* Not needed any more. */
case 'O': /* Optimized choice during collision resolution. */
- {
- _option_word |= OPT_CHOICE;
- break;
- }
+ break; /* Not needed any more. */
case 'p': /* Generated lookup function a pointer instead of int. */
break; /* This is now the default. */
case 'r': /* Utilize randomness to initialize the associated values table. */
diff --git a/src/options.h b/src/options.h
index bad6966..9d37201 100644
--- a/src/options.h
+++ b/src/options.h
@@ -39,72 +39,63 @@ enum Option_Type
/* Enable debugging (prints diagnostics to stderr). */
DEBUG = 1 << 0,
- /* Apply ordering heuristic to speed-up search time. */
- ORDER = 1 << 1,
-
/* Use the given key positions. */
- POSITIONS = 1 << 2,
+ POSITIONS = 1 << 1,
/* Use all characters in hash function. */
- ALLCHARS = 1 << 3,
+ ALLCHARS = 1 << 2,
/* Handle user-defined type structured keyword input. */
- TYPE = 1 << 4,
+ TYPE = 1 << 3,
/* Randomly initialize the associated values table. */
- RANDOM = 1 << 5,
+ RANDOM = 1 << 4,
/* Generate switch output to save space. */
- SWITCH = 1 << 6,
+ SWITCH = 1 << 5,
/* Don't include keyword length in hash computations. */
- NOLENGTH = 1 << 7,
+ NOLENGTH = 1 << 6,
/* Generate a length table for string comparison. */
- LENTABLE = 1 << 8,
+ LENTABLE = 1 << 7,
/* Handle duplicate hash values for keywords. */
- DUP = 1 << 9,
-
- /* Generate the hash function "fast". */
- FAST = 1 << 10,
+ DUP = 1 << 8,
/* Don't include user-defined type definition in output -- it's already
defined elsewhere. */
- NOTYPE = 1 << 11,
+ NOTYPE = 1 << 9,
/* Generate strncmp rather than strcmp. */
- COMP = 1 << 12,
+ COMP = 1 << 10,
/* Make the keyword table a global variable. */
- GLOBAL = 1 << 13,
+ GLOBAL = 1 << 11,
/* Make the generated tables readonly (const). */
- CONST = 1 << 14,
+ CONST = 1 << 12,
/* Generate K&R C code: no prototypes, no const. */
- KRC = 1 << 15,
+ KRC = 1 << 13,
/* Generate C code: no prototypes, but const (user can #define it away). */
- C = 1 << 16,
+ C = 1 << 14,
/* Generate ISO/ANSI C code: prototypes and const, but no class. */
- ANSIC = 1 << 17,
+ ANSIC = 1 << 15,
/* Generate C++ code: prototypes, const, class, inline, enum. */
- CPLUSPLUS = 1 << 18,
+ CPLUSPLUS = 1 << 16,
/* Use enum for constants. */
- ENUM = 1 << 19,
+ ENUM = 1 << 17,
/* Generate #include statements. */
- INCLUDE = 1 << 20,
+ INCLUDE = 1 << 18,
/* Assume 7-bit, not 8-bit, characters. */
- SEVENBIT = 1 << 21,
-
- /* Apply optimized collision resolution to speed-up search time. */
- OPT_CHOICE = 1 << 22
+ SEVENBIT = 1 << 19
};
/* Class manager for gperf program Options. */
@@ -140,9 +131,6 @@ public:
/* Sets the output language, if not already set. */
void set_language (const char *language);
- /* Returns the iterations value. */
- int get_iterations () const;
-
/* Returns the jump value. */
int get_jump () const;
@@ -222,9 +210,6 @@ private:
/* The output language. */
const char * _language;
- /* Amount to iterate when a collision occurs. */
- int _iterations;
-
/* Jump length when trying alternative values. */
int _jump;
diff --git a/src/options.icc b/src/options.icc
index c54af90..a9295a0 100644
--- a/src/options.icc
+++ b/src/options.icc
@@ -51,13 +51,6 @@ Options::get_output_file_name () const
return _output_file_name;
}
-/* Returns the iterations value. */
-INLINE int
-Options::get_iterations () const
-{
- return _iterations;
-}
-
/* Returns the jump value. */
INLINE int
Options::get_jump () const
diff --git a/src/search.cc b/src/search.cc
index 705827e..8fc1f7c 100644
--- a/src/search.cc
+++ b/src/search.cc
@@ -634,140 +634,6 @@ Search::prepare ()
/* Memory allocation. */
_asso_values = new int[_alpha_size];
- _determined = new bool[_alpha_size];
-}
-
-/* ---------------- Reordering the Keyword list (optional) ----------------- */
-
-/* Computes the sum of occurrences of the _selchars of a keyword.
- This is a kind of correlation measure: Keywords which have many
- selected characters in common with other keywords have a high
- occurrence sum. Keywords whose selected characters don't occur
- in other keywords have a low occurrence sum. */
-
-inline int
-Search::compute_occurrence (KeywordExt *ptr) const
-{
- int value = 0;
-
- const unsigned int *p = ptr->_selchars;
- unsigned int i = ptr->_selchars_length;
- for (; i > 0; p++, i--)
- value += _occurrences[*p];
-
- return value;
-}
-
-/* Comparison function for sorting by decreasing _occurrence valuation. */
-static bool
-greater_by_occurrence (KeywordExt *keyword1, KeywordExt *keyword2)
-{
- return keyword1->_occurrence > keyword2->_occurrence;
-}
-
-/* Auxiliary function for reorder():
- Sets all alphabet characters as undetermined. */
-
-inline void
-Search::clear_determined ()
-{
- memset (_determined, 0, _alpha_size * sizeof (_determined[0]));
-}
-
-/* Auxiliary function for reorder():
- Sets all selected characters of the keyword as determined. */
-
-inline void
-Search::set_determined (KeywordExt *keyword)
-{
- const unsigned int *p = keyword->_selchars;
- unsigned int i = keyword->_selchars_length;
- for (; i > 0; p++, i--)
- _determined[*p] = true;
-}
-
-/* Auxiliary function for reorder():
- Returns true if the keyword's selected characters are all determined. */
-
-inline bool
-Search::already_determined (KeywordExt *keyword) const
-{
- const unsigned int *p = keyword->_selchars;
- unsigned int i = keyword->_selchars_length;
- for (; i > 0; p++, i--)
- if (!_determined[*p])
- return false;
-
- return true;
-}
-
-/* Reorders the keyword list so as to minimize search times.
- First the list is reordered so that frequently occuring keys appear first.
- Then the list is reordered so that keys whose values are already determined
- will be placed towards the front of the list. This helps prune the search
- time by handling inevitable collisions early in the search process. See
- Cichelli's paper from Jan 1980 JACM for details.... */
-
-void
-Search::reorder ()
-{
- KeywordExt_List *ptr;
-
- /* Compute the _occurrence valuation of every keyword on the list. */
- for (ptr = _head; ptr; ptr = ptr->rest())
- {
- KeywordExt *keyword = ptr->first();
-
- keyword->_occurrence = compute_occurrence (keyword);
- }
-
- /* Sort the list by decreasing _occurrence valuation. */
- _head = mergesort_list (_head, greater_by_occurrence);
-
- /* Reorder the list to maximize the efficiency of the search. */
-
- /* At the beginning, consider that no asso_values[c] is fixed. */
- clear_determined ();
- for (ptr = _head; ptr != NULL && ptr->rest() != NULL; ptr = ptr->rest())
- {
- KeywordExt *keyword = ptr->first();
-
- /* Then we'll fix asso_values[c] for all c occurring in this keyword. */
- set_determined (keyword);
-
- /* Then we wish to test for hash value collisions the remaining keywords
- whose hash value is completely determined, as quickly as possible.
- For this purpose, move all the completely determined keywords in the
- remaining list immediately past this keyword. */
- KeywordExt_List *curr_ptr;
- KeywordExt_List *next_ptr; /* = curr_ptr->rest() */
- for (curr_ptr = ptr, next_ptr = curr_ptr->rest();
- next_ptr != NULL;
- next_ptr = curr_ptr->rest())
- {
- KeywordExt *next_keyword = next_ptr->first();
-
- if (already_determined (next_keyword))
- {
- if (curr_ptr == ptr)
- /* Keep next_ptr where it is. */
- curr_ptr = next_ptr;
- else
- {
- /* Remove next_ptr from its current list position... */
- curr_ptr->rest() = next_ptr->rest();
- /* ... and insert it right after ptr. */
- next_ptr->rest() = ptr->rest();
- ptr->rest() = next_ptr;
- }
-
- /* Advance ptr. */
- ptr = ptr->rest();
- }
- else
- curr_ptr = next_ptr;
- }
- }
}
/* ------------------------------------------------------------------------- */
@@ -873,515 +739,585 @@ Search::prepare_asso_values ()
_jump = option.get_jump ();
}
-/* Puts a first guess into asso_values[]. */
+/* Finds some _asso_values[] that fit. */
-void
-Search::init_asso_values ()
+/* The idea is to choose the _asso_values[] one by one, in a way that
+ a choice that has been made never needs to be undone later. This
+ means that we split the work into several steps. Each step chooses
+ one or more _asso_values[c]. The result of choosing one or more
+ _asso_values[c] is that the partitioning of the keyword set gets
+ broader.
+ Look at this partitioning: After every step, the _asso_values[] of a
+ certain set C of characters are undetermined. (At the beginning, C
+ is the set of characters c with _occurrences[c] > 0. At the end, C
+ is empty.) To each keyword K, we associate the multiset of _selchars
+ for which the _asso_values[] are undetermined:
+ K --> K->_selchars intersect C.
+ Consider two keywords equivalent if their value under this mapping is
+ the same. This introduces an equivalence relation on the set of
+ keywords. The equivalence classes partition the keyword set. (At the
+ beginning, the partition is the finest possible: each K is an equivalence
+ class by itself, because all K have a different _selchars. At the end,
+ all K have been merged into a single equivalence class.)
+ The partition before a step is always a refinement of the partition
+ after the step.
+ We choose the steps in such a way that the partition really becomes
+ broader at each step. (A step that only chooses an _asso_values[c]
+ without changing the partition is better merged with the previous step,
+ to avoid useless backtracking.) */
+
+struct EquivalenceClass
{
- if (_initial_asso_value < 0)
- {
- for (unsigned int i = 0; i < _alpha_size; i++)
- _asso_values[i] = rand () & (_asso_value_max - 1);
- }
- else
- {
- int asso_value = _initial_asso_value;
-
- asso_value = asso_value & (_asso_value_max - 1);
- for (unsigned int i = 0; i < _alpha_size; i++)
- _asso_values[i] = asso_value;
- }
-}
-
-/* Computes a keyword's hash value, relative to the current _asso_values[],
- and stores it in keyword->_hash_value.
- This is called very frequently, and needs to be fast! */
+ /* The keywords in this equivalence class. */
+ KeywordExt_List * _keywords;
+ KeywordExt_List * _keywords_last;
+ /* The number of keywords in this equivalence class. */
+ unsigned int _cardinality;
+ /* The undetermined selected characters for the keywords in this
+ equivalence class, as a canonically reordered multiset. */
+ unsigned int * _undetermined_chars;
+ unsigned int _undetermined_chars_length;
+
+ EquivalenceClass * _next;
+};
-inline int
-Search::compute_hash (KeywordExt *keyword) const
+struct Step
{
- int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
-
- const unsigned int *p = keyword->_selchars;
- int i = keyword->_selchars_length;
- for (; i > 0; p++, i--)
- sum += _asso_values[*p];
-
- return keyword->_hash_value = sum;
-}
-
-/* Computes the disjoint union of two multisets of characters, i.e.
- the set of characters that are contained with a different multiplicity
- in set_1 and set_2. This includes those characters which are contained
- in one of the sets but not both.
- Both sets set_1[0..size_1-1] and set_2[0..size_2-1] are given ordered.
- The result, an ordered set (not multiset!) is stored in set_3[0...].
- Returns the size of the resulting set. */
+ /* The characters whose values are being determined in this step. */
+ unsigned int _changing_count;
+ unsigned int * _changing;
+ /* Exclusive upper bound for the _asso_values[c] of this step.
+ A power of 2. */
+ unsigned int _asso_value_max;
+ /* The characters whose values will be determined after this step. */
+ bool * _undetermined;
+ /* The keyword set partition after this step. */
+ EquivalenceClass * _partition;
+ /* The expected number of iterations in this step. */
+ double _expected_lower;
+ double _expected_upper;
+
+ Step * _next;
+};
-inline int
-compute_disjoint_union (const unsigned int *set_1, int size_1,
- const unsigned int *set_2, int size_2,
- unsigned int *set_3)
+static inline bool
+equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
{
- unsigned int *base = set_3;
-
- while (size_1 > 0 && size_2 > 0)
- if (*set_1 == *set_2)
- {
- set_1++, size_1--;
- set_2++, size_2--;
- }
- else
- {
- unsigned int next;
- if (*set_1 < *set_2)
- next = *set_1++, size_1--;
- else
- next = *set_2++, size_2--;
- if (set_3 == base || next != set_3[-1])
- *set_3++ = next;
- }
-
- while (size_1 > 0)
- {
- unsigned int next;
- next = *set_1++, size_1--;
- if (set_3 == base || next != set_3[-1])
- *set_3++ = next;
- }
-
- while (size_2 > 0)
+ while (len > 0)
{
- unsigned int next;
- next = *set_2++, size_2--;
- if (set_3 == base || next != set_3[-1])
- *set_3++ = next;
+ if (*ptr1 != *ptr2)
+ return false;
+ ptr1++;
+ ptr2++;
+ len--;
}
- return set_3 - base;
+ return true;
}
-/* Sorts the given set in increasing frequency of _occurrences[]. */
-
-inline void
-Search::sort_by_occurrence (unsigned int *set, unsigned int len) const
+EquivalenceClass *
+Search::compute_partition (bool *undetermined) const
{
- /* Use bubble sort, since the set is typically short. */
- for (unsigned int i = 1; i < len; i++)
+ EquivalenceClass *partition = NULL;
+ EquivalenceClass *partition_last = NULL;
+ for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
{
- unsigned int j;
- unsigned int tmp;
+ KeywordExt *keyword = temp->first();
- for (j = i, tmp = set[j];
- j > 0 && _occurrences[tmp] < _occurrences[set[j-1]];
- j--)
- set[j] = set[j - 1];
+ /* Compute the undetermined characters for this keyword. */
+ unsigned int *undetermined_chars =
+ new unsigned int[keyword->_selchars_length];
+ unsigned int undetermined_chars_length = 0;
+
+ for (int i = 0; i < keyword->_selchars_length; i++)
+ if (undetermined[keyword->_selchars[i]])
+ undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
+
+ /* Look up the equivalence class to which this keyword belongs. */
+ EquivalenceClass *equclass;
+ for (equclass = partition; equclass; equclass = equclass->_next)
+ if (equclass->_undetermined_chars_length == undetermined_chars_length
+ && equals (equclass->_undetermined_chars, undetermined_chars,
+ undetermined_chars_length))
+ break;
+ if (equclass == NULL)
+ {
+ equclass = new EquivalenceClass();
+ equclass->_keywords = NULL;
+ equclass->_keywords_last = NULL;
+ equclass->_cardinality = 0;
+ equclass->_undetermined_chars = undetermined_chars;
+ equclass->_undetermined_chars_length = undetermined_chars_length;
+ equclass->_next = NULL;
+ if (partition)
+ partition_last->_next = equclass;
+ else
+ partition = equclass;
+ partition_last = equclass;
+ }
+ else
+ delete[] undetermined_chars;
- set[j] = tmp;
+ /* Add the keyword to the equivalence class. */
+ KeywordExt_List *cons = new KeywordExt_List(keyword);
+ if (equclass->_keywords)
+ equclass->_keywords_last->rest() = cons;
+ else
+ equclass->_keywords = cons;
+ equclass->_keywords_last = cons;
+ equclass->_cardinality++;
}
+
+ /* Free some of the allocated memory. The caller doesn't need it. */
+ for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
+ delete[] cls->_undetermined_chars;
+
+ return partition;
}
-/* Computes the frequency of occurrence of a character among the keywords
- up to the given keyword. */
-inline unsigned int
-Search::compute_occurrence (unsigned int c, KeywordExt *curr) const
+static void
+delete_partition (EquivalenceClass *partition)
{
- unsigned int occurrence = 0;
-
- for (KeywordExt_List *temp = _head; ; temp = temp->rest())
+ while (partition != NULL)
{
- KeywordExt *keyword = temp->first();
-
- int m = keyword->_selchars_length;
- for (int i = 0; i < m; i++)
- if (keyword->_selchars[i] == c)
- {
- occurrence++;
- break;
- }
-
- if (keyword == curr)
- break;
+ EquivalenceClass *equclass = partition;
+ partition = equclass->_next;
+ delete_list (equclass->_keywords);
+ //delete[] equclass->_undetermined_chars; // already freed above
+ delete equclass;
}
-
- return occurrence;
}
-/* Sorts the given set in increasing frequency of occurrences among the
- keywords up to the given keyword. */
-
-inline void
-Search::sort_by_occurrence (unsigned int *set, unsigned int len, KeywordExt *curr) const
+/* Compute the possible number of collisions when _asso_values[c] is
+ chosen, leading to the given partition. */
+unsigned int
+Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
{
- unsigned int occurrences[len];
-
- for (unsigned int j = 0; j < len; j++)
- occurrences[j] = 0;
- for (KeywordExt_List *temp = _head; ; temp = temp->rest())
+ /* Every equivalence class p is split according to the frequency of
+ occurrence of c, leading to equivalence classes p1, p2, ...
+ This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions.
+ Return the sum of this expression over all equivalence classes. */
+ unsigned int sum = 0;
+ unsigned int m = get_max_keysig_size();
+ unsigned int split_cardinalities[m+1];
+ for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
{
- KeywordExt *keyword = temp->first();
+ for (unsigned int i = 0; i <= m; i++)
+ split_cardinalities[i] = 0;
- int m = keyword->_selchars_length;
- for (unsigned int j = 0; j < len; j++)
+ for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
{
- unsigned int c = set[j];
+ KeywordExt *keyword = temp->first();
- for (int i = 0; i < m; i++)
+ unsigned int count = 0;
+ for (int i = 0; i < keyword->_selchars_length; i++)
if (keyword->_selchars[i] == c)
- {
- occurrences[j]++;
- break;
- }
+ count++;
+
+ split_cardinalities[count]++;
}
- if (keyword == curr)
- break;
+ sum += cls->_cardinality * cls->_cardinality;
+ for (unsigned int i = 0; i <= m; i++)
+ sum -= split_cardinalities[i] * split_cardinalities[i];
}
+ return sum;
+}
- /* Use bubble sort, since the set is typically short. */
- for (unsigned int i = 1; i < len; i++)
+/* Test whether adding c to the undetermined characters changes the given
+ partition. */
+bool
+Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
+{
+ for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
{
- unsigned int j;
- unsigned int set_tmp, occ_tmp;
+ unsigned int first_count = UINT_MAX;
- for (j = i, set_tmp = set[j], occ_tmp = occurrences[j];
- j > 0 && occ_tmp < occurrences[j-1];
- j--)
+ for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
{
- set[j] = set[j - 1];
- occurrences[j] = occurrences[j - 1];
- }
+ KeywordExt *keyword = temp->first();
+
+ unsigned int count = 0;
+ for (int i = 0; i < keyword->_selchars_length; i++)
+ if (keyword->_selchars[i] == c)
+ count++;
- set[j] = set_tmp;
- occurrences[j] = occ_tmp;
+ if (temp == cls->_keywords)
+ first_count = count;
+ else if (count != first_count)
+ /* c would split this equivalence class. */
+ return false;
+ }
}
+ return true;
}
-/* Returns true if the recomputed hash values for the keywords from
- _head->first() to curr - inclusive - give at least one collision.
- This is called very frequently, and needs to be fast! */
-bool
-Search::has_collisions (KeywordExt *curr)
+void
+Search::find_asso_values ()
{
- /* Iteration Number array is a win, O(1) initialization time! */
- _collision_detector->clear ();
+ Step *steps;
- for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
- {
- KeywordExt *keyword = ptr->first();
+ /* Determine the steps, starting with the last one. */
+ {
+ bool *undetermined;
+ bool *determined;
- /* Compute new hash code for the keyword, and see whether it
- collides with another keyword's hash code. If we have too
- many collisions, we can safely abort the fruitless loop. */
- if (_collision_detector->set_bit (compute_hash (keyword)))
- return true;
+ steps = NULL;
- if (keyword == curr)
- return false;
- }
-}
+ undetermined = new bool[_alpha_size];
+ for (unsigned int c = 0; c < _alpha_size; c++)
+ undetermined[c] = false;
-/* Tests whether the given keyword has the same hash value as another one
- earlier in the list. If yes, this earlier keyword is returned (more
- precisely, the first one of them, but it doesn't really matter which one).
- If no collision is present, NULL is returned. */
-KeywordExt *
-Search::collision_prior_to (KeywordExt *curr)
-{
- for (KeywordExt_List *prior_ptr = _head;
- prior_ptr->first() != curr;
- prior_ptr = prior_ptr->rest())
- {
- KeywordExt *prior = prior_ptr->first();
+ determined = new bool[_alpha_size];
+ for (unsigned int c = 0; c < _alpha_size; c++)
+ determined[c] = true;
- if (prior->_hash_value == curr->_hash_value)
- return prior;
- }
- return NULL;
-}
+ for (;;)
+ {
+ /* Compute the partition that needs to be refined. */
+ EquivalenceClass *partition = compute_partition (undetermined);
+
+ /* Determine the main character to be chosen in this step.
+ Choosing such a character c has the effect of splitting every
+ equivalence class (according the the frequency of occurrence of c).
+ We choose the c with the minimum number of possible collisions,
+ so that characters which lead to a large number of collisions get
+ handled early during the search. */
+ unsigned int chosen_c;
+ unsigned int chosen_possible_collisions;
+ {
+ unsigned int best_c = 0;
+ unsigned int best_possible_collisions = UINT_MAX;
+ for (unsigned int c = 0; c < _alpha_size; c++)
+ if (_occurrences[c] > 0 && determined[c])
+ {
+ unsigned int possible_collisions =
+ count_possible_collisions (partition, c);
+ if (possible_collisions < best_possible_collisions)
+ {
+ best_c = c;
+ best_possible_collisions = possible_collisions;
+ }
+ }
+ if (best_possible_collisions == UINT_MAX)
+ {
+ /* All c with _occurrences[c] > 0 are undetermined. We are
+ are the starting situation and don't need any more step. */
+ delete_partition (partition);
+ break;
+ }
+ chosen_c = best_c;
+ chosen_possible_collisions = best_possible_collisions;
+ }
-/* Finding good asso_values is normally straightforwards, but needs
- backtracking in some cases. The recurse/backtrack depth can be at most
- _list_len. Since we cannot assume that the C stack is large enough,
- we perform the processing without recursion, and simulate the stack. */
-struct StackEntry
-{
- /* The current keyword. */
- KeywordExt * _curr;
+ /* We need one more step. */
+ Step *step = new Step();
- /* The prior keyword, with which curr collides. */
- KeywordExt * _prior;
+ step->_undetermined = new bool[_alpha_size];
+ memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
- /* Scratch set. */
- unsigned int * _union_set;
- unsigned int _union_set_length;
+ step->_partition = partition;
- /* Current index into the scratch set. */
- unsigned int _union_index;
+ /* Now determine how the equivalence classes will be before this
+ step. */
+ undetermined[chosen_c] = true;
+ partition = compute_partition (undetermined);
- /* Trying a different value for _asso_values[_c]. */
- unsigned int _c;
+ /* Now determine which other characters should be determined in this
+ step, because they will not change the equivalence classes at
+ this point. It is the set of all c which, for all equivalence
+ classes, have the same frequency of occurrence in every keyword
+ of the equivalence class. */
+ for (unsigned int c = 0; c < _alpha_size; c++)
+ if (_occurrences[c] > 0 && determined[c]
+ && unchanged_partition (partition, c))
+ {
+ undetermined[c] = true;
+ determined[c] = false;
+ }
- /* The original value of _asso_values[_c]. */
- unsigned int _original_asso_value;
+ /* main_c must be one of these. */
+ if (determined[chosen_c])
+ abort ();
- /* Remaining number of iterations. */
- int _iter;
-};
+ /* Now the set of changing characters of this step. */
+ unsigned int changing_count;
-/* Finds some _asso_values[] that fit. */
+ changing_count = 0;
+ for (unsigned int c = 0; c < _alpha_size; c++)
+ if (undetermined[c] && !step->_undetermined[c])
+ changing_count++;
-void
-Search::find_asso_values ()
-{
- /* Add one keyword after the other and see whether its hash value collides
- with one of the previous hash values. If so, change some asso_values[]
- entry until the number of collisions so far is reduced. Then continue
- with the next keyword. */
-
- init_asso_values ();
-
- int iterations =
- !option[FAST]
- ? _asso_value_max /* Try all possible values of _asso_values[c]. */
- : option.get_iterations ()
- ? option.get_iterations ()
- : keyword_list_length ();
-
- /* Allocate stack. */
- StackEntry *stack = new StackEntry[_list_len];
- {
- KeywordExt_List *ptr = _head;
- for (int i = 0; i < _list_len; i++, ptr = ptr->rest())
- {
- stack[i]._curr = ptr->first();
- stack[i]._union_set = new unsigned int [2 * get_max_keysig_size ()];
- }
- }
+ unsigned int *changing = new unsigned int[changing_count];
+ changing_count = 0;
+ for (unsigned int c = 0; c < _alpha_size; c++)
+ if (undetermined[c] && !step->_undetermined[c])
+ changing[changing_count++] = c;
- /* Backtracking according to the standard Prolog call pattern:
+ step->_changing = changing;
+ step->_changing_count = changing_count;
- +------------------+
- -------CALL------>| |-------RETURN------>
- | |
- <------FAIL-------| |<------REDO---------
- +------------------+
+ step->_asso_value_max = _asso_value_max;
- A CALL and RETURN increase the stack pointer, FAIL and REDO decrease it.
- */
- {
- /* Current stack pointer. */
- StackEntry *sp = &stack[0];
-
- /* Local variables corresponding to *sp. */
- /* The current keyword. */
- KeywordExt *curr;
- /* The prior keyword, with which curr collides. */
- KeywordExt *prior;
- /* Scratch set. */
- unsigned int *union_set;
- unsigned int union_set_length;
- /* Current index into the scratch set. */
- unsigned int union_index;
- /* Trying a different value for _asso_values[c]. */
- unsigned int c;
- /* The original value of _asso_values[c]. */
- unsigned int original_asso_value;
- /* Remaining number of iterations. */
- int iter;
-
- /* ==== CALL ==== */
- CALL:
-
- /* Next keyword from the list. */
- curr = sp->_curr;
-
- /* Compute this keyword's hash value. */
- compute_hash (curr);
-
- /* See if it collides with a prior keyword. */
- prior = collision_prior_to (curr);
-
- if (prior != NULL)
- {
- /* Handle collision: Attempt to change an _asso_value[], in order to
- resolve a hash value collision between the two given keywords. */
+ step->_expected_lower =
+ exp (static_cast<double>(chosen_possible_collisions)
+ / static_cast<double>(_max_hash_value));
+ step->_expected_upper =
+ exp (static_cast<double>(chosen_possible_collisions)
+ / static_cast<double>(_asso_value_max));
- if (option[DEBUG])
- {
- fprintf (stderr, "collision on keyword #%d, prior = \"%.*s\", curr = \"%.*s\" hash = %d\n",
- sp - stack + 1,
- prior->_allchars_length, prior->_allchars,
- curr->_allchars_length, curr->_allchars,
- curr->_hash_value);
- fflush (stderr);
- }
+ delete_partition (partition);
- /* To achieve that the two hash values become different, we have to
- change an _asso_values[c] for a character c that contributes to the
- hash functions of prior and curr with different multiplicity.
- So we compute the set of such c. */
- union_set = sp->_union_set;
- union_set_length =
- compute_disjoint_union (prior->_selchars, prior->_selchars_length,
- curr->_selchars, curr->_selchars_length,
- union_set);
-
- /* Sort by decreasing occurrence: Try least-used characters c first.
- The idea is that this reduces the number of freshly introduced
- collisions. */
- if (option[OPT_CHOICE])
- sort_by_occurrence (union_set, union_set_length, curr);
- else
- sort_by_occurrence (union_set, union_set_length);
+ step->_next = steps;
+ steps = step;
+ }
- for (union_index = 0; union_index < union_set_length; union_index++)
- {
- c = union_set[union_index];
+ delete[] determined;
+ delete[] undetermined;
+ }
- if (option[DEBUG])
- {
- unsigned int n = sp - stack + 1;
- unsigned int o = compute_occurrence (c, curr);
- fprintf (stderr, "Expected number of iterations between %g and %g\n",
- exp (static_cast<double>((2*n-o)*o)
- / static_cast<double>(2*_max_hash_value)),
- exp (static_cast<double>((2*n-o)*o)
- / static_cast<double>(2*_asso_value_max)));
- }
+ if (option[DEBUG])
+ {
+ unsigned int stepno = 0;
+ for (Step *step = steps; step; step = step->_next)
+ {
+ stepno++;
+ fprintf (stderr, "Step %u chooses _asso_values[", stepno);
+ for (unsigned int i = 0; i < step->_changing_count; i++)
+ {
+ if (i > 0)
+ fprintf (stderr, ",");
+ fprintf (stderr, "'%c'", step->_changing[i]);
+ }
+ fprintf (stderr, "], expected number of iterations between %g and %g.\n",
+ step->_expected_lower, step->_expected_upper);
+ fprintf (stderr, "Keyword equivalence classes:\n");
+ for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
+ {
+ fprintf (stderr, "\n");
+ for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
+ {
+ KeywordExt *keyword = temp->first();
+ fprintf (stderr, " %.*s\n",
+ keyword->_allchars_length, keyword->_allchars);
+ }
+ }
+ fprintf (stderr, "\n");
+ }
+ }
- /* Try various other values for _asso_values[c]. A value is
- successful if, with it, the recomputed hash values for the
- keywords from _head->first() to curr - inclusive - give no
- collisions. Up to the given number of iterations are performed.
- If successful, _asso_values[c] is changed, and the recursion
- continues. If all iterations are unsuccessful, _asso_values[c]
- is restored and we backtrack, trying the next union_index. */
+ /* Initialize _asso_values[]. (The value given here matters only
+ for those c which occur in all keywords with equal multiplicity.) */
+ for (unsigned int c = 0; c < _alpha_size; c++)
+ _asso_values[c] = 0;
- original_asso_value = _asso_values[c];
+ unsigned int stepno = 0;
+ for (Step *step = steps; step; step = step->_next)
+ {
+ stepno++;
- /* Try many valid associated values. */
- for (iter = iterations; iter > 0; iter--)
- {
- /* Try next value. Wrap around mod _asso_value_max. */
- _asso_values[c] =
- (_asso_values[c] + (_jump != 0 ? _jump : rand ()))
- & (_asso_value_max - 1);
+ /* Initialize the asso_values[]. */
+ unsigned int k = step->_changing_count;
+ for (unsigned int i = 0; i < k; i++)
+ {
+ unsigned int c = step->_changing[i];
+ _asso_values[c] =
+ (_initial_asso_value < 0 ? rand () : _initial_asso_value)
+ & (step->_asso_value_max - 1);
+ }
+
+ unsigned int iterations = 0;
+ unsigned int iter[k];
+ for (unsigned int i = 0; i < k; i++)
+ iter[i] = 0;
+ unsigned int ii = (_jump != 0 ? k - 1 : 0);
+
+ for (;;)
+ {
+ /* Test whether these asso_values[] lead to collisions among
+ the equivalence classes that should be collision-free. */
+ bool has_collision = false;
+ for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
+ {
+ /* Iteration Number array is a win, O(1) initialization time! */
+ _collision_detector->clear ();
- if (!has_collisions (curr))
+ for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
+ {
+ KeywordExt *keyword = ptr->first();
+
+ /* Compute the new hash code for the keyword, leaving apart
+ the yet undetermined asso_values[]. */
+ int hashcode;
{
- /* Good, this _asso_values[] modification reduces the
- number of collisions so far.
- All keyword->_hash_value up to curr - inclusive -
- have been updated. */
- if (option[DEBUG])
- {
- fprintf (stderr, "- resolved after %d iterations by "
- "changing asso_value['%c'] (char #%d) to %d\n",
- iterations - iter + 1, c,
- union_index + 1, _asso_values[c]);
- fflush (stderr);
- }
- goto RECURSE_COLLISION;
- BACKTRACK_COLLISION:
- if (option[DEBUG])
- {
- fprintf (stderr, "back to collision on keyword #%d, prior = \"%.*s\", curr = \"%.*s\" hash = %d\n",
- sp - stack + 1,
- prior->_allchars_length, prior->_allchars,
- curr->_allchars_length, curr->_allchars,
- curr->_hash_value);
- fflush (stderr);
- }
+ int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
+ const unsigned int *p = keyword->_selchars;
+ int i = keyword->_selchars_length;
+ for (; i > 0; p++, i--)
+ if (!step->_undetermined[*p])
+ sum += _asso_values[*p];
+ hashcode = sum;
}
- }
- /* Restore original values, no more tries. */
- _asso_values[c] = original_asso_value;
- }
+ /* See whether it collides with another keyword's hash code,
+ from the same equivalence class. */
+ if (_collision_detector->set_bit (hashcode))
+ {
+ has_collision = true;
+ break;
+ }
+ }
- /* Failed to resolve a collision. */
+ /* Don't need to continue looking at the other equivalence
+ classes if we already have found a collision. */
+ if (has_collision)
+ break;
+ }
- /* Recompute all keyword->_hash_value up to curr - exclusive -. */
- for (KeywordExt_List *ptr = _head; ; ptr = ptr->rest())
- {
- KeywordExt* keyword = ptr->first();
- if (keyword == curr)
- break;
- compute_hash (keyword);
- }
+ iterations++;
+ if (!has_collision)
+ break;
- if (option[DEBUG])
- {
- fprintf (stderr, "** collision not resolved after %d iterations of asso_value[",
- iterations);
- for (union_index = 0; union_index < union_set_length; union_index++)
+ /* Try other asso_values[]. */
+ if (_jump != 0)
+ {
+ /* The way we try various values for
+ asso_values[step->_changing[0],...step->_changing[k-1]]
+ is like this:
+ for (bound = 0,1,...)
+ for (ii = 0,...,k-1)
+ iter[ii] := bound
+ iter[0..ii-1] := values <= bound
+ iter[ii+1..k-1] := values < bound
+ and
+ asso_values[step->_changing[i]] =
+ _initial_asso_value + iter[i] * _jump.
+ This makes it more likely to find small asso_values[].
+ */
+ unsigned int bound = iter[ii];
+ unsigned int i = 0;
+ while (i < ii)
+ {
+ unsigned int c = step->_changing[i];
+ iter[i]++;
+ _asso_values[c] =
+ (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
+ if (iter[i] <= bound)
+ goto found_next;
+ _asso_values[c] =
+ (_asso_values[c] - iter[i] * _jump)
+ & (step->_asso_value_max - 1);
+ iter[i] = 0;
+ i++;
+ }
+ i = ii + 1;
+ while (i < k)
+ {
+ unsigned int c = step->_changing[i];
+ iter[i]++;
+ _asso_values[c] =
+ (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
+ if (iter[i] < bound)
+ goto found_next;
+ _asso_values[c] =
+ (_asso_values[c] - iter[i] * _jump)
+ & (step->_asso_value_max - 1);
+ iter[i] = 0;
+ i++;
+ }
+ /* Switch from one ii to the next. */
{
- if (union_index > 0)
- fprintf (stderr, ",");
- fprintf(stderr, "'%c'", union_set[union_index]);
+ unsigned int c = step->_changing[ii];
+ _asso_values[c] =
+ (_asso_values[c] - bound * _jump)
+ & (step->_asso_value_max - 1);
+ iter[ii] = 0;
}
- fprintf (stderr, "], backtracking...\n");
- fflush (stderr);
- }
- }
- else
- {
- /* Nothing to do, just recurse. */
- goto RECURSE_NO_COLLISION;
- BACKTRACK_NO_COLLISION: ;
- }
+ /* Here all iter[i] == 0. */
+ ii++;
+ if (ii == k)
+ {
+ ii = 0;
+ bound++;
+ if (bound == step->_asso_value_max)
+ {
+ /* Out of search space! We can either backtrack, or
+ increase the available search space of this step.
+ It seems simpler to choose the latter solution. */
+ step->_asso_value_max = 2 * step->_asso_value_max;
+ if (step->_asso_value_max > _asso_value_max)
+ {
+ _asso_value_max = step->_asso_value_max;
+ /* Reinitialize _max_hash_value. */
+ _max_hash_value =
+ (option[NOLENGTH] ? 0 : max_key_length ())
+ + (_asso_value_max - 1) * get_max_keysig_size ();
+ /* Reinitialize _collision_detector. */
+ delete _collision_detector;
+ _collision_detector =
+ new Bool_Array (_max_hash_value + 1);
+ }
+ }
+ }
+ {
+ unsigned int c = step->_changing[ii];
+ iter[ii] = bound;
+ _asso_values[c] =
+ (_asso_values[c] + bound * _jump)
+ & (step->_asso_value_max - 1);
+ }
+ found_next: ;
+ }
+ else
+ {
+ /* Random. */
+ unsigned int c = step->_changing[ii];
+ _asso_values[c] =
+ (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
+ /* Next time, change the next c. */
+ ii++;
+ if (ii == k)
+ ii = 0;
+ }
+ }
+ if (option[DEBUG])
+ {
+ fprintf (stderr, "Step %u chose _asso_values[", stepno);
+ for (unsigned int i = 0; i < step->_changing_count; i++)
+ {
+ if (i > 0)
+ fprintf (stderr, ",");
+ fprintf (stderr, "'%c'", step->_changing[i]);
+ }
+ fprintf (stderr, "] in %u iterations.\n", iterations);
+ }
+ }
- /* ==== FAIL ==== */
- if (sp != stack)
- {
- sp--;
- /* ==== REDO ==== */
- curr = sp->_curr;
- prior = sp->_prior;
- if (prior == NULL)
- goto BACKTRACK_NO_COLLISION;
- union_set = sp->_union_set;
- union_set_length = sp->_union_set_length;
- union_index = sp->_union_index;
- c = sp->_c;
- original_asso_value = sp->_original_asso_value;
- iter = sp->_iter;
- goto BACKTRACK_COLLISION;
- }
+ /* Free allocated memory. */
+ while (steps != NULL)
+ {
+ Step *step = steps;
+ steps = step->_next;
+ delete[] step->_changing;
+ delete[] step->_undetermined;
+ delete_partition (step->_partition);
+ delete step;
+ }
+}
- /* No solution found after an exhaustive search!
- We should ideally turn off option[FAST] and, if that doesn't help,
- multiply _asso_value_max by 2. */
- fprintf (stderr,
- "\nBig failure, always got duplicate hash code values.\n");
- if (option[POSITIONS])
- fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
- else
- fprintf (stderr, "try options -m or -r.\n\n");
- exit (1);
-
- RECURSE_COLLISION:
- /*sp->_union_set = union_set;*/ // redundant
- sp->_union_set_length = union_set_length;
- sp->_union_index = union_index;
- sp->_c = c;
- sp->_original_asso_value = original_asso_value;
- sp->_iter = iter;
- RECURSE_NO_COLLISION:
- /*sp->_curr = curr;*/ // redundant
- sp->_prior = prior;
- /* ==== RETURN ==== */
- sp++;
- if (sp - stack < _list_len)
- goto CALL;
- }
+/* Computes a keyword's hash value, relative to the current _asso_values[],
+ and stores it in keyword->_hash_value. */
- /* Deallocate stack. */
- {
- for (int i = 0; i < _list_len; i++)
- delete[] stack[i]._union_set;
- }
- delete[] stack;
+inline int
+Search::compute_hash (KeywordExt *keyword) const
+{
+ int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
+
+ const unsigned int *p = keyword->_selchars;
+ int i = keyword->_selchars_length;
+ for (; i > 0; p++, i--)
+ sum += _asso_values[*p];
+
+ return keyword->_hash_value = sum;
}
/* Finds good _asso_values[]. */
@@ -1390,8 +1326,6 @@ void
Search::find_good_asso_values ()
{
prepare ();
- if (option[ORDER])
- reorder ();
prepare_asso_values ();
/* Search for good _asso_values[]. */
@@ -1529,7 +1463,6 @@ Search::optimize ()
Search::~Search ()
{
delete _collision_detector;
- delete[] _determined;
if (option[DEBUG])
{
fprintf (stderr, "\ndumping occurrence and associated values tables\n");
diff --git a/src/search.h b/src/search.h
index 2ff3f78..ada14d2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -30,6 +30,8 @@
#include "positions.h"
#include "bool-array.h"
+struct EquivalenceClass;
+
class Search
{
public:
@@ -62,16 +64,6 @@ private:
void prepare ();
- /* Computes the sum of occurrences of the _selchars of a keyword. */
- int compute_occurrence (KeywordExt *ptr) const;
-
- /* Auxiliary functions used by Search::reorder(). */
- void clear_determined ();
- void set_determined (KeywordExt *keyword);
- bool already_determined (KeywordExt *keyword) const;
- /* Reorders the keyword list so as to minimize search times. */
- void reorder ();
-
/* Returns the length of keyword list. */
int keyword_list_length () const;
@@ -83,30 +75,20 @@ private:
/* Initializes the asso_values[] related parameters. */
void prepare_asso_values ();
- /* Puts a first guess into asso_values[]. */
- void init_asso_values ();
-
- /* Computes a keyword's hash value, relative to the current _asso_values[],
- and stores it in keyword->_hash_value. */
- int compute_hash (KeywordExt *keyword) const;
- /* Computes the frequency of occurrence of a character among the keywords
- up to the given keyword. */
- unsigned int compute_occurrence (unsigned int c, KeywordExt *curr) const;
+ EquivalenceClass * compute_partition (bool *undetermined) const;
- /* Sorts the given set in increasing frequency of _occurrences[]. */
- void sort_by_occurrence (unsigned int *set, unsigned int len) const;
- /* Sorts the given set in increasing frequency of occurrences among the
- keywords up to the given keyword. */
- void sort_by_occurrence (unsigned int *set, unsigned int len, KeywordExt *curr) const;
+ unsigned int count_possible_collisions (EquivalenceClass *partition, unsigned int c) const;
- bool has_collisions (KeywordExt *curr);
-
- KeywordExt * collision_prior_to (KeywordExt *curr);
+ bool unchanged_partition (EquivalenceClass *partition, unsigned int c) const;
/* Finds some _asso_values[] that fit. */
void find_asso_values ();
+ /* Computes a keyword's hash value, relative to the current _asso_values[],
+ and stores it in keyword->_hash_value. */
+ int compute_hash (KeywordExt *keyword) const;
+
/* Finds good _asso_values[]. */
void find_good_asso_values ();
@@ -152,9 +134,6 @@ private:
/* Length of _head list. Number of keywords, not counting duplicates. */
int _list_len;
- /* Vector used during Search::reorder(). */
- bool * _determined;
-
/* Exclusive upper bound for every _asso_values[c]. A power of 2. */
unsigned int _asso_value_max;