summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2003-02-19 12:32:06 +0000
committerBruno Haible <bruno@clisp.org>2003-02-19 12:32:06 +0000
commit1d73fbe0191e1249bf4f61149b4f4352e53438cb (patch)
tree5b9db61d0186d39dc656105ef4b41e71dc60f7d7
parent362493686d5d58634f098d9c74c0427e830a2475 (diff)
downloadgperf-1d73fbe0191e1249bf4f61149b4f4352e53438cb.tar.gz
Comments and code restructuring.
-rw-r--r--ChangeLog11
-rw-r--r--src/search.cc112
-rw-r--r--src/search.h5
3 files changed, 95 insertions, 33 deletions
diff --git a/ChangeLog b/ChangeLog
index ad7cda6..cc32f6a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2002-11-19 Bruno Haible <bruno@clisp.org>
+
+ * src/search.h (Search::find_good_asso_values): New declaration.
+ * src/search.cc: Add comments about the basic structure of the
+ algorithm.
+ (Search::find_positions): Move the option[POSITIONS] test to here.
+ (Search::find_good_asso_values): New method, extracted from
+ Search::optimize.
+ (Search::optimize): Remove option[POSITIONS] test. Call
+ find_good_asso_values.
+
2002-11-17 Bruno Haible <bruno@clisp.org>
* src/options.cc (Options::parse_options): Include copyright notice
diff --git a/src/search.cc b/src/search.cc
index 7fee88d..826c43f 100644
--- a/src/search.cc
+++ b/src/search.cc
@@ -34,10 +34,12 @@
/* The most general form of the hash function is
hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
+ + len (keyword)
where Pos is a set of byte positions,
each alpha_inc[i] is a nonnegative integer,
- each asso_values[c] is a nonnegative integer.
+ each asso_values[c] is a nonnegative integer,
+ len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
Theorem 1: If all keywords are different, there is a set Pos such that
all tuples (keyword[i] : i in Pos) are different.
@@ -63,9 +65,23 @@
Step 3 (Finding good asso_values):
Find asso_values[c] such that all hash (keyword) are different.
+
+ In other words, each step finds a projection that is injective on the
+ given finite set:
+ proj1 : String --> Map (Pos --> N)
+ proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
+ proj3 : Map (Pos --> N) / S(Pos) --> N
+ where N denotes the set of nonnegative integers, and S(Pos) is the
+ symmetric group over Pos.
+
+ This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
+ modifications apply:
+ proj1 : String --> Map (Pos --> N) x N
+ proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
+ proj3 : Map (Pos --> N) / S(Pos) x N --> N
*/
-/* -------------------- Initialization and Preparation --------------------- */
+/* ==================== Initialization and Preparation ===================== */
Search::Search (KeywordExt_List *list)
: _head (list)
@@ -106,7 +122,7 @@ Search::preprepare ()
}
}
-/* ---------------------- Finding good byte positions ---------------------- */
+/* ====================== Finding good byte positions ====================== */
/* Initializes each keyword's _selchars array. */
void
@@ -124,10 +140,16 @@ Search::delete_selchars () const
temp->first()->delete_selchars();
}
-/* Count the duplicate keywords that occur with a given set of positions. */
+/* Count the duplicate keywords that occur with a given set of positions.
+ In other words, it returns the difference
+ # K - # proj1 (K)
+ where K is the multiset of given keywords. */
unsigned int
Search::count_duplicates_tuple (const Positions& positions) const
{
+ /* Run through the keyword list and count the duplicates incrementally.
+ The result does not depend on the order of the keyword list, thanks to
+ the formula above. */
init_selchars_tuple (option[ALLCHARS], positions);
unsigned int count = 0;
@@ -151,6 +173,13 @@ Search::count_duplicates_tuple (const Positions& positions) const
void
Search::find_positions ()
{
+ /* If the user gave the key positions, we use them. */
+ if (option[POSITIONS])
+ {
+ _key_positions = option.get_key_positions();
+ return;
+ }
+
/* 1. Find positions that must occur in order to distinguish duplicates. */
Positions mandatory;
@@ -302,7 +331,7 @@ Search::find_positions ()
_key_positions = current;
}
-/* --------------------- Finding good alpha increments --------------------- */
+/* ===================== Finding good alpha increments ===================== */
/* Initializes each keyword's _selchars array. */
void
@@ -313,10 +342,16 @@ Search::init_selchars_multiset (bool use_all_chars, const Positions& positions,
}
/* Count the duplicate keywords that occur with the given set of positions
- and a given alpha_inc[] array. */
+ and a given alpha_inc[] array.
+ In other words, it returns the difference
+ # K - # proj2 (proj1 (K))
+ where K is the multiset of given keywords. */
unsigned int
Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
{
+ /* Run through the keyword list and count the duplicates incrementally.
+ The result does not depend on the order of the keyword list, thanks to
+ the formula above. */
init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc);
unsigned int count = 0;
@@ -341,7 +376,8 @@ void
Search::find_alpha_inc ()
{
/* The goal is to choose _alpha_inc[] such that it doesn't introduce
- artificial duplicates. */
+ artificial duplicates.
+ In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
unsigned int duplicates_goal = count_duplicates_tuple (_key_positions);
/* Start with zero increments. This is sufficient in most cases. */
@@ -428,7 +464,7 @@ Search::find_alpha_inc ()
_alpha_inc = current;
}
-/* ------------------------------------------------------------------------- */
+/* ======================= Finding good asso_values ======================== */
void
Search::prepare ()
@@ -988,7 +1024,7 @@ Search::change_some_asso_value (KeywordExt *prior, KeywordExt *curr)
}
}
-/* Finds good _asso_values[]. */
+/* Finds some _asso_values[] that fit. */
void
Search::find_asso_values ()
@@ -1026,32 +1062,11 @@ Search::find_asso_values ()
}
}
-/* ------------------------------------------------------------------------- */
-
-/* Comparison function for sorting by increasing _hash_value. */
-static bool
-less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
-{
- return keyword1->_hash_value < keyword2->_hash_value;
-}
-
-/* Sorts the keyword list by hash value. */
-
-void
-Search::sort ()
-{
- _head = mergesort_list (_head, less_by_hash_value);
-}
+/* Finds good _asso_values[]. */
void
-Search::optimize ()
+Search::find_good_asso_values ()
{
- /* Preparations. */
- preprepare ();
- _key_positions = option.get_key_positions();
- if (!option[POSITIONS])
- find_positions ();
- find_alpha_inc ();
prepare ();
if (option[ORDER])
reorder ();
@@ -1129,6 +1144,39 @@ Search::optimize ()
delete[] best_asso_values;
/* The keywords' _hash_value fields are recomputed below. */
}
+}
+
+/* ========================================================================= */
+
+/* Comparison function for sorting by increasing _hash_value. */
+static bool
+less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
+{
+ return keyword1->_hash_value < keyword2->_hash_value;
+}
+
+/* Sorts the keyword list by hash value. */
+
+void
+Search::sort ()
+{
+ _head = mergesort_list (_head, less_by_hash_value);
+}
+
+void
+Search::optimize ()
+{
+ /* Preparations. */
+ preprepare ();
+
+ /* Step 1: Finding good byte positions. */
+ find_positions ();
+
+ /* Step 2: Finding good alpha increments. */
+ find_alpha_inc ();
+
+ /* Step 3: Finding good asso_values. */
+ find_good_asso_values ();
/* Make one final check, just to make sure nothing weird happened.... */
_collision_detector->clear ();
diff --git a/src/search.h b/src/search.h
index d6d4ce3..f120c3f 100644
--- a/src/search.h
+++ b/src/search.h
@@ -100,9 +100,12 @@ private:
collision between the two given keywords. */
void change_some_asso_value (KeywordExt *prior, KeywordExt *curr);
- /* Finds good _asso_values[]. */
+ /* Finds some _asso_values[] that fit. */
void find_asso_values ();
+ /* Finds good _asso_values[]. */
+ void find_good_asso_values ();
+
/* Sorts the keyword list by hash value. */
void sort ();