diff options
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | src/search.cc | 85 | ||||
-rw-r--r-- | src/search.h | 7 |
3 files changed, 71 insertions, 31 deletions
@@ -1,5 +1,15 @@ 2002-12-12 Bruno Haible <bruno@clisp.org> + * src/search.h (Search::init_selchars_tuple, + Search::count_duplicates_tuple): Add alpha_unify argument. + (Search::count_duplicates_tuple): New method declaration. + * src/search.cc (Search::init_selchars_tuple, + Search::count_duplicates_tuple): Add alpha_unify argument. + (Search::find_positions): Update. + (Search::count_duplicates_tuple): New method. + (Search::count_duplicates_multiset): Free temp alpha_unify vector. + (Search::find_alpha_inc): Call count_duplicates_tuple. + * src/configure.in: Add test for stack-allocated variable-size arrays. * src/config.h.in: Regenerated. * src/search.cc: Include config.h. diff --git a/src/search.cc b/src/search.cc index a1e3d70..6eec1bb 100644 --- a/src/search.cc +++ b/src/search.cc @@ -58,7 +58,7 @@ /* ================================ Theory ================================= */ -/* The most general form of the hash function is +/* The general form of the hash function is hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos) + len (keyword) @@ -75,6 +75,8 @@ are nonnegative integers alpha_inc[i] such that all multisets {keyword[i] + alpha_inc[i] : i in Pos} are different. + Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}. + Theorem 3: If all multisets selchars[keyword] are different, there are nonnegative integers asso_values[c] such that all hash values sum (asso_values[c] : c in selchars[keyword]) are different. @@ -98,14 +100,25 @@ proj1 : String --> Map (Pos --> N) proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos) proj3 : Map (Pos --> N) / S(Pos) --> N - where N denotes the set of nonnegative integers, and S(Pos) is the - symmetric group over Pos. + where + N denotes the set of nonnegative integers, + Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and + S(Pos) is the symmetric group over Pos. This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight modifications apply: proj1 : String --> Map (Pos --> N) x N proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N proj3 : Map (Pos --> N) / S(Pos) x N --> N + + For a case-insensitive hash function, the general form is + + hash (keyword) = + sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos) + + len (keyword) + + where alpha_unify[c] is chosen so that an upper/lower case change in + keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]]. */ /* ==================== Initialization and Preparation ===================== */ @@ -118,17 +131,15 @@ Search::Search (KeywordExt_List *list) void Search::prepare () { - KeywordExt_List *temp; - /* Compute the total number of keywords. */ _total_keys = 0; - for (temp = _head; temp; temp = temp->rest()) + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) _total_keys++; /* Compute the minimum and maximum keyword length. */ _max_key_len = INT_MIN; _min_key_len = INT_MAX; - for (temp = _head; temp; temp = temp->rest()) + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) { KeywordExt *keyword = temp->first(); @@ -142,16 +153,16 @@ Search::prepare () expressions don't work correctly for looking up an empty string. */ if (_min_key_len == 0) { - fprintf (stderr, "Empty input key is not allowed.\n" - "To recognize an empty input key, your code should check for\n" - "len == 0 before calling the gperf generated lookup function.\n"); + fprintf (stderr, "Empty input keyword is not allowed.\n" + "To recognize an empty input keyword, your code should check for\n" + "len == 0 before calling the gperf generated lookup function.\n"); exit (1); } /* Exit program if the characters in the keywords are not in the required range. */ if (option[SEVENBIT]) - for (temp = _head; temp; temp = temp->rest()) + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) { KeywordExt *keyword = temp->first(); @@ -185,6 +196,7 @@ Search::compute_alpha_unify () const { if (option[UPPERLOWER]) { + /* Uppercase to lowercase mapping. */ unsigned int alpha_size = compute_alpha_size(); unsigned int *alpha_unify = new unsigned int[alpha_size]; for (unsigned int c = 0; c < alpha_size; c++) @@ -194,15 +206,16 @@ Search::compute_alpha_unify () const return alpha_unify; } else + /* Identity mapping. */ return NULL; } /* Initializes each keyword's _selchars array. */ void -Search::init_selchars_tuple (const Positions& positions) const +Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const { for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) - temp->first()->init_selchars_tuple(positions, _alpha_unify); + temp->first()->init_selchars_tuple(positions, alpha_unify); } /* Deletes each keyword's _selchars array. */ @@ -218,12 +231,12 @@ Search::delete_selchars () const # K - # proj1 (K) where K is the multiset of given keywords. */ unsigned int -Search::count_duplicates_tuple (const Positions& positions) const +Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const { /* Run through the keyword list and count the duplicates incrementally. The result does not depend on the order of the keyword list, thanks to the formula above. */ - init_selchars_tuple (positions); + init_selchars_tuple (positions, alpha_unify); unsigned int count = 0; { @@ -253,8 +266,8 @@ Search::find_positions () return; } - /* Compute preliminary value for _alpha_unify. */ - _alpha_unify = compute_alpha_unify (); + /* Compute preliminary alpha_unify table. */ + unsigned int *alpha_unify = compute_alpha_unify (); /* 1. Find positions that must occur in order to distinguish duplicates. */ Positions mandatory; @@ -322,7 +335,8 @@ Search::find_positions () int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1 ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1); Positions current = mandatory; - unsigned int current_duplicates_count = count_duplicates_tuple (current); + unsigned int current_duplicates_count = + count_duplicates_tuple (current, alpha_unify); for (;;) { Positions best; @@ -333,7 +347,8 @@ Search::find_positions () { Positions tryal = current; tryal.add (i); - unsigned int try_duplicates_count = count_duplicates_tuple (tryal); + unsigned int try_duplicates_count = + count_duplicates_tuple (tryal, alpha_unify); /* We prefer 'try' to 'best' if it produces less duplicates, or if it produces the same number of duplicates but with @@ -366,7 +381,8 @@ Search::find_positions () { Positions tryal = current; tryal.remove (i); - unsigned int try_duplicates_count = count_duplicates_tuple (tryal); + unsigned int try_duplicates_count = + count_duplicates_tuple (tryal, alpha_unify); /* We prefer 'try' to 'best' if it produces less duplicates, or if it produces the same number of duplicates but with @@ -406,7 +422,7 @@ Search::find_positions () tryal.remove (i2); tryal.add (i3); unsigned int try_duplicates_count = - count_duplicates_tuple (tryal); + count_duplicates_tuple (tryal, alpha_unify); /* We prefer 'try' to 'best' if it produces less duplicates, or if it produces the same number of duplicates but with @@ -459,8 +475,21 @@ Search::find_positions () fprintf (stderr, "\n"); } - /* Free preliminary value for _alpha_unify. */ - delete[] _alpha_unify; + /* Free preliminary alpha_unify table. */ + delete[] alpha_unify; +} + +/* Count the duplicate keywords that occur with the found set of positions. + In other words, it returns the difference + # K - # proj1 (K) + where K is the multiset of given keywords. */ +unsigned int +Search::count_duplicates_tuple () const +{ + unsigned int *alpha_unify = compute_alpha_unify (); + unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify); + delete[] alpha_unify; + return count; } /* ===================== Finding good alpha increments ===================== */ @@ -558,9 +587,8 @@ Search::count_duplicates_multiset (const unsigned int *alpha_inc) const /* Run through the keyword list and count the duplicates incrementally. The result does not depend on the order of the keyword list, thanks to the formula above. */ - init_selchars_multiset (_key_positions, - compute_alpha_unify (_key_positions, alpha_inc), - alpha_inc); + unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc); + init_selchars_multiset (_key_positions, alpha_unify, alpha_inc); unsigned int count = 0; { @@ -574,6 +602,7 @@ Search::count_duplicates_multiset (const unsigned int *alpha_inc) const } delete_selchars (); + delete[] alpha_unify; return count; } @@ -586,9 +615,7 @@ Search::find_alpha_inc () /* The goal is to choose _alpha_inc[] such that it doesn't introduce artificial duplicates. In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ - _alpha_unify = compute_alpha_unify (); - unsigned int duplicates_goal = count_duplicates_tuple (_key_positions); - delete[] _alpha_unify; + unsigned int duplicates_goal = count_duplicates_tuple (); /* Start with zero increments. This is sufficient in most cases. */ unsigned int *current = new unsigned int [_max_key_len]; diff --git a/src/search.h b/src/search.h index 17c874c..c098f50 100644 --- a/src/search.h +++ b/src/search.h @@ -50,16 +50,19 @@ private: unsigned int * compute_alpha_unify () const; /* Initializes each keyword's _selchars array. */ - void init_selchars_tuple (const Positions& positions) const; + void init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const; /* Deletes each keyword's _selchars array. */ void delete_selchars () const; /* Count the duplicate keywords that occur with a given set of positions. */ - unsigned int count_duplicates_tuple (const Positions& positions) const; + unsigned int count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const; /* Find good key positions. */ void find_positions (); + /* Count the duplicate keywords that occur with the found set of positions. */ + unsigned int count_duplicates_tuple () const; + /* Computes the upper bound on the indices passed to asso_values[]. */ unsigned int compute_alpha_size (const unsigned int *alpha_inc) const; |