diff options
author | Bruno Haible <bruno@clisp.org> | 2003-02-11 11:09:27 +0000 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2003-02-11 11:09:27 +0000 |
commit | 810fef43aebd9cd1d58d9a6a412c49835d3e8471 (patch) | |
tree | e00cbf3d2754cfbdfa69af299f91b21fe7e8e326 /src | |
parent | 6202aaadb1a2904f456c2ee55623bf4a1a951ad7 (diff) | |
download | gperf-810fef43aebd9cd1d58d9a6a412c49835d3e8471.tar.gz |
When the option -k is not given, the default key positions are now computed
depending on the set of keywords.
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.in | 4 | ||||
-rw-r--r-- | src/hash-table.cc | 15 | ||||
-rw-r--r-- | src/keyword.cc | 27 | ||||
-rw-r--r-- | src/keyword.h | 9 | ||||
-rw-r--r-- | src/main.cc | 3 | ||||
-rw-r--r-- | src/options.cc | 142 | ||||
-rw-r--r-- | src/options.h | 65 | ||||
-rw-r--r-- | src/options.icc | 39 | ||||
-rw-r--r-- | src/output.cc | 75 | ||||
-rw-r--r-- | src/output.h | 10 | ||||
-rw-r--r-- | src/search.cc | 230 | ||||
-rw-r--r-- | src/search.h | 24 |
12 files changed, 522 insertions, 121 deletions
diff --git a/src/Makefile.in b/src/Makefile.in index 1aecdcf..d959696 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -91,8 +91,8 @@ KEYWORD_LIST_H = keyword-list.h keyword-list.icc $(KEYWORD_H) INPUT_H = input.h $(KEYWORD_LIST_H) BOOL_ARRAY_H = bool-array.h bool-array.icc $(OPTIONS_H) HASH_TABLE_H = hash-table.h $(KEYWORD_H) -SEARCH_H = search.h $(KEYWORD_LIST_H) $(BOOL_ARRAY_H) -OUTPUT_H = output.h $(KEYWORD_LIST_H) +SEARCH_H = search.h $(KEYWORD_LIST_H) $(OPTIONS_H) $(BOOL_ARRAY_H) +OUTPUT_H = output.h $(KEYWORD_LIST_H) $(OPTIONS_H) version.o : version.cc $(VERSION_H) $(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $(srcdir)/version.cc diff --git a/src/hash-table.cc b/src/hash-table.cc index a4d2a1b..6928eb6 100644 --- a/src/hash-table.cc +++ b/src/hash-table.cc @@ -89,16 +89,11 @@ Hash_Table::~Hash_Table () { int field_width; - if (option[ALLCHARS]) - { - field_width = 0; - for (int i = _size - 1; i >= 0; i--) - if (_table[i]) - if (field_width < _table[i]->_selchars_length) - field_width = _table[i]->_selchars_length; - } - else - field_width = option.get_max_keysig_size (); + field_width = 0; + for (int i = _size - 1; i >= 0; i--) + if (_table[i]) + if (field_width < _table[i]->_selchars_length) + field_width = _table[i]->_selchars_length; fprintf (stderr, "\ndumping the hash table\n" diff --git a/src/keyword.cc b/src/keyword.cc index 6250165..73608d0 100644 --- a/src/keyword.cc +++ b/src/keyword.cc @@ -47,7 +47,7 @@ static inline void sort_char_set (unsigned char *base, int len) } } -/* Initialize selchars and selchars_length. +/* Initializes selchars and selchars_length. The hash function will be computed as asso_values[allchars[key_pos[0]]] + asso_values[allchars[key_pos[1]]] + ... We compute selchars as the multiset @@ -57,14 +57,15 @@ static inline void sort_char_set (unsigned char *base, int len) Furthermore we sort the selchars array, to ease detection of duplicates later. */ -void KeywordExt::init_selchars () +void +KeywordExt::init_selchars (bool use_all_chars, const Positions& positions) { const char *k = _allchars; unsigned char *key_set = - new unsigned char[(option[ALLCHARS] ? _allchars_length : option.get_max_keysig_size ())]; + new unsigned char[(use_all_chars ? _allchars_length : positions.get_size ())]; unsigned char *ptr = key_set; - if (option[ALLCHARS]) + if (use_all_chars) /* Use all the character positions in the KEY. */ for (int i = _allchars_length; i > 0; k++, i--) { @@ -76,7 +77,7 @@ void KeywordExt::init_selchars () { /* Iterate through the list of key_positions, initializing selchars (via ptr). */ - PositionIterator iter (option.get_key_positions ()); + PositionIterator iter (positions); for (int i; (i = iter.next ()) != PositionIterator::EOS; ) { @@ -91,15 +92,6 @@ void KeywordExt::init_selchars () continue; ptr++; } - - /* Didn't get any hits and user doesn't want to consider the - keylength, so there are essentially no usable hash positions! */ - if (ptr == key_set && option[NOLENGTH]) - { - fprintf (stderr, "Can't hash keyword %.*s with chosen key positions.\n", - _allchars_length, _allchars); - exit (1); - } } /* Sort the KEY_SET items alphabetically. */ @@ -109,6 +101,13 @@ void KeywordExt::init_selchars () _selchars_length = ptr - key_set; } +/* Deletes selchars. */ +void +KeywordExt::delete_selchars () +{ + delete[] _selchars; +} + /* ------------------------- Keyword_Factory class ------------------------- */ diff --git a/src/keyword.h b/src/keyword.h index 79a71df..b2f0a75 100644 --- a/src/keyword.h +++ b/src/keyword.h @@ -26,6 +26,9 @@ #ifndef keyword_h #define keyword_h 1 +/* Class defined in "options.h". */ +class Positions; + /* An instance of this class is a keyword, as specified in the input file. */ struct Keyword @@ -64,8 +67,10 @@ struct KeywordExt : public Keyword KeywordExt * _duplicate_link; /* Methods depending on the keyposition list. */ - /* Initialize selchars and selchars_length. */ - void init_selchars (); + /* Initializes selchars and selchars_length. */ + void init_selchars (bool use_all_chars, const Positions& positions); + /* Deletes selchars. */ + void delete_selchars (); /* Data members used by the algorithm. */ int _occurrence; /* Frequency of key set occurrences. */ diff --git a/src/main.cc b/src/main.cc index 7552b3c..8509cc0 100644 --- a/src/main.cc +++ b/src/main.cc @@ -103,9 +103,10 @@ main (int argc, char *argv[]) inputter._verbatim_code_end, inputter._verbatim_code_lineno, searcher._total_keys, - searcher._total_duplicates, searcher._max_key_len, searcher._min_key_len, + searcher._key_positions, + searcher._total_duplicates, searcher._alpha_size, searcher._occurrences, searcher._asso_values); diff --git a/src/options.cc b/src/options.cc index 7bbe27e..885504e 100644 --- a/src/options.cc +++ b/src/options.cc @@ -442,7 +442,7 @@ Options::Options () _hash_name (DEFAULT_HASH_NAME), _wordlist_name (DEFAULT_WORDLIST_NAME), _delimiters (DEFAULT_DELIMITERS), - _key_positions (1, Positions::LASTCHAR) + _key_positions () { } @@ -766,6 +766,7 @@ Options::parse_options (int argc, char *argv[]) } case 'k': /* Sets key positions used for hash function. */ { + _option_word |= POSITIONS; const int BAD_VALUE = -2; const int EOS = PositionIterator::EOS; int value; @@ -782,7 +783,7 @@ Options::parse_options (int argc, char *argv[]) { if (value == BAD_VALUE) { - fprintf (stderr, "Invalid key value or range, use 1,2,3-%d,'$' or '*'.\n", + fprintf (stderr, "Invalid position value or range, use 1,2,3-%d,'$' or '*'.\n", Positions::MAX_KEY_POS); short_usage (stderr); exit (1); @@ -793,7 +794,7 @@ Options::parse_options (int argc, char *argv[]) Since all key positions are in the range 1..Positions::MAX_KEY_POS or == Positions::LASTCHAR, there must be duplicates. */ - fprintf (stderr, "Duplicate keys selected\n"); + fprintf (stderr, "Duplicate key positions selected\n"); short_usage (stderr); exit (1); } @@ -803,7 +804,7 @@ Options::parse_options (int argc, char *argv[]) unsigned int total_keysig_size = key_pos - key_positions; if (total_keysig_size == 0) { - fprintf (stderr, "No keys selected.\n"); + fprintf (stderr, "No key positions selected.\n"); short_usage (stderr); exit (1); } @@ -814,7 +815,7 @@ Options::parse_options (int argc, char *argv[]) when generating code. */ if (! _key_positions.sort()) { - fprintf (stderr, "Duplicate keys selected\n"); + fprintf (stderr, "Duplicate key positions selected\n"); short_usage (stderr); exit (1); } @@ -939,6 +940,137 @@ Options::parse_options (int argc, char *argv[]) } } +/* ---------------------------- Class Positions ---------------------------- */ + +/* Set operations. Assumes the array is in reverse order. */ + +bool +Positions::contains (int pos) const +{ + unsigned int count = _size; + const unsigned char *p = _positions + _size - 1; + + for (; count > 0; p--, count--) + { + if (*p == pos) + return true; + if (*p > pos) + break; + } + return false; +} + +void +Positions::add (int pos) +{ + unsigned int count = _size; + + if (count == MAX_KEY_POS + 1) + { + fprintf (stderr, "Positions::add internal error: overflow\n"); + exit (1); + } + + unsigned char *p = _positions + _size - 1; + + for (; count > 0; p--, count--) + { + if (*p == pos) + { + fprintf (stderr, "Positions::add internal error: duplicate\n"); + exit (1); + } + if (*p > pos) + break; + p[1] = p[0]; + } + p[1] = pos; + _size++; +} + +void +Positions::remove (int pos) +{ + unsigned int count = _size; + if (count > 0) + { + unsigned char *p = _positions + _size - 1; + + if (*p == pos) + { + _size--; + return; + } + if (*p < pos) + { + unsigned char prev = *p; + + for (;;) + { + p--; + count--; + if (count == 0) + break; + if (*p == pos) + { + *p = prev; + _size--; + return; + } + if (*p > pos) + break; + unsigned char curr = *p; + *p = prev; + prev = curr; + } + } + } + fprintf (stderr, "Positions::remove internal error: not found\n"); + exit (1); +} + +/* Output in external syntax. */ +void +Positions::print () const +{ + bool first = true; + bool seen_LASTCHAR = false; + unsigned int count = _size; + const unsigned char *p = _positions + _size - 1; + + for (; count > 0; p--, count--) + { + if (*p == LASTCHAR) + seen_LASTCHAR = true; + else + { + if (!first) + printf (","); + printf ("%d", *p); + if (count > 0 && p[-1] == *p + 1) + { + printf ("-"); + do + { + p--; + count--; + } + while (count > 0 && p[-1] == *p + 1); + printf ("%d", *p); + } + first = false; + } + } + if (seen_LASTCHAR) + { + if (!first) + printf (","); + printf ("$"); + } +} + +/* ------------------------------------------------------------------------- */ + #ifndef __OPTIMIZE__ #define INLINE /* not inline */ diff --git a/src/options.h b/src/options.h index edb1fca..ec32c97 100644 --- a/src/options.h +++ b/src/options.h @@ -41,63 +41,66 @@ enum Option_Type /* Apply ordering heuristic to speed-up search time. */ ORDER = 1 << 1, + /* Use the given key positions. */ + POSITIONS = 1 << 2, + /* Use all characters in hash function. */ - ALLCHARS = 1 << 2, + ALLCHARS = 1 << 3, /* Handle user-defined type structured keyword input. */ - TYPE = 1 << 3, + TYPE = 1 << 4, /* Randomly initialize the associated values table. */ - RANDOM = 1 << 4, + RANDOM = 1 << 5, /* Generate switch output to save space. */ - SWITCH = 1 << 5, + SWITCH = 1 << 6, /* Don't include keyword length in hash computations. */ - NOLENGTH = 1 << 6, + NOLENGTH = 1 << 7, /* Generate a length table for string comparison. */ - LENTABLE = 1 << 7, + LENTABLE = 1 << 8, /* Handle duplicate hash values for keywords. */ - DUP = 1 << 8, + DUP = 1 << 9, /* Generate the hash function "fast". */ - FAST = 1 << 9, + FAST = 1 << 10, /* Don't include user-defined type definition in output -- it's already defined elsewhere. */ - NOTYPE = 1 << 10, + NOTYPE = 1 << 11, /* Generate strncmp rather than strcmp. */ - COMP = 1 << 11, + COMP = 1 << 12, /* Make the keyword table a global variable. */ - GLOBAL = 1 << 12, + GLOBAL = 1 << 13, /* Make the generated tables readonly (const). */ - CONST = 1 << 13, + CONST = 1 << 14, /* Generate K&R C code: no prototypes, no const. */ - KRC = 1 << 14, + KRC = 1 << 15, /* Generate C code: no prototypes, but const (user can #define it away). */ - C = 1 << 15, + C = 1 << 16, /* Generate ISO/ANSI C code: prototypes and const, but no class. */ - ANSIC = 1 << 16, + ANSIC = 1 << 17, /* Generate C++ code: prototypes, const, class, inline, enum. */ - CPLUSPLUS = 1 << 17, + CPLUSPLUS = 1 << 18, /* Use enum for constants. */ - ENUM = 1 << 18, + ENUM = 1 << 19, /* Generate #include statements. */ - INCLUDE = 1 << 21, + INCLUDE = 1 << 20, /* Assume 7-bit, not 8-bit, characters. */ - SEVENBIT = 1 << 22 + SEVENBIT = 1 << 21 }; /* This class denotes a set of key positions. */ @@ -115,8 +118,14 @@ public: /* Constructors. */ Positions (); - Positions (int key1); - Positions (int key1, int key2); + Positions (int pos1); + Positions (int pos1, int pos2); + + /* Copy constructor. */ + Positions (const Positions& src); + + /* Assignment operator. */ + Positions& operator= (const Positions& src); /* Accessors. */ int operator[] (unsigned int index) const; @@ -130,6 +139,14 @@ public: Returns true if there are no duplicates, false otherwise. */ bool sort (); + /* Set operations. Assumes the array is in reverse order. */ + bool contains (int pos) const; + void add (int pos); + void remove (int pos); + + /* Output in external syntax. */ + void print () const; + private: /* Number of positions. */ unsigned int _size; @@ -246,13 +263,9 @@ public: void set_delimiters (const char *delimiters); /* Returns key positions. - Only to be called if !options[ALLCHARS]. */ + Only to be used if !options[ALLCHARS]. */ const Positions& get_key_positions () const; - /* Returns total distinct key positions. - Only to be called if !options[ALLCHARS]. */ - int get_max_keysig_size () const; - private: /* Prints program usage to given stream. */ void short_usage (FILE * stream) const; diff --git a/src/options.icc b/src/options.icc index 5b7e9b1..444c849 100644 --- a/src/options.icc +++ b/src/options.icc @@ -32,18 +32,37 @@ Positions::Positions () } INLINE -Positions::Positions (int key1) +Positions::Positions (int pos1) : _size (1) { - _positions[0] = key1; + _positions[0] = pos1; } INLINE -Positions::Positions (int key1, int key2) +Positions::Positions (int pos1, int pos2) : _size (2) { - _positions[0] = key1; - _positions[1] = key2; + _positions[0] = pos1; + _positions[1] = pos2; +} + +/* Copy constructor. */ + +INLINE +Positions::Positions (const Positions& src) + : _size (src._size) +{ + memcpy (_positions, src._positions, _size * sizeof (_positions[0])); +} + +/* Assignment operator. */ + +INLINE Positions& +Positions::operator= (const Positions& src) +{ + _size = src._size; + memcpy (_positions, src._positions, _size * sizeof (_positions[0])); + return *this; } /* Accessors. */ @@ -238,17 +257,9 @@ Options::get_delimiters () const } /* Returns key positions. - Only to be called if !options[ALLCHARS]. */ + Only to be used if !options[ALLCHARS]. */ INLINE const Positions& Options::get_key_positions () const { return _key_positions; } - -/* Returns total distinct key positions. - Only to be called if !options[ALLCHARS]. */ -INLINE int -Options::get_max_keysig_size () const -{ - return _key_positions.get_size(); -} diff --git a/src/output.cc b/src/output.cc index eaf3260..8a34c21 100644 --- a/src/output.cc +++ b/src/output.cc @@ -87,8 +87,9 @@ Output::Output (KeywordExt_List *head, const char *struct_decl, unsigned int verbatim_declarations_lineno, const char *verbatim_code, const char *verbatim_code_end, unsigned int verbatim_code_lineno, - int total_keys, int total_duplicates, int max_key_len, - int min_key_len, int alpha_size, const int *occurrences, + int total_keys, int max_key_len, int min_key_len, + const Positions& positions, int total_duplicates, + int alpha_size, const int *occurrences, const int *asso_values) : _head (head), _struct_decl (struct_decl), _struct_decl_lineno (struct_decl_lineno), _return_type (return_type), @@ -99,9 +100,10 @@ Output::Output (KeywordExt_List *head, const char *struct_decl, _verbatim_code (verbatim_code), _verbatim_code_end (verbatim_code_end), _verbatim_code_lineno (verbatim_code_lineno), - _total_keys (total_keys), _total_duplicates (total_duplicates), + _total_keys (total_keys), _max_key_len (max_key_len), _min_key_len (min_key_len), - _alpha_size (alpha_size), + _key_positions (positions), + _total_duplicates (total_duplicates), _alpha_size (alpha_size), _occurrences (occurrences), _asso_values (asso_values) { } @@ -480,32 +482,33 @@ Output::output_hash_function () const printf ("{\n"); /* First the asso_values array. */ - { - printf (" static %s%s asso_values[] =\n" - " {", - const_readonly_array, - smallest_integral_type (_max_hash_value + 1)); + if (option[ALLCHARS] || _key_positions.get_size() > 0) + { + printf (" static %s%s asso_values[] =\n" + " {", + const_readonly_array, + smallest_integral_type (_max_hash_value + 1)); - const int columns = 10; + const int columns = 10; - /* Calculate maximum number of digits required for MAX_HASH_VALUE. */ - int field_width = 2; - for (int trunc = _max_hash_value; (trunc /= 10) > 0;) - field_width++; + /* Calculate maximum number of digits required for MAX_HASH_VALUE. */ + int field_width = 2; + for (int trunc = _max_hash_value; (trunc /= 10) > 0;) + field_width++; - for (int count = 0; count < _alpha_size; count++) - { - if (count > 0) - printf (","); - if ((count % columns) == 0) - printf ("\n "); - printf ("%*d", field_width, - _occurrences[count] ? _asso_values[count] : _max_hash_value + 1); - } + for (int count = 0; count < _alpha_size; count++) + { + if (count > 0) + printf (","); + if ((count % columns) == 0) + printf ("\n "); + printf ("%*d", field_width, + _occurrences[count] ? _asso_values[count] : _max_hash_value + 1); + } - printf ("\n" - " };\n"); - } + printf ("\n" + " };\n"); + } if (option[ALLCHARS]) { @@ -526,12 +529,18 @@ Output::output_hash_function () const " }\n" " return hval;\n"); } + else if (_key_positions.get_size() == 0) + { + /* Trivial case: No key positions at all. */ + printf (" return %s;\n", + option[NOLENGTH] ? "0" : "len"); + } else { /* Iterate through the key positions. Remember that Positions::sort() has sorted them in decreasing order, with Positions::LASTCHAR coming last. */ - PositionIterator iter (option.get_key_positions()); + PositionIterator iter (_key_positions); int key_pos; /* Get the highest key position. */ @@ -547,9 +556,9 @@ Output::output_hash_function () const printf (" return %s", option[NOLENGTH] ? "" : "len + "); - if (option.get_key_positions().get_size() == 2 - && option.get_key_positions()[0] == 1 - && option.get_key_positions()[1] == Positions::LASTCHAR) + if (_key_positions.get_size() == 2 + && _key_positions[0] == 1 + && _key_positions[1] == Positions::LASTCHAR) /* Optimize special case of "-k 1,$". */ printf ("asso_values[%sstr[len - 1]] + asso_values[%sstr[0]]", char_to_index, char_to_index); @@ -1492,6 +1501,12 @@ Output::output () printf (" code produced by gperf version %s */\n", version_string); option.print_options (); printf ("\n"); + if (!option[POSITIONS]) + { + printf ("/* Computed positions: -k'"); + _key_positions.print(); + printf ("' */\n"); + } if (_verbatim_declarations < _verbatim_declarations_end) { diff --git a/src/output.h b/src/output.h index d9756ab..e141645 100644 --- a/src/output.h +++ b/src/output.h @@ -27,6 +27,7 @@ #define output_h 1 #include "keyword-list.h" +#include "options.h" /* OSF/1 cxx needs these forward declarations. */ struct Output_Constants; @@ -48,8 +49,9 @@ public: const char *verbatim_code_end, unsigned int verbatim_code_lineno, int total_keys, - int total_duplicates, int max_key_len, int min_key_len, + const Positions& positions, + int total_duplicates, int alpha_size, const int *occurrences, const int *asso_values); @@ -113,12 +115,14 @@ private: unsigned int const _verbatim_code_lineno; /* Total number of keys, counting duplicates. */ int const _total_keys; - /* Total number of duplicate hash values. */ - int const _total_duplicates; /* Maximum length of the longest keyword. */ int const _max_key_len; /* Minimum length of the shortest keyword. */ int const _min_key_len; + /* Key positions. Only to be used if !options[ALLCHARS]. */ + Positions const _key_positions; + /* Total number of duplicate hash values. */ + int const _total_duplicates; /* Minimum hash value for all keywords. */ int _min_hash_value; /* Maximum hash value for all keywords. */ diff --git a/src/search.cc b/src/search.cc index a17cc28..98efcc6 100644 --- a/src/search.cc +++ b/src/search.cc @@ -27,7 +27,7 @@ #include <stdlib.h> /* declares exit(), rand(), srand() */ #include <string.h> /* declares memset(), memcmp() */ #include <time.h> /* declares time() */ -#include <limits.h> /* defines INT_MIN, INT_MAX */ +#include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */ #include "options.h" #include "hash-table.h" @@ -35,6 +35,7 @@ Search::Search (KeywordExt_List *list) : _head (list), + _key_positions (option.get_key_positions()), _alpha_size (option[SEVENBIT] ? 128 : 256), _occurrences (new int[_alpha_size]), _asso_values (new int[_alpha_size]), @@ -43,7 +44,7 @@ Search::Search (KeywordExt_List *list) } void -Search::prepare () +Search::preprepare () { KeywordExt_List *temp; @@ -52,10 +53,6 @@ Search::prepare () for (temp = _head; temp; temp = temp->rest()) _total_keys++; - /* Initialize each keyword's _selchars array. */ - for (temp = _head; temp; temp = temp->rest()) - temp->first()->init_selchars(); - /* Compute the minimum and maximum keyword length. */ _max_key_len = INT_MIN; _min_key_len = INT_MAX; @@ -78,6 +75,212 @@ Search::prepare () "len == 0 before calling the gperf generated lookup function.\n"); exit (1); } +} + +/* Initializes each keyword's _selchars array. */ +void +Search::init_selchars (bool use_all_chars, const Positions& positions) const +{ + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) + temp->first()->init_selchars(use_all_chars, positions); +} + +/* Deletes each keyword's _selchars array. */ +void +Search::delete_selchars () const +{ + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) + temp->first()->delete_selchars(); +} + +/* Count the duplicate keywords that occur with a given set of positions. */ +unsigned int +Search::count_duplicates (const Positions& positions) const +{ + init_selchars (false, positions); + + unsigned int count = 0; + Hash_Table representatives (_total_keys, option[NOLENGTH]); + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) + { + KeywordExt *keyword = temp->first(); + if (representatives.insert (keyword)) + count++; + } + + delete_selchars (); + + return count; +} + +void +Search::find_positions () +{ + /* Determine good key positions. */ + + /* 1. Find positions that must occur in order to distinguish duplicates. */ + Positions mandatory; + + if (!option[DUP]) + { + for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest()) + { + KeywordExt *keyword1 = l1->first(); + for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest()) + { + KeywordExt *keyword2 = l2->first(); + + /* If keyword1 and keyword2 have the same length and differ + in just one position, and it is not the last character, + this position is mandatory. */ + if (keyword1->_allchars_length == keyword2->_allchars_length) + { + int n = keyword1->_allchars_length; + int i; + for (i = 1; i < n; i++) + if (keyword1->_allchars[i-1] != keyword2->_allchars[i-1]) + break; + if (i < n + && memcmp (&keyword1->_allchars[i], + &keyword2->_allchars[i], + n - i) + == 0) + { + /* Position i is mandatory. */ + if (!mandatory.contains (i)) + mandatory.add (i); + } + } + } + } + } + + /* 2. Add positions, as long as this decreases the duplicates count. */ + int imax = (_max_key_len < Positions::MAX_KEY_POS + ? _max_key_len : Positions::MAX_KEY_POS); + Positions current = mandatory; + unsigned int current_duplicates_count = count_duplicates (current); + for (;;) + { + Positions best; + unsigned int best_duplicates_count = UINT_MAX; + + for (int i = imax; i >= 0; i--) + if (!current.contains (i)) + { + Positions tryal = current; + tryal.add (i); + unsigned int try_duplicates_count = count_duplicates (tryal); + + /* We prefer 'try' to 'best' if it produces less duplicates, + or if it produces the same number of duplicates but with + a more efficient hash function. */ + if (try_duplicates_count < best_duplicates_count + || (try_duplicates_count == best_duplicates_count && i > 0)) + { + best = tryal; + best_duplicates_count = try_duplicates_count; + } + } + + /* Stop adding positions when it gives no improvement. */ + if (best_duplicates_count >= current_duplicates_count) + break; + + current = best; + current_duplicates_count = best_duplicates_count; + } + + /* 3. Remove positions, as long as this doesn't increase the duplicates + count. */ + for (;;) + { + Positions best; + unsigned int best_duplicates_count = UINT_MAX; + + for (int i = imax; i >= 0; i--) + if (current.contains (i) && !mandatory.contains (i)) + { + Positions tryal = current; + tryal.remove (i); + unsigned int try_duplicates_count = count_duplicates (tryal); + + /* We prefer 'try' to 'best' if it produces less duplicates, + or if it produces the same number of duplicates but with + a more efficient hash function. */ + if (try_duplicates_count < best_duplicates_count + || (try_duplicates_count == best_duplicates_count && i == 0)) + { + best = tryal; + best_duplicates_count = try_duplicates_count; + } + } + + /* Stop removing positions when it gives no improvement. */ + if (best_duplicates_count > current_duplicates_count) + break; + + current = best; + current_duplicates_count = best_duplicates_count; + } + + /* 4. Replace two positions by one, as long as this doesn't increase the + duplicates count. */ + for (;;) + { + Positions best; + unsigned int best_duplicates_count = UINT_MAX; + + for (int i1 = imax; i1 >= 0; i1--) + if (current.contains (i1) && !mandatory.contains (i1)) + for (int i2 = imax; i2 >= 0; i2--) + if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1) + for (int i3 = imax; i3 >= 0; i3--) + if (!current.contains (i3)) + { + Positions tryal = current; + tryal.remove (i1); + tryal.remove (i2); + tryal.add (i3); + unsigned int try_duplicates_count = + count_duplicates (tryal); + + /* We prefer 'try' to 'best' if it produces less duplicates, + or if it produces the same number of duplicates but with + a more efficient hash function. */ + if (try_duplicates_count < best_duplicates_count + || (try_duplicates_count == best_duplicates_count + && (i1 == 0 || i2 == 0 || i3 > 0))) + { + best = tryal; + best_duplicates_count = try_duplicates_count; + } + } + + /* Stop removing positions when it gives no improvement. */ + if (best_duplicates_count > current_duplicates_count) + break; + + current = best; + current_duplicates_count = best_duplicates_count; + } + + /* That's it. Hope it's good enough. */ + _key_positions = current; +} + +void +Search::prepare () +{ + KeywordExt_List *temp; + + preprepare (); + + if (!option[POSITIONS]) + find_positions (); + + /* Initialize each keyword's _selchars array. */ + init_selchars (option[ALLCHARS], _key_positions); /* Check for duplicates, i.e. keywords with the same _selchars array (and - if !option[NOLENGTH] - also the same length). @@ -140,8 +343,12 @@ Search::prepare () _total_duplicates); else { - fprintf (stderr, "%d input keys have identical hash values,\ntry different key positions or use option -D.\n", + fprintf (stderr, "%d input keys have identical hash values,\n", _total_duplicates); + if (option[POSITIONS]) + fprintf (stderr, "try different key positions or use option -D.\n"); + else + fprintf (stderr, "use option -D.\n"); exit (1); } } @@ -313,7 +520,7 @@ Search::max_key_length () const int Search::get_max_keysig_size () const { - return option[ALLCHARS] ? _max_key_len : option.get_max_keysig_size (); + return option[ALLCHARS] ? _max_key_len : _key_positions.get_size(); } /* ---------------------- Finding good asso_values[] ----------------------- */ @@ -758,9 +965,12 @@ Search::optimize () else /* Yow, big problems. we're outta here! */ { fprintf (stderr, - "\nInternal error, duplicate value %d:\n" - "try options -D or -m or -r, or use new key positions.\n\n", + "\nInternal error, duplicate hash code value %d:\n", hashcode); + if (option[POSITIONS]) + fprintf (stderr, "try options -m or -D or -r, or use new key positions.\n\n"); + else + fprintf (stderr, "try options -m or -D or -r.\n\n"); exit (1); } } diff --git a/src/search.h b/src/search.h index 7e1f781..0d67458 100644 --- a/src/search.h +++ b/src/search.h @@ -27,6 +27,7 @@ #define search_h 1 #include "keyword-list.h" +#include "options.h" #include "bool-array.h" class Search @@ -36,6 +37,18 @@ public: ~Search (); void optimize (); private: + void preprepare (); + + /* Initializes each keyword's _selchars array. */ + void init_selchars (bool use_all_chars, const Positions& positions) const; + /* Deletes each keyword's _selchars array. */ + void delete_selchars () const; + + /* Count the duplicate keywords that occur with a given set of positions. */ + unsigned int count_duplicates (const Positions& positions) const; + + void find_positions (); + void prepare (); /* Computes the sum of occurrences of the _selchars of a keyword. */ @@ -90,16 +103,19 @@ public: /* Total number of keywords, counting duplicates. */ int _total_keys; - /* Total number of duplicates that have been moved to _duplicate_link lists - (not counting their representatives which stay on the main list). */ - int _total_duplicates; - /* Maximum length of the longest keyword. */ int _max_key_len; /* Minimum length of the shortest keyword. */ int _min_key_len; + /* User-specified or computed key positions. */ + Positions _key_positions; + + /* Total number of duplicates that have been moved to _duplicate_link lists + (not counting their representatives which stay on the main list). */ + int _total_duplicates; + /* Size of alphabet. */ int const _alpha_size; |