diff options
author | Bruno Haible <bruno@clisp.org> | 2002-12-19 12:35:17 +0000 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2002-12-19 12:35:17 +0000 |
commit | 1186e616cbdad01f89e59a971f7073e55bcd0111 (patch) | |
tree | fb75cfa992e5543810c3ee9dfd7475ca084637a2 | |
parent | 388a431c7d1b5f856993425e4a947b8aaa709431 (diff) | |
download | gperf-1186e616cbdad01f89e59a971f7073e55bcd0111.tar.gz |
Rework the hash table code.
-rw-r--r-- | ChangeLog | 19 | ||||
-rw-r--r-- | src/hash-table.cc | 98 | ||||
-rw-r--r-- | src/hash-table.h | 33 | ||||
-rw-r--r-- | src/search.cc | 100 |
4 files changed, 168 insertions, 82 deletions
@@ -1,3 +1,22 @@ +2002-11-03 Bruno Haible <bruno@clisp.org> + + Bug fix: The hash table could fail to detect duplicates, between + keywords of different length, when option -n (option[NOLENGTH]) was + given. + * src/hash-table.h (Hash_Table::Hash_Table): Pass table size, not + vector and vector size as arguments. + (Hash_Table::_log_size): New field. + (Hash_Table::equal): New declaration. + * src/hash-table.cc (size_factor): New variable. + (Hash_Table::Hash_Table): Pass table size, not vector and vector size + as arguments. Allocate the vector here. + (Hash_Table::~Hash_Table): Deallocate the vector here. + (Hash_Table::equal): New function. + (Hash_Table::insert): Use it. Don't use item->_allchars_length for the + increment if _ignore_length is true. + * src/search.cc (TABLE_MULTIPLE): Remove variable. + (Search::prepare): Update. + 2002-11-02 Bruno Haible <bruno@clisp.org> Provide documentation also in PDF format. diff --git a/src/hash-table.cc b/src/hash-table.cc index b6622ba..3b8c139 100644 --- a/src/hash-table.cc +++ b/src/hash-table.cc @@ -28,21 +28,61 @@ #include <hash.h> #include "options.h" -/* The size of the hash table is always the smallest power of 2 >= the size - indicated by the user. This allows several optimizations, including - the use of double hashing and elimination of the mod instruction. - Note that the size had better be larger than the number of items - in the hash table, else there's trouble!!! Note that the memory - for the hash table is allocated *outside* the intialization routine. - This compromises information hiding somewhat, but greatly reduces - memory fragmentation, since we can now use alloca! */ - -Hash_Table::Hash_Table (KeywordExt **table_ptr, int s, bool ignore_len): - _table (table_ptr), _size (s), _collisions (0), _ignore_length (ignore_len) +/* We use a hash table with double hashing. This is the simplest kind of + hash table, given that we always only insert and never remove entries + from the hash table. */ + +/* To make double hashing efficient, there need to be enough spare entries. */ +static const int size_factor = 10; + +/* We make the size of the hash table a power of 2. This allows for two + optimizations: It eliminates the modulo instruction, and allows for an + easy secondary hashing function. */ + +/* Constructor. */ +Hash_Table::Hash_Table (unsigned int size, bool ignore_length) + : _ignore_length (ignore_length), + _collisions (0) { + /* There need to be enough spare entries. */ + size = size * size_factor; + + /* Find smallest power of 2 that is >= size. */ + unsigned int shift = 0; + if ((size >> 16) > 0) + { + size = size >> 16; + shift += 16; + } + if ((size >> 8) > 0) + { + size = size >> 8; + shift += 8; + } + if ((size >> 4) > 0) + { + size = size >> 4; + shift += 4; + } + if ((size >> 2) > 0) + { + size = size >> 2; + shift += 2; + } + if ((size >> 1) > 0) + { + size = size >> 1; + shift += 1; + } + _log_size = shift; + _size = 1 << shift; + + /* Allocate table. */ + _table = new KeywordExt*[_size]; memset (_table, 0, _size * sizeof (*_table)); } +/* Destructor. */ Hash_Table::~Hash_Table () { if (option[DEBUG]) @@ -76,24 +116,38 @@ Hash_Table::~Hash_Table () fprintf (stderr, "\nend dumping hash table\n\n"); } + delete[] _table; } -/* If the ITEM is already in the hash table return the item found - in the table. Otherwise inserts the ITEM, and returns FALSE. - Uses double hashing. */ +/* Compares two items. */ +inline bool +Hash_Table::equal (KeywordExt *item1, KeywordExt *item2) +{ + return item1->_selchars_length == item2->_selchars_length + && memcmp (item1->_selchars, item2->_selchars, item2->_selchars_length) + == 0 + && (_ignore_length + || item1->_allchars_length == item2->_allchars_length); +} +/* Attempts to insert ITEM in the table. If there is already an equal + entry in it, returns it. Otherwise inserts ITEM and returns NULL. */ KeywordExt * Hash_Table::insert (KeywordExt *item) { - unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length); - int probe = hash_val & (_size - 1); - int increment = ((hash_val ^ item->_allchars_length) | 1) & (_size - 1); - - while (_table[probe]) + unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length); + unsigned int probe = hash_val & (_size - 1); + unsigned int increment = + (((hash_val >> _log_size) + ^ (_ignore_length ? 0 : item->_allchars_length)) + << 1) + 1; + /* Note that because _size is a power of 2 and increment is odd, + we have gcd(increment,_size) = 1, which guarantees that we'll find + an empty entry during the loop. */ + + while (_table[probe] != NULL) { - if (_table[probe]->_selchars_length == item->_selchars_length - && memcmp (_table[probe]->_selchars, item->_selchars, item->_selchars_length) == 0 - && (_ignore_length || _table[probe]->_allchars_length == item->_allchars_length)) + if (equal (_table[probe], item)) return _table[probe]; _collisions++; diff --git a/src/hash-table.h b/src/hash-table.h index 141c690..4de8466 100644 --- a/src/hash-table.h +++ b/src/hash-table.h @@ -28,18 +28,37 @@ #include "keyword.h" +/* Hash table of KeywordExt* entries. + Two entries are considered equal if their _selchars are the same and + - if !ignore_length - if their _allchars_length are the same. */ + class Hash_Table { -private: - KeywordExt ** _table; /* Vector of pointers to linked lists of keywords. */ - int _size; /* Size of the vector. */ - int _collisions; /* Find out how well our double hashing is working! */ - bool _ignore_length; - public: - Hash_Table (KeywordExt **t, int s, bool ignore_len); + /* Constructor. + size is the maximum number of entries. + ignore_length determines a detail in the comparison function. */ + Hash_Table (unsigned int size, bool ignore_length); + /* Destructor. */ ~Hash_Table (); + /* Attempts to insert ITEM in the table. If there is already an equal + entry in it, returns it. Otherwise inserts ITEM and returns NULL. */ KeywordExt * insert (KeywordExt *item); + +private: + /* Vector of entries. */ + KeywordExt ** _table; + /* Size of the vector. */ + unsigned int _size; + /* log2(_size). */ + unsigned int _log_size; + /* A detail of the comparison function. */ + bool _ignore_length; + /* Statistics: Number of collisions so far. */ + unsigned int _collisions; + + /* Compares two items. */ + bool equal (KeywordExt *item1, KeywordExt *item2); }; #endif diff --git a/src/search.cc b/src/search.cc index dac5932..1f39f54 100644 --- a/src/search.cc +++ b/src/search.cc @@ -31,9 +31,6 @@ #include "options.h" #include "hash-table.h" -/* Make the hash table 8 times larger than the number of keyword entries. */ -static const int TABLE_MULTIPLE = 10; - /* Efficiently returns the least power of two greater than or equal to X! */ #define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X))) @@ -50,7 +47,6 @@ void Search::prepare () { KeywordExt_List *temp; - KeywordExt_List *trail = NULL; _total_keys = 0; for (temp = _head; temp; temp = temp->rest()) @@ -58,59 +54,57 @@ Search::prepare () temp->first()->init_selchars(_occurrences); _total_keys++; } - - /* Hash table this number of times larger than keyword number. */ - int table_size = (_list_len = _total_keys) * TABLE_MULTIPLE; - /* Table must be a power of 2 for the hash function scheme to work. */ - KeywordExt **table = new KeywordExt*[POW (table_size)]; - - /* Make large hash table for efficiency. */ - Hash_Table found_link (table, table_size, option[NOLENGTH]); - - /* Test whether there are any links and also set the maximum length of - an identifier in the keyword list. */ - _total_duplicates = 0; - _max_key_len = INT_MIN; - _min_key_len = INT_MAX; - for (temp = _head; temp; temp = temp->rest()) - { - KeywordExt *keyword = temp->first(); - KeywordExt *other_keyword = found_link.insert (keyword); - /* Check for links. We deal with these by building an equivalence class - of all duplicate values (i.e., links) so that only 1 keyword is - representative of the entire collection. This *greatly* simplifies - processing during later stages of the program. */ + _list_len = _total_keys; - if (other_keyword) - { - _total_duplicates++; - _list_len--; - trail->rest() = temp->rest(); - temp->first()->_duplicate_link = other_keyword->_duplicate_link; - other_keyword->_duplicate_link = temp->first(); - - /* Complain if user hasn't enabled the duplicate option. */ - if (!option[DUP] || option[DEBUG]) - fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n", - keyword->_allchars_length, keyword->_allchars, - other_keyword->_allchars_length, other_keyword->_allchars, - keyword->_selchars_length, keyword->_selchars); - } - else - { - temp->first()->_duplicate_link = NULL; - trail = temp; - } + { + /* Make hash table for efficiency. */ + Hash_Table found_link (_list_len, option[NOLENGTH]); - /* Update minimum and maximum keyword length, if needed. */ - if (_max_key_len < keyword->_allchars_length) - _max_key_len = keyword->_allchars_length; - if (_min_key_len > keyword->_allchars_length) - _min_key_len = keyword->_allchars_length; - } + /* Test whether there are any links and also set the maximum length of + an identifier in the keyword list. */ + _total_duplicates = 0; + _max_key_len = INT_MIN; + _min_key_len = INT_MAX; + KeywordExt_List *trail = NULL; + for (temp = _head; temp; temp = temp->rest()) + { + KeywordExt *keyword = temp->first(); + KeywordExt *other_keyword = found_link.insert (keyword); - delete[] table; + /* Check for links. We deal with these by building an equivalence class + of all duplicate values (i.e., links) so that only 1 keyword is + representative of the entire collection. This *greatly* simplifies + processing during later stages of the program. */ + + if (other_keyword) + { + _total_duplicates++; + _list_len--; + trail->rest() = temp->rest(); + temp->first()->_duplicate_link = other_keyword->_duplicate_link; + other_keyword->_duplicate_link = temp->first(); + + /* Complain if user hasn't enabled the duplicate option. */ + if (!option[DUP] || option[DEBUG]) + fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n", + keyword->_allchars_length, keyword->_allchars, + other_keyword->_allchars_length, other_keyword->_allchars, + keyword->_selchars_length, keyword->_selchars); + } + else + { + temp->first()->_duplicate_link = NULL; + trail = temp; + } + + /* Update minimum and maximum keyword length, if needed. */ + if (_max_key_len < keyword->_allchars_length) + _max_key_len = keyword->_allchars_length; + if (_min_key_len > keyword->_allchars_length) + _min_key_len = keyword->_allchars_length; + } + } /* Exit program if links exists and option[DUP] not set, since we can't continue */ if (_total_duplicates) |