summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2002-12-19 12:35:17 +0000
committerBruno Haible <bruno@clisp.org>2002-12-19 12:35:17 +0000
commit1186e616cbdad01f89e59a971f7073e55bcd0111 (patch)
treefb75cfa992e5543810c3ee9dfd7475ca084637a2
parent388a431c7d1b5f856993425e4a947b8aaa709431 (diff)
downloadgperf-1186e616cbdad01f89e59a971f7073e55bcd0111.tar.gz
Rework the hash table code.
-rw-r--r--ChangeLog19
-rw-r--r--src/hash-table.cc98
-rw-r--r--src/hash-table.h33
-rw-r--r--src/search.cc100
4 files changed, 168 insertions, 82 deletions
diff --git a/ChangeLog b/ChangeLog
index 7ade621..2553a1b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2002-11-03 Bruno Haible <bruno@clisp.org>
+
+ Bug fix: The hash table could fail to detect duplicates, between
+ keywords of different length, when option -n (option[NOLENGTH]) was
+ given.
+ * src/hash-table.h (Hash_Table::Hash_Table): Pass table size, not
+ vector and vector size as arguments.
+ (Hash_Table::_log_size): New field.
+ (Hash_Table::equal): New declaration.
+ * src/hash-table.cc (size_factor): New variable.
+ (Hash_Table::Hash_Table): Pass table size, not vector and vector size
+ as arguments. Allocate the vector here.
+ (Hash_Table::~Hash_Table): Deallocate the vector here.
+ (Hash_Table::equal): New function.
+ (Hash_Table::insert): Use it. Don't use item->_allchars_length for the
+ increment if _ignore_length is true.
+ * src/search.cc (TABLE_MULTIPLE): Remove variable.
+ (Search::prepare): Update.
+
2002-11-02 Bruno Haible <bruno@clisp.org>
Provide documentation also in PDF format.
diff --git a/src/hash-table.cc b/src/hash-table.cc
index b6622ba..3b8c139 100644
--- a/src/hash-table.cc
+++ b/src/hash-table.cc
@@ -28,21 +28,61 @@
#include <hash.h>
#include "options.h"
-/* The size of the hash table is always the smallest power of 2 >= the size
- indicated by the user. This allows several optimizations, including
- the use of double hashing and elimination of the mod instruction.
- Note that the size had better be larger than the number of items
- in the hash table, else there's trouble!!! Note that the memory
- for the hash table is allocated *outside* the intialization routine.
- This compromises information hiding somewhat, but greatly reduces
- memory fragmentation, since we can now use alloca! */
-
-Hash_Table::Hash_Table (KeywordExt **table_ptr, int s, bool ignore_len):
- _table (table_ptr), _size (s), _collisions (0), _ignore_length (ignore_len)
+/* We use a hash table with double hashing. This is the simplest kind of
+ hash table, given that we always only insert and never remove entries
+ from the hash table. */
+
+/* To make double hashing efficient, there need to be enough spare entries. */
+static const int size_factor = 10;
+
+/* We make the size of the hash table a power of 2. This allows for two
+ optimizations: It eliminates the modulo instruction, and allows for an
+ easy secondary hashing function. */
+
+/* Constructor. */
+Hash_Table::Hash_Table (unsigned int size, bool ignore_length)
+ : _ignore_length (ignore_length),
+ _collisions (0)
{
+ /* There need to be enough spare entries. */
+ size = size * size_factor;
+
+ /* Find smallest power of 2 that is >= size. */
+ unsigned int shift = 0;
+ if ((size >> 16) > 0)
+ {
+ size = size >> 16;
+ shift += 16;
+ }
+ if ((size >> 8) > 0)
+ {
+ size = size >> 8;
+ shift += 8;
+ }
+ if ((size >> 4) > 0)
+ {
+ size = size >> 4;
+ shift += 4;
+ }
+ if ((size >> 2) > 0)
+ {
+ size = size >> 2;
+ shift += 2;
+ }
+ if ((size >> 1) > 0)
+ {
+ size = size >> 1;
+ shift += 1;
+ }
+ _log_size = shift;
+ _size = 1 << shift;
+
+ /* Allocate table. */
+ _table = new KeywordExt*[_size];
memset (_table, 0, _size * sizeof (*_table));
}
+/* Destructor. */
Hash_Table::~Hash_Table ()
{
if (option[DEBUG])
@@ -76,24 +116,38 @@ Hash_Table::~Hash_Table ()
fprintf (stderr, "\nend dumping hash table\n\n");
}
+ delete[] _table;
}
-/* If the ITEM is already in the hash table return the item found
- in the table. Otherwise inserts the ITEM, and returns FALSE.
- Uses double hashing. */
+/* Compares two items. */
+inline bool
+Hash_Table::equal (KeywordExt *item1, KeywordExt *item2)
+{
+ return item1->_selchars_length == item2->_selchars_length
+ && memcmp (item1->_selchars, item2->_selchars, item2->_selchars_length)
+ == 0
+ && (_ignore_length
+ || item1->_allchars_length == item2->_allchars_length);
+}
+/* Attempts to insert ITEM in the table. If there is already an equal
+ entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
KeywordExt *
Hash_Table::insert (KeywordExt *item)
{
- unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
- int probe = hash_val & (_size - 1);
- int increment = ((hash_val ^ item->_allchars_length) | 1) & (_size - 1);
-
- while (_table[probe])
+ unsigned hash_val = hashpjw (item->_selchars, item->_selchars_length);
+ unsigned int probe = hash_val & (_size - 1);
+ unsigned int increment =
+ (((hash_val >> _log_size)
+ ^ (_ignore_length ? 0 : item->_allchars_length))
+ << 1) + 1;
+ /* Note that because _size is a power of 2 and increment is odd,
+ we have gcd(increment,_size) = 1, which guarantees that we'll find
+ an empty entry during the loop. */
+
+ while (_table[probe] != NULL)
{
- if (_table[probe]->_selchars_length == item->_selchars_length
- && memcmp (_table[probe]->_selchars, item->_selchars, item->_selchars_length) == 0
- && (_ignore_length || _table[probe]->_allchars_length == item->_allchars_length))
+ if (equal (_table[probe], item))
return _table[probe];
_collisions++;
diff --git a/src/hash-table.h b/src/hash-table.h
index 141c690..4de8466 100644
--- a/src/hash-table.h
+++ b/src/hash-table.h
@@ -28,18 +28,37 @@
#include "keyword.h"
+/* Hash table of KeywordExt* entries.
+ Two entries are considered equal if their _selchars are the same and
+ - if !ignore_length - if their _allchars_length are the same. */
+
class Hash_Table
{
-private:
- KeywordExt ** _table; /* Vector of pointers to linked lists of keywords. */
- int _size; /* Size of the vector. */
- int _collisions; /* Find out how well our double hashing is working! */
- bool _ignore_length;
-
public:
- Hash_Table (KeywordExt **t, int s, bool ignore_len);
+ /* Constructor.
+ size is the maximum number of entries.
+ ignore_length determines a detail in the comparison function. */
+ Hash_Table (unsigned int size, bool ignore_length);
+ /* Destructor. */
~Hash_Table ();
+ /* Attempts to insert ITEM in the table. If there is already an equal
+ entry in it, returns it. Otherwise inserts ITEM and returns NULL. */
KeywordExt * insert (KeywordExt *item);
+
+private:
+ /* Vector of entries. */
+ KeywordExt ** _table;
+ /* Size of the vector. */
+ unsigned int _size;
+ /* log2(_size). */
+ unsigned int _log_size;
+ /* A detail of the comparison function. */
+ bool _ignore_length;
+ /* Statistics: Number of collisions so far. */
+ unsigned int _collisions;
+
+ /* Compares two items. */
+ bool equal (KeywordExt *item1, KeywordExt *item2);
};
#endif
diff --git a/src/search.cc b/src/search.cc
index dac5932..1f39f54 100644
--- a/src/search.cc
+++ b/src/search.cc
@@ -31,9 +31,6 @@
#include "options.h"
#include "hash-table.h"
-/* Make the hash table 8 times larger than the number of keyword entries. */
-static const int TABLE_MULTIPLE = 10;
-
/* Efficiently returns the least power of two greater than or equal to X! */
#define POW(X) ((!X)?1:(X-=1,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,(++X)))
@@ -50,7 +47,6 @@ void
Search::prepare ()
{
KeywordExt_List *temp;
- KeywordExt_List *trail = NULL;
_total_keys = 0;
for (temp = _head; temp; temp = temp->rest())
@@ -58,59 +54,57 @@ Search::prepare ()
temp->first()->init_selchars(_occurrences);
_total_keys++;
}
-
- /* Hash table this number of times larger than keyword number. */
- int table_size = (_list_len = _total_keys) * TABLE_MULTIPLE;
- /* Table must be a power of 2 for the hash function scheme to work. */
- KeywordExt **table = new KeywordExt*[POW (table_size)];
-
- /* Make large hash table for efficiency. */
- Hash_Table found_link (table, table_size, option[NOLENGTH]);
-
- /* Test whether there are any links and also set the maximum length of
- an identifier in the keyword list. */
- _total_duplicates = 0;
- _max_key_len = INT_MIN;
- _min_key_len = INT_MAX;
- for (temp = _head; temp; temp = temp->rest())
- {
- KeywordExt *keyword = temp->first();
- KeywordExt *other_keyword = found_link.insert (keyword);
- /* Check for links. We deal with these by building an equivalence class
- of all duplicate values (i.e., links) so that only 1 keyword is
- representative of the entire collection. This *greatly* simplifies
- processing during later stages of the program. */
+ _list_len = _total_keys;
- if (other_keyword)
- {
- _total_duplicates++;
- _list_len--;
- trail->rest() = temp->rest();
- temp->first()->_duplicate_link = other_keyword->_duplicate_link;
- other_keyword->_duplicate_link = temp->first();
-
- /* Complain if user hasn't enabled the duplicate option. */
- if (!option[DUP] || option[DEBUG])
- fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
- keyword->_allchars_length, keyword->_allchars,
- other_keyword->_allchars_length, other_keyword->_allchars,
- keyword->_selchars_length, keyword->_selchars);
- }
- else
- {
- temp->first()->_duplicate_link = NULL;
- trail = temp;
- }
+ {
+ /* Make hash table for efficiency. */
+ Hash_Table found_link (_list_len, option[NOLENGTH]);
- /* Update minimum and maximum keyword length, if needed. */
- if (_max_key_len < keyword->_allchars_length)
- _max_key_len = keyword->_allchars_length;
- if (_min_key_len > keyword->_allchars_length)
- _min_key_len = keyword->_allchars_length;
- }
+ /* Test whether there are any links and also set the maximum length of
+ an identifier in the keyword list. */
+ _total_duplicates = 0;
+ _max_key_len = INT_MIN;
+ _min_key_len = INT_MAX;
+ KeywordExt_List *trail = NULL;
+ for (temp = _head; temp; temp = temp->rest())
+ {
+ KeywordExt *keyword = temp->first();
+ KeywordExt *other_keyword = found_link.insert (keyword);
- delete[] table;
+ /* Check for links. We deal with these by building an equivalence class
+ of all duplicate values (i.e., links) so that only 1 keyword is
+ representative of the entire collection. This *greatly* simplifies
+ processing during later stages of the program. */
+
+ if (other_keyword)
+ {
+ _total_duplicates++;
+ _list_len--;
+ trail->rest() = temp->rest();
+ temp->first()->_duplicate_link = other_keyword->_duplicate_link;
+ other_keyword->_duplicate_link = temp->first();
+
+ /* Complain if user hasn't enabled the duplicate option. */
+ if (!option[DUP] || option[DEBUG])
+ fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"%.*s\".\n",
+ keyword->_allchars_length, keyword->_allchars,
+ other_keyword->_allchars_length, other_keyword->_allchars,
+ keyword->_selchars_length, keyword->_selchars);
+ }
+ else
+ {
+ temp->first()->_duplicate_link = NULL;
+ trail = temp;
+ }
+
+ /* Update minimum and maximum keyword length, if needed. */
+ if (_max_key_len < keyword->_allchars_length)
+ _max_key_len = keyword->_allchars_length;
+ if (_min_key_len > keyword->_allchars_length)
+ _min_key_len = keyword->_allchars_length;
+ }
+ }
/* Exit program if links exists and option[DUP] not set, since we can't continue */
if (_total_duplicates)