diff options
-rw-r--r-- | ChangeLog | 44 | ||||
-rw-r--r-- | NEWS | 2 | ||||
-rw-r--r-- | doc/gperf.texi | 17 | ||||
-rw-r--r-- | src/input.cc | 4 | ||||
-rw-r--r-- | src/keyword.cc | 14 | ||||
-rw-r--r-- | src/keyword.h | 6 | ||||
-rw-r--r-- | src/options.cc | 12 | ||||
-rw-r--r-- | src/options.h | 5 | ||||
-rw-r--r-- | src/output.cc | 153 | ||||
-rw-r--r-- | src/search.cc | 204 | ||||
-rw-r--r-- | src/search.h | 26 | ||||
-rw-r--r-- | tests/Makefile.in | 2 | ||||
-rw-r--r-- | tests/permutc2.exp | 116 | ||||
-rw-r--r-- | tests/permutc2.gperf | 14 | ||||
-rw-r--r-- | tests/test-6.exp | 3 |
15 files changed, 583 insertions, 39 deletions
@@ -1,5 +1,49 @@ 2002-12-10 Bruno Haible <bruno@clisp.org> + * src/options.h (UPPERLOWER): New enum value. + * src/options.cc (Options::long_usage): Document option --ignore-case. + (Options::~Options): Update. + (long_options): Add option --ignore-case. + (Options::parse_options): Handle option --ignore-case. + * src/input.cc (Input::read_input): Recognize option %ignore-case. + * src/keyword.h (KeywordExt::init_selchars_tuple, + KeywordExt::init_selchars_multiset, KeywordExt::init_selchars_low): + Add alpha_unify argument. + * src/keyword.cc (KeywordExt::init_selchars_low): Add alpha_unify + argument. + (KeywordExt::init_selchars_tuple): Add alpha_unify argument. + (KeywordExt::init_selchars_multiset): Add alpha_unify argument. + * src/search.h (Search::compute_alpha_size, + Search::compute_alpha_unify): New declarations. + (Search::init_selchars_multiset): Add alpha_unify argument. + (Search::_alpha_unify): New field. + * src/search.cc (Search::compute_alpha_size, + Search::compute_alpha_unify): New functions. + (Search::init_selchars_tuple): Update. + (Search::find_positions): Temporarily set _alpha_unify. Perform a + case insensitive comparison if needed. + (Search::init_selchars_multiset): Add alpha_unify argument. + (Search::count_duplicates_multiset): Call compute_alpha_unify. + (Search::find_alpha_inc): Temporarily set _alpha_unify. At the end, + set _alpha_size and _alpha_unify. + (Search::prepare): Update. Don't compute _alpga_size here. + (Search::optimize): Propagate unified asso_values. + (Search::~Search) Delete _alpha_unify. + * src/output.cc (output_upperlower_strcmp, output_upperlower_strncmp, + output_upperlower_memcmp): New functions. + (Output_Compare_Strcmp::output_comparison, + Output_Compare_Strncmp::output_comparison, + Output_Compare_Memcmp::output_comparison): Use the case-insensitive + comparison function if --ignore-case was given. + (Output::output): Emit the auxiliary case-insensitive comparison + function if needed. + * tests/permutc2.gperf, tests/permutc2.exp: New files. + * tests/Makefile.in (check-test): Also check permutc2.gperf. + * tests/test-6.exp: Update. + * doc/gperf.texi (Gperf Declarations): Document %ignore-case. + (Input Details): Document option --ignore-case. + * NEWS: Update. + * src/search.cc (Search::optimize): Fill unused asso_values[] entries with a large value. * src/output.h (Output::Output): Remove occurrences argument. @@ -10,6 +10,7 @@ New in 2.97: * The following options can now be specified inside the input file: %delimiters=DELIMITER-LIST %struct-type + %ignore-case %language=LANGUAGE-NAME %define slot-name NAME %define hash-function-name NAME @@ -42,6 +43,7 @@ New in 2.97: * The options -f/--fast and -o/--occurrence-sort have no effect any more. * Added option -P/--pic that optimizes the generated code for use in shared libraries. +* Added option --ignore-case that produces a case independent lookup function. * Bug fixes. New in 2.7.2: diff --git a/doc/gperf.texi b/doc/gperf.texi index b9ce792..56676ce 100644 --- a/doc/gperf.texi +++ b/doc/gperf.texi @@ -436,6 +436,12 @@ commas or newlines. Allows you to include a @code{struct} type declaration for generated code; see above for an example. +@item %ignore-case +@cindex @samp{%ignore-case} +Consider upper and lower case ASCII characters as equivalent. The string +comparison will use a case insignificant character comparison. Note that +locale dependent case mappings are ignored. + @item %language=@var{language-name} @cindex @samp{%language} Instructs @code{gperf} to generate code in the language specified by the @@ -790,6 +796,17 @@ part of the type declaration. Keywords and additional fields may follow this, one group of fields per line. A set of examples for generating perfect hash tables and functions for Ada, C, C++, Pascal, Modula 2, Modula 3 and JavaScript reserved words are distributed with this release. + +@item --ignore-case +Consider upper and lower case ASCII characters as equivalent. The string +comparison will use a case insignificant character comparison. Note that +locale dependent case mappings are ignored. This option is therefore not +suitable if a properly internationalized or locale aware case mapping +should be used. (For example, in a Turkish locale, the upper case equivalent +of the lowercase ASCII letter @samp{i} is the non-ASCII character +@samp{capital i with dot above}.) For this case, it is better to apply +an uppercase or lowercase conversion on the string before passing it to +the @code{gperf} generated function. @end table @node Output Language, Output Details, Input Details, Options diff --git a/src/input.cc b/src/input.cc index 3c81ba4..933911e 100644 --- a/src/input.cc +++ b/src/input.cc @@ -472,6 +472,10 @@ Input::read_input () option.set (TYPE); else + if (is_declaration (line, line_end, lineno, "ignore-case")) + option.set (UPPERLOWER); + else + if (is_declaration_with_arg (line, line_end, lineno, "language", &arg)) option.set_language (arg); diff --git a/src/keyword.cc b/src/keyword.cc index a784e07..2d7904e 100644 --- a/src/keyword.cc +++ b/src/keyword.cc @@ -59,7 +59,7 @@ static inline void sort_char_set (unsigned int *base, int len) */ unsigned int * -KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) +KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) { const char *k = _allchars; unsigned int *key_set = @@ -73,6 +73,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c unsigned int c = static_cast<unsigned char>(*k); if (alpha_inc) c += alpha_inc[k-_allchars]; + if (alpha_unify) + c = alpha_unify[c]; *ptr = c; ptr++; } @@ -99,6 +101,8 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c else /* Out of range of KEY length, so we'll just skip it. */ continue; + if (alpha_unify) + c = alpha_unify[c]; *ptr = c; ptr++; } @@ -111,16 +115,16 @@ KeywordExt::init_selchars_low (bool use_all_chars, const Positions& positions, c } void -KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions) +KeywordExt::init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify) { - init_selchars_low (use_all_chars, positions, NULL); + init_selchars_low (use_all_chars, positions, alpha_unify, NULL); } void -KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) +KeywordExt::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) { unsigned int *selchars = - init_selchars_low (use_all_chars, positions, alpha_inc); + init_selchars_low (use_all_chars, positions, alpha_unify, alpha_inc); /* Sort the selchars elements alphabetically. */ sort_char_set (selchars, _selchars_length); diff --git a/src/keyword.h b/src/keyword.h index 02e0364..9f09186 100644 --- a/src/keyword.h +++ b/src/keyword.h @@ -68,9 +68,9 @@ struct KeywordExt : public Keyword /* Methods depending on the keyposition list. */ /* Initializes selchars and selchars_length, without reordering. */ - void init_selchars_tuple (bool use_all_chars, const Positions& positions); + void init_selchars_tuple (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify); /* Initializes selchars and selchars_length, with reordering. */ - void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc); + void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc); /* Deletes selchars. */ void delete_selchars (); @@ -81,7 +81,7 @@ struct KeywordExt : public Keyword int _final_index; private: - unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc); + unsigned int * init_selchars_low (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc); }; /* An abstract factory for creating Keyword instances. diff --git a/src/options.cc b/src/options.cc index 29d0b7b..7e533cd 100644 --- a/src/options.cc +++ b/src/options.cc @@ -106,6 +106,10 @@ Options::long_usage (FILE * stream) const " is considered part of the type declaration. Key\n" " words and additional fields may follow this, one\n" " group of fields per line.\n"); + fprintf (stream, + " --ignore-case Consider upper and lower case ASCII characters as\n" + " equivalent. Note that locale dependent case mappings\n" + " are ignored.\n"); fprintf (stream, "\n"); fprintf (stream, "Language for the output code:\n"); @@ -463,6 +467,7 @@ Options::~Options () "\nENUM is........: %s" "\nINCLUDE is.....: %s" "\nSEVENBIT is....: %s" + "\nUPPERLOWER is..: %s" "\nlookup function name = %s" "\nhash function name = %s" "\nword list name = %s" @@ -492,6 +497,7 @@ Options::~Options () _option_word & ENUM ? "enabled" : "disabled", _option_word & INCLUDE ? "enabled" : "disabled", _option_word & SEVENBIT ? "enabled" : "disabled", + _option_word & UPPERLOWER ? "enabled" : "disabled", _function_name, _hash_name, _wordlist_name, _slot_name, _initializer_suffix, _asso_iterations, _jump, _size_multiple, _initial_asso_value, _delimiters, _total_switches); @@ -605,6 +611,7 @@ Options::set_delimiters (const char *delimiters) static const struct option long_options[] = { { "output-file", required_argument, NULL, CHAR_MAX + 1 }, + { "ignore-case", no_argument, NULL, CHAR_MAX + 2 }, { "delimiters", required_argument, NULL, 'e' }, { "struct-type", no_argument, NULL, 't' }, { "language", required_argument, NULL, 'L' }, @@ -949,6 +956,11 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\ _output_file_name = /*getopt*/optarg; break; } + case CHAR_MAX + 2: /* Case insignificant. */ + { + _option_word |= UPPERLOWER; + break; + } default: short_usage (stderr); exit (1); diff --git a/src/options.h b/src/options.h index 2a9567b..4dda513 100644 --- a/src/options.h +++ b/src/options.h @@ -98,7 +98,10 @@ enum Option_Type SEVENBIT = 1 << 19, /* Optimize for position-independent code. */ - SHAREDLIB = 1 << 20 + SHAREDLIB = 1 << 20, + + /* Ignore case of ASCII characters. */ + UPPERLOWER = 1 << 21 }; /* Class manager for gperf program Options. */ diff --git a/src/output.cc b/src/output.cc index a92a9a3..6e0d688 100644 --- a/src/output.cc +++ b/src/output.cc @@ -232,6 +232,131 @@ Output::output_constants (struct Output_Constants& style) const /* ------------------------------------------------------------------------- */ +/* Output gperf's ASCII-case insensitive strcmp replacement. */ + +static void +output_upperlower_strcmp () +{ + printf ("#ifndef GPERF_CASE_STRCMP\n" + "#define GPERF_CASE_STRCMP 1\n" + "static int\n" + "gperf_case_strcmp "); + printf (option[KRC] ? + "(s1, s2)\n" + " register char *s1;\n" + " register char *s2;\n" : + option[C] ? + "(s1, s2)\n" + " register const char *s1;\n" + " register const char *s2;\n" : + option[ANSIC] | option[CPLUSPLUS] ? + "(register const char *s1, register const char *s2)\n" : + ""); + printf ("{\n" + " for (;;)\n" + " {\n" + " unsigned char c1 = *s1++;\n" + " unsigned char c2 = *s2++;\n" + " if (c1 >= 'A' && c1 <= 'Z')\n" + " c1 += 'a' - 'A';\n" + " if (c2 >= 'A' && c2 <= 'Z')\n" + " c2 += 'a' - 'A';\n" + " if (c1 != 0 && c1 == c2)\n" + " continue;\n" + " return (int)c1 - (int)c2;\n" + " }\n" + "}\n" + "#endif\n\n"); +} + +/* Output gperf's ASCII-case insensitive strncmp replacement. */ + +static void +output_upperlower_strncmp () +{ + printf ("#ifndef GPERF_CASE_STRNCMP\n" + "#define GPERF_CASE_STRNCMP 1\n" + "static int\n" + "gperf_case_strncmp "); + printf (option[KRC] ? + "(s1, s2, n)\n" + " register char *s1;\n" + " register char *s2;\n" + " register unsigned int n;\n" : + option[C] ? + "(s1, s2, n)\n" + " register const char *s1;\n" + " register const char *s2;\n" + " register unsigned int n;\n" : + option[ANSIC] | option[CPLUSPLUS] ? + "(register const char *s1, register const char *s2, register unsigned int n)\n" : + ""); + printf ("{\n" + " for (; n > 0;)\n" + " {\n" + " unsigned char c1 = *s1++;\n" + " unsigned char c2 = *s2++;\n" + " if (c1 >= 'A' && c1 <= 'Z')\n" + " c1 += 'a' - 'A';\n" + " if (c2 >= 'A' && c2 <= 'Z')\n" + " c2 += 'a' - 'A';\n" + " if (c1 != 0 && c1 == c2)\n" + " {\n" + " n--;\n" + " continue;\n" + " }\n" + " return (int)c1 - (int)c2;\n" + " }\n" + " return 0;\n" + "}\n" + "#endif\n\n"); +} + +/* Output gperf's ASCII-case insensitive memcmp replacement. */ + +static void +output_upperlower_memcmp () +{ + printf ("#ifndef GPERF_CASE_MEMCMP\n" + "#define GPERF_CASE_MEMCMP 1\n" + "static int\n" + "gperf_case_memcmp "); + printf (option[KRC] ? + "(s1, s2, n)\n" + " register char *s1;\n" + " register char *s2;\n" + " register unsigned int n;\n" : + option[C] ? + "(s1, s2, n)\n" + " register const char *s1;\n" + " register const char *s2;\n" + " register unsigned int n;\n" : + option[ANSIC] | option[CPLUSPLUS] ? + "(register const char *s1, register const char *s2, register unsigned int n)\n" : + ""); + printf ("{\n" + " for (; n > 0;)\n" + " {\n" + " unsigned char c1 = *s1++;\n" + " unsigned char c2 = *s2++;\n" + " if (c1 >= 'A' && c1 <= 'Z')\n" + " c1 += 'a' - 'A';\n" + " if (c2 >= 'A' && c2 <= 'Z')\n" + " c2 += 'a' - 'A';\n" + " if (c1 == c2)\n" + " {\n" + " n--;\n" + " continue;\n" + " }\n" + " return (int)c1 - (int)c2;\n" + " }\n" + " return 0;\n" + "}\n" + "#endif\n\n"); +} + +/* ------------------------------------------------------------------------- */ + /* Outputs a keyword, as a string: enclosed in double quotes, escaping backslashes, double quote and unprintable characters. */ @@ -363,7 +488,10 @@ void Output_Compare_Strcmp::output_comparison (const Output_Expr& expr1, expr1.output_expr (); printf (" == *"); expr2.output_expr (); - printf (" && !strcmp ("); + printf (" && !"); + if (option[UPPERLOWER]) + printf ("gperf_case_"); + printf ("strcmp ("); expr1.output_expr (); printf (" + 1, "); expr2.output_expr (); @@ -389,7 +517,10 @@ void Output_Compare_Strncmp::output_comparison (const Output_Expr& expr1, expr1.output_expr (); printf (" == *"); expr2.output_expr (); - printf (" && !strncmp ("); + printf (" && !"); + if (option[UPPERLOWER]) + printf ("gperf_case_"); + printf ("strncmp ("); expr1.output_expr (); printf (" + 1, "); expr2.output_expr (); @@ -418,7 +549,10 @@ void Output_Compare_Memcmp::output_comparison (const Output_Expr& expr1, expr1.output_expr (); printf (" == *"); expr2.output_expr (); - printf (" && !memcmp ("); + printf (" && !"); + if (option[UPPERLOWER]) + printf ("gperf_case_"); + printf ("memcmp ("); expr1.output_expr (); printf (" + 1, "); expr2.output_expr (); @@ -1522,6 +1656,19 @@ Output::output () printf ("/* maximum key range = %d, duplicates = %d */\n\n", _max_hash_value - _min_hash_value + 1, _total_duplicates); + if (option[UPPERLOWER]) + { + if (option[LENTABLE]) + output_upperlower_memcmp (); + else + { + if (option[COMP]) + output_upperlower_strncmp (); + else + output_upperlower_strcmp (); + } + } + if (option[CPLUSPLUS]) printf ("class %s\n" "{\n" diff --git a/src/search.cc b/src/search.cc index 730a1ac..7ceb26c 100644 --- a/src/search.cc +++ b/src/search.cc @@ -146,12 +146,39 @@ Search::preprepare () /* ====================== Finding good byte positions ====================== */ +/* Computes the upper bound on the indices passed to asso_values[], + assuming no alpha_increments. */ +unsigned int +Search::compute_alpha_size () const +{ + return (option[SEVENBIT] ? 128 : 256); +} + +/* Computes the unification rules between different asso_values[c], + assuming no alpha_increments. */ +unsigned int * +Search::compute_alpha_unify () const +{ + if (option[UPPERLOWER]) + { + unsigned int alpha_size = compute_alpha_size(); + unsigned int *alpha_unify = new unsigned int[alpha_size]; + for (unsigned int c = 0; c < alpha_size; c++) + alpha_unify[c] = c; + for (unsigned int c = 'A'; c <= 'Z'; c++) + alpha_unify[c] = c + ('a'-'A'); + return alpha_unify; + } + else + return NULL; +} + /* Initializes each keyword's _selchars array. */ void Search::init_selchars_tuple (bool use_all_chars, const Positions& positions) const { for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) - temp->first()->init_selchars_tuple(use_all_chars, positions); + temp->first()->init_selchars_tuple(use_all_chars, positions, _alpha_unify); } /* Deletes each keyword's _selchars array. */ @@ -202,6 +229,9 @@ Search::find_positions () return; } + /* Compute preliminary value for _alpha_unify. */ + _alpha_unify = compute_alpha_unify (); + /* 1. Find positions that must occur in order to distinguish duplicates. */ Positions mandatory; @@ -222,17 +252,42 @@ Search::find_positions () int n = keyword1->_allchars_length; int i; for (i = 1; i < n; i++) - if (keyword1->_allchars[i-1] != keyword2->_allchars[i-1]) - break; - if (i < n - && memcmp (&keyword1->_allchars[i], - &keyword2->_allchars[i], - n - i) - == 0) { - /* Position i is mandatory. */ - if (!mandatory.contains (i)) - mandatory.add (i); + unsigned char c1 = keyword1->_allchars[i-1]; + unsigned char c2 = keyword2->_allchars[i-1]; + if (option[UPPERLOWER]) + { + if (c1 >= 'A' && c1 <= 'Z') + c1 += 'a' - 'A'; + if (c2 >= 'A' && c2 <= 'Z') + c2 += 'a' - 'A'; + } + if (c1 != c2) + break; + } + if (i < n) + { + int j; + for (j = i + 1; j <= n; j++) + { + unsigned char c1 = keyword1->_allchars[j-1]; + unsigned char c2 = keyword2->_allchars[j-1]; + if (option[UPPERLOWER]) + { + if (c1 >= 'A' && c1 <= 'Z') + c1 += 'a' - 'A'; + if (c2 >= 'A' && c2 <= 'Z') + c2 += 'a' - 'A'; + } + if (c1 != c2) + break; + } + if (j > n) + { + /* Position i is mandatory. */ + if (!mandatory.contains (i)) + mandatory.add (i); + } } } } @@ -379,16 +434,113 @@ Search::find_positions () } fprintf (stderr, "\n"); } + + /* Free preliminary value for _alpha_unify. */ + delete[] _alpha_unify; } /* ===================== Finding good alpha increments ===================== */ +/* Computes the upper bound on the indices passed to asso_values[]. */ +unsigned int +Search::compute_alpha_size (const unsigned int *alpha_inc) const +{ + unsigned int max_alpha_inc = 0; + for (int i = 0; i < _max_key_len; i++) + if (max_alpha_inc < alpha_inc[i]) + max_alpha_inc = alpha_inc[i]; + return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc; +} + +/* Computes the unification rules between different asso_values[c]. */ +unsigned int * +Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const +{ + if (option[UPPERLOWER]) + { + /* Without alpha increments, we would simply unify + 'A' -> 'a', ..., 'Z' -> 'z'. + But when a keyword contains at position i a character c, + we have the constraint + asso_values[tolower(c) + alpha_inc[i]] == + asso_values[toupper(c) + alpha_inc[i]]. + This introduces a unification + toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i]. + Note that this unification can extend outside the range of + ASCII letters! But still every unified character pair is at + a distance of 'a'-'A' = 32, or (after chained unification) + at a multiple of 32. So in the end the alpha_unify vector has + the form c -> c + 32 * f(c) where f(c) is a nonnegative + integer. */ + unsigned int alpha_size = compute_alpha_size (alpha_inc); + + unsigned int *alpha_unify = new unsigned int[alpha_size]; + for (unsigned int c = 0; c < alpha_size; c++) + alpha_unify[c] = c; + + for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) + { + KeywordExt *keyword = temp->first(); + + if (option[ALLCHARS]) + /* Iterate through all character positions. */ + for (int i = 0; i < keyword->_allchars_length; i++) + { + unsigned int c = static_cast<unsigned char>(keyword->_allchars[i]); + if (c >= 'A' && c <= 'Z') + c += 'a' - 'A'; + if (c >= 'a' && c <= 'z') + { + c += alpha_inc[i]; + /* Unify c with c - ('a'-'A'). */ + unsigned int d = alpha_unify[c]; + unsigned int b = c - ('a'-'A'); + for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A')) + alpha_unify[a] = d; + } + } + else + { + /* Iterate through the selected character positions. */ + PositionIterator iter (positions); + + for (int i; (i = iter.next ()) != PositionIterator::EOS; ) + { + unsigned int c; + if (i == Positions::LASTCHAR) + c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]); + else if (i <= keyword->_allchars_length) + c = static_cast<unsigned char>(keyword->_allchars[i - 1]); + else + continue; + if (c >= 'A' && c <= 'Z') + c += 'a' - 'A'; + if (c >= 'a' && c <= 'z') + { + if (i != Positions::LASTCHAR) + c += alpha_inc[i - 1]; + /* Unify c with c - ('a'-'A'). */ + unsigned int d = alpha_unify[c]; + unsigned int b = c - ('a'-'A'); + for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A')) + alpha_unify[a] = d; + } + } + } + } + return alpha_unify; + } + else + /* Identity mapping. */ + return NULL; +} + /* Initializes each keyword's _selchars array. */ void -Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const +Search::init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const { for (KeywordExt_List *temp = _head; temp; temp = temp->rest()) - temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_inc); + temp->first()->init_selchars_multiset(use_all_chars, positions, alpha_unify, alpha_inc); } /* Count the duplicate keywords that occur with the given set of positions @@ -402,7 +554,9 @@ Search::count_duplicates_multiset (const unsigned int *alpha_inc) const /* Run through the keyword list and count the duplicates incrementally. The result does not depend on the order of the keyword list, thanks to the formula above. */ - init_selchars_multiset (option[ALLCHARS], _key_positions, alpha_inc); + init_selchars_multiset (option[ALLCHARS], _key_positions, + compute_alpha_unify (_key_positions, alpha_inc), + alpha_inc); unsigned int count = 0; { @@ -428,7 +582,9 @@ Search::find_alpha_inc () /* The goal is to choose _alpha_inc[] such that it doesn't introduce artificial duplicates. In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */ + _alpha_unify = compute_alpha_unify (); unsigned int duplicates_goal = count_duplicates_tuple (_key_positions); + delete[] _alpha_unify; /* Start with zero increments. This is sufficient in most cases. */ unsigned int *current = new unsigned int [_max_key_len]; @@ -545,6 +701,8 @@ Search::find_alpha_inc () } _alpha_inc = current; + _alpha_size = compute_alpha_size (_alpha_inc); + _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc); } /* ======================= Finding good asso_values ======================== */ @@ -555,7 +713,8 @@ Search::prepare () KeywordExt_List *temp; /* Initialize each keyword's _selchars array. */ - init_selchars_multiset(option[ALLCHARS], _key_positions, _alpha_inc); + init_selchars_multiset(option[ALLCHARS], _key_positions, + _alpha_unify, _alpha_inc); /* Check for duplicates, i.e. keywords with the same _selchars array (and - if !option[NOLENGTH] - also the same length). @@ -634,14 +793,6 @@ Search::prepare () } } - /* Compute _alpha_size, the upper bound on the indices passed to - asso_values[]. */ - unsigned int max_alpha_inc = 0; - for (int i = 0; i < _max_key_len; i++) - if (max_alpha_inc < _alpha_inc[i]) - max_alpha_inc = _alpha_inc[i]; - _alpha_size = (option[SEVENBIT] ? 128 : 256) + max_alpha_inc; - /* Compute the occurrences of each character in the alphabet. */ _occurrences = new int[_alpha_size]; memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0])); @@ -1492,6 +1643,12 @@ Search::optimize () for (unsigned int c = 0; c < _alpha_size; c++) if (_occurrences[c] == 0) _asso_values[c] = max_hash_value + 1; + + /* Propagate unified asso_values. */ + if (_alpha_unify) + for (unsigned int c = 0; c < _alpha_size; c++) + if (_alpha_unify[c] != c) + _asso_values[c] = _asso_values[_alpha_unify[c]]; } /* Prints out some diagnostics upon completion. */ @@ -1533,5 +1690,6 @@ Search::~Search () } delete[] _asso_values; delete[] _occurrences; + delete[] _alpha_unify; delete[] _alpha_inc; } diff --git a/src/search.h b/src/search.h index ada14d2..0cd2829 100644 --- a/src/search.h +++ b/src/search.h @@ -41,6 +41,14 @@ public: private: void preprepare (); + /* Computes the upper bound on the indices passed to asso_values[], + assuming no alpha_increments. */ + unsigned int compute_alpha_size () const; + + /* Computes the unification rules between different asso_values[c], + assuming no alpha_increments. */ + unsigned int * compute_alpha_unify () const; + /* Initializes each keyword's _selchars array. */ void init_selchars_tuple (bool use_all_chars, const Positions& positions) const; /* Deletes each keyword's _selchars array. */ @@ -52,8 +60,14 @@ private: /* Find good key positions. */ void find_positions (); + /* Computes the upper bound on the indices passed to asso_values[]. */ + unsigned int compute_alpha_size (const unsigned int *alpha_inc) const; + + /* Computes the unification rules between different asso_values[c]. */ + unsigned int * compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const; + /* Initializes each keyword's _selchars array. */ - void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_inc) const; + void init_selchars_multiset (bool use_all_chars, const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const; /* Count the duplicate keywords that occur with the given set of positions and a given alpha_inc[] array. */ @@ -115,13 +129,17 @@ public: /* Adjustments to add to bytes add specific key positions. */ unsigned int * _alpha_inc; + /* Size of alphabet. */ + unsigned int _alpha_size; + + /* Alphabet character unification, either the identity or a mapping from + upper case characters to lower case characters (and maybe more). */ + unsigned int * _alpha_unify; + /* Total number of duplicates that have been moved to _duplicate_link lists (not counting their representatives which stay on the main list). */ int _total_duplicates; - /* Size of alphabet. */ - unsigned int _alpha_size; - /* Counts occurrences of each key set character. _occurrences[c] is the number of times that c occurs among the _selchars of a keyword. */ diff --git a/tests/Makefile.in b/tests/Makefile.in index c857fc4..c7c3c37 100644 --- a/tests/Makefile.in +++ b/tests/Makefile.in @@ -143,6 +143,8 @@ check-test: diff $(srcdir)/permut2.exp permut2.out $(GPERF) -m5 < $(srcdir)/permut3.gperf > permut3.out diff $(srcdir)/permut3.exp permut3.out + $(GPERF) -m5 --ignore-case < $(srcdir)/permutc2.gperf > permutc2.out + diff $(srcdir)/permutc2.exp permutc2.out $(GPERF) -C -E -G -I -t < $(srcdir)/charsets.gperf > charsets.out diff $(srcdir)/charsets.exp charsets.out $(GPERF) -C -E -G -I -t < $(srcdir)/languages.gperf > languages.out diff --git a/tests/permutc2.exp b/tests/permutc2.exp new file mode 100644 index 0000000..02632d5 --- /dev/null +++ b/tests/permutc2.exp @@ -0,0 +1,116 @@ +/* C code produced by gperf version 2.7.2 */ +/* Command-line: ../src/gperf -m5 --ignore-case */ +/* Computed positions: -k'1-2' */ + +/* Test of a hash function which has to deal with permutation and + case-independence. Without case-independence, the alpha_inc is 1. + With case-independence, the alpha_inc is 3. */ + +#define TOTAL_KEYWORDS 8 +#define MIN_WORD_LENGTH 2 +#define MAX_WORD_LENGTH 2 +#define MIN_HASH_VALUE 2 +#define MAX_HASH_VALUE 9 +/* maximum key range = 8, duplicates = 0 */ + +#ifndef GPERF_CASE_STRCMP +#define GPERF_CASE_STRCMP 1 +static int +gperf_case_strcmp (s1, s2) + register const char *s1; + register const char *s2; +{ + for (;;) + { + unsigned char c1 = *s1++; + unsigned char c2 = *s2++; + if (c1 >= 'A' && c1 <= 'Z') + c1 += 'a' - 'A'; + if (c2 >= 'A' && c2 <= 'Z') + c2 += 'a' - 'A'; + if (c1 != 0 && c1 == c2) + continue; + return (int)c1 - (int)c2; + } +} +#endif + +#ifdef __GNUC__ +__inline +#else +#ifdef __cplusplus +inline +#endif +#endif +static unsigned int +hash (str, len) + register const char *str; + register unsigned int len; +{ + static unsigned char asso_values[] = + { + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10, 1,10,10, 3,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10, 3, 1, + 0, 7, 1, 0, 3,10,10, 1,10,10, + 3,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 3, 1, 0, 0, 1, 0, 2,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10,10, + 10,10,10,10,10,10,10,10,10 + }; + return len + asso_values[(unsigned char)str[1]+3] + asso_values[(unsigned char)str[0]]; +} + +#ifdef __GNUC__ +__inline +#endif +const char * +in_word_set (str, len) + register const char *str; + register unsigned int len; +{ + static const char * wordlist[] = + { + "", "", + "{w", + "az", + "ay", + "za", + "ya", + "x{", + "x[", + "[w" + }; + + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) + { + register int key = hash (str, len); + + if (key <= MAX_HASH_VALUE && key >= 0) + { + register const char *s = wordlist[key]; + + if (*str == *s && !gperf_case_strcmp (str + 1, s + 1)) + return s; + } + } + return 0; +} diff --git a/tests/permutc2.gperf b/tests/permutc2.gperf new file mode 100644 index 0000000..223a193 --- /dev/null +++ b/tests/permutc2.gperf @@ -0,0 +1,14 @@ +%{ +/* Test of a hash function which has to deal with permutation and + case-independence. Without case-independence, the alpha_inc is 1. + With case-independence, the alpha_inc is 3. */ +%} +%% +az +za +ay +ya +x{ +x[ +{w +[w diff --git a/tests/test-6.exp b/tests/test-6.exp index 14cd1d7..cabbe9c 100644 --- a/tests/test-6.exp +++ b/tests/test-6.exp @@ -20,6 +20,9 @@ Input file interpretation: is considered part of the type declaration. Key words and additional fields may follow this, one group of fields per line. + --ignore-case Consider upper and lower case ASCII characters as + equivalent. Note that locale dependent case mappings + are ignored. Language for the output code: -L, --language=LANGUAGE-NAME |