6 files changed, 370 insertions, 197 deletions
diff --git a/ChangeLog b/ChangeLog
index 25ad42e..215c83c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -40,6 +40,20 @@ Version 7.4 10-Sep-07
     
 9.  When pcregrep was used with the --colour option, it missed the line ending
     sequence off the lines that it output. 
+    
+10. It was pointed out to me that arrays of string pointers cause lots of 
+    relocations when a shared library is dynamically loaded. A technique of 
+    using a single long string with a table of offsets can drastically reduce 
+    these. I have refactored PCRE in four places to do this. The result is 
+    dramatic:
+    
+      Originally:                          290
+      After changing UCP table:            187
+      After changing error message table:   43 
+      After changing table of "verbs"       36
+      After changing table of Posix names   22
+      
+    Thanks to the folks working on Gregex for glib for this insight.
 
 
 Version 7.3 28-Aug-07
diff --git a/pcre_compile.c b/pcre_compile.c
index 15e06bc..fe74e59 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -140,35 +140,47 @@ static const short int escapes[] = {
 #endif
 
 
-/* Table of special "verbs" like (*PRUNE) */
+/* Table of special "verbs" like (*PRUNE). This is a short table, so it is 
+searched linearly. Put all the names into a single string, in order to reduce 
+the number of relocations when a shared library is dynamically linked. */
 
 typedef struct verbitem {
-  const char *name;
   int   len;
   int   op;
 } verbitem;
 
+static const char verbnames[] =
+  "ACCEPT\0" 
+  "COMMIT\0" 
+  "F\0"      
+  "FAIL\0"   
+  "PRUNE\0"  
+  "SKIP\0"   
+  "THEN";   
+
 static verbitem verbs[] = {
-  { "ACCEPT", 6, OP_ACCEPT },
-  { "COMMIT", 6, OP_COMMIT },
-  { "F",      1, OP_FAIL },
-  { "FAIL",   4, OP_FAIL },
-  { "PRUNE",  5, OP_PRUNE },
-  { "SKIP",   4, OP_SKIP  },
-  { "THEN",   4, OP_THEN  }
+  { 6, OP_ACCEPT },
+  { 6, OP_COMMIT },
+  { 1, OP_FAIL },
+  { 4, OP_FAIL },
+  { 5, OP_PRUNE },
+  { 4, OP_SKIP  },
+  { 4, OP_THEN  }
 };
 
 static int verbcount = sizeof(verbs)/sizeof(verbitem);
 
 
-/* Tables of names of POSIX character classes and their lengths. The list is
-terminated by a zero length entry. The first three must be alpha, lower, upper,
-as this is assumed for handling case independence. */
+/* Tables of names of POSIX character classes and their lengths. The names are 
+now all in a single string, to reduce the number of relocations when a shared 
+library is dynamically loaded. The list of lengths is terminated by a zero
+length entry. The first three must be alpha, lower, upper, as this is assumed
+for handling case independence. */
 
-static const char *const posix_names[] = {
-  "alpha", "lower", "upper",
-  "alnum", "ascii", "blank", "cntrl", "digit", "graph",
-  "print", "punct", "space", "word",  "xdigit" };
+static const char posix_names[] =
+  "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0" 
+  "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0" 
+  "word\0"   "xdigit";
 
 static const uschar posix_name_lengths[] = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
@@ -207,84 +219,88 @@ static const int posix_class_maps[] = {
 /* The texts of compile-time error messages. These are "char *" because they
 are passed to the outside world. Do not ever re-use any error number, because
 they are documented. Always add a new error instead. Messages marked DEAD below
-are no longer used. */
-
-static const char *error_texts[] = {
-  "no error",
-  "\\ at end of pattern",
-  "\\c at end of pattern",
-  "unrecognized character follows \\",
-  "numbers out of order in {} quantifier",
+are no longer used. This used to be a table of strings, but in order to reduce 
+the number of relocations needed when a shared library is loaded dynamically, 
+it is now one long string. We cannot use a table of offsets, because the 
+lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we 
+simply count through to the one we want - this isn't a performance issue 
+because these strings are used only when there is a compilation error. */
+
+static const char error_texts[] =
+  "no error\0"
+  "\\ at end of pattern\0"
+  "\\c at end of pattern\0"
+  "unrecognized character follows \\\0"
+  "numbers out of order in {} quantifier\0"
   /* 5 */
-  "number too big in {} quantifier",
-  "missing terminating ] for character class",
-  "invalid escape sequence in character class",
-  "range out of order in character class",
-  "nothing to repeat",
+  "number too big in {} quantifier\0"
+  "missing terminating ] for character class\0"
+  "invalid escape sequence in character class\0"
+  "range out of order in character class\0"
+  "nothing to repeat\0"
   /* 10 */
-  "operand of unlimited repeat could match the empty string",  /** DEAD **/
-  "internal error: unexpected repeat",
-  "unrecognized character after (?",
-  "POSIX named classes are supported only within a class",
-  "missing )",
+  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
+  "internal error: unexpected repeat\0"
+  "unrecognized character after (?\0"
+  "POSIX named classes are supported only within a class\0"
+  "missing )\0"
   /* 15 */
-  "reference to non-existent subpattern",
-  "erroffset passed as NULL",
-  "unknown option bit(s) set",
-  "missing ) after comment",
-  "parentheses nested too deeply",  /** DEAD **/
+  "reference to non-existent subpattern\0"
+  "erroffset passed as NULL\0"
+  "unknown option bit(s) set\0"
+  "missing ) after comment\0"
+  "parentheses nested too deeply\0"  /** DEAD **/
   /* 20 */
-  "regular expression is too large",
-  "failed to get memory",
-  "unmatched parentheses",
-  "internal error: code overflow",
-  "unrecognized character after (?<",
+  "regular expression is too large\0"
+  "failed to get memory\0"
+  "unmatched parentheses\0"
+  "internal error: code overflow\0"
+  "unrecognized character after (?<\0"
   /* 25 */
-  "lookbehind assertion is not fixed length",
-  "malformed number or name after (?(",
-  "conditional group contains more than two branches",
-  "assertion expected after (?(",
-  "(?R or (?[+-]digits must be followed by )",
+  "lookbehind assertion is not fixed length\0"
+  "malformed number or name after (?(\0"
+  "conditional group contains more than two branches\0"
+  "assertion expected after (?(\0"
+  "(?R or (?[+-]digits must be followed by )\0"
   /* 30 */
-  "unknown POSIX class name",
-  "POSIX collating elements are not supported",
-  "this version of PCRE is not compiled with PCRE_UTF8 support",
-  "spare error",  /** DEAD **/
-  "character value in \\x{...} sequence is too large",
+  "unknown POSIX class name\0"
+  "POSIX collating elements are not supported\0"
+  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
+  "spare error\0"  /** DEAD **/
+  "character value in \\x{...} sequence is too large\0"
   /* 35 */
-  "invalid condition (?(0)",
-  "\\C not allowed in lookbehind assertion",
-  "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
-  "number after (?C is > 255",
-  "closing ) for (?C expected",
+  "invalid condition (?(0)\0"
+  "\\C not allowed in lookbehind assertion\0"
+  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
+  "number after (?C is > 255\0"
+  "closing ) for (?C expected\0"
   /* 40 */
-  "recursive call could loop indefinitely",
-  "unrecognized character after (?P",
-  "syntax error in subpattern name (missing terminator)",
-  "two named subpatterns have the same name",
-  "invalid UTF-8 string",
+  "recursive call could loop indefinitely\0"
+  "unrecognized character after (?P\0"
+  "syntax error in subpattern name (missing terminator)\0"
+  "two named subpatterns have the same name\0"
+  "invalid UTF-8 string\0"
   /* 45 */
-  "support for \\P, \\p, and \\X has not been compiled",
-  "malformed \\P or \\p sequence",
-  "unknown property name after \\P or \\p",
-  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
-  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
+  "support for \\P, \\p, and \\X has not been compiled\0"
+  "malformed \\P or \\p sequence\0"
+  "unknown property name after \\P or \\p\0"
+  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
+  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
   /* 50 */
-  "repeated subpattern is too long",    /** DEAD **/
-  "octal value is greater than \\377 (not in UTF-8 mode)",
-  "internal error: overran compiling workspace",
-  "internal error: previously-checked referenced subpattern not found",
-  "DEFINE group contains more than one branch",
+  "repeated subpattern is too long\0"    /** DEAD **/
+  "octal value is greater than \\377 (not in UTF-8 mode)\0"
+  "internal error: overran compiling workspace\0"
+  "internal error: previously-checked referenced subpattern not found\0"
+  "DEFINE group contains more than one branch\0"
   /* 55 */
-  "repeating a DEFINE group is not allowed",
-  "inconsistent NEWLINE options",
-  "\\g is not followed by a braced name or an optionally braced non-zero number",
-  "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
-  "(*VERB) with an argument is not supported",
+  "repeating a DEFINE group is not allowed\0"
+  "inconsistent NEWLINE options\0"
+  "\\g is not followed by a braced name or an optionally braced non-zero number\0"
+  "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
+  "(*VERB) with an argument is not supported\0"
   /* 60 */
-  "(*VERB) not recognized",
-  "number is too big"
-};
+  "(*VERB) not recognized\0"
+  "number is too big";
 
 
 /* Table to identify digits and hex digits. This is used when compiling
@@ -420,6 +436,28 @@ static BOOL
 
 
 /*************************************************
+*            Find an error text                  *
+*************************************************/
+
+/* The error texts are now all in one long string, to save on relocations. As 
+some of the text is of unknown length, we can't use a table of offsets. 
+Instead, just count through the strings. This is not a performance issue 
+because it happens only when there has been a compilation error.
+
+Argument:   the error number
+Returns:    pointer to the error string
+*/
+
+static const char *
+find_error_text(int n)
+{
+const char *s = error_texts;
+for (; n > 0; n--) while (*s++ != 0); 
+return s;
+}
+
+
+/*************************************************
 *            Handle escapes                      *
 *************************************************/
 
@@ -776,7 +814,7 @@ top = _pcre_utt_size;
 while (bot < top)
   {
   i = (bot + top) >> 1;
-  c = strcmp(name, _pcre_utt[i].name);
+  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
   if (c == 0)
     {
     *dptr = _pcre_utt[i].value;
@@ -1733,11 +1771,13 @@ Returns:     a value representing the name, or -1 if unknown
 static int
 check_posix_name(const uschar *ptr, int len)
 {
+const char *pn = posix_names;
 register int yield = 0;
 while (posix_name_lengths[yield] != 0)
   {
   if (len == posix_name_lengths[yield] &&
-    strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
+    strncmp((const char *)ptr, pn, len) == 0) return yield;
+  pn += posix_name_lengths[yield] + 1;   
   yield++;
   }
 return -1;
@@ -4024,6 +4064,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
     if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
       {
       int i, namelen;
+      const char *vn = verbnames; 
       const uschar *name = ++ptr;
       previous = NULL;
       while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
@@ -4041,12 +4082,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
       for (i = 0; i < verbcount; i++)
         {
         if (namelen == verbs[i].len &&
-            strncmp((char *)name, verbs[i].name, namelen) == 0)
+            strncmp((char *)name, vn, namelen) == 0)
           {
           *code = verbs[i].op;
           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
           break;
           }
+        vn += verbs[i].len + 1;  
         }
       if (i < verbcount) continue;
       *errorcodeptr = ERR60;
@@ -6005,7 +6047,7 @@ if (errorcode != 0)
   PCRE_EARLY_ERROR_RETURN:
   *erroroffset = ptr - (const uschar *)pattern;
   PCRE_EARLY_ERROR_RETURN2:
-  *errorptr = error_texts[errorcode];
+  *errorptr = find_error_text(errorcode);
   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
   return NULL;
   }
@@ -6090,7 +6132,7 @@ was compiled can be seen. */
 if (code - codestart > length)
   {
   (pcre_free)(re);
-  *errorptr = error_texts[ERR23];
+  *errorptr = find_error_text(ERR23);
   *erroroffset = ptr - (uschar *)pattern;
   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
   return NULL;
diff --git a/pcre_internal.h b/pcre_internal.h
index 775e03f..b039900 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -1064,10 +1064,12 @@ total length. */
 #define tables_length (ctypes_offset + 256)
 
 /* Layout of the UCP type table that translates property names into types and
-codes. */
+codes. Each entry used to point directly to a name, but to reduce the number of
+relocations in shared libraries, it now has an offset into a single string 
+instead. */
 
 typedef struct {
-  const char *name;
+  pcre_uint16 name_offset; 
   pcre_uint16 type;
   pcre_uint16 value;
 } ucp_type_table;
@@ -1085,6 +1087,7 @@ extern const uschar _pcre_utf8_table4[];
 
 extern const int    _pcre_utf8_table1_size;
 
+extern const char   _pcre_utt_names[];
 extern const ucp_type_table _pcre_utt[];
 extern const int _pcre_utt_size;
 
diff --git a/pcre_printint.src b/pcre_printint.src
index 90381ed..d51cbe0 100644
--- a/pcre_printint.src
+++ b/pcre_printint.src
@@ -126,7 +126,7 @@ for (i = _pcre_utt_size - 1; i >= 0; i--)
   {
   if (ptype == _pcre_utt[i].type && pvalue == _pcre_utt[i].value) break;
   }
-return (i >= 0)? _pcre_utt[i].name : "??";
+return (i >= 0)? _pcre_utt_names + _pcre_utt[i].name_offset : "??";
 #else
 /* It gets harder and harder to shut off unwanted compiler warnings. */
 ptype = ptype * pvalue;
diff --git a/pcre_tables.c b/pcre_tables.c
index 0d060c2..6c4a60d 100644
--- a/pcre_tables.c
+++ b/pcre_tables.c
@@ -87,115 +87,228 @@ const uschar _pcre_utf8_table4[] = {
   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 
-/* This table translates Unicode property names into type and code values. It
-is searched by binary chop, so must be in collating sequence of name. */
+/* The pcre_utt[] table below translates Unicode property names into type and
+code values. It is searched by binary chop, so must be in collating sequence of
+name. Originally, the table contained pointers to the name strings in the first 
+field of each entry. However, that leads to a large number of relocations when 
+a shared library is dynamically loaded. A significant reduction is made by 
+putting all the names into a single, large string and then using offsets in the 
+table itself. Maintenance is more error-prone, but frequent changes to this 
+data is unlikely. */
+
+const char _pcre_utt_names[] =
+  "Any\0" 
+  "Arabic\0" 
+  "Armenian\0" 
+  "Balinese\0" 
+  "Bengali\0" 
+  "Bopomofo\0" 
+  "Braille\0" 
+  "Buginese\0" 
+  "Buhid\0" 
+  "C\0" 
+  "Canadian_Aboriginal\0" 
+  "Cc\0" 
+  "Cf\0" 
+  "Cherokee\0" 
+  "Cn\0" 
+  "Co\0" 
+  "Common\0" 
+  "Coptic\0" 
+  "Cs\0" 
+  "Cuneiform\0" 
+  "Cypriot\0" 
+  "Cyrillic\0" 
+  "Deseret\0" 
+  "Devanagari\0" 
+  "Ethiopic\0" 
+  "Georgian\0" 
+  "Glagolitic\0" 
+  "Gothic\0" 
+  "Greek\0" 
+  "Gujarati\0" 
+  "Gurmukhi\0" 
+  "Han\0" 
+  "Hangul\0" 
+  "Hanunoo\0" 
+  "Hebrew\0" 
+  "Hiragana\0" 
+  "Inherited\0" 
+  "Kannada\0" 
+  "Katakana\0" 
+  "Kharoshthi\0" 
+  "Khmer\0" 
+  "L\0" 
+  "L&\0" 
+  "Lao\0" 
+  "Latin\0" 
+  "Limbu\0" 
+  "Linear_B\0" 
+  "Ll\0" 
+  "Lm\0" 
+  "Lo\0" 
+  "Lt\0" 
+  "Lu\0" 
+  "M\0" 
+  "Malayalam\0" 
+  "Mc\0" 
+  "Me\0" 
+  "Mn\0" 
+  "Mongolian\0" 
+  "Myanmar\0" 
+  "N\0" 
+  "Nd\0" 
+  "New_Tai_Lue\0" 
+  "Nko\0" 
+  "Nl\0" 
+  "No\0" 
+  "Ogham\0" 
+  "Old_Italic\0" 
+  "Old_Persian\0" 
+  "Oriya\0" 
+  "Osmanya\0" 
+  "P\0" 
+  "Pc\0" 
+  "Pd\0" 
+  "Pe\0" 
+  "Pf\0" 
+  "Phags_Pa\0" 
+  "Phoenician\0" 
+  "Pi\0" 
+  "Po\0" 
+  "Ps\0" 
+  "Runic\0" 
+  "S\0" 
+  "Sc\0" 
+  "Shavian\0" 
+  "Sinhala\0" 
+  "Sk\0" 
+  "Sm\0" 
+  "So\0" 
+  "Syloti_Nagri\0" 
+  "Syriac\0" 
+  "Tagalog\0" 
+  "Tagbanwa\0" 
+  "Tai_Le\0" 
+  "Tamil\0" 
+  "Telugu\0" 
+  "Thaana\0" 
+  "Thai\0" 
+  "Tibetan\0" 
+  "Tifinagh\0" 
+  "Ugaritic\0" 
+  "Yi\0" 
+  "Z\0" 
+  "Zl\0" 
+  "Zp\0" 
+  "Zs\0";
 
 const ucp_type_table _pcre_utt[] = {
-  { "Any",                 PT_ANY,  0 },
-  { "Arabic",              PT_SC,   ucp_Arabic },
-  { "Armenian",            PT_SC,   ucp_Armenian },
-  { "Balinese",            PT_SC,   ucp_Balinese },
-  { "Bengali",             PT_SC,   ucp_Bengali },
-  { "Bopomofo",            PT_SC,   ucp_Bopomofo },
-  { "Braille",             PT_SC,   ucp_Braille },
-  { "Buginese",            PT_SC,   ucp_Buginese },
-  { "Buhid",               PT_SC,   ucp_Buhid },
-  { "C",                   PT_GC,   ucp_C },
-  { "Canadian_Aboriginal", PT_SC,   ucp_Canadian_Aboriginal },
-  { "Cc",                  PT_PC,   ucp_Cc },
-  { "Cf",                  PT_PC,   ucp_Cf },
-  { "Cherokee",            PT_SC,   ucp_Cherokee },
-  { "Cn",                  PT_PC,   ucp_Cn },
-  { "Co",                  PT_PC,   ucp_Co },
-  { "Common",              PT_SC,   ucp_Common },
-  { "Coptic",              PT_SC,   ucp_Coptic },
-  { "Cs",                  PT_PC,   ucp_Cs },
-  { "Cuneiform",           PT_SC,   ucp_Cuneiform },
-  { "Cypriot",             PT_SC,   ucp_Cypriot },
-  { "Cyrillic",            PT_SC,   ucp_Cyrillic },
-  { "Deseret",             PT_SC,   ucp_Deseret },
-  { "Devanagari",          PT_SC,   ucp_Devanagari },
-  { "Ethiopic",            PT_SC,   ucp_Ethiopic },
-  { "Georgian",            PT_SC,   ucp_Georgian },
-  { "Glagolitic",          PT_SC,   ucp_Glagolitic },
-  { "Gothic",              PT_SC,   ucp_Gothic },
-  { "Greek",               PT_SC,   ucp_Greek },
-  { "Gujarati",            PT_SC,   ucp_Gujarati },
-  { "Gurmukhi",            PT_SC,   ucp_Gurmukhi },
-  { "Han",                 PT_SC,   ucp_Han },
-  { "Hangul",              PT_SC,   ucp_Hangul },
-  { "Hanunoo",             PT_SC,   ucp_Hanunoo },
-  { "Hebrew",              PT_SC,   ucp_Hebrew },
-  { "Hiragana",            PT_SC,   ucp_Hiragana },
-  { "Inherited",           PT_SC,   ucp_Inherited },
-  { "Kannada",             PT_SC,   ucp_Kannada },
-  { "Katakana",            PT_SC,   ucp_Katakana },
-  { "Kharoshthi",          PT_SC,   ucp_Kharoshthi },
-  { "Khmer",               PT_SC,   ucp_Khmer },
-  { "L",                   PT_GC,   ucp_L },
-  { "L&",                  PT_LAMP, 0 },
-  { "Lao",                 PT_SC,   ucp_Lao },
-  { "Latin",               PT_SC,   ucp_Latin },
-  { "Limbu",               PT_SC,   ucp_Limbu },
-  { "Linear_B",            PT_SC,   ucp_Linear_B },
-  { "Ll",                  PT_PC,   ucp_Ll },
-  { "Lm",                  PT_PC,   ucp_Lm },
-  { "Lo",                  PT_PC,   ucp_Lo },
-  { "Lt",                  PT_PC,   ucp_Lt },
-  { "Lu",                  PT_PC,   ucp_Lu },
-  { "M",                   PT_GC,   ucp_M },
-  { "Malayalam",           PT_SC,   ucp_Malayalam },
-  { "Mc",                  PT_PC,   ucp_Mc },
-  { "Me",                  PT_PC,   ucp_Me },
-  { "Mn",                  PT_PC,   ucp_Mn },
-  { "Mongolian",           PT_SC,   ucp_Mongolian },
-  { "Myanmar",             PT_SC,   ucp_Myanmar },
-  { "N",                   PT_GC,   ucp_N },
-  { "Nd",                  PT_PC,   ucp_Nd },
-  { "New_Tai_Lue",         PT_SC,   ucp_New_Tai_Lue },
-  { "Nko",                 PT_SC,   ucp_Nko },
-  { "Nl",                  PT_PC,   ucp_Nl },
-  { "No",                  PT_PC,   ucp_No },
-  { "Ogham",               PT_SC,   ucp_Ogham },
-  { "Old_Italic",          PT_SC,   ucp_Old_Italic },
-  { "Old_Persian",         PT_SC,   ucp_Old_Persian },
-  { "Oriya",               PT_SC,   ucp_Oriya },
-  { "Osmanya",             PT_SC,   ucp_Osmanya },
-  { "P",                   PT_GC,   ucp_P },
-  { "Pc",                  PT_PC,   ucp_Pc },
-  { "Pd",                  PT_PC,   ucp_Pd },
-  { "Pe",                  PT_PC,   ucp_Pe },
-  { "Pf",                  PT_PC,   ucp_Pf },
-  { "Phags_Pa",            PT_SC,   ucp_Phags_Pa },
-  { "Phoenician",          PT_SC,   ucp_Phoenician },
-  { "Pi",                  PT_PC,   ucp_Pi },
-  { "Po",                  PT_PC,   ucp_Po },
-  { "Ps",                  PT_PC,   ucp_Ps },
-  { "Runic",               PT_SC,   ucp_Runic },
-  { "S",                   PT_GC,   ucp_S },
-  { "Sc",                  PT_PC,   ucp_Sc },
-  { "Shavian",             PT_SC,   ucp_Shavian },
-  { "Sinhala",             PT_SC,   ucp_Sinhala },
-  { "Sk",                  PT_PC,   ucp_Sk },
-  { "Sm",                  PT_PC,   ucp_Sm },
-  { "So",                  PT_PC,   ucp_So },
-  { "Syloti_Nagri",        PT_SC,   ucp_Syloti_Nagri },
-  { "Syriac",              PT_SC,   ucp_Syriac },
-  { "Tagalog",             PT_SC,   ucp_Tagalog },
-  { "Tagbanwa",            PT_SC,   ucp_Tagbanwa },
-  { "Tai_Le",              PT_SC,   ucp_Tai_Le },
-  { "Tamil",               PT_SC,   ucp_Tamil },
-  { "Telugu",              PT_SC,   ucp_Telugu },
-  { "Thaana",              PT_SC,   ucp_Thaana },
-  { "Thai",                PT_SC,   ucp_Thai },
-  { "Tibetan",             PT_SC,   ucp_Tibetan },
-  { "Tifinagh",            PT_SC,   ucp_Tifinagh },
-  { "Ugaritic",            PT_SC,   ucp_Ugaritic },
-  { "Yi",                  PT_SC,   ucp_Yi },
-  { "Z",                   PT_GC,   ucp_Z },
-  { "Zl",                  PT_PC,   ucp_Zl },
-  { "Zp",                  PT_PC,   ucp_Zp },
-  { "Zs",                  PT_PC,   ucp_Zs }
+  { 0,   PT_ANY, 0 },
+  { 4,   PT_SC, ucp_Arabic },
+  { 11,  PT_SC, ucp_Armenian },
+  { 20,  PT_SC, ucp_Balinese },
+  { 29,  PT_SC, ucp_Bengali },
+  { 37,  PT_SC, ucp_Bopomofo },
+  { 46,  PT_SC, ucp_Braille },
+  { 54,  PT_SC, ucp_Buginese },
+  { 63,  PT_SC, ucp_Buhid },
+  { 69,  PT_GC, ucp_C },
+  { 71,  PT_SC, ucp_Canadian_Aboriginal },
+  { 91,  PT_PC, ucp_Cc },
+  { 94,  PT_PC, ucp_Cf },
+  { 97,  PT_SC, ucp_Cherokee },
+  { 106, PT_PC, ucp_Cn },
+  { 109, PT_PC, ucp_Co },
+  { 112, PT_SC, ucp_Common },
+  { 119, PT_SC, ucp_Coptic },
+  { 126, PT_PC, ucp_Cs },
+  { 129, PT_SC, ucp_Cuneiform },
+  { 139, PT_SC, ucp_Cypriot },
+  { 147, PT_SC, ucp_Cyrillic },
+  { 156, PT_SC, ucp_Deseret },
+  { 164, PT_SC, ucp_Devanagari },
+  { 175, PT_SC, ucp_Ethiopic },
+  { 184, PT_SC, ucp_Georgian },
+  { 193, PT_SC, ucp_Glagolitic },
+  { 204, PT_SC, ucp_Gothic },
+  { 211, PT_SC, ucp_Greek },
+  { 217, PT_SC, ucp_Gujarati },
+  { 226, PT_SC, ucp_Gurmukhi },
+  { 235, PT_SC, ucp_Han },
+  { 239, PT_SC, ucp_Hangul },
+  { 246, PT_SC, ucp_Hanunoo },
+  { 254, PT_SC, ucp_Hebrew },
+  { 261, PT_SC, ucp_Hiragana },
+  { 270, PT_SC, ucp_Inherited },
+  { 280, PT_SC, ucp_Kannada },
+  { 288, PT_SC, ucp_Katakana },
+  { 297, PT_SC, ucp_Kharoshthi },
+  { 308, PT_SC, ucp_Khmer },
+  { 314, PT_GC, ucp_L },
+  { 316, PT_LAMP, 0 },
+  { 319, PT_SC, ucp_Lao },
+  { 323, PT_SC, ucp_Latin },
+  { 329, PT_SC, ucp_Limbu },
+  { 335, PT_SC, ucp_Linear_B },
+  { 344, PT_PC, ucp_Ll },
+  { 347, PT_PC, ucp_Lm },
+  { 350, PT_PC, ucp_Lo },
+  { 353, PT_PC, ucp_Lt },
+  { 356, PT_PC, ucp_Lu },
+  { 359, PT_GC, ucp_M },
+  { 361, PT_SC, ucp_Malayalam },
+  { 371, PT_PC, ucp_Mc },
+  { 374, PT_PC, ucp_Me },
+  { 377, PT_PC, ucp_Mn },
+  { 380, PT_SC, ucp_Mongolian },
+  { 390, PT_SC, ucp_Myanmar },
+  { 398, PT_GC, ucp_N },
+  { 400, PT_PC, ucp_Nd },
+  { 403, PT_SC, ucp_New_Tai_Lue },
+  { 415, PT_SC, ucp_Nko },
+  { 419, PT_PC, ucp_Nl },
+  { 422, PT_PC, ucp_No },
+  { 425, PT_SC, ucp_Ogham },
+  { 431, PT_SC, ucp_Old_Italic },
+  { 442, PT_SC, ucp_Old_Persian },
+  { 454, PT_SC, ucp_Oriya },
+  { 460, PT_SC, ucp_Osmanya },
+  { 468, PT_GC, ucp_P },
+  { 470, PT_PC, ucp_Pc },
+  { 473, PT_PC, ucp_Pd },
+  { 476, PT_PC, ucp_Pe },
+  { 479, PT_PC, ucp_Pf },
+  { 482, PT_SC, ucp_Phags_Pa },
+  { 491, PT_SC, ucp_Phoenician },
+  { 502, PT_PC, ucp_Pi },
+  { 505, PT_PC, ucp_Po },
+  { 508, PT_PC, ucp_Ps },
+  { 511, PT_SC, ucp_Runic },
+  { 517, PT_GC, ucp_S },
+  { 519, PT_PC, ucp_Sc },
+  { 522, PT_SC, ucp_Shavian },
+  { 530, PT_SC, ucp_Sinhala },
+  { 538, PT_PC, ucp_Sk },
+  { 541, PT_PC, ucp_Sm },
+  { 544, PT_PC, ucp_So },
+  { 547, PT_SC, ucp_Syloti_Nagri },
+  { 560, PT_SC, ucp_Syriac },
+  { 567, PT_SC, ucp_Tagalog },
+  { 575, PT_SC, ucp_Tagbanwa },
+  { 584, PT_SC, ucp_Tai_Le },
+  { 591, PT_SC, ucp_Tamil },
+  { 597, PT_SC, ucp_Telugu },
+  { 604, PT_SC, ucp_Thaana },
+  { 611, PT_SC, ucp_Thai },
+  { 616, PT_SC, ucp_Tibetan },
+  { 624, PT_SC, ucp_Tifinagh },
+  { 633, PT_SC, ucp_Ugaritic },
+  { 642, PT_SC, ucp_Yi },
+  { 645, PT_GC, ucp_Z },
+  { 647, PT_PC, ucp_Zl },
+  { 650, PT_PC, ucp_Zp },
+  { 653, PT_PC, ucp_Zs }
 };
 
 const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
diff --git a/pcretest.c b/pcretest.c
index cb9e077..f7a7e21 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -94,6 +94,7 @@ symbols to prevent clashes. */
 #define _pcre_utf8_table4      utf8_table4
 #define _pcre_utt              utt
 #define _pcre_utt_size         utt_size
+#define _pcre_utt_names        utt_names
 #define _pcre_OP_lengths       OP_lengths
 
 #include "pcre_tables.c"