Upgrade the ucptest program (used only by maintainer) and script run tests.

git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1025 6239d852-aaf2-0410-a92c-79f79f948069
author: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2018-10-14 14:27:16 +0000
committer: ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> 2018-10-14 14:27:16 +0000
commit: e6dbb8822cc6cb922b0e99f33ad8d5ee8e8f4a0c (patch)
tree: e309c9e7c67463149847c939a3d2485e0c96cd39 /maint
parent: 0bbce752a408b94a06e5620075d0ffd6766d5854 (diff)
download: pcre2-e6dbb8822cc6cb922b0e99f33ad8d5ee8e8f4a0c.tar.gz
1 files changed, 613 insertions, 202 deletions
diff --git a/maint/ucptest.c b/maint/ucptest.c
index 0ffb34a..720160c 100644
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@@ -7,14 +7,42 @@
 /* Compile thus:
    gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
      ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
+   Add -lreadline or -ledit if required.
 */
 
-/* If there are arguments, they are a list of hexadecimal code points whose
+/* This is a hacked-up program for testing the Unicode properties tables of
+PCRE2. It can also be used for finding characters with certain properties.
+I wrote it to help with debugging PCRE, and have added things that I found
+useful, in a rather haphazard way. The code has never been "tidied" or checked
+for robustness.
+
+If there are arguments, they are a list of hexadecimal code points whose
 properties are to be output. Otherwise, the program expects to read commands on
-stdin, and it writes output to stdout. There is only one command, "findprop",
-followed by a list of Unicode code points as hex numbers (without any
-prefixes). The output is one line per character, giving its Unicode properties
-followed by its other case if there is one. */
+stdin, and it writes output to stdout. There are two commands:
+
+"findprop" must be followed by a list of Unicode code points as hex numbers
+(without any prefixes). The output is one line per character, giving its
+Unicode properties followed by its other case if there is one, followed by its
+Script Extension list if it is not just the same as the base script.
+
+"find" must be followed by a list of property names and their values. This
+finds characters that have those properties. If multiple properties are listed,
+they must all be matched. Currently supported:
+
+  script <name>    The character must have this script property. Only one
+                     such script may be given.
+  scriptx <name>   This script must be in the character's Script Extension
+                     property list. If this is used many times, all the given
+                     scripts must be present.
+  type <abbrev>    The character's type (e.g. Lu or Nd) must match.
+  gbreak <name>    The grapheme break property must match.
+
+If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
+Script Extensions, there may be a mixture of positive and negative
+requirements. All must be satisfied.
+
+No more than 100 characters are output. If there are more, the list ends with
+... */
 
 #ifdef HAVE_CONFIG_H
 #include "../src/config.h"
@@ -31,6 +59,22 @@ followed by its other case if there is one. */
 #include "../src/pcre2_internal.h"
 #include "../src/pcre2_ucp.h"
 
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+#if defined(SUPPORT_LIBREADLINE)
+#include <readline/readline.h>
+#include <readline/history.h>
+#else
+#if defined(HAVE_EDITLINE_READLINE_H)
+#include <editline/readline.h>
+#else
+#include <readline/readline.h>
+#endif
+#endif
+#endif
 
 
 /* -------------------------------------------------------------------*/
@@ -45,183 +89,232 @@ followed by its other case if there is one. */
 /* -------------------------------------------------------------------*/
 
 
-
-
-/*************************************************
-*          Find a script name                    *
-*************************************************/
-
-static unsigned char *
-find_script_name(int script)
-{
-switch(script)
-  {
-  default:              return US"??"; 
-  case ucp_Unknown:     return US"Unknown";  
-  case ucp_Arabic:      return US"Arabic"; 
-  case ucp_Armenian:    return US"Armenian"; 
-  case ucp_Balinese:    return US"Balinese"; 
-  case ucp_Bengali:     return US"Bengali"; 
-  case ucp_Bopomofo:    return US"Bopomofo"; 
-  case ucp_Braille:     return US"Braille"; 
-  case ucp_Buginese:    return US"Buginese"; 
-  case ucp_Buhid:       return US"Buhid"; 
-  case ucp_Canadian_Aboriginal: return US"Canadian_Aboriginal"; 
-  case ucp_Cherokee:    return US"Cherokee"; 
-  case ucp_Common:      return US"Common"; 
-  case ucp_Coptic:      return US"Coptic"; 
-  case ucp_Cuneiform:   return US"Cuneiform"; 
-  case ucp_Cypriot:     return US"Cypriot"; 
-  case ucp_Cyrillic:    return US"Cyrillic"; 
-  case ucp_Deseret:     return US"Deseret"; 
-  case ucp_Devanagari:  return US"Devanagari"; 
-  case ucp_Ethiopic:    return US"Ethiopic"; 
-  case ucp_Georgian:    return US"Georgian"; 
-  case ucp_Glagolitic:  return US"Glagolitic"; 
-  case ucp_Gothic:      return US"Gothic"; 
-  case ucp_Greek:       return US"Greek"; 
-  case ucp_Gujarati:    return US"Gujarati"; 
-  case ucp_Gurmukhi:    return US"Gurmukhi"; 
-  case ucp_Han:         return US"Han"; 
-  case ucp_Hangul:      return US"Hangul"; 
-  case ucp_Hanunoo:     return US"Hanunoo"; 
-  case ucp_Hebrew:      return US"Hebrew"; 
-  case ucp_Hiragana:    return US"Hiragana"; 
-  case ucp_Inherited:   return US"Inherited"; 
-  case ucp_Kannada:     return US"Kannada"; 
-  case ucp_Katakana:    return US"Katakana"; 
-  case ucp_Kharoshthi:  return US"Kharoshthi"; 
-  case ucp_Khmer:       return US"Khmer"; 
-  case ucp_Lao:         return US"Lao"; 
-  case ucp_Latin:       return US"Latin"; 
-  case ucp_Limbu:       return US"Limbu"; 
-  case ucp_Linear_B:    return US"Linear_B"; 
-  case ucp_Malayalam:   return US"Malayalam"; 
-  case ucp_Mongolian:   return US"Mongolian"; 
-  case ucp_Myanmar:     return US"Myanmar"; 
-  case ucp_New_Tai_Lue: return US"New_Tai_Lue"; 
-  case ucp_Nko:         return US"Nko"; 
-  case ucp_Ogham:       return US"Ogham"; 
-  case ucp_Old_Italic:  return US"Old_Italic"; 
-  case ucp_Old_Persian: return US"Old_Persian"; 
-  case ucp_Oriya:       return US"Oriya"; 
-  case ucp_Osmanya:     return US"Osmanya"; 
-  case ucp_Phags_Pa:    return US"Phags_Pa"; 
-  case ucp_Phoenician:  return US"Phoenician"; 
-  case ucp_Runic:       return US"Runic"; 
-  case ucp_Shavian:     return US"Shavian"; 
-  case ucp_Sinhala:     return US"Sinhala"; 
-  case ucp_Syloti_Nagri: return US"Syloti_Nagri"; 
-  case ucp_Syriac:      return US"Syriac"; 
-  case ucp_Tagalog:     return US"Tagalog"; 
-  case ucp_Tagbanwa:    return US"Tagbanwa"; 
-  case ucp_Tai_Le:      return US"Tai_Le"; 
-  case ucp_Tamil:       return US"Tamil"; 
-  case ucp_Telugu:      return US"Telugu"; 
-  case ucp_Thaana:      return US"Thaana"; 
-  case ucp_Thai:        return US"Thai"; 
-  case ucp_Tibetan:     return US"Tibetan"; 
-  case ucp_Tifinagh:    return US"Tifinagh"; 
-  case ucp_Ugaritic:    return US"Ugaritic"; 
-  case ucp_Yi:          return US"Yi"; 
+const unsigned char *script_names[] = {
+  US"Unknown",
+  US"Arabic",
+  US"Armenian",
+  US"Bengali",
+  US"Bopomofo",
+  US"Braille",
+  US"Buginese",
+  US"Buhid",
+  US"Canadian_Aboriginal",
+  US"Cherokee",
+  US"Common",
+  US"Coptic",
+  US"Cypriot",
+  US"Cyrillic",
+  US"Deseret",
+  US"Devanagari",
+  US"Ethiopic",
+  US"Georgian",
+  US"Glagolitic",
+  US"Gothic",
+  US"Greek",
+  US"Gujarati",
+  US"Gurmukhi",
+  US"Han",
+  US"Hangul",
+  US"Hanunoo",
+  US"Hebrew",
+  US"Hiragana",
+  US"Inherited",
+  US"Kannada",
+  US"Katakana",
+  US"Kharoshthi",
+  US"Khmer",
+  US"Lao",
+  US"Latin",
+  US"Limbu",
+  US"Linear_B",
+  US"Malayalam",
+  US"Mongolian",
+  US"Myanmar",
+  US"New_Tai_Lue",
+  US"Ogham",
+  US"Old_Italic",
+  US"Old_Persian",
+  US"Oriya",
+  US"Osmanya",
+  US"Runic",
+  US"Shavian",
+  US"Sinhala",
+  US"Syloti_Nagri",
+  US"Syriac",
+  US"Tagalog",
+  US"Tagbanwa",
+  US"Tai_Le",
+  US"Tamil",
+  US"Telugu",
+  US"Thaana",
+  US"Thai",
+  US"Tibetan",
+  US"Tifinagh",
+  US"Ugaritic",
+  US"Yi",
+  /* New for Unicode 5.0: */
+  US"Balinese",
+  US"Cuneiform",
+  US"Nko",
+  US"Phags_Pa",
+  US"Phoenician",
   /* New for Unicode 5.1: */
-  case ucp_Carian:      return US"Carian"; 
-  case ucp_Cham:        return US"Cham"; 
-  case ucp_Kayah_Li:    return US"Kayah_Li"; 
-  case ucp_Lepcha:      return US"Lepcha"; 
-  case ucp_Lycian:      return US"Lycian"; 
-  case ucp_Lydian:      return US"Lydian"; 
-  case ucp_Ol_Chiki:    return US"Ol_Chiki"; 
-  case ucp_Rejang:      return US"Rejang"; 
-  case ucp_Saurashtra:  return US"Saurashtra"; 
-  case ucp_Sundanese:   return US"Sundanese"; 
-  case ucp_Vai:         return US"Vai"; 
+  US"Carian",
+  US"Cham",
+  US"Kayah_Li",
+  US"Lepcha",
+  US"Lycian",
+  US"Lydian",
+  US"Ol_Chiki",
+  US"Rejang",
+  US"Saurashtra",
+  US"Sundanese",
+  US"Vai",
   /* New for Unicode 5.2: */
-  case ucp_Avestan:     return US"Avestan"; 
-  case ucp_Bamum:       return US"Bamum"; 
-  case ucp_Egyptian_Hieroglyphs: return US"Egyptian_Hieroglyphs"; 
-  case ucp_Imperial_Aramaic: return US"Imperial_Aramaic"; 
-  case ucp_Inscriptional_Pahlavi: return US"Inscriptional_Pahlavi"; 
-  case ucp_Inscriptional_Parthian: return US"Inscriptional_Parthian"; 
-  case ucp_Javanese:    return US"Javanese"; 
-  case ucp_Kaithi:      return US"Kaithi"; 
-  case ucp_Lisu:        return US"Lisu"; 
-  case ucp_Meetei_Mayek: return US"Meetei_Mayek"; 
-  case ucp_Old_South_Arabian: return US"Old_South_Arabian"; 
-  case ucp_Old_Turkic:  return US"Old_Turkic"; 
-  case ucp_Samaritan:   return US"Samaritan"; 
-  case ucp_Tai_Tham:    return US"Tai_Tham"; 
-  case ucp_Tai_Viet:    return US"Tai_Viet"; 
+  US"Avestan",
+  US"Bamum",
+  US"Egyptian_Hieroglyphs",
+  US"Imperial_Aramaic",
+  US"Inscriptional_Pahlavi",
+  US"Inscriptional_Parthian",
+  US"Javanese",
+  US"Kaithi",
+  US"Lisu",
+  US"Meetei_Mayek",
+  US"Old_South_Arabian",
+  US"Old_Turkic",
+  US"Samaritan",
+  US"Tai_Tham",
+  US"Tai_Viet",
   /* New for Unicode 6.0.0 */
-  case ucp_Batak:       return US"Batak"; 
-  case ucp_Brahmi:      return US"Brahmi"; 
-  case ucp_Mandaic:     return US"Mandaic"; 
-
+  US"Batak",
+  US"Brahmi",
+  US"Mandaic",
   /* New for Unicode 6.1.0 */
-  case ucp_Chakma:               return US"Chakma"; 
-  case ucp_Meroitic_Cursive:     return US"Meroitic_Cursive"; 
-  case ucp_Meroitic_Hieroglyphs: return US"Meroitic_Hieroglyphs"; 
-  case ucp_Miao:                 return US"Miao"; 
-  case ucp_Sharada:              return US"Sharada"; 
-  case ucp_Sora_Sompeng:         return US"Sora Sompent"; 
-  case ucp_Takri:                return US"Takri"; 
-
+  US"Chakma",
+  US"Meroitic_Cursive",
+  US"Meroitic_Hieroglyphs",
+  US"Miao",
+  US"Sharada",
+  US"Sora Sompent",
+  US"Takri",
   /* New for Unicode 7.0.0 */
-  case ucp_Bassa_Vah:          return US"Bassa_Vah"; 
-  case ucp_Caucasian_Albanian: return US"Caucasian_Albanian"; 
-  case ucp_Duployan:           return US"Duployan"; 
-  case ucp_Elbasan:            return US"Elbasan"; 
-  case ucp_Grantha:            return US"Grantha"; 
-  case ucp_Khojki:             return US"Khojki"; 
-  case ucp_Khudawadi:          return US"Khudawadi"; 
-  case ucp_Linear_A:           return US"Linear_A"; 
-  case ucp_Mahajani:           return US"Mahajani"; 
-  case ucp_Manichaean:         return US"Manichaean"; 
-  case ucp_Mende_Kikakui:      return US"Mende_Kikakui"; 
-  case ucp_Modi:               return US"Modi"; 
-  case ucp_Mro:                return US"Mro"; 
-  case ucp_Nabataean:          return US"Nabataean"; 
-  case ucp_Old_North_Arabian:  return US"Old_North_Arabian"; 
-  case ucp_Old_Permic:         return US"Old_Permic"; 
-  case ucp_Pahawh_Hmong:       return US"Pahawh_Hmong"; 
-  case ucp_Palmyrene:          return US"Palmyrene"; 
-  case ucp_Psalter_Pahlavi:    return US"Psalter_Pahlavi"; 
-  case ucp_Pau_Cin_Hau:        return US"Pau_Cin_Hau"; 
-  case ucp_Siddham:            return US"Siddham"; 
-  case ucp_Tirhuta:            return US"Tirhuta"; 
-  case ucp_Warang_Citi:        return US"Warang_Citi"; 
-
+  US"Bassa_Vah",
+  US"Caucasian_Albanian",
+  US"Duployan",
+  US"Elbasan",
+  US"Grantha",
+  US"Khojki",
+  US"Khudawadi",
+  US"Linear_A",
+  US"Mahajani",
+  US"Manichaean",
+  US"Mende_Kikakui",
+  US"Modi",
+  US"Mro",
+  US"Nabataean",
+  US"Old_North_Arabian",
+  US"Old_Permic",
+  US"Pahawh_Hmong",
+  US"Palmyrene",
+  US"Psalter_Pahlavi",
+  US"Pau_Cin_Hau",
+  US"Siddham",
+  US"Tirhuta",
+  US"Warang_Citi",
   /* New for Unicode 8.0.0 */
-  case ucp_Ahom:                  return US"Ahom"; 
-  case ucp_Anatolian_Hieroglyphs: return US"Anatolian_Hieroglyphs"; 
-  case ucp_Hatran:                return US"Hatran"; 
-  case ucp_Multani:               return US"Multani"; 
-  case ucp_Old_Hungarian:         return US"Old_Hungarian"; 
-  case ucp_SignWriting:           return US"SignWriting"; 
-
+  US"Ahom",
+  US"Anatolian_Hieroglyphs",
+  US"Hatran",
+  US"Multani",
+  US"Old_Hungarian",
+  US"SignWriting",
   /* New for Unicode 10.0.0 (no update since 8.0.0) */
-  case ucp_Adlam:               return US"Adlam"; 
-  case ucp_Bhaiksuki:           return US"Bhaiksuki"; 
-  case ucp_Marchen:             return US"Marchen"; 
-  case ucp_Newa:                return US"Newa"; 
-  case ucp_Osage:               return US"Osage"; 
-  case ucp_Tangut:              return US"Tangut"; 
-  case ucp_Masaram_Gondi:       return US"Masaram_Gondi"; 
-  case ucp_Nushu:               return US"Nushu"; 
-  case ucp_Soyombo:             return US"Soyombo"; 
-  case ucp_Zanabazar_Square:    return US"Zanabazar_Square"; 
-
-  /* New for Unicode 11.0.0 */ 
-  case ucp_Dogra:               return US"Dogra";  
-  case ucp_Gunjala_Gondi:       return US"Gunjala_Gondi";  
-  case ucp_Hanifi_Rohingya:     return US"Hanifi_Rohingya";  
-  case ucp_Makasar:             return US"Makasar";  
-  case ucp_Medefaidrin:         return US"Medefaidrin"; 
-  case ucp_Old_Sogdian:         return US"Old_Sogdian";  
-  case ucp_Sogdian:             return US"Sogdian"; 
-  }
-}
+  US"Adlam",
+  US"Bhaiksuki",
+  US"Marchen",
+  US"Newa",
+  US"Osage",
+  US"Tangut",
+  US"Masaram_Gondi",
+  US"Nushu",
+  US"Soyombo",
+  US"Zanabazar_Square",
+  /* New for Unicode 11.0.0 */
+  US"Dogra",
+  US"Gunjala_Gondi",
+  US"Hanifi_Rohingya",
+  US"Makasar",
+  US"Medefaidrin",
+  US"Old_Sogdian",
+  US"Sogdian"
+};
+
+const unsigned char *type_names[] = {
+  US"Cc",
+  US"Cf",
+  US"Cn",
+  US"Co",
+  US"Cs",
+  US"Ll",
+  US"Lm",
+  US"Lo",
+  US"Lt",
+  US"Lu",
+  US"Mc",
+  US"Me",
+  US"Mn",
+  US"Nd",
+  US"Nl",
+  US"No",
+  US"Pc",
+  US"Pd",
+  US"Pe",
+  US"Pf",
+  US"Pi",
+  US"Po",
+  US"Ps",
+  US"Sc",
+  US"Sk",
+  US"Sm",
+  US"So",
+  US"Zl",
+  US"Zp",
+  US"Zs"
+};
+
+const unsigned char *gb_names[] = {
+  US"CR",
+  US"LF",
+  US"Control",
+  US"Extend",
+  US"Prepend",
+  US"SpacingMark",
+  US"L",
+  US"V",
+  US"T",
+  US"LV",
+  US"LVT",
+  US"RegionalIndicator",
+  US"Other",
+  US"ZWJ",
+  US"Extended_Pictographic"
+};
+
+
+/*************************************************
+*             Test for interaction               *
+*************************************************/
 
+static BOOL
+is_stdin_tty(void)
+{
+#if defined WIN32
+return _isatty(_fileno(stdin));
+#else
+return isatty(fileno(stdin));
+#endif
+}
 
 
 /*************************************************
@@ -239,11 +332,13 @@ int gbprop = UCD_GRAPHBREAK(c);
 int othercase = UCD_OTHERCASE(c);
 int caseset = UCD_CASESET(c);
 
-unsigned char *fulltypename = US"??";
-unsigned char *typename = US"??";
-unsigned char *graphbreak = US"??";
+const unsigned char *fulltypename = US"??";
+const unsigned char *typename = US"??";
+const unsigned char *scriptname = US"??";
+const unsigned char *graphbreak = US"??";
 
-unsigned char *scriptname = find_script_name(script); 
+if (script < sizeof(script_names)/sizeof(char *))
+  scriptname = script_names[script];
 
 switch (type)
   {
@@ -289,7 +384,7 @@ switch (fulltype)
   case ucp_Zp: fulltypename = US"Paragraph separator"; break;
   case ucp_Zs: fulltypename = US"Space separator"; break;
   }
-  
+
 switch(gbprop)
   {
   case ucp_gbCR:           graphbreak = US"CR"; break;
@@ -308,12 +403,12 @@ switch(gbprop)
   case ucp_gbOther:        graphbreak = US"Other"; break;
   case ucp_gbZWJ:          graphbreak = US"Zero Width Joiner"; break;
   case ucp_gbExtended_Pictographic:
-                           graphbreak = US"Extended Pictographic"; break;  
-  default:                 graphbreak = US"Unknown"; break;  
+                           graphbreak = US"Extended Pictographic"; break;
+  default:                 graphbreak = US"Unknown"; break;
   }
-  
+
 printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
-if (othercase != c) 
+if (othercase != c)
   {
   printf(", %04x", othercase);
   if (caseset != 0)
@@ -321,37 +416,307 @@ if (othercase != c)
     const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
     while (*(++p) < NOTACHAR)
       if (*p != othercase && *p != c) printf(", %04x", *p);
-    }   
-  } 
-  
+    }
+  }
+
 if (scriptx != script)
   {
-  printf(", ["); 
-  if (scriptx >= 0) printf("%s", find_script_name(scriptx)); else
+  printf(", [");
+  if (scriptx >= 0)
     {
-    char *sep = ""; 
+    scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))?
+      US"??" : script_names[scriptx];
+    printf("%s", scriptname);
+    }
+  else
+    {
+    char *sep = "";
     const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
     while (*p != 0)
       {
-      printf("%s%s", sep, find_script_name(*p++));
-      sep = ", "; 
-      }   
-    }  
+      scriptname = (*p >= sizeof(script_names)/sizeof(char *))?
+        US"??" : script_names[*p++];
+      printf("%s%s", sep, scriptname);
+      sep = ", ";
+      }
+    }
   printf("]");
-  } 
- 
+  }
+
 printf("\n");
 }
 
 
 
 /*************************************************
+*   Find character(s) with given property/ies    *
+*************************************************/
+
+static void
+find_chars(unsigned char *s)
+{
+unsigned char name[24];
+unsigned char value[24];
+unsigned char *t;
+unsigned int count= 0;
+int scriptx_list[24];
+unsigned int scriptx_count = 0;
+uint32_t i, c;
+int script = -1;
+int type = -1;
+int gbreak = -1;
+BOOL script_not = FALSE;
+BOOL type_not = FALSE;
+BOOL gbreak_not = FALSE;
+BOOL hadrange = FALSE;
+const ucd_record *ucd, *next_ucd;
+const char *pad = "      ";
+
+while (*s != 0)
+  {
+  unsigned int offset = 0;
+  BOOL scriptx_not = FALSE;
+
+  for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+  *t = 0;
+  while (isspace(*s)) s++;
+
+  for (t = value; *s != 0 && !isspace(*s); s++) *t++ = *s;
+  *t = 0;
+  while (isspace(*s)) s++;
+
+  if (strcmp(CS name, "script") == 0 ||
+      strcmp(CS name, "scriptx") == 0)
+    {
+    if (value[0] == '!')
+      {
+      if (name[6] == 'x') scriptx_not = TRUE;
+        else script_not = TRUE;
+      offset = 1;
+      }
+
+    for (i = 0; i < sizeof(script_names)/sizeof(char *); i++)
+      {
+      if (strcmp(CS value + offset, script_names[i]) == 0)
+        {
+        if (name[6] == 'x')
+          {
+          scriptx_list[scriptx_count++] = scriptx_not? (-i):i;
+          }
+        else
+          {
+          if (script < 0) script = i; else
+            {
+            printf("** Only 1 script value allowed\n");
+            return;
+            }
+          }
+        break;
+        }
+      }
+
+    if (i >= sizeof(script_names)/sizeof(char *))
+      {
+      printf("** Unrecognized script name '%s'\n", value);
+      return;
+      }
+    }
+
+  else if (strcmp(CS name, "type") == 0)
+    {
+    if (type >= 0)
+      {
+      printf("** Only 1 type value allowed\n");
+      return;
+      }
+    else
+      {
+      if (value[0] == '!')
+        {
+        type_not = TRUE;
+        offset = 1;
+        }
+
+      for (i = 0; i < sizeof(type_names)/sizeof(char *); i++)
+        {
+        if (strcmp(CS (value + offset), type_names[i]) == 0)
+          {
+          type = i;
+          break;
+          }
+        }
+      if (i >= sizeof(type_names)/sizeof(char *))
+        {
+        printf("** Unrecognized type name '%s'\n", value);
+        return;
+        }
+      }
+    }
+
+  else if (strcmp(CS name, "gbreak") == 0)
+    {
+    if (gbreak >= 0)
+      {
+      printf("** Only 1 grapheme break value allowed\n");
+      return;
+      }
+    else
+      {
+      if (value[0] == '!')
+        {
+        gbreak_not = TRUE;
+        offset = 1;
+        }
+
+      for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++)
+        {
+        if (strcmp(CS (value + offset), gb_names[i]) == 0)
+          {
+          gbreak = i;
+          break;
+          }
+        }
+      if (i >= sizeof(gb_names)/sizeof(char *))
+        {
+        printf("** Unrecognized gbreak name '%s'\n", value);
+        return;
+        }
+      }
+    }
+
+  else
+    {
+    printf("** Unrecognized property name '%s'\n", name);
+    return;
+    }
+  }
+
+if (script < 0 && scriptx_count == 0 && type < 0 && gbreak < 0)
+  {
+  printf("** No properties specified\n");
+  return;
+  }
+
+for (c = 0; c <= 0x10ffff; c++)
+  {
+  if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
+
+  if (scriptx_count > 0)
+    {
+    const uint8_t *char_scriptx = NULL;
+    int found = 0;
+    int scriptx = UCD_SCRIPTX(c);
+
+    if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
+
+    for (i = 0; i < scriptx_count; i++)
+      {
+      /* Positive requirment */
+      if (scriptx_list[i] >= 0)
+        {
+        if (scriptx >= 0)
+          {
+          if (scriptx == scriptx_list[i]) found++;
+          }
+
+        else
+          {
+          const uint8_t *p;
+          for (p = char_scriptx; *p != 0; p++)
+            {
+            if (scriptx_list[i] == *p)
+              {
+              found++;
+              break;
+              }
+            }
+          }
+        }
+      /* Negative requirement */
+      else
+        {
+        if (scriptx >= 0)
+          {
+          if (scriptx != -scriptx_list[i]) found++;
+          }
+        else
+          {
+          const uint8_t *p;
+          for (p = char_scriptx; *p != 0; p++)
+            if (-scriptx_list[i] == *p) break;
+          if (*p == 0) found++;
+          }
+        }
+      }
+
+    if (found != scriptx_count) continue;
+    }
+
+  if (type >= 0)
+    {
+    if (type_not)
+      {
+      if (type == UCD_CHARTYPE(c)) continue;
+      }
+    else
+      {
+      if (type != UCD_CHARTYPE(c)) continue;
+      }
+    }
+
+  if (gbreak >= 0)
+    {
+    if (gbreak_not)
+      {
+      if (gbreak == UCD_GRAPHBREAK(c)) continue;
+      }
+    else
+      {
+      if (gbreak != UCD_GRAPHBREAK(c)) continue;
+      }
+    }
+
+  /* All conditions are met. Look for runs. */
+
+  ucd = GET_UCD(c);
+
+  for (i = c + 1; i < 0x10ffff; i++)
+    {
+    next_ucd = GET_UCD(i);
+    if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
+    }
+
+  if (--i > c)
+    {
+    printf("%04x..", c);
+    c = i;
+    hadrange = TRUE;
+    }
+  else if (hadrange) printf("%s", pad);
+
+  print_prop(c);
+  if (c >= 0x100000) pad = "        ";
+    else if (c >= 0x10000) pad = "       ";
+  count++;
+  if (count >= 100)
+    {
+    printf("...\n");
+    break;
+    }
+  }
+
+if (count == 0) printf("No characters found\n");
+}
+
+
+/*************************************************
 *               Main program                     *
 *************************************************/
 
 int
 main(int argc, char **argv)
 {
+BOOL interactive;
 unsigned char buffer[1024];
 
 if (argc > 1)
@@ -359,19 +724,48 @@ if (argc > 1)
   int i;
   for (i = 1; i < argc; i++)
     {
-    unsigned char *endptr; 
+    unsigned char *endptr;
     int c = strtoul(argv[i], CSS(&endptr), 16);
-    print_prop(c); 
+    if (*endptr != 0)
+      printf("** Hex number expected; ignored '%s'\n", argv[i]);
+    else print_prop(c);
     }
   return 0;
-  }    
+  }
+
+interactive = is_stdin_tty();
 
-while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) using_history();
+#endif
+
+for(;;)
   {
   unsigned char name[24];
   unsigned char *s, *t;
 
-  printf("%s", buffer);
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+  if (interactive)
+    {
+    size_t len;
+    s = readline("> ");
+    if (s == NULL) break;
+    len = strlen(s);
+    if (len > 0) add_history(s);
+    memcpy(buffer, s, len);
+    buffer[len] = '\n';
+    buffer[len+1] = 0;
+    free(s);
+    }
+  else
+#endif
+
+    {
+    if (interactive) printf("> ");
+    if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
+    if (!interactive) printf("%s", buffer);
+    }
+
   s = buffer;
   while (isspace(*s)) s++;
   if (*s == 0) continue;
@@ -386,15 +780,32 @@ while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
       {
       unsigned char *endptr;
       int c = strtoul(CS s, CSS(&endptr), 16);
-      print_prop(c);
+
+      if (*endptr != 0 && !isspace(*endptr))
+        {
+        while (*endptr != 0 && !isspace(*endptr)) endptr++;
+        printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s);
+        }
+      else  print_prop(c);
       s = endptr;
       while (isspace(*s)) s++;
       }
     }
 
-  else printf("Unknown test command %s\n", name);
+  else if (strcmp(CS name, "find") == 0)
+    {
+    find_chars(s);
+    }
+
+  else printf("** Unknown test command %s\n", name);
   }
 
+if (interactive) printf("\n");
+
+#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
+if (interactive) clear_history();
+#endif
+
 return 0;
 }
author	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2018-10-14 14:27:16 +0000
committer	ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>	2018-10-14 14:27:16 +0000
commit	e6dbb8822cc6cb922b0e99f33ad8d5ee8e8f4a0c (patch)
tree	e309c9e7c67463149847c939a3d2485e0c96cd39 /maint
parent	0bbce752a408b94a06e5620075d0ffd6766d5854 (diff)
download	pcre2-e6dbb8822cc6cb922b0e99f33ad8d5ee8e8f4a0c.tar.gz