diff options
Diffstat (limited to 'maint/ucptest.c')
-rw-r--r-- | maint/ucptest.c | 643 |
1 files changed, 323 insertions, 320 deletions
diff --git a/maint/ucptest.c b/maint/ucptest.c index e946226..49616e2 100644 --- a/maint/ucptest.c +++ b/maint/ucptest.c @@ -16,36 +16,58 @@ /* This is a hacked-up program for testing the Unicode properties tables of PCRE2. It can also be used for finding characters with certain properties. I wrote it to help with debugging PCRE, and have added things that I found -useful, in a rather haphazard way. The code has never been "tidied" or checked -for robustness. - -If there are arguments, they are a list of hexadecimal code points whose -properties are to be output. Otherwise, the program expects to read commands on -stdin, and it writes output to stdout. There are two commands: - -"findprop" must be followed by a list of Unicode code points as hex numbers -(without any prefixes). The output is one line per character, giving its -Unicode properties followed by its other case if there is one, followed by its -Script Extension list if it is not just the same as the base script. - -"find" must be followed by a list of property names and their values. This -finds characters that have those properties. If multiple properties are listed, -they must all be matched. Currently supported: +useful, in a rather haphazard way. The code has never been seriously tidied or +checked for robustness, but it shouldn't now give compiler warnings. + +There is only one option: "-s". If given, it applies only to the "findprop" +command. It causes the UTF-8 sequence of bytes that encode the character to be +output between angle brackets at the end of the line. On a UTF-8 terminal, this +will show the appropriate graphic for the code point. + +If the command has arguments, they are concatenated into a buffer, separated by +spaces. If the first argument starts "U+" or consists entirely of hexadecimal +digits, "findprop" is inserted at the start. The buffer is then processed as a +single line file, after which the program exits. If there are no arguments, the +program reads commands line by line on stdin and writes output to stdout. The +return code is always zero. + +There are three commands: + +"findprop" must be followed by a space-separated list of Unicode code points as +hex numbers, either without any prefix or starting with "U+". The output is one +line per character, giving its Unicode properties followed by its other case or +cases if one or more exist, followed by its Script Extension list if it is not +just the same as the base script. This list is in square brackets. The +properties are: + +General type e.g. Letter +Specific type e.g. Upper case letter +Script e.g. Medefaidrin +Grapheme break type e.g. Extend (most common is Other) + +"find" must be followed by a list of property names and their values. The +values are case-sensitive. This finds characters that have those properties. If +multiple properties are listed, they must all be matched. Currently supported: script <name> The character must have this script property. Only one such script may be given. scriptx <name> This script must be in the character's Script Extension property list. If this is used many times, all the given scripts must be present. - type <abbrev> The character's type (e.g. Lu or Nd) must match. + type <abbrev> The character's specific type (e.g. Lu or Nd) must match. gbreak <name> The grapheme break property must match. If a <name> or <abbrev> is preceded by !, the value must NOT be present. For Script Extensions, there may be a mixture of positive and negative requirements. All must be satisfied. -No more than 100 characters are output. If there are more, the list ends with -... */ +Sequences of two or more characters are shown as ranges, for example +U+0041..U+004A. No more than 100 lines are are output. If there are more +characters, the list ends with ... + +"list" must be followed by a property name (script, type, or gbreak). The +defined values for that property are listed. */ + #ifdef HAVE_CONFIG_H #include "../src/config.h" @@ -91,228 +113,99 @@ No more than 100 characters are output. If there are more, the list ends with /* -------------------------------------------------------------------*/ - -const unsigned char *script_names[] = { - US"Unknown", - US"Arabic", - US"Armenian", - US"Bengali", - US"Bopomofo", - US"Braille", - US"Buginese", - US"Buhid", - US"Canadian_Aboriginal", - US"Cherokee", - US"Common", - US"Coptic", - US"Cypriot", - US"Cyrillic", - US"Deseret", - US"Devanagari", - US"Ethiopic", - US"Georgian", - US"Glagolitic", - US"Gothic", - US"Greek", - US"Gujarati", - US"Gurmukhi", - US"Han", - US"Hangul", - US"Hanunoo", - US"Hebrew", - US"Hiragana", - US"Inherited", - US"Kannada", - US"Katakana", - US"Kharoshthi", - US"Khmer", - US"Lao", - US"Latin", - US"Limbu", - US"Linear_B", - US"Malayalam", - US"Mongolian", - US"Myanmar", - US"New_Tai_Lue", - US"Ogham", - US"Old_Italic", - US"Old_Persian", - US"Oriya", - US"Osmanya", - US"Runic", - US"Shavian", - US"Sinhala", - US"Syloti_Nagri", - US"Syriac", - US"Tagalog", - US"Tagbanwa", - US"Tai_Le", - US"Tamil", - US"Telugu", - US"Thaana", - US"Thai", - US"Tibetan", - US"Tifinagh", - US"Ugaritic", - US"Yi", - /* New for Unicode 5.0: */ - US"Balinese", - US"Cuneiform", - US"Nko", - US"Phags_Pa", - US"Phoenician", - /* New for Unicode 5.1: */ - US"Carian", - US"Cham", - US"Kayah_Li", - US"Lepcha", - US"Lycian", - US"Lydian", - US"Ol_Chiki", - US"Rejang", - US"Saurashtra", - US"Sundanese", - US"Vai", - /* New for Unicode 5.2: */ - US"Avestan", - US"Bamum", - US"Egyptian_Hieroglyphs", - US"Imperial_Aramaic", - US"Inscriptional_Pahlavi", - US"Inscriptional_Parthian", - US"Javanese", - US"Kaithi", - US"Lisu", - US"Meetei_Mayek", - US"Old_South_Arabian", - US"Old_Turkic", - US"Samaritan", - US"Tai_Tham", - US"Tai_Viet", - /* New for Unicode 6.0.0 */ - US"Batak", - US"Brahmi", - US"Mandaic", - /* New for Unicode 6.1.0 */ - US"Chakma", - US"Meroitic_Cursive", - US"Meroitic_Hieroglyphs", - US"Miao", - US"Sharada", - US"Sora Sompent", - US"Takri", - /* New for Unicode 7.0.0 */ - US"Bassa_Vah", - US"Caucasian_Albanian", - US"Duployan", - US"Elbasan", - US"Grantha", - US"Khojki", - US"Khudawadi", - US"Linear_A", - US"Mahajani", - US"Manichaean", - US"Mende_Kikakui", - US"Modi", - US"Mro", - US"Nabataean", - US"Old_North_Arabian", - US"Old_Permic", - US"Pahawh_Hmong", - US"Palmyrene", - US"Psalter_Pahlavi", - US"Pau_Cin_Hau", - US"Siddham", - US"Tirhuta", - US"Warang_Citi", - /* New for Unicode 8.0.0 */ - US"Ahom", - US"Anatolian_Hieroglyphs", - US"Hatran", - US"Multani", - US"Old_Hungarian", - US"SignWriting", - /* New for Unicode 10.0.0 (no update since 8.0.0) */ - US"Adlam", - US"Bhaiksuki", - US"Marchen", - US"Newa", - US"Osage", - US"Tangut", - US"Masaram_Gondi", - US"Nushu", - US"Soyombo", - US"Zanabazar_Square", - /* New for Unicode 11.0.0 */ - US"Dogra", - US"Gunjala_Gondi", - US"Hanifi_Rohingya", - US"Makasar", - US"Medefaidrin", - US"Old_Sogdian", - US"Sogdian", - /* New for Unicode 12.0.0 */ - US"Elymaic", - US"Nandinagari", - US"Nyiakeng_Puachue_Hmong", - US"Wancho", - /* New for Unicode 13.0.0 */ - US"Chorasmian", - US"Dives_Akuru", - US"Khitan_Small_Script", - US"Yezidi" +static BOOL show_character = FALSE; + +static const unsigned char *type_names[] = { + US"Cc", US"Control", + US"Cf", US"Format", + US"Cn", US"Unassigned", + US"Co", US"Private use", + US"Cs", US"Surrogate", + US"Ll", US"Lower case letter", + US"Lm", US"Modifier letter", + US"Lo", US"Other letter", + US"Lt", US"Title case letter", + US"Lu", US"Upper case letter", + US"Mc", US"Spacing mark", + US"Me", US"Enclosing mark", + US"Mn", US"Non-spacing mark", + US"Nd", US"Decimal number", + US"Nl", US"Letter number", + US"No", US"Other number", + US"Pc", US"Connector punctuation", + US"Pd", US"Dash punctuation", + US"Pe", US"Close punctuation", + US"Pf", US"Final punctuation", + US"Pi", US"Initial punctuation", + US"Po", US"Other punctuation", + US"Ps", US"Open punctuation", + US"Sc", US"Currency symbol", + US"Sk", US"Modifier symbol", + US"Sm", US"Mathematical symbol", + US"So", US"Other symbol", + US"Zl", US"Line separator", + US"Zp", US"Paragraph separator", + US"Zs", US"Space separator" }; -const unsigned char *type_names[] = { - US"Cc", - US"Cf", - US"Cn", - US"Co", - US"Cs", - US"Ll", - US"Lm", - US"Lo", - US"Lt", - US"Lu", - US"Mc", - US"Me", - US"Mn", - US"Nd", - US"Nl", - US"No", - US"Pc", - US"Pd", - US"Pe", - US"Pf", - US"Pi", - US"Po", - US"Ps", - US"Sc", - US"Sk", - US"Sm", - US"So", - US"Zl", - US"Zp", - US"Zs" +static const unsigned char *gb_names[] = { + US"CR", US"carriage return", + US"LF", US"linefeed", + US"Control", US"", + US"Extend", US"", + US"Prepend", US"", + US"SpacingMark", US"", + US"L", US"Hangul syllable type L", + US"V", US"Hangul syllable type V", + US"T", US"Hangul syllable type T", + US"LV", US"Hangul syllable type LV", + US"LVT", US"Hangul syllable type LVT", + US"RegionalIndicator", US"", + US"Other", US"", + US"ZWJ", US"zero width joiner", + US"Extended_Pictographic", US"" }; -const unsigned char *gb_names[] = { - US"CR", - US"LF", - US"Control", - US"Extend", - US"Prepend", - US"SpacingMark", - US"L", - US"V", - US"T", - US"LV", - US"LVT", - US"RegionalIndicator", - US"Other", - US"ZWJ", - US"Extended_Pictographic" -}; + +static const unsigned int utf8_table1[] = { + 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff}; + +static const int utf8_table2[] = { + 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + + +/************************************************* +* Convert character value to UTF-8 * +*************************************************/ + +/* This function takes an unsigned long integer value in the range 0 - +0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes. + +Arguments: + cvalue the character value + buffer pointer to buffer for result - at least 6 bytes long + +Returns: number of bytes placed in the buffer + 0 if input code point is too big +*/ + +static size_t +ord2utf8(unsigned int cvalue, unsigned char *buffer) +{ +size_t i, j; +for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) + if (cvalue <= utf8_table1[i]) break; +if (i >= sizeof(utf8_table1)/sizeof(int)) return 0; +buffer += i; +for (j = i; j > 0; j--) + { + *buffer-- = 0x80 | (cvalue & 0x3f); + cvalue >>= 6; + } +*buffer = utf8_table2[i] | cvalue; +return i + 1; +} + /************************************************* @@ -331,27 +224,46 @@ return isatty(fileno(stdin)); /************************************************* +* Get script name from ucp ident * +*************************************************/ + +static const char * +get_scriptname(int script) +{ +size_t i; +const ucp_type_table *u; + +for (i = 0; i < PRIV(utt_size); i++) + { + u = PRIV(utt) + i; + if (u->type == PT_SC && u->value == script) break; + } +if (i < PRIV(utt_size)) + return PRIV(utt_names) + u->name_offset; + +return "??"; +} + + +/************************************************* * Print Unicode property info for a char * *************************************************/ static void -print_prop(int c) +print_prop(unsigned int c, BOOL is_just_one) { int type = UCD_CATEGORY(c); int fulltype = UCD_CHARTYPE(c); int script = UCD_SCRIPT(c); int scriptx = UCD_SCRIPTX(c); int gbprop = UCD_GRAPHBREAK(c); -int othercase = UCD_OTHERCASE(c); +unsigned int othercase = UCD_OTHERCASE(c); int caseset = UCD_CASESET(c); const unsigned char *fulltypename = US"??"; const unsigned char *typename = US"??"; -const unsigned char *scriptname = US"??"; const unsigned char *graphbreak = US"??"; - -if (script < sizeof(script_names)/sizeof(char *)) - scriptname = script_names[script]; +const unsigned char *scriptname = CUS get_scriptname(script); switch (type) { @@ -420,15 +332,18 @@ switch(gbprop) default: graphbreak = US"Unknown"; break; } -printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak); -if (othercase != c) +printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak); +if (is_just_one && othercase != c) { - printf(", %04x", othercase); + printf(", U+%04X", othercase); if (caseset != 0) { const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1; while (*(++p) < NOTACHAR) - if (*p != othercase && *p != c) printf(", %04x", *p); + { + unsigned int d = *p; + if (d != othercase && d != c) printf(", U+%04X", d); + } } } @@ -436,25 +351,26 @@ if (scriptx != script) { printf(", ["); if (scriptx >= 0) - { - scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))? - US"??" : script_names[scriptx]; - printf("%s", scriptname); - } + printf("%s", get_scriptname(scriptx)); else { - char *sep = ""; + const char *sep = ""; const uint8_t *p = PRIV(ucd_script_sets) - scriptx; while (*p != 0) { - scriptname = (*p >= sizeof(script_names)/sizeof(char *))? - US"??" : script_names[*p++]; - printf("%s%s", sep, scriptname); + printf("%s%s", sep, get_scriptname(*p++)); sep = ", "; } } printf("]"); } + +if (show_character && is_just_one) + { + unsigned char buffer[8]; + size_t len = ord2utf8(c, buffer); + printf(", >%.*s<", (int)len, buffer); + } printf("\n"); } @@ -483,7 +399,7 @@ BOOL type_not = FALSE; BOOL gbreak_not = FALSE; BOOL hadrange = FALSE; const ucd_record *ucd, *next_ucd; -const char *pad = " "; +const char *pad = " "; while (*s != 0) { @@ -508,17 +424,20 @@ while (*s != 0) offset = 1; } - for (i = 0; i < sizeof(script_names)/sizeof(char *); i++) + for (i = 0; i < PRIV(utt_size); i++) { - if (strcmp(CS value + offset, script_names[i]) == 0) + const ucp_type_table *u = PRIV(utt) + i; + if (u->type == PT_SC && strcmp(CS(value + offset), + PRIV(utt_names) + u->name_offset) == 0) { + c = u->value; if (name[6] == 'x') { - scriptx_list[scriptx_count++] = scriptx_not? (-i):i; + scriptx_list[scriptx_count++] = scriptx_not? (-c):c; } else { - if (script < 0) script = i; else + if (script < 0) script = c; else { printf("** Only 1 script value allowed\n"); return; @@ -528,9 +447,9 @@ while (*s != 0) } } - if (i >= sizeof(script_names)/sizeof(char *)) + if (i >= PRIV(utt_size)) { - printf("** Unrecognized script name '%s'\n", value); + printf("** Unrecognized script name \"%s\"\n", value); return; } } @@ -550,17 +469,17 @@ while (*s != 0) offset = 1; } - for (i = 0; i < sizeof(type_names)/sizeof(char *); i++) + for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2) { - if (strcmp(CS (value + offset), type_names[i]) == 0) + if (strcmp(CS (value + offset), CS type_names[i]) == 0) { - type = i; + type = i/2; break; } } if (i >= sizeof(type_names)/sizeof(char *)) { - printf("** Unrecognized type name '%s'\n", value); + printf("** Unrecognized type name \"%s\"\n", value); return; } } @@ -581,17 +500,17 @@ while (*s != 0) offset = 1; } - for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++) + for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2) { - if (strcmp(CS (value + offset), gb_names[i]) == 0) + if (strcmp(CS (value + offset), CS gb_names[i]) == 0) { - gbreak = i; + gbreak = i/2; break; } } if (i >= sizeof(gb_names)/sizeof(char *)) { - printf("** Unrecognized gbreak name '%s'\n", value); + printf("** Unrecognized gbreak name \"%s\"\n", value); return; } } @@ -599,7 +518,7 @@ while (*s != 0) else { - printf("** Unrecognized property name '%s'\n", name); + printf("** Unrecognized property name \"%s\"\n", name); return; } } @@ -617,7 +536,7 @@ for (c = 0; c <= 0x10ffff; c++) if (scriptx_count > 0) { const uint8_t *char_scriptx = NULL; - int found = 0; + unsigned int found = 0; int scriptx = UCD_SCRIPTX(c); if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx; @@ -701,13 +620,13 @@ for (c = 0; c <= 0x10ffff; c++) if (--i > c) { - printf("%04x..", c); + printf("U+%04X..", c); c = i; hadrange = TRUE; } else if (hadrange) printf("%s", pad); - print_prop(c); + print_prop(c, FALSE); if (c >= 0x100000) pad = " "; else if (c >= 0x10000) pad = " "; count++; @@ -723,6 +642,101 @@ if (count == 0) printf("No characters found\n"); /************************************************* +* Process command line * +*************************************************/ + +static void +process_command_line(unsigned char *buffer) +{ +unsigned char *s, *t; +unsigned char name[24]; + +s = buffer; +while (isspace(*s)) s++; +if (*s == 0) return; + +for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; +*t = 0; +while (isspace(*s)) s++; + +if (strcmp(CS name, "findprop") == 0) + { + while (*s != 0) + { + unsigned int c; + unsigned char *endptr; + t = s; + if (strncmp(CS t, "U+", 2) == 0) t += 2; + c = strtoul(CS t, CSS(&endptr), 16); + if (*endptr != 0 && !isspace(*endptr)) + { + while (*endptr != 0 && !isspace(*endptr)) endptr++; + printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s); + } + else + { + if (c > 0x10ffff) + printf("** U+%x is too big for a Unicode code point\n", c); + else + print_prop(c, TRUE); + } + s = endptr; + while (isspace(*s)) s++; + } + } + +else if (strcmp(CS name, "find") == 0) + { + find_chars(s); + } + +else if (strcmp(CS name, "list") == 0) + { + while (*s != 0) + { + size_t i; + for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; + *t = 0; + while (isspace(*s)) s++; + + if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0) + { + for (i = 0; i < PRIV(utt_size); i++) + if (PRIV(utt)[i].type == PT_SC) + printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset); + } + + else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0) + { + for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2) + printf("%s %s\n", type_names[i], type_names[i+1]); + } + + else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0) + { + for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2) + { + if (gb_names[i+1][0] != 0) + printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]); + else + printf("%s\n", gb_names[i]); + } + } + + else + { + printf("** Unknown property \"%s\"\n", name); + break; + } + } + } + +else printf("** Unknown test command \"%s\"\n", name); +} + + + +/************************************************* * Main program * *************************************************/ @@ -730,19 +744,42 @@ int main(int argc, char **argv) { BOOL interactive; +int first_arg = 1; unsigned char buffer[1024]; -if (argc > 1) +if (argc > 1 && strcmp(argv[1], "-s") == 0) + { + show_character = TRUE; + first_arg++; + } + +if (argc > first_arg) { int i; - for (i = 1; i < argc; i++) + BOOL hexfirst = TRUE; + char *arg = argv[first_arg]; + unsigned char *s = buffer; + + if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg)) { - unsigned char *endptr; - int c = strtoul(argv[i], CSS(&endptr), 16); - if (*endptr != 0) - printf("** Hex number expected; ignored '%s'\n", argv[i]); - else print_prop(c); + while (*arg != 0) + { + if (!isxdigit(*arg++)) { hexfirst = FALSE; break; } + } + } + + if (hexfirst) + { + strcpy(CS s, "findprop "); + s += 9; + } + + for (i = first_arg; i < argc; i++) + { + s += sprintf(CS s, "%s ", argv[i]); } + + process_command_line(buffer); return 0; } @@ -754,17 +791,14 @@ if (interactive) using_history(); for(;;) { - unsigned char name[24]; - unsigned char *s, *t; - #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) if (interactive) { size_t len; - s = readline("> "); + unsigned char *s = US readline("> "); if (s == NULL) break; - len = strlen(s); - if (len > 0) add_history(s); + len = strlen(CS s); + if (len > 0) add_history(CS s); memcpy(buffer, s, len); buffer[len] = '\n'; buffer[len+1] = 0; @@ -778,39 +812,8 @@ for(;;) if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break; if (!interactive) printf("%s", buffer); } - - s = buffer; - while (isspace(*s)) s++; - if (*s == 0) continue; - - for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; - *t = 0; - while (isspace(*s)) s++; - - if (strcmp(CS name, "findprop") == 0) - { - while (*s != 0) - { - unsigned char *endptr; - int c = strtoul(CS s, CSS(&endptr), 16); - - if (*endptr != 0 && !isspace(*endptr)) - { - while (*endptr != 0 && !isspace(*endptr)) endptr++; - printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s); - } - else print_prop(c); - s = endptr; - while (isspace(*s)) s++; - } - } - - else if (strcmp(CS name, "find") == 0) - { - find_chars(s); - } - - else printf("** Unknown test command %s\n", name); + + process_command_line(buffer); } if (interactive) printf("\n"); |