diff options
author | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2020-04-01 17:00:33 +0000 |
---|---|---|
committer | ph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069> | 2020-04-01 17:00:33 +0000 |
commit | 265489fe6082dc2ba2c91ffdc1d448ed35739e60 (patch) | |
tree | 83756df63a6eac5d7b9b0d6307177f9ad96736ea | |
parent | 2082578875ec81a296070568fa7f09a6abc4f1ce (diff) | |
download | pcre2-265489fe6082dc2ba2c91ffdc1d448ed35739e60.tar.gz |
Tidies and updates to maintenance programs utf8 and ucptest.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1241 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r-- | maint/README | 19 | ||||
-rw-r--r-- | maint/ucptest.c | 643 | ||||
-rw-r--r-- | maint/ucptestdata/testinput1 | 2 | ||||
-rw-r--r-- | maint/ucptestdata/testinput2 | 5 | ||||
-rw-r--r-- | maint/ucptestdata/testoutput1 | 702 | ||||
-rw-r--r-- | maint/ucptestdata/testoutput2 | 188 | ||||
-rw-r--r-- | maint/utf8.c | 313 |
7 files changed, 1082 insertions, 790 deletions
diff --git a/maint/README b/maint/README index 0e1ff8f..fac36b2 100644 --- a/maint/README +++ b/maint/README @@ -54,10 +54,12 @@ Unicode.tables The files in this directory were downloaded from the Unicode ucptest.c A short C program for testing the Unicode property macros that do lookups in the pcre2_ucd.c data, mainly useful after rebuilding the Unicode property table. Compile and run this in - the "maint" directory (see comments at its head). + the "maint" directory (see comments at its head). This program + can also be used to find characters with specific properties. -ucptestdata A directory containing two files, testinput1 and testoutput1, - to use in conjunction with the ucptest program. +ucptestdata A directory containing four files, testinput{1,2} and + testoutput{1,2}, for use in conjunction with the ucptest + program. utf8.c A short, freestanding C program for converting a Unicode code point into a sequence of bytes in the UTF-8 encoding, and vice @@ -65,7 +67,7 @@ utf8.c A short, freestanding C program for converting a Unicode code outputs a list of the equivalent UTF-8 bytes. If its argument is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it treats them as a UTF-8 character and outputs the equivalent - code point in hex. + code point in hex. See comments at its head for details. Updating to a new Unicode release @@ -96,9 +98,10 @@ lists of scripts. The ucptest program can be compiled and used to check that the new tables in pcre2_ucd.c work properly, using the data files in ucptestdata to check a -number of test characters. The source file ucptest.c should also be updated -whenever new Unicode script names are added, and adding a few tests for new -scripts is a good idea. +number of test characters. It used to be necessary to update the source +ucptest.c whenever new Unicode scripts were added, but this is no longer +required because that program now uses the lists in the PCRE2 source. However, +adding a few tests for new scripts to the files in ucptestdata is a good idea. Preparing for a PCRE2 release @@ -437,4 +440,4 @@ very sensible; some are rather wacky. Some have been on this list for years. Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -Last updated: 03 June 2019 +Last updated: 01 April 2020 diff --git a/maint/ucptest.c b/maint/ucptest.c index e946226..49616e2 100644 --- a/maint/ucptest.c +++ b/maint/ucptest.c @@ -16,36 +16,58 @@ /* This is a hacked-up program for testing the Unicode properties tables of PCRE2. It can also be used for finding characters with certain properties. I wrote it to help with debugging PCRE, and have added things that I found -useful, in a rather haphazard way. The code has never been "tidied" or checked -for robustness. - -If there are arguments, they are a list of hexadecimal code points whose -properties are to be output. Otherwise, the program expects to read commands on -stdin, and it writes output to stdout. There are two commands: - -"findprop" must be followed by a list of Unicode code points as hex numbers -(without any prefixes). The output is one line per character, giving its -Unicode properties followed by its other case if there is one, followed by its -Script Extension list if it is not just the same as the base script. - -"find" must be followed by a list of property names and their values. This -finds characters that have those properties. If multiple properties are listed, -they must all be matched. Currently supported: +useful, in a rather haphazard way. The code has never been seriously tidied or +checked for robustness, but it shouldn't now give compiler warnings. + +There is only one option: "-s". If given, it applies only to the "findprop" +command. It causes the UTF-8 sequence of bytes that encode the character to be +output between angle brackets at the end of the line. On a UTF-8 terminal, this +will show the appropriate graphic for the code point. + +If the command has arguments, they are concatenated into a buffer, separated by +spaces. If the first argument starts "U+" or consists entirely of hexadecimal +digits, "findprop" is inserted at the start. The buffer is then processed as a +single line file, after which the program exits. If there are no arguments, the +program reads commands line by line on stdin and writes output to stdout. The +return code is always zero. + +There are three commands: + +"findprop" must be followed by a space-separated list of Unicode code points as +hex numbers, either without any prefix or starting with "U+". The output is one +line per character, giving its Unicode properties followed by its other case or +cases if one or more exist, followed by its Script Extension list if it is not +just the same as the base script. This list is in square brackets. The +properties are: + +General type e.g. Letter +Specific type e.g. Upper case letter +Script e.g. Medefaidrin +Grapheme break type e.g. Extend (most common is Other) + +"find" must be followed by a list of property names and their values. The +values are case-sensitive. This finds characters that have those properties. If +multiple properties are listed, they must all be matched. Currently supported: script <name> The character must have this script property. Only one such script may be given. scriptx <name> This script must be in the character's Script Extension property list. If this is used many times, all the given scripts must be present. - type <abbrev> The character's type (e.g. Lu or Nd) must match. + type <abbrev> The character's specific type (e.g. Lu or Nd) must match. gbreak <name> The grapheme break property must match. If a <name> or <abbrev> is preceded by !, the value must NOT be present. For Script Extensions, there may be a mixture of positive and negative requirements. All must be satisfied. -No more than 100 characters are output. If there are more, the list ends with -... */ +Sequences of two or more characters are shown as ranges, for example +U+0041..U+004A. No more than 100 lines are are output. If there are more +characters, the list ends with ... + +"list" must be followed by a property name (script, type, or gbreak). The +defined values for that property are listed. */ + #ifdef HAVE_CONFIG_H #include "../src/config.h" @@ -91,228 +113,99 @@ No more than 100 characters are output. If there are more, the list ends with /* -------------------------------------------------------------------*/ - -const unsigned char *script_names[] = { - US"Unknown", - US"Arabic", - US"Armenian", - US"Bengali", - US"Bopomofo", - US"Braille", - US"Buginese", - US"Buhid", - US"Canadian_Aboriginal", - US"Cherokee", - US"Common", - US"Coptic", - US"Cypriot", - US"Cyrillic", - US"Deseret", - US"Devanagari", - US"Ethiopic", - US"Georgian", - US"Glagolitic", - US"Gothic", - US"Greek", - US"Gujarati", - US"Gurmukhi", - US"Han", - US"Hangul", - US"Hanunoo", - US"Hebrew", - US"Hiragana", - US"Inherited", - US"Kannada", - US"Katakana", - US"Kharoshthi", - US"Khmer", - US"Lao", - US"Latin", - US"Limbu", - US"Linear_B", - US"Malayalam", - US"Mongolian", - US"Myanmar", - US"New_Tai_Lue", - US"Ogham", - US"Old_Italic", - US"Old_Persian", - US"Oriya", - US"Osmanya", - US"Runic", - US"Shavian", - US"Sinhala", - US"Syloti_Nagri", - US"Syriac", - US"Tagalog", - US"Tagbanwa", - US"Tai_Le", - US"Tamil", - US"Telugu", - US"Thaana", - US"Thai", - US"Tibetan", - US"Tifinagh", - US"Ugaritic", - US"Yi", - /* New for Unicode 5.0: */ - US"Balinese", - US"Cuneiform", - US"Nko", - US"Phags_Pa", - US"Phoenician", - /* New for Unicode 5.1: */ - US"Carian", - US"Cham", - US"Kayah_Li", - US"Lepcha", - US"Lycian", - US"Lydian", - US"Ol_Chiki", - US"Rejang", - US"Saurashtra", - US"Sundanese", - US"Vai", - /* New for Unicode 5.2: */ - US"Avestan", - US"Bamum", - US"Egyptian_Hieroglyphs", - US"Imperial_Aramaic", - US"Inscriptional_Pahlavi", - US"Inscriptional_Parthian", - US"Javanese", - US"Kaithi", - US"Lisu", - US"Meetei_Mayek", - US"Old_South_Arabian", - US"Old_Turkic", - US"Samaritan", - US"Tai_Tham", - US"Tai_Viet", - /* New for Unicode 6.0.0 */ - US"Batak", - US"Brahmi", - US"Mandaic", - /* New for Unicode 6.1.0 */ - US"Chakma", - US"Meroitic_Cursive", - US"Meroitic_Hieroglyphs", - US"Miao", - US"Sharada", - US"Sora Sompent", - US"Takri", - /* New for Unicode 7.0.0 */ - US"Bassa_Vah", - US"Caucasian_Albanian", - US"Duployan", - US"Elbasan", - US"Grantha", - US"Khojki", - US"Khudawadi", - US"Linear_A", - US"Mahajani", - US"Manichaean", - US"Mende_Kikakui", - US"Modi", - US"Mro", - US"Nabataean", - US"Old_North_Arabian", - US"Old_Permic", - US"Pahawh_Hmong", - US"Palmyrene", - US"Psalter_Pahlavi", - US"Pau_Cin_Hau", - US"Siddham", - US"Tirhuta", - US"Warang_Citi", - /* New for Unicode 8.0.0 */ - US"Ahom", - US"Anatolian_Hieroglyphs", - US"Hatran", - US"Multani", - US"Old_Hungarian", - US"SignWriting", - /* New for Unicode 10.0.0 (no update since 8.0.0) */ - US"Adlam", - US"Bhaiksuki", - US"Marchen", - US"Newa", - US"Osage", - US"Tangut", - US"Masaram_Gondi", - US"Nushu", - US"Soyombo", - US"Zanabazar_Square", - /* New for Unicode 11.0.0 */ - US"Dogra", - US"Gunjala_Gondi", - US"Hanifi_Rohingya", - US"Makasar", - US"Medefaidrin", - US"Old_Sogdian", - US"Sogdian", - /* New for Unicode 12.0.0 */ - US"Elymaic", - US"Nandinagari", - US"Nyiakeng_Puachue_Hmong", - US"Wancho", - /* New for Unicode 13.0.0 */ - US"Chorasmian", - US"Dives_Akuru", - US"Khitan_Small_Script", - US"Yezidi" +static BOOL show_character = FALSE; + +static const unsigned char *type_names[] = { + US"Cc", US"Control", + US"Cf", US"Format", + US"Cn", US"Unassigned", + US"Co", US"Private use", + US"Cs", US"Surrogate", + US"Ll", US"Lower case letter", + US"Lm", US"Modifier letter", + US"Lo", US"Other letter", + US"Lt", US"Title case letter", + US"Lu", US"Upper case letter", + US"Mc", US"Spacing mark", + US"Me", US"Enclosing mark", + US"Mn", US"Non-spacing mark", + US"Nd", US"Decimal number", + US"Nl", US"Letter number", + US"No", US"Other number", + US"Pc", US"Connector punctuation", + US"Pd", US"Dash punctuation", + US"Pe", US"Close punctuation", + US"Pf", US"Final punctuation", + US"Pi", US"Initial punctuation", + US"Po", US"Other punctuation", + US"Ps", US"Open punctuation", + US"Sc", US"Currency symbol", + US"Sk", US"Modifier symbol", + US"Sm", US"Mathematical symbol", + US"So", US"Other symbol", + US"Zl", US"Line separator", + US"Zp", US"Paragraph separator", + US"Zs", US"Space separator" }; -const unsigned char *type_names[] = { - US"Cc", - US"Cf", - US"Cn", - US"Co", - US"Cs", - US"Ll", - US"Lm", - US"Lo", - US"Lt", - US"Lu", - US"Mc", - US"Me", - US"Mn", - US"Nd", - US"Nl", - US"No", - US"Pc", - US"Pd", - US"Pe", - US"Pf", - US"Pi", - US"Po", - US"Ps", - US"Sc", - US"Sk", - US"Sm", - US"So", - US"Zl", - US"Zp", - US"Zs" +static const unsigned char *gb_names[] = { + US"CR", US"carriage return", + US"LF", US"linefeed", + US"Control", US"", + US"Extend", US"", + US"Prepend", US"", + US"SpacingMark", US"", + US"L", US"Hangul syllable type L", + US"V", US"Hangul syllable type V", + US"T", US"Hangul syllable type T", + US"LV", US"Hangul syllable type LV", + US"LVT", US"Hangul syllable type LVT", + US"RegionalIndicator", US"", + US"Other", US"", + US"ZWJ", US"zero width joiner", + US"Extended_Pictographic", US"" }; -const unsigned char *gb_names[] = { - US"CR", - US"LF", - US"Control", - US"Extend", - US"Prepend", - US"SpacingMark", - US"L", - US"V", - US"T", - US"LV", - US"LVT", - US"RegionalIndicator", - US"Other", - US"ZWJ", - US"Extended_Pictographic" -}; + +static const unsigned int utf8_table1[] = { + 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff}; + +static const int utf8_table2[] = { + 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + + +/************************************************* +* Convert character value to UTF-8 * +*************************************************/ + +/* This function takes an unsigned long integer value in the range 0 - +0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes. + +Arguments: + cvalue the character value + buffer pointer to buffer for result - at least 6 bytes long + +Returns: number of bytes placed in the buffer + 0 if input code point is too big +*/ + +static size_t +ord2utf8(unsigned int cvalue, unsigned char *buffer) +{ +size_t i, j; +for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) + if (cvalue <= utf8_table1[i]) break; +if (i >= sizeof(utf8_table1)/sizeof(int)) return 0; +buffer += i; +for (j = i; j > 0; j--) + { + *buffer-- = 0x80 | (cvalue & 0x3f); + cvalue >>= 6; + } +*buffer = utf8_table2[i] | cvalue; +return i + 1; +} + /************************************************* @@ -331,27 +224,46 @@ return isatty(fileno(stdin)); /************************************************* +* Get script name from ucp ident * +*************************************************/ + +static const char * +get_scriptname(int script) +{ +size_t i; +const ucp_type_table *u; + +for (i = 0; i < PRIV(utt_size); i++) + { + u = PRIV(utt) + i; + if (u->type == PT_SC && u->value == script) break; + } +if (i < PRIV(utt_size)) + return PRIV(utt_names) + u->name_offset; + +return "??"; +} + + +/************************************************* * Print Unicode property info for a char * *************************************************/ static void -print_prop(int c) +print_prop(unsigned int c, BOOL is_just_one) { int type = UCD_CATEGORY(c); int fulltype = UCD_CHARTYPE(c); int script = UCD_SCRIPT(c); int scriptx = UCD_SCRIPTX(c); int gbprop = UCD_GRAPHBREAK(c); -int othercase = UCD_OTHERCASE(c); +unsigned int othercase = UCD_OTHERCASE(c); int caseset = UCD_CASESET(c); const unsigned char *fulltypename = US"??"; const unsigned char *typename = US"??"; -const unsigned char *scriptname = US"??"; const unsigned char *graphbreak = US"??"; - -if (script < sizeof(script_names)/sizeof(char *)) - scriptname = script_names[script]; +const unsigned char *scriptname = CUS get_scriptname(script); switch (type) { @@ -420,15 +332,18 @@ switch(gbprop) default: graphbreak = US"Unknown"; break; } -printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak); -if (othercase != c) +printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak); +if (is_just_one && othercase != c) { - printf(", %04x", othercase); + printf(", U+%04X", othercase); if (caseset != 0) { const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1; while (*(++p) < NOTACHAR) - if (*p != othercase && *p != c) printf(", %04x", *p); + { + unsigned int d = *p; + if (d != othercase && d != c) printf(", U+%04X", d); + } } } @@ -436,25 +351,26 @@ if (scriptx != script) { printf(", ["); if (scriptx >= 0) - { - scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))? - US"??" : script_names[scriptx]; - printf("%s", scriptname); - } + printf("%s", get_scriptname(scriptx)); else { - char *sep = ""; + const char *sep = ""; const uint8_t *p = PRIV(ucd_script_sets) - scriptx; while (*p != 0) { - scriptname = (*p >= sizeof(script_names)/sizeof(char *))? - US"??" : script_names[*p++]; - printf("%s%s", sep, scriptname); + printf("%s%s", sep, get_scriptname(*p++)); sep = ", "; } } printf("]"); } + +if (show_character && is_just_one) + { + unsigned char buffer[8]; + size_t len = ord2utf8(c, buffer); + printf(", >%.*s<", (int)len, buffer); + } printf("\n"); } @@ -483,7 +399,7 @@ BOOL type_not = FALSE; BOOL gbreak_not = FALSE; BOOL hadrange = FALSE; const ucd_record *ucd, *next_ucd; -const char *pad = " "; +const char *pad = " "; while (*s != 0) { @@ -508,17 +424,20 @@ while (*s != 0) offset = 1; } - for (i = 0; i < sizeof(script_names)/sizeof(char *); i++) + for (i = 0; i < PRIV(utt_size); i++) { - if (strcmp(CS value + offset, script_names[i]) == 0) + const ucp_type_table *u = PRIV(utt) + i; + if (u->type == PT_SC && strcmp(CS(value + offset), + PRIV(utt_names) + u->name_offset) == 0) { + c = u->value; if (name[6] == 'x') { - scriptx_list[scriptx_count++] = scriptx_not? (-i):i; + scriptx_list[scriptx_count++] = scriptx_not? (-c):c; } else { - if (script < 0) script = i; else + if (script < 0) script = c; else { printf("** Only 1 script value allowed\n"); return; @@ -528,9 +447,9 @@ while (*s != 0) } } - if (i >= sizeof(script_names)/sizeof(char *)) + if (i >= PRIV(utt_size)) { - printf("** Unrecognized script name '%s'\n", value); + printf("** Unrecognized script name \"%s\"\n", value); return; } } @@ -550,17 +469,17 @@ while (*s != 0) offset = 1; } - for (i = 0; i < sizeof(type_names)/sizeof(char *); i++) + for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2) { - if (strcmp(CS (value + offset), type_names[i]) == 0) + if (strcmp(CS (value + offset), CS type_names[i]) == 0) { - type = i; + type = i/2; break; } } if (i >= sizeof(type_names)/sizeof(char *)) { - printf("** Unrecognized type name '%s'\n", value); + printf("** Unrecognized type name \"%s\"\n", value); return; } } @@ -581,17 +500,17 @@ while (*s != 0) offset = 1; } - for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++) + for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2) { - if (strcmp(CS (value + offset), gb_names[i]) == 0) + if (strcmp(CS (value + offset), CS gb_names[i]) == 0) { - gbreak = i; + gbreak = i/2; break; } } if (i >= sizeof(gb_names)/sizeof(char *)) { - printf("** Unrecognized gbreak name '%s'\n", value); + printf("** Unrecognized gbreak name \"%s\"\n", value); return; } } @@ -599,7 +518,7 @@ while (*s != 0) else { - printf("** Unrecognized property name '%s'\n", name); + printf("** Unrecognized property name \"%s\"\n", name); return; } } @@ -617,7 +536,7 @@ for (c = 0; c <= 0x10ffff; c++) if (scriptx_count > 0) { const uint8_t *char_scriptx = NULL; - int found = 0; + unsigned int found = 0; int scriptx = UCD_SCRIPTX(c); if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx; @@ -701,13 +620,13 @@ for (c = 0; c <= 0x10ffff; c++) if (--i > c) { - printf("%04x..", c); + printf("U+%04X..", c); c = i; hadrange = TRUE; } else if (hadrange) printf("%s", pad); - print_prop(c); + print_prop(c, FALSE); if (c >= 0x100000) pad = " "; else if (c >= 0x10000) pad = " "; count++; @@ -723,6 +642,101 @@ if (count == 0) printf("No characters found\n"); /************************************************* +* Process command line * +*************************************************/ + +static void +process_command_line(unsigned char *buffer) +{ +unsigned char *s, *t; +unsigned char name[24]; + +s = buffer; +while (isspace(*s)) s++; +if (*s == 0) return; + +for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; +*t = 0; +while (isspace(*s)) s++; + +if (strcmp(CS name, "findprop") == 0) + { + while (*s != 0) + { + unsigned int c; + unsigned char *endptr; + t = s; + if (strncmp(CS t, "U+", 2) == 0) t += 2; + c = strtoul(CS t, CSS(&endptr), 16); + if (*endptr != 0 && !isspace(*endptr)) + { + while (*endptr != 0 && !isspace(*endptr)) endptr++; + printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s); + } + else + { + if (c > 0x10ffff) + printf("** U+%x is too big for a Unicode code point\n", c); + else + print_prop(c, TRUE); + } + s = endptr; + while (isspace(*s)) s++; + } + } + +else if (strcmp(CS name, "find") == 0) + { + find_chars(s); + } + +else if (strcmp(CS name, "list") == 0) + { + while (*s != 0) + { + size_t i; + for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; + *t = 0; + while (isspace(*s)) s++; + + if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0) + { + for (i = 0; i < PRIV(utt_size); i++) + if (PRIV(utt)[i].type == PT_SC) + printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset); + } + + else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0) + { + for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2) + printf("%s %s\n", type_names[i], type_names[i+1]); + } + + else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0) + { + for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2) + { + if (gb_names[i+1][0] != 0) + printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]); + else + printf("%s\n", gb_names[i]); + } + } + + else + { + printf("** Unknown property \"%s\"\n", name); + break; + } + } + } + +else printf("** Unknown test command \"%s\"\n", name); +} + + + +/************************************************* * Main program * *************************************************/ @@ -730,19 +744,42 @@ int main(int argc, char **argv) { BOOL interactive; +int first_arg = 1; unsigned char buffer[1024]; -if (argc > 1) +if (argc > 1 && strcmp(argv[1], "-s") == 0) + { + show_character = TRUE; + first_arg++; + } + +if (argc > first_arg) { int i; - for (i = 1; i < argc; i++) + BOOL hexfirst = TRUE; + char *arg = argv[first_arg]; + unsigned char *s = buffer; + + if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg)) { - unsigned char *endptr; - int c = strtoul(argv[i], CSS(&endptr), 16); - if (*endptr != 0) - printf("** Hex number expected; ignored '%s'\n", argv[i]); - else print_prop(c); + while (*arg != 0) + { + if (!isxdigit(*arg++)) { hexfirst = FALSE; break; } + } + } + + if (hexfirst) + { + strcpy(CS s, "findprop "); + s += 9; + } + + for (i = first_arg; i < argc; i++) + { + s += sprintf(CS s, "%s ", argv[i]); } + + process_command_line(buffer); return 0; } @@ -754,17 +791,14 @@ if (interactive) using_history(); for(;;) { - unsigned char name[24]; - unsigned char *s, *t; - #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT) if (interactive) { size_t len; - s = readline("> "); + unsigned char *s = US readline("> "); if (s == NULL) break; - len = strlen(s); - if (len > 0) add_history(s); + len = strlen(CS s); + if (len > 0) add_history(CS s); memcpy(buffer, s, len); buffer[len] = '\n'; buffer[len+1] = 0; @@ -778,39 +812,8 @@ for(;;) if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break; if (!interactive) printf("%s", buffer); } - - s = buffer; - while (isspace(*s)) s++; - if (*s == 0) continue; - - for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s; - *t = 0; - while (isspace(*s)) s++; - - if (strcmp(CS name, "findprop") == 0) - { - while (*s != 0) - { - unsigned char *endptr; - int c = strtoul(CS s, CSS(&endptr), 16); - - if (*endptr != 0 && !isspace(*endptr)) - { - while (*endptr != 0 && !isspace(*endptr)) endptr++; - printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s); - } - else print_prop(c); - s = endptr; - while (isspace(*s)) s++; - } - } - - else if (strcmp(CS name, "find") == 0) - { - find_chars(s); - } - - else printf("** Unknown test command %s\n", name); + + process_command_line(buffer); } if (interactive) printf("\n"); diff --git a/maint/ucptestdata/testinput1 b/maint/ucptestdata/testinput1 index 58c7cf1..3552a4f 100644 --- a/maint/ucptestdata/testinput1 +++ b/maint/ucptestdata/testinput1 @@ -45,4 +45,4 @@ findprop 32ff findprop 1f16d -findprop 10e93 10eaa +findprop U+10e93 U+10eaa diff --git a/maint/ucptestdata/testinput2 b/maint/ucptestdata/testinput2 new file mode 100644 index 0000000..bdea520 --- /dev/null +++ b/maint/ucptestdata/testinput2 @@ -0,0 +1,5 @@ +find script Han +find type Pe script Common scriptx Hangul +find type Sk +find type Pd +find gbreak LVT diff --git a/maint/ucptestdata/testoutput1 b/maint/ucptestdata/testoutput1 index 0751a58..275b8e4 100644 --- a/maint/ucptestdata/testoutput1 +++ b/maint/ucptestdata/testoutput1 @@ -1,398 +1,398 @@ findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f -0000 Control: Control, Common, Control -0001 Control: Control, Common, Control -0002 Control: Control, Common, Control -0003 Control: Control, Common, Control -0004 Control: Control, Common, Control -0005 Control: Control, Common, Control -0006 Control: Control, Common, Control -0007 Control: Control, Common, Control -0008 Control: Control, Common, Control -0009 Control: Control, Common, Control -000a Control: Control, Common, LF -000b Control: Control, Common, Control -000c Control: Control, Common, Control -000d Control: Control, Common, CR -000e Control: Control, Common, Control -000f Control: Control, Common, Control +U+0000 Control: Control, Common, Control +U+0001 Control: Control, Common, Control +U+0002 Control: Control, Common, Control +U+0003 Control: Control, Common, Control +U+0004 Control: Control, Common, Control +U+0005 Control: Control, Common, Control +U+0006 Control: Control, Common, Control +U+0007 Control: Control, Common, Control +U+0008 Control: Control, Common, Control +U+0009 Control: Control, Common, Control +U+000A Control: Control, Common, LF +U+000B Control: Control, Common, Control +U+000C Control: Control, Common, Control +U+000D Control: Control, Common, CR +U+000E Control: Control, Common, Control +U+000F Control: Control, Common, Control findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f -0010 Control: Control, Common, Control -0011 Control: Control, Common, Control -0012 Control: Control, Common, Control -0013 Control: Control, Common, Control -0014 Control: Control, Common, Control -0015 Control: Control, Common, Control -0016 Control: Control, Common, Control -0017 Control: Control, Common, Control -0018 Control: Control, Common, Control -0019 Control: Control, Common, Control -001a Control: Control, Common, Control -001b Control: Control, Common, Control -001c Control: Control, Common, Control -001d Control: Control, Common, Control -001e Control: Control, Common, Control -001f Control: Control, Common, Control +U+0010 Control: Control, Common, Control +U+0011 Control: Control, Common, Control +U+0012 Control: Control, Common, Control +U+0013 Control: Control, Common, Control +U+0014 Control: Control, Common, Control +U+0015 Control: Control, Common, Control +U+0016 Control: Control, Common, Control +U+0017 Control: Control, Common, Control +U+0018 Control: Control, Common, Control +U+0019 Control: Control, Common, Control +U+001A Control: Control, Common, Control +U+001B Control: Control, Common, Control +U+001C Control: Control, Common, Control +U+001D Control: Control, Common, Control +U+001E Control: Control, Common, Control +U+001F Control: Control, Common, Control findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f -0020 Separator: Space separator, Common, Other -0021 Punctuation: Other punctuation, Common, Other -0022 Punctuation: Other punctuation, Common, Other -0023 Punctuation: Other punctuation, Common, Other -0024 Symbol: Currency symbol, Common, Other -0025 Punctuation: Other punctuation, Common, Other -0026 Punctuation: Other punctuation, Common, Other -0027 Punctuation: Other punctuation, Common, Other -0028 Punctuation: Open punctuation, Common, Other -0029 Punctuation: Close punctuation, Common, Other -002a Punctuation: Other punctuation, Common, Other -002b Symbol: Mathematical symbol, Common, Other -002c Punctuation: Other punctuation, Common, Other -002d Punctuation: Dash punctuation, Common, Other -002e Punctuation: Other punctuation, Common, Other -002f Punctuation: Other punctuation, Common, Other +U+0020 Separator: Space separator, Common, Other +U+0021 Punctuation: Other punctuation, Common, Other +U+0022 Punctuation: Other punctuation, Common, Other +U+0023 Punctuation: Other punctuation, Common, Other +U+0024 Symbol: Currency symbol, Common, Other +U+0025 Punctuation: Other punctuation, Common, Other +U+0026 Punctuation: Other punctuation, Common, Other +U+0027 Punctuation: Other punctuation, Common, Other +U+0028 Punctuation: Open punctuation, Common, Other +U+0029 Punctuation: Close punctuation, Common, Other +U+002A Punctuation: Other punctuation, Common, Other +U+002B Symbol: Mathematical symbol, Common, Other +U+002C Punctuation: Other punctuation, Common, Other +U+002D Punctuation: Dash punctuation, Common, Other +U+002E Punctuation: Other punctuation, Common, Other +U+002F Punctuation: Other punctuation, Common, Other findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f -0030 Number: Decimal number, Common, Other -0031 Number: Decimal number, Common, Other -0032 Number: Decimal number, Common, Other -0033 Number: Decimal number, Common, Other -0034 Number: Decimal number, Common, Other -0035 Number: Decimal number, Common, Other -0036 Number: Decimal number, Common, Other -0037 Number: Decimal number, Common, Other -0038 Number: Decimal number, Common, Other -0039 Number: Decimal number, Common, Other -003a Punctuation: Other punctuation, Common, Other -003b Punctuation: Other punctuation, Common, Other -003c Symbol: Mathematical symbol, Common, Other -003d Symbol: Mathematical symbol, Common, Other -003e Symbol: Mathematical symbol, Common, Other -003f Punctuation: Other punctuation, Common, Other +U+0030 Number: Decimal number, Common, Other +U+0031 Number: Decimal number, Common, Other +U+0032 Number: Decimal number, Common, Other +U+0033 Number: Decimal number, Common, Other +U+0034 Number: Decimal number, Common, Other +U+0035 Number: Decimal number, Common, Other +U+0036 Number: Decimal number, Common, Other +U+0037 Number: Decimal number, Common, Other +U+0038 Number: Decimal number, Common, Other +U+0039 Number: Decimal number, Common, Other +U+003A Punctuation: Other punctuation, Common, Other +U+003B Punctuation: Other punctuation, Common, Other +U+003C Symbol: Mathematical symbol, Common, Other +U+003D Symbol: Mathematical symbol, Common, Other +U+003E Symbol: Mathematical symbol, Common, Other +U+003F Punctuation: Other punctuation, Common, Other findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f -0040 Punctuation: Other punctuation, Common, Other -0041 Letter: Upper case letter, Latin, Other, 0061 -0042 Letter: Upper case letter, Latin, Other, 0062 -0043 Letter: Upper case letter, Latin, Other, 0063 -0044 Letter: Upper case letter, Latin, Other, 0064 -0045 Letter: Upper case letter, Latin, Other, 0065 -0046 Letter: Upper case letter, Latin, Other, 0066 -0047 Letter: Upper case letter, Latin, Other, 0067 -0048 Letter: Upper case letter, Latin, Other, 0068 -0049 Letter: Upper case letter, Latin, Other, 0069 -004a Letter: Upper case letter, Latin, Other, 006a -004b Letter: Upper case letter, Latin, Other, 006b, 212a -004c Letter: Upper case letter, Latin, Other, 006c -004d Letter: Upper case letter, Latin, Other, 006d -004e Letter: Upper case letter, Latin, Other, 006e -004f Letter: Upper case letter, Latin, Other, 006f +U+0040 Punctuation: Other punctuation, Common, Other +U+0041 Letter: Upper case letter, Latin, Other, U+0061 +U+0042 Letter: Upper case letter, Latin, Other, U+0062 +U+0043 Letter: Upper case letter, Latin, Other, U+0063 +U+0044 Letter: Upper case letter, Latin, Other, U+0064 +U+0045 Letter: Upper case letter, Latin, Other, U+0065 +U+0046 Letter: Upper case letter, Latin, Other, U+0066 +U+0047 Letter: Upper case letter, Latin, Other, U+0067 +U+0048 Letter: Upper case letter, Latin, Other, U+0068 +U+0049 Letter: Upper case letter, Latin, Other, U+0069 +U+004A Letter: Upper case letter, Latin, Other, U+006A +U+004B Letter: Upper case letter, Latin, Other, U+006B, U+212A +U+004C Letter: Upper case letter, Latin, Other, U+006C +U+004D Letter: Upper case letter, Latin, Other, U+006D +U+004E Letter: Upper case letter, Latin, Other, U+006E +U+004F Letter: Upper case letter, Latin, Other, U+006F findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f -0050 Letter: Upper case letter, Latin, Other, 0070 -0051 Letter: Upper case letter, Latin, Other, 0071 -0052 Letter: Upper case letter, Latin, Other, 0072 -0053 Letter: Upper case letter, Latin, Other, 0073, 017f -0054 Letter: Upper case letter, Latin, Other, 0074 -0055 Letter: Upper case letter, Latin, Other, 0075 -0056 Letter: Upper case letter, Latin, Other, 0076 -0057 Letter: Upper case letter, Latin, Other, 0077 -0058 Letter: Upper case letter, Latin, Other, 0078 -0059 Letter: Upper case letter, Latin, Other, 0079 -005a Letter: Upper case letter, Latin, Other, 007a -005b Punctuation: Open punctuation, Common, Other -005c Punctuation: Other punctuation, Common, Other -005d Punctuation: Close punctuation, Common, Other -005e Symbol: Modifier symbol, Common, Other -005f Punctuation: Connector punctuation, Common, Other +U+0050 Letter: Upper case letter, Latin, Other, U+0070 +U+0051 Letter: Upper case letter, Latin, Other, U+0071 +U+0052 Letter: Upper case letter, Latin, Other, U+0072 +U+0053 Letter: Upper case letter, Latin, Other, U+0073, U+017F +U+0054 Letter: Upper case letter, Latin, Other, U+0074 +U+0055 Letter: Upper case letter, Latin, Other, U+0075 +U+0056 Letter: Upper case letter, Latin, Other, U+0076 +U+0057 Letter: Upper case letter, Latin, Other, U+0077 +U+0058 Letter: Upper case letter, Latin, Other, U+0078 +U+0059 Letter: Upper case letter, Latin, Other, U+0079 +U+005A Letter: Upper case letter, Latin, Other, U+007A +U+005B Punctuation: Open punctuation, Common, Other +U+005C Punctuation: Other punctuation, Common, Other +U+005D Punctuation: Close punctuation, Common, Other +U+005E Symbol: Modifier symbol, Common, Other +U+005F Punctuation: Connector punctuation, Common, Other findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f -0060 Symbol: Modifier symbol, Common, Other -0061 Letter: Lower case letter, Latin, Other, 0041 -0062 Letter: Lower case letter, Latin, Other, 0042 -0063 Letter: Lower case letter, Latin, Other, 0043 -0064 Letter: Lower case letter, Latin, Other, 0044 -0065 Letter: Lower case letter, Latin, Other, 0045 -0066 Letter: Lower case letter, Latin, Other, 0046 -0067 Letter: Lower case letter, Latin, Other, 0047 -0068 Letter: Lower case letter, Latin, Other, 0048 -0069 Letter: Lower case letter, Latin, Other, 0049 -006a Letter: Lower case letter, Latin, Other, 004a -006b Letter: Lower case letter, Latin, Other, 004b, 212a -006c Letter: Lower case letter, Latin, Other, 004c -006d Letter: Lower case letter, Latin, Other, 004d -006e Letter: Lower case letter, Latin, Other, 004e -006f Letter: Lower case letter, Latin, Other, 004f +U+0060 Symbol: Modifier symbol, Common, Other +U+0061 Letter: Lower case letter, Latin, Other, U+0041 +U+0062 Letter: Lower case letter, Latin, Other, U+0042 +U+0063 Letter: Lower case letter, Latin, Other, U+0043 +U+0064 Letter: Lower case letter, Latin, Other, U+0044 +U+0065 Letter: Lower case letter, Latin, Other, U+0045 +U+0066 Letter: Lower case letter, Latin, Other, U+0046 +U+0067 Letter: Lower case letter, Latin, Other, U+0047 +U+0068 Letter: Lower case letter, Latin, Other, U+0048 +U+0069 Letter: Lower case letter, Latin, Other, U+0049 +U+006A Letter: Lower case letter, Latin, Other, U+004A +U+006B Letter: Lower case letter, Latin, Other, U+004B, U+212A +U+006C Letter: Lower case letter, Latin, Other, U+004C +U+006D Letter: Lower case letter, Latin, Other, U+004D +U+006E Letter: Lower case letter, Latin, Other, U+004E +U+006F Letter: Lower case letter, Latin, Other, U+004F findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f -0070 Letter: Lower case letter, Latin, Other, 0050 -0071 Letter: Lower case letter, Latin, Other, 0051 -0072 Letter: Lower case letter, Latin, Other, 0052 -0073 Letter: Lower case letter, Latin, Other, 0053, 017f -0074 Letter: Lower case letter, Latin, Other, 0054 -0075 Letter: Lower case letter, Latin, Other, 0055 -0076 Letter: Lower case letter, Latin, Other, 0056 -0077 Letter: Lower case letter, Latin, Other, 0057 -0078 Letter: Lower case letter, Latin, Other, 0058 -0079 Letter: Lower case letter, Latin, Other, 0059 -007a Letter: Lower case letter, Latin, Other, 005a -007b Punctuation: Open punctuation, Common, Other -007c Symbol: Mathematical symbol, Common, Other -007d Punctuation: Close punctuation, Common, Other -007e Symbol: Mathematical symbol, Common, Other -007f Control: Control, Common, Control +U+0070 Letter: Lower case letter, Latin, Other, U+0050 +U+0071 Letter: Lower case letter, Latin, Other, U+0051 +U+0072 Letter: Lower case letter, Latin, Other, U+0052 +U+0073 Letter: Lower case letter, Latin, Other, U+0053, U+017F +U+0074 Letter: Lower case letter, Latin, Other, U+0054 +U+0075 Letter: Lower case letter, Latin, Other, U+0055 +U+0076 Letter: Lower case letter, Latin, Other, U+0056 +U+0077 Letter: Lower case letter, Latin, Other, U+0057 +U+0078 Letter: Lower case letter, Latin, Other, U+0058 +U+0079 Letter: Lower case letter, Latin, Other, U+0059 +U+007A Letter: Lower case letter, Latin, Other, U+005A +U+007B Punctuation: Open punctuation, Common, Other +U+007C Symbol: Mathematical symbol, Common, Other +U+007D Punctuation: Close punctuation, Common, Other +U+007E Symbol: Mathematical symbol, Common, Other +U+007F Control: Control, Common, Control findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f -0080 Control: Control, Common, Control -0081 Control: Control, Common, Control -0082 Control: Control, Common, Control -0083 Control: Control, Common, Control -0084 Control: Control, Common, Control -0085 Control: Control, Common, Control -0086 Control: Control, Common, Control -0087 Control: Control, Common, Control -0088 Control: Control, Common, Control -0089 Control: Control, Common, Control -008a Control: Control, Common, Control -008b Control: Control, Common, Control -008c Control: Control, Common, Control -008d Control: Control, Common, Control -008e Control: Control, Common, Control -008f Control: Control, Common, Control +U+0080 Control: Control, Common, Control +U+0081 Control: Control, Common, Control +U+0082 Control: Control, Common, Control +U+0083 Control: Control, Common, Control +U+0084 Control: Control, Common, Control +U+0085 Control: Control, Common, Control +U+0086 Control: Control, Common, Control +U+0087 Control: Control, Common, Control +U+0088 Control: Control, Common, Control +U+0089 Control: Control, Common, Control +U+008A Control: Control, Common, Control +U+008B Control: Control, Common, Control +U+008C Control: Control, Common, Control +U+008D Control: Control, Common, Control +U+008E Control: Control, Common, Control +U+008F Control: Control, Common, Control findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f -0090 Control: Control, Common, Control -0091 Control: Control, Common, Control -0092 Control: Control, Common, Control -0093 Control: Control, Common, Control -0094 Control: Control, Common, Control -0095 Control: Control, Common, Control -0096 Control: Control, Common, Control -0097 Control: Control, Common, Control -0098 Control: Control, Common, Control -0099 Control: Control, Common, Control -009a Control: Control, Common, Control -009b Control: Control, Common, Control -009c Control: Control, Common, Control -009d Control: Control, Common, Control -009e Control: Control, Common, Control -009f Control: Control, Common, Control +U+0090 Control: Control, Common, Control +U+0091 Control: Control, Common, Control +U+0092 Control: Control, Common, Control +U+0093 Control: Control, Common, Control +U+0094 Control: Control, Common, Control +U+0095 Control: Control, Common, Control +U+0096 Control: Control, Common, Control +U+0097 Control: Control, Common, Control +U+0098 Control: Control, Common, Control +U+0099 Control: Control, Common, Control +U+009A Control: Control, Common, Control +U+009B Control: Control, Common, Control +U+009C Control: Control, Common, Control +U+009D Control: Control, Common, Control +U+009E Control: Control, Common, Control +U+009F Control: Control, Common, Control findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af -00a0 Separator: Space separator, Common, Other -00a1 Punctuation: Other punctuation, Common, Other -00a2 Symbol: Currency symbol, Common, Other -00a3 Symbol: Currency symbol, Common, Other -00a4 Symbol: Currency symbol, Common, Other -00a5 Symbol: Currency symbol, Common, Other -00a6 Symbol: Other symbol, Common, Other -00a7 Punctuation: Other punctuation, Common, Other -00a8 Symbol: Modifier symbol, Common, Other -00a9 Symbol: Other symbol, Common, Extended Pictographic -00aa Letter: Other letter, Latin, Other -00ab Punctuation: Initial punctuation, Common, Other -00ac Symbol: Mathematical symbol, Common, Other -00ad Control: Format, Common, Control -00ae Symbol: Other symbol, Common, Extended Pictographic -00af Symbol: Modifier symbol, Common, Other +U+00A0 Separator: Space separator, Common, Other +U+00A1 Punctuation: Other punctuation, Common, Other +U+00A2 Symbol: Currency symbol, Common, Other +U+00A3 Symbol: Currency symbol, Common, Other +U+00A4 Symbol: Currency symbol, Common, Other +U+00A5 Symbol: Currency symbol, Common, Other +U+00A6 Symbol: Other symbol, Common, Other +U+00A7 Punctuation: Other punctuation, Common, Other +U+00A8 Symbol: Modifier symbol, Common, Other +U+00A9 Symbol: Other symbol, Common, Extended Pictographic +U+00AA Letter: Other letter, Latin, Other +U+00AB Punctuation: Initial punctuation, Common, Other +U+00AC Symbol: Mathematical symbol, Common, Other +U+00AD Control: Format, Common, Control +U+00AE Symbol: Other symbol, Common, Extended Pictographic +U+00AF Symbol: Modifier symbol, Common, Other findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf -00b0 Symbol: Other symbol, Common, Other -00b1 Symbol: Mathematical symbol, Common, Other -00b2 Number: Other number, Common, Other -00b3 Number: Other number, Common, Other -00b4 Symbol: Modifier symbol, Common, Other -00b5 Letter: Lower case letter, Common, Other, 03bc, 039c -00b6 Punctuation: Other punctuation, Common, Other -00b7 Punctuation: Other punctuation, Common, Other -00b8 Symbol: Modifier symbol, Common, Other -00b9 Number: Other number, Common, Other -00ba Letter: Other letter, Latin, Other -00bb Punctuation: Final punctuation, Common, Other -00bc Number: Other number, Common, Other -00bd Number: Other number, Common, Other -00be Number: Other number, Common, Other -00bf Punctuation: Other punctuation, Common, Other +U+00B0 Symbol: Other symbol, Common, Other +U+00B1 Symbol: Mathematical symbol, Common, Other +U+00B2 Number: Other number, Common, Other +U+00B3 Number: Other number, Common, Other +U+00B4 Symbol: Modifier symbol, Common, Other +U+00B5 Letter: Lower case letter, Common, Other, U+03BC, U+039C +U+00B6 Punctuation: Other punctuation, Common, Other +U+00B7 Punctuation: Other punctuation, Common, Other +U+00B8 Symbol: Modifier symbol, Common, Other +U+00B9 Number: Other number, Common, Other +U+00BA Letter: Other letter, Latin, Other +U+00BB Punctuation: Final punctuation, Common, Other +U+00BC Number: Other number, Common, Other +U+00BD Number: Other number, Common, Other +U+00BE Number: Other number, Common, Other +U+00BF Punctuation: Other punctuation, Common, Other findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf -00c0 Letter: Upper case letter, Latin, Other, 00e0 -00c1 Letter: Upper case letter, Latin, Other, 00e1 -00c2 Letter: Upper case letter, Latin, Other, 00e2 -00c3 Letter: Upper case letter, Latin, Other, 00e3 -00c4 Letter: Upper case letter, Latin, Other, 00e4 -00c5 Letter: Upper case letter, Latin, Other, 00e5, 212b -00c6 Letter: Upper case letter, Latin, Other, 00e6 -00c7 Letter: Upper case letter, Latin, Other, 00e7 -00c8 Letter: Upper case letter, Latin, Other, 00e8 -00c9 Letter: Upper case letter, Latin, Other, 00e9 -00ca Letter: Upper case letter, Latin, Other, 00ea -00cb Letter: Upper case letter, Latin, Other, 00eb -00cc Letter: Upper case letter, Latin, Other, 00ec -00cd Letter: Upper case letter, Latin, Other, 00ed -00ce Letter: Upper case letter, Latin, Other, 00ee -00cf Letter: Upper case letter, Latin, Other, 00ef +U+00C0 Letter: Upper case letter, Latin, Other, U+00E0 +U+00C1 Letter: Upper case letter, Latin, Other, U+00E1 +U+00C2 Letter: Upper case letter, Latin, Other, U+00E2 +U+00C3 Letter: Upper case letter, Latin, Other, U+00E3 +U+00C4 Letter: Upper case letter, Latin, Other, U+00E4 +U+00C5 Letter: Upper case letter, Latin, Other, U+00E5, U+212B +U+00C6 Letter: Upper case letter, Latin, Other, U+00E6 +U+00C7 Letter: Upper case letter, Latin, Other, U+00E7 +U+00C8 Letter: Upper case letter, Latin, Other, U+00E8 +U+00C9 Letter: Upper case letter, Latin, Other, U+00E9 +U+00CA Letter: Upper case letter, Latin, Other, U+00EA +U+00CB Letter: Upper case letter, Latin, Other, U+00EB +U+00CC Letter: Upper case letter, Latin, Other, U+00EC +U+00CD Letter: Upper case letter, Latin, Other, U+00ED +U+00CE Letter: Upper case letter, Latin, Other, U+00EE +U+00CF Letter: Upper case letter, Latin, Other, U+00EF findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df -00d0 Letter: Upper case letter, Latin, Other, 00f0 -00d1 Letter: Upper case letter, Latin, Other, 00f1 -00d2 Letter: Upper case letter, Latin, Other, 00f2 -00d3 Letter: Upper case letter, Latin, Other, 00f3 -00d4 Letter: Upper case letter, Latin, Other, 00f4 -00d5 Letter: Upper case letter, Latin, Other, 00f5 -00d6 Letter: Upper case letter, Latin, Other, 00f6 -00d7 Symbol: Mathematical symbol, Common, Other -00d8 Letter: Upper case letter, Latin, Other, 00f8 -00d9 Letter: Upper case letter, Latin, Other, 00f9 -00da Letter: Upper case letter, Latin, Other, 00fa -00db Letter: Upper case letter, Latin, Other, 00fb -00dc Letter: Upper case letter, Latin, Other, 00fc -00dd Letter: Upper case letter, Latin, Other, 00fd -00de Letter: Upper case letter, Latin, Other, 00fe -00df Letter: Lower case letter, Latin, Other, 1e9e +U+00D0 Letter: Upper case letter, Latin, Other, U+00F0 +U+00D1 Letter: Upper case letter, Latin, Other, U+00F1 +U+00D2 Letter: Upper case letter, Latin, Other, U+00F2 +U+00D3 Letter: Upper case letter, Latin, Other, U+00F3 +U+00D4 Letter: Upper case letter, Latin, Other, U+00F4 +U+00D5 Letter: Upper case letter, Latin, Other, U+00F5 +U+00D6 Letter: Upper case letter, Latin, Other, U+00F6 +U+00D7 Symbol: Mathematical symbol, Common, Other +U+00D8 Letter: Upper case letter, Latin, Other, U+00F8 +U+00D9 Letter: Upper case letter, Latin, Other, U+00F9 +U+00DA Letter: Upper case letter, Latin, Other, U+00FA +U+00DB Letter: Upper case letter, Latin, Other, U+00FB +U+00DC Letter: Upper case letter, Latin, Other, U+00FC +U+00DD Letter: Upper case letter, Latin, Other, U+00FD +U+00DE Letter: Upper case letter, Latin, Other, U+00FE +U+00DF Letter: Lower case letter, Latin, Other, U+1E9E findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef -00e0 Letter: Lower case letter, Latin, Other, 00c0 -00e1 Letter: Lower case letter, Latin, Other, 00c1 -00e2 Letter: Lower case letter, Latin, Other, 00c2 -00e3 Letter: Lower case letter, Latin, Other, 00c3 -00e4 Letter: Lower case letter, Latin, Other, 00c4 -00e5 Letter: Lower case letter, Latin, Other, 00c5, 212b -00e6 Letter: Lower case letter, Latin, Other, 00c6 -00e7 Letter: Lower case letter, Latin, Other, 00c7 -00e8 Letter: Lower case letter, Latin, Other, 00c8 -00e9 Letter: Lower case letter, Latin, Other, 00c9 -00ea Letter: Lower case letter, Latin, Other, 00ca -00eb Letter: Lower case letter, Latin, Other, 00cb -00ec Letter: Lower case letter, Latin, Other, 00cc -00ed Letter: Lower case letter, Latin, Other, 00cd -00ee Letter: Lower case letter, Latin, Other, 00ce -00ef Letter: Lower case letter, Latin, Other, 00cf +U+00E0 Letter: Lower case letter, Latin, Other, U+00C0 +U+00E1 Letter: Lower case letter, Latin, Other, U+00C1 +U+00E2 Letter: Lower case letter, Latin, Other, U+00C2 +U+00E3 Letter: Lower case letter, Latin, Other, U+00C3 +U+00E4 Letter: Lower case letter, Latin, Other, U+00C4 +U+00E5 Letter: Lower case letter, Latin, Other, U+00C5, U+212B +U+00E6 Letter: Lower case letter, Latin, Other, U+00C6 +U+00E7 Letter: Lower case letter, Latin, Other, U+00C7 +U+00E8 Letter: Lower case letter, Latin, Other, U+00C8 +U+00E9 Letter: Lower case letter, Latin, Other, U+00C9 +U+00EA Letter: Lower case letter, Latin, Other, U+00CA +U+00EB Letter: Lower case letter, Latin, Other, U+00CB +U+00EC Letter: Lower case letter, Latin, Other, U+00CC +U+00ED Letter: Lower case letter, Latin, Other, U+00CD +U+00EE Letter: Lower case letter, Latin, Other, U+00CE +U+00EF Letter: Lower case letter, Latin, Other, U+00CF findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff -00f0 Letter: Lower case letter, Latin, Other, 00d0 -00f1 Letter: Lower case letter, Latin, Other, 00d1 -00f2 Letter: Lower case letter, Latin, Other, 00d2 -00f3 Letter: Lower case letter, Latin, Other, 00d3 -00f4 Letter: Lower case letter, Latin, Other, 00d4 -00f5 Letter: Lower case letter, Latin, Other, 00d5 -00f6 Letter: Lower case letter, Latin, Other, 00d6 -00f7 Symbol: Mathematical symbol, Common, Other -00f8 Letter: Lower case letter, Latin, Other, 00d8 -00f9 Letter: Lower case letter, Latin, Other, 00d9 -00fa Letter: Lower case letter, Latin, Other, 00da -00fb Letter: Lower case letter, Latin, Other, 00db -00fc Letter: Lower case letter, Latin, Other, 00dc -00fd Letter: Lower case letter, Latin, Other, 00dd -00fe Letter: Lower case letter, Latin, Other, 00de -00ff Letter: Lower case letter, Latin, Other, 0178 +U+00F0 Letter: Lower case letter, Latin, Other, U+00D0 +U+00F1 Letter: Lower case letter, Latin, Other, U+00D1 +U+00F2 Letter: Lower case letter, Latin, Other, U+00D2 +U+00F3 Letter: Lower case letter, Latin, Other, U+00D3 +U+00F4 Letter: Lower case letter, Latin, Other, U+00D4 +U+00F5 Letter: Lower case letter, Latin, Other, U+00D5 +U+00F6 Letter: Lower case letter, Latin, Other, U+00D6 +U+00F7 Symbol: Mathematical symbol, Common, Other +U+00F8 Letter: Lower case letter, Latin, Other, U+00D8 +U+00F9 Letter: Lower case letter, Latin, Other, U+00D9 +U+00FA Letter: Lower case letter, Latin, Other, U+00DA +U+00FB Letter: Lower case letter, Latin, Other, U+00DB +U+00FC Letter: Lower case letter, Latin, Other, U+00DC +U+00FD Letter: Lower case letter, Latin, Other, U+00DD +U+00FE Letter: Lower case letter, Latin, Other, U+00DE +U+00FF Letter: Lower case letter, Latin, Other, U+0178 findprop 0100 0101 0102 0103 0104 0105 0106 -0100 Letter: Upper case letter, Latin, Other, 0101 -0101 Letter: Lower case letter, Latin, Other, 0100 -0102 Letter: Upper case letter, Latin, Other, 0103 -0103 Letter: Lower case letter, Latin, Other, 0102 -0104 Letter: Upper case letter, Latin, Other, 0105 -0105 Letter: Lower case letter, Latin, Other, 0104 -0106 Letter: Upper case letter, Latin, Other, 0107 +U+0100 Letter: Upper case letter, Latin, Other, U+0101 +U+0101 Letter: Lower case letter, Latin, Other, U+0100 +U+0102 Letter: Upper case letter, Latin, Other, U+0103 +U+0103 Letter: Lower case letter, Latin, Other, U+0102 +U+0104 Letter: Upper case letter, Latin, Other, U+0105 +U+0105 Letter: Lower case letter, Latin, Other, U+0104 +U+0106 Letter: Upper case letter, Latin, Other, U+0107 findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7 -ffe0 Symbol: Currency symbol, Common, Other -ffe1 Symbol: Currency symbol, Common, Other -ffe2 Symbol: Mathematical symbol, Common, Other -ffe3 Symbol: Modifier symbol, Common, Other -ffe4 Symbol: Other symbol, Common, Other -ffe5 Symbol: Currency symbol, Common, Other -ffe6 Symbol: Currency symbol, Common, Other -ffe7 Control: Unassigned, Unknown, Other +U+FFE0 Symbol: Currency symbol, Common, Other +U+FFE1 Symbol: Currency symbol, Common, Other +U+FFE2 Symbol: Mathematical symbol, Common, Other +U+FFE3 Symbol: Modifier symbol, Common, Other +U+FFE4 Symbol: Other symbol, Common, Other +U+FFE5 Symbol: Currency symbol, Common, Other +U+FFE6 Symbol: Currency symbol, Common, Other +U+FFE7 Control: Unassigned, Unknown, Other findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef -ffe8 Symbol: Other symbol, Common, Other -ffe9 Symbol: Mathematical symbol, Common, Other -ffea Symbol: Mathematical symbol, Common, Other -ffeb Symbol: Mathematical symbol, Common, Other -ffec Symbol: Mathematical symbol, Common, Other -ffed Symbol: Other symbol, Common, Other -ffee Symbol: Other symbol, Common, Other -ffef Control: Unassigned, Unknown, Other +U+FFE8 Symbol: Other symbol, Common, Other +U+FFE9 Symbol: Mathematical symbol, Common, Other +U+FFEA Symbol: Mathematical symbol, Common, Other +U+FFEB Symbol: Mathematical symbol, Common, Other +U+FFEC Symbol: Mathematical symbol, Common, Other +U+FFED Symbol: Other symbol, Common, Other +U+FFEE Symbol: Other symbol, Common, Other +U+FFEF Control: Unassigned, Unknown, Other findprop fff8 fff9 fffa fffb fffc fffd fffe ffff -fff8 Control: Unassigned, Unknown, Control -fff9 Control: Format, Common, Control -fffa Control: Format, Common, Control -fffb Control: Format, Common, Control -fffc Symbol: Other symbol, Common, Other -fffd Symbol: Other symbol, Common, Other -fffe Control: Unassigned, Unknown, Other -ffff Control: Unassigned, Unknown, Other +U+FFF8 Control: Unassigned, Unknown, Control +U+FFF9 Control: Format, Common, Control +U+FFFA Control: Format, Common, Control +U+FFFB Control: Format, Common, Control +U+FFFC Symbol: Other symbol, Common, Other +U+FFFD Symbol: Other symbol, Common, Other +U+FFFE Control: Unassigned, Unknown, Other +U+FFFF Control: Unassigned, Unknown, Other findprop 10000 10001 e01ef f0000 100000 -10000 Letter: Other letter, Linear_B, Other -10001 Letter: Other letter, Linear_B, Other -e01ef Mark: Non-spacing mark, Inherited, Extend -f0000 Control: Private use, Unknown, Other -100000 Control: Private use, Unknown, Other +U+10000 Letter: Other letter, Linear_B, Other +U+10001 Letter: Other letter, Linear_B, Other +U+E01EF Mark: Non-spacing mark, Inherited, Extend +U+F0000 Control: Private use, Unknown, Other +U+100000 Control: Private use, Unknown, Other findprop 1b00 12000 7c0 a840 10900 -1b00 Mark: Non-spacing mark, Balinese, Extend -12000 Letter: Other letter, Cuneiform, Other -07c0 Number: Decimal number, Nko, Other -a840 Letter: Other letter, Phags_Pa, Other -10900 Letter: Other letter, Phoenician, Other +U+1B00 Mark: Non-spacing mark, Balinese, Extend +U+12000 Letter: Other letter, Cuneiform, Other +U+07C0 Number: Decimal number, Nko, Other +U+A840 Letter: Other letter, Phags_Pa, Other +U+10900 Letter: Other letter, Phoenician, Other findprop 1d79 a77d -1d79 Letter: Lower case letter, Latin, Other, a77d -a77d Letter: Upper case letter, Latin, Other, 1d79 +U+1D79 Letter: Lower case letter, Latin, Other, U+A77D +U+A77D Letter: Upper case letter, Latin, Other, U+1D79 findprop 0800 083e a4d0 a4f7 aa80 aadf -0800 Letter: Other letter, Samaritan, Other -083e Punctuation: Other punctuation, Samaritan, Other -a4d0 Letter: Other letter, Lisu, Other -a4f7 Letter: Other letter, Lisu, Other -aa80 Letter: Other letter, Tai_Viet, Other -aadf Punctuation: Other punctuation, Tai_Viet, Other +U+0800 Letter: Other letter, Samaritan, Other +U+083E Punctuation: Other punctuation, Samaritan, Other +U+A4D0 Letter: Other letter, Lisu, Other +U+A4F7 Letter: Other letter, Lisu, Other +U+AA80 Letter: Other letter, Tai_Viet, Other +U+AADF Punctuation: Other punctuation, Tai_Viet, Other findprop 10b00 10b35 13000 1342e 10840 10855 -10b00 Letter: Other letter, Avestan, Other -10b35 Letter: Other letter, Avestan, Other -13000 Letter: Other letter, Egyptian_Hieroglyphs, Other -1342e Letter: Other letter, Egyptian_Hieroglyphs, Other -10840 Letter: Other letter, Imperial_Aramaic, Other -10855 Letter: Other letter, Imperial_Aramaic, Other +U+10B00 Letter: Other letter, Avestan, Other +U+10B35 Letter: Other letter, Avestan, Other +U+13000 Letter: Other letter, Egyptian_Hieroglyphs, Other +U+1342E Letter: Other letter, Egyptian_Hieroglyphs, Other +U+10840 Letter: Other letter, Imperial_Aramaic, Other +U+10855 Letter: Other letter, Imperial_Aramaic, Other findprop 11100 1113c 11680 116c0 -11100 Mark: Non-spacing mark, Chakma, Extend -1113c Number: Decimal number, Chakma, Other -11680 Letter: Other letter, Takri, Other -116c0 Number: Decimal number, Takri, Other +U+11100 Mark: Non-spacing mark, Chakma, Extend +U+1113C Number: Decimal number, Chakma, Other +U+11680 Letter: Other letter, Takri, Other +U+116C0 Number: Decimal number, Takri, Other findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89 -000d Control: Control, Common, CR -000a Control: Control, Common, LF -000e Control: Control, Common, Control -0711 Mark: Non-spacing mark, Syriac, Extend -1b04 Mark: Spacing mark, Balinese, SpacingMark -1111 Letter: Other letter, Hangul, Hangul syllable type L -1169 Letter: Other letter, Hangul, Hangul syllable type V -11fe Letter: Other letter, Hangul, Hangul syllable type T -ae4c Letter: Other letter, Hangul, Hangul syllable type LV -ad89 Letter: Other letter, Hangul, Hangul syllable type LVT +U+000D Control: Control, Common, CR +U+000A Control: Control, Common, LF +U+000E Control: Control, Common, Control +U+0711 Mark: Non-spacing mark, Syriac, Extend +U+1B04 Mark: Spacing mark, Balinese, SpacingMark +U+1111 Letter: Other letter, Hangul, Hangul syllable type L +U+1169 Letter: Other letter, Hangul, Hangul syllable type V +U+11FE Letter: Other letter, Hangul, Hangul syllable type T +U+AE4C Letter: Other letter, Hangul, Hangul syllable type LV +U+AD89 Letter: Other letter, Hangul, Hangul syllable type LVT findprop 118a0 11ac7 16ad0 -118a0 Letter: Upper case letter, Warang_Citi, Other, 118c0 -11ac7 Letter: Other letter, Pau_Cin_Hau, Other -16ad0 Letter: Other letter, Bassa_Vah, Other +U+118A0 Letter: Upper case letter, Warang_Citi, Other, U+118C0 +U+11AC7 Letter: Other letter, Pau_Cin_Hau, Other +U+16AD0 Letter: Other letter, Bassa_Vah, Other findprop 11700 14400 108e0 11280 1d800 -11700 Letter: Other letter, Ahom, Other -14400 Letter: Other letter, Anatolian_Hieroglyphs, Other -108e0 Letter: Other letter, Hatran, Other -11280 Letter: Other letter, Multani, Other -1d800 Symbol: Other symbol, SignWriting, Other +U+11700 Letter: Other letter, Ahom, Other +U+14400 Letter: Other letter, Anatolian_Hieroglyphs, Other +U+108E0 Letter: Other letter, Hatran, Other +U+11280 Letter: Other letter, Multani, Other +U+1D800 Symbol: Other symbol, SignWriting, Other findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30 -11800 Letter: Other letter, Dogra, Other -1e903 Letter: Upper case letter, Adlam, Other, 1e925 -11da9 Number: Decimal number, Gunjala_Gondi, Other -10d27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend -11ee0 Letter: Other letter, Makasar, Other -16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68 -10f27 Letter: Other letter, Old_Sogdian, Other -10f30 Letter: Other letter, Sogdian, Other +U+11800 Letter: Other letter, Dogra, Other +U+1E903 Letter: Upper case letter, Adlam, Other, U+1E925 +U+11DA9 Number: Decimal number, Gunjala_Gondi, Other +U+10D27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend +U+11EE0 Letter: Other letter, Makasar, Other +U+16E48 Letter: Upper case letter, Medefaidrin, Other, U+16E68 +U+10F27 Letter: Other letter, Old_Sogdian, Other +U+10F30 Letter: Other letter, Sogdian, Other findprop a836 a833 1cf4 20f0 1cd0 -a836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta] -a833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta] -1cf4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada] -20f0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin] -1cd0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada] +U+A836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta] +U+A833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta] +U+1CF4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada] +U+20F0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin] +U+1CD0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada] findprop 32ff -32ff Symbol: Other symbol, Common, Other, [Han] +U+32FF Symbol: Other symbol, Common, Other, [Han] findprop 1f16d -1f16d Symbol: Other symbol, Common, Extended Pictographic +U+1F16D Symbol: Other symbol, Common, Extended Pictographic -findprop 10e93 10eaa -10e93 Letter: Other letter, Yezidi, Other -10eaa Control: Unassigned, Unknown, Other +findprop U+10e93 U+10eaa +U+10E93 Letter: Other letter, Yezidi, Other +U+10EAA Control: Unassigned, Unknown, Other diff --git a/maint/ucptestdata/testoutput2 b/maint/ucptestdata/testoutput2 new file mode 100644 index 0000000..b0689f4 --- /dev/null +++ b/maint/ucptestdata/testoutput2 @@ -0,0 +1,188 @@ +find script Han +U+2E80..U+2E99 Symbol: Other symbol, Han, Other +U+2E9B..U+2EF3 Symbol: Other symbol, Han, Other +U+2F00..U+2FD5 Symbol: Other symbol, Han, Other + U+3005 Letter: Modifier letter, Han, Other + U+3007 Number: Letter number, Han, Other +U+3021..U+3029 Number: Letter number, Han, Other +U+3038..U+303A Number: Letter number, Han, Other + U+303B Letter: Modifier letter, Han, Other +U+3400..U+4DBF Letter: Other letter, Han, Other +U+4E00..U+9FFC Letter: Other letter, Han, Other +U+F900..U+FA6D Letter: Other letter, Han, Other +U+FA70..U+FAD9 Letter: Other letter, Han, Other +U+16FF0..U+16FF1 Mark: Spacing mark, Han, SpacingMark +U+20000..U+2A6DD Letter: Other letter, Han, Other +U+2A700..U+2B734 Letter: Other letter, Han, Other +U+2B740..U+2B81D Letter: Other letter, Han, Other +U+2B820..U+2CEA1 Letter: Other letter, Han, Other +U+2CEB0..U+2EBE0 Letter: Other letter, Han, Other +U+2F800..U+2FA1D Letter: Other letter, Han, Other +U+30000..U+3134A Letter: Other letter, Han, Other +find type Pe script Common scriptx Hangul +U+3009 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+300B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+300D Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+300F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+3011 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+3015 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+3017 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+3019 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+301B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +U+301E..U+301F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana] + U+FF63 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi] +find type Sk +U+005E Symbol: Modifier symbol, Common, Other +U+0060 Symbol: Modifier symbol, Common, Other +U+00A8 Symbol: Modifier symbol, Common, Other +U+00AF Symbol: Modifier symbol, Common, Other +U+00B4 Symbol: Modifier symbol, Common, Other +U+00B8 Symbol: Modifier symbol, Common, Other +U+02C2..U+02C5 Symbol: Modifier symbol, Common, Other +U+02D2..U+02DF Symbol: Modifier symbol, Common, Other +U+02E5..U+02E9 Symbol: Modifier symbol, Common, Other +U+02EA..U+02EB Symbol: Modifier symbol, Bopomofo, Other + U+02ED Symbol: Modifier symbol, Common, Other +U+02EF..U+02FF Symbol: Modifier symbol, Common, Other + U+0375 Symbol: Modifier symbol, Greek, Other + U+0384 Symbol: Modifier symbol, Greek, Other + U+0385 Symbol: Modifier symbol, Common, Other + U+1FBD Symbol: Modifier symbol, Greek, Other +U+1FBF..U+1FC1 Symbol: Modifier symbol, Greek, Other +U+1FCD..U+1FCF Symbol: Modifier symbol, Greek, Other +U+1FDD..U+1FDF Symbol: Modifier symbol, Greek, Other +U+1FED..U+1FEF Symbol: Modifier symbol, Greek, Other +U+1FFD..U+1FFE Symbol: Modifier symbol, Greek, Other +U+309B..U+309C Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana] +U+A700..U+A707 Symbol: Modifier symbol, Common, Other, [Han, Latin] +U+A708..U+A716 Symbol: Modifier symbol, Common, Other +U+A720..U+A721 Symbol: Modifier symbol, Common, Other +U+A789..U+A78A Symbol: Modifier symbol, Common, Other + U+AB5B Symbol: Modifier symbol, Common, Other +U+AB6A..U+AB6B Symbol: Modifier symbol, Common, Other +U+FBB2..U+FBC1 Symbol: Modifier symbol, Arabic, Other + U+FF3E Symbol: Modifier symbol, Common, Other + U+FF40 Symbol: Modifier symbol, Common, Other + U+FFE3 Symbol: Modifier symbol, Common, Other +U+1F3FB..U+1F3FF Symbol: Modifier symbol, Common, Extend +find type Pd +U+002D Punctuation: Dash punctuation, Common, Other +U+058A Punctuation: Dash punctuation, Armenian, Other +U+05BE Punctuation: Dash punctuation, Hebrew, Other +U+1400 Punctuation: Dash punctuation, Canadian_Aboriginal, Other +U+1806 Punctuation: Dash punctuation, Mongolian, Other +U+2010..U+2015 Punctuation: Dash punctuation, Common, Other + U+2E17 Punctuation: Dash punctuation, Common, Other + U+2E1A Punctuation: Dash punctuation, Common, Other +U+2E3A..U+2E3B Punctuation: Dash punctuation, Common, Other + U+2E40 Punctuation: Dash punctuation, Common, Other + U+301C Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana] + U+3030 Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana] + U+30A0 Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana] +U+FE31..U+FE32 Punctuation: Dash punctuation, Common, Other + U+FE58 Punctuation: Dash punctuation, Common, Other + U+FE63 Punctuation: Dash punctuation, Common, Other + U+FF0D Punctuation: Dash punctuation, Common, Other + U+10EAD Punctuation: Dash punctuation, Yezidi, Other +find gbreak LVT +U+AC01..U+AC1B Letter: Other letter, Hangul, Hangul syllable type LVT +U+AC1D..U+AC37 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AC39..U+AC53 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AC55..U+AC6F Letter: Other letter, Hangul, Hangul syllable type LVT +U+AC71..U+AC8B Letter: Other letter, Hangul, Hangul syllable type LVT +U+AC8D..U+ACA7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+ACA9..U+ACC3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+ACC5..U+ACDF Letter: Other letter, Hangul, Hangul syllable type LVT +U+ACE1..U+ACFB Letter: Other letter, Hangul, Hangul syllable type LVT +U+ACFD..U+AD17 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AD19..U+AD33 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AD35..U+AD4F Letter: Other letter, Hangul, Hangul syllable type LVT +U+AD51..U+AD6B Letter: Other letter, Hangul, Hangul syllable type LVT +U+AD6D..U+AD87 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AD89..U+ADA3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+ADA5..U+ADBF Letter: Other letter, Hangul, Hangul syllable type LVT +U+ADC1..U+ADDB Letter: Other letter, Hangul, Hangul syllable type LVT +U+ADDD..U+ADF7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+ADF9..U+AE13 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AE15..U+AE2F Letter: Other letter, Hangul, Hangul syllable type LVT +U+AE31..U+AE4B Letter: Other letter, Hangul, Hangul syllable type LVT +U+AE4D..U+AE67 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AE69..U+AE83 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AE85..U+AE9F Letter: Other letter, Hangul, Hangul syllable type LVT +U+AEA1..U+AEBB Letter: Other letter, Hangul, Hangul syllable type LVT +U+AEBD..U+AED7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AED9..U+AEF3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AEF5..U+AF0F Letter: Other letter, Hangul, Hangul syllable type LVT +U+AF11..U+AF2B Letter: Other letter, Hangul, Hangul syllable type LVT +U+AF2D..U+AF47 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AF49..U+AF63 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AF65..U+AF7F Letter: Other letter, Hangul, Hangul syllable type LVT +U+AF81..U+AF9B Letter: Other letter, Hangul, Hangul syllable type LVT +U+AF9D..U+AFB7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AFB9..U+AFD3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+AFD5..U+AFEF Letter: Other letter, Hangul, Hangul syllable type LVT +U+AFF1..U+B00B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B00D..U+B027 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B029..U+B043 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B045..U+B05F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B061..U+B07B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B07D..U+B097 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B099..U+B0B3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B0B5..U+B0CF Letter: Other letter, Hangul, Hangul syllable type LVT +U+B0D1..U+B0EB Letter: Other letter, Hangul, Hangul syllable type LVT +U+B0ED..U+B107 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B109..U+B123 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B125..U+B13F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B141..U+B15B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B15D..U+B177 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B179..U+B193 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B195..U+B1AF Letter: Other letter, Hangul, Hangul syllable type LVT +U+B1B1..U+B1CB Letter: Other letter, Hangul, Hangul syllable type LVT +U+B1CD..U+B1E7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B1E9..U+B203 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B205..U+B21F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B221..U+B23B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B23D..U+B257 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B259..U+B273 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B275..U+B28F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B291..U+B2AB Letter: Other letter, Hangul, Hangul syllable type LVT +U+B2AD..U+B2C7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B2C9..U+B2E3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B2E5..U+B2FF Letter: Other letter, Hangul, Hangul syllable type LVT +U+B301..U+B31B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B31D..U+B337 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B339..U+B353 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B355..U+B36F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B371..U+B38B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B38D..U+B3A7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B3A9..U+B3C3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B3C5..U+B3DF Letter: Other letter, Hangul, Hangul syllable type LVT +U+B3E1..U+B3FB Letter: Other letter, Hangul, Hangul syllable type LVT +U+B3FD..U+B417 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B419..U+B433 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B435..U+B44F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B451..U+B46B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B46D..U+B487 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B489..U+B4A3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B4A5..U+B4BF Letter: Other letter, Hangul, Hangul syllable type LVT +U+B4C1..U+B4DB Letter: Other letter, Hangul, Hangul syllable type LVT +U+B4DD..U+B4F7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B4F9..U+B513 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B515..U+B52F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B531..U+B54B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B54D..U+B567 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B569..U+B583 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B585..U+B59F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B5A1..U+B5BB Letter: Other letter, Hangul, Hangul syllable type LVT +U+B5BD..U+B5D7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B5D9..U+B5F3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B5F5..U+B60F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B611..U+B62B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B62D..U+B647 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B649..U+B663 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B665..U+B67F Letter: Other letter, Hangul, Hangul syllable type LVT +U+B681..U+B69B Letter: Other letter, Hangul, Hangul syllable type LVT +U+B69D..U+B6B7 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B6B9..U+B6D3 Letter: Other letter, Hangul, Hangul syllable type LVT +U+B6D5..U+B6EF Letter: Other letter, Hangul, Hangul syllable type LVT +... diff --git a/maint/utf8.c b/maint/utf8.c index 9ac6dc5..bc11a50 100644 --- a/maint/utf8.c +++ b/maint/utf8.c @@ -1,29 +1,46 @@ -/* A test program for converting characters to UTF-8 and vice versa. Note that -this program conforms to the original definition of UTF-8, which allows -codepoints up to 7fffffff. The more recent definition limits the validity of -UTF-8 codepoints to a maximum of 10ffffff. - -The arguments are either single codepoint values, written as 0xhhhh, for -conversion to UTF-8, or sequences of hex values, written without 0x and -optionally including spaces (but such arguments must be quoted), for conversion +/**************************************************** +* PCRE maintainers' helper program: UTF-8 converter * +****************************************************/ + +/* This is a test program for converting character code points to UTF-8 and +vice versa. Note that this program conforms to the original definition of +UTF-8, which allows codepoints up to 7fffffff. The more recent definition +limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff, and +forbids the "surrogate" code points. This program now gives warnings for these +invalid code points. + +The arguments are either single code point values written as U+hh.. or 0xhh.. +for conversion to UTF-8, or sequences of hex values, written without 0x and +optionally including spaces (but such arguments must be quoted), for conversion from UTF-8 to codepoints. For example: ./utf8 0x1234 -0x00001234 => e1 88 b4 +U+00001234 => e1 88 b4 ./utf8 "e1 88 b4" -0x00001234 <= e1 88 b4 +U+00001234 <= e1 88 b4 -In the second case, a number of characters can be present in one argument: +In the second case, a number of UTF-8 characters can be present in one +argument. In other words, each such argument is interpreted (after ignoring +spaces) as a string of UTF-8 bytes representing a string of characters: ./utf8 "65 e188b4 77" -0x00000065 <= 65 -0x00001234 <= e1 88 b4 -0x00000077 <= 77 +0x00000065 <= 65 +0x00001234 <= e1 88 b4 +0x00000077 <= 77 -If the option -s is given, the sequence of UTF-bytes is written out between +If the option -s is given, the sequence of UTF-bytes is written out between angle brackets at the end of the line. On a UTF-8 terminal, this will show the -appropriate graphic for the codepoint. */ +appropriate graphic for the code point. + +Errors provoke error messages, but the program carries on with the next +argument. The return code is always zero. + +Philip Hazel +Original creation data: unknown +Code extended and tidied to avoid compiler warnings: 26 March 2020 +*/ + #include <stdio.h> #include <stdlib.h> @@ -41,47 +58,38 @@ appropriate graphic for the codepoint. */ */ -static const int utf8_table1[] = { - 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff}; +static const unsigned int utf8_table1[] = { + 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff}; static const int utf8_table2[] = { - 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; - + 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + static const int utf8_table3[] = { - 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; - -static const unsigned char utf8_table4[] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 }; + 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; /************************************************* * Convert character value to UTF-8 * *************************************************/ -/* This function takes an integer value in the range 0 - 0x7fffffff -and encodes it as a UTF-8 character in 1 to 6 bytes. +/* This function takes an unsigned long integer value in the range 0 - +0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes. -Arguments: - cvalue the character value +Arguments: + cvalue the character value buffer pointer to buffer for result - at least 6 bytes long - -Returns: number of characters placed in the buffer - -1 if input character is negative - 0 if input character is positive but too big (only when - int is longer than 32 bits) + +Returns: number of bytes placed in the buffer + 0 if input code point is too big */ -int -ord2utf8(int cvalue, unsigned char *buffer) +static size_t +ord2utf8(unsigned long int cvalue, unsigned char *buffer) { -register int i, j; +size_t i, j; for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) if (cvalue <= utf8_table1[i]) break; if (i >= sizeof(utf8_table1)/sizeof(int)) return 0; -if (cvalue < 0) return -1; buffer += i; for (j = i; j > 0; j--) { @@ -98,32 +106,59 @@ return i + 1; * Convert UTF-8 string to value * *************************************************/ -/* This function takes one or more bytes that represents a UTF-8 character, -and returns the value of the character. +/* This function takes one or more bytes that represent a UTF-8 character from +the start of a string of bytes. It returns the value of the character, or the +offset of a malformation. For an overlong encoding that works but is not the +correct (shortest) one, the error offset is just after the last byte. -Argument: +Argument: buffer a pointer to the byte vector - vptr a pointer to an int to receive the value - -Returns: > 0 => the number of bytes consumed - -6 to 0 => malformed UTF-8 character at offset = (-return) + buffend a pointer to the end of the buffer + vptr a pointer to a variable to receive the value + lenptr a pointer to a variable to receive the offset when error detected + +Returns: > 0 => the number of bytes consumed + 0 => invalid UTF-8: first byte missing 0x40 bit + -1 => invalid UTF-8: first byte has too many high-order 1-bits + -2 => incomplete sequence at end of string + -3 => incomplete sequence within string + -4 => overlong code sequence */ -int -utf82ord(unsigned char *buffer, int *vptr) +static int +utf82ord(unsigned char *buffer, unsigned char *buffend, + long unsigned int *vptr, int *lenptr) { -int c = *buffer++; -int d = c; +unsigned int c = *buffer++; +unsigned int d = c; int i, j, s; -for (i = -1; i < 6; i++) /* i is number of additional bytes */ +/* Check for an ASCII character, or find the number of additional bytes in a +multibyte character. */ + +for (i = -1; i < 6; i++) { if ((d & 0x80) == 0) break; d <<= 1; } -if (i == -1) { *vptr = c; return 1; } /* ascii character */ -if (i == 0 || i == 6) return 0; /* invalid UTF-8 */ +switch (i) + { + case -1: /* ASCII character; first byte does not have 0x80 bit */ + *vptr = c; + return 1; + + case 0: /* First byte has 0x80 but is missing 0x40 bit */ + *lenptr = 0; + return 0; + + case 6: + *lenptr = 0; /* Too many high bits */ + return -1; + + default: + break; + } /* i now has a value in the range 1-5 */ @@ -132,32 +167,46 @@ d = (c & utf8_table3[i]) << s; for (j = 0; j < i; j++) { + if (buffer >= buffend) + { + *lenptr = j + 1; + return -2; + } c = *buffer++; - if ((c & 0xc0) != 0x80) return -(j+1); + if ((c & 0xc0) != 0x80) + { + *lenptr = j + 1; + return -3; + } s -= 6; d |= (c & 0x3f) << s; } -/* Check that encoding was the correct unique one */ +/* Valid UTF-8 syntax */ -for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++) +*vptr = d; + +/* Check that encoding was the correct one, not overlong */ + +for (j = 0; j < (int)(sizeof(utf8_table1)/sizeof(int)); j++) if (d <= utf8_table1[j]) break; -if (j != i) return -(i+1); +if (j != i) + { + *lenptr = i + 1; + return -4; + } /* Valid value */ -*vptr = d; -return i+1; +return i + 1; } - /************************************************* * Main Program * *************************************************/ - int main(int argc, char **argv) { @@ -169,85 +218,129 @@ if (argc > 1 && strcmp(argv[1], "-s") == 0) { show = 1; i = 2; - } + } for (; i < argc; i++) { char *x = argv[i]; - if (strncmp(x, "0x", 2) == 0) + char *endptr; + if (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0) { - int j; - int d = strtol(x+2, NULL, 16); - int rc = ord2utf8(d, buffer); - printf("0x%08x => ", d); - if (rc <= 0) printf("*** Error %d ***", rc); else + size_t rc, j; + unsigned long int d = strtoul(x+2, &endptr, 16); + if (*endptr != 0) + { + printf("** Invalid hex number %s\n", x); + continue; /* With next argument */ + } + rc = ord2utf8(d, buffer); + printf("U+%08lx => ", d); + if (rc == 0) + printf("** Code point greater than 0x7fffffff cannot be encoded"); + else { for (j = 0; j < rc; j++) printf("%02x ", buffer[j]); if (show) { printf(">"); for (j = 0; j < rc; j++) printf("%c", buffer[j]); - printf("<"); - } - } - printf("\n"); + printf("< "); + } + if (d >= 0xd800 && d <= 0xdfff) + printf("** Invalid Unicode (surrogate)"); + else if (d > 0x10ffff) + printf("** Invalid Unicode (greater than U+10ffff)"); + } + printf("\n"); } else { - int d, rc; - int j = 0; - int y = 0; - int z = 0; unsigned char *bptr; - - for (;;) - { - while (*x == ' ') x++; + unsigned char *buffend; + int len = 0; + int y = 0; + int z = 0; + + for (;;) + { + while (*x == ' ') x++; if (*x == 0 && !z) break; - if (!isxdigit(*x)) + if (!isxdigit(*x)) { - printf("Malformed hex string: %s\n", argv[i]); - j = -1; - break; - } + printf("** Malformed hex string: %s\n", argv[i]); + len = -1; + break; + } y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W'); - x++; + x++; if (z) - { - buffer[j++] = y; + { + buffer[len++] = y; y = 0; } - z ^= 1; - } - buffer[j] = 0; + z ^= 1; + } + + if (len < 0) continue; /* With next argument after malformation */ + bptr = buffer; + buffend = buffer + len; + + while (bptr < buffend) + { + unsigned long int d; + int j; + int offset; + int rc = utf82ord(bptr, buffend, &d, &offset); - while (*bptr != 0) - { - rc = utf82ord(bptr, &d); - if (rc > 0) + if (rc > 0) { - printf("0x%08x <= ", d); + printf("U+%08lx <= ", d); for (j = 0; j < rc; j++) printf("%02x ", bptr[j]); if (show) { printf(">"); for (j = 0; j < rc; j++) printf("%c", bptr[j]); - printf("<"); - } + printf("<"); + } printf("\n"); - bptr += rc; - } - else + bptr += rc; + } + else if (rc == -4) + { + printf("U+%08lx <= ", d); + for (j = 0; j < offset; j++) printf("%02x ", bptr[j]); + printf("** Overlong UTF-8 sequence\n"); + bptr += offset; + } + else { - printf("Malformed UTF-8 at offset %d <= ", -rc); - while (*bptr != 0) printf("%02x ", *bptr++); - printf("\n"); - break; - } - } - } - } + switch (rc) + { + case 0: printf("** First byte missing 0x40 bit"); + break; + + case -1: printf("** First byte has too many high-order bits"); + break; + + case -2: printf("** Incomplete UTF-8 sequence at end of string"); + break; + + case -3: printf("** Incomplete UTF-8 sequence"); + break; + + default: printf("** Unexpected return %d from utf82ord()", rc); + break; + } + printf(" at offset %d in string ", offset); + while (bptr < buffend) printf("%02x ", *bptr++); + printf("\n"); + break; + } + } + } + } + return 0; } |