summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2020-04-01 17:00:33 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2020-04-01 17:00:33 +0000
commit265489fe6082dc2ba2c91ffdc1d448ed35739e60 (patch)
tree83756df63a6eac5d7b9b0d6307177f9ad96736ea
parent2082578875ec81a296070568fa7f09a6abc4f1ce (diff)
downloadpcre2-265489fe6082dc2ba2c91ffdc1d448ed35739e60.tar.gz
Tidies and updates to maintenance programs utf8 and ucptest.
git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1241 6239d852-aaf2-0410-a92c-79f79f948069
-rw-r--r--maint/README19
-rw-r--r--maint/ucptest.c643
-rw-r--r--maint/ucptestdata/testinput12
-rw-r--r--maint/ucptestdata/testinput25
-rw-r--r--maint/ucptestdata/testoutput1702
-rw-r--r--maint/ucptestdata/testoutput2188
-rw-r--r--maint/utf8.c313
7 files changed, 1082 insertions, 790 deletions
diff --git a/maint/README b/maint/README
index 0e1ff8f..fac36b2 100644
--- a/maint/README
+++ b/maint/README
@@ -54,10 +54,12 @@ Unicode.tables The files in this directory were downloaded from the Unicode
ucptest.c A short C program for testing the Unicode property macros
that do lookups in the pcre2_ucd.c data, mainly useful after
rebuilding the Unicode property table. Compile and run this in
- the "maint" directory (see comments at its head).
+ the "maint" directory (see comments at its head). This program
+ can also be used to find characters with specific properties.
-ucptestdata A directory containing two files, testinput1 and testoutput1,
- to use in conjunction with the ucptest program.
+ucptestdata A directory containing four files, testinput{1,2} and
+ testoutput{1,2}, for use in conjunction with the ucptest
+ program.
utf8.c A short, freestanding C program for converting a Unicode code
point into a sequence of bytes in the UTF-8 encoding, and vice
@@ -65,7 +67,7 @@ utf8.c A short, freestanding C program for converting a Unicode code
outputs a list of the equivalent UTF-8 bytes. If its argument
is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it
treats them as a UTF-8 character and outputs the equivalent
- code point in hex.
+ code point in hex. See comments at its head for details.
Updating to a new Unicode release
@@ -96,9 +98,10 @@ lists of scripts.
The ucptest program can be compiled and used to check that the new tables in
pcre2_ucd.c work properly, using the data files in ucptestdata to check a
-number of test characters. The source file ucptest.c should also be updated
-whenever new Unicode script names are added, and adding a few tests for new
-scripts is a good idea.
+number of test characters. It used to be necessary to update the source
+ucptest.c whenever new Unicode scripts were added, but this is no longer
+required because that program now uses the lists in the PCRE2 source. However,
+adding a few tests for new scripts to the files in ucptestdata is a good idea.
Preparing for a PCRE2 release
@@ -437,4 +440,4 @@ very sensible; some are rather wacky. Some have been on this list for years.
Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
-Last updated: 03 June 2019
+Last updated: 01 April 2020
diff --git a/maint/ucptest.c b/maint/ucptest.c
index e946226..49616e2 100644
--- a/maint/ucptest.c
+++ b/maint/ucptest.c
@@ -16,36 +16,58 @@
/* This is a hacked-up program for testing the Unicode properties tables of
PCRE2. It can also be used for finding characters with certain properties.
I wrote it to help with debugging PCRE, and have added things that I found
-useful, in a rather haphazard way. The code has never been "tidied" or checked
-for robustness.
-
-If there are arguments, they are a list of hexadecimal code points whose
-properties are to be output. Otherwise, the program expects to read commands on
-stdin, and it writes output to stdout. There are two commands:
-
-"findprop" must be followed by a list of Unicode code points as hex numbers
-(without any prefixes). The output is one line per character, giving its
-Unicode properties followed by its other case if there is one, followed by its
-Script Extension list if it is not just the same as the base script.
-
-"find" must be followed by a list of property names and their values. This
-finds characters that have those properties. If multiple properties are listed,
-they must all be matched. Currently supported:
+useful, in a rather haphazard way. The code has never been seriously tidied or
+checked for robustness, but it shouldn't now give compiler warnings.
+
+There is only one option: "-s". If given, it applies only to the "findprop"
+command. It causes the UTF-8 sequence of bytes that encode the character to be
+output between angle brackets at the end of the line. On a UTF-8 terminal, this
+will show the appropriate graphic for the code point.
+
+If the command has arguments, they are concatenated into a buffer, separated by
+spaces. If the first argument starts "U+" or consists entirely of hexadecimal
+digits, "findprop" is inserted at the start. The buffer is then processed as a
+single line file, after which the program exits. If there are no arguments, the
+program reads commands line by line on stdin and writes output to stdout. The
+return code is always zero.
+
+There are three commands:
+
+"findprop" must be followed by a space-separated list of Unicode code points as
+hex numbers, either without any prefix or starting with "U+". The output is one
+line per character, giving its Unicode properties followed by its other case or
+cases if one or more exist, followed by its Script Extension list if it is not
+just the same as the base script. This list is in square brackets. The
+properties are:
+
+General type e.g. Letter
+Specific type e.g. Upper case letter
+Script e.g. Medefaidrin
+Grapheme break type e.g. Extend (most common is Other)
+
+"find" must be followed by a list of property names and their values. The
+values are case-sensitive. This finds characters that have those properties. If
+multiple properties are listed, they must all be matched. Currently supported:
script <name> The character must have this script property. Only one
such script may be given.
scriptx <name> This script must be in the character's Script Extension
property list. If this is used many times, all the given
scripts must be present.
- type <abbrev> The character's type (e.g. Lu or Nd) must match.
+ type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
gbreak <name> The grapheme break property must match.
If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
Script Extensions, there may be a mixture of positive and negative
requirements. All must be satisfied.
-No more than 100 characters are output. If there are more, the list ends with
-... */
+Sequences of two or more characters are shown as ranges, for example
+U+0041..U+004A. No more than 100 lines are are output. If there are more
+characters, the list ends with ...
+
+"list" must be followed by a property name (script, type, or gbreak). The
+defined values for that property are listed. */
+
#ifdef HAVE_CONFIG_H
#include "../src/config.h"
@@ -91,228 +113,99 @@ No more than 100 characters are output. If there are more, the list ends with
/* -------------------------------------------------------------------*/
-
-const unsigned char *script_names[] = {
- US"Unknown",
- US"Arabic",
- US"Armenian",
- US"Bengali",
- US"Bopomofo",
- US"Braille",
- US"Buginese",
- US"Buhid",
- US"Canadian_Aboriginal",
- US"Cherokee",
- US"Common",
- US"Coptic",
- US"Cypriot",
- US"Cyrillic",
- US"Deseret",
- US"Devanagari",
- US"Ethiopic",
- US"Georgian",
- US"Glagolitic",
- US"Gothic",
- US"Greek",
- US"Gujarati",
- US"Gurmukhi",
- US"Han",
- US"Hangul",
- US"Hanunoo",
- US"Hebrew",
- US"Hiragana",
- US"Inherited",
- US"Kannada",
- US"Katakana",
- US"Kharoshthi",
- US"Khmer",
- US"Lao",
- US"Latin",
- US"Limbu",
- US"Linear_B",
- US"Malayalam",
- US"Mongolian",
- US"Myanmar",
- US"New_Tai_Lue",
- US"Ogham",
- US"Old_Italic",
- US"Old_Persian",
- US"Oriya",
- US"Osmanya",
- US"Runic",
- US"Shavian",
- US"Sinhala",
- US"Syloti_Nagri",
- US"Syriac",
- US"Tagalog",
- US"Tagbanwa",
- US"Tai_Le",
- US"Tamil",
- US"Telugu",
- US"Thaana",
- US"Thai",
- US"Tibetan",
- US"Tifinagh",
- US"Ugaritic",
- US"Yi",
- /* New for Unicode 5.0: */
- US"Balinese",
- US"Cuneiform",
- US"Nko",
- US"Phags_Pa",
- US"Phoenician",
- /* New for Unicode 5.1: */
- US"Carian",
- US"Cham",
- US"Kayah_Li",
- US"Lepcha",
- US"Lycian",
- US"Lydian",
- US"Ol_Chiki",
- US"Rejang",
- US"Saurashtra",
- US"Sundanese",
- US"Vai",
- /* New for Unicode 5.2: */
- US"Avestan",
- US"Bamum",
- US"Egyptian_Hieroglyphs",
- US"Imperial_Aramaic",
- US"Inscriptional_Pahlavi",
- US"Inscriptional_Parthian",
- US"Javanese",
- US"Kaithi",
- US"Lisu",
- US"Meetei_Mayek",
- US"Old_South_Arabian",
- US"Old_Turkic",
- US"Samaritan",
- US"Tai_Tham",
- US"Tai_Viet",
- /* New for Unicode 6.0.0 */
- US"Batak",
- US"Brahmi",
- US"Mandaic",
- /* New for Unicode 6.1.0 */
- US"Chakma",
- US"Meroitic_Cursive",
- US"Meroitic_Hieroglyphs",
- US"Miao",
- US"Sharada",
- US"Sora Sompent",
- US"Takri",
- /* New for Unicode 7.0.0 */
- US"Bassa_Vah",
- US"Caucasian_Albanian",
- US"Duployan",
- US"Elbasan",
- US"Grantha",
- US"Khojki",
- US"Khudawadi",
- US"Linear_A",
- US"Mahajani",
- US"Manichaean",
- US"Mende_Kikakui",
- US"Modi",
- US"Mro",
- US"Nabataean",
- US"Old_North_Arabian",
- US"Old_Permic",
- US"Pahawh_Hmong",
- US"Palmyrene",
- US"Psalter_Pahlavi",
- US"Pau_Cin_Hau",
- US"Siddham",
- US"Tirhuta",
- US"Warang_Citi",
- /* New for Unicode 8.0.0 */
- US"Ahom",
- US"Anatolian_Hieroglyphs",
- US"Hatran",
- US"Multani",
- US"Old_Hungarian",
- US"SignWriting",
- /* New for Unicode 10.0.0 (no update since 8.0.0) */
- US"Adlam",
- US"Bhaiksuki",
- US"Marchen",
- US"Newa",
- US"Osage",
- US"Tangut",
- US"Masaram_Gondi",
- US"Nushu",
- US"Soyombo",
- US"Zanabazar_Square",
- /* New for Unicode 11.0.0 */
- US"Dogra",
- US"Gunjala_Gondi",
- US"Hanifi_Rohingya",
- US"Makasar",
- US"Medefaidrin",
- US"Old_Sogdian",
- US"Sogdian",
- /* New for Unicode 12.0.0 */
- US"Elymaic",
- US"Nandinagari",
- US"Nyiakeng_Puachue_Hmong",
- US"Wancho",
- /* New for Unicode 13.0.0 */
- US"Chorasmian",
- US"Dives_Akuru",
- US"Khitan_Small_Script",
- US"Yezidi"
+static BOOL show_character = FALSE;
+
+static const unsigned char *type_names[] = {
+ US"Cc", US"Control",
+ US"Cf", US"Format",
+ US"Cn", US"Unassigned",
+ US"Co", US"Private use",
+ US"Cs", US"Surrogate",
+ US"Ll", US"Lower case letter",
+ US"Lm", US"Modifier letter",
+ US"Lo", US"Other letter",
+ US"Lt", US"Title case letter",
+ US"Lu", US"Upper case letter",
+ US"Mc", US"Spacing mark",
+ US"Me", US"Enclosing mark",
+ US"Mn", US"Non-spacing mark",
+ US"Nd", US"Decimal number",
+ US"Nl", US"Letter number",
+ US"No", US"Other number",
+ US"Pc", US"Connector punctuation",
+ US"Pd", US"Dash punctuation",
+ US"Pe", US"Close punctuation",
+ US"Pf", US"Final punctuation",
+ US"Pi", US"Initial punctuation",
+ US"Po", US"Other punctuation",
+ US"Ps", US"Open punctuation",
+ US"Sc", US"Currency symbol",
+ US"Sk", US"Modifier symbol",
+ US"Sm", US"Mathematical symbol",
+ US"So", US"Other symbol",
+ US"Zl", US"Line separator",
+ US"Zp", US"Paragraph separator",
+ US"Zs", US"Space separator"
};
-const unsigned char *type_names[] = {
- US"Cc",
- US"Cf",
- US"Cn",
- US"Co",
- US"Cs",
- US"Ll",
- US"Lm",
- US"Lo",
- US"Lt",
- US"Lu",
- US"Mc",
- US"Me",
- US"Mn",
- US"Nd",
- US"Nl",
- US"No",
- US"Pc",
- US"Pd",
- US"Pe",
- US"Pf",
- US"Pi",
- US"Po",
- US"Ps",
- US"Sc",
- US"Sk",
- US"Sm",
- US"So",
- US"Zl",
- US"Zp",
- US"Zs"
+static const unsigned char *gb_names[] = {
+ US"CR", US"carriage return",
+ US"LF", US"linefeed",
+ US"Control", US"",
+ US"Extend", US"",
+ US"Prepend", US"",
+ US"SpacingMark", US"",
+ US"L", US"Hangul syllable type L",
+ US"V", US"Hangul syllable type V",
+ US"T", US"Hangul syllable type T",
+ US"LV", US"Hangul syllable type LV",
+ US"LVT", US"Hangul syllable type LVT",
+ US"RegionalIndicator", US"",
+ US"Other", US"",
+ US"ZWJ", US"zero width joiner",
+ US"Extended_Pictographic", US""
};
-const unsigned char *gb_names[] = {
- US"CR",
- US"LF",
- US"Control",
- US"Extend",
- US"Prepend",
- US"SpacingMark",
- US"L",
- US"V",
- US"T",
- US"LV",
- US"LVT",
- US"RegionalIndicator",
- US"Other",
- US"ZWJ",
- US"Extended_Pictographic"
-};
+
+static const unsigned int utf8_table1[] = {
+ 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
+
+static const int utf8_table2[] = {
+ 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
+
+/*************************************************
+* Convert character value to UTF-8 *
+*************************************************/
+
+/* This function takes an unsigned long integer value in the range 0 -
+0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
+
+Arguments:
+ cvalue the character value
+ buffer pointer to buffer for result - at least 6 bytes long
+
+Returns: number of bytes placed in the buffer
+ 0 if input code point is too big
+*/
+
+static size_t
+ord2utf8(unsigned int cvalue, unsigned char *buffer)
+{
+size_t i, j;
+for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+ if (cvalue <= utf8_table1[i]) break;
+if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
+buffer += i;
+for (j = i; j > 0; j--)
+ {
+ *buffer-- = 0x80 | (cvalue & 0x3f);
+ cvalue >>= 6;
+ }
+*buffer = utf8_table2[i] | cvalue;
+return i + 1;
+}
+
/*************************************************
@@ -331,27 +224,46 @@ return isatty(fileno(stdin));
/*************************************************
+* Get script name from ucp ident *
+*************************************************/
+
+static const char *
+get_scriptname(int script)
+{
+size_t i;
+const ucp_type_table *u;
+
+for (i = 0; i < PRIV(utt_size); i++)
+ {
+ u = PRIV(utt) + i;
+ if (u->type == PT_SC && u->value == script) break;
+ }
+if (i < PRIV(utt_size))
+ return PRIV(utt_names) + u->name_offset;
+
+return "??";
+}
+
+
+/*************************************************
* Print Unicode property info for a char *
*************************************************/
static void
-print_prop(int c)
+print_prop(unsigned int c, BOOL is_just_one)
{
int type = UCD_CATEGORY(c);
int fulltype = UCD_CHARTYPE(c);
int script = UCD_SCRIPT(c);
int scriptx = UCD_SCRIPTX(c);
int gbprop = UCD_GRAPHBREAK(c);
-int othercase = UCD_OTHERCASE(c);
+unsigned int othercase = UCD_OTHERCASE(c);
int caseset = UCD_CASESET(c);
const unsigned char *fulltypename = US"??";
const unsigned char *typename = US"??";
-const unsigned char *scriptname = US"??";
const unsigned char *graphbreak = US"??";
-
-if (script < sizeof(script_names)/sizeof(char *))
- scriptname = script_names[script];
+const unsigned char *scriptname = CUS get_scriptname(script);
switch (type)
{
@@ -420,15 +332,18 @@ switch(gbprop)
default: graphbreak = US"Unknown"; break;
}
-printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
-if (othercase != c)
+printf("U+%04X %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
+if (is_just_one && othercase != c)
{
- printf(", %04x", othercase);
+ printf(", U+%04X", othercase);
if (caseset != 0)
{
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
while (*(++p) < NOTACHAR)
- if (*p != othercase && *p != c) printf(", %04x", *p);
+ {
+ unsigned int d = *p;
+ if (d != othercase && d != c) printf(", U+%04X", d);
+ }
}
}
@@ -436,25 +351,26 @@ if (scriptx != script)
{
printf(", [");
if (scriptx >= 0)
- {
- scriptname = (scriptx >= sizeof(script_names)/sizeof(char *))?
- US"??" : script_names[scriptx];
- printf("%s", scriptname);
- }
+ printf("%s", get_scriptname(scriptx));
else
{
- char *sep = "";
+ const char *sep = "";
const uint8_t *p = PRIV(ucd_script_sets) - scriptx;
while (*p != 0)
{
- scriptname = (*p >= sizeof(script_names)/sizeof(char *))?
- US"??" : script_names[*p++];
- printf("%s%s", sep, scriptname);
+ printf("%s%s", sep, get_scriptname(*p++));
sep = ", ";
}
}
printf("]");
}
+
+if (show_character && is_just_one)
+ {
+ unsigned char buffer[8];
+ size_t len = ord2utf8(c, buffer);
+ printf(", >%.*s<", (int)len, buffer);
+ }
printf("\n");
}
@@ -483,7 +399,7 @@ BOOL type_not = FALSE;
BOOL gbreak_not = FALSE;
BOOL hadrange = FALSE;
const ucd_record *ucd, *next_ucd;
-const char *pad = " ";
+const char *pad = " ";
while (*s != 0)
{
@@ -508,17 +424,20 @@ while (*s != 0)
offset = 1;
}
- for (i = 0; i < sizeof(script_names)/sizeof(char *); i++)
+ for (i = 0; i < PRIV(utt_size); i++)
{
- if (strcmp(CS value + offset, script_names[i]) == 0)
+ const ucp_type_table *u = PRIV(utt) + i;
+ if (u->type == PT_SC && strcmp(CS(value + offset),
+ PRIV(utt_names) + u->name_offset) == 0)
{
+ c = u->value;
if (name[6] == 'x')
{
- scriptx_list[scriptx_count++] = scriptx_not? (-i):i;
+ scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
}
else
{
- if (script < 0) script = i; else
+ if (script < 0) script = c; else
{
printf("** Only 1 script value allowed\n");
return;
@@ -528,9 +447,9 @@ while (*s != 0)
}
}
- if (i >= sizeof(script_names)/sizeof(char *))
+ if (i >= PRIV(utt_size))
{
- printf("** Unrecognized script name '%s'\n", value);
+ printf("** Unrecognized script name \"%s\"\n", value);
return;
}
}
@@ -550,17 +469,17 @@ while (*s != 0)
offset = 1;
}
- for (i = 0; i < sizeof(type_names)/sizeof(char *); i++)
+ for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
{
- if (strcmp(CS (value + offset), type_names[i]) == 0)
+ if (strcmp(CS (value + offset), CS type_names[i]) == 0)
{
- type = i;
+ type = i/2;
break;
}
}
if (i >= sizeof(type_names)/sizeof(char *))
{
- printf("** Unrecognized type name '%s'\n", value);
+ printf("** Unrecognized type name \"%s\"\n", value);
return;
}
}
@@ -581,17 +500,17 @@ while (*s != 0)
offset = 1;
}
- for (i = 0; i < sizeof(gb_names)/sizeof(char *); i++)
+ for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
{
- if (strcmp(CS (value + offset), gb_names[i]) == 0)
+ if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
{
- gbreak = i;
+ gbreak = i/2;
break;
}
}
if (i >= sizeof(gb_names)/sizeof(char *))
{
- printf("** Unrecognized gbreak name '%s'\n", value);
+ printf("** Unrecognized gbreak name \"%s\"\n", value);
return;
}
}
@@ -599,7 +518,7 @@ while (*s != 0)
else
{
- printf("** Unrecognized property name '%s'\n", name);
+ printf("** Unrecognized property name \"%s\"\n", name);
return;
}
}
@@ -617,7 +536,7 @@ for (c = 0; c <= 0x10ffff; c++)
if (scriptx_count > 0)
{
const uint8_t *char_scriptx = NULL;
- int found = 0;
+ unsigned int found = 0;
int scriptx = UCD_SCRIPTX(c);
if (scriptx < 0) char_scriptx = PRIV(ucd_script_sets) - scriptx;
@@ -701,13 +620,13 @@ for (c = 0; c <= 0x10ffff; c++)
if (--i > c)
{
- printf("%04x..", c);
+ printf("U+%04X..", c);
c = i;
hadrange = TRUE;
}
else if (hadrange) printf("%s", pad);
- print_prop(c);
+ print_prop(c, FALSE);
if (c >= 0x100000) pad = " ";
else if (c >= 0x10000) pad = " ";
count++;
@@ -723,6 +642,101 @@ if (count == 0) printf("No characters found\n");
/*************************************************
+* Process command line *
+*************************************************/
+
+static void
+process_command_line(unsigned char *buffer)
+{
+unsigned char *s, *t;
+unsigned char name[24];
+
+s = buffer;
+while (isspace(*s)) s++;
+if (*s == 0) return;
+
+for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+*t = 0;
+while (isspace(*s)) s++;
+
+if (strcmp(CS name, "findprop") == 0)
+ {
+ while (*s != 0)
+ {
+ unsigned int c;
+ unsigned char *endptr;
+ t = s;
+ if (strncmp(CS t, "U+", 2) == 0) t += 2;
+ c = strtoul(CS t, CSS(&endptr), 16);
+ if (*endptr != 0 && !isspace(*endptr))
+ {
+ while (*endptr != 0 && !isspace(*endptr)) endptr++;
+ printf("** Invalid hex number: ignored \"%.*s\"\n", (int)(endptr-s), s);
+ }
+ else
+ {
+ if (c > 0x10ffff)
+ printf("** U+%x is too big for a Unicode code point\n", c);
+ else
+ print_prop(c, TRUE);
+ }
+ s = endptr;
+ while (isspace(*s)) s++;
+ }
+ }
+
+else if (strcmp(CS name, "find") == 0)
+ {
+ find_chars(s);
+ }
+
+else if (strcmp(CS name, "list") == 0)
+ {
+ while (*s != 0)
+ {
+ size_t i;
+ for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
+ *t = 0;
+ while (isspace(*s)) s++;
+
+ if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
+ {
+ for (i = 0; i < PRIV(utt_size); i++)
+ if (PRIV(utt)[i].type == PT_SC)
+ printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
+ }
+
+ else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
+ {
+ for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
+ printf("%s %s\n", type_names[i], type_names[i+1]);
+ }
+
+ else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
+ {
+ for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
+ {
+ if (gb_names[i+1][0] != 0)
+ printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
+ else
+ printf("%s\n", gb_names[i]);
+ }
+ }
+
+ else
+ {
+ printf("** Unknown property \"%s\"\n", name);
+ break;
+ }
+ }
+ }
+
+else printf("** Unknown test command \"%s\"\n", name);
+}
+
+
+
+/*************************************************
* Main program *
*************************************************/
@@ -730,19 +744,42 @@ int
main(int argc, char **argv)
{
BOOL interactive;
+int first_arg = 1;
unsigned char buffer[1024];
-if (argc > 1)
+if (argc > 1 && strcmp(argv[1], "-s") == 0)
+ {
+ show_character = TRUE;
+ first_arg++;
+ }
+
+if (argc > first_arg)
{
int i;
- for (i = 1; i < argc; i++)
+ BOOL hexfirst = TRUE;
+ char *arg = argv[first_arg];
+ unsigned char *s = buffer;
+
+ if (strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
{
- unsigned char *endptr;
- int c = strtoul(argv[i], CSS(&endptr), 16);
- if (*endptr != 0)
- printf("** Hex number expected; ignored '%s'\n", argv[i]);
- else print_prop(c);
+ while (*arg != 0)
+ {
+ if (!isxdigit(*arg++)) { hexfirst = FALSE; break; }
+ }
+ }
+
+ if (hexfirst)
+ {
+ strcpy(CS s, "findprop ");
+ s += 9;
+ }
+
+ for (i = first_arg; i < argc; i++)
+ {
+ s += sprintf(CS s, "%s ", argv[i]);
}
+
+ process_command_line(buffer);
return 0;
}
@@ -754,17 +791,14 @@ if (interactive) using_history();
for(;;)
{
- unsigned char name[24];
- unsigned char *s, *t;
-
#if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
if (interactive)
{
size_t len;
- s = readline("> ");
+ unsigned char *s = US readline("> ");
if (s == NULL) break;
- len = strlen(s);
- if (len > 0) add_history(s);
+ len = strlen(CS s);
+ if (len > 0) add_history(CS s);
memcpy(buffer, s, len);
buffer[len] = '\n';
buffer[len+1] = 0;
@@ -778,39 +812,8 @@ for(;;)
if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
if (!interactive) printf("%s", buffer);
}
-
- s = buffer;
- while (isspace(*s)) s++;
- if (*s == 0) continue;
-
- for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
- *t = 0;
- while (isspace(*s)) s++;
-
- if (strcmp(CS name, "findprop") == 0)
- {
- while (*s != 0)
- {
- unsigned char *endptr;
- int c = strtoul(CS s, CSS(&endptr), 16);
-
- if (*endptr != 0 && !isspace(*endptr))
- {
- while (*endptr != 0 && !isspace(*endptr)) endptr++;
- printf("** Hex number expected; ignored '%.*s'\n", endptr-s, s);
- }
- else print_prop(c);
- s = endptr;
- while (isspace(*s)) s++;
- }
- }
-
- else if (strcmp(CS name, "find") == 0)
- {
- find_chars(s);
- }
-
- else printf("** Unknown test command %s\n", name);
+
+ process_command_line(buffer);
}
if (interactive) printf("\n");
diff --git a/maint/ucptestdata/testinput1 b/maint/ucptestdata/testinput1
index 58c7cf1..3552a4f 100644
--- a/maint/ucptestdata/testinput1
+++ b/maint/ucptestdata/testinput1
@@ -45,4 +45,4 @@ findprop 32ff
findprop 1f16d
-findprop 10e93 10eaa
+findprop U+10e93 U+10eaa
diff --git a/maint/ucptestdata/testinput2 b/maint/ucptestdata/testinput2
new file mode 100644
index 0000000..bdea520
--- /dev/null
+++ b/maint/ucptestdata/testinput2
@@ -0,0 +1,5 @@
+find script Han
+find type Pe script Common scriptx Hangul
+find type Sk
+find type Pd
+find gbreak LVT
diff --git a/maint/ucptestdata/testoutput1 b/maint/ucptestdata/testoutput1
index 0751a58..275b8e4 100644
--- a/maint/ucptestdata/testoutput1
+++ b/maint/ucptestdata/testoutput1
@@ -1,398 +1,398 @@
findprop 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
-0000 Control: Control, Common, Control
-0001 Control: Control, Common, Control
-0002 Control: Control, Common, Control
-0003 Control: Control, Common, Control
-0004 Control: Control, Common, Control
-0005 Control: Control, Common, Control
-0006 Control: Control, Common, Control
-0007 Control: Control, Common, Control
-0008 Control: Control, Common, Control
-0009 Control: Control, Common, Control
-000a Control: Control, Common, LF
-000b Control: Control, Common, Control
-000c Control: Control, Common, Control
-000d Control: Control, Common, CR
-000e Control: Control, Common, Control
-000f Control: Control, Common, Control
+U+0000 Control: Control, Common, Control
+U+0001 Control: Control, Common, Control
+U+0002 Control: Control, Common, Control
+U+0003 Control: Control, Common, Control
+U+0004 Control: Control, Common, Control
+U+0005 Control: Control, Common, Control
+U+0006 Control: Control, Common, Control
+U+0007 Control: Control, Common, Control
+U+0008 Control: Control, Common, Control
+U+0009 Control: Control, Common, Control
+U+000A Control: Control, Common, LF
+U+000B Control: Control, Common, Control
+U+000C Control: Control, Common, Control
+U+000D Control: Control, Common, CR
+U+000E Control: Control, Common, Control
+U+000F Control: Control, Common, Control
findprop 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
-0010 Control: Control, Common, Control
-0011 Control: Control, Common, Control
-0012 Control: Control, Common, Control
-0013 Control: Control, Common, Control
-0014 Control: Control, Common, Control
-0015 Control: Control, Common, Control
-0016 Control: Control, Common, Control
-0017 Control: Control, Common, Control
-0018 Control: Control, Common, Control
-0019 Control: Control, Common, Control
-001a Control: Control, Common, Control
-001b Control: Control, Common, Control
-001c Control: Control, Common, Control
-001d Control: Control, Common, Control
-001e Control: Control, Common, Control
-001f Control: Control, Common, Control
+U+0010 Control: Control, Common, Control
+U+0011 Control: Control, Common, Control
+U+0012 Control: Control, Common, Control
+U+0013 Control: Control, Common, Control
+U+0014 Control: Control, Common, Control
+U+0015 Control: Control, Common, Control
+U+0016 Control: Control, Common, Control
+U+0017 Control: Control, Common, Control
+U+0018 Control: Control, Common, Control
+U+0019 Control: Control, Common, Control
+U+001A Control: Control, Common, Control
+U+001B Control: Control, Common, Control
+U+001C Control: Control, Common, Control
+U+001D Control: Control, Common, Control
+U+001E Control: Control, Common, Control
+U+001F Control: Control, Common, Control
findprop 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
-0020 Separator: Space separator, Common, Other
-0021 Punctuation: Other punctuation, Common, Other
-0022 Punctuation: Other punctuation, Common, Other
-0023 Punctuation: Other punctuation, Common, Other
-0024 Symbol: Currency symbol, Common, Other
-0025 Punctuation: Other punctuation, Common, Other
-0026 Punctuation: Other punctuation, Common, Other
-0027 Punctuation: Other punctuation, Common, Other
-0028 Punctuation: Open punctuation, Common, Other
-0029 Punctuation: Close punctuation, Common, Other
-002a Punctuation: Other punctuation, Common, Other
-002b Symbol: Mathematical symbol, Common, Other
-002c Punctuation: Other punctuation, Common, Other
-002d Punctuation: Dash punctuation, Common, Other
-002e Punctuation: Other punctuation, Common, Other
-002f Punctuation: Other punctuation, Common, Other
+U+0020 Separator: Space separator, Common, Other
+U+0021 Punctuation: Other punctuation, Common, Other
+U+0022 Punctuation: Other punctuation, Common, Other
+U+0023 Punctuation: Other punctuation, Common, Other
+U+0024 Symbol: Currency symbol, Common, Other
+U+0025 Punctuation: Other punctuation, Common, Other
+U+0026 Punctuation: Other punctuation, Common, Other
+U+0027 Punctuation: Other punctuation, Common, Other
+U+0028 Punctuation: Open punctuation, Common, Other
+U+0029 Punctuation: Close punctuation, Common, Other
+U+002A Punctuation: Other punctuation, Common, Other
+U+002B Symbol: Mathematical symbol, Common, Other
+U+002C Punctuation: Other punctuation, Common, Other
+U+002D Punctuation: Dash punctuation, Common, Other
+U+002E Punctuation: Other punctuation, Common, Other
+U+002F Punctuation: Other punctuation, Common, Other
findprop 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
-0030 Number: Decimal number, Common, Other
-0031 Number: Decimal number, Common, Other
-0032 Number: Decimal number, Common, Other
-0033 Number: Decimal number, Common, Other
-0034 Number: Decimal number, Common, Other
-0035 Number: Decimal number, Common, Other
-0036 Number: Decimal number, Common, Other
-0037 Number: Decimal number, Common, Other
-0038 Number: Decimal number, Common, Other
-0039 Number: Decimal number, Common, Other
-003a Punctuation: Other punctuation, Common, Other
-003b Punctuation: Other punctuation, Common, Other
-003c Symbol: Mathematical symbol, Common, Other
-003d Symbol: Mathematical symbol, Common, Other
-003e Symbol: Mathematical symbol, Common, Other
-003f Punctuation: Other punctuation, Common, Other
+U+0030 Number: Decimal number, Common, Other
+U+0031 Number: Decimal number, Common, Other
+U+0032 Number: Decimal number, Common, Other
+U+0033 Number: Decimal number, Common, Other
+U+0034 Number: Decimal number, Common, Other
+U+0035 Number: Decimal number, Common, Other
+U+0036 Number: Decimal number, Common, Other
+U+0037 Number: Decimal number, Common, Other
+U+0038 Number: Decimal number, Common, Other
+U+0039 Number: Decimal number, Common, Other
+U+003A Punctuation: Other punctuation, Common, Other
+U+003B Punctuation: Other punctuation, Common, Other
+U+003C Symbol: Mathematical symbol, Common, Other
+U+003D Symbol: Mathematical symbol, Common, Other
+U+003E Symbol: Mathematical symbol, Common, Other
+U+003F Punctuation: Other punctuation, Common, Other
findprop 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
-0040 Punctuation: Other punctuation, Common, Other
-0041 Letter: Upper case letter, Latin, Other, 0061
-0042 Letter: Upper case letter, Latin, Other, 0062
-0043 Letter: Upper case letter, Latin, Other, 0063
-0044 Letter: Upper case letter, Latin, Other, 0064
-0045 Letter: Upper case letter, Latin, Other, 0065
-0046 Letter: Upper case letter, Latin, Other, 0066
-0047 Letter: Upper case letter, Latin, Other, 0067
-0048 Letter: Upper case letter, Latin, Other, 0068
-0049 Letter: Upper case letter, Latin, Other, 0069
-004a Letter: Upper case letter, Latin, Other, 006a
-004b Letter: Upper case letter, Latin, Other, 006b, 212a
-004c Letter: Upper case letter, Latin, Other, 006c
-004d Letter: Upper case letter, Latin, Other, 006d
-004e Letter: Upper case letter, Latin, Other, 006e
-004f Letter: Upper case letter, Latin, Other, 006f
+U+0040 Punctuation: Other punctuation, Common, Other
+U+0041 Letter: Upper case letter, Latin, Other, U+0061
+U+0042 Letter: Upper case letter, Latin, Other, U+0062
+U+0043 Letter: Upper case letter, Latin, Other, U+0063
+U+0044 Letter: Upper case letter, Latin, Other, U+0064
+U+0045 Letter: Upper case letter, Latin, Other, U+0065
+U+0046 Letter: Upper case letter, Latin, Other, U+0066
+U+0047 Letter: Upper case letter, Latin, Other, U+0067
+U+0048 Letter: Upper case letter, Latin, Other, U+0068
+U+0049 Letter: Upper case letter, Latin, Other, U+0069
+U+004A Letter: Upper case letter, Latin, Other, U+006A
+U+004B Letter: Upper case letter, Latin, Other, U+006B, U+212A
+U+004C Letter: Upper case letter, Latin, Other, U+006C
+U+004D Letter: Upper case letter, Latin, Other, U+006D
+U+004E Letter: Upper case letter, Latin, Other, U+006E
+U+004F Letter: Upper case letter, Latin, Other, U+006F
findprop 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
-0050 Letter: Upper case letter, Latin, Other, 0070
-0051 Letter: Upper case letter, Latin, Other, 0071
-0052 Letter: Upper case letter, Latin, Other, 0072
-0053 Letter: Upper case letter, Latin, Other, 0073, 017f
-0054 Letter: Upper case letter, Latin, Other, 0074
-0055 Letter: Upper case letter, Latin, Other, 0075
-0056 Letter: Upper case letter, Latin, Other, 0076
-0057 Letter: Upper case letter, Latin, Other, 0077
-0058 Letter: Upper case letter, Latin, Other, 0078
-0059 Letter: Upper case letter, Latin, Other, 0079
-005a Letter: Upper case letter, Latin, Other, 007a
-005b Punctuation: Open punctuation, Common, Other
-005c Punctuation: Other punctuation, Common, Other
-005d Punctuation: Close punctuation, Common, Other
-005e Symbol: Modifier symbol, Common, Other
-005f Punctuation: Connector punctuation, Common, Other
+U+0050 Letter: Upper case letter, Latin, Other, U+0070
+U+0051 Letter: Upper case letter, Latin, Other, U+0071
+U+0052 Letter: Upper case letter, Latin, Other, U+0072
+U+0053 Letter: Upper case letter, Latin, Other, U+0073, U+017F
+U+0054 Letter: Upper case letter, Latin, Other, U+0074
+U+0055 Letter: Upper case letter, Latin, Other, U+0075
+U+0056 Letter: Upper case letter, Latin, Other, U+0076
+U+0057 Letter: Upper case letter, Latin, Other, U+0077
+U+0058 Letter: Upper case letter, Latin, Other, U+0078
+U+0059 Letter: Upper case letter, Latin, Other, U+0079
+U+005A Letter: Upper case letter, Latin, Other, U+007A
+U+005B Punctuation: Open punctuation, Common, Other
+U+005C Punctuation: Other punctuation, Common, Other
+U+005D Punctuation: Close punctuation, Common, Other
+U+005E Symbol: Modifier symbol, Common, Other
+U+005F Punctuation: Connector punctuation, Common, Other
findprop 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
-0060 Symbol: Modifier symbol, Common, Other
-0061 Letter: Lower case letter, Latin, Other, 0041
-0062 Letter: Lower case letter, Latin, Other, 0042
-0063 Letter: Lower case letter, Latin, Other, 0043
-0064 Letter: Lower case letter, Latin, Other, 0044
-0065 Letter: Lower case letter, Latin, Other, 0045
-0066 Letter: Lower case letter, Latin, Other, 0046
-0067 Letter: Lower case letter, Latin, Other, 0047
-0068 Letter: Lower case letter, Latin, Other, 0048
-0069 Letter: Lower case letter, Latin, Other, 0049
-006a Letter: Lower case letter, Latin, Other, 004a
-006b Letter: Lower case letter, Latin, Other, 004b, 212a
-006c Letter: Lower case letter, Latin, Other, 004c
-006d Letter: Lower case letter, Latin, Other, 004d
-006e Letter: Lower case letter, Latin, Other, 004e
-006f Letter: Lower case letter, Latin, Other, 004f
+U+0060 Symbol: Modifier symbol, Common, Other
+U+0061 Letter: Lower case letter, Latin, Other, U+0041
+U+0062 Letter: Lower case letter, Latin, Other, U+0042
+U+0063 Letter: Lower case letter, Latin, Other, U+0043
+U+0064 Letter: Lower case letter, Latin, Other, U+0044
+U+0065 Letter: Lower case letter, Latin, Other, U+0045
+U+0066 Letter: Lower case letter, Latin, Other, U+0046
+U+0067 Letter: Lower case letter, Latin, Other, U+0047
+U+0068 Letter: Lower case letter, Latin, Other, U+0048
+U+0069 Letter: Lower case letter, Latin, Other, U+0049
+U+006A Letter: Lower case letter, Latin, Other, U+004A
+U+006B Letter: Lower case letter, Latin, Other, U+004B, U+212A
+U+006C Letter: Lower case letter, Latin, Other, U+004C
+U+006D Letter: Lower case letter, Latin, Other, U+004D
+U+006E Letter: Lower case letter, Latin, Other, U+004E
+U+006F Letter: Lower case letter, Latin, Other, U+004F
findprop 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
-0070 Letter: Lower case letter, Latin, Other, 0050
-0071 Letter: Lower case letter, Latin, Other, 0051
-0072 Letter: Lower case letter, Latin, Other, 0052
-0073 Letter: Lower case letter, Latin, Other, 0053, 017f
-0074 Letter: Lower case letter, Latin, Other, 0054
-0075 Letter: Lower case letter, Latin, Other, 0055
-0076 Letter: Lower case letter, Latin, Other, 0056
-0077 Letter: Lower case letter, Latin, Other, 0057
-0078 Letter: Lower case letter, Latin, Other, 0058
-0079 Letter: Lower case letter, Latin, Other, 0059
-007a Letter: Lower case letter, Latin, Other, 005a
-007b Punctuation: Open punctuation, Common, Other
-007c Symbol: Mathematical symbol, Common, Other
-007d Punctuation: Close punctuation, Common, Other
-007e Symbol: Mathematical symbol, Common, Other
-007f Control: Control, Common, Control
+U+0070 Letter: Lower case letter, Latin, Other, U+0050
+U+0071 Letter: Lower case letter, Latin, Other, U+0051
+U+0072 Letter: Lower case letter, Latin, Other, U+0052
+U+0073 Letter: Lower case letter, Latin, Other, U+0053, U+017F
+U+0074 Letter: Lower case letter, Latin, Other, U+0054
+U+0075 Letter: Lower case letter, Latin, Other, U+0055
+U+0076 Letter: Lower case letter, Latin, Other, U+0056
+U+0077 Letter: Lower case letter, Latin, Other, U+0057
+U+0078 Letter: Lower case letter, Latin, Other, U+0058
+U+0079 Letter: Lower case letter, Latin, Other, U+0059
+U+007A Letter: Lower case letter, Latin, Other, U+005A
+U+007B Punctuation: Open punctuation, Common, Other
+U+007C Symbol: Mathematical symbol, Common, Other
+U+007D Punctuation: Close punctuation, Common, Other
+U+007E Symbol: Mathematical symbol, Common, Other
+U+007F Control: Control, Common, Control
findprop 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f
-0080 Control: Control, Common, Control
-0081 Control: Control, Common, Control
-0082 Control: Control, Common, Control
-0083 Control: Control, Common, Control
-0084 Control: Control, Common, Control
-0085 Control: Control, Common, Control
-0086 Control: Control, Common, Control
-0087 Control: Control, Common, Control
-0088 Control: Control, Common, Control
-0089 Control: Control, Common, Control
-008a Control: Control, Common, Control
-008b Control: Control, Common, Control
-008c Control: Control, Common, Control
-008d Control: Control, Common, Control
-008e Control: Control, Common, Control
-008f Control: Control, Common, Control
+U+0080 Control: Control, Common, Control
+U+0081 Control: Control, Common, Control
+U+0082 Control: Control, Common, Control
+U+0083 Control: Control, Common, Control
+U+0084 Control: Control, Common, Control
+U+0085 Control: Control, Common, Control
+U+0086 Control: Control, Common, Control
+U+0087 Control: Control, Common, Control
+U+0088 Control: Control, Common, Control
+U+0089 Control: Control, Common, Control
+U+008A Control: Control, Common, Control
+U+008B Control: Control, Common, Control
+U+008C Control: Control, Common, Control
+U+008D Control: Control, Common, Control
+U+008E Control: Control, Common, Control
+U+008F Control: Control, Common, Control
findprop 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f
-0090 Control: Control, Common, Control
-0091 Control: Control, Common, Control
-0092 Control: Control, Common, Control
-0093 Control: Control, Common, Control
-0094 Control: Control, Common, Control
-0095 Control: Control, Common, Control
-0096 Control: Control, Common, Control
-0097 Control: Control, Common, Control
-0098 Control: Control, Common, Control
-0099 Control: Control, Common, Control
-009a Control: Control, Common, Control
-009b Control: Control, Common, Control
-009c Control: Control, Common, Control
-009d Control: Control, Common, Control
-009e Control: Control, Common, Control
-009f Control: Control, Common, Control
+U+0090 Control: Control, Common, Control
+U+0091 Control: Control, Common, Control
+U+0092 Control: Control, Common, Control
+U+0093 Control: Control, Common, Control
+U+0094 Control: Control, Common, Control
+U+0095 Control: Control, Common, Control
+U+0096 Control: Control, Common, Control
+U+0097 Control: Control, Common, Control
+U+0098 Control: Control, Common, Control
+U+0099 Control: Control, Common, Control
+U+009A Control: Control, Common, Control
+U+009B Control: Control, Common, Control
+U+009C Control: Control, Common, Control
+U+009D Control: Control, Common, Control
+U+009E Control: Control, Common, Control
+U+009F Control: Control, Common, Control
findprop a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af
-00a0 Separator: Space separator, Common, Other
-00a1 Punctuation: Other punctuation, Common, Other
-00a2 Symbol: Currency symbol, Common, Other
-00a3 Symbol: Currency symbol, Common, Other
-00a4 Symbol: Currency symbol, Common, Other
-00a5 Symbol: Currency symbol, Common, Other
-00a6 Symbol: Other symbol, Common, Other
-00a7 Punctuation: Other punctuation, Common, Other
-00a8 Symbol: Modifier symbol, Common, Other
-00a9 Symbol: Other symbol, Common, Extended Pictographic
-00aa Letter: Other letter, Latin, Other
-00ab Punctuation: Initial punctuation, Common, Other
-00ac Symbol: Mathematical symbol, Common, Other
-00ad Control: Format, Common, Control
-00ae Symbol: Other symbol, Common, Extended Pictographic
-00af Symbol: Modifier symbol, Common, Other
+U+00A0 Separator: Space separator, Common, Other
+U+00A1 Punctuation: Other punctuation, Common, Other
+U+00A2 Symbol: Currency symbol, Common, Other
+U+00A3 Symbol: Currency symbol, Common, Other
+U+00A4 Symbol: Currency symbol, Common, Other
+U+00A5 Symbol: Currency symbol, Common, Other
+U+00A6 Symbol: Other symbol, Common, Other
+U+00A7 Punctuation: Other punctuation, Common, Other
+U+00A8 Symbol: Modifier symbol, Common, Other
+U+00A9 Symbol: Other symbol, Common, Extended Pictographic
+U+00AA Letter: Other letter, Latin, Other
+U+00AB Punctuation: Initial punctuation, Common, Other
+U+00AC Symbol: Mathematical symbol, Common, Other
+U+00AD Control: Format, Common, Control
+U+00AE Symbol: Other symbol, Common, Extended Pictographic
+U+00AF Symbol: Modifier symbol, Common, Other
findprop b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf
-00b0 Symbol: Other symbol, Common, Other
-00b1 Symbol: Mathematical symbol, Common, Other
-00b2 Number: Other number, Common, Other
-00b3 Number: Other number, Common, Other
-00b4 Symbol: Modifier symbol, Common, Other
-00b5 Letter: Lower case letter, Common, Other, 03bc, 039c
-00b6 Punctuation: Other punctuation, Common, Other
-00b7 Punctuation: Other punctuation, Common, Other
-00b8 Symbol: Modifier symbol, Common, Other
-00b9 Number: Other number, Common, Other
-00ba Letter: Other letter, Latin, Other
-00bb Punctuation: Final punctuation, Common, Other
-00bc Number: Other number, Common, Other
-00bd Number: Other number, Common, Other
-00be Number: Other number, Common, Other
-00bf Punctuation: Other punctuation, Common, Other
+U+00B0 Symbol: Other symbol, Common, Other
+U+00B1 Symbol: Mathematical symbol, Common, Other
+U+00B2 Number: Other number, Common, Other
+U+00B3 Number: Other number, Common, Other
+U+00B4 Symbol: Modifier symbol, Common, Other
+U+00B5 Letter: Lower case letter, Common, Other, U+03BC, U+039C
+U+00B6 Punctuation: Other punctuation, Common, Other
+U+00B7 Punctuation: Other punctuation, Common, Other
+U+00B8 Symbol: Modifier symbol, Common, Other
+U+00B9 Number: Other number, Common, Other
+U+00BA Letter: Other letter, Latin, Other
+U+00BB Punctuation: Final punctuation, Common, Other
+U+00BC Number: Other number, Common, Other
+U+00BD Number: Other number, Common, Other
+U+00BE Number: Other number, Common, Other
+U+00BF Punctuation: Other punctuation, Common, Other
findprop c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
-00c0 Letter: Upper case letter, Latin, Other, 00e0
-00c1 Letter: Upper case letter, Latin, Other, 00e1
-00c2 Letter: Upper case letter, Latin, Other, 00e2
-00c3 Letter: Upper case letter, Latin, Other, 00e3
-00c4 Letter: Upper case letter, Latin, Other, 00e4
-00c5 Letter: Upper case letter, Latin, Other, 00e5, 212b
-00c6 Letter: Upper case letter, Latin, Other, 00e6
-00c7 Letter: Upper case letter, Latin, Other, 00e7
-00c8 Letter: Upper case letter, Latin, Other, 00e8
-00c9 Letter: Upper case letter, Latin, Other, 00e9
-00ca Letter: Upper case letter, Latin, Other, 00ea
-00cb Letter: Upper case letter, Latin, Other, 00eb
-00cc Letter: Upper case letter, Latin, Other, 00ec
-00cd Letter: Upper case letter, Latin, Other, 00ed
-00ce Letter: Upper case letter, Latin, Other, 00ee
-00cf Letter: Upper case letter, Latin, Other, 00ef
+U+00C0 Letter: Upper case letter, Latin, Other, U+00E0
+U+00C1 Letter: Upper case letter, Latin, Other, U+00E1
+U+00C2 Letter: Upper case letter, Latin, Other, U+00E2
+U+00C3 Letter: Upper case letter, Latin, Other, U+00E3
+U+00C4 Letter: Upper case letter, Latin, Other, U+00E4
+U+00C5 Letter: Upper case letter, Latin, Other, U+00E5, U+212B
+U+00C6 Letter: Upper case letter, Latin, Other, U+00E6
+U+00C7 Letter: Upper case letter, Latin, Other, U+00E7
+U+00C8 Letter: Upper case letter, Latin, Other, U+00E8
+U+00C9 Letter: Upper case letter, Latin, Other, U+00E9
+U+00CA Letter: Upper case letter, Latin, Other, U+00EA
+U+00CB Letter: Upper case letter, Latin, Other, U+00EB
+U+00CC Letter: Upper case letter, Latin, Other, U+00EC
+U+00CD Letter: Upper case letter, Latin, Other, U+00ED
+U+00CE Letter: Upper case letter, Latin, Other, U+00EE
+U+00CF Letter: Upper case letter, Latin, Other, U+00EF
findprop d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df
-00d0 Letter: Upper case letter, Latin, Other, 00f0
-00d1 Letter: Upper case letter, Latin, Other, 00f1
-00d2 Letter: Upper case letter, Latin, Other, 00f2
-00d3 Letter: Upper case letter, Latin, Other, 00f3
-00d4 Letter: Upper case letter, Latin, Other, 00f4
-00d5 Letter: Upper case letter, Latin, Other, 00f5
-00d6 Letter: Upper case letter, Latin, Other, 00f6
-00d7 Symbol: Mathematical symbol, Common, Other
-00d8 Letter: Upper case letter, Latin, Other, 00f8
-00d9 Letter: Upper case letter, Latin, Other, 00f9
-00da Letter: Upper case letter, Latin, Other, 00fa
-00db Letter: Upper case letter, Latin, Other, 00fb
-00dc Letter: Upper case letter, Latin, Other, 00fc
-00dd Letter: Upper case letter, Latin, Other, 00fd
-00de Letter: Upper case letter, Latin, Other, 00fe
-00df Letter: Lower case letter, Latin, Other, 1e9e
+U+00D0 Letter: Upper case letter, Latin, Other, U+00F0
+U+00D1 Letter: Upper case letter, Latin, Other, U+00F1
+U+00D2 Letter: Upper case letter, Latin, Other, U+00F2
+U+00D3 Letter: Upper case letter, Latin, Other, U+00F3
+U+00D4 Letter: Upper case letter, Latin, Other, U+00F4
+U+00D5 Letter: Upper case letter, Latin, Other, U+00F5
+U+00D6 Letter: Upper case letter, Latin, Other, U+00F6
+U+00D7 Symbol: Mathematical symbol, Common, Other
+U+00D8 Letter: Upper case letter, Latin, Other, U+00F8
+U+00D9 Letter: Upper case letter, Latin, Other, U+00F9
+U+00DA Letter: Upper case letter, Latin, Other, U+00FA
+U+00DB Letter: Upper case letter, Latin, Other, U+00FB
+U+00DC Letter: Upper case letter, Latin, Other, U+00FC
+U+00DD Letter: Upper case letter, Latin, Other, U+00FD
+U+00DE Letter: Upper case letter, Latin, Other, U+00FE
+U+00DF Letter: Lower case letter, Latin, Other, U+1E9E
findprop e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef
-00e0 Letter: Lower case letter, Latin, Other, 00c0
-00e1 Letter: Lower case letter, Latin, Other, 00c1
-00e2 Letter: Lower case letter, Latin, Other, 00c2
-00e3 Letter: Lower case letter, Latin, Other, 00c3
-00e4 Letter: Lower case letter, Latin, Other, 00c4
-00e5 Letter: Lower case letter, Latin, Other, 00c5, 212b
-00e6 Letter: Lower case letter, Latin, Other, 00c6
-00e7 Letter: Lower case letter, Latin, Other, 00c7
-00e8 Letter: Lower case letter, Latin, Other, 00c8
-00e9 Letter: Lower case letter, Latin, Other, 00c9
-00ea Letter: Lower case letter, Latin, Other, 00ca
-00eb Letter: Lower case letter, Latin, Other, 00cb
-00ec Letter: Lower case letter, Latin, Other, 00cc
-00ed Letter: Lower case letter, Latin, Other, 00cd
-00ee Letter: Lower case letter, Latin, Other, 00ce
-00ef Letter: Lower case letter, Latin, Other, 00cf
+U+00E0 Letter: Lower case letter, Latin, Other, U+00C0
+U+00E1 Letter: Lower case letter, Latin, Other, U+00C1
+U+00E2 Letter: Lower case letter, Latin, Other, U+00C2
+U+00E3 Letter: Lower case letter, Latin, Other, U+00C3
+U+00E4 Letter: Lower case letter, Latin, Other, U+00C4
+U+00E5 Letter: Lower case letter, Latin, Other, U+00C5, U+212B
+U+00E6 Letter: Lower case letter, Latin, Other, U+00C6
+U+00E7 Letter: Lower case letter, Latin, Other, U+00C7
+U+00E8 Letter: Lower case letter, Latin, Other, U+00C8
+U+00E9 Letter: Lower case letter, Latin, Other, U+00C9
+U+00EA Letter: Lower case letter, Latin, Other, U+00CA
+U+00EB Letter: Lower case letter, Latin, Other, U+00CB
+U+00EC Letter: Lower case letter, Latin, Other, U+00CC
+U+00ED Letter: Lower case letter, Latin, Other, U+00CD
+U+00EE Letter: Lower case letter, Latin, Other, U+00CE
+U+00EF Letter: Lower case letter, Latin, Other, U+00CF
findprop f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
-00f0 Letter: Lower case letter, Latin, Other, 00d0
-00f1 Letter: Lower case letter, Latin, Other, 00d1
-00f2 Letter: Lower case letter, Latin, Other, 00d2
-00f3 Letter: Lower case letter, Latin, Other, 00d3
-00f4 Letter: Lower case letter, Latin, Other, 00d4
-00f5 Letter: Lower case letter, Latin, Other, 00d5
-00f6 Letter: Lower case letter, Latin, Other, 00d6
-00f7 Symbol: Mathematical symbol, Common, Other
-00f8 Letter: Lower case letter, Latin, Other, 00d8
-00f9 Letter: Lower case letter, Latin, Other, 00d9
-00fa Letter: Lower case letter, Latin, Other, 00da
-00fb Letter: Lower case letter, Latin, Other, 00db
-00fc Letter: Lower case letter, Latin, Other, 00dc
-00fd Letter: Lower case letter, Latin, Other, 00dd
-00fe Letter: Lower case letter, Latin, Other, 00de
-00ff Letter: Lower case letter, Latin, Other, 0178
+U+00F0 Letter: Lower case letter, Latin, Other, U+00D0
+U+00F1 Letter: Lower case letter, Latin, Other, U+00D1
+U+00F2 Letter: Lower case letter, Latin, Other, U+00D2
+U+00F3 Letter: Lower case letter, Latin, Other, U+00D3
+U+00F4 Letter: Lower case letter, Latin, Other, U+00D4
+U+00F5 Letter: Lower case letter, Latin, Other, U+00D5
+U+00F6 Letter: Lower case letter, Latin, Other, U+00D6
+U+00F7 Symbol: Mathematical symbol, Common, Other
+U+00F8 Letter: Lower case letter, Latin, Other, U+00D8
+U+00F9 Letter: Lower case letter, Latin, Other, U+00D9
+U+00FA Letter: Lower case letter, Latin, Other, U+00DA
+U+00FB Letter: Lower case letter, Latin, Other, U+00DB
+U+00FC Letter: Lower case letter, Latin, Other, U+00DC
+U+00FD Letter: Lower case letter, Latin, Other, U+00DD
+U+00FE Letter: Lower case letter, Latin, Other, U+00DE
+U+00FF Letter: Lower case letter, Latin, Other, U+0178
findprop 0100 0101 0102 0103 0104 0105 0106
-0100 Letter: Upper case letter, Latin, Other, 0101
-0101 Letter: Lower case letter, Latin, Other, 0100
-0102 Letter: Upper case letter, Latin, Other, 0103
-0103 Letter: Lower case letter, Latin, Other, 0102
-0104 Letter: Upper case letter, Latin, Other, 0105
-0105 Letter: Lower case letter, Latin, Other, 0104
-0106 Letter: Upper case letter, Latin, Other, 0107
+U+0100 Letter: Upper case letter, Latin, Other, U+0101
+U+0101 Letter: Lower case letter, Latin, Other, U+0100
+U+0102 Letter: Upper case letter, Latin, Other, U+0103
+U+0103 Letter: Lower case letter, Latin, Other, U+0102
+U+0104 Letter: Upper case letter, Latin, Other, U+0105
+U+0105 Letter: Lower case letter, Latin, Other, U+0104
+U+0106 Letter: Upper case letter, Latin, Other, U+0107
findprop ffe0 ffe1 ffe2 ffe3 ffe4 ffe5 ffe6 ffe7
-ffe0 Symbol: Currency symbol, Common, Other
-ffe1 Symbol: Currency symbol, Common, Other
-ffe2 Symbol: Mathematical symbol, Common, Other
-ffe3 Symbol: Modifier symbol, Common, Other
-ffe4 Symbol: Other symbol, Common, Other
-ffe5 Symbol: Currency symbol, Common, Other
-ffe6 Symbol: Currency symbol, Common, Other
-ffe7 Control: Unassigned, Unknown, Other
+U+FFE0 Symbol: Currency symbol, Common, Other
+U+FFE1 Symbol: Currency symbol, Common, Other
+U+FFE2 Symbol: Mathematical symbol, Common, Other
+U+FFE3 Symbol: Modifier symbol, Common, Other
+U+FFE4 Symbol: Other symbol, Common, Other
+U+FFE5 Symbol: Currency symbol, Common, Other
+U+FFE6 Symbol: Currency symbol, Common, Other
+U+FFE7 Control: Unassigned, Unknown, Other
findprop ffe8 ffe9 ffea ffeb ffec ffed ffee ffef
-ffe8 Symbol: Other symbol, Common, Other
-ffe9 Symbol: Mathematical symbol, Common, Other
-ffea Symbol: Mathematical symbol, Common, Other
-ffeb Symbol: Mathematical symbol, Common, Other
-ffec Symbol: Mathematical symbol, Common, Other
-ffed Symbol: Other symbol, Common, Other
-ffee Symbol: Other symbol, Common, Other
-ffef Control: Unassigned, Unknown, Other
+U+FFE8 Symbol: Other symbol, Common, Other
+U+FFE9 Symbol: Mathematical symbol, Common, Other
+U+FFEA Symbol: Mathematical symbol, Common, Other
+U+FFEB Symbol: Mathematical symbol, Common, Other
+U+FFEC Symbol: Mathematical symbol, Common, Other
+U+FFED Symbol: Other symbol, Common, Other
+U+FFEE Symbol: Other symbol, Common, Other
+U+FFEF Control: Unassigned, Unknown, Other
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
-fff8 Control: Unassigned, Unknown, Control
-fff9 Control: Format, Common, Control
-fffa Control: Format, Common, Control
-fffb Control: Format, Common, Control
-fffc Symbol: Other symbol, Common, Other
-fffd Symbol: Other symbol, Common, Other
-fffe Control: Unassigned, Unknown, Other
-ffff Control: Unassigned, Unknown, Other
+U+FFF8 Control: Unassigned, Unknown, Control
+U+FFF9 Control: Format, Common, Control
+U+FFFA Control: Format, Common, Control
+U+FFFB Control: Format, Common, Control
+U+FFFC Symbol: Other symbol, Common, Other
+U+FFFD Symbol: Other symbol, Common, Other
+U+FFFE Control: Unassigned, Unknown, Other
+U+FFFF Control: Unassigned, Unknown, Other
findprop 10000 10001 e01ef f0000 100000
-10000 Letter: Other letter, Linear_B, Other
-10001 Letter: Other letter, Linear_B, Other
-e01ef Mark: Non-spacing mark, Inherited, Extend
-f0000 Control: Private use, Unknown, Other
-100000 Control: Private use, Unknown, Other
+U+10000 Letter: Other letter, Linear_B, Other
+U+10001 Letter: Other letter, Linear_B, Other
+U+E01EF Mark: Non-spacing mark, Inherited, Extend
+U+F0000 Control: Private use, Unknown, Other
+U+100000 Control: Private use, Unknown, Other
findprop 1b00 12000 7c0 a840 10900
-1b00 Mark: Non-spacing mark, Balinese, Extend
-12000 Letter: Other letter, Cuneiform, Other
-07c0 Number: Decimal number, Nko, Other
-a840 Letter: Other letter, Phags_Pa, Other
-10900 Letter: Other letter, Phoenician, Other
+U+1B00 Mark: Non-spacing mark, Balinese, Extend
+U+12000 Letter: Other letter, Cuneiform, Other
+U+07C0 Number: Decimal number, Nko, Other
+U+A840 Letter: Other letter, Phags_Pa, Other
+U+10900 Letter: Other letter, Phoenician, Other
findprop 1d79 a77d
-1d79 Letter: Lower case letter, Latin, Other, a77d
-a77d Letter: Upper case letter, Latin, Other, 1d79
+U+1D79 Letter: Lower case letter, Latin, Other, U+A77D
+U+A77D Letter: Upper case letter, Latin, Other, U+1D79
findprop 0800 083e a4d0 a4f7 aa80 aadf
-0800 Letter: Other letter, Samaritan, Other
-083e Punctuation: Other punctuation, Samaritan, Other
-a4d0 Letter: Other letter, Lisu, Other
-a4f7 Letter: Other letter, Lisu, Other
-aa80 Letter: Other letter, Tai_Viet, Other
-aadf Punctuation: Other punctuation, Tai_Viet, Other
+U+0800 Letter: Other letter, Samaritan, Other
+U+083E Punctuation: Other punctuation, Samaritan, Other
+U+A4D0 Letter: Other letter, Lisu, Other
+U+A4F7 Letter: Other letter, Lisu, Other
+U+AA80 Letter: Other letter, Tai_Viet, Other
+U+AADF Punctuation: Other punctuation, Tai_Viet, Other
findprop 10b00 10b35 13000 1342e 10840 10855
-10b00 Letter: Other letter, Avestan, Other
-10b35 Letter: Other letter, Avestan, Other
-13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
-1342e Letter: Other letter, Egyptian_Hieroglyphs, Other
-10840 Letter: Other letter, Imperial_Aramaic, Other
-10855 Letter: Other letter, Imperial_Aramaic, Other
+U+10B00 Letter: Other letter, Avestan, Other
+U+10B35 Letter: Other letter, Avestan, Other
+U+13000 Letter: Other letter, Egyptian_Hieroglyphs, Other
+U+1342E Letter: Other letter, Egyptian_Hieroglyphs, Other
+U+10840 Letter: Other letter, Imperial_Aramaic, Other
+U+10855 Letter: Other letter, Imperial_Aramaic, Other
findprop 11100 1113c 11680 116c0
-11100 Mark: Non-spacing mark, Chakma, Extend
-1113c Number: Decimal number, Chakma, Other
-11680 Letter: Other letter, Takri, Other
-116c0 Number: Decimal number, Takri, Other
+U+11100 Mark: Non-spacing mark, Chakma, Extend
+U+1113C Number: Decimal number, Chakma, Other
+U+11680 Letter: Other letter, Takri, Other
+U+116C0 Number: Decimal number, Takri, Other
findprop 0d 0a 0e 0711 1b04 1111 1169 11fe ae4c ad89
-000d Control: Control, Common, CR
-000a Control: Control, Common, LF
-000e Control: Control, Common, Control
-0711 Mark: Non-spacing mark, Syriac, Extend
-1b04 Mark: Spacing mark, Balinese, SpacingMark
-1111 Letter: Other letter, Hangul, Hangul syllable type L
-1169 Letter: Other letter, Hangul, Hangul syllable type V
-11fe Letter: Other letter, Hangul, Hangul syllable type T
-ae4c Letter: Other letter, Hangul, Hangul syllable type LV
-ad89 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+000D Control: Control, Common, CR
+U+000A Control: Control, Common, LF
+U+000E Control: Control, Common, Control
+U+0711 Mark: Non-spacing mark, Syriac, Extend
+U+1B04 Mark: Spacing mark, Balinese, SpacingMark
+U+1111 Letter: Other letter, Hangul, Hangul syllable type L
+U+1169 Letter: Other letter, Hangul, Hangul syllable type V
+U+11FE Letter: Other letter, Hangul, Hangul syllable type T
+U+AE4C Letter: Other letter, Hangul, Hangul syllable type LV
+U+AD89 Letter: Other letter, Hangul, Hangul syllable type LVT
findprop 118a0 11ac7 16ad0
-118a0 Letter: Upper case letter, Warang_Citi, Other, 118c0
-11ac7 Letter: Other letter, Pau_Cin_Hau, Other
-16ad0 Letter: Other letter, Bassa_Vah, Other
+U+118A0 Letter: Upper case letter, Warang_Citi, Other, U+118C0
+U+11AC7 Letter: Other letter, Pau_Cin_Hau, Other
+U+16AD0 Letter: Other letter, Bassa_Vah, Other
findprop 11700 14400 108e0 11280 1d800
-11700 Letter: Other letter, Ahom, Other
-14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
-108e0 Letter: Other letter, Hatran, Other
-11280 Letter: Other letter, Multani, Other
-1d800 Symbol: Other symbol, SignWriting, Other
+U+11700 Letter: Other letter, Ahom, Other
+U+14400 Letter: Other letter, Anatolian_Hieroglyphs, Other
+U+108E0 Letter: Other letter, Hatran, Other
+U+11280 Letter: Other letter, Multani, Other
+U+1D800 Symbol: Other symbol, SignWriting, Other
findprop 11800 1e903 11da9 10d27 11ee0 16e48 10f27 10f30
-11800 Letter: Other letter, Dogra, Other
-1e903 Letter: Upper case letter, Adlam, Other, 1e925
-11da9 Number: Decimal number, Gunjala_Gondi, Other
-10d27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
-11ee0 Letter: Other letter, Makasar, Other
-16e48 Letter: Upper case letter, Medefaidrin, Other, 16e68
-10f27 Letter: Other letter, Old_Sogdian, Other
-10f30 Letter: Other letter, Sogdian, Other
+U+11800 Letter: Other letter, Dogra, Other
+U+1E903 Letter: Upper case letter, Adlam, Other, U+1E925
+U+11DA9 Number: Decimal number, Gunjala_Gondi, Other
+U+10D27 Mark: Non-spacing mark, Hanifi_Rohingya, Extend
+U+11EE0 Letter: Other letter, Makasar, Other
+U+16E48 Letter: Upper case letter, Medefaidrin, Other, U+16E68
+U+10F27 Letter: Other letter, Old_Sogdian, Other
+U+10F30 Letter: Other letter, Sogdian, Other
findprop a836 a833 1cf4 20f0 1cd0
-a836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
-a833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
-1cf4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
-20f0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
-1cd0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
+U+A836 Symbol: Other symbol, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kaithi, Mahajani, Modi, Khudawadi, Takri, Tirhuta]
+U+A833 Number: Other number, Common, Other, [Devanagari, Dogra, Gujarati, Gurmukhi, Khojki, Kannada, Kaithi, Mahajani, Modi, Nandinagari, Khudawadi, Takri, Tirhuta]
+U+1CF4 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Kannada]
+U+20F0 Mark: Non-spacing mark, Inherited, Extend, [Devanagari, Grantha, Latin]
+U+1CD0 Mark: Non-spacing mark, Inherited, Extend, [Bengali, Devanagari, Grantha, Kannada]
findprop 32ff
-32ff Symbol: Other symbol, Common, Other, [Han]
+U+32FF Symbol: Other symbol, Common, Other, [Han]
findprop 1f16d
-1f16d Symbol: Other symbol, Common, Extended Pictographic
+U+1F16D Symbol: Other symbol, Common, Extended Pictographic
-findprop 10e93 10eaa
-10e93 Letter: Other letter, Yezidi, Other
-10eaa Control: Unassigned, Unknown, Other
+findprop U+10e93 U+10eaa
+U+10E93 Letter: Other letter, Yezidi, Other
+U+10EAA Control: Unassigned, Unknown, Other
diff --git a/maint/ucptestdata/testoutput2 b/maint/ucptestdata/testoutput2
new file mode 100644
index 0000000..b0689f4
--- /dev/null
+++ b/maint/ucptestdata/testoutput2
@@ -0,0 +1,188 @@
+find script Han
+U+2E80..U+2E99 Symbol: Other symbol, Han, Other
+U+2E9B..U+2EF3 Symbol: Other symbol, Han, Other
+U+2F00..U+2FD5 Symbol: Other symbol, Han, Other
+ U+3005 Letter: Modifier letter, Han, Other
+ U+3007 Number: Letter number, Han, Other
+U+3021..U+3029 Number: Letter number, Han, Other
+U+3038..U+303A Number: Letter number, Han, Other
+ U+303B Letter: Modifier letter, Han, Other
+U+3400..U+4DBF Letter: Other letter, Han, Other
+U+4E00..U+9FFC Letter: Other letter, Han, Other
+U+F900..U+FA6D Letter: Other letter, Han, Other
+U+FA70..U+FAD9 Letter: Other letter, Han, Other
+U+16FF0..U+16FF1 Mark: Spacing mark, Han, SpacingMark
+U+20000..U+2A6DD Letter: Other letter, Han, Other
+U+2A700..U+2B734 Letter: Other letter, Han, Other
+U+2B740..U+2B81D Letter: Other letter, Han, Other
+U+2B820..U+2CEA1 Letter: Other letter, Han, Other
+U+2CEB0..U+2EBE0 Letter: Other letter, Han, Other
+U+2F800..U+2FA1D Letter: Other letter, Han, Other
+U+30000..U+3134A Letter: Other letter, Han, Other
+find type Pe script Common scriptx Hangul
+U+3009 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+300B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+300D Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+300F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+3011 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+3015 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+3017 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+3019 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+301B Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+U+301E..U+301F Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
+ U+FF63 Punctuation: Close punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana, Yi]
+find type Sk
+U+005E Symbol: Modifier symbol, Common, Other
+U+0060 Symbol: Modifier symbol, Common, Other
+U+00A8 Symbol: Modifier symbol, Common, Other
+U+00AF Symbol: Modifier symbol, Common, Other
+U+00B4 Symbol: Modifier symbol, Common, Other
+U+00B8 Symbol: Modifier symbol, Common, Other
+U+02C2..U+02C5 Symbol: Modifier symbol, Common, Other
+U+02D2..U+02DF Symbol: Modifier symbol, Common, Other
+U+02E5..U+02E9 Symbol: Modifier symbol, Common, Other
+U+02EA..U+02EB Symbol: Modifier symbol, Bopomofo, Other
+ U+02ED Symbol: Modifier symbol, Common, Other
+U+02EF..U+02FF Symbol: Modifier symbol, Common, Other
+ U+0375 Symbol: Modifier symbol, Greek, Other
+ U+0384 Symbol: Modifier symbol, Greek, Other
+ U+0385 Symbol: Modifier symbol, Common, Other
+ U+1FBD Symbol: Modifier symbol, Greek, Other
+U+1FBF..U+1FC1 Symbol: Modifier symbol, Greek, Other
+U+1FCD..U+1FCF Symbol: Modifier symbol, Greek, Other
+U+1FDD..U+1FDF Symbol: Modifier symbol, Greek, Other
+U+1FED..U+1FEF Symbol: Modifier symbol, Greek, Other
+U+1FFD..U+1FFE Symbol: Modifier symbol, Greek, Other
+U+309B..U+309C Symbol: Modifier symbol, Common, Other, [Hiragana, Katakana]
+U+A700..U+A707 Symbol: Modifier symbol, Common, Other, [Han, Latin]
+U+A708..U+A716 Symbol: Modifier symbol, Common, Other
+U+A720..U+A721 Symbol: Modifier symbol, Common, Other
+U+A789..U+A78A Symbol: Modifier symbol, Common, Other
+ U+AB5B Symbol: Modifier symbol, Common, Other
+U+AB6A..U+AB6B Symbol: Modifier symbol, Common, Other
+U+FBB2..U+FBC1 Symbol: Modifier symbol, Arabic, Other
+ U+FF3E Symbol: Modifier symbol, Common, Other
+ U+FF40 Symbol: Modifier symbol, Common, Other
+ U+FFE3 Symbol: Modifier symbol, Common, Other
+U+1F3FB..U+1F3FF Symbol: Modifier symbol, Common, Extend
+find type Pd
+U+002D Punctuation: Dash punctuation, Common, Other
+U+058A Punctuation: Dash punctuation, Armenian, Other
+U+05BE Punctuation: Dash punctuation, Hebrew, Other
+U+1400 Punctuation: Dash punctuation, Canadian_Aboriginal, Other
+U+1806 Punctuation: Dash punctuation, Mongolian, Other
+U+2010..U+2015 Punctuation: Dash punctuation, Common, Other
+ U+2E17 Punctuation: Dash punctuation, Common, Other
+ U+2E1A Punctuation: Dash punctuation, Common, Other
+U+2E3A..U+2E3B Punctuation: Dash punctuation, Common, Other
+ U+2E40 Punctuation: Dash punctuation, Common, Other
+ U+301C Punctuation: Dash punctuation, Common, Other, [Bopomofo, Hangul, Han, Hiragana, Katakana]
+ U+3030 Punctuation: Dash punctuation, Common, Extended Pictographic, [Bopomofo, Hangul, Han, Hiragana, Katakana]
+ U+30A0 Punctuation: Dash punctuation, Common, Other, [Hiragana, Katakana]
+U+FE31..U+FE32 Punctuation: Dash punctuation, Common, Other
+ U+FE58 Punctuation: Dash punctuation, Common, Other
+ U+FE63 Punctuation: Dash punctuation, Common, Other
+ U+FF0D Punctuation: Dash punctuation, Common, Other
+ U+10EAD Punctuation: Dash punctuation, Yezidi, Other
+find gbreak LVT
+U+AC01..U+AC1B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AC1D..U+AC37 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AC39..U+AC53 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AC55..U+AC6F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AC71..U+AC8B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AC8D..U+ACA7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ACA9..U+ACC3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ACC5..U+ACDF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ACE1..U+ACFB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ACFD..U+AD17 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AD19..U+AD33 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AD35..U+AD4F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AD51..U+AD6B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AD6D..U+AD87 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AD89..U+ADA3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ADA5..U+ADBF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ADC1..U+ADDB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ADDD..U+ADF7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+ADF9..U+AE13 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AE15..U+AE2F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AE31..U+AE4B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AE4D..U+AE67 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AE69..U+AE83 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AE85..U+AE9F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AEA1..U+AEBB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AEBD..U+AED7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AED9..U+AEF3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AEF5..U+AF0F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AF11..U+AF2B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AF2D..U+AF47 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AF49..U+AF63 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AF65..U+AF7F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AF81..U+AF9B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AF9D..U+AFB7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AFB9..U+AFD3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AFD5..U+AFEF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+AFF1..U+B00B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B00D..U+B027 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B029..U+B043 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B045..U+B05F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B061..U+B07B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B07D..U+B097 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B099..U+B0B3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B0B5..U+B0CF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B0D1..U+B0EB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B0ED..U+B107 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B109..U+B123 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B125..U+B13F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B141..U+B15B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B15D..U+B177 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B179..U+B193 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B195..U+B1AF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B1B1..U+B1CB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B1CD..U+B1E7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B1E9..U+B203 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B205..U+B21F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B221..U+B23B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B23D..U+B257 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B259..U+B273 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B275..U+B28F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B291..U+B2AB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B2AD..U+B2C7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B2C9..U+B2E3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B2E5..U+B2FF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B301..U+B31B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B31D..U+B337 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B339..U+B353 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B355..U+B36F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B371..U+B38B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B38D..U+B3A7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B3A9..U+B3C3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B3C5..U+B3DF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B3E1..U+B3FB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B3FD..U+B417 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B419..U+B433 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B435..U+B44F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B451..U+B46B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B46D..U+B487 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B489..U+B4A3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B4A5..U+B4BF Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B4C1..U+B4DB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B4DD..U+B4F7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B4F9..U+B513 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B515..U+B52F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B531..U+B54B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B54D..U+B567 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B569..U+B583 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B585..U+B59F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B5A1..U+B5BB Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B5BD..U+B5D7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B5D9..U+B5F3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B5F5..U+B60F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B611..U+B62B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B62D..U+B647 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B649..U+B663 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B665..U+B67F Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B681..U+B69B Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B69D..U+B6B7 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B6B9..U+B6D3 Letter: Other letter, Hangul, Hangul syllable type LVT
+U+B6D5..U+B6EF Letter: Other letter, Hangul, Hangul syllable type LVT
+...
diff --git a/maint/utf8.c b/maint/utf8.c
index 9ac6dc5..bc11a50 100644
--- a/maint/utf8.c
+++ b/maint/utf8.c
@@ -1,29 +1,46 @@
-/* A test program for converting characters to UTF-8 and vice versa. Note that
-this program conforms to the original definition of UTF-8, which allows
-codepoints up to 7fffffff. The more recent definition limits the validity of
-UTF-8 codepoints to a maximum of 10ffffff.
-
-The arguments are either single codepoint values, written as 0xhhhh, for
-conversion to UTF-8, or sequences of hex values, written without 0x and
-optionally including spaces (but such arguments must be quoted), for conversion
+/****************************************************
+* PCRE maintainers' helper program: UTF-8 converter *
+****************************************************/
+
+/* This is a test program for converting character code points to UTF-8 and
+vice versa. Note that this program conforms to the original definition of
+UTF-8, which allows codepoints up to 7fffffff. The more recent definition
+limits the validity of Unicode UTF-8 codepoints to a maximum of 10ffffff, and
+forbids the "surrogate" code points. This program now gives warnings for these
+invalid code points.
+
+The arguments are either single code point values written as U+hh.. or 0xhh..
+for conversion to UTF-8, or sequences of hex values, written without 0x and
+optionally including spaces (but such arguments must be quoted), for conversion
from UTF-8 to codepoints. For example:
./utf8 0x1234
-0x00001234 => e1 88 b4
+U+00001234 => e1 88 b4
./utf8 "e1 88 b4"
-0x00001234 <= e1 88 b4
+U+00001234 <= e1 88 b4
-In the second case, a number of characters can be present in one argument:
+In the second case, a number of UTF-8 characters can be present in one
+argument. In other words, each such argument is interpreted (after ignoring
+spaces) as a string of UTF-8 bytes representing a string of characters:
./utf8 "65 e188b4 77"
-0x00000065 <= 65
-0x00001234 <= e1 88 b4
-0x00000077 <= 77
+0x00000065 <= 65
+0x00001234 <= e1 88 b4
+0x00000077 <= 77
-If the option -s is given, the sequence of UTF-bytes is written out between
+If the option -s is given, the sequence of UTF-bytes is written out between
angle brackets at the end of the line. On a UTF-8 terminal, this will show the
-appropriate graphic for the codepoint. */
+appropriate graphic for the code point.
+
+Errors provoke error messages, but the program carries on with the next
+argument. The return code is always zero.
+
+Philip Hazel
+Original creation data: unknown
+Code extended and tidied to avoid compiler warnings: 26 March 2020
+*/
+
#include <stdio.h>
#include <stdlib.h>
@@ -41,47 +58,38 @@ appropriate graphic for the codepoint. */
*/
-static const int utf8_table1[] = {
- 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
+static const unsigned int utf8_table1[] = {
+ 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
static const int utf8_table2[] = {
- 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-
+ 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
static const int utf8_table3[] = {
- 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
-
-static const unsigned char utf8_table4[] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
+ 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
/*************************************************
* Convert character value to UTF-8 *
*************************************************/
-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 1 to 6 bytes.
+/* This function takes an unsigned long integer value in the range 0 -
+0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
-Arguments:
- cvalue the character value
+Arguments:
+ cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long
-
-Returns: number of characters placed in the buffer
- -1 if input character is negative
- 0 if input character is positive but too big (only when
- int is longer than 32 bits)
+
+Returns: number of bytes placed in the buffer
+ 0 if input code point is too big
*/
-int
-ord2utf8(int cvalue, unsigned char *buffer)
+static size_t
+ord2utf8(unsigned long int cvalue, unsigned char *buffer)
{
-register int i, j;
+size_t i, j;
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
if (cvalue <= utf8_table1[i]) break;
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
-if (cvalue < 0) return -1;
buffer += i;
for (j = i; j > 0; j--)
{
@@ -98,32 +106,59 @@ return i + 1;
* Convert UTF-8 string to value *
*************************************************/
-/* This function takes one or more bytes that represents a UTF-8 character,
-and returns the value of the character.
+/* This function takes one or more bytes that represent a UTF-8 character from
+the start of a string of bytes. It returns the value of the character, or the
+offset of a malformation. For an overlong encoding that works but is not the
+correct (shortest) one, the error offset is just after the last byte.
-Argument:
+Argument:
buffer a pointer to the byte vector
- vptr a pointer to an int to receive the value
-
-Returns: > 0 => the number of bytes consumed
- -6 to 0 => malformed UTF-8 character at offset = (-return)
+ buffend a pointer to the end of the buffer
+ vptr a pointer to a variable to receive the value
+ lenptr a pointer to a variable to receive the offset when error detected
+
+Returns: > 0 => the number of bytes consumed
+ 0 => invalid UTF-8: first byte missing 0x40 bit
+ -1 => invalid UTF-8: first byte has too many high-order 1-bits
+ -2 => incomplete sequence at end of string
+ -3 => incomplete sequence within string
+ -4 => overlong code sequence
*/
-int
-utf82ord(unsigned char *buffer, int *vptr)
+static int
+utf82ord(unsigned char *buffer, unsigned char *buffend,
+ long unsigned int *vptr, int *lenptr)
{
-int c = *buffer++;
-int d = c;
+unsigned int c = *buffer++;
+unsigned int d = c;
int i, j, s;
-for (i = -1; i < 6; i++) /* i is number of additional bytes */
+/* Check for an ASCII character, or find the number of additional bytes in a
+multibyte character. */
+
+for (i = -1; i < 6; i++)
{
if ((d & 0x80) == 0) break;
d <<= 1;
}
-if (i == -1) { *vptr = c; return 1; } /* ascii character */
-if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
+switch (i)
+ {
+ case -1: /* ASCII character; first byte does not have 0x80 bit */
+ *vptr = c;
+ return 1;
+
+ case 0: /* First byte has 0x80 but is missing 0x40 bit */
+ *lenptr = 0;
+ return 0;
+
+ case 6:
+ *lenptr = 0; /* Too many high bits */
+ return -1;
+
+ default:
+ break;
+ }
/* i now has a value in the range 1-5 */
@@ -132,32 +167,46 @@ d = (c & utf8_table3[i]) << s;
for (j = 0; j < i; j++)
{
+ if (buffer >= buffend)
+ {
+ *lenptr = j + 1;
+ return -2;
+ }
c = *buffer++;
- if ((c & 0xc0) != 0x80) return -(j+1);
+ if ((c & 0xc0) != 0x80)
+ {
+ *lenptr = j + 1;
+ return -3;
+ }
s -= 6;
d |= (c & 0x3f) << s;
}
-/* Check that encoding was the correct unique one */
+/* Valid UTF-8 syntax */
-for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
+*vptr = d;
+
+/* Check that encoding was the correct one, not overlong */
+
+for (j = 0; j < (int)(sizeof(utf8_table1)/sizeof(int)); j++)
if (d <= utf8_table1[j]) break;
-if (j != i) return -(i+1);
+if (j != i)
+ {
+ *lenptr = i + 1;
+ return -4;
+ }
/* Valid value */
-*vptr = d;
-return i+1;
+return i + 1;
}
-
/*************************************************
* Main Program *
*************************************************/
-
int
main(int argc, char **argv)
{
@@ -169,85 +218,129 @@ if (argc > 1 && strcmp(argv[1], "-s") == 0)
{
show = 1;
i = 2;
- }
+ }
for (; i < argc; i++)
{
char *x = argv[i];
- if (strncmp(x, "0x", 2) == 0)
+ char *endptr;
+ if (strncmp(x, "0x", 2) == 0 || strncmp(x, "U+", 2) == 0)
{
- int j;
- int d = strtol(x+2, NULL, 16);
- int rc = ord2utf8(d, buffer);
- printf("0x%08x => ", d);
- if (rc <= 0) printf("*** Error %d ***", rc); else
+ size_t rc, j;
+ unsigned long int d = strtoul(x+2, &endptr, 16);
+ if (*endptr != 0)
+ {
+ printf("** Invalid hex number %s\n", x);
+ continue; /* With next argument */
+ }
+ rc = ord2utf8(d, buffer);
+ printf("U+%08lx => ", d);
+ if (rc == 0)
+ printf("** Code point greater than 0x7fffffff cannot be encoded");
+ else
{
for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
if (show)
{
printf(">");
for (j = 0; j < rc; j++) printf("%c", buffer[j]);
- printf("<");
- }
- }
- printf("\n");
+ printf("< ");
+ }
+ if (d >= 0xd800 && d <= 0xdfff)
+ printf("** Invalid Unicode (surrogate)");
+ else if (d > 0x10ffff)
+ printf("** Invalid Unicode (greater than U+10ffff)");
+ }
+ printf("\n");
}
else
{
- int d, rc;
- int j = 0;
- int y = 0;
- int z = 0;
unsigned char *bptr;
-
- for (;;)
- {
- while (*x == ' ') x++;
+ unsigned char *buffend;
+ int len = 0;
+ int y = 0;
+ int z = 0;
+
+ for (;;)
+ {
+ while (*x == ' ') x++;
if (*x == 0 && !z) break;
- if (!isxdigit(*x))
+ if (!isxdigit(*x))
{
- printf("Malformed hex string: %s\n", argv[i]);
- j = -1;
- break;
- }
+ printf("** Malformed hex string: %s\n", argv[i]);
+ len = -1;
+ break;
+ }
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
- x++;
+ x++;
if (z)
- {
- buffer[j++] = y;
+ {
+ buffer[len++] = y;
y = 0;
}
- z ^= 1;
- }
- buffer[j] = 0;
+ z ^= 1;
+ }
+
+ if (len < 0) continue; /* With next argument after malformation */
+
bptr = buffer;
+ buffend = buffer + len;
+
+ while (bptr < buffend)
+ {
+ unsigned long int d;
+ int j;
+ int offset;
+ int rc = utf82ord(bptr, buffend, &d, &offset);
- while (*bptr != 0)
- {
- rc = utf82ord(bptr, &d);
- if (rc > 0)
+ if (rc > 0)
{
- printf("0x%08x <= ", d);
+ printf("U+%08lx <= ", d);
for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
if (show)
{
printf(">");
for (j = 0; j < rc; j++) printf("%c", bptr[j]);
- printf("<");
- }
+ printf("<");
+ }
printf("\n");
- bptr += rc;
- }
- else
+ bptr += rc;
+ }
+ else if (rc == -4)
+ {
+ printf("U+%08lx <= ", d);
+ for (j = 0; j < offset; j++) printf("%02x ", bptr[j]);
+ printf("** Overlong UTF-8 sequence\n");
+ bptr += offset;
+ }
+ else
{
- printf("Malformed UTF-8 at offset %d <= ", -rc);
- while (*bptr != 0) printf("%02x ", *bptr++);
- printf("\n");
- break;
- }
- }
- }
- }
+ switch (rc)
+ {
+ case 0: printf("** First byte missing 0x40 bit");
+ break;
+
+ case -1: printf("** First byte has too many high-order bits");
+ break;
+
+ case -2: printf("** Incomplete UTF-8 sequence at end of string");
+ break;
+
+ case -3: printf("** Incomplete UTF-8 sequence");
+ break;
+
+ default: printf("** Unexpected return %d from utf82ord()", rc);
+ break;
+ }
+ printf(" at offset %d in string ", offset);
+ while (bptr < buffend) printf("%02x ", *bptr++);
+ printf("\n");
+ break;
+ }
+ }
+ }
+ }
+
return 0;
}