/* Generate Unicode conforming character classification tables and line break properties tables and word break property tables and decomposition/composition and case mapping tables from a UnicodeData file. Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc. Written by Bruno Haible , 2000-2002. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* Usage example: $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \ /usr/local/share/Unidata/PropList.txt \ /usr/local/share/Unidata/DerivedCoreProperties.txt \ /usr/local/share/Unidata/Scripts.txt \ /usr/local/share/Unidata/Blocks.txt \ /usr/local/share/Unidata/PropList-3.0.1.txt \ /usr/local/share/Unidata/EastAsianWidth.txt \ /usr/local/share/Unidata/LineBreak.txt \ /usr/local/share/Unidata/WordBreakProperty.txt \ /usr/local/share/Unidata/CompositionExclusions.txt \ /usr/local/share/Unidata/SpecialCasing.txt \ /usr/local/share/Unidata/CaseFolding.txt \ 5.1.0 */ #include #include #include #include #include #include /* ========================================================================= */ /* Reading UnicodeData.txt. */ /* See UCD.html. */ /* This structure represents one line in the UnicodeData.txt file. */ struct unicode_attribute { const char *name; /* Character name */ const char *category; /* General category */ const char *combining; /* Canonical combining class */ const char *bidi; /* Bidirectional category */ const char *decomposition; /* Character decomposition mapping */ const char *decdigit; /* Decimal digit value */ const char *digit; /* Digit value */ const char *numeric; /* Numeric value */ bool mirrored; /* mirrored */ const char *oldname; /* Old Unicode 1.0 name */ const char *comment; /* Comment */ unsigned int upper; /* Uppercase mapping */ unsigned int lower; /* Lowercase mapping */ unsigned int title; /* Titlecase mapping */ }; /* Missing fields are represented with "" for strings, and NONE for characters. */ #define NONE (~(unsigned int)0) /* The entire contents of the UnicodeData.txt file. */ struct unicode_attribute unicode_attributes [0x110000]; /* Stores in unicode_attributes[i] the values from the given fields. */ static void fill_attribute (unsigned int i, const char *field1, const char *field2, const char *field3, const char *field4, const char *field5, const char *field6, const char *field7, const char *field8, const char *field9, const char *field10, const char *field11, const char *field12, const char *field13, const char *field14) { struct unicode_attribute * uni; if (i >= 0x110000) { fprintf (stderr, "index too large\n"); exit (1); } if (strcmp (field2, "Cs") == 0) /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */ return; uni = &unicode_attributes[i]; /* Copy the strings. */ uni->name = strdup (field1); uni->category = (field2[0] == '\0' ? "" : strdup (field2)); uni->combining = (field3[0] == '\0' ? "" : strdup (field3)); uni->bidi = (field4[0] == '\0' ? "" : strdup (field4)); uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5)); uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6)); uni->digit = (field7[0] == '\0' ? "" : strdup (field7)); uni->numeric = (field8[0] == '\0' ? "" : strdup (field8)); uni->mirrored = (field9[0] == 'Y'); uni->oldname = (field10[0] == '\0' ? "" : strdup (field10)); uni->comment = (field11[0] == '\0' ? "" : strdup (field11)); uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16)); uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16)); uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16)); } /* Maximum length of a field in the UnicodeData.txt file. */ #define FIELDLEN 120 /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN. Reads up to (but excluding) DELIM. Returns 1 when a field was successfully read, otherwise 0. */ static int getfield (FILE *stream, char *buffer, int delim) { int count = 0; int c; for (; (c = getc (stream)), (c != EOF && c != delim); ) { /* The original unicode.org UnicodeData.txt file happens to have CR/LF line terminators. Silently convert to LF. */ if (c == '\r') continue; /* Put c into the buffer. */ if (++count >= FIELDLEN - 1) { fprintf (stderr, "field longer than expected, increase FIELDLEN\n"); exit (1); } *buffer++ = c; } if (c == EOF) return 0; *buffer = '\0'; return 1; } /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt file. */ static void fill_attributes (const char *unicodedata_filename) { unsigned int i, j; FILE *stream; char field0[FIELDLEN]; char field1[FIELDLEN]; char field2[FIELDLEN]; char field3[FIELDLEN]; char field4[FIELDLEN]; char field5[FIELDLEN]; char field6[FIELDLEN]; char field7[FIELDLEN]; char field8[FIELDLEN]; char field9[FIELDLEN]; char field10[FIELDLEN]; char field11[FIELDLEN]; char field12[FIELDLEN]; char field13[FIELDLEN]; char field14[FIELDLEN]; int lineno = 0; for (i = 0; i < 0x110000; i++) unicode_attributes[i].name = NULL; stream = fopen (unicodedata_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename); exit (1); } for (;;) { int n; lineno++; n = getfield (stream, field0, ';'); n += getfield (stream, field1, ';'); n += getfield (stream, field2, ';'); n += getfield (stream, field3, ';'); n += getfield (stream, field4, ';'); n += getfield (stream, field5, ';'); n += getfield (stream, field6, ';'); n += getfield (stream, field7, ';'); n += getfield (stream, field8, ';'); n += getfield (stream, field9, ';'); n += getfield (stream, field10, ';'); n += getfield (stream, field11, ';'); n += getfield (stream, field12, ';'); n += getfield (stream, field13, ';'); n += getfield (stream, field14, '\n'); if (n == 0) break; if (n != 15) { fprintf (stderr, "short line in '%s':%d\n", unicodedata_filename, lineno); exit (1); } i = strtoul (field0, NULL, 16); if (field1[0] == '<' && strlen (field1) >= 9 && strcmp (field1 + strlen(field1) - 8, ", First>") == 0) { /* Deal with a range. */ lineno++; n = getfield (stream, field0, ';'); n += getfield (stream, field1, ';'); n += getfield (stream, field2, ';'); n += getfield (stream, field3, ';'); n += getfield (stream, field4, ';'); n += getfield (stream, field5, ';'); n += getfield (stream, field6, ';'); n += getfield (stream, field7, ';'); n += getfield (stream, field8, ';'); n += getfield (stream, field9, ';'); n += getfield (stream, field10, ';'); n += getfield (stream, field11, ';'); n += getfield (stream, field12, ';'); n += getfield (stream, field13, ';'); n += getfield (stream, field14, '\n'); if (n != 15) { fprintf (stderr, "missing end range in '%s':%d\n", unicodedata_filename, lineno); exit (1); } if (!(field1[0] == '<' && strlen (field1) >= 8 && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0)) { fprintf (stderr, "missing end range in '%s':%d\n", unicodedata_filename, lineno); exit (1); } field1[strlen (field1) - 7] = '\0'; j = strtoul (field0, NULL, 16); for (; i <= j; i++) fill_attribute (i, field1+1, field2, field3, field4, field5, field6, field7, field8, field9, field10, field11, field12, field13, field14); } else { /* Single character line */ fill_attribute (i, field1, field2, field3, field4, field5, field6, field7, field8, field9, field10, field11, field12, field13, field14); } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); exit (1); } } /* ========================================================================= */ /* General category. */ /* See Unicode 3.0 book, section 4.5, UCD.html. */ static bool is_category_L (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'L'); } static bool is_category_Lu (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'L' && unicode_attributes[ch].category[1] == 'u'); } static bool is_category_Ll (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'L' && unicode_attributes[ch].category[1] == 'l'); } static bool is_category_Lt (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'L' && unicode_attributes[ch].category[1] == 't'); } static bool is_category_Lm (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'L' && unicode_attributes[ch].category[1] == 'm'); } static bool is_category_Lo (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'L' && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_M (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'M'); } static bool is_category_Mn (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'M' && unicode_attributes[ch].category[1] == 'n'); } static bool is_category_Mc (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'M' && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Me (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'M' && unicode_attributes[ch].category[1] == 'e'); } static bool is_category_N (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'N'); } static bool is_category_Nd (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'N' && unicode_attributes[ch].category[1] == 'd'); } static bool is_category_Nl (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'N' && unicode_attributes[ch].category[1] == 'l'); } static bool is_category_No (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'N' && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_P (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P'); } static bool is_category_Pc (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Pd (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 'd'); } static bool is_category_Ps (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 's'); } static bool is_category_Pe (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 'e'); } static bool is_category_Pi (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 'i'); } static bool is_category_Pf (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 'f'); } static bool is_category_Po (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_S (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'S'); } static bool is_category_Sm (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'S' && unicode_attributes[ch].category[1] == 'm'); } static bool is_category_Sc (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'S' && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Sk (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'S' && unicode_attributes[ch].category[1] == 'k'); } static bool is_category_So (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'S' && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_Z (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'Z'); } static bool is_category_Zs (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'Z' && unicode_attributes[ch].category[1] == 's'); } static bool is_category_Zl (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'Z' && unicode_attributes[ch].category[1] == 'l'); } static bool is_category_Zp (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'Z' && unicode_attributes[ch].category[1] == 'p'); } static bool is_category_C (unsigned int ch) { return (unicode_attributes[ch].name == NULL || unicode_attributes[ch].category[0] == 'C'); } static bool is_category_Cc (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'C' && unicode_attributes[ch].category[1] == 'c'); } static bool is_category_Cf (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'C' && unicode_attributes[ch].category[1] == 'f'); } static bool is_category_Cs (unsigned int ch) { return (ch >= 0xd800 && ch < 0xe000); } static bool is_category_Co (unsigned int ch) { return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'C' && unicode_attributes[ch].category[1] == 'o'); } static bool is_category_Cn (unsigned int ch) { return (unicode_attributes[ch].name == NULL && !(ch >= 0xd800 && ch < 0xe000)); } /* Output a boolean property in a human readable format. */ static void debug_output_predicate (const char *filename, bool (*predicate) (unsigned int)) { FILE *stream; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } #if 0 /* This yields huge text output. */ for (ch = 0; ch < 0x110000; ch++) if (predicate (ch)) { fprintf (stream, "0x%04X\n", ch); } #else for (ch = 0; ch < 0x110000; ch++) if (predicate (ch)) { unsigned int first = ch; unsigned int last; while (ch + 1 < 0x110000 && predicate (ch + 1)) ch++; last = ch; if (first < last) fprintf (stream, "0x%04X..0x%04X\n", first, last); else fprintf (stream, "0x%04X\n", ch); } #endif if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Output the unit test for a boolean property. */ static void output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression) { FILE *stream; bool need_comma; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Test the Unicode character type functions.\n"); fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); fprintf (stream, " (at your option) any later version.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); fprintf (stream, " GNU General Public License for more details.\n"); fprintf (stream, "\n"); fprintf (stream, " You should have received a copy of the GNU General Public License\n"); fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); fprintf (stream, "#include \"test-predicate-part1.h\"\n"); fprintf (stream, "\n"); need_comma = false; for (ch = 0; ch < 0x110000; ch++) if (predicate (ch)) { unsigned int first = ch; unsigned int last; while (ch + 1 < 0x110000 && predicate (ch + 1)) ch++; last = ch; if (need_comma) fprintf (stream, ",\n"); fprintf (stream, " { 0x%04X, 0x%04X }", first, last); need_comma = true; } if (need_comma) fprintf (stream, "\n"); fprintf (stream, "\n"); fprintf (stream, "#define PREDICATE(c) %s\n", expression); fprintf (stream, "#include \"test-predicate-part2.h\"\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Construction of sparse 3-level tables. */ #define TABLE predicate_table #define xmalloc malloc #define xrealloc realloc #include "3levelbit.h" /* Output a boolean property in a three-level bitmap. */ static void output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version) { FILE *stream; unsigned int ch, i; struct predicate_table t; unsigned int level1_offset, level2_offset, level3_offset; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* %s of Unicode characters. */\n", comment); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 4; /* or: 5 */ t.q = 7; /* or: 6 */ predicate_table_init (&t); for (ch = 0; ch < 0x110000; ch++) if (predicate (ch)) predicate_table_add (&t, ch); predicate_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) if (i != 1) fprintf (stream, "#define header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int header[1];\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream, " }\n"); fprintf (stream, "%s =\n", name); fprintf (stream, "{\n"); fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]); fprintf (stream, " {"); if (t.level1_size > 1) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 1) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu", 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 1) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 1) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 1) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu", 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 1) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level3_size << t.p > 4) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 4) == 0) fprintf (stream, "\n "); fprintf (stream, " 0x%08X", ((uint32_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) fprintf (stream, ","); } if (t.level3_size << t.p > 4) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Output all categories. */ static void output_categories (const char *version) { #define CATEGORY(C) \ debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \ output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \ output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version); CATEGORY (L) CATEGORY (Lu) CATEGORY (Ll) CATEGORY (Lt) CATEGORY (Lm) CATEGORY (Lo) CATEGORY (M) CATEGORY (Mn) CATEGORY (Mc) CATEGORY (Me) CATEGORY (N) CATEGORY (Nd) CATEGORY (Nl) CATEGORY (No) CATEGORY (P) CATEGORY (Pc) CATEGORY (Pd) CATEGORY (Ps) CATEGORY (Pe) CATEGORY (Pi) CATEGORY (Pf) CATEGORY (Po) CATEGORY (S) CATEGORY (Sm) CATEGORY (Sc) CATEGORY (Sk) CATEGORY (So) CATEGORY (Z) CATEGORY (Zs) CATEGORY (Zl) CATEGORY (Zp) CATEGORY (C) CATEGORY (Cc) CATEGORY (Cf) CATEGORY (Cs) CATEGORY (Co) CATEGORY (Cn) #undef CATEGORY } enum { UC_CATEGORY_MASK_L = 0x0000001f, UC_CATEGORY_MASK_Lu = 0x00000001, UC_CATEGORY_MASK_Ll = 0x00000002, UC_CATEGORY_MASK_Lt = 0x00000004, UC_CATEGORY_MASK_Lm = 0x00000008, UC_CATEGORY_MASK_Lo = 0x00000010, UC_CATEGORY_MASK_M = 0x000000e0, UC_CATEGORY_MASK_Mn = 0x00000020, UC_CATEGORY_MASK_Mc = 0x00000040, UC_CATEGORY_MASK_Me = 0x00000080, UC_CATEGORY_MASK_N = 0x00000700, UC_CATEGORY_MASK_Nd = 0x00000100, UC_CATEGORY_MASK_Nl = 0x00000200, UC_CATEGORY_MASK_No = 0x00000400, UC_CATEGORY_MASK_P = 0x0003f800, UC_CATEGORY_MASK_Pc = 0x00000800, UC_CATEGORY_MASK_Pd = 0x00001000, UC_CATEGORY_MASK_Ps = 0x00002000, UC_CATEGORY_MASK_Pe = 0x00004000, UC_CATEGORY_MASK_Pi = 0x00008000, UC_CATEGORY_MASK_Pf = 0x00010000, UC_CATEGORY_MASK_Po = 0x00020000, UC_CATEGORY_MASK_S = 0x003c0000, UC_CATEGORY_MASK_Sm = 0x00040000, UC_CATEGORY_MASK_Sc = 0x00080000, UC_CATEGORY_MASK_Sk = 0x00100000, UC_CATEGORY_MASK_So = 0x00200000, UC_CATEGORY_MASK_Z = 0x01c00000, UC_CATEGORY_MASK_Zs = 0x00400000, UC_CATEGORY_MASK_Zl = 0x00800000, UC_CATEGORY_MASK_Zp = 0x01000000, UC_CATEGORY_MASK_C = 0x3e000000, UC_CATEGORY_MASK_Cc = 0x02000000, UC_CATEGORY_MASK_Cf = 0x04000000, UC_CATEGORY_MASK_Cs = 0x08000000, UC_CATEGORY_MASK_Co = 0x10000000, UC_CATEGORY_MASK_Cn = 0x20000000 }; static int general_category_byname (const char *category_name) { if (category_name[0] != '\0' && (category_name[1] == '\0' || category_name[2] == '\0')) switch (category_name[0]) { case 'L': switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_L; case 'u': return UC_CATEGORY_MASK_Lu; case 'l': return UC_CATEGORY_MASK_Ll; case 't': return UC_CATEGORY_MASK_Lt; case 'm': return UC_CATEGORY_MASK_Lm; case 'o': return UC_CATEGORY_MASK_Lo; } break; case 'M': switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_M; case 'n': return UC_CATEGORY_MASK_Mn; case 'c': return UC_CATEGORY_MASK_Mc; case 'e': return UC_CATEGORY_MASK_Me; } break; case 'N': switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_N; case 'd': return UC_CATEGORY_MASK_Nd; case 'l': return UC_CATEGORY_MASK_Nl; case 'o': return UC_CATEGORY_MASK_No; } break; case 'P': switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_P; case 'c': return UC_CATEGORY_MASK_Pc; case 'd': return UC_CATEGORY_MASK_Pd; case 's': return UC_CATEGORY_MASK_Ps; case 'e': return UC_CATEGORY_MASK_Pe; case 'i': return UC_CATEGORY_MASK_Pi; case 'f': return UC_CATEGORY_MASK_Pf; case 'o': return UC_CATEGORY_MASK_Po; } break; case 'S': switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_S; case 'm': return UC_CATEGORY_MASK_Sm; case 'c': return UC_CATEGORY_MASK_Sc; case 'k': return UC_CATEGORY_MASK_Sk; case 'o': return UC_CATEGORY_MASK_So; } break; case 'Z': switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_Z; case 's': return UC_CATEGORY_MASK_Zs; case 'l': return UC_CATEGORY_MASK_Zl; case 'p': return UC_CATEGORY_MASK_Zp; } break; case 'C': switch (category_name[1]) { case '\0': return UC_CATEGORY_MASK_C; case 'c': return UC_CATEGORY_MASK_Cc; case 'f': return UC_CATEGORY_MASK_Cf; case 's': return UC_CATEGORY_MASK_Cs; case 'o': return UC_CATEGORY_MASK_Co; case 'n': return UC_CATEGORY_MASK_Cn; } break; } /* Invalid category name. */ abort (); } /* Construction of sparse 3-level tables. */ #define TABLE category_table #define ELEMENT uint8_t #define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */ #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output the per-character category table. */ static void output_category (const char *filename, const char *version) { FILE *stream; unsigned int ch, i; struct category_table t; unsigned int level1_offset, level2_offset, level3_offset; uint16_t *level3_packed; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Categories of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 7; t.q = 9; category_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { int value; unsigned int log2_value; if (is_category_Cs (ch)) value = UC_CATEGORY_MASK_Cs; else if (unicode_attributes[ch].name != NULL) value = general_category_byname (unicode_attributes[ch].category); else continue; /* Now value should contain exactly one bit. */ if (value == 0 || ((value & (value - 1)) != 0)) abort (); for (log2_value = 0; value > 1; value >>= 1, log2_value++); category_table_add (&t, ch, log2_value); } category_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define category_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, (1 << t.p) * 5 / 16); fprintf (stream, " }\n"); fprintf (stream, "u_category =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units, not 32-bit units, in order to make the lookup function easier. */ level3_packed = (uint16_t *) calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t)); for (i = 0; i < t.level3_size << t.p; i++) { unsigned int j = (i * 5) / 16; unsigned int k = (i * 5) % 16; uint32_t value = ((unsigned char *) (t.result + level3_offset))[i]; value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k); level3_packed[j] = value & 0xffff; level3_packed[j+1] = value >> 16; } fprintf (stream, " {"); if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) fprintf (stream, "\n "); for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " 0x%04x", level3_packed[i]); if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1) fprintf (stream, ","); } if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); free (level3_packed); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Canonical combining class. */ /* See Unicode 3.0 book, section 4.2, UCD.html. */ /* Construction of sparse 3-level tables. */ #define TABLE combclass_table #define ELEMENT uint8_t #define DEFAULT 0 #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output the per-character combining class table. */ static void output_combclass (const char *filename, const char *version) { FILE *stream; unsigned int ch, i; struct combclass_table t; unsigned int level1_offset, level2_offset, level3_offset; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Combining class of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 7; t.q = 9; combclass_table_init (&t); for (ch = 0; ch < 0x110000; ch++) if (unicode_attributes[ch].name != NULL) { int value = atoi (unicode_attributes[ch].combining); if (!(value >= 0 && value <= 255)) abort (); combclass_table_add (&t, ch, value); } combclass_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define combclass_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream, " }\n"); fprintf (stream, "u_combclass =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level3_size << t.p > 8) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) fprintf (stream, ","); } if (t.level3_size << t.p > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Bidirectional category. */ /* See Unicode 3.0 book, section 4.3, UCD.html. */ enum { UC_BIDI_L, /* Left-to-Right */ UC_BIDI_LRE, /* Left-to-Right Embedding */ UC_BIDI_LRO, /* Left-to-Right Override */ UC_BIDI_R, /* Right-to-Left */ UC_BIDI_AL, /* Right-to-Left Arabic */ UC_BIDI_RLE, /* Right-to-Left Embedding */ UC_BIDI_RLO, /* Right-to-Left Override */ UC_BIDI_PDF, /* Pop Directional Format */ UC_BIDI_EN, /* European Number */ UC_BIDI_ES, /* European Number Separator */ UC_BIDI_ET, /* European Number Terminator */ UC_BIDI_AN, /* Arabic Number */ UC_BIDI_CS, /* Common Number Separator */ UC_BIDI_NSM, /* Non-Spacing Mark */ UC_BIDI_BN, /* Boundary Neutral */ UC_BIDI_B, /* Paragraph Separator */ UC_BIDI_S, /* Segment Separator */ UC_BIDI_WS, /* Whitespace */ UC_BIDI_ON /* Other Neutral */ }; static int bidi_category_byname (const char *category_name) { switch (category_name[0]) { case 'A': switch (category_name[1]) { case 'L': if (category_name[2] == '\0') return UC_BIDI_AL; break; case 'N': if (category_name[2] == '\0') return UC_BIDI_AN; break; } break; case 'B': switch (category_name[1]) { case '\0': return UC_BIDI_B; case 'N': if (category_name[2] == '\0') return UC_BIDI_BN; break; } break; case 'C': switch (category_name[1]) { case 'S': if (category_name[2] == '\0') return UC_BIDI_CS; break; } break; case 'E': switch (category_name[1]) { case 'N': if (category_name[2] == '\0') return UC_BIDI_EN; break; case 'S': if (category_name[2] == '\0') return UC_BIDI_ES; break; case 'T': if (category_name[2] == '\0') return UC_BIDI_ET; break; } break; case 'L': switch (category_name[1]) { case '\0': return UC_BIDI_L; case 'R': switch (category_name[2]) { case 'E': if (category_name[3] == '\0') return UC_BIDI_LRE; break; case 'O': if (category_name[3] == '\0') return UC_BIDI_LRO; break; } break; } break; case 'N': switch (category_name[1]) { case 'S': switch (category_name[2]) { case 'M': if (category_name[3] == '\0') return UC_BIDI_NSM; break; } break; } break; case 'O': switch (category_name[1]) { case 'N': if (category_name[2] == '\0') return UC_BIDI_ON; break; } break; case 'P': switch (category_name[1]) { case 'D': switch (category_name[2]) { case 'F': if (category_name[3] == '\0') return UC_BIDI_PDF; break; } break; } break; case 'R': switch (category_name[1]) { case '\0': return UC_BIDI_R; case 'L': switch (category_name[2]) { case 'E': if (category_name[3] == '\0') return UC_BIDI_RLE; break; case 'O': if (category_name[3] == '\0') return UC_BIDI_RLO; break; } break; } break; case 'S': if (category_name[1] == '\0') return UC_BIDI_S; break; case 'W': switch (category_name[1]) { case 'S': if (category_name[2] == '\0') return UC_BIDI_WS; break; } break; } /* Invalid bidi category name. */ abort (); } static int get_bidi_category (unsigned int ch) { if (unicode_attributes[ch].name != NULL) return bidi_category_byname (unicode_attributes[ch].bidi); else { /* The bidi category of unassigned characters depends on the range. See UTR #9 and DerivedBidiClass.txt. */ if ((ch >= 0x0590 && ch <= 0x05FF) || (ch >= 0x07FB && ch <= 0x08FF) || (ch >= 0xFB37 && ch <= 0xFB45) || (ch >= 0x10800 && ch <= 0x10FFF)) return UC_BIDI_R; else if ((ch >= 0x0600 && ch <= 0x07BF) || (ch >= 0x2064 && ch <= 0x2069) || (ch >= 0xFBB2 && ch <= 0xFDCF) || (ch >= 0xFDFE && ch <= 0xFEFE)) return UC_BIDI_AL; else if ((ch >= 0xFDD0 && ch <= 0xFDEF) || (ch >= 0xFFF0 && ch <= 0xFFFF) || (ch & 0xFFFF) == 0xFFFE || (ch & 0xFFFF) == 0xFFFF || (ch >= 0xE0000 && ch <= 0xE0FFF)) return UC_BIDI_BN; else return UC_BIDI_L; } } /* Construction of sparse 3-level tables. */ #define TABLE bidi_category_table #define ELEMENT uint8_t #define DEFAULT UC_BIDI_L #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output the per-character bidi category table. */ static void output_bidi_category (const char *filename, const char *version) { FILE *stream; unsigned int ch, i; struct bidi_category_table t; unsigned int level1_offset, level2_offset, level3_offset; uint16_t *level3_packed; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Bidi categories of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 7; t.q = 9; bidi_category_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { int value = get_bidi_category (ch); bidi_category_table_add (&t, ch, value); } bidi_category_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define bidi_category_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, (1 << t.p) * 5 / 16); fprintf (stream, " }\n"); fprintf (stream, "u_bidi_category =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units, not 32-bit units, in order to make the lookup function easier. */ level3_packed = (uint16_t *) calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t)); for (i = 0; i < t.level3_size << t.p; i++) { unsigned int j = (i * 5) / 16; unsigned int k = (i * 5) % 16; uint32_t value = ((unsigned char *) (t.result + level3_offset))[i]; value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k); level3_packed[j] = value & 0xffff; level3_packed[j+1] = value >> 16; } fprintf (stream, " {"); if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) fprintf (stream, "\n "); for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " 0x%04x", level3_packed[i]); if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1) fprintf (stream, ","); } if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); free (level3_packed); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Decimal digit value. */ /* See Unicode 3.0 book, section 4.6. */ static int get_decdigit_value (unsigned int ch) { if (unicode_attributes[ch].name != NULL && unicode_attributes[ch].decdigit[0] != '\0') return atoi (unicode_attributes[ch].decdigit); return -1; } /* Construction of sparse 3-level tables. */ #define TABLE decdigit_table #define ELEMENT uint8_t #define DEFAULT 0 #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output the unit test for the per-character decimal digit value table. */ static void output_decimal_digit_test (const char *filename, const char *version) { FILE *stream; bool need_comma; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); need_comma = false; for (ch = 0; ch < 0x110000; ch++) { int value = get_decdigit_value (ch); if (!(value >= -1 && value < 10)) abort (); if (value >= 0) { if (need_comma) fprintf (stream, ",\n"); fprintf (stream, " { 0x%04X, %d }", ch, value); need_comma = true; } } if (need_comma) fprintf (stream, "\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Output the per-character decimal digit value table. */ static void output_decimal_digit (const char *filename, const char *version) { FILE *stream; unsigned int ch, i; struct decdigit_table t; unsigned int level1_offset, level2_offset, level3_offset; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 7; t.q = 9; decdigit_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { int value = 1 + get_decdigit_value (ch); if (!(value >= 0 && value <= 10)) abort (); decdigit_table_add (&t, ch, value); } decdigit_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define decdigit_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p - 1); fprintf (stream, " }\n"); fprintf (stream, "u_decdigit =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); /* Pack the level3 array. Each entry needs 4 bits only. */ fprintf (stream, " {"); if (t.level3_size << (t.p - 1) > 8) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << (t.p - 1); i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " 0x%02x", ((uint8_t *) (t.result + level3_offset))[2*i] + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); if (i+1 < t.level3_size << (t.p - 1)) fprintf (stream, ","); } if (t.level3_size << (t.p - 1) > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Digit value. */ /* See Unicode 3.0 book, section 4.6. */ static int get_digit_value (unsigned int ch) { if (unicode_attributes[ch].name != NULL && unicode_attributes[ch].digit[0] != '\0') return atoi (unicode_attributes[ch].digit); return -1; } /* Output the unit test for the per-character digit value table. */ static void output_digit_test (const char *filename, const char *version) { FILE *stream; bool need_comma; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); need_comma = false; for (ch = 0; ch < 0x110000; ch++) { int value = get_digit_value (ch); if (!(value >= -1 && value < 10)) abort (); if (value >= 0) { if (need_comma) fprintf (stream, ",\n"); fprintf (stream, " { 0x%04X, %d }", ch, value); need_comma = true; } } if (need_comma) fprintf (stream, "\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Output the per-character digit value table. */ static void output_digit (const char *filename, const char *version) { FILE *stream; unsigned int ch, i; struct decdigit_table t; unsigned int level1_offset, level2_offset, level3_offset; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Digit values of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 7; t.q = 9; decdigit_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { int value = 1 + get_digit_value (ch); if (!(value >= 0 && value <= 10)) abort (); decdigit_table_add (&t, ch, value); } decdigit_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define digit_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p - 1); fprintf (stream, " }\n"); fprintf (stream, "u_digit =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); /* Pack the level3 array. Each entry needs 4 bits only. */ fprintf (stream, " {"); if (t.level3_size << (t.p - 1) > 8) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << (t.p - 1); i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " 0x%02x", ((uint8_t *) (t.result + level3_offset))[2*i] + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); if (i+1 < t.level3_size << (t.p - 1)) fprintf (stream, ","); } if (t.level3_size << (t.p - 1) > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Numeric value. */ /* See Unicode 3.0 book, section 4.6. */ typedef struct { int numerator; int denominator; } uc_fraction_t; static uc_fraction_t get_numeric_value (unsigned int ch) { uc_fraction_t value; if (unicode_attributes[ch].name != NULL && unicode_attributes[ch].numeric[0] != '\0') { const char *str = unicode_attributes[ch].numeric; /* str is of the form "integer" or "integer/posinteger". */ value.numerator = atoi (str); if (strchr (str, '/') != NULL) value.denominator = atoi (strchr (str, '/') + 1); else value.denominator = 1; } else { value.numerator = 0; value.denominator = 0; } return value; } /* Output the unit test for the per-character numeric value table. */ static void output_numeric_test (const char *filename, const char *version) { FILE *stream; bool need_comma; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); need_comma = false; for (ch = 0; ch < 0x110000; ch++) { uc_fraction_t value = get_numeric_value (ch); if (value.numerator != 0 || value.denominator != 0) { if (need_comma) fprintf (stream, ",\n"); fprintf (stream, " { 0x%04X, %d, %d }", ch, value.numerator, value.denominator); need_comma = true; } } if (need_comma) fprintf (stream, "\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Construction of sparse 3-level tables. */ #define TABLE numeric_table #define ELEMENT uint8_t #define DEFAULT 0 #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output the per-character numeric value table. */ static void output_numeric (const char *filename, const char *version) { FILE *stream; uc_fraction_t fractions[128]; unsigned int nfractions; unsigned int ch, i, j; struct numeric_table t; unsigned int level1_offset, level2_offset, level3_offset; uint16_t *level3_packed; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Numeric values of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); /* Create table of occurring fractions. */ nfractions = 0; for (ch = 0; ch < 0x110000; ch++) { uc_fraction_t value = get_numeric_value (ch); for (i = 0; i < nfractions; i++) if (value.numerator == fractions[i].numerator && value.denominator == fractions[i].denominator) break; if (i == nfractions) { if (nfractions == 128) abort (); for (i = 0; i < nfractions; i++) if (value.denominator < fractions[i].denominator || (value.denominator == fractions[i].denominator && value.numerator < fractions[i].numerator)) break; for (j = nfractions; j > i; j--) fractions[j] = fractions[j - 1]; fractions[i] = value; nfractions++; } } fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n", nfractions); fprintf (stream, "{\n"); for (i = 0; i < nfractions; i++) { fprintf (stream, " { %d, %d }", fractions[i].numerator, fractions[i].denominator); if (i+1 < nfractions) fprintf (stream, ","); fprintf (stream, "\n"); } fprintf (stream, "};\n"); t.p = 7; t.q = 9; numeric_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { uc_fraction_t value = get_numeric_value (ch); for (i = 0; i < nfractions; i++) if (value.numerator == fractions[i].numerator && value.denominator == fractions[i].denominator) break; if (i == nfractions) abort (); numeric_table_add (&t, ch, i); } numeric_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define numeric_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, (1 << t.p) * 7 / 16); fprintf (stream, " }\n"); fprintf (stream, "u_numeric =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units, not 32-bit units, in order to make the lookup function easier. */ level3_packed = (uint16_t *) calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t)); for (i = 0; i < t.level3_size << t.p; i++) { unsigned int j = (i * 7) / 16; unsigned int k = (i * 7) % 16; uint32_t value = ((unsigned char *) (t.result + level3_offset))[i]; value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k); level3_packed[j] = value & 0xffff; level3_packed[j+1] = value >> 16; } fprintf (stream, " {"); if ((t.level3_size << t.p) * 7 / 16 + 1 > 8) fprintf (stream, "\n "); for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " 0x%04x", level3_packed[i]); if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1) fprintf (stream, ","); } if ((t.level3_size << t.p) * 7 / 16 + 1 > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); free (level3_packed); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Mirrored. */ /* See Unicode 3.0 book, section 4.7, UAX #9. */ /* List of mirrored character pairs. This is a subset of the characters having the BidiMirrored property. */ static unsigned int mirror_pairs[][2] = { { 0x0028, 0x0029 }, { 0x003C, 0x003E }, { 0x005B, 0x005D }, { 0x007B, 0x007D }, { 0x00AB, 0x00BB }, { 0x2039, 0x203A }, { 0x2045, 0x2046 }, { 0x207D, 0x207E }, { 0x208D, 0x208E }, { 0x2208, 0x220B }, { 0x220A, 0x220D }, { 0x223C, 0x223D }, { 0x2243, 0x22CD }, { 0x2252, 0x2253 }, { 0x2254, 0x2255 }, { 0x2264, 0x2265 }, { 0x2266, 0x2267 }, { 0x226A, 0x226B }, { 0x2276, 0x2277 }, { 0x2278, 0x2279 }, { 0x227A, 0x227B }, { 0x227C, 0x227D }, { 0x2282, 0x2283 }, { 0x2286, 0x2287 }, { 0x228F, 0x2290 }, { 0x2291, 0x2292 }, { 0x22A2, 0x22A3 }, { 0x22B0, 0x22B1 }, { 0x22B2, 0x22B3 }, { 0x22B4, 0x22B5 }, { 0x22B6, 0x22B7 }, { 0x22C9, 0x22CA }, { 0x22CB, 0x22CC }, { 0x22D0, 0x22D1 }, { 0x22D6, 0x22D7 }, { 0x22D8, 0x22D9 }, { 0x22DA, 0x22DB }, { 0x22DC, 0x22DD }, { 0x22DE, 0x22DF }, { 0x22F0, 0x22F1 }, { 0x2308, 0x2309 }, { 0x230A, 0x230B }, { 0x2329, 0x232A }, { 0x3008, 0x3009 }, { 0x300A, 0x300B }, { 0x300C, 0x300D }, { 0x300E, 0x300F }, { 0x3010, 0x3011 }, { 0x3014, 0x3015 }, { 0x3016, 0x3017 }, { 0x3018, 0x3019 }, { 0x301A, 0x301B } }; static int get_mirror_value (unsigned int ch) { bool mirrored; unsigned int mirror_char; unsigned int i; mirrored = (unicode_attributes[ch].name != NULL && unicode_attributes[ch].mirrored); mirror_char = 0xfffd; for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++) if (ch == mirror_pairs[i][0]) { mirror_char = mirror_pairs[i][1]; break; } else if (ch == mirror_pairs[i][1]) { mirror_char = mirror_pairs[i][0]; break; } if (mirrored) return (int) mirror_char - (int) ch; else { if (mirror_char != 0xfffd) abort (); return 0; } } /* Construction of sparse 3-level tables. */ #define TABLE mirror_table #define ELEMENT int32_t #define DEFAULT 0 #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output the per-character mirror table. */ static void output_mirror (const char *filename, const char *version) { FILE *stream; unsigned int ch, i; struct mirror_table t; unsigned int level1_offset, level2_offset, level3_offset; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Mirrored Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 7; t.q = 9; mirror_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { int value = get_mirror_value (ch); mirror_table_add (&t, ch, value); } mirror_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define mirror_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream, " }\n"); fprintf (stream, "u_mirror =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (int32_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level3_size << t.p > 8) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) fprintf (stream, ","); } if (t.level3_size << t.p > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Properties. */ /* Reading PropList.txt and DerivedCoreProperties.txt. */ enum { /* PropList.txt */ PROP_WHITE_SPACE, PROP_BIDI_CONTROL, PROP_JOIN_CONTROL, PROP_DASH, PROP_HYPHEN, PROP_QUOTATION_MARK, PROP_TERMINAL_PUNCTUATION, PROP_OTHER_MATH, PROP_HEX_DIGIT, PROP_ASCII_HEX_DIGIT, PROP_OTHER_ALPHABETIC, PROP_IDEOGRAPHIC, PROP_DIACRITIC, PROP_EXTENDER, PROP_OTHER_LOWERCASE, PROP_OTHER_UPPERCASE, PROP_NONCHARACTER_CODE_POINT, PROP_OTHER_GRAPHEME_EXTEND, PROP_IDS_BINARY_OPERATOR, PROP_IDS_TRINARY_OPERATOR, PROP_RADICAL, PROP_UNIFIED_IDEOGRAPH, PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT, PROP_DEPRECATED, PROP_SOFT_DOTTED, PROP_LOGICAL_ORDER_EXCEPTION, PROP_OTHER_ID_START, PROP_OTHER_ID_CONTINUE, PROP_STERM, PROP_VARIATION_SELECTOR, PROP_PATTERN_WHITE_SPACE, PROP_PATTERN_SYNTAX, /* DerivedCoreProperties.txt */ PROP_MATH, PROP_ALPHABETIC, PROP_LOWERCASE, PROP_UPPERCASE, PROP_ID_START, PROP_ID_CONTINUE, PROP_XID_START, PROP_XID_CONTINUE, PROP_DEFAULT_IGNORABLE_CODE_POINT, PROP_GRAPHEME_EXTEND, PROP_GRAPHEME_BASE, PROP_GRAPHEME_LINK }; unsigned long long unicode_properties[0x110000]; static void clear_properties (void) { unsigned int i; for (i = 0; i < 0x110000; i++) unicode_properties[i] = 0; } /* Stores in unicode_properties[] the properties from the PropList.txt or DerivedCoreProperties.txt file. */ static void fill_properties (const char *proplist_filename) { unsigned int i; FILE *stream; stream = fopen (proplist_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", proplist_filename); exit (1); } for (;;) { char buf[200+1]; unsigned int i1, i2; char padding[200+1]; char propname[200+1]; unsigned int propvalue; if (fscanf (stream, "%200[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) { if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) { fprintf (stderr, "parse error in '%s'\n", proplist_filename); exit (1); } i2 = i1; } #define PROP(name,value) \ if (strcmp (propname, name) == 0) propvalue = value; else /* PropList.txt */ PROP ("White_Space", PROP_WHITE_SPACE) PROP ("Bidi_Control", PROP_BIDI_CONTROL) PROP ("Join_Control", PROP_JOIN_CONTROL) PROP ("Dash", PROP_DASH) PROP ("Hyphen", PROP_HYPHEN) PROP ("Quotation_Mark", PROP_QUOTATION_MARK) PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION) PROP ("Other_Math", PROP_OTHER_MATH) PROP ("Hex_Digit", PROP_HEX_DIGIT) PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT) PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC) PROP ("Ideographic", PROP_IDEOGRAPHIC) PROP ("Diacritic", PROP_DIACRITIC) PROP ("Extender", PROP_EXTENDER) PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE) PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE) PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT) PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND) PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR) PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR) PROP ("Radical", PROP_RADICAL) PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH) PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT) PROP ("Deprecated", PROP_DEPRECATED) PROP ("Soft_Dotted", PROP_SOFT_DOTTED) PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION) PROP ("Other_ID_Start", PROP_OTHER_ID_START) PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE) PROP ("STerm", PROP_STERM) PROP ("Variation_Selector", PROP_VARIATION_SELECTOR) PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE) PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX) /* DerivedCoreProperties.txt */ PROP ("Math", PROP_MATH) PROP ("Alphabetic", PROP_ALPHABETIC) PROP ("Lowercase", PROP_LOWERCASE) PROP ("Uppercase", PROP_UPPERCASE) PROP ("ID_Start", PROP_ID_START) PROP ("ID_Continue", PROP_ID_CONTINUE) PROP ("XID_Start", PROP_XID_START) PROP ("XID_Continue", PROP_XID_CONTINUE) PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT) PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND) PROP ("Grapheme_Base", PROP_GRAPHEME_BASE) PROP ("Grapheme_Link", PROP_GRAPHEME_LINK) #undef PROP { fprintf (stderr, "unknown property named '%s' in '%s'\n", propname, proplist_filename); exit (1); } if (!(i1 <= i2 && i2 < 0x110000)) abort (); for (i = i1; i <= i2; i++) unicode_properties[i] |= 1ULL << propvalue; } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", proplist_filename); exit (1); } } /* Stores in array the given property from the Unicode 3.0 PropList.txt file. */ static void fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name) { unsigned int i; FILE *stream; char buf[100+1]; for (i = 0; i < 0x110000; i++) array[i] = 0; stream = fopen (proplist_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", proplist_filename); exit (1); } /* Search for the "Property dump for: ..." line. */ do { if (fscanf (stream, "%100[^\n]\n", buf) < 1) { fprintf (stderr, "no property found in '%s'\n", proplist_filename); exit (1); } } while (strstr (buf, property_name) == NULL); for (;;) { unsigned int i1, i2; if (fscanf (stream, "%100[^\n]\n", buf) < 1) break; if (buf[0] == '*') break; if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.') { if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2) { fprintf (stderr, "parse error in property in '%s'\n", proplist_filename); exit (1); } } else if (strlen (buf) >= 4) { if (sscanf (buf, "%4X", &i1) < 1) { fprintf (stderr, "parse error in property in '%s'\n", proplist_filename); exit (1); } i2 = i1; } else { fprintf (stderr, "parse error in property in '%s'\n", proplist_filename); exit (1); } if (!(i1 <= i2 && i2 < 0x110000)) abort (); for (i = i1; i <= i2; i++) array[i] = 1; } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", proplist_filename); exit (1); } } /* Properties from Unicode 3.0 PropList.txt file. */ /* The paired punctuation property from the PropList.txt file. */ char unicode_pairedpunctuation[0x110000]; /* The left of pair property from the PropList.txt file. */ char unicode_leftofpair[0x110000]; static void fill_properties30 (const char *proplist30_filename) { fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)"); fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)"); } /* ------------------------------------------------------------------------- */ /* See PropList.txt, UCD.html. */ static bool is_property_white_space (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0); } /* See Unicode 3.0 book, section 4.10, PropList.txt, UCD.html, DerivedCoreProperties.txt, UCD.html. */ static bool is_property_alphabetic (unsigned int ch) { bool result1 = is_category_L (ch) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0) /* For some reason, the following are listed as having property Alphabetic but not as having property Other_Alphabetic. */ || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */ || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */ || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */ || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */ || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */ || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */ || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */ || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */ || (ch == 0x10341) /* GOTHIC LETTER NINETY */ || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */ || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */ bool result2 = ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0); if (result1 != result2) abort (); return result1; } /* See PropList.txt, UCD.html. */ static bool is_property_other_alphabetic (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_not_a_character (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0); } /* See PropList.txt, UCD.html, DerivedCoreProperties.txt, UCD.html. */ static bool is_property_default_ignorable_code_point (unsigned int ch) { bool result1 = (is_category_Cf (ch) && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */ && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0) || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); bool result2 = ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0); if (result1 != result2) abort (); return result1; } /* See PropList.txt, UCD.html. */ static bool is_property_other_default_ignorable_code_point (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_deprecated (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_logical_order_exception (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_variation_selector (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_private_use (unsigned int ch) { /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */ return (ch >= 0xE000 && ch <= 0xF8FF) || (ch >= 0xF0000 && ch <= 0xFFFFD) || (ch >= 0x100000 && ch <= 0x10FFFD); } /* See PropList-3.0.1.txt. */ static bool is_property_unassigned_code_value (unsigned int ch) { return (is_category_Cn (ch) && !is_property_not_a_character (ch)); } /* See PropList.txt, UCD.html, DerivedCoreProperties.txt, UCD.html. */ static bool is_property_uppercase (unsigned int ch) { bool result1 = is_category_Lu (ch) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0); bool result2 = ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0); if (result1 != result2) abort (); return result1; } /* See PropList.txt, UCD.html. */ static bool is_property_other_uppercase (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0); } /* See PropList.txt, UCD.html, DerivedCoreProperties.txt, UCD.html. */ static bool is_property_lowercase (unsigned int ch) { bool result1 = is_category_Ll (ch) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0); bool result2 = ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0); if (result1 != result2) abort (); return result1; } /* See PropList.txt, UCD.html. */ static bool is_property_other_lowercase (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_titlecase (unsigned int ch) { return is_category_Lt (ch); } /* See PropList.txt, UCD.html. */ static bool is_property_soft_dotted (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0); } /* See DerivedCoreProperties.txt, UCD.html. */ static bool is_property_id_start (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_other_id_start (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0); } /* See DerivedCoreProperties.txt, UCD.html. */ static bool is_property_id_continue (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_other_id_continue (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0); } /* See DerivedCoreProperties.txt, UCD.html. */ static bool is_property_xid_start (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0); } /* See DerivedCoreProperties.txt, UCD.html. */ static bool is_property_xid_continue (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_pattern_white_space (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_pattern_syntax (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_join_control (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0); } /* See DerivedCoreProperties.txt, UCD.html. */ static bool is_property_grapheme_base (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0); } /* See DerivedCoreProperties.txt, UCD.html. */ static bool is_property_grapheme_extend (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_other_grapheme_extend (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0); } /* See DerivedCoreProperties.txt, UCD.html. */ static bool is_property_grapheme_link (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_bidi_control (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_left_to_right (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_L); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_hebrew_right_to_left (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_R); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_arabic_right_to_left (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_AL); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_european_digit (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_EN); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_eur_num_separator (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_ES); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_eur_num_terminator (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_ET); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_arabic_digit (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_AN); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_common_separator (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_CS); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_block_separator (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_B); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_segment_separator (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_S); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_whitespace (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_WS); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_non_spacing_mark (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_NSM); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_boundary_neutral (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_BN); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_pdf (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_PDF); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_embedding_or_override (unsigned int ch) { int category = get_bidi_category (ch); return (category == UC_BIDI_LRE || category == UC_BIDI_LRO || category == UC_BIDI_RLE || category == UC_BIDI_RLO); } /* See PropList-3.0.1.txt. */ static bool is_property_bidi_other_neutral (unsigned int ch) { return (get_bidi_category (ch) == UC_BIDI_ON); } /* See PropList.txt, UCD.html. */ static bool is_property_hex_digit (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_ascii_hex_digit (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0); } /* See Unicode 3.0 book, section 4.10, PropList.txt, UCD.html. */ static bool is_property_ideographic (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_unified_ideograph (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_radical (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_ids_binary_operator (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_ids_trinary_operator (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_zero_width (unsigned int ch) { return is_category_Cf (ch) || (unicode_attributes[ch].name != NULL && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL); } /* See PropList-3.0.1.txt. */ static bool is_property_space (unsigned int ch) { return is_category_Zs (ch); } /* See PropList-3.0.1.txt. */ static bool is_property_non_break (unsigned int ch) { /* This is exactly the set of characters having line breaking property GL. */ return (ch == 0x00A0 /* NO-BREAK SPACE */ || ch == 0x034F /* COMBINING GRAPHEME JOINER */ || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */ || ch == 0x035D /* COMBINING DOUBLE BREVE */ || ch == 0x035E /* COMBINING DOUBLE MACRON */ || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */ || ch == 0x0360 /* COMBINING DOUBLE TILDE */ || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */ || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */ || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ || ch == 0x2007 /* FIGURE SPACE */ || ch == 0x2011 /* NON-BREAKING HYPHEN */ || ch == 0x202F /* NARROW NO-BREAK SPACE */); } /* See PropList-3.0.1.txt. */ static bool is_property_iso_control (unsigned int ch) { bool result1 = (unicode_attributes[ch].name != NULL && strcmp (unicode_attributes[ch].name, "") == 0); bool result2 = is_category_Cc (ch); if (result1 != result2) abort (); return result1; } /* See PropList-3.0.1.txt. */ static bool is_property_format_control (unsigned int ch) { return (is_category_Cf (ch) && get_bidi_category (ch) == UC_BIDI_BN && !is_property_join_control (ch) && ch != 0xFEFF); } /* See PropList.txt, UCD.html. */ static bool is_property_dash (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_hyphen (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_punctuation (unsigned int ch) { return is_category_P (ch); } /* See PropList-3.0.1.txt. */ static bool is_property_line_separator (unsigned int ch) { return is_category_Zl (ch); } /* See PropList-3.0.1.txt. */ static bool is_property_paragraph_separator (unsigned int ch) { return is_category_Zp (ch); } /* See PropList.txt, UCD.html. */ static bool is_property_quotation_mark (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_sentence_terminal (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_terminal_punctuation (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_currency_symbol (unsigned int ch) { return is_category_Sc (ch); } /* See Unicode 3.0 book, section 4.9, PropList.txt, UCD.html, DerivedCoreProperties.txt, UCD.html. */ static bool is_property_math (unsigned int ch) { bool result1 = is_category_Sm (ch) || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0); bool result2 = ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0); if (result1 != result2) abort (); return result1; } /* See PropList.txt, UCD.html. */ static bool is_property_other_math (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_paired_punctuation (unsigned int ch) { return unicode_pairedpunctuation[ch]; } /* See PropList-3.0.1.txt. */ static bool is_property_left_of_pair (unsigned int ch) { return unicode_leftofpair[ch]; } /* See PropList-3.0.1.txt. */ static bool is_property_combining (unsigned int ch) { return (unicode_attributes[ch].name != NULL && (strcmp (unicode_attributes[ch].combining, "0") != 0 || is_category_Mc (ch) || is_category_Me (ch) || is_category_Mn (ch))); } #if 0 /* same as is_property_bidi_non_spacing_mark */ /* See PropList-3.0.1.txt. */ static bool is_property_non_spacing (unsigned int ch) { return (unicode_attributes[ch].name != NULL && get_bidi_category (ch) == UC_BIDI_NSM); } #endif /* See PropList-3.0.1.txt. */ static bool is_property_composite (unsigned int ch) { /* This definition differs from the one in PropList-3.0.1.txt, but is more logical in some sense. */ if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */ return true; if (unicode_attributes[ch].name != NULL && unicode_attributes[ch].decomposition != NULL) { /* Test whether the decomposition contains more than one character, and the first is not a space. */ const char *decomp = unicode_attributes[ch].decomposition; if (decomp[0] == '<') { decomp = strchr (decomp, '>') + 1; if (decomp[0] == ' ') decomp++; } return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0; } return false; } /* See PropList-3.0.1.txt. */ static bool is_property_decimal_digit (unsigned int ch) { return is_category_Nd (ch); } /* See PropList-3.0.1.txt. */ static bool is_property_numeric (unsigned int ch) { return ((get_numeric_value (ch)).denominator > 0) || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */ || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */ } /* See PropList.txt, UCD.html. */ static bool is_property_diacritic (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0); } /* See PropList.txt, UCD.html. */ static bool is_property_extender (unsigned int ch) { return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0); } /* See PropList-3.0.1.txt. */ static bool is_property_ignorable_control (unsigned int ch) { return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN) || is_category_Cf (ch)) && ch != 0x0000; } /* ------------------------------------------------------------------------- */ /* Output all properties. */ static void output_properties (const char *version) { #define PROPERTY(P) \ debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \ output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \ output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version); PROPERTY(white_space) PROPERTY(alphabetic) PROPERTY(other_alphabetic) PROPERTY(not_a_character) PROPERTY(default_ignorable_code_point) PROPERTY(other_default_ignorable_code_point) PROPERTY(deprecated) PROPERTY(logical_order_exception) PROPERTY(variation_selector) PROPERTY(private_use) PROPERTY(unassigned_code_value) PROPERTY(uppercase) PROPERTY(other_uppercase) PROPERTY(lowercase) PROPERTY(other_lowercase) PROPERTY(titlecase) PROPERTY(soft_dotted) PROPERTY(id_start) PROPERTY(other_id_start) PROPERTY(id_continue) PROPERTY(other_id_continue) PROPERTY(xid_start) PROPERTY(xid_continue) PROPERTY(pattern_white_space) PROPERTY(pattern_syntax) PROPERTY(join_control) PROPERTY(grapheme_base) PROPERTY(grapheme_extend) PROPERTY(other_grapheme_extend) PROPERTY(grapheme_link) PROPERTY(bidi_control) PROPERTY(bidi_left_to_right) PROPERTY(bidi_hebrew_right_to_left) PROPERTY(bidi_arabic_right_to_left) PROPERTY(bidi_european_digit) PROPERTY(bidi_eur_num_separator) PROPERTY(bidi_eur_num_terminator) PROPERTY(bidi_arabic_digit) PROPERTY(bidi_common_separator) PROPERTY(bidi_block_separator) PROPERTY(bidi_segment_separator) PROPERTY(bidi_whitespace) PROPERTY(bidi_non_spacing_mark) PROPERTY(bidi_boundary_neutral) PROPERTY(bidi_pdf) PROPERTY(bidi_embedding_or_override) PROPERTY(bidi_other_neutral) PROPERTY(hex_digit) PROPERTY(ascii_hex_digit) PROPERTY(ideographic) PROPERTY(unified_ideograph) PROPERTY(radical) PROPERTY(ids_binary_operator) PROPERTY(ids_trinary_operator) PROPERTY(zero_width) PROPERTY(space) PROPERTY(non_break) PROPERTY(iso_control) PROPERTY(format_control) PROPERTY(dash) PROPERTY(hyphen) PROPERTY(punctuation) PROPERTY(line_separator) PROPERTY(paragraph_separator) PROPERTY(quotation_mark) PROPERTY(sentence_terminal) PROPERTY(terminal_punctuation) PROPERTY(currency_symbol) PROPERTY(math) PROPERTY(other_math) PROPERTY(paired_punctuation) PROPERTY(left_of_pair) PROPERTY(combining) PROPERTY(composite) PROPERTY(decimal_digit) PROPERTY(numeric) PROPERTY(diacritic) PROPERTY(extender) PROPERTY(ignorable_control) #undef PROPERTY } /* ========================================================================= */ /* Scripts. */ static const char *scripts[256]; static unsigned int numscripts; static uint8_t unicode_scripts[0x110000]; static void fill_scripts (const char *scripts_filename) { FILE *stream; unsigned int i; stream = fopen (scripts_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", scripts_filename); exit (1); } numscripts = 0; for (i = 0; i < 0x110000; i++) unicode_scripts[i] = (uint8_t)~(uint8_t)0; for (;;) { char buf[200+1]; unsigned int i1, i2; char padding[200+1]; char scriptname[200+1]; int script; if (fscanf (stream, "%200[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4) { if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3) { fprintf (stderr, "parse error in '%s'\n", scripts_filename); exit (1); } i2 = i1; } if (i2 < i1) abort (); if (i2 >= 0x110000) abort (); for (script = numscripts - 1; script >= 0; script--) if (strcmp (scripts[script], scriptname) == 0) break; if (script < 0) { scripts[numscripts] = strdup (scriptname); script = numscripts; numscripts++; if (numscripts == 256) abort (); } for (i = i1; i <= i2; i++) { if (unicode_scripts[i] != (uint8_t)~(uint8_t)0) fprintf (stderr, "0x%04X belongs to multiple scripts\n", i); unicode_scripts[i] = script; } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", scripts_filename); exit (1); } } /* Construction of sparse 3-level tables. */ #define TABLE script_table #define ELEMENT uint8_t #define DEFAULT (uint8_t)~(uint8_t)0 #define xmalloc malloc #define xrealloc realloc #include "3level.h" static void output_scripts (const char *version) { const char *filename = "unictype/scripts.h"; FILE *stream; unsigned int ch, s, i; struct script_table t; unsigned int level1_offset, level2_offset, level3_offset; typedef struct { const char *lowercase_name; } scriptinfo_t; scriptinfo_t scriptinfo[256]; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode scripts. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); for (s = 0; s < numscripts; s++) { char *lcp = strdup (scripts[s]); char *cp; for (cp = lcp; *cp != '\0'; cp++) if (*cp >= 'A' && *cp <= 'Z') *cp += 'a' - 'A'; scriptinfo[s].lowercase_name = lcp; } for (s = 0; s < numscripts; s++) { fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n", scriptinfo[s].lowercase_name); fprintf (stream, "{\n"); i = 0; for (ch = 0; ch < 0x110000; ch++) if (unicode_scripts[ch] == s) { unsigned int start; unsigned int end; start = ch; while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s) ch++; end = ch; if (i > 0) fprintf (stream, ",\n"); if (start == end) fprintf (stream, " { 0x%04X, 1, 1 }", start); else fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }", start, end); i++; } fprintf (stream, "\n"); fprintf (stream, "};\n"); } fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts); fprintf (stream, "{\n"); for (s = 0; s < numscripts; s++) { fprintf (stream, " {\n"); fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n", scriptinfo[s].lowercase_name); fprintf (stream, " script_%s_intervals,\n", scriptinfo[s].lowercase_name); fprintf (stream, " \"%s\"\n", scripts[s]); fprintf (stream, " }"); if (s+1 < numscripts) fprintf (stream, ","); fprintf (stream, "\n"); } fprintf (stream, "};\n"); t.p = 7; t.q = 9; script_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { unsigned int s = unicode_scripts[ch]; if (s != (uint8_t)~(uint8_t)0) script_table_add (&t, ch, s); } script_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define script_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream, " }\n"); fprintf (stream, "u_script =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level3_size << t.p > 8) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) fprintf (stream, ","); } if (t.level3_size << t.p > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } static void output_scripts_byname (const char *version) { const char *filename = "unictype/scripts_byname.gperf"; FILE *stream; unsigned int s; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode scripts. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); fprintf (stream, "struct named_script { const char *name; unsigned int index; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define hash-function-name scripts_hash\n"); fprintf (stream, "%%define lookup-function-name uc_script_lookup\n"); fprintf (stream, "%%readonly-tables\n"); fprintf (stream, "%%global-table\n"); fprintf (stream, "%%define word-array-name script_names\n"); fprintf (stream, "%%%%\n"); for (s = 0; s < numscripts; s++) fprintf (stream, "%s, %u\n", scripts[s], s); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Blocks. */ typedef struct { unsigned int start; unsigned int end; const char *name; } block_t; static block_t blocks[256]; static unsigned int numblocks; static void fill_blocks (const char *blocks_filename) { FILE *stream; stream = fopen (blocks_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", blocks_filename); exit (1); } for (;;) { char buf[200+1]; unsigned int i1, i2; char padding[200+1]; char blockname[200+1]; if (fscanf (stream, "%200[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4) { fprintf (stderr, "parse error in '%s'\n", blocks_filename); exit (1); } blocks[numblocks].start = i1; blocks[numblocks].end = i2; blocks[numblocks].name = strdup (blockname); /* It must be sorted. */ if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start)) abort (); numblocks++; if (numblocks == 256) abort (); } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", blocks_filename); exit (1); } } /* Return the smallest block index among the blocks for characters >= ch. */ static unsigned int block_first_index (unsigned int ch) { /* Binary search. */ unsigned int lo = 0; unsigned int hi = numblocks; /* Invariants: All blocks[i], i < lo, have blocks[i].end < ch, all blocks[i], i >= hi, have blocks[i].end >= ch. */ while (lo < hi) { unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */ if (blocks[mid].end < ch) lo = mid + 1; else hi = mid; } return hi; } /* Return the largest block index among the blocks for characters <= ch, plus 1. */ static unsigned int block_last_index (unsigned int ch) { /* Binary search. */ unsigned int lo = 0; unsigned int hi = numblocks; /* Invariants: All blocks[i], i < lo, have blocks[i].start <= ch, all blocks[i], i >= hi, have blocks[i].start > ch. */ while (lo < hi) { unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */ if (blocks[mid].start <= ch) lo = mid + 1; else hi = mid; } return hi; } static void output_blocks (const char *version) { const char *filename = "unictype/blocks.h"; const unsigned int shift = 8; /* bits to shift away for array access */ const unsigned int threshold = 0x30000; /* cut-off table here to save space */ FILE *stream; unsigned int i; unsigned int i1; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Unicode blocks. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); fprintf (stream, "static const uc_block_t blocks[] =\n"); fprintf (stream, "{\n"); for (i = 0; i < numblocks; i++) { fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start, blocks[i].end, blocks[i].name); if (i+1 < numblocks) fprintf (stream, ","); fprintf (stream, "\n"); } fprintf (stream, "};\n"); fprintf (stream, "#define blocks_level1_shift %d\n", shift); fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold); fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n", threshold >> shift); fprintf (stream, "{\n"); for (i1 = 0; i1 < (threshold >> shift); i1++) { unsigned int first_index = block_first_index (i1 << shift); unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1); fprintf (stream, " %3d, %3d", first_index, last_index); if (i1+1 < (threshold >> shift)) fprintf (stream, ","); fprintf (stream, "\n"); } fprintf (stream, "};\n"); fprintf (stream, "#define blocks_upper_first_index %d\n", block_first_index (threshold)); fprintf (stream, "#define blocks_upper_last_index %d\n", block_last_index (0x10FFFF)); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* C and Java syntax. */ enum { UC_IDENTIFIER_START, /* valid as first or subsequent character */ UC_IDENTIFIER_VALID, /* valid as subsequent character only */ UC_IDENTIFIER_INVALID, /* not valid */ UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */ }; /* ISO C 99 section 6.4.(3). */ static bool is_c_whitespace (unsigned int ch) { return (ch == ' ' /* space */ || ch == '\t' /* horizontal tab */ || ch == '\n' || ch == '\r' /* new-line */ || ch == '\v' /* vertical tab */ || ch == '\f'); /* form-feed */ } /* ISO C 99 section 6.4.2.1 and appendix D. */ static int c_ident_category (unsigned int ch) { /* Section 6.4.2.1. */ if (ch >= '0' && ch <= '9') return UC_IDENTIFIER_VALID; if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_') return UC_IDENTIFIER_START; /* Appendix D. */ if (0 /* Latin */ || (ch == 0x00AA) || (ch == 0x00BA) || (ch >= 0x00C0 && ch <= 0x00D6) || (ch >= 0x00D8 && ch <= 0x00F6) || (ch >= 0x00F8 && ch <= 0x01F5) || (ch >= 0x01FA && ch <= 0x0217) || (ch >= 0x0250 && ch <= 0x02A8) || (ch >= 0x1E00 && ch <= 0x1E9B) || (ch >= 0x1EA0 && ch <= 0x1EF9) || (ch == 0x207F) /* Greek */ || (ch == 0x0386) || (ch >= 0x0388 && ch <= 0x038A) || (ch == 0x038C) || (ch >= 0x038E && ch <= 0x03A1) || (ch >= 0x03A3 && ch <= 0x03CE) || (ch >= 0x03D0 && ch <= 0x03D6) || (ch == 0x03DA) || (ch == 0x03DC) || (ch == 0x03DE) || (ch == 0x03E0) || (ch >= 0x03E2 && ch <= 0x03F3) || (ch >= 0x1F00 && ch <= 0x1F15) || (ch >= 0x1F18 && ch <= 0x1F1D) || (ch >= 0x1F20 && ch <= 0x1F45) || (ch >= 0x1F48 && ch <= 0x1F4D) || (ch >= 0x1F50 && ch <= 0x1F57) || (ch == 0x1F59) || (ch == 0x1F5B) || (ch == 0x1F5D) || (ch >= 0x1F5F && ch <= 0x1F7D) || (ch >= 0x1F80 && ch <= 0x1FB4) || (ch >= 0x1FB6 && ch <= 0x1FBC) || (ch >= 0x1FC2 && ch <= 0x1FC4) || (ch >= 0x1FC6 && ch <= 0x1FCC) || (ch >= 0x1FD0 && ch <= 0x1FD3) || (ch >= 0x1FD6 && ch <= 0x1FDB) || (ch >= 0x1FE0 && ch <= 0x1FEC) || (ch >= 0x1FF2 && ch <= 0x1FF4) || (ch >= 0x1FF6 && ch <= 0x1FFC) /* Cyrillic */ || (ch >= 0x0401 && ch <= 0x040C) || (ch >= 0x040E && ch <= 0x044F) || (ch >= 0x0451 && ch <= 0x045C) || (ch >= 0x045E && ch <= 0x0481) || (ch >= 0x0490 && ch <= 0x04C4) || (ch >= 0x04C7 && ch <= 0x04C8) || (ch >= 0x04CB && ch <= 0x04CC) || (ch >= 0x04D0 && ch <= 0x04EB) || (ch >= 0x04EE && ch <= 0x04F5) || (ch >= 0x04F8 && ch <= 0x04F9) /* Armenian */ || (ch >= 0x0531 && ch <= 0x0556) || (ch >= 0x0561 && ch <= 0x0587) /* Hebrew */ || (ch >= 0x05B0 && ch <= 0x05B9) || (ch >= 0x05BB && ch <= 0x05BD) || (ch == 0x05BF) || (ch >= 0x05C1 && ch <= 0x05C2) || (ch >= 0x05D0 && ch <= 0x05EA) || (ch >= 0x05F0 && ch <= 0x05F2) /* Arabic */ || (ch >= 0x0621 && ch <= 0x063A) || (ch >= 0x0640 && ch <= 0x0652) || (ch >= 0x0670 && ch <= 0x06B7) || (ch >= 0x06BA && ch <= 0x06BE) || (ch >= 0x06C0 && ch <= 0x06CE) || (ch >= 0x06D0 && ch <= 0x06DC) || (ch >= 0x06E5 && ch <= 0x06E8) || (ch >= 0x06EA && ch <= 0x06ED) /* Devanagari */ || (ch >= 0x0901 && ch <= 0x0903) || (ch >= 0x0905 && ch <= 0x0939) || (ch >= 0x093E && ch <= 0x094D) || (ch >= 0x0950 && ch <= 0x0952) || (ch >= 0x0958 && ch <= 0x0963) /* Bengali */ || (ch >= 0x0981 && ch <= 0x0983) || (ch >= 0x0985 && ch <= 0x098C) || (ch >= 0x098F && ch <= 0x0990) || (ch >= 0x0993 && ch <= 0x09A8) || (ch >= 0x09AA && ch <= 0x09B0) || (ch == 0x09B2) || (ch >= 0x09B6 && ch <= 0x09B9) || (ch >= 0x09BE && ch <= 0x09C4) || (ch >= 0x09C7 && ch <= 0x09C8) || (ch >= 0x09CB && ch <= 0x09CD) || (ch >= 0x09DC && ch <= 0x09DD) || (ch >= 0x09DF && ch <= 0x09E3) || (ch >= 0x09F0 && ch <= 0x09F1) /* Gurmukhi */ || (ch == 0x0A02) || (ch >= 0x0A05 && ch <= 0x0A0A) || (ch >= 0x0A0F && ch <= 0x0A10) || (ch >= 0x0A13 && ch <= 0x0A28) || (ch >= 0x0A2A && ch <= 0x0A30) || (ch >= 0x0A32 && ch <= 0x0A33) || (ch >= 0x0A35 && ch <= 0x0A36) || (ch >= 0x0A38 && ch <= 0x0A39) || (ch >= 0x0A3E && ch <= 0x0A42) || (ch >= 0x0A47 && ch <= 0x0A48) || (ch >= 0x0A4B && ch <= 0x0A4D) || (ch >= 0x0A59 && ch <= 0x0A5C) || (ch == 0x0A5E) || (ch == 0x0A74) /* Gujarati */ || (ch >= 0x0A81 && ch <= 0x0A83) || (ch >= 0x0A85 && ch <= 0x0A8B) || (ch == 0x0A8D) || (ch >= 0x0A8F && ch <= 0x0A91) || (ch >= 0x0A93 && ch <= 0x0AA8) || (ch >= 0x0AAA && ch <= 0x0AB0) || (ch >= 0x0AB2 && ch <= 0x0AB3) || (ch >= 0x0AB5 && ch <= 0x0AB9) || (ch >= 0x0ABD && ch <= 0x0AC5) || (ch >= 0x0AC7 && ch <= 0x0AC9) || (ch >= 0x0ACB && ch <= 0x0ACD) || (ch == 0x0AD0) || (ch == 0x0AE0) /* Oriya */ || (ch >= 0x0B01 && ch <= 0x0B03) || (ch >= 0x0B05 && ch <= 0x0B0C) || (ch >= 0x0B0F && ch <= 0x0B10) || (ch >= 0x0B13 && ch <= 0x0B28) || (ch >= 0x0B2A && ch <= 0x0B30) || (ch >= 0x0B32 && ch <= 0x0B33) || (ch >= 0x0B36 && ch <= 0x0B39) || (ch >= 0x0B3E && ch <= 0x0B43) || (ch >= 0x0B47 && ch <= 0x0B48) || (ch >= 0x0B4B && ch <= 0x0B4D) || (ch >= 0x0B5C && ch <= 0x0B5D) || (ch >= 0x0B5F && ch <= 0x0B61) /* Tamil */ || (ch >= 0x0B82 && ch <= 0x0B83) || (ch >= 0x0B85 && ch <= 0x0B8A) || (ch >= 0x0B8E && ch <= 0x0B90) || (ch >= 0x0B92 && ch <= 0x0B95) || (ch >= 0x0B99 && ch <= 0x0B9A) || (ch == 0x0B9C) || (ch >= 0x0B9E && ch <= 0x0B9F) || (ch >= 0x0BA3 && ch <= 0x0BA4) || (ch >= 0x0BA8 && ch <= 0x0BAA) || (ch >= 0x0BAE && ch <= 0x0BB5) || (ch >= 0x0BB7 && ch <= 0x0BB9) || (ch >= 0x0BBE && ch <= 0x0BC2) || (ch >= 0x0BC6 && ch <= 0x0BC8) || (ch >= 0x0BCA && ch <= 0x0BCD) /* Telugu */ || (ch >= 0x0C01 && ch <= 0x0C03) || (ch >= 0x0C05 && ch <= 0x0C0C) || (ch >= 0x0C0E && ch <= 0x0C10) || (ch >= 0x0C12 && ch <= 0x0C28) || (ch >= 0x0C2A && ch <= 0x0C33) || (ch >= 0x0C35 && ch <= 0x0C39) || (ch >= 0x0C3E && ch <= 0x0C44) || (ch >= 0x0C46 && ch <= 0x0C48) || (ch >= 0x0C4A && ch <= 0x0C4D) || (ch >= 0x0C60 && ch <= 0x0C61) /* Kannada */ || (ch >= 0x0C82 && ch <= 0x0C83) || (ch >= 0x0C85 && ch <= 0x0C8C) || (ch >= 0x0C8E && ch <= 0x0C90) || (ch >= 0x0C92 && ch <= 0x0CA8) || (ch >= 0x0CAA && ch <= 0x0CB3) || (ch >= 0x0CB5 && ch <= 0x0CB9) || (ch >= 0x0CBE && ch <= 0x0CC4) || (ch >= 0x0CC6 && ch <= 0x0CC8) || (ch >= 0x0CCA && ch <= 0x0CCD) || (ch == 0x0CDE) || (ch >= 0x0CE0 && ch <= 0x0CE1) /* Malayalam */ || (ch >= 0x0D02 && ch <= 0x0D03) || (ch >= 0x0D05 && ch <= 0x0D0C) || (ch >= 0x0D0E && ch <= 0x0D10) || (ch >= 0x0D12 && ch <= 0x0D28) || (ch >= 0x0D2A && ch <= 0x0D39) || (ch >= 0x0D3E && ch <= 0x0D43) || (ch >= 0x0D46 && ch <= 0x0D48) || (ch >= 0x0D4A && ch <= 0x0D4D) || (ch >= 0x0D60 && ch <= 0x0D61) /* Thai */ || (ch >= 0x0E01 && ch <= 0x0E3A) || (ch >= 0x0E40 && ch <= 0x0E5B) /* Lao */ || (ch >= 0x0E81 && ch <= 0x0E82) || (ch == 0x0E84) || (ch >= 0x0E87 && ch <= 0x0E88) || (ch == 0x0E8A) || (ch == 0x0E8D) || (ch >= 0x0E94 && ch <= 0x0E97) || (ch >= 0x0E99 && ch <= 0x0E9F) || (ch >= 0x0EA1 && ch <= 0x0EA3) || (ch == 0x0EA5) || (ch == 0x0EA7) || (ch >= 0x0EAA && ch <= 0x0EAB) || (ch >= 0x0EAD && ch <= 0x0EAE) || (ch >= 0x0EB0 && ch <= 0x0EB9) || (ch >= 0x0EBB && ch <= 0x0EBD) || (ch >= 0x0EC0 && ch <= 0x0EC4) || (ch == 0x0EC6) || (ch >= 0x0EC8 && ch <= 0x0ECD) || (ch >= 0x0EDC && ch <= 0x0EDD) /* Tibetan */ || (ch == 0x0F00) || (ch >= 0x0F18 && ch <= 0x0F19) || (ch == 0x0F35) || (ch == 0x0F37) || (ch == 0x0F39) || (ch >= 0x0F3E && ch <= 0x0F47) || (ch >= 0x0F49 && ch <= 0x0F69) || (ch >= 0x0F71 && ch <= 0x0F84) || (ch >= 0x0F86 && ch <= 0x0F8B) || (ch >= 0x0F90 && ch <= 0x0F95) || (ch == 0x0F97) || (ch >= 0x0F99 && ch <= 0x0FAD) || (ch >= 0x0FB1 && ch <= 0x0FB7) || (ch == 0x0FB9) /* Georgian */ || (ch >= 0x10A0 && ch <= 0x10C5) || (ch >= 0x10D0 && ch <= 0x10F6) /* Hiragana */ || (ch >= 0x3041 && ch <= 0x3093) || (ch >= 0x309B && ch <= 0x309C) /* Katakana */ || (ch >= 0x30A1 && ch <= 0x30F6) || (ch >= 0x30FB && ch <= 0x30FC) /* Bopomofo */ || (ch >= 0x3105 && ch <= 0x312C) /* CJK Unified Ideographs */ || (ch >= 0x4E00 && ch <= 0x9FA5) /* Hangul */ || (ch >= 0xAC00 && ch <= 0xD7A3) /* Digits */ || (ch >= 0x0660 && ch <= 0x0669) || (ch >= 0x06F0 && ch <= 0x06F9) || (ch >= 0x0966 && ch <= 0x096F) || (ch >= 0x09E6 && ch <= 0x09EF) || (ch >= 0x0A66 && ch <= 0x0A6F) || (ch >= 0x0AE6 && ch <= 0x0AEF) || (ch >= 0x0B66 && ch <= 0x0B6F) || (ch >= 0x0BE7 && ch <= 0x0BEF) || (ch >= 0x0C66 && ch <= 0x0C6F) || (ch >= 0x0CE6 && ch <= 0x0CEF) || (ch >= 0x0D66 && ch <= 0x0D6F) || (ch >= 0x0E50 && ch <= 0x0E59) || (ch >= 0x0ED0 && ch <= 0x0ED9) || (ch >= 0x0F20 && ch <= 0x0F33) /* Special characters */ || (ch == 0x00B5) || (ch == 0x00B7) || (ch >= 0x02B0 && ch <= 0x02B8) || (ch == 0x02BB) || (ch >= 0x02BD && ch <= 0x02C1) || (ch >= 0x02D0 && ch <= 0x02D1) || (ch >= 0x02E0 && ch <= 0x02E4) || (ch == 0x037A) || (ch == 0x0559) || (ch == 0x093D) || (ch == 0x0B3D) || (ch == 0x1FBE) || (ch >= 0x203F && ch <= 0x2040) || (ch == 0x2102) || (ch == 0x2107) || (ch >= 0x210A && ch <= 0x2113) || (ch == 0x2115) || (ch >= 0x2118 && ch <= 0x211D) || (ch == 0x2124) || (ch == 0x2126) || (ch == 0x2128) || (ch >= 0x212A && ch <= 0x2131) || (ch >= 0x2133 && ch <= 0x2138) || (ch >= 0x2160 && ch <= 0x2182) || (ch >= 0x3005 && ch <= 0x3007) || (ch >= 0x3021 && ch <= 0x3029) ) return UC_IDENTIFIER_START; return UC_IDENTIFIER_INVALID; } /* The Java Language Specification, 3rd edition, §3.6. http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */ static bool is_java_whitespace (unsigned int ch) { return (ch == ' ' || ch == '\t' || ch == '\f' || ch == '\n' || ch == '\r'); } /* The Java Language Specification, 3rd edition, §3.8. http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625 and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */ static int java_ident_category (unsigned int ch) { /* FIXME: Check this against Sun's JDK implementation. */ if (is_category_L (ch) /* = Character.isLetter(ch) */ || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */ || is_category_Sc (ch) /* currency symbol */ || is_category_Pc (ch) /* connector punctuation */ ) return UC_IDENTIFIER_START; if (is_category_Nd (ch) /* digit */ || is_category_Mc (ch) /* combining mark */ || is_category_Mn (ch) /* non-spacing mark */ ) return UC_IDENTIFIER_VALID; if ((ch >= 0x0000 && ch <= 0x0008) || (ch >= 0x000E && ch <= 0x001B) || (ch >= 0x007F && ch <= 0x009F) || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */ ) return UC_IDENTIFIER_IGNORABLE; return UC_IDENTIFIER_INVALID; } /* Construction of sparse 3-level tables. */ #define TABLE identsyntax_table #define ELEMENT uint8_t #define DEFAULT UC_IDENTIFIER_INVALID #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output an identifier syntax categorization in a three-level bitmap. */ static void output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version) { FILE *stream; unsigned int ch, i; struct identsyntax_table t; unsigned int level1_offset, level2_offset, level3_offset; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Language syntax properties of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-ctype.c for Unicode %s. */\n", version); t.p = 7; /* or 8 */ t.q = 5; /* or 4 */ identsyntax_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { int syntaxcode = predicate (ch); if (syntaxcode != UC_IDENTIFIER_INVALID) identsyntax_table_add (&t, ch, syntaxcode); } identsyntax_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define identsyntax_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size, (1 << t.p) * 2 / 16); fprintf (stream, " }\n"); fprintf (stream, "%s =\n", name); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (uint8_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); /* Pack the level3 array. Each entry needs 2 bits only. */ fprintf (stream, " {"); if ((t.level3_size << t.p) * 2 / 16 > 8) fprintf (stream, "\n "); for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " 0x%04x", (((uint8_t *) (t.result + level3_offset))[8 * i] << 0) | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2) | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4) | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6) | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8) | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10) | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12) | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14)); if (i+1 < (t.level3_size << t.p) * 2 / 16) fprintf (stream, ","); } if ((t.level3_size << t.p) * 2 / 16 > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } static void output_ident_properties (const char *version) { #define PROPERTY(P) \ debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \ output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \ output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version); PROPERTY(c_whitespace) PROPERTY(java_whitespace) #undef PROPERTY output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version); output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version); } /* ========================================================================= */ /* Like ISO C and . Compatible to glibc's glibc/localedata/locales/i18n file, generated by glibc/localedata/gen-unicode-ctype.c. */ /* Character mappings. */ static unsigned int to_upper (unsigned int ch) { if (unicode_attributes[ch].name != NULL && unicode_attributes[ch].upper != NONE) return unicode_attributes[ch].upper; else return ch; } static unsigned int to_lower (unsigned int ch) { if (unicode_attributes[ch].name != NULL && unicode_attributes[ch].lower != NONE) return unicode_attributes[ch].lower; else return ch; } static unsigned int to_title (unsigned int ch) { if (unicode_attributes[ch].name != NULL && unicode_attributes[ch].title != NONE) return unicode_attributes[ch].title; else return ch; } /* Character class properties. */ static bool is_upper (unsigned int ch) { return (to_lower (ch) != ch); } static bool is_lower (unsigned int ch) { return (to_upper (ch) != ch) /* is lowercase, but without simple to_upper mapping. */ || (ch == 0x00DF); } static bool is_alpha (unsigned int ch) { return (unicode_attributes[ch].name != NULL && ((unicode_attributes[ch].category[0] == 'L' /* Theppitak Karoonboonyanan says , should belong to is_punct. */ && (ch != 0x0E2F) && (ch != 0x0E46)) /* Theppitak Karoonboonyanan says , .., .. are is_alpha. */ || (ch == 0x0E31) || (ch >= 0x0E34 && ch <= 0x0E3A) || (ch >= 0x0E47 && ch <= 0x0E4E) /* Avoid warning for . */ || (ch == 0x0345) /* Avoid warnings for ... */ || (unicode_attributes[ch].category[0] == 'N' && unicode_attributes[ch].category[1] == 'l') /* Avoid warnings for ... */ || (unicode_attributes[ch].category[0] == 'S' && unicode_attributes[ch].category[1] == 'o' && strstr (unicode_attributes[ch].name, " LETTER ") != NULL) /* Consider all the non-ASCII digits as alphabetic. ISO C 99 forbids us to have them in category "digit", but we want iswalnum to return true on them. */ || (unicode_attributes[ch].category[0] == 'N' && unicode_attributes[ch].category[1] == 'd' && !(ch >= 0x0030 && ch <= 0x0039)))); } static bool is_digit (unsigned int ch) { #if 0 return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'N' && unicode_attributes[ch].category[1] == 'd'); /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without a zero. Must add <0> in front of them by hand. */ #else /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99 takes it away: 7.25.2.1.5: The iswdigit function tests for any wide character that corresponds to a decimal-digit character (as defined in 5.2.1). 5.2.1: the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 */ return (ch >= 0x0030 && ch <= 0x0039); #endif } static bool is_outdigit (unsigned int ch) { return (ch >= 0x0030 && ch <= 0x0039); } static bool is_alnum (unsigned int ch) { return is_alpha (ch) || is_digit (ch); } static bool is_blank (unsigned int ch) { return (ch == 0x0009 /* '\t' */ /* Category Zs without mention of "" */ || (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'Z' && unicode_attributes[ch].category[1] == 's' && !strstr (unicode_attributes[ch].decomposition, ""))); } static bool is_space (unsigned int ch) { /* Don't make U+00A0 a space. Non-breaking space means that all programs should treat it like a punctuation character, not like a space. */ return (ch == 0x0020 /* ' ' */ || ch == 0x000C /* '\f' */ || ch == 0x000A /* '\n' */ || ch == 0x000D /* '\r' */ || ch == 0x0009 /* '\t' */ || ch == 0x000B /* '\v' */ /* Categories Zl, Zp, and Zs without mention of "" */ || (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'Z' && (unicode_attributes[ch].category[1] == 'l' || unicode_attributes[ch].category[1] == 'p' || (unicode_attributes[ch].category[1] == 's' && !strstr (unicode_attributes[ch].decomposition, ""))))); } static bool is_cntrl (unsigned int ch) { return (unicode_attributes[ch].name != NULL && (strcmp (unicode_attributes[ch].name, "") == 0 /* Categories Zl and Zp */ || (unicode_attributes[ch].category[0] == 'Z' && (unicode_attributes[ch].category[1] == 'l' || unicode_attributes[ch].category[1] == 'p')))); } static bool is_xdigit (unsigned int ch) { #if 0 return is_digit (ch) || (ch >= 0x0041 && ch <= 0x0046) || (ch >= 0x0061 && ch <= 0x0066); #else /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 takes it away: 7.25.2.1.12: The iswxdigit function tests for any wide character that corresponds to a hexadecimal-digit character (as defined in 6.4.4.1). 6.4.4.1: hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F */ return (ch >= 0x0030 && ch <= 0x0039) || (ch >= 0x0041 && ch <= 0x0046) || (ch >= 0x0061 && ch <= 0x0066); #endif } static bool is_graph (unsigned int ch) { return (unicode_attributes[ch].name != NULL && strcmp (unicode_attributes[ch].name, "") && !is_space (ch)); } static bool is_print (unsigned int ch) { return (unicode_attributes[ch].name != NULL && strcmp (unicode_attributes[ch].name, "") /* Categories Zl and Zp */ && !(unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'Z' && (unicode_attributes[ch].category[1] == 'l' || unicode_attributes[ch].category[1] == 'p'))); } static bool is_punct (unsigned int ch) { #if 0 return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'P'); #else /* The traditional POSIX definition of punctuation is every graphic, non-alphanumeric character. */ return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch)); #endif } /* Output all properties. */ static void output_old_ctype (const char *version) { #define PROPERTY(P) \ debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \ output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \ output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C like properties", version); PROPERTY(alnum) PROPERTY(alpha) PROPERTY(cntrl) PROPERTY(digit) PROPERTY(graph) PROPERTY(lower) PROPERTY(print) PROPERTY(punct) PROPERTY(space) PROPERTY(upper) PROPERTY(xdigit) PROPERTY(blank) #undef PROPERTY } #if 0 static bool is_combining (unsigned int ch) { /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt file. In 3.0.1 it was identical to the union of the general categories "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the PropList.txt file, so we take the latter definition. */ return (unicode_attributes[ch].name != NULL && unicode_attributes[ch].category[0] == 'M' && (unicode_attributes[ch].category[1] == 'n' || unicode_attributes[ch].category[1] == 'c' || unicode_attributes[ch].category[1] == 'e')); } static bool is_combining_level3 (unsigned int ch) { return is_combining (ch) && !(unicode_attributes[ch].combining[0] != '\0' && unicode_attributes[ch].combining[0] != '0' && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200); } /* Return the UCS symbol string for a Unicode character. */ static const char * ucs_symbol (unsigned int i) { static char buf[11+1]; sprintf (buf, (i < 0x10000 ? "" : ""), i); return buf; } /* Return the UCS symbol range string for a Unicode characters interval. */ static const char * ucs_symbol_range (unsigned int low, unsigned int high) { static char buf[24+1]; strcpy (buf, ucs_symbol (low)); strcat (buf, ".."); strcat (buf, ucs_symbol (high)); return buf; } /* Output a character class (= property) table. */ static void output_charclass (FILE *stream, const char *classname, bool (*func) (unsigned int)) { char table[0x110000]; unsigned int i; bool need_semicolon; const int max_column = 75; int column; for (i = 0; i < 0x110000; i++) table[i] = (int) func (i); fprintf (stream, "%s ", classname); need_semicolon = false; column = 1000; for (i = 0; i < 0x110000; ) { if (!table[i]) i++; else { unsigned int low, high; char buf[25]; low = i; do i++; while (i < 0x110000 && table[i]); high = i - 1; if (low == high) strcpy (buf, ucs_symbol (low)); else strcpy (buf, ucs_symbol_range (low, high)); if (need_semicolon) { fprintf (stream, ";"); column++; } if (column + strlen (buf) > max_column) { fprintf (stream, "/\n "); column = 3; } fprintf (stream, "%s", buf); column += strlen (buf); need_semicolon = true; } } fprintf (stream, "\n"); } /* Output a character mapping table. */ static void output_charmap (FILE *stream, const char *mapname, unsigned int (*func) (unsigned int)) { char table[0x110000]; unsigned int i; bool need_semicolon; const int max_column = 75; int column; for (i = 0; i < 0x110000; i++) table[i] = (func (i) != i); fprintf (stream, "%s ", mapname); need_semicolon = false; column = 1000; for (i = 0; i < 0x110000; i++) if (table[i]) { char buf[25+1]; strcpy (buf, "("); strcat (buf, ucs_symbol (i)); strcat (buf, ","); strcat (buf, ucs_symbol (func (i))); strcat (buf, ")"); if (need_semicolon) { fprintf (stream, ";"); column++; } if (column + strlen (buf) > max_column) { fprintf (stream, "/\n "); column = 3; } fprintf (stream, "%s", buf); column += strlen (buf); need_semicolon = true; } fprintf (stream, "\n"); } /* Output the width table. */ static void output_widthmap (FILE *stream) { } /* Output the tables to the given file. */ static void output_tables (const char *filename, const char *version) { FILE *stream; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "escape_char /\n"); fprintf (stream, "comment_char %%\n"); fprintf (stream, "\n"); fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", version); fprintf (stream, "\n"); fprintf (stream, "LC_IDENTIFICATION\n"); fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version); fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n"); fprintf (stream, "address \"\"\n"); fprintf (stream, "contact \"\"\n"); fprintf (stream, "email \"bug-glibc@gnu.org\"\n"); fprintf (stream, "tel \"\"\n"); fprintf (stream, "fax \"\"\n"); fprintf (stream, "language \"\"\n"); fprintf (stream, "territory \"Earth\"\n"); fprintf (stream, "revision \"%s\"\n", version); { time_t now; char date[11]; now = time (NULL); strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now)); fprintf (stream, "date \"%s\"\n", date); } fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n"); fprintf (stream, "END LC_IDENTIFICATION\n"); fprintf (stream, "\n"); /* Verifications. */ for (ch = 0; ch < 0x110000; ch++) { /* toupper restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_upper (ch)); /* tolower restriction: "Only characters specified for the keywords lower and upper shall be specified. */ if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch))) fprintf (stderr, "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", ucs_symbol (ch), ch, to_lower (ch)); /* alpha restriction: "Characters classified as either upper or lower shall automatically belong to this class. */ if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch)) fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); /* alpha restriction: "No character specified for the keywords cntrl, digit, punct or space shall be specified." */ if (is_alpha (ch) && is_cntrl (ch)) fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); if (is_alpha (ch) && is_digit (ch)) fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); if (is_alpha (ch) && is_punct (ch)) fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); if (is_alpha (ch) && is_space (ch)) fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); /* space restriction: "No character specified for the keywords upper, lower, alpha, digit, graph or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_space (ch) && is_digit (ch)) fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); if (is_space (ch) && is_graph (ch)) fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); if (is_space (ch) && is_xdigit (ch)) fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); /* cntrl restriction: "No character specified for the keywords upper, lower, alpha, digit, punct, graph, print or xdigit shall be specified." upper, lower, alpha already checked above. */ if (is_cntrl (ch) && is_digit (ch)) fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_punct (ch)) fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_graph (ch)) fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_print (ch)) fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); if (is_cntrl (ch) && is_xdigit (ch)) fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); /* punct restriction: "No character specified for the keywords upper, lower, alpha, digit, cntrl, xdigit or as the character shall be specified." upper, lower, alpha, cntrl already checked above. */ if (is_punct (ch) && is_digit (ch)) fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); if (is_punct (ch) && is_xdigit (ch)) fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); if (is_punct (ch) && (ch == 0x0020)) fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); /* graph restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* print restriction: "No character specified for the keyword cntrl shall be specified." Already checked above. */ /* graph - print relation: differ only in the character. How is this possible if there are more than one space character?! I think susv2/xbd/locale.html should speak of "space characters", not "space character". */ if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch))) fprintf (stderr, "%s is print but not graph|\n", ucs_symbol (ch)); if (!is_print (ch) && (is_graph (ch) || ch == 0x0020)) fprintf (stderr, "%s is graph| but not print\n", ucs_symbol (ch)); } fprintf (stream, "LC_CTYPE\n"); output_charclass (stream, "upper", is_upper); output_charclass (stream, "lower", is_lower); output_charclass (stream, "alpha", is_alpha); output_charclass (stream, "digit", is_digit); output_charclass (stream, "outdigit", is_outdigit); output_charclass (stream, "blank", is_blank); output_charclass (stream, "space", is_space); output_charclass (stream, "cntrl", is_cntrl); output_charclass (stream, "punct", is_punct); output_charclass (stream, "xdigit", is_xdigit); output_charclass (stream, "graph", is_graph); output_charclass (stream, "print", is_print); output_charclass (stream, "class \"combining\";", is_combining); output_charclass (stream, "class \"combining_level3\";", is_combining_level3); output_charmap (stream, "toupper", to_upper); output_charmap (stream, "tolower", to_lower); output_charmap (stream, "map \"totitle\";", to_title); output_widthmap (stream); fprintf (stream, "END LC_CTYPE\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } #endif /* ========================================================================= */ /* The width property from the EastAsianWidth.txt file. Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ const char * unicode_width[0x110000]; /* Stores in unicode_width[] the width property from the EastAsianWidth.txt file. */ static void fill_width (const char *width_filename) { unsigned int i, j; FILE *stream; char field0[FIELDLEN]; char field1[FIELDLEN]; char field2[FIELDLEN]; int lineno = 0; for (i = 0; i < 0x110000; i++) unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); stream = fopen (width_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", width_filename); exit (1); } for (;;) { int n; int c; lineno++; c = getc (stream); if (c == EOF) break; if (c == '#') { do c = getc (stream); while (c != EOF && c != '\n'); continue; } ungetc (c, stream); n = getfield (stream, field0, ';'); n += getfield (stream, field1, ' '); n += getfield (stream, field2, '\n'); if (n == 0) break; if (n != 3) { fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); exit (1); } i = strtoul (field0, NULL, 16); if (strstr (field0, "..") != NULL) { /* Deal with a range. */ j = strtoul (strstr (field0, "..") + 2, NULL, 16); for (; i <= j; i++) unicode_width[i] = strdup (field1); } else { /* Single character line. */ unicode_width[i] = strdup (field1); } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", width_filename); exit (1); } } /* Line breaking classification. */ enum { /* Values >= 24 are resolved at run time. */ LBP_BK = 24, /* mandatory break */ /*LBP_CR, carriage return - not used here because it's a DOSism */ /*LBP_LF, line feed - not used here because it's a DOSism */ LBP_CM = 25, /* attached characters and combining marks */ /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ /*LBP_SG, surrogates - not used here because they are not characters */ LBP_WJ = 0, /* word joiner */ LBP_ZW = 26, /* zero width space */ LBP_GL = 1, /* non-breaking (glue) */ LBP_SP = 27, /* space */ LBP_B2 = 2, /* break opportunity before and after */ LBP_BA = 3, /* break opportunity after */ LBP_BB = 4, /* break opportunity before */ LBP_HY = 5, /* hyphen */ LBP_CB = 28, /* contingent break opportunity */ LBP_CL = 6, /* closing punctuation */ LBP_EX = 7, /* exclamation/interrogation */ LBP_IN = 8, /* inseparable */ LBP_NS = 9, /* non starter */ LBP_OP = 10, /* opening punctuation */ LBP_QU = 11, /* ambiguous quotation */ LBP_IS = 12, /* infix separator (numeric) */ LBP_NU = 13, /* numeric */ LBP_PO = 14, /* postfix (numeric) */ LBP_PR = 15, /* prefix (numeric) */ LBP_SY = 16, /* symbols allowing breaks */ LBP_AI = 29, /* ambiguous (alphabetic or ideograph) */ LBP_AL = 17, /* ordinary alphabetic and symbol characters */ LBP_H2 = 18, /* Hangul LV syllable */ LBP_H3 = 19, /* Hangul LVT syllable */ LBP_ID = 20, /* ideographic */ LBP_JL = 21, /* Hangul L Jamo */ LBP_JV = 22, /* Hangul V Jamo */ LBP_JT = 23, /* Hangul T Jamo */ LBP_SA = 30, /* complex context (South East Asian) */ LBP_XX = 31 /* unknown */ }; /* Returns the line breaking classification for ch, as a bit mask. */ static int get_lbp (unsigned int ch) { int attr = 0; if (unicode_attributes[ch].name != NULL) { /* mandatory break */ if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */ || ch == 0x000C /* form feed */ || ch == 0x000B /* line tabulation */ || ch == 0x2028 /* LINE SEPARATOR */ || ch == 0x2029 /* PARAGRAPH SEPARATOR */) attr |= 1 << LBP_BK; if (ch == 0x2060 /* WORD JOINER */ || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */) attr |= 1 << LBP_WJ; /* zero width space */ if (ch == 0x200B /* ZERO WIDTH SPACE */) attr |= 1 << LBP_ZW; /* non-breaking (glue) */ if (ch == 0x00A0 /* NO-BREAK SPACE */ || ch == 0x202F /* NARROW NO-BREAK SPACE */ || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ || ch == 0x034F /* COMBINING GRAPHEME JOINER */ || ch == 0x2007 /* FIGURE SPACE */ || ch == 0x2011 /* NON-BREAKING HYPHEN */ || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */) attr |= 1 << LBP_GL; /* space */ if (ch == 0x0020 /* SPACE */) attr |= 1 << LBP_SP; /* break opportunity before and after */ if (ch == 0x2014 /* EM DASH */) attr |= 1 << LBP_B2; /* break opportunity after */ if (ch == 0x1680 /* OGHAM SPACE MARK */ || ch == 0x2000 /* EN QUAD */ || ch == 0x2001 /* EM QUAD */ || ch == 0x2002 /* EN SPACE */ || ch == 0x2003 /* EM SPACE */ || ch == 0x2004 /* THREE-PER-EM SPACE */ || ch == 0x2005 /* FOUR-PER-EM SPACE */ || ch == 0x2006 /* SIX-PER-EM SPACE */ || ch == 0x2008 /* PUNCTUATION SPACE */ || ch == 0x2009 /* THIN SPACE */ || ch == 0x200A /* HAIR SPACE */ || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */ || ch == 0x0009 /* tab */ || ch == 0x00AD /* SOFT HYPHEN */ || ch == 0x058A /* ARMENIAN HYPHEN */ || ch == 0x2010 /* HYPHEN */ || ch == 0x2012 /* FIGURE DASH */ || ch == 0x2013 /* EN DASH */ || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ || ch == 0x1361 /* ETHIOPIC WORDSPACE */ || ch == 0x17D8 /* KHMER SIGN BEYYAL */ || ch == 0x17DA /* KHMER SIGN KOOMUUT */ || ch == 0x2027 /* HYPHENATION POINT */ || ch == 0x007C /* VERTICAL LINE */ || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ || ch == 0x2056 /* THREE DOT PUNCTUATION */ || ch == 0x2058 /* FOUR DOT PUNCTUATION */ || ch == 0x2059 /* FIVE DOT PUNCTUATION */ || ch == 0x205A /* TWO DOT PUNCTUATION */ || ch == 0x205B /* FOUR DOT MARK */ || ch == 0x205D /* TRICOLON */ || ch == 0x205E /* VERTICAL FOUR DOTS */ || ch == 0x2E19 /* PALM BRANCH */ || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */ || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */ || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */ || ch == 0x2E2D /* FIVE DOT PUNCTUATION */ || ch == 0x2E30 /* RING POINT */ || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */ || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */ || ch == 0x10102 /* AEGEAN CHECK MARK */ || ch == 0x1039F /* UGARITIC WORD DIVIDER */ || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */ || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */ || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */ || ch == 0x0964 /* DEVANAGARI DANDA */ || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */ || ch == 0x104B /* MYANMAR SIGN SECTION */ || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */ || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */ || ch == 0x17D4 /* KHMER SIGN KHAN */ || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ || ch == 0x1B5E /* BALINESE CARIK SIKI */ || ch == 0x1B5F /* BALINESE CARIK PAREREN */ || ch == 0xA8CE /* SAURASHTRA DANDA */ || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */ || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */ || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */ || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */ || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */ || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */ || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */ || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */ || ch == 0x0F85 /* TIBETAN MARK PALUTA */ || ch == 0x0FBE /* TIBETAN KU RU KHA */ || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ || ch == 0x1804 /* MONGOLIAN COLON */ || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ || ch == 0x1B5A /* BALINESE PANTI */ || ch == 0x1B5B /* BALINESE PAMADA */ || ch == 0x1B5C /* BALINESE WINDU */ || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */ || ch == 0x1B60 /* BALINESE PAMENENG */ || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */ || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */ || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */ || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */ || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */ || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */ || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */ || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */ || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */ || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */ || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */ || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */ || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */ || ch == 0xA60D /* VAI COMMA */ || ch == 0xA60F /* VAI QUESTION MARK */ || ch == 0xA92E /* KAYAH LI SIGN CWI */ || ch == 0xA92F /* KAYAH LI SIGN SHYA */ || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */ || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */ || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */ || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */ || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) attr |= 1 << LBP_BA; /* break opportunity before */ if (ch == 0x00B4 /* ACUTE ACCENT */ || ch == 0x1FFD /* GREEK OXIA */ || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */ || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */ || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */ || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */ || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */ || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */ || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */ || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */ || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */ || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */ || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */ || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */ || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */ || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */ || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) attr |= 1 << LBP_BB; /* hyphen */ if (ch == 0x002D /* HYPHEN-MINUS */) attr |= 1 << LBP_HY; /* contingent break opportunity */ if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) attr |= 1 << LBP_CB; /* closing punctuation */ if ((unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 'e') || ch == 0x3001 /* IDEOGRAPHIC COMMA */ || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */ || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */ || ch == 0xFE50 /* SMALL COMMA */ || ch == 0xFE52 /* SMALL FULL STOP */ || ch == 0xFF0C /* FULLWIDTH COMMA */ || ch == 0xFF0E /* FULLWIDTH FULL STOP */ || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */) attr |= 1 << LBP_CL; /* exclamation/interrogation */ if (ch == 0x0021 /* EXCLAMATION MARK */ || ch == 0x003F /* QUESTION MARK */ || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */ || ch == 0x061B /* ARABIC SEMICOLON */ || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */ || ch == 0x061F /* ARABIC QUESTION MARK */ || ch == 0x06D4 /* ARABIC FULL STOP */ || ch == 0x07F9 /* NKO EXCLAMATION MARK */ || ch == 0x0F0D /* TIBETAN MARK SHAD */ || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */ || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */ || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */ || ch == 0x1802 /* MONGOLIAN COMMA */ || ch == 0x1803 /* MONGOLIAN FULL STOP */ || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ || ch == 0x1944 /* LIMBU EXCLAMATION MARK */ || ch == 0x1945 /* LIMBU QUESTION MARK */ || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */ || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */ || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ || ch == 0x2CFE /* COPTIC FULL STOP */ || ch == 0x2E2E /* REVERSED QUESTION MARK */ #if REVISION_22 || ch == 0xA60C /* VAI SYLLABLE LENGTHENER */ #endif || ch == 0xA60E /* VAI FULL STOP */ || ch == 0xA876 /* PHAGS-PA MARK SHAD */ || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */ || ch == 0xFE56 /* SMALL QUESTION MARK */ || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) attr |= 1 << LBP_EX; /* inseparable */ if (ch == 0x2024 /* ONE DOT LEADER */ || ch == 0x2025 /* TWO DOT LEADER */ || ch == 0x2026 /* HORIZONTAL ELLIPSIS */ || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */) attr |= 1 << LBP_IN; /* non starter */ if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ || ch == 0x203D /* INTERROBANG */ || ch == 0x2047 /* DOUBLE QUESTION MARK */ || ch == 0x2048 /* QUESTION EXCLAMATION MARK */ || ch == 0x2049 /* EXCLAMATION QUESTION MARK */ || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ || ch == 0x301C /* WAVE DASH */ || ch == 0x303C /* MASU MARK */ || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */ || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ || ch == 0x309D /* HIRAGANA ITERATION MARK */ || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */ || ch == 0x30FB /* KATAKANA MIDDLE DOT */ || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */ || ch == 0x30FD /* KATAKANA ITERATION MARK */ || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */ || ch == 0xA015 /* YI SYLLABLE WU */ || ch == 0xFE54 /* SMALL SEMICOLON */ || ch == 0xFE55 /* SMALL COLON */ || ch == 0xFF1A /* FULLWIDTH COLON */ || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */ || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) attr |= 1 << LBP_NS; /* opening punctuation */ if ((unicode_attributes[ch].category[0] == 'P' && unicode_attributes[ch].category[1] == 's') || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00BF /* INVERTED QUESTION MARK */ || ch == 0x2E18 /* INVERTED INTERROBANG */) attr |= 1 << LBP_OP; /* ambiguous quotation */ if ((unicode_attributes[ch].category[0] == 'P' && (unicode_attributes[ch].category[1] == 'f' || unicode_attributes[ch].category[1] == 'i')) || ch == 0x0022 /* QUOTATION MARK */ || ch == 0x0027 /* APOSTROPHE */ || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */ || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */ || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */ || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */ || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */ || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */ || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */ || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */ || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */ || ch == 0x2E0B /* RAISED SQUARE */) attr |= 1 << LBP_QU; /* infix separator (numeric) */ if (ch == 0x002C /* COMMA */ || ch == 0x002E /* FULL STOP */ || ch == 0x003A /* COLON */ || ch == 0x003B /* SEMICOLON */ || ch == 0x037E /* GREEK QUESTION MARK */ || ch == 0x0589 /* ARMENIAN FULL STOP */ || ch == 0x060C /* ARABIC COMMA */ || ch == 0x060D /* ARABIC DATE SEPARATOR */ || ch == 0x07F8 /* NKO COMMA */ || ch == 0x2044 /* FRACTION SLASH */ || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */ || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */ || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */) attr |= 1 << LBP_IS; /* numeric */ if ((unicode_attributes[ch].category[0] == 'N' && unicode_attributes[ch].category[1] == 'd' && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */) attr |= 1 << LBP_NU; /* postfix (numeric) */ if (ch == 0x0025 /* PERCENT SIGN */ || ch == 0x00A2 /* CENT SIGN */ || ch == 0x00B0 /* DEGREE SIGN */ || ch == 0x060B /* AFGHANI SIGN */ || ch == 0x066A /* ARABIC PERCENT SIGN */ || ch == 0x2030 /* PER MILLE SIGN */ || ch == 0x2031 /* PER TEN THOUSAND SIGN */ || ch == 0x2032 /* PRIME */ || ch == 0x2033 /* DOUBLE PRIME */ || ch == 0x2034 /* TRIPLE PRIME */ || ch == 0x2035 /* REVERSED PRIME */ || ch == 0x2036 /* REVERSED DOUBLE PRIME */ || ch == 0x2037 /* REVERSED TRIPLE PRIME */ || ch == 0x20A7 /* PESETA SIGN */ || ch == 0x2103 /* DEGREE CELSIUS */ || ch == 0x2109 /* DEGREE FAHRENHEIT */ || ch == 0xFDFC /* RIAL SIGN */ || ch == 0xFE6A /* SMALL PERCENT SIGN */ || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ || ch == 0x0D79 /* MALAYALAM DATE MARK */) attr |= 1 << LBP_PO; /* prefix (numeric) */ if ((unicode_attributes[ch].category[0] == 'S' && unicode_attributes[ch].category[1] == 'c') || ch == 0x002B /* PLUS SIGN */ || ch == 0x005C /* REVERSE SOLIDUS */ || ch == 0x00B1 /* PLUS-MINUS SIGN */ || ch == 0x2116 /* NUMERO SIGN */ || ch == 0x2212 /* MINUS SIGN */ || ch == 0x2213 /* MINUS-OR-PLUS SIGN */) if (!(attr & (1 << LBP_PO))) attr |= 1 << LBP_PR; /* symbols allowing breaks */ if (ch == 0x002F /* SOLIDUS */) attr |= 1 << LBP_SY; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0) attr |= 1 << LBP_H2; if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0) attr |= 1 << LBP_H3; if ((ch >= 0x1100 && ch <= 0x1159) || ch == 0x115F) attr |= 1 << LBP_JL; if (ch >= 0x1160 && ch <= 0x11A2) attr |= 1 << LBP_JV; if (ch >= 0x11A8 && ch <= 0x11F9) attr |= 1 << LBP_JT; /* complex context (South East Asian) */ if (((unicode_attributes[ch].category[0] == 'C' && unicode_attributes[ch].category[1] == 'f') || (unicode_attributes[ch].category[0] == 'L' && (unicode_attributes[ch].category[1] == 'm' || unicode_attributes[ch].category[1] == 'o')) || (unicode_attributes[ch].category[0] == 'M' && (unicode_attributes[ch].category[1] == 'c' || unicode_attributes[ch].category[1] == 'n')) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */) && ((ch >= 0x0E00 && ch <= 0x0EFF) || (ch >= 0x1000 && ch <= 0x109F) || (ch >= 0x1780 && ch <= 0x17FF) || (ch >= 0x1950 && ch <= 0x19DF))) attr |= 1 << LBP_SA; /* attached characters and combining marks */ if ((unicode_attributes[ch].category[0] == 'M' && (unicode_attributes[ch].category[1] == 'c' || unicode_attributes[ch].category[1] == 'e' || unicode_attributes[ch].category[1] == 'n')) || (unicode_attributes[ch].category[0] == 'C' && (unicode_attributes[ch].category[1] == 'c' || unicode_attributes[ch].category[1] == 'f'))) if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL) | (1 << LBP_SA) | (1 << LBP_WJ) | (1 << LBP_ZW)))) attr |= 1 << LBP_CM; /* ideographic */ if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ || ch == 0x3000 /* IDEOGRAPHIC SPACE */ || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ || (ch >= 0x3400 && ch <= 0x4DB5) /* CJK Ideograph Extension A */ || (ch >= 0x4E00 && ch <= 0x9FC3) /* CJK Ideograph */ || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ || ch == 0xFE62 /* SMALL PLUS SIGN */ || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ || ch == 0xFE66 /* SMALL EQUALS SIGN */ || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */ || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL || (ch >= 0x3000 && ch <= 0x33FF && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))) /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */ || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */ || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */ || ch == 0xFE45 /* SESAME DOT */ || ch == 0xFE46 /* WHITE SESAME DOT */ || ch == 0xFE49 /* DASHED OVERLINE */ || ch == 0xFE4A /* CENTRELINE OVERLINE */ || ch == 0xFE4B /* WAVY OVERLINE */ || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */ || ch == 0xFE4D /* DASHED LOW LINE */ || ch == 0xFE4E /* CENTRELINE LOW LINE */ || ch == 0xFE4F /* WAVY LOW LINE */ || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */ || ch == 0xFE58 /* SMALL EM DASH */ || ch == 0xFE5F /* SMALL NUMBER SIGN */ || ch == 0xFE60 /* SMALL AMPERSAND */ || ch == 0xFE61 /* SMALL ASTERISK */ || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */ || ch == 0xFE6B /* SMALL COMMERCIAL AT */ || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */ || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */ || ch == 0xFF06 /* FULLWIDTH AMPERSAND */ || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */ || ch == 0xFF0A /* FULLWIDTH ASTERISK */ || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */ || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */ || ch == 0xFF0F /* FULLWIDTH SOLIDUS */ || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */ || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */ || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */ || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */ || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */ || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */ || ch == 0xFF3F /* FULLWIDTH LOW LINE */ || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */ || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */ || ch == 0xFF5E /* FULLWIDTH TILDE */ || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ || ch == 0xFFE3 /* FULLWIDTH MACRON */ || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */) if (!(attr & ((1 << LBP_NS) | (1 << LBP_CM)))) { /* ambiguous (ideograph) ? */ if ((unicode_width[ch] != NULL && unicode_width[ch][0] == 'A' && ch >= 0x2000) || ch == 0x24EA /* CIRCLED DIGIT ZERO */ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */) attr |= 1 << LBP_AI; else attr |= 1 << LBP_ID; } /* ordinary alphabetic and symbol characters */ if ((unicode_attributes[ch].category[0] == 'L' && (unicode_attributes[ch].category[1] == 'u' || unicode_attributes[ch].category[1] == 'l' || unicode_attributes[ch].category[1] == 't' || unicode_attributes[ch].category[1] == 'm' || unicode_attributes[ch].category[1] == 'o')) || (unicode_attributes[ch].category[0] == 'S' && (unicode_attributes[ch].category[1] == 'm' || unicode_attributes[ch].category[1] == 'k' || unicode_attributes[ch].category[1] == 'o')) || (unicode_attributes[ch].category[0] == 'N' && (unicode_attributes[ch].category[1] == 'l' || unicode_attributes[ch].category[1] == 'o')) || (unicode_attributes[ch].category[0] == 'P' && (unicode_attributes[ch].category[1] == 'c' || unicode_attributes[ch].category[1] == 'd' || unicode_attributes[ch].category[1] == 'o')) || ch == 0x0600 /* ARABIC NUMBER SIGN */ || ch == 0x0601 /* ARABIC SIGN SANAH */ || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */ || ch == 0x0603 /* ARABIC SIGN SAFHA */ || ch == 0x06DD /* ARABIC END OF AYAH */ || ch == 0x070F /* SYRIAC ABBREVIATION MARK */ || ch == 0x2061 /* FUNCTION APPLICATION */ || ch == 0x2062 /* INVISIBLE TIMES */ || ch == 0x2063 /* INVISIBLE SEPARATOR */ || ch == 0x2064 /* INVISIBLE PLUS */) if (!(attr & ((1 << LBP_GL) | (1 << LBP_B2) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_HY) | (1 << LBP_CB) | (1 << LBP_CL) | (1 << LBP_EX) | (1 << LBP_IN) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_QU) | (1 << LBP_IS) | (1 << LBP_NU) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SY) | (1 << LBP_H2) | (1 << LBP_H3) | (1 << LBP_JL) | (1 << LBP_JV) | (1 << LBP_JT) | (1 << LBP_SA) | (1 << LBP_ID)))) { /* ambiguous (alphabetic) ? */ if ((unicode_width[ch] != NULL && unicode_width[ch][0] == 'A' && ch >= 0x2000 /* Extra exceptions for compatibility with Unicode LineBreak.txt. */ && ch != 0x2022 /* BULLET */ && ch != 0x203E /* OVERLINE */ && ch != 0x2126 /* OHM SIGN */ && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */ && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */ && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */ && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */ && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */ && ch != 0x21E7 /* UPWARDS WHITE ARROW */ && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */ && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */) #if !REVISION_22 || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ || ch == 0x00A7 /* SECTION SIGN */ || ch == 0x00A8 /* DIAERESIS */ || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */ || ch == 0x00B2 /* SUPERSCRIPT TWO */ || ch == 0x00B3 /* SUPERSCRIPT THREE */ || ch == 0x00B6 /* PILCROW SIGN */ || ch == 0x00B7 /* MIDDLE DOT */ || ch == 0x00B8 /* CEDILLA */ || ch == 0x00B9 /* SUPERSCRIPT ONE */ || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */ || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ || ch == 0x00BF /* INVERTED QUESTION MARK */ || ch == 0x00D7 /* MULTIPLICATION SIGN */ || ch == 0x00F7 /* DIVISION SIGN */ || ch == 0x02C7 /* CARON */ || ch == 0x02C9 /* MODIFIER LETTER MACRON */ || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */ || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */ || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */ || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */ || ch == 0x02D8 /* BREVE */ || ch == 0x02D9 /* DOT ABOVE */ || ch == 0x02DA /* RING ABOVE */ || ch == 0x02DB /* OGONEK */ || ch == 0x02DD /* DOUBLE ACUTE ACCENT */ #endif || ch == 0x24EA /* CIRCLED DIGIT ZERO */ || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */ /* Extra characters for compatibility with Unicode LineBreak.txt. */ || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */ || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */ || ch == 0x2616 /* WHITE SHOGI PIECE */ || ch == 0x2617 /* BLACK SHOGI PIECE */) attr |= 1 << LBP_AI; else attr |= 1 << LBP_AL; attr &= ~(1 << LBP_CM); } } if (attr == 0) /* unknown */ attr |= 1 << LBP_XX; return attr; } /* Output the line breaking properties in a human readable format. */ static void debug_output_lbp (FILE *stream) { unsigned int i; for (i = 0; i < 0x110000; i++) { int attr = get_lbp (i); if (attr != 1 << LBP_XX) { fprintf (stream, "0x%04X", i); #define PRINT_BIT(attr,bit) \ if (attr & (1 << bit)) fprintf (stream, " " #bit); PRINT_BIT(attr,LBP_BK); PRINT_BIT(attr,LBP_CM); PRINT_BIT(attr,LBP_WJ); PRINT_BIT(attr,LBP_ZW); PRINT_BIT(attr,LBP_GL); PRINT_BIT(attr,LBP_SP); PRINT_BIT(attr,LBP_B2); PRINT_BIT(attr,LBP_BA); PRINT_BIT(attr,LBP_BB); PRINT_BIT(attr,LBP_HY); PRINT_BIT(attr,LBP_CB); PRINT_BIT(attr,LBP_CL); PRINT_BIT(attr,LBP_EX); PRINT_BIT(attr,LBP_IN); PRINT_BIT(attr,LBP_NS); PRINT_BIT(attr,LBP_OP); PRINT_BIT(attr,LBP_QU); PRINT_BIT(attr,LBP_IS); PRINT_BIT(attr,LBP_NU); PRINT_BIT(attr,LBP_PO); PRINT_BIT(attr,LBP_PR); PRINT_BIT(attr,LBP_SY); PRINT_BIT(attr,LBP_AI); PRINT_BIT(attr,LBP_AL); PRINT_BIT(attr,LBP_H2); PRINT_BIT(attr,LBP_H3); PRINT_BIT(attr,LBP_ID); PRINT_BIT(attr,LBP_JL); PRINT_BIT(attr,LBP_JV); PRINT_BIT(attr,LBP_JT); PRINT_BIT(attr,LBP_SA); PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT fprintf (stream, "\n"); } } } static void debug_output_lbrk_tables (const char *filename) { FILE *stream; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } debug_output_lbp (stream); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* The line breaking property from the LineBreak.txt file. */ int unicode_org_lbp[0x110000]; /* Stores in unicode_org_lbp[] the line breaking property from the LineBreak.txt file. */ static void fill_org_lbp (const char *linebreak_filename) { unsigned int i, j; FILE *stream; char field0[FIELDLEN]; char field1[FIELDLEN]; char field2[FIELDLEN]; int lineno = 0; for (i = 0; i < 0x110000; i++) unicode_org_lbp[i] = LBP_XX; stream = fopen (linebreak_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename); exit (1); } for (;;) { int n; int c; int value; lineno++; c = getc (stream); if (c == EOF) break; if (c == '#') { do c = getc (stream); while (c != EOF && c != '\n'); continue; } ungetc (c, stream); n = getfield (stream, field0, ';'); n += getfield (stream, field1, ' '); n += getfield (stream, field2, '\n'); if (n == 0) break; if (n != 3) { fprintf (stderr, "short line in '%s':%d\n", linebreak_filename, lineno); exit (1); } #define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit; if (false) {} TRY(LBP_BK) TRY(LBP_CM) TRY(LBP_WJ) TRY(LBP_ZW) TRY(LBP_GL) TRY(LBP_SP) TRY(LBP_B2) TRY(LBP_BA) TRY(LBP_BB) TRY(LBP_HY) TRY(LBP_CB) TRY(LBP_CL) TRY(LBP_EX) TRY(LBP_IN) TRY(LBP_NS) TRY(LBP_OP) TRY(LBP_QU) TRY(LBP_IS) TRY(LBP_NU) TRY(LBP_PO) TRY(LBP_PR) TRY(LBP_SY) TRY(LBP_AI) TRY(LBP_AL) TRY(LBP_H2) TRY(LBP_H3) TRY(LBP_ID) TRY(LBP_JL) TRY(LBP_JV) TRY(LBP_JT) TRY(LBP_SA) TRY(LBP_XX) #undef TRY else if (strcmp (field1, "LF") == 0) value = LBP_BK; else if (strcmp (field1, "CR") == 0) value = LBP_BK; else if (strcmp (field1, "NL") == 0) value = LBP_BK; else if (strcmp (field1, "SG") == 0) value = LBP_XX; else { fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n", field1, linebreak_filename, lineno); exit (1); } i = strtoul (field0, NULL, 16); if (strstr (field0, "..") != NULL) { /* Deal with a range. */ j = strtoul (strstr (field0, "..") + 2, NULL, 16); for (; i <= j; i++) unicode_org_lbp[i] = value; } else { /* Single character line. */ unicode_org_lbp[i] = value; } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", linebreak_filename); exit (1); } } /* Output the line breaking properties in a human readable format. */ static void debug_output_org_lbp (FILE *stream) { unsigned int i; for (i = 0; i < 0x110000; i++) { int attr = unicode_org_lbp[i]; if (attr != LBP_XX) { fprintf (stream, "0x%04X", i); #define PRINT_BIT(attr,bit) \ if (attr == bit) fprintf (stream, " " #bit); PRINT_BIT(attr,LBP_BK); PRINT_BIT(attr,LBP_CM); PRINT_BIT(attr,LBP_WJ); PRINT_BIT(attr,LBP_ZW); PRINT_BIT(attr,LBP_GL); PRINT_BIT(attr,LBP_SP); PRINT_BIT(attr,LBP_B2); PRINT_BIT(attr,LBP_BA); PRINT_BIT(attr,LBP_BB); PRINT_BIT(attr,LBP_HY); PRINT_BIT(attr,LBP_CB); PRINT_BIT(attr,LBP_CL); PRINT_BIT(attr,LBP_EX); PRINT_BIT(attr,LBP_IN); PRINT_BIT(attr,LBP_NS); PRINT_BIT(attr,LBP_OP); PRINT_BIT(attr,LBP_QU); PRINT_BIT(attr,LBP_IS); PRINT_BIT(attr,LBP_NU); PRINT_BIT(attr,LBP_PO); PRINT_BIT(attr,LBP_PR); PRINT_BIT(attr,LBP_SY); PRINT_BIT(attr,LBP_AI); PRINT_BIT(attr,LBP_AL); PRINT_BIT(attr,LBP_H2); PRINT_BIT(attr,LBP_H3); PRINT_BIT(attr,LBP_ID); PRINT_BIT(attr,LBP_JL); PRINT_BIT(attr,LBP_JV); PRINT_BIT(attr,LBP_JT); PRINT_BIT(attr,LBP_SA); PRINT_BIT(attr,LBP_XX); #undef PRINT_BIT fprintf (stream, "\n"); } } } static void debug_output_org_lbrk_tables (const char *filename) { FILE *stream; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } debug_output_org_lbp (stream); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Construction of sparse 3-level tables. */ #define TABLE lbp_table #define ELEMENT unsigned char #define DEFAULT LBP_XX #define xmalloc malloc #define xrealloc realloc #include "3level.h" static void output_lbp (FILE *stream1, FILE *stream2) { unsigned int i; struct lbp_table t; unsigned int level1_offset, level2_offset, level3_offset; t.p = 7; t.q = 9; lbp_table_init (&t); for (i = 0; i < 0x110000; i++) { int attr = get_lbp (i); /* Now attr should contain exactly one bit. */ if (attr == 0 || ((attr & (attr - 1)) != 0)) abort (); if (attr != 1 << LBP_XX) { unsigned int log2_attr; for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); lbp_table_add (&t, i, log2_attr); } } lbp_table_finalize (&t); level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream1, "#define lbrkprop_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream1, "\n"); fprintf (stream1, "typedef struct\n"); fprintf (stream1, " {\n"); fprintf (stream1, " int level1[%zu];\n", t.level1_size); fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream1, " }\n"); fprintf (stream1, "lbrkprop_t;\n"); fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n"); fprintf (stream2, "const lbrkprop_t unilbrkprop =\n"); fprintf (stream2, "{\n"); fprintf (stream2, " {"); if (t.level1_size > 8) fprintf (stream2, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream2, " %5d", -1); else fprintf (stream2, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream2, ","); } if (t.level1_size > 8) fprintf (stream2, "\n "); fprintf (stream2, " },\n"); fprintf (stream2, " {"); if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream2, " %5d", -1); else fprintf (stream2, " %5zu", (offset - level3_offset) / sizeof (unsigned char)); if (i+1 < t.level2_size << t.q) fprintf (stream2, ","); } if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); fprintf (stream2, " },\n"); fprintf (stream2, " {"); if (t.level3_size << t.p > 8) fprintf (stream2, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; const char *value_string; switch (value) { #define CASE(x) case x: value_string = #x; break; CASE(LBP_BK); CASE(LBP_CM); CASE(LBP_WJ); CASE(LBP_ZW); CASE(LBP_GL); CASE(LBP_SP); CASE(LBP_B2); CASE(LBP_BA); CASE(LBP_BB); CASE(LBP_HY); CASE(LBP_CB); CASE(LBP_CL); CASE(LBP_EX); CASE(LBP_IN); CASE(LBP_NS); CASE(LBP_OP); CASE(LBP_QU); CASE(LBP_IS); CASE(LBP_NU); CASE(LBP_PO); CASE(LBP_PR); CASE(LBP_SY); CASE(LBP_AI); CASE(LBP_AL); CASE(LBP_H2); CASE(LBP_H3); CASE(LBP_ID); CASE(LBP_JL); CASE(LBP_JV); CASE(LBP_JT); CASE(LBP_SA); CASE(LBP_XX); #undef CASE default: abort (); } if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); fprintf (stream2, " %s%s", value_string, (i+1 < t.level3_size << t.p ? "," : "")); } if (t.level3_size << t.p > 8) fprintf (stream2, "\n "); fprintf (stream2, " }\n"); fprintf (stream2, "};\n"); } static void output_lbrk_tables (const char *filename1, const char *filename2, const char *version) { const char *filenames[2]; FILE *streams[2]; size_t i; filenames[0] = filename1; filenames[1] = filename2; for (i = 0; i < 2; i++) { streams[i] = fopen (filenames[i], "w"); if (streams[i] == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); exit (1); } } for (i = 0; i < 2; i++) { FILE *stream = streams[i]; fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-lbrk for Unicode %s. */\n", version); fprintf (stream, "\n"); /* Put a GPL header on it. The gnulib module is under LGPL (although it still carries the GPL header), and it's gnulib-tool which replaces the GPL header with an LGPL header. */ fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); fprintf (stream, " (at your option) any later version.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); fprintf (stream, " GNU General Public License for more details.\n"); fprintf (stream, "\n"); fprintf (stream, " You should have received a copy of the GNU General Public License\n"); fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); } output_lbp (streams[0], streams[1]); for (i = 0; i < 2; i++) { if (ferror (streams[i]) || fclose (streams[i])) { fprintf (stderr, "error writing to '%s'\n", filenames[i]); exit (1); } } } /* ========================================================================= */ /* Word break property. */ /* Possible values of the Word_Break property. */ enum { WBP_OTHER = 0, WBP_CR = 11, WBP_LF = 12, WBP_NEWLINE = 10, WBP_EXTEND = 8, WBP_FORMAT = 9, WBP_KATAKANA = 1, WBP_ALETTER = 2, WBP_MIDNUMLET = 3, WBP_MIDLETTER = 4, WBP_MIDNUM = 5, WBP_NUMERIC = 6, WBP_EXTENDNUMLET = 7 }; /* Returns the word breaking property for ch, as a bit mask. */ static int get_wbp (unsigned int ch) { int attr = 0; if (unicode_attributes[ch].name != NULL) { if (ch == 0x000D) attr |= 1 << WBP_CR; if (ch == 0x000A) attr |= 1 << WBP_LF; if (ch == 0x000B || ch == 0x000C || ch == 0x0085 || ch == 0x2028 || ch == 0x2029) attr |= 1 << WBP_NEWLINE; if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0 || (unicode_attributes[ch].category != NULL && strcmp (unicode_attributes[ch].category, "Mc") == 0)) attr |= 1 << WBP_EXTEND; if (unicode_attributes[ch].category != NULL && strcmp (unicode_attributes[ch].category, "Cf") == 0 && ch != 0x200C && ch != 0x200D) attr |= 1 << WBP_FORMAT; if ((unicode_scripts[ch] < numscripts && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0) || (ch >= 0x3031 && ch <= 0x3035) || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC || ch == 0xFF70) attr |= 1 << WBP_KATAKANA; if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0 || ch == 0x05F3) && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0 && (attr & (1 << WBP_KATAKANA)) == 0 && ((get_lbp (ch) >> LBP_SA) & 1) == 0 && !(unicode_scripts[ch] < numscripts && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0) && (attr & (1 << WBP_EXTEND)) == 0) attr |= 1 << WBP_ALETTER; if (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E) attr |= 1 << WBP_MIDNUMLET; if (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A) attr |= 1 << WBP_MIDLETTER; if ((((get_lbp (ch) >> LBP_IS) & 1) != 0 || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C || ch == 0xFF1B) && ch != 0x003A && ch != 0xFE13 && ch != 0x002E) attr |= 1 << WBP_MIDNUM; if (((get_lbp (ch) >> LBP_NU) & 1) != 0 && ch != 0x066C) attr |= 1 << WBP_NUMERIC; if (unicode_attributes[ch].category != NULL && strcmp (unicode_attributes[ch].category, "Pc") == 0) attr |= 1 << WBP_EXTENDNUMLET; } if (attr == 0) /* other */ attr |= 1 << WBP_OTHER; return attr; } /* Output the word break property in a human readable format. */ static void debug_output_wbp (FILE *stream) { unsigned int i; for (i = 0; i < 0x110000; i++) { int attr = get_wbp (i); if (attr != 1 << WBP_OTHER) { fprintf (stream, "0x%04X", i); if (attr & (1 << WBP_CR)) fprintf (stream, " CR"); if (attr & (1 << WBP_LF)) fprintf (stream, " LF"); if (attr & (1 << WBP_NEWLINE)) fprintf (stream, " Newline"); if (attr & (1 << WBP_EXTEND)) fprintf (stream, " Extend"); if (attr & (1 << WBP_FORMAT)) fprintf (stream, " Format"); if (attr & (1 << WBP_KATAKANA)) fprintf (stream, " Katakana"); if (attr & (1 << WBP_ALETTER)) fprintf (stream, " ALetter"); if (attr & (1 << WBP_MIDNUMLET)) fprintf (stream, " MidNumLet"); if (attr & (1 << WBP_MIDLETTER)) fprintf (stream, " MidLetter"); if (attr & (1 << WBP_MIDNUM)) fprintf (stream, " MidNum"); if (attr & (1 << WBP_NUMERIC)) fprintf (stream, " Numeric"); if (attr & (1 << WBP_EXTENDNUMLET)) fprintf (stream, " ExtendNumLet"); fprintf (stream, "\n"); } } } static void debug_output_wbrk_tables (const char *filename) { FILE *stream; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } debug_output_wbp (stream); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* The word break property from the WordBreakProperty.txt file. */ int unicode_org_wbp[0x110000]; /* Stores in unicode_org_wbp[] the word break property from the WordBreakProperty.txt file. */ static void fill_org_wbp (const char *wordbreakproperty_filename) { unsigned int i; FILE *stream; for (i = 0; i < 0x110000; i++) unicode_org_wbp[i] = WBP_OTHER; stream = fopen (wordbreakproperty_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename); exit (1); } for (;;) { char buf[200+1]; unsigned int i1, i2; char padding[200+1]; char propname[200+1]; int propvalue; if (fscanf (stream, "%200[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) { if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) { fprintf (stderr, "parse error in '%s'\n", wordbreakproperty_filename); exit (1); } i2 = i1; } #define PROP(name,value) \ if (strcmp (propname, name) == 0) propvalue = value; else PROP ("CR", WBP_CR) PROP ("LF", WBP_LF) PROP ("Newline", WBP_NEWLINE) PROP ("Extend", WBP_EXTEND) PROP ("Format", WBP_FORMAT) PROP ("Katakana", WBP_KATAKANA) PROP ("ALetter", WBP_ALETTER) PROP ("MidNumLet", WBP_MIDNUMLET) PROP ("MidLetter", WBP_MIDLETTER) PROP ("MidNum", WBP_MIDNUM) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) #undef PROP { fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, wordbreakproperty_filename); exit (1); } if (!(i1 <= i2 && i2 < 0x110000)) abort (); for (i = i1; i <= i2; i++) unicode_org_wbp[i] = propvalue; } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename); exit (1); } } /* Output the word break property in a human readable format. */ static void debug_output_org_wbp (FILE *stream) { unsigned int i; for (i = 0; i < 0x110000; i++) { int propvalue = unicode_org_wbp[i]; if (propvalue != WBP_OTHER) { fprintf (stream, "0x%04X", i); #define PROP(name,value) \ if (propvalue == value) fprintf (stream, " " name); else PROP ("CR", WBP_CR) PROP ("LF", WBP_LF) PROP ("Newline", WBP_NEWLINE) PROP ("Extend", WBP_EXTEND) PROP ("Format", WBP_FORMAT) PROP ("Katakana", WBP_KATAKANA) PROP ("ALetter", WBP_ALETTER) PROP ("MidNumLet", WBP_MIDNUMLET) PROP ("MidLetter", WBP_MIDLETTER) PROP ("MidNum", WBP_MIDNUM) PROP ("Numeric", WBP_NUMERIC) PROP ("ExtendNumLet", WBP_EXTENDNUMLET) #undef PROP fprintf (stream, " ??"); fprintf (stream, "\n"); } } } static void debug_output_org_wbrk_tables (const char *filename) { FILE *stream; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } debug_output_org_wbp (stream); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Construction of sparse 3-level tables. */ #define TABLE wbp_table #define ELEMENT unsigned char #define DEFAULT WBP_OTHER #define xmalloc malloc #define xrealloc realloc #include "3level.h" static void output_wbp (FILE *stream) { unsigned int i; struct wbp_table t; unsigned int level1_offset, level2_offset, level3_offset; t.p = 7; t.q = 9; wbp_table_init (&t); for (i = 0; i < 0x110000; i++) { int attr = get_wbp (i); /* Now attr should contain exactly one bit. */ if (attr == 0 || ((attr & (attr - 1)) != 0)) abort (); if (attr != 1 << WBP_OTHER) { unsigned int log2_attr; for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); wbp_table_add (&t, i, log2_attr); } } wbp_table_finalize (&t); level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define wbrkprop_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "\n"); fprintf (stream, "typedef struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream, " }\n"); fprintf (stream, "wbrkprop_t;\n"); fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (unsigned char)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level3_size << t.p > 4) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; const char *value_string; switch (value) { #define CASE(x) case x: value_string = #x; break; CASE(WBP_OTHER); CASE(WBP_CR); CASE(WBP_LF); CASE(WBP_NEWLINE); CASE(WBP_EXTEND); CASE(WBP_FORMAT); CASE(WBP_KATAKANA); CASE(WBP_ALETTER); CASE(WBP_MIDNUMLET); CASE(WBP_MIDLETTER); CASE(WBP_MIDNUM); CASE(WBP_NUMERIC); CASE(WBP_EXTENDNUMLET); #undef CASE default: abort (); } if (i > 0 && (i % 4) == 0) fprintf (stream, "\n "); fprintf (stream, " %s%s", value_string, (i+1 < t.level3_size << t.p ? "," : "")); } if (t.level3_size << t.p > 4) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); } static void output_wbrk_tables (const char *filename, const char *version) { FILE *stream; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", version); fprintf (stream, "\n"); /* Put a GPL header on it. The gnulib module is under LGPL (although it still carries the GPL header), and it's gnulib-tool which replaces the GPL header with an LGPL header. */ fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); fprintf (stream, " (at your option) any later version.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); fprintf (stream, " GNU General Public License for more details.\n"); fprintf (stream, "\n"); fprintf (stream, " You should have received a copy of the GNU General Public License\n"); fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); output_wbp (stream); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Maximum number of characters into which a single Unicode character can be decomposed. */ #define MAX_DECOMP_LENGTH 18 enum { UC_DECOMP_CANONICAL,/* Canonical decomposition. */ UC_DECOMP_FONT, /* A font variant (e.g. a blackletter form). */ UC_DECOMP_NOBREAK, /* A no-break version of a space or hyphen. */ UC_DECOMP_INITIAL, /* An initial presentation form (Arabic). */ UC_DECOMP_MEDIAL, /* A medial presentation form (Arabic). */ UC_DECOMP_FINAL, /* A final presentation form (Arabic). */ UC_DECOMP_ISOLATED,/* An isolated presentation form (Arabic). */ UC_DECOMP_CIRCLE, /* An encircled form. */ UC_DECOMP_SUPER, /* A superscript form. */ UC_DECOMP_SUB, /* A subscript form. */ UC_DECOMP_VERTICAL,/* A vertical layout presentation form. */ UC_DECOMP_WIDE, /* A wide (or zenkaku) compatibility character. */ UC_DECOMP_NARROW, /* A narrow (or hankaku) compatibility character. */ UC_DECOMP_SMALL, /* A small variant form (CNS compatibility). */ UC_DECOMP_SQUARE, /* A CJK squared font variant. */ UC_DECOMP_FRACTION,/* A vulgar fraction form. */ UC_DECOMP_COMPAT /* Otherwise unspecified compatibility character. */ }; /* Return the decomposition for a Unicode character (ignoring Hangul Jamo decompositions). Return the type, or -1 for none. */ static int get_decomposition (unsigned int ch, unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH]) { const char *decomposition = unicode_attributes[ch].decomposition; if (decomposition != NULL && decomposition[0] != '\0') { int type = UC_DECOMP_CANONICAL; unsigned int length; char *endptr; if (decomposition[0] == '<') { const char *rangle; size_t typelen; rangle = strchr (decomposition + 1, '>'); if (rangle == NULL) abort (); typelen = rangle + 1 - decomposition; #define TYPE(t1,t2) \ if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \ type = t2; \ else TYPE ("", UC_DECOMP_FONT) TYPE ("", UC_DECOMP_NOBREAK) TYPE ("", UC_DECOMP_INITIAL) TYPE ("", UC_DECOMP_MEDIAL) TYPE ("", UC_DECOMP_FINAL) TYPE ("", UC_DECOMP_ISOLATED) TYPE ("", UC_DECOMP_CIRCLE) TYPE ("", UC_DECOMP_SUPER) TYPE ("", UC_DECOMP_SUB) TYPE ("", UC_DECOMP_VERTICAL) TYPE ("", UC_DECOMP_WIDE) TYPE ("", UC_DECOMP_NARROW) TYPE ("", UC_DECOMP_SMALL) TYPE ("", UC_DECOMP_SQUARE) TYPE ("", UC_DECOMP_FRACTION) TYPE ("", UC_DECOMP_COMPAT) { fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition); exit (1); } #undef TYPE decomposition = rangle + 1; if (decomposition[0] == ' ') decomposition++; } for (length = 0; length < MAX_DECOMP_LENGTH; length++) { decomposed[length] = strtoul (decomposition, &endptr, 16); if (endptr == decomposition) break; decomposition = endptr; if (decomposition[0] == ' ') decomposition++; } if (*decomposition != '\0') /* MAX_DECOMP_LENGTH is too small. */ abort (); *lengthp = length; return type; } else return -1; } /* Construction of sparse 3-level tables. */ #define TABLE decomp_table #define ELEMENT uint16_t #define DEFAULT (uint16_t)(-1) #define xmalloc malloc #define xrealloc realloc #include "3level.h" static void output_decomposition (FILE *stream1, FILE *stream2) { struct decomp_table t; unsigned int level1_offset, level2_offset, level3_offset; unsigned int offset; unsigned int ch; unsigned int i; t.p = 5; t.q = 5; decomp_table_init (&t); fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n"); fprintf (stream1, "\n"); fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{"); offset = 0; for (ch = 0; ch < 0x110000; ch++) { unsigned int length; unsigned int decomposed[MAX_DECOMP_LENGTH]; int type = get_decomposition (ch, &length, decomposed); if (type >= 0) { if (!(offset < (1 << 15))) abort (); decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset); /* Produce length 3-bytes entries. */ if (length == 0) /* We would need a special representation of zero-length entries. */ abort (); for (i = 0; i < length; i++) { if (offset > 0) fprintf (stream2, ","); if ((offset % 4) == 0) fprintf (stream2, "\n "); if (!(decomposed[i] < (1 << 18))) abort (); fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X", (((i+1 < length ? (1 << 23) : 0) | (i == 0 ? (type << 18) : 0) | decomposed[i]) >> 16) & 0xff, (decomposed[i] >> 8) & 0xff, decomposed[i] & 0xff); offset++; } } } fprintf (stream2, "\n};\n"); fprintf (stream2, "\n"); decomp_table_finalize (&t); level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream1, "#define decomp_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream1, "\n"); fprintf (stream1, "typedef struct\n"); fprintf (stream1, " {\n"); fprintf (stream1, " int level1[%zu];\n", t.level1_size); fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream1, " }\n"); fprintf (stream1, "decomp_index_table_t;\n"); fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n"); fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n"); fprintf (stream2, "{\n"); fprintf (stream2, " {"); if (t.level1_size > 8) fprintf (stream2, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream2, " %5d", -1); else fprintf (stream2, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream2, ","); } if (t.level1_size > 8) fprintf (stream2, "\n "); fprintf (stream2, " },\n"); fprintf (stream2, " {"); if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream2, " %5d", -1); else fprintf (stream2, " %5zu", (offset - level3_offset) / sizeof (uint16_t)); if (i+1 < t.level2_size << t.q) fprintf (stream2, ","); } if (t.level2_size << t.q > 8) fprintf (stream2, "\n "); fprintf (stream2, " },\n"); fprintf (stream2, " {"); if (t.level3_size << t.p > 8) fprintf (stream2, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { uint16_t value = ((uint16_t *) (t.result + level3_offset))[i]; if (i > 0 && (i % 8) == 0) fprintf (stream2, "\n "); fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value); if (i+1 < t.level3_size << t.p) fprintf (stream2, ","); } if (t.level3_size << t.p > 8) fprintf (stream2, "\n "); fprintf (stream2, " }\n"); fprintf (stream2, "};\n"); } static void output_decomposition_tables (const char *filename1, const char *filename2, const char *version) { const char *filenames[2]; FILE *streams[2]; size_t i; filenames[0] = filename1; filenames[1] = filename2; for (i = 0; i < 2; i++) { streams[i] = fopen (filenames[i], "w"); if (streams[i] == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); exit (1); } } for (i = 0; i < 2; i++) { FILE *stream = streams[i]; fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Decomposition of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "\n"); } output_decomposition (streams[0], streams[1]); for (i = 0; i < 2; i++) { if (ferror (streams[i]) || fclose (streams[i])) { fprintf (stderr, "error writing to '%s'\n", filenames[i]); exit (1); } } } /* The "excluded from composition" property from the CompositionExclusions.txt file. */ char unicode_composition_exclusions[0x110000]; static void fill_composition_exclusions (const char *compositionexclusions_filename) { FILE *stream; unsigned int i; stream = fopen (compositionexclusions_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename); exit (1); } for (i = 0; i < 0x110000; i++) unicode_composition_exclusions[i] = 0; for (;;) { char buf[200+1]; unsigned int i; if (fscanf (stream, "%200[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; if (sscanf (buf, "%X", &i) != 1) { fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename); exit (1); } if (!(i < 0x110000)) abort (); unicode_composition_exclusions[i] = 1; } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename); exit (1); } } static void debug_output_composition_tables (const char *filename) { FILE *stream; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } for (ch = 0; ch < 0x110000; ch++) { unsigned int length; unsigned int decomposed[MAX_DECOMP_LENGTH]; int type = get_decomposition (ch, &length, decomposed); if (type == UC_DECOMP_CANONICAL /* Consider only binary decompositions. Exclude singleton decompositions. */ && length == 2) { unsigned int code1 = decomposed[0]; unsigned int code2 = decomposed[1]; unsigned int combined = ch; /* Exclude decompositions where the first part is not a starter, i.e. is not of canonical combining class 0. */ if (strcmp (unicode_attributes[code1].combining, "0") == 0 /* Exclude characters listed in CompositionExclusions.txt. */ && !unicode_composition_exclusions[combined]) { /* The combined character must now also be a starter. Verify this. */ if (strcmp (unicode_attributes[combined].combining, "0") != 0) abort (); fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n", code1, code2, combined, unicode_attributes[code2].combining); } } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } static void output_composition_tables (const char *filename, const char *version) { FILE *stream; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-uni-tables for Unicode %s. */\n", version); fprintf (stream, "\n"); /* Put a GPL header on it. The gnulib module is under LGPL (although it still carries the GPL header), and it's gnulib-tool which replaces the GPL header with an LGPL header. */ fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); fprintf (stream, " (at your option) any later version.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); fprintf (stream, " GNU General Public License for more details.\n"); fprintf (stream, "\n"); fprintf (stream, " You should have received a copy of the GNU General Public License\n"); fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); /* The composition table is a set of mappings (code1, code2) -> combined, with 928 entries, 367 values for code1 (from 0x003C to 0x30FD), 54 values for code2 (from 0x0300 to 0x309A). For a fixed code1, there are from 1 to 19 possible values for code2. For a fixed code2, there are from 1 to 117 possible values for code1. This is a very sparse matrix. We want an O(1) hash lookup. We could implement the hash lookup by mapping (code1, code2) to a linear combination mul1*code1 + mul2*code2, which is then used as an index into a 3-level table. But this leads to a table of size 37 KB. We use gperf to implement the hash lookup, giving it the 928 sets of 4 bytes (code1, code2) as input. gperf generates a hash table of size 1527, which is quite good (60% filled). It requires an auxiliary table lookup in a table of size 0.5 KB. The total tables size is 11 KB. */ fprintf (stream, "struct composition_rule { char codes[4]; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define slot-name codes\n"); fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n"); fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n"); fprintf (stream, "%%compare-lengths\n"); fprintf (stream, "%%compare-strncmp\n"); fprintf (stream, "%%readonly-tables\n"); fprintf (stream, "%%omit-struct-type\n"); fprintf (stream, "%%%%\n"); for (ch = 0; ch < 0x110000; ch++) { unsigned int length; unsigned int decomposed[MAX_DECOMP_LENGTH]; int type = get_decomposition (ch, &length, decomposed); if (type == UC_DECOMP_CANONICAL /* Consider only binary decompositions. Exclude singleton decompositions. */ && length == 2) { unsigned int code1 = decomposed[0]; unsigned int code2 = decomposed[1]; unsigned int combined = ch; /* Exclude decompositions where the first part is not a starter, i.e. is not of canonical combining class 0. */ if (strcmp (unicode_attributes[code1].combining, "0") == 0 /* Exclude characters listed in CompositionExclusions.txt. */ && !unicode_composition_exclusions[combined]) { /* The combined character must now also be a starter. Verify this. */ if (strcmp (unicode_attributes[combined].combining, "0") != 0) abort (); if (!(code1 < 0x10000)) abort (); if (!(code2 < 0x10000)) abort (); if (!(combined < 0x10000)) abort (); fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", (code1 >> 8) & 0xff, code1 & 0xff, (code2 >> 8) & 0xff, code2 & 0xff, combined); } } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Output the test for a simple character mapping table to the given file. */ static void output_simple_mapping_test (const char *filename, const char *function_name, unsigned int (*func) (unsigned int), const char *version) { FILE *stream; bool need_comma; unsigned int ch; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Test the Unicode character mapping functions.\n"); fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); fprintf (stream, " (at your option) any later version.\n"); fprintf (stream, "\n"); fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); fprintf (stream, " GNU General Public License for more details.\n"); fprintf (stream, "\n"); fprintf (stream, " You should have received a copy of the GNU General Public License\n"); fprintf (stream, " along with this program. If not, see . */\n"); fprintf (stream, "\n"); fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", version); fprintf (stream, "\n"); fprintf (stream, "#include \"test-mapping-part1.h\"\n"); fprintf (stream, "\n"); need_comma = false; for (ch = 0; ch < 0x110000; ch++) { unsigned int value = func (ch); if (value != ch) { if (need_comma) fprintf (stream, ",\n"); fprintf (stream, " { 0x%04X, 0x%04X }", ch, value); need_comma = true; } } if (need_comma) fprintf (stream, "\n"); fprintf (stream, "\n"); fprintf (stream, "#define MAP(c) %s (c)\n", function_name); fprintf (stream, "#include \"test-mapping-part2.h\"\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* Construction of sparse 3-level tables. */ #define TABLE mapping_table #define ELEMENT int32_t #define DEFAULT 0 #define xmalloc malloc #define xrealloc realloc #include "3level.h" /* Output a simple character mapping table to the given file. */ static void output_simple_mapping (const char *filename, unsigned int (*func) (unsigned int), const char *version) { FILE *stream; unsigned int ch, i; struct mapping_table t; unsigned int level1_offset, level2_offset, level3_offset; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Simple character mapping of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-case.c for Unicode %s. */\n", version); t.p = 7; t.q = 9; mapping_table_init (&t); for (ch = 0; ch < 0x110000; ch++) { int value = (int) func (ch) - (int) ch; mapping_table_add (&t, ch, value); } mapping_table_finalize (&t); /* Offsets in t.result, in memory of this process. */ level1_offset = 5 * sizeof (uint32_t); level2_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t); level3_offset = 5 * sizeof (uint32_t) + t.level1_size * sizeof (uint32_t) + (t.level2_size << t.q) * sizeof (uint32_t); for (i = 0; i < 5; i++) fprintf (stream, "#define mapping_header_%d %d\n", i, ((uint32_t *) t.result)[i]); fprintf (stream, "static const\n"); fprintf (stream, "struct\n"); fprintf (stream, " {\n"); fprintf (stream, " int level1[%zu];\n", t.level1_size); fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p); fprintf (stream, " }\n"); fprintf (stream, "u_mapping =\n"); fprintf (stream, "{\n"); fprintf (stream, " {"); if (t.level1_size > 8) fprintf (stream, "\n "); for (i = 0; i < t.level1_size; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level1_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level2_offset) / sizeof (uint32_t)); if (i+1 < t.level1_size) fprintf (stream, ","); } if (t.level1_size > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level2_size << t.q > 8) fprintf (stream, "\n "); for (i = 0; i < t.level2_size << t.q; i++) { uint32_t offset; if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); offset = ((uint32_t *) (t.result + level2_offset))[i]; if (offset == 0) fprintf (stream, " %5d", -1); else fprintf (stream, " %5zu", (offset - level3_offset) / sizeof (int32_t)); if (i+1 < t.level2_size << t.q) fprintf (stream, ","); } if (t.level2_size << t.q > 8) fprintf (stream, "\n "); fprintf (stream, " },\n"); fprintf (stream, " {"); if (t.level3_size << t.p > 8) fprintf (stream, "\n "); for (i = 0; i < t.level3_size << t.p; i++) { if (i > 0 && (i % 8) == 0) fprintf (stream, "\n "); fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); if (i+1 < t.level3_size << t.p) fprintf (stream, ","); } if (t.level3_size << t.p > 8) fprintf (stream, "\n "); fprintf (stream, " }\n"); fprintf (stream, "};\n"); if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* A special casing context. A context is negated through x -> -x. */ enum { SCC_ALWAYS = 0, SCC_FINAL_SIGMA, SCC_AFTER_SOFT_DOTTED, SCC_MORE_ABOVE, SCC_BEFORE_DOT, SCC_AFTER_I }; /* A special casing rule. */ struct special_casing_rule { unsigned int code; unsigned int lower_mapping[3]; unsigned int title_mapping[3]; unsigned int upper_mapping[3]; unsigned int casefold_mapping[3]; const char *language; int context; }; /* The special casing rules. */ struct special_casing_rule **casing_rules; unsigned int num_casing_rules; unsigned int allocated_casing_rules; static void add_casing_rule (struct special_casing_rule *new_rule) { if (num_casing_rules == allocated_casing_rules) { allocated_casing_rules = 2 * allocated_casing_rules; if (allocated_casing_rules < 16) allocated_casing_rules = 16; casing_rules = (struct special_casing_rule **) realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *)); } casing_rules[num_casing_rules++] = new_rule; } /* Stores in casing_rules the special casing rules found in specialcasing_filename. */ static void fill_casing_rules (const char *specialcasing_filename) { FILE *stream; stream = fopen (specialcasing_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename); exit (1); } casing_rules = NULL; num_casing_rules = 0; allocated_casing_rules = 0; for (;;) { char buf[200+1]; char *scanptr; char *endptr; int i; unsigned int code; unsigned int lower_mapping[3]; unsigned int title_mapping[3]; unsigned int upper_mapping[3]; char *language; int context; if (fscanf (stream, "%200[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; /* Scan code. */ scanptr = buf; code = strtoul (scanptr, &endptr, 16); if (endptr == scanptr) { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr = endptr; if (*scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr++; /* Scan lower mapping. */ for (i = 0; i < 3; i++) lower_mapping[i] = 0; for (i = 0; i < 3; i++) { while (*scanptr == ' ') scanptr++; if (*scanptr == ';') break; lower_mapping[i] = strtoul (scanptr, &endptr, 16); if (endptr == scanptr) { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr = endptr; } if (*scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr++; /* Scan title mapping. */ for (i = 0; i < 3; i++) title_mapping[i] = 0; for (i = 0; i < 3; i++) { while (*scanptr == ' ') scanptr++; if (*scanptr == ';') break; title_mapping[i] = strtoul (scanptr, &endptr, 16); if (endptr == scanptr) { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr = endptr; } if (*scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr++; /* Scan upper mapping. */ for (i = 0; i < 3; i++) upper_mapping[i] = 0; for (i = 0; i < 3; i++) { while (*scanptr == ' ') scanptr++; if (*scanptr == ';') break; upper_mapping[i] = strtoul (scanptr, &endptr, 16); if (endptr == scanptr) { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr = endptr; } if (*scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } scanptr++; /* Scan language and context. */ language = NULL; context = SCC_ALWAYS; while (*scanptr == ' ') scanptr++; if (*scanptr != '\0' && *scanptr != '#') { const char *word_begin = scanptr; const char *word_end; while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') scanptr++; word_end = scanptr; while (*scanptr == ' ') scanptr++; if (word_end - word_begin == 2) { language = (char *) malloc ((word_end - word_begin) + 1); memcpy (language, word_begin, 2); language[word_end - word_begin] = '\0'; word_begin = word_end = NULL; if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') { word_begin = scanptr; while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') scanptr++; word_end = scanptr; } } if (word_end > word_begin) { bool negate = false; if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0) { word_begin += 4; negate = true; } if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0) context = SCC_FINAL_SIGMA; else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0) context = SCC_AFTER_SOFT_DOTTED; else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0) context = SCC_MORE_ABOVE; else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0) context = SCC_BEFORE_DOT; else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0) context = SCC_AFTER_I; else { fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename); exit (1); } if (negate) context = - context; } if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); exit (1); } } /* Store the rule. */ { struct special_casing_rule *new_rule = (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); new_rule->code = code; new_rule->language = language; new_rule->context = context; memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping)); memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping)); memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping)); add_casing_rule (new_rule); } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", specialcasing_filename); exit (1); } } /* A casefolding rule. */ struct casefold_rule { unsigned int code; unsigned int mapping[3]; const char *language; }; /* The casefolding rules. */ struct casefold_rule **casefolding_rules; unsigned int num_casefolding_rules; unsigned int allocated_casefolding_rules; /* Stores in casefolding_rules the case folding rules found in casefolding_filename. */ static void fill_casefolding_rules (const char *casefolding_filename) { FILE *stream; stream = fopen (casefolding_filename, "r"); if (stream == NULL) { fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename); exit (1); } casefolding_rules = NULL; num_casefolding_rules = 0; allocated_casefolding_rules = 0; for (;;) { char buf[200+1]; char *scanptr; char *endptr; int i; unsigned int code; char type; unsigned int mapping[3]; if (fscanf (stream, "%200[^\n]\n", buf) < 1) break; if (buf[0] == '\0' || buf[0] == '#') continue; /* Scan code. */ scanptr = buf; code = strtoul (scanptr, &endptr, 16); if (endptr == scanptr) { fprintf (stderr, "parse error in '%s'\n", casefolding_filename); exit (1); } scanptr = endptr; if (*scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", casefolding_filename); exit (1); } scanptr++; /* Scan type. */ while (*scanptr == ' ') scanptr++; switch (*scanptr) { case 'C': case 'F': case 'S': case 'T': type = *scanptr; break; default: fprintf (stderr, "parse error in '%s'\n", casefolding_filename); exit (1); } scanptr++; if (*scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", casefolding_filename); exit (1); } scanptr++; /* Scan casefold mapping. */ for (i = 0; i < 3; i++) mapping[i] = 0; for (i = 0; i < 3; i++) { while (*scanptr == ' ') scanptr++; if (*scanptr == ';') break; mapping[i] = strtoul (scanptr, &endptr, 16); if (endptr == scanptr) { fprintf (stderr, "parse error in '%s'\n", casefolding_filename); exit (1); } scanptr = endptr; } if (*scanptr != ';') { fprintf (stderr, "parse error in '%s'\n", casefolding_filename); exit (1); } scanptr++; /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */ if (type != 'S') { const char * const *languages; unsigned int languages_count; /* Type 'T' indicates that the rule is applicable to Turkish languages only. */ if (type == 'T') { static const char * const turkish_languages[] = { "tr", "az" }; languages = turkish_languages; languages_count = 2; } else { static const char * const all_languages[] = { NULL }; languages = all_languages; languages_count = 1; } for (i = 0; i < languages_count; i++) { /* Store a new rule. */ struct casefold_rule *new_rule = (struct casefold_rule *) malloc (sizeof (struct casefold_rule)); new_rule->code = code; memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping)); new_rule->language = languages[i]; if (num_casefolding_rules == allocated_casefolding_rules) { allocated_casefolding_rules = 2 * allocated_casefolding_rules; if (allocated_casefolding_rules < 16) allocated_casefolding_rules = 16; casefolding_rules = (struct casefold_rule **) realloc (casefolding_rules, allocated_casefolding_rules * sizeof (struct casefold_rule *)); } casefolding_rules[num_casefolding_rules++] = new_rule; } } } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error reading from '%s'\n", casefolding_filename); exit (1); } } /* Casefold mapping, when it maps to a single character. */ unsigned int unicode_casefold[0x110000]; static unsigned int to_casefold (unsigned int ch) { return unicode_casefold[ch]; } /* Redistribute the casefolding_rules: - Rules that map to a single character, language independently, are stored in unicode_casefold. - Other rules are merged into casing_rules. */ static void redistribute_casefolding_rules (void) { unsigned int ch, i, j; /* Fill unicode_casefold[]. */ for (ch = 0; ch < 0x110000; ch++) unicode_casefold[ch] = ch; for (i = 0; i < num_casefolding_rules; i++) { struct casefold_rule *cfrule = casefolding_rules[i]; if (cfrule->language == NULL && cfrule->mapping[1] == 0) { ch = cfrule->code; if (!(ch < 0x110000)) abort (); unicode_casefold[ch] = cfrule->mapping[0]; } } /* Extend the special casing rules by filling in their casefold_mapping[] field. */ for (j = 0; j < num_casing_rules; j++) { struct special_casing_rule *rule = casing_rules[j]; unsigned int k; rule->casefold_mapping[0] = to_casefold (rule->code); for (k = 1; k < 3; k++) rule->casefold_mapping[k] = 0; } /* Now merge the other casefolding rules into casing_rules. */ for (i = 0; i < num_casefolding_rules; i++) { struct casefold_rule *cfrule = casefolding_rules[i]; if (!(cfrule->language == NULL && cfrule->mapping[1] == 0)) { /* Find a rule that applies to the same code, same language, and it has context SCC_ALWAYS. At the same time, update all rules that have the same code and same or more specific language. */ struct special_casing_rule *found_rule = NULL; for (j = 0; j < num_casing_rules; j++) { struct special_casing_rule *rule = casing_rules[j]; if (rule->code == cfrule->code && (cfrule->language == NULL || (rule->language != NULL && strcmp (rule->language, cfrule->language) == 0))) { memcpy (rule->casefold_mapping, cfrule->mapping, sizeof (rule->casefold_mapping)); if ((cfrule->language == NULL ? rule->language == NULL : rule->language != NULL && strcmp (rule->language, cfrule->language) == 0) && rule->context == SCC_ALWAYS) { /* Found it. */ found_rule = rule; } } } if (found_rule == NULL) { /* Create a new rule. */ struct special_casing_rule *new_rule = (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); /* Try to find a rule that applies to the same code, no language restriction, and with context SCC_ALWAYS. */ for (j = 0; j < num_casing_rules; j++) { struct special_casing_rule *rule = casing_rules[j]; if (rule->code == cfrule->code && rule->context == SCC_ALWAYS && rule->language == NULL) { /* Found it. */ found_rule = rule; break; } } new_rule->code = cfrule->code; new_rule->language = cfrule->language; new_rule->context = SCC_ALWAYS; if (found_rule != NULL) { memcpy (new_rule->lower_mapping, found_rule->lower_mapping, sizeof (new_rule->lower_mapping)); memcpy (new_rule->title_mapping, found_rule->title_mapping, sizeof (new_rule->title_mapping)); memcpy (new_rule->upper_mapping, found_rule->upper_mapping, sizeof (new_rule->upper_mapping)); } else { unsigned int k; new_rule->lower_mapping[0] = to_lower (cfrule->code); for (k = 1; k < 3; k++) new_rule->lower_mapping[k] = 0; new_rule->title_mapping[0] = to_title (cfrule->code); for (k = 1; k < 3; k++) new_rule->title_mapping[k] = 0; new_rule->upper_mapping[0] = to_upper (cfrule->code); for (k = 1; k < 3; k++) new_rule->upper_mapping[k] = 0; } memcpy (new_rule->casefold_mapping, cfrule->mapping, sizeof (new_rule->casefold_mapping)); add_casing_rule (new_rule); } } } } static int compare_casing_rules (const void *a, const void *b) { struct special_casing_rule *a_rule = *(struct special_casing_rule **) a; struct special_casing_rule *b_rule = *(struct special_casing_rule **) b; unsigned int a_code = a_rule->code; unsigned int b_code = b_rule->code; if (a_code < b_code) return -1; if (a_code > b_code) return 1; /* Sort the more specific rules before the more general ones. */ return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0)) + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0))); } static void sort_casing_rules (void) { /* Sort the rules 1. by code, 2. by specificity. */ if (num_casing_rules > 1) qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *), compare_casing_rules); } /* Output the special casing rules. */ static void output_casing_rules (const char *filename, const char *version) { FILE *stream; unsigned int i, j; unsigned int minor; stream = fopen (filename, "w"); if (stream == NULL) { fprintf (stderr, "cannot open '%s' for writing\n", filename); exit (1); } fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); fprintf (stream, "/* Special casing rules of Unicode characters. */\n"); fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", version); fprintf (stream, "struct special_casing_rule { char code[3]; };\n"); fprintf (stream, "%%struct-type\n"); fprintf (stream, "%%language=ANSI-C\n"); fprintf (stream, "%%define slot-name code\n"); fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n"); fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n"); fprintf (stream, "%%compare-lengths\n"); fprintf (stream, "%%compare-strncmp\n"); fprintf (stream, "%%readonly-tables\n"); fprintf (stream, "%%omit-struct-type\n"); fprintf (stream, "%%%%\n"); minor = 0; for (i = 0; i < num_casing_rules; i++) { struct special_casing_rule *rule = casing_rules[i]; int context; if (i > 0 && rule->code == casing_rules[i - 1]->code) minor += 1; else minor = 0; if (!(rule->code < 0x10000)) { fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code); exit (1); } fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ", (rule->code >> 8) & 0xff, rule->code & 0xff, minor); fprintf (stream, "%d, ", i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0); context = rule->context; if (context < 0) { fprintf (stream, "-"); context = - context; } else fprintf (stream, " "); switch (context) { case SCC_ALWAYS: fprintf (stream, "SCC_ALWAYS "); break; case SCC_FINAL_SIGMA: fprintf (stream, "SCC_FINAL_SIGMA "); break; case SCC_AFTER_SOFT_DOTTED: fprintf (stream, "SCC_AFTER_SOFT_DOTTED"); break; case SCC_MORE_ABOVE: fprintf (stream, "SCC_MORE_ABOVE "); break; case SCC_BEFORE_DOT: fprintf (stream, "SCC_BEFORE_DOT "); break; case SCC_AFTER_I: fprintf (stream, "SCC_AFTER_I "); break; default: abort (); } fprintf (stream, ", "); if (rule->language != NULL) { if (strlen (rule->language) != 2) abort (); fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]); } else fprintf (stream, "{ '\\0', '\\0' }, "); fprintf (stream, "{ "); for (j = 0; j < 3; j++) { if (j > 0) fprintf (stream, ", "); if (!(rule->upper_mapping[j] < 0x10000)) { fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code); exit (1); } if (rule->upper_mapping[j] != 0) fprintf (stream, "0x%04X", rule->upper_mapping[j]); else fprintf (stream, " 0"); } fprintf (stream, " }, { "); for (j = 0; j < 3; j++) { if (j > 0) fprintf (stream, ", "); if (!(rule->lower_mapping[j] < 0x10000)) { fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code); exit (1); } if (rule->lower_mapping[j] != 0) fprintf (stream, "0x%04X", rule->lower_mapping[j]); else fprintf (stream, " 0"); } fprintf (stream, " }, { "); for (j = 0; j < 3; j++) { if (j > 0) fprintf (stream, ", "); if (!(rule->title_mapping[j] < 0x10000)) { fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code); exit (1); } if (rule->title_mapping[j] != 0) fprintf (stream, "0x%04X", rule->title_mapping[j]); else fprintf (stream, " 0"); } fprintf (stream, " }, { "); for (j = 0; j < 3; j++) { if (j > 0) fprintf (stream, ", "); if (!(rule->casefold_mapping[j] < 0x10000)) { fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code); exit (1); } if (rule->casefold_mapping[j] != 0) fprintf (stream, "0x%04X", rule->casefold_mapping[j]); else fprintf (stream, " 0"); } fprintf (stream, " }\n"); } if (ferror (stream) || fclose (stream)) { fprintf (stderr, "error writing to '%s'\n", filename); exit (1); } } /* ========================================================================= */ /* Quoting the Unicode standard: Definition: A character is defined to be "cased" if it has the Lowercase or Uppercase property or has a General_Category value of Titlecase_Letter. */ static bool is_cased (unsigned int ch) { return (is_property_lowercase (ch) || is_property_uppercase (ch) || is_category_Lt (ch)); } /* Quoting the Unicode standard: Definition: A character is defined to be "case-ignorable" if it has the value MidLetter {or the value MidNumLet} for the Word_Break property or its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk). The text marked in braces was added in Unicode 5.1.0, see section "Update of Definition of case-ignorable". */ /* Since this predicate is only used for the "Before C" and "After C" conditions of FINAL_SIGMA, we exclude the "cased" characters here. This simplifies the evaluation of the regular expressions \p{cased} (\p{case-ignorable})* C and C (\p{case-ignorable})* \p{cased} */ static bool is_case_ignorable (unsigned int ch) { return (unicode_org_wbp[ch] == WBP_MIDLETTER || unicode_org_wbp[ch] == WBP_MIDNUMLET || is_category_Mn (ch) || is_category_Me (ch) || is_category_Cf (ch) || is_category_Lm (ch) || is_category_Sk (ch)) && !is_cased (ch); } /* ------------------------------------------------------------------------- */ /* Output all case related properties. */ static void output_casing_properties (const char *version) { #define PROPERTY(FN,P) \ debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \ output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \ output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version); PROPERTY(cased, cased) PROPERTY(ignorable, case_ignorable) #undef PROPERTY } /* ========================================================================= */ int main (int argc, char * argv[]) { const char *unicodedata_filename; const char *proplist_filename; const char *derivedproplist_filename; const char *scripts_filename; const char *blocks_filename; const char *proplist30_filename; const char *eastasianwidth_filename; const char *linebreak_filename; const char *wordbreakproperty_filename; const char *compositionexclusions_filename; const char *specialcasing_filename; const char *casefolding_filename; const char *version; if (argc != 14) { fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", argv[0]); exit (1); } unicodedata_filename = argv[1]; proplist_filename = argv[2]; derivedproplist_filename = argv[3]; scripts_filename = argv[4]; blocks_filename = argv[5]; proplist30_filename = argv[6]; eastasianwidth_filename = argv[7]; linebreak_filename = argv[8]; wordbreakproperty_filename = argv[9]; compositionexclusions_filename = argv[10]; specialcasing_filename = argv[11]; casefolding_filename = argv[12]; version = argv[13]; fill_attributes (unicodedata_filename); clear_properties (); fill_properties (proplist_filename); fill_properties (derivedproplist_filename); fill_properties30 (proplist30_filename); fill_scripts (scripts_filename); fill_blocks (blocks_filename); fill_width (eastasianwidth_filename); fill_org_lbp (linebreak_filename); fill_org_wbp (wordbreakproperty_filename); fill_composition_exclusions (compositionexclusions_filename); fill_casing_rules (specialcasing_filename); fill_casefolding_rules (casefolding_filename); redistribute_casefolding_rules (); sort_casing_rules (); output_categories (version); output_category ("unictype/categ_of.h", version); output_combclass ("unictype/combining.h", version); output_bidi_category ("unictype/bidi_of.h", version); output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); output_decimal_digit ("unictype/decdigit.h", version); output_digit_test ("../tests/unictype/test-digit.h", version); output_digit ("unictype/digit.h", version); output_numeric_test ("../tests/unictype/test-numeric.h", version); output_numeric ("unictype/numeric.h", version); output_mirror ("unictype/mirror.h", version); output_properties (version); output_scripts (version); output_scripts_byname (version); output_blocks (version); output_ident_properties (version); output_old_ctype (version); debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt"); output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version); debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt"); debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt"); output_wbrk_tables ("uniwbrk/wbrkprop.h", version); output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); debug_output_composition_tables ("uninorm/composition.txt"); output_composition_tables ("uninorm/composition-table.gperf", version); output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version); output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version); output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version); output_simple_mapping ("unicase/toupper.h", to_upper, version); output_simple_mapping ("unicase/tolower.h", to_lower, version); output_simple_mapping ("unicase/totitle.h", to_title, version); output_simple_mapping ("unicase/tocasefold.h", to_casefold, version); output_casing_rules ("unicase/special-casing-table.gperf", version); output_casing_properties (version); return 0; } /* * For Emacs M-x compile * Local Variables: * compile-command: " gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \ ./gen-uni-tables \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/UnicodeData.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/PropList.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/DerivedCoreProperties.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Scripts.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/Blocks.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/EastAsianWidth.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \ /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \ 5.1.0 " * End: */