summaryrefslogtreecommitdiff
path: root/lib/gen-uni-tables.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gen-uni-tables.c')
-rw-r--r--lib/gen-uni-tables.c208
1 files changed, 156 insertions, 52 deletions
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 2d7c5d98ee..9665d7764c 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
/usr/local/share/Unidata/CompositionExclusions.txt \
/usr/local/share/Unidata/SpecialCasing.txt \
/usr/local/share/Unidata/CaseFolding.txt \
- 6.0.0
+ 6.1.0
*/
#include <assert.h>
@@ -2855,7 +2855,7 @@ is_property_default_ignorable_code_point (unsigned int ch)
bool result1 =
(is_category_Cf (ch)
&& !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */
- && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F)
+ && !((ch >= 0x0600 && ch <= 0x0604) || ch == 0x06DD || ch == 0x070F)
/* For some reason, the following are not listed as having property
Default_Ignorable_Code_Point. */
&& !(ch == 0x110BD))
@@ -3725,7 +3725,8 @@ enum
UC_JOINING_GROUP_YUDH, /* Yudh */
UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */
UC_JOINING_GROUP_ZAIN, /* Zain */
- UC_JOINING_GROUP_ZHAIN /* Zhain */
+ UC_JOINING_GROUP_ZHAIN, /* Zhain */
+ UC_JOINING_GROUP_ROHINGYA_YEH /* Rohingya_Yeh */
};
static uint8_t unicode_joining_group[0x110000];
@@ -3864,6 +3865,7 @@ fill_arabicshaping (const char *arabicshaping_filename)
TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE")
TRY(UC_JOINING_GROUP_ZAIN, "ZAIN")
TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN")
+ TRY(UC_JOINING_GROUP_ROHINGYA_YEH, "ROHINGYA YEH")
#undef TRY
else
{
@@ -4144,6 +4146,7 @@ joining_group_as_c_identifier (int joining_group)
TRY(UC_JOINING_GROUP_YUDH_HE)
TRY(UC_JOINING_GROUP_ZAIN)
TRY(UC_JOINING_GROUP_ZHAIN)
+ TRY(UC_JOINING_GROUP_ROHINGYA_YEH)
#undef TRY
abort ();
}
@@ -4190,11 +4193,22 @@ output_joining_group_test (const char *filename, const char *version)
}
}
+/* Construction of sparse 3-level tables. */
+#define TABLE joining_group_table
+#define ELEMENT uint8_t
+#define DEFAULT UC_JOINING_GROUP_NONE
+#define xmalloc malloc
+#define xrealloc realloc
+#include "3level.h"
+
static void
output_joining_group (const char *filename, const char *version)
{
FILE *stream;
- unsigned int ch_min, ch_max, ch, i;
+ unsigned int ch, i;
+ struct joining_group_table t;
+ unsigned int level1_offset, level2_offset, level3_offset;
+ uint16_t *level3_packed;
stream = fopen (filename, "w");
if (stream == NULL)
@@ -4204,53 +4218,118 @@ output_joining_group (const char *filename, const char *version)
}
fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
- fprintf (stream, "/* Arabic joining type of Unicode characters. */\n");
+ fprintf (stream, "/* Arabic joining group of Unicode characters. */\n");
fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n",
version);
- ch_min = 0x10FFFF;
+ t.p = 7;
+ t.q = 9;
+ joining_group_table_init (&t);
+
for (ch = 0; ch < 0x110000; ch++)
- if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
- {
- ch_min = ch;
- break;
- }
+ {
+ uint8_t value = unicode_joining_group[ch];
- ch_max = 0;
- for (ch = 0x10FFFF; ch > 0; ch--)
- if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE)
- {
- ch_max = ch;
- break;
- }
+ assert (value <= 0x7f);
- assert (ch_min <= ch_max);
+ joining_group_table_add (&t, ch, value);
+ }
- /* If the interval [ch_min, ch_max] is too large, we should better use a
- 3-level table. */
- assert (ch_max - ch_min < 0x200);
+ joining_group_table_finalize (&t);
- fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min);
- fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n",
- ch_max + 1, ch_min);
- fprintf (stream, "{");
- for (i = 0; i <= ch_max - ch_min; i++)
- {
- const char *s;
+ /* Offsets in t.result, in memory of this process. */
+ level1_offset =
+ 5 * sizeof (uint32_t);
+ level2_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t);
+ level3_offset =
+ 5 * sizeof (uint32_t)
+ + t.level1_size * sizeof (uint32_t)
+ + (t.level2_size << t.q) * sizeof (uint32_t);
- ch = ch_min + i;
- if ((i % 2) == 0)
- fprintf (stream, "\n ");
- s = joining_group_as_c_identifier (unicode_joining_group[ch]);
- fprintf (stream, " %s", s);
- if (i+1 <= ch_max - ch_min)
- {
- fprintf (stream, ",");
- if (((i+1) % 2) != 0)
- fprintf (stream, "%*s", 38 - (int) strlen (s), "");
- }
+ for (i = 0; i < 5; i++)
+ fprintf (stream, "#define joining_group_header_%d %d\n", i,
+ ((uint32_t *) t.result)[i]);
+ fprintf (stream, "static const\n");
+ fprintf (stream, "struct\n");
+ fprintf (stream, " {\n");
+ fprintf (stream, " int level1[%zu];\n", t.level1_size);
+ fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q);
+ fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size,
+ (1 << t.p) * 7 / 16);
+ fprintf (stream, " }\n");
+ fprintf (stream, "u_joining_group =\n");
+ fprintf (stream, "{\n");
+ fprintf (stream, " {");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level1_size; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level1_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level2_offset) / sizeof (uint32_t));
+ if (i+1 < t.level1_size)
+ fprintf (stream, ",");
}
- fprintf (stream, "\n");
+ if (t.level1_size > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ fprintf (stream, " {");
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < t.level2_size << t.q; i++)
+ {
+ uint32_t offset;
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ offset = ((uint32_t *) (t.result + level2_offset))[i];
+ if (offset == 0)
+ fprintf (stream, " %5d", -1);
+ else
+ fprintf (stream, " %5zu",
+ (offset - level3_offset) / sizeof (uint8_t));
+ if (i+1 < t.level2_size << t.q)
+ fprintf (stream, ",");
+ }
+ if (t.level2_size << t.q > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " },\n");
+ /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units,
+ not 32-bit units, in order to make the lookup function easier. */
+ level3_packed =
+ (uint16_t *)
+ calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t));
+ for (i = 0; i < t.level3_size << t.p; i++)
+ {
+ unsigned int j = (i * 7) / 16;
+ unsigned int k = (i * 7) % 16;
+ uint32_t value = ((unsigned char *) (t.result + level3_offset))[i];
+ value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k);
+ level3_packed[j] = value & 0xffff;
+ level3_packed[j+1] = value >> 16;
+ }
+ fprintf (stream, " {");
+ if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
+ fprintf (stream, "\n ");
+ for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++)
+ {
+ if (i > 0 && (i % 8) == 0)
+ fprintf (stream, "\n ");
+ fprintf (stream, " 0x%04x", level3_packed[i]);
+ if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1)
+ fprintf (stream, ",");
+ }
+ if ((t.level3_size << t.p) * 7 / 16 + 1 > 8)
+ fprintf (stream, "\n ");
+ fprintf (stream, " }\n");
+ free (level3_packed);
fprintf (stream, "};\n");
if (ferror (stream) || fclose (stream))
@@ -6096,22 +6175,22 @@ output_width_property_test (const char *filename)
enum
{
- /* Values >= 25 are resolved at run time. */
- LBP_BK = 25, /* mandatory break */
+ /* Values >= 26 are resolved at run time. */
+ LBP_BK = 26, /* mandatory break */
/*LBP_CR, carriage return - not used here because it's a DOSism */
/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 26, /* attached characters and combining marks */
+ LBP_CM = 27, /* attached characters and combining marks */
/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
/*LBP_SG, surrogates - not used here because they are not characters */
LBP_WJ = 0, /* word joiner */
- LBP_ZW = 27, /* zero width space */
+ LBP_ZW = 28, /* zero width space */
LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 28, /* space */
+ LBP_SP = 29, /* space */
LBP_B2 = 2, /* break opportunity before and after */
LBP_BA = 3, /* break opportunity after */
LBP_BB = 4, /* break opportunity before */
LBP_HY = 5, /* hyphen */
- LBP_CB = 29, /* contingent break opportunity */
+ LBP_CB = 30, /* contingent break opportunity */
LBP_CL = 6, /* closing punctuation */
LBP_CP = 7, /* closing parenthesis */
LBP_EX = 8, /* exclamation/interrogation */
@@ -6124,16 +6203,18 @@ enum
LBP_PO = 15, /* postfix (numeric) */
LBP_PR = 16, /* prefix (numeric) */
LBP_SY = 17, /* symbols allowing breaks */
- LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */
+ LBP_AI = 31, /* ambiguous (alphabetic or ideograph) */
LBP_AL = 18, /* ordinary alphabetic and symbol characters */
+/*LBP_CJ, conditional Japanese starter, resolved to NS */
LBP_H2 = 19, /* Hangul LV syllable */
LBP_H3 = 20, /* Hangul LVT syllable */
+ LBP_HL = 25, /* Hebrew letter */
LBP_ID = 21, /* ideographic */
LBP_JL = 22, /* Hangul L Jamo */
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
- LBP_SA = 31, /* complex context (South East Asian) */
- LBP_XX = 32 /* unknown */
+ LBP_SA = 32, /* complex context (South East Asian) */
+ LBP_XX = 33 /* unknown */
};
/* Returns the line breaking classification for ch, as a bit mask. */
@@ -6181,7 +6262,9 @@ get_lbp (unsigned int ch)
attr |= (int64_t) 1 << LBP_SP;
/* break opportunity before and after */
- if (ch == 0x2014 /* EM DASH */)
+ if (ch == 0x2014 /* EM DASH */
+ || ch == 0x2E3A /* TWO-EM DASH */
+ || ch == 0x2E3B /* THREE-EM DASH */)
attr |= (int64_t) 1 << LBP_B2;
/* break opportunity after */
@@ -6234,6 +6317,8 @@ get_lbp (unsigned int ch)
|| ch == 0x2E2D /* FIVE DOT PUNCTUATION */
|| ch == 0x2E30 /* RING POINT */
|| ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */
+ || ch == 0x2E33 /* RAISED DOT */
+ || ch == 0x2E34 /* RAISED COMMA */
|| ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */
|| ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */
|| ch == 0x10102 /* AEGEAN CHECK MARK */
@@ -6310,6 +6395,8 @@ get_lbp (unsigned int ch)
|| ch == 0xA9C7 /* JAVANESE PADA PANGKAT */
|| ch == 0xA9C8 /* JAVANESE PADA LINGSA */
|| ch == 0xA9C9 /* JAVANESE PADA LUNGSI */
+ || ch == 0xAAF0 /* MEETEI MAYEK CHEIKHAN */
+ || ch == 0xAAF1 /* MEETEI MAYEK AHANG KHUDAM */
|| ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */
|| ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */
|| ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */
@@ -6325,6 +6412,13 @@ get_lbp (unsigned int ch)
|| ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */
|| ch == 0x110C0 /* KAITHI DANDA */
|| ch == 0x110C1 /* KAITHI DOUBLE DANDA */
+ || ch == 0x11140 /* CHAKMA SECTION MARK */
+ || ch == 0x11141 /* CHAKMA DANDA */
+ || ch == 0x11142 /* CHAKMA DOUBLE DANDA */
+ || ch == 0x11143 /* CHAKMA QUESTION MARK */
+ || ch == 0x111C5 /* SHARADA DANDA */
+ || ch == 0x111C6 /* SHARADA DOUBLE DANDA */
+ || ch == 0x111C8 /* SHARADA SEPARATOR */
|| ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */
|| ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */
|| ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */)
@@ -6578,6 +6672,10 @@ get_lbp (unsigned int ch)
if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0)
attr |= (int64_t) 1 << LBP_H3;
+ if ((ch >= 0x05D0 && ch <= 0x05F2) || ch == 0xFB1D
+ || (ch >= 0xFB1F && ch <= 0xFB28) || (ch >= 0xFB2A && ch <= 0xFB4F))
+ attr |= (int64_t) 1 << LBP_HL;
+
if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C))
attr |= (int64_t) 1 << LBP_JL;
@@ -6731,6 +6829,7 @@ get_lbp (unsigned int ch)
|| ch == 0x0601 /* ARABIC SIGN SANAH */
|| ch == 0x0602 /* ARABIC FOOTNOTE MARKER */
|| ch == 0x0603 /* ARABIC SIGN SAFHA */
+ || ch == 0x0604 /* ARABIC SIGN SAMVAT */
|| ch == 0x06DD /* ARABIC END OF AYAH */
|| ch == 0x070F /* SYRIAC ABBREVIATION MARK */
|| ch == 0x2061 /* FUNCTION APPLICATION */
@@ -6739,7 +6838,7 @@ get_lbp (unsigned int ch)
|| ch == 0x2064 /* INVISIBLE PLUS */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x110BD /* KAITHI NUMBER SIGN */)
- if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
{
/* ambiguous (alphabetic) ? */
if ((unicode_width[ch] != NULL
@@ -6859,6 +6958,7 @@ debug_output_lbp (FILE *stream)
PRINT_BIT(attr,LBP_AL);
PRINT_BIT(attr,LBP_H2);
PRINT_BIT(attr,LBP_H3);
+ PRINT_BIT(attr,LBP_HL);
PRINT_BIT(attr,LBP_ID);
PRINT_BIT(attr,LBP_JL);
PRINT_BIT(attr,LBP_JV);
@@ -6973,6 +7073,7 @@ fill_org_lbp (const char *linebreak_filename)
TRY(LBP_AL)
TRY(LBP_H2)
TRY(LBP_H3)
+ TRY(LBP_HL)
TRY(LBP_ID)
TRY(LBP_JL)
TRY(LBP_JV)
@@ -6984,6 +7085,7 @@ fill_org_lbp (const char *linebreak_filename)
else if (strcmp (field1, "CR") == 0) value = LBP_BK;
else if (strcmp (field1, "NL") == 0) value = LBP_BK;
else if (strcmp (field1, "SG") == 0) value = LBP_XX;
+ else if (strcmp (field1, "CJ") == 0) value = LBP_NS;
else
{
fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n",
@@ -7053,6 +7155,7 @@ debug_output_org_lbp (FILE *stream)
PRINT_BIT(attr,LBP_AL);
PRINT_BIT(attr,LBP_H2);
PRINT_BIT(attr,LBP_H3);
+ PRINT_BIT(attr,LBP_HL);
PRINT_BIT(attr,LBP_ID);
PRINT_BIT(attr,LBP_JL);
PRINT_BIT(attr,LBP_JV);
@@ -7225,6 +7328,7 @@ output_lbp (FILE *stream1, FILE *stream2)
CASE(LBP_AL);
CASE(LBP_H2);
CASE(LBP_H3);
+ CASE(LBP_HL);
CASE(LBP_ID);
CASE(LBP_JL);
CASE(LBP_JV);