diff options
author | Bruno Haible <bruno@clisp.org> | 2021-12-29 00:06:11 +0100 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2021-12-29 00:06:11 +0100 |
commit | 2995fb5e993a5d7434d96465758087b35a1488ac (patch) | |
tree | 2e37cf280b02efd5f6464e5e50ad985a5e48e80c /lib | |
parent | 9f6dca273305dbafb013740c0de468a93b8e7be5 (diff) | |
download | gnulib-2995fb5e993a5d7434d96465758087b35a1488ac.tar.gz |
unilbrk: Update handling of zero-width joiner for Unicode 10.0.0.
* lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks): Update
code for zero-width joiner handling to match UAX #14 for Unicode 10.0.0.
* lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks):
Likewise.
* lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks):
Likewise.
* tests/unilbrk/test-u8-possible-linebreaks.c (main): Add a test
regarding zero-width joiner.
* tests/unilbrk/test-u16-possible-linebreaks.c (main): Likewise.
* tests/unilbrk/test-u32-possible-linebreaks.c (main): Likewise.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/unilbrk/u16-possible-linebreaks.c | 22 | ||||
-rw-r--r-- | lib/unilbrk/u32-possible-linebreaks.c | 22 | ||||
-rw-r--r-- | lib/unilbrk/u8-possible-linebreaks.c | 22 |
3 files changed, 48 insertions, 18 deletions
diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index 4c01720096..e0f0ff06b7 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -42,6 +42,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char { int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); const uint16_t *s_end = s + n; + int prev_prop = LBP_BK; /* line break property of last character */ int last_prop = LBP_BK; /* line break property of last non-space character */ char *seen_space = NULL; /* Was a space seen after the last non-space character? */ @@ -58,6 +59,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char { /* (LB4,LB5,LB6) Mandatory break. */ *p = UC_BREAK_MANDATORY; + prev_prop = LBP_BK; last_prop = LBP_BK; seen_space = NULL; } @@ -97,16 +99,16 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char last_prop = LBP_ZW; seen_space = NULL; } - else if (prop == LBP_CM) + else if (prop == LBP_CM || prop == LBP_ZWJ) { - /* (LB9) Don't break just before a combining character, except - immediately after a mandatory break character, space, or - zero-width space. */ + /* (LB9) Don't break just before a combining character or + zero-width joiner, except immediately after a mandatory + break character, space, or zero-width space. */ if (last_prop == LBP_BK) { /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ *p = UC_BREAK_PROHIBITED; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -118,7 +120,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char character as base for combining marks" because now the NBSP CM sequence is recommended instead of SP CM. */ *p = UC_BREAK_POSSIBLE; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -144,6 +146,12 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char /* (LB8) Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; } + else if (prev_prop == LBP_ZWJ + && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM)) + { + /* (LB8a) Don't break right after a zero-width joiner. */ + *p = UC_BREAK_PROHIBITED; + } else { switch (unilbrk_table [last_prop] [prop]) @@ -164,6 +172,8 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char last_prop = prop; seen_space = NULL; } + + prev_prop = prop; } s += count; diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c index 562419ad96..ab973e911c 100644 --- a/lib/unilbrk/u32-possible-linebreaks.c +++ b/lib/unilbrk/u32-possible-linebreaks.c @@ -40,6 +40,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char { int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); const uint32_t *s_end = s + n; + int prev_prop = LBP_BK; /* line break property of last character */ int last_prop = LBP_BK; /* line break property of last non-space character */ char *seen_space = NULL; /* Was a space seen after the last non-space character? */ @@ -52,6 +53,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char { /* (LB4,LB5,LB6) Mandatory break. */ *p = UC_BREAK_MANDATORY; + prev_prop = LBP_BK; last_prop = LBP_BK; seen_space = NULL; } @@ -91,16 +93,16 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char last_prop = LBP_ZW; seen_space = NULL; } - else if (prop == LBP_CM) + else if (prop == LBP_CM || prop == LBP_ZWJ) { - /* (LB9) Don't break just before a combining character, except - immediately after a mandatory break character, space, or - zero-width space. */ + /* (LB9) Don't break just before a combining character or + zero-width joiner, except immediately after a mandatory + break character, space, or zero-width space. */ if (last_prop == LBP_BK) { /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ *p = UC_BREAK_PROHIBITED; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -112,7 +114,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char character as base for combining marks" because now the NBSP CM sequence is recommended instead of SP CM. */ *p = UC_BREAK_POSSIBLE; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -138,6 +140,12 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char /* (LB8) Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; } + else if (prev_prop == LBP_ZWJ + && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM)) + { + /* (LB8a) Don't break right after a zero-width joiner. */ + *p = UC_BREAK_PROHIBITED; + } else { switch (unilbrk_table [last_prop] [prop]) @@ -158,6 +166,8 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char last_prop = prop; seen_space = NULL; } + + prev_prop = prop; } s++; diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c index d7ab680203..86f1ce8f13 100644 --- a/lib/unilbrk/u8-possible-linebreaks.c +++ b/lib/unilbrk/u8-possible-linebreaks.c @@ -42,6 +42,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * { int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); const uint8_t *s_end = s + n; + int prev_prop = LBP_BK; /* line break property of last character */ int last_prop = LBP_BK; /* line break property of last non-space character */ char *seen_space = NULL; /* Was a space seen after the last non-space character? */ @@ -58,6 +59,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * { /* (LB4,LB5,LB6) Mandatory break. */ *p = UC_BREAK_MANDATORY; + prev_prop = LBP_BK; last_prop = LBP_BK; seen_space = NULL; } @@ -97,16 +99,16 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * last_prop = LBP_ZW; seen_space = NULL; } - else if (prop == LBP_CM) + else if (prop == LBP_CM || prop == LBP_ZWJ) { - /* (LB9) Don't break just before a combining character, except - immediately after a mandatory break character, space, or - zero-width space. */ + /* (LB9) Don't break just before a combining character or + zero-width joiner, except immediately after a mandatory + break character, space, or zero-width space. */ if (last_prop == LBP_BK) { /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ *p = UC_BREAK_PROHIBITED; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -118,7 +120,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * character as base for combining marks" because now the NBSP CM sequence is recommended instead of SP CM. */ *p = UC_BREAK_POSSIBLE; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -144,6 +146,12 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * /* (LB8) Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; } + else if (prev_prop == LBP_ZWJ + && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM)) + { + /* (LB8a) Don't break right after a zero-width joiner. */ + *p = UC_BREAK_PROHIBITED; + } else { switch (unilbrk_table [last_prop] [prop]) @@ -164,6 +172,8 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * last_prop = prop; seen_space = NULL; } + + prev_prop = prop; } s += count; |