From 2995fb5e993a5d7434d96465758087b35a1488ac Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Wed, 29 Dec 2021 00:06:11 +0100 Subject: unilbrk: Update handling of zero-width joiner for Unicode 10.0.0. * lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks): Update code for zero-width joiner handling to match UAX #14 for Unicode 10.0.0. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks): Likewise. * tests/unilbrk/test-u8-possible-linebreaks.c (main): Add a test regarding zero-width joiner. * tests/unilbrk/test-u16-possible-linebreaks.c (main): Likewise. * tests/unilbrk/test-u32-possible-linebreaks.c (main): Likewise. --- lib/unilbrk/u16-possible-linebreaks.c | 22 ++++++++++++++++------ lib/unilbrk/u32-possible-linebreaks.c | 22 ++++++++++++++++------ lib/unilbrk/u8-possible-linebreaks.c | 22 ++++++++++++++++------ 3 files changed, 48 insertions(+), 18 deletions(-) (limited to 'lib') diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index 4c01720096..e0f0ff06b7 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -42,6 +42,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char { int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); const uint16_t *s_end = s + n; + int prev_prop = LBP_BK; /* line break property of last character */ int last_prop = LBP_BK; /* line break property of last non-space character */ char *seen_space = NULL; /* Was a space seen after the last non-space character? */ @@ -58,6 +59,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char { /* (LB4,LB5,LB6) Mandatory break. */ *p = UC_BREAK_MANDATORY; + prev_prop = LBP_BK; last_prop = LBP_BK; seen_space = NULL; } @@ -97,16 +99,16 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char last_prop = LBP_ZW; seen_space = NULL; } - else if (prop == LBP_CM) + else if (prop == LBP_CM || prop == LBP_ZWJ) { - /* (LB9) Don't break just before a combining character, except - immediately after a mandatory break character, space, or - zero-width space. */ + /* (LB9) Don't break just before a combining character or + zero-width joiner, except immediately after a mandatory + break character, space, or zero-width space. */ if (last_prop == LBP_BK) { /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ *p = UC_BREAK_PROHIBITED; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -118,7 +120,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char character as base for combining marks" because now the NBSP CM sequence is recommended instead of SP CM. */ *p = UC_BREAK_POSSIBLE; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -144,6 +146,12 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char /* (LB8) Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; } + else if (prev_prop == LBP_ZWJ + && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM)) + { + /* (LB8a) Don't break right after a zero-width joiner. */ + *p = UC_BREAK_PROHIBITED; + } else { switch (unilbrk_table [last_prop] [prop]) @@ -164,6 +172,8 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char last_prop = prop; seen_space = NULL; } + + prev_prop = prop; } s += count; diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c index 562419ad96..ab973e911c 100644 --- a/lib/unilbrk/u32-possible-linebreaks.c +++ b/lib/unilbrk/u32-possible-linebreaks.c @@ -40,6 +40,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char { int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); const uint32_t *s_end = s + n; + int prev_prop = LBP_BK; /* line break property of last character */ int last_prop = LBP_BK; /* line break property of last non-space character */ char *seen_space = NULL; /* Was a space seen after the last non-space character? */ @@ -52,6 +53,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char { /* (LB4,LB5,LB6) Mandatory break. */ *p = UC_BREAK_MANDATORY; + prev_prop = LBP_BK; last_prop = LBP_BK; seen_space = NULL; } @@ -91,16 +93,16 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char last_prop = LBP_ZW; seen_space = NULL; } - else if (prop == LBP_CM) + else if (prop == LBP_CM || prop == LBP_ZWJ) { - /* (LB9) Don't break just before a combining character, except - immediately after a mandatory break character, space, or - zero-width space. */ + /* (LB9) Don't break just before a combining character or + zero-width joiner, except immediately after a mandatory + break character, space, or zero-width space. */ if (last_prop == LBP_BK) { /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ *p = UC_BREAK_PROHIBITED; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -112,7 +114,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char character as base for combining marks" because now the NBSP CM sequence is recommended instead of SP CM. */ *p = UC_BREAK_POSSIBLE; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -138,6 +140,12 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char /* (LB8) Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; } + else if (prev_prop == LBP_ZWJ + && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM)) + { + /* (LB8a) Don't break right after a zero-width joiner. */ + *p = UC_BREAK_PROHIBITED; + } else { switch (unilbrk_table [last_prop] [prop]) @@ -158,6 +166,8 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char last_prop = prop; seen_space = NULL; } + + prev_prop = prop; } s++; diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c index d7ab680203..86f1ce8f13 100644 --- a/lib/unilbrk/u8-possible-linebreaks.c +++ b/lib/unilbrk/u8-possible-linebreaks.c @@ -42,6 +42,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * { int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); const uint8_t *s_end = s + n; + int prev_prop = LBP_BK; /* line break property of last character */ int last_prop = LBP_BK; /* line break property of last non-space character */ char *seen_space = NULL; /* Was a space seen after the last non-space character? */ @@ -58,6 +59,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * { /* (LB4,LB5,LB6) Mandatory break. */ *p = UC_BREAK_MANDATORY; + prev_prop = LBP_BK; last_prop = LBP_BK; seen_space = NULL; } @@ -97,16 +99,16 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * last_prop = LBP_ZW; seen_space = NULL; } - else if (prop == LBP_CM) + else if (prop == LBP_CM || prop == LBP_ZWJ) { - /* (LB9) Don't break just before a combining character, except - immediately after a mandatory break character, space, or - zero-width space. */ + /* (LB9) Don't break just before a combining character or + zero-width joiner, except immediately after a mandatory + break character, space, or zero-width space. */ if (last_prop == LBP_BK) { /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ *p = UC_BREAK_PROHIBITED; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -118,7 +120,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * character as base for combining marks" because now the NBSP CM sequence is recommended instead of SP CM. */ *p = UC_BREAK_POSSIBLE; - /* Treat CM as AL. */ + /* (LB10) Treat CM or ZWJ as AL. */ last_prop = LBP_AL; seen_space = NULL; } @@ -144,6 +146,12 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * /* (LB8) Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; } + else if (prev_prop == LBP_ZWJ + && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM)) + { + /* (LB8a) Don't break right after a zero-width joiner. */ + *p = UC_BREAK_PROHIBITED; + } else { switch (unilbrk_table [last_prop] [prop]) @@ -164,6 +172,8 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char * last_prop = prop; seen_space = NULL; } + + prev_prop = prop; } s += count; -- cgit v1.2.1