summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2021-12-29 00:06:11 +0100
committerBruno Haible <bruno@clisp.org>2021-12-29 00:06:11 +0100
commit2995fb5e993a5d7434d96465758087b35a1488ac (patch)
tree2e37cf280b02efd5f6464e5e50ad985a5e48e80c /lib
parent9f6dca273305dbafb013740c0de468a93b8e7be5 (diff)
downloadgnulib-2995fb5e993a5d7434d96465758087b35a1488ac.tar.gz
unilbrk: Update handling of zero-width joiner for Unicode 10.0.0.
* lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks): Update code for zero-width joiner handling to match UAX #14 for Unicode 10.0.0. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks): Likewise. * tests/unilbrk/test-u8-possible-linebreaks.c (main): Add a test regarding zero-width joiner. * tests/unilbrk/test-u16-possible-linebreaks.c (main): Likewise. * tests/unilbrk/test-u32-possible-linebreaks.c (main): Likewise.
Diffstat (limited to 'lib')
-rw-r--r--lib/unilbrk/u16-possible-linebreaks.c22
-rw-r--r--lib/unilbrk/u32-possible-linebreaks.c22
-rw-r--r--lib/unilbrk/u8-possible-linebreaks.c22
3 files changed, 48 insertions, 18 deletions
diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c
index 4c01720096..e0f0ff06b7 100644
--- a/lib/unilbrk/u16-possible-linebreaks.c
+++ b/lib/unilbrk/u16-possible-linebreaks.c
@@ -42,6 +42,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
{
int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
const uint16_t *s_end = s + n;
+ int prev_prop = LBP_BK; /* line break property of last character */
int last_prop = LBP_BK; /* line break property of last non-space character */
char *seen_space = NULL; /* Was a space seen after the last non-space character? */
@@ -58,6 +59,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
{
/* (LB4,LB5,LB6) Mandatory break. */
*p = UC_BREAK_MANDATORY;
+ prev_prop = LBP_BK;
last_prop = LBP_BK;
seen_space = NULL;
}
@@ -97,16 +99,16 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
last_prop = LBP_ZW;
seen_space = NULL;
}
- else if (prop == LBP_CM)
+ else if (prop == LBP_CM || prop == LBP_ZWJ)
{
- /* (LB9) Don't break just before a combining character, except
- immediately after a mandatory break character, space, or
- zero-width space. */
+ /* (LB9) Don't break just before a combining character or
+ zero-width joiner, except immediately after a mandatory
+ break character, space, or zero-width space. */
if (last_prop == LBP_BK)
{
/* (LB4,LB5,LB6) Don't break at the beginning of a line. */
*p = UC_BREAK_PROHIBITED;
- /* Treat CM as AL. */
+ /* (LB10) Treat CM or ZWJ as AL. */
last_prop = LBP_AL;
seen_space = NULL;
}
@@ -118,7 +120,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
character as base for combining marks" because now the
NBSP CM sequence is recommended instead of SP CM. */
*p = UC_BREAK_POSSIBLE;
- /* Treat CM as AL. */
+ /* (LB10) Treat CM or ZWJ as AL. */
last_prop = LBP_AL;
seen_space = NULL;
}
@@ -144,6 +146,12 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
/* (LB8) Break after zero-width space. */
*p = UC_BREAK_POSSIBLE;
}
+ else if (prev_prop == LBP_ZWJ
+ && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM))
+ {
+ /* (LB8a) Don't break right after a zero-width joiner. */
+ *p = UC_BREAK_PROHIBITED;
+ }
else
{
switch (unilbrk_table [last_prop] [prop])
@@ -164,6 +172,8 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
last_prop = prop;
seen_space = NULL;
}
+
+ prev_prop = prop;
}
s += count;
diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c
index 562419ad96..ab973e911c 100644
--- a/lib/unilbrk/u32-possible-linebreaks.c
+++ b/lib/unilbrk/u32-possible-linebreaks.c
@@ -40,6 +40,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
{
int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
const uint32_t *s_end = s + n;
+ int prev_prop = LBP_BK; /* line break property of last character */
int last_prop = LBP_BK; /* line break property of last non-space character */
char *seen_space = NULL; /* Was a space seen after the last non-space character? */
@@ -52,6 +53,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
{
/* (LB4,LB5,LB6) Mandatory break. */
*p = UC_BREAK_MANDATORY;
+ prev_prop = LBP_BK;
last_prop = LBP_BK;
seen_space = NULL;
}
@@ -91,16 +93,16 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
last_prop = LBP_ZW;
seen_space = NULL;
}
- else if (prop == LBP_CM)
+ else if (prop == LBP_CM || prop == LBP_ZWJ)
{
- /* (LB9) Don't break just before a combining character, except
- immediately after a mandatory break character, space, or
- zero-width space. */
+ /* (LB9) Don't break just before a combining character or
+ zero-width joiner, except immediately after a mandatory
+ break character, space, or zero-width space. */
if (last_prop == LBP_BK)
{
/* (LB4,LB5,LB6) Don't break at the beginning of a line. */
*p = UC_BREAK_PROHIBITED;
- /* Treat CM as AL. */
+ /* (LB10) Treat CM or ZWJ as AL. */
last_prop = LBP_AL;
seen_space = NULL;
}
@@ -112,7 +114,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
character as base for combining marks" because now the
NBSP CM sequence is recommended instead of SP CM. */
*p = UC_BREAK_POSSIBLE;
- /* Treat CM as AL. */
+ /* (LB10) Treat CM or ZWJ as AL. */
last_prop = LBP_AL;
seen_space = NULL;
}
@@ -138,6 +140,12 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
/* (LB8) Break after zero-width space. */
*p = UC_BREAK_POSSIBLE;
}
+ else if (prev_prop == LBP_ZWJ
+ && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM))
+ {
+ /* (LB8a) Don't break right after a zero-width joiner. */
+ *p = UC_BREAK_PROHIBITED;
+ }
else
{
switch (unilbrk_table [last_prop] [prop])
@@ -158,6 +166,8 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
last_prop = prop;
seen_space = NULL;
}
+
+ prev_prop = prop;
}
s++;
diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c
index d7ab680203..86f1ce8f13 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -42,6 +42,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
{
int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
const uint8_t *s_end = s + n;
+ int prev_prop = LBP_BK; /* line break property of last character */
int last_prop = LBP_BK; /* line break property of last non-space character */
char *seen_space = NULL; /* Was a space seen after the last non-space character? */
@@ -58,6 +59,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
{
/* (LB4,LB5,LB6) Mandatory break. */
*p = UC_BREAK_MANDATORY;
+ prev_prop = LBP_BK;
last_prop = LBP_BK;
seen_space = NULL;
}
@@ -97,16 +99,16 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
last_prop = LBP_ZW;
seen_space = NULL;
}
- else if (prop == LBP_CM)
+ else if (prop == LBP_CM || prop == LBP_ZWJ)
{
- /* (LB9) Don't break just before a combining character, except
- immediately after a mandatory break character, space, or
- zero-width space. */
+ /* (LB9) Don't break just before a combining character or
+ zero-width joiner, except immediately after a mandatory
+ break character, space, or zero-width space. */
if (last_prop == LBP_BK)
{
/* (LB4,LB5,LB6) Don't break at the beginning of a line. */
*p = UC_BREAK_PROHIBITED;
- /* Treat CM as AL. */
+ /* (LB10) Treat CM or ZWJ as AL. */
last_prop = LBP_AL;
seen_space = NULL;
}
@@ -118,7 +120,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
character as base for combining marks" because now the
NBSP CM sequence is recommended instead of SP CM. */
*p = UC_BREAK_POSSIBLE;
- /* Treat CM as AL. */
+ /* (LB10) Treat CM or ZWJ as AL. */
last_prop = LBP_AL;
seen_space = NULL;
}
@@ -144,6 +146,12 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
/* (LB8) Break after zero-width space. */
*p = UC_BREAK_POSSIBLE;
}
+ else if (prev_prop == LBP_ZWJ
+ && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM))
+ {
+ /* (LB8a) Don't break right after a zero-width joiner. */
+ *p = UC_BREAK_PROHIBITED;
+ }
else
{
switch (unilbrk_table [last_prop] [prop])
@@ -164,6 +172,8 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
last_prop = prop;
seen_space = NULL;
}
+
+ prev_prop = prop;
}
s += count;