unilbrk: Update handling of zero-width joiner for Unicode 10.0.0.

* lib/unilbrk/u8-possible-linebreaks.c (u8_possible_linebreaks): Update code for zero-width joiner handling to match UAX #14 for Unicode 10.0.0. * lib/unilbrk/u16-possible-linebreaks.c (u16_possible_linebreaks): Likewise. * lib/unilbrk/u32-possible-linebreaks.c (u32_possible_linebreaks): Likewise. * tests/unilbrk/test-u8-possible-linebreaks.c (main): Add a test regarding zero-width joiner. * tests/unilbrk/test-u16-possible-linebreaks.c (main): Likewise. * tests/unilbrk/test-u32-possible-linebreaks.c (main): Likewise.
author: Bruno Haible <bruno@clisp.org> 2021-12-29 00:06:11 +0100
committer: Bruno Haible <bruno@clisp.org> 2021-12-29 00:06:11 +0100
commit: 2995fb5e993a5d7434d96465758087b35a1488ac (patch)
tree: 2e37cf280b02efd5f6464e5e50ad985a5e48e80c /lib
parent: 9f6dca273305dbafb013740c0de468a93b8e7be5 (diff)
download: gnulib-2995fb5e993a5d7434d96465758087b35a1488ac.tar.gz
3 files changed, 48 insertions, 18 deletions
diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c
index 4c01720096..e0f0ff06b7 100644
--- a/lib/unilbrk/u16-possible-linebreaks.c
+++ b/lib/unilbrk/u16-possible-linebreaks.c
@@ -42,6 +42,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
     {
       int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
       const uint16_t *s_end = s + n;
+      int prev_prop = LBP_BK; /* line break property of last character */
       int last_prop = LBP_BK; /* line break property of last non-space character */
       char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 
@@ -58,6 +59,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
             {
               /* (LB4,LB5,LB6) Mandatory break.  */
               *p = UC_BREAK_MANDATORY;
+              prev_prop = LBP_BK;
               last_prop = LBP_BK;
               seen_space = NULL;
             }
@@ -97,16 +99,16 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
                   last_prop = LBP_ZW;
                   seen_space = NULL;
                 }
-              else if (prop == LBP_CM)
+              else if (prop == LBP_CM || prop == LBP_ZWJ)
                 {
-                  /* (LB9) Don't break just before a combining character, except
-                     immediately after a mandatory break character, space, or
-                     zero-width space.  */
+                  /* (LB9) Don't break just before a combining character or
+                     zero-width joiner, except immediately after a mandatory
+                     break character, space, or zero-width space.  */
                   if (last_prop == LBP_BK)
                     {
                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
                       *p = UC_BREAK_PROHIBITED;
-                      /* Treat CM as AL.  */
+                      /* (LB10) Treat CM or ZWJ as AL.  */
                       last_prop = LBP_AL;
                       seen_space = NULL;
                     }
@@ -118,7 +120,7 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
                          character as base for combining marks" because now the
                          NBSP CM sequence is recommended instead of SP CM.  */
                       *p = UC_BREAK_POSSIBLE;
-                      /* Treat CM as AL.  */
+                      /* (LB10) Treat CM or ZWJ as AL.  */
                       last_prop = LBP_AL;
                       seen_space = NULL;
                     }
@@ -144,6 +146,12 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
                       /* (LB8) Break after zero-width space.  */
                       *p = UC_BREAK_POSSIBLE;
                     }
+                  else if (prev_prop == LBP_ZWJ
+                           && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM))
+                    {
+                      /* (LB8a) Don't break right after a zero-width joiner.  */
+                      *p = UC_BREAK_PROHIBITED;
+                    }
                   else
                     {
                       switch (unilbrk_table [last_prop] [prop])
@@ -164,6 +172,8 @@ u16_possible_linebreaks (const uint16_t *s, size_t n, const char *encoding, char
                   last_prop = prop;
                   seen_space = NULL;
                 }
+
+              prev_prop = prop;
             }
 
           s += count;
diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c
index 562419ad96..ab973e911c 100644
--- a/lib/unilbrk/u32-possible-linebreaks.c
+++ b/lib/unilbrk/u32-possible-linebreaks.c
@@ -40,6 +40,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
     {
       int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
       const uint32_t *s_end = s + n;
+      int prev_prop = LBP_BK; /* line break property of last character */
       int last_prop = LBP_BK; /* line break property of last non-space character */
       char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 
@@ -52,6 +53,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
             {
               /* (LB4,LB5,LB6) Mandatory break.  */
               *p = UC_BREAK_MANDATORY;
+              prev_prop = LBP_BK;
               last_prop = LBP_BK;
               seen_space = NULL;
             }
@@ -91,16 +93,16 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
                   last_prop = LBP_ZW;
                   seen_space = NULL;
                 }
-              else if (prop == LBP_CM)
+              else if (prop == LBP_CM || prop == LBP_ZWJ)
                 {
-                  /* (LB9) Don't break just before a combining character, except
-                     immediately after a mandatory break character, space, or
-                     zero-width space.  */
+                  /* (LB9) Don't break just before a combining character or
+                     zero-width joiner, except immediately after a mandatory
+                     break character, space, or zero-width space.  */
                   if (last_prop == LBP_BK)
                     {
                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
                       *p = UC_BREAK_PROHIBITED;
-                      /* Treat CM as AL.  */
+                      /* (LB10) Treat CM or ZWJ as AL.  */
                       last_prop = LBP_AL;
                       seen_space = NULL;
                     }
@@ -112,7 +114,7 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
                          character as base for combining marks" because now the
                          NBSP CM sequence is recommended instead of SP CM.  */
                       *p = UC_BREAK_POSSIBLE;
-                      /* Treat CM as AL.  */
+                      /* (LB10) Treat CM or ZWJ as AL.  */
                       last_prop = LBP_AL;
                       seen_space = NULL;
                     }
@@ -138,6 +140,12 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
                       /* (LB8) Break after zero-width space.  */
                       *p = UC_BREAK_POSSIBLE;
                     }
+                  else if (prev_prop == LBP_ZWJ
+                           && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM))
+                    {
+                      /* (LB8a) Don't break right after a zero-width joiner.  */
+                      *p = UC_BREAK_PROHIBITED;
+                    }
                   else
                     {
                       switch (unilbrk_table [last_prop] [prop])
@@ -158,6 +166,8 @@ u32_possible_linebreaks (const uint32_t *s, size_t n, const char *encoding, char
                   last_prop = prop;
                   seen_space = NULL;
                 }
+
+              prev_prop = prop;
             }
 
           s++;
diff --git a/lib/unilbrk/u8-possible-linebreaks.c b/lib/unilbrk/u8-possible-linebreaks.c
index d7ab680203..86f1ce8f13 100644
--- a/lib/unilbrk/u8-possible-linebreaks.c
+++ b/lib/unilbrk/u8-possible-linebreaks.c
@@ -42,6 +42,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
     {
       int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
       const uint8_t *s_end = s + n;
+      int prev_prop = LBP_BK; /* line break property of last character */
       int last_prop = LBP_BK; /* line break property of last non-space character */
       char *seen_space = NULL; /* Was a space seen after the last non-space character? */
 
@@ -58,6 +59,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
             {
               /* (LB4,LB5,LB6) Mandatory break.  */
               *p = UC_BREAK_MANDATORY;
+              prev_prop = LBP_BK;
               last_prop = LBP_BK;
               seen_space = NULL;
             }
@@ -97,16 +99,16 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
                   last_prop = LBP_ZW;
                   seen_space = NULL;
                 }
-              else if (prop == LBP_CM)
+              else if (prop == LBP_CM || prop == LBP_ZWJ)
                 {
-                  /* (LB9) Don't break just before a combining character, except
-                     immediately after a mandatory break character, space, or
-                     zero-width space.  */
+                  /* (LB9) Don't break just before a combining character or
+                     zero-width joiner, except immediately after a mandatory
+                     break character, space, or zero-width space.  */
                   if (last_prop == LBP_BK)
                     {
                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
                       *p = UC_BREAK_PROHIBITED;
-                      /* Treat CM as AL.  */
+                      /* (LB10) Treat CM or ZWJ as AL.  */
                       last_prop = LBP_AL;
                       seen_space = NULL;
                     }
@@ -118,7 +120,7 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
                          character as base for combining marks" because now the
                          NBSP CM sequence is recommended instead of SP CM.  */
                       *p = UC_BREAK_POSSIBLE;
-                      /* Treat CM as AL.  */
+                      /* (LB10) Treat CM or ZWJ as AL.  */
                       last_prop = LBP_AL;
                       seen_space = NULL;
                     }
@@ -144,6 +146,12 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
                       /* (LB8) Break after zero-width space.  */
                       *p = UC_BREAK_POSSIBLE;
                     }
+                  else if (prev_prop == LBP_ZWJ
+                           && (prop == LBP_ID || prop == LBP_EB || prop == LBP_EM))
+                    {
+                      /* (LB8a) Don't break right after a zero-width joiner.  */
+                      *p = UC_BREAK_PROHIBITED;
+                    }
                   else
                     {
                       switch (unilbrk_table [last_prop] [prop])
@@ -164,6 +172,8 @@ u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *
                   last_prop = prop;
                   seen_space = NULL;
                 }
+
+              prev_prop = prop;
             }
 
           s += count;
author	Bruno Haible <bruno@clisp.org>	2021-12-29 00:06:11 +0100
committer	Bruno Haible <bruno@clisp.org>	2021-12-29 00:06:11 +0100
commit	2995fb5e993a5d7434d96465758087b35a1488ac (patch)
tree	2e37cf280b02efd5f6464e5e50ad985a5e48e80c /lib
parent	9f6dca273305dbafb013740c0de468a93b8e7be5 (diff)
download	gnulib-2995fb5e993a5d7434d96465758087b35a1488ac.tar.gz