unigbrk, uniwbrk: No-op tweaks.

* lib/unigbrk/u-grapheme-breaks.h: Comment tweaks. * lib/uniwbrk/u-wordbreaks.h: Coding style and comment tweaks. * lib/uniwbrk/wbrktable.c: Comment tweaks.
author: Bruno Haible <bruno@clisp.org> 2021-12-27 13:01:50 +0100
committer: Bruno Haible <bruno@clisp.org> 2021-12-28 12:44:01 +0100
commit: 43c47768d8694926a167928ff490fda96240cfbd (patch)
tree: f77563a700a5e9a18043e4fc94c9339e67085212 /lib/uniwbrk
parent: 8220e0f0b5f46ff61e1d19f8a1614508fa162abd (diff)
download: gnulib-43c47768d8694926a167928ff490fda96240cfbd.tar.gz
2 files changed, 24 insertions, 15 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 1be2880671..8ac5cd06cc 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -23,6 +23,9 @@
    License and of the GNU General Public License along with this
    program.  If not, see <https://www.gnu.org/licenses/>.  */
 
+/* This file implements section 4 "Word Boundaries"
+   of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>.  */
+
 void
 FUNC (const UNIT *s, size_t n, char *p)
 {
@@ -47,6 +50,8 @@ FUNC (const UNIT *s, size_t n, char *p)
          -1 at the very beginning of the string.  */
       int secondlast_compchar_prop = -1;
 
+      /* Number of consecutive regional indicator (RI) characters seen
+         immediately before the current point.  */
       size_t ri_count = 0;
 
       /* Don't break inside multibyte characters.  */
@@ -73,11 +78,15 @@ FUNC (const UNIT *s, size_t n, char *p)
                            || prop == WBP_NEWLINE))
                 *p = 1;
               /* No break within emoji zwj sequence (WB3c).  */
-              else if (last_char_prop == WBP_ZWJ &&
-                       (prop == WBP_GAZ || prop == WBP_EBG))
+              else if (last_char_prop == WBP_ZWJ
+                       && (prop == WBP_GAZ || prop == WBP_EBG))
+                /* *p = 0 */;
+              /* Ignore Format and Extend characters (WB4).  */
+              else if (prop == WBP_EXTEND
+                       || prop == WBP_FORMAT
+                       || prop == WBP_ZWJ)
                 /* *p = 0 */;
-              /* Ignore Format and Extend characters.  */
-              else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
+              else
                 {
                   /* No break in these situations (see UAX #29):
 
@@ -146,7 +155,7 @@ FUNC (const UNIT *s, size_t n, char *p)
           last_char_prop = prop;
 
           /* Ignore Format and Extend characters, except at the
-             start of the line.  */
+             start of the line (WB4).  */
           if (last_compchar_prop < 0
               || last_compchar_prop == WBP_CR
               || last_compchar_prop == WBP_LF
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 9af65ffb66..59d94c688e 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -58,16 +58,16 @@ const int uniwbrk_prop_index[22] =
 
                            last         current
 
-                     (ALetter | HL) × (ALetter | HL)                  (WB5)
-                     (ALetter | HL) × Numeric                         (WB9)
-                                 HL × SQ                              (WB7a)
-                            Numeric × (ALetter | HL)                  (WB10)
-                            Numeric × Numeric                         (WB8)
-                           Katakana × Katakana                        (WB13)
-(ALetter | HL | Numeric | Katakana) × ExtendNumLet                    (WB13a)
-                       ExtendNumLet × ExtendNumLet                    (WB13a)
-                   ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
-                     (E_Base | EBG) × E_Modifier                      (WB14)
+                     (ALetter | HL) × (ALetter | HL)                       (WB5)
+                     (ALetter | HL) × Numeric                              (WB9)
+                                 HL × SQ                                   (WB7a)
+                            Numeric × (ALetter | HL)                       (WB10)
+                            Numeric × Numeric                              (WB8)
+                           Katakana × Katakana                             (WB13)
+(ALetter | HL | Numeric | Katakana) × ExtendNumLet                         (WB13a)
+                       ExtendNumLet × ExtendNumLet                         (WB13a)
+                       ExtendNumLet × (ALetter | HL | Numeric | Katakana)  (WB13b)
+                     (E_Base | EBG) × E_Modifier                           (WB14)
 
    Note that the following rules are not handled here but in the loop in u-wordbreaks.h:
    - The rules need to look back or look ahead the second character (WB6, WB7, WB7b, WB7c, WB11, WB12)
author	Bruno Haible <bruno@clisp.org>	2021-12-27 13:01:50 +0100
committer	Bruno Haible <bruno@clisp.org>	2021-12-28 12:44:01 +0100
commit	43c47768d8694926a167928ff490fda96240cfbd (patch)
tree	f77563a700a5e9a18043e4fc94c9339e67085212 /lib/uniwbrk
parent	8220e0f0b5f46ff61e1d19f8a1614508fa162abd (diff)
download	gnulib-43c47768d8694926a167928ff490fda96240cfbd.tar.gz