summaryrefslogtreecommitdiff
path: root/lib/uniwbrk
diff options
context:
space:
mode:
authorBruno Haible <bruno@clisp.org>2021-12-27 13:01:50 +0100
committerBruno Haible <bruno@clisp.org>2021-12-28 12:44:01 +0100
commit43c47768d8694926a167928ff490fda96240cfbd (patch)
treef77563a700a5e9a18043e4fc94c9339e67085212 /lib/uniwbrk
parent8220e0f0b5f46ff61e1d19f8a1614508fa162abd (diff)
downloadgnulib-43c47768d8694926a167928ff490fda96240cfbd.tar.gz
unigbrk, uniwbrk: No-op tweaks.
* lib/unigbrk/u-grapheme-breaks.h: Comment tweaks. * lib/uniwbrk/u-wordbreaks.h: Coding style and comment tweaks. * lib/uniwbrk/wbrktable.c: Comment tweaks.
Diffstat (limited to 'lib/uniwbrk')
-rw-r--r--lib/uniwbrk/u-wordbreaks.h19
-rw-r--r--lib/uniwbrk/wbrktable.c20
2 files changed, 24 insertions, 15 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 1be2880671..8ac5cd06cc 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -23,6 +23,9 @@
License and of the GNU General Public License along with this
program. If not, see <https://www.gnu.org/licenses/>. */
+/* This file implements section 4 "Word Boundaries"
+ of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>. */
+
void
FUNC (const UNIT *s, size_t n, char *p)
{
@@ -47,6 +50,8 @@ FUNC (const UNIT *s, size_t n, char *p)
-1 at the very beginning of the string. */
int secondlast_compchar_prop = -1;
+ /* Number of consecutive regional indicator (RI) characters seen
+ immediately before the current point. */
size_t ri_count = 0;
/* Don't break inside multibyte characters. */
@@ -73,11 +78,15 @@ FUNC (const UNIT *s, size_t n, char *p)
|| prop == WBP_NEWLINE))
*p = 1;
/* No break within emoji zwj sequence (WB3c). */
- else if (last_char_prop == WBP_ZWJ &&
- (prop == WBP_GAZ || prop == WBP_EBG))
+ else if (last_char_prop == WBP_ZWJ
+ && (prop == WBP_GAZ || prop == WBP_EBG))
+ /* *p = 0 */;
+ /* Ignore Format and Extend characters (WB4). */
+ else if (prop == WBP_EXTEND
+ || prop == WBP_FORMAT
+ || prop == WBP_ZWJ)
/* *p = 0 */;
- /* Ignore Format and Extend characters. */
- else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
+ else
{
/* No break in these situations (see UAX #29):
@@ -146,7 +155,7 @@ FUNC (const UNIT *s, size_t n, char *p)
last_char_prop = prop;
/* Ignore Format and Extend characters, except at the
- start of the line. */
+ start of the line (WB4). */
if (last_compchar_prop < 0
|| last_compchar_prop == WBP_CR
|| last_compchar_prop == WBP_LF
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 9af65ffb66..59d94c688e 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -58,16 +58,16 @@ const int uniwbrk_prop_index[22] =
last current
- (ALetter | HL) × (ALetter | HL) (WB5)
- (ALetter | HL) × Numeric (WB9)
- HL × SQ (WB7a)
- Numeric × (ALetter | HL) (WB10)
- Numeric × Numeric (WB8)
- Katakana × Katakana (WB13)
-(ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
- ExtendNumLet × ExtendNumLet (WB13a)
- ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
- (E_Base | EBG) × E_Modifier (WB14)
+ (ALetter | HL) × (ALetter | HL) (WB5)
+ (ALetter | HL) × Numeric (WB9)
+ HL × SQ (WB7a)
+ Numeric × (ALetter | HL) (WB10)
+ Numeric × Numeric (WB8)
+ Katakana × Katakana (WB13)
+(ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
+ ExtendNumLet × ExtendNumLet (WB13a)
+ ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
+ (E_Base | EBG) × E_Modifier (WB14)
Note that the following rules are not handled here but in the loop in u-wordbreaks.h:
- The rules need to look back or look ahead the second character (WB6, WB7, WB7b, WB7c, WB11, WB12)