From 4db05d04e1311d803e0a066420d0815a2e7c05df Mon Sep 17 00:00:00 2001
From: Reuben Thomas <rrt@sc3d.org>
Date: Fri, 14 Jun 2019 14:36:14 +0100
Subject: tokenize_line: fix skipping back over non-word characters (fix #212)

We were truncating the string a byte at a time rather than a character at a
time.

Thanks to Juha Jeronen for the bug report.
---
 src/enchant.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'src')

diff --git a/src/enchant.c b/src/enchant.c
index c7e958e..b1178f4 100644
--- a/src/enchant.c
+++ b/src/enchant.c
@@ -197,17 +197,13 @@ static GSList *
 tokenize_line (EnchantDict * dict, GString * line)
 {
 	GSList * tokens = NULL;
-	char *utf = (char *) line->str;
-
-	GString * word;
-	
-	gunichar uc;
+	gchar *utf = (char *) line->str;
 	size_t cur_pos = 0;
 	size_t start_pos = 0;
-	word = g_string_new (NULL);
+	GString * word = g_string_new (NULL);
 
 	while (cur_pos < line->len && *utf) {
-		int i;
+		gunichar uc;
 
 	        /* Skip non-word characters. */
 		cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
@@ -228,10 +224,12 @@ tokenize_line (EnchantDict * dict, GString * line)
 		}
 
 	        /* Skip backwards over any characters that can't appear at the end of a word. */
-		i = word->len-1;
-	        while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) {
-	                g_string_truncate (word, i);
-			i--;
+	        for (gchar *i_utf = word->str + word->len;
+		     (i_utf = g_utf8_find_prev_char (word->str, i_utf)) != NULL;
+		     g_string_truncate (word, i_utf - word->str)) {
+			uc = g_utf8_get_char (i_utf);
+			if (enchant_dict_is_word_character(dict, uc, 2))
+				break;
 		}
 
 		/* Save (word, position) tuple. */
-- 
cgit v1.2.1