tokenize_line: fix skipping back over non-word characters (fix #212)

We were truncating the string a byte at a time rather than a character at a time. Thanks to Juha Jeronen for the bug report.
author: Reuben Thomas <rrt@sc3d.org> 2019-06-14 14:36:14 +0100
committer: Reuben Thomas <rrt@sc3d.org> 2019-06-17 17:36:21 +0100
commit: 4db05d04e1311d803e0a066420d0815a2e7c05df (patch)
tree: 28541a410eeee5bb44d11edffb78e3f86fdd4794 /src
parent: 7bf08f34a2095e37ea0e8ae5ab92334028d92f75 (diff)
download: enchant-4db05d04e1311d803e0a066420d0815a2e7c05df.tar.gz
1 files changed, 9 insertions, 11 deletions
diff --git a/src/enchant.c b/src/enchant.c
index c7e958e..b1178f4 100644
--- a/src/enchant.c
+++ b/src/enchant.c
@@ -197,17 +197,13 @@ static GSList *
 tokenize_line (EnchantDict * dict, GString * line)
 {
 	GSList * tokens = NULL;
-	char *utf = (char *) line->str;
-
-	GString * word;
-	
-	gunichar uc;
+	gchar *utf = (char *) line->str;
 	size_t cur_pos = 0;
 	size_t start_pos = 0;
-	word = g_string_new (NULL);
+	GString * word = g_string_new (NULL);
 
 	while (cur_pos < line->len && *utf) {
-		int i;
+		gunichar uc;
 
 	        /* Skip non-word characters. */
 		cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
@@ -228,10 +224,12 @@ tokenize_line (EnchantDict * dict, GString * line)
 		}
 
 	        /* Skip backwards over any characters that can't appear at the end of a word. */
-		i = word->len-1;
-	        while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) {
-	                g_string_truncate (word, i);
-			i--;
+	        for (gchar *i_utf = word->str + word->len;
+		     (i_utf = g_utf8_find_prev_char (word->str, i_utf)) != NULL;
+		     g_string_truncate (word, i_utf - word->str)) {
+			uc = g_utf8_get_char (i_utf);
+			if (enchant_dict_is_word_character(dict, uc, 2))
+				break;
 		}
 
 		/* Save (word, position) tuple. */
author	Reuben Thomas <rrt@sc3d.org>	2019-06-14 14:36:14 +0100
committer	Reuben Thomas <rrt@sc3d.org>	2019-06-17 17:36:21 +0100
commit	4db05d04e1311d803e0a066420d0815a2e7c05df (patch)
tree	28541a410eeee5bb44d11edffb78e3f86fdd4794 /src
parent	7bf08f34a2095e37ea0e8ae5ab92334028d92f75 (diff)
download	enchant-4db05d04e1311d803e0a066420d0815a2e7c05df.tar.gz