From 4db05d04e1311d803e0a066420d0815a2e7c05df Mon Sep 17 00:00:00 2001 From: Reuben Thomas Date: Fri, 14 Jun 2019 14:36:14 +0100 Subject: tokenize_line: fix skipping back over non-word characters (fix #212) We were truncating the string a byte at a time rather than a character at a time. Thanks to Juha Jeronen for the bug report. --- src/enchant.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/enchant.c b/src/enchant.c index c7e958e..b1178f4 100644 --- a/src/enchant.c +++ b/src/enchant.c @@ -197,17 +197,13 @@ static GSList * tokenize_line (EnchantDict * dict, GString * line) { GSList * tokens = NULL; - char *utf = (char *) line->str; - - GString * word; - - gunichar uc; + gchar *utf = (char *) line->str; size_t cur_pos = 0; size_t start_pos = 0; - word = g_string_new (NULL); + GString * word = g_string_new (NULL); while (cur_pos < line->len && *utf) { - int i; + gunichar uc; /* Skip non-word characters. */ cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); @@ -228,10 +224,12 @@ tokenize_line (EnchantDict * dict, GString * line) } /* Skip backwards over any characters that can't appear at the end of a word. */ - i = word->len-1; - while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) { - g_string_truncate (word, i); - i--; + for (gchar *i_utf = word->str + word->len; + (i_utf = g_utf8_find_prev_char (word->str, i_utf)) != NULL; + g_string_truncate (word, i_utf - word->str)) { + uc = g_utf8_get_char (i_utf); + if (enchant_dict_is_word_character(dict, uc, 2)) + break; } /* Save (word, position) tuple. */ -- cgit v1.2.1