diff options
author | Reuben Thomas <rrt@sc3d.org> | 2019-06-14 14:36:14 +0100 |
---|---|---|
committer | Reuben Thomas <rrt@sc3d.org> | 2019-06-17 17:36:21 +0100 |
commit | 4db05d04e1311d803e0a066420d0815a2e7c05df (patch) | |
tree | 28541a410eeee5bb44d11edffb78e3f86fdd4794 /src | |
parent | 7bf08f34a2095e37ea0e8ae5ab92334028d92f75 (diff) | |
download | enchant-4db05d04e1311d803e0a066420d0815a2e7c05df.tar.gz |
tokenize_line: fix skipping back over non-word characters (fix #212)
We were truncating the string a byte at a time rather than a character at a
time.
Thanks to Juha Jeronen for the bug report.
Diffstat (limited to 'src')
-rw-r--r-- | src/enchant.c | 20 |
1 files changed, 9 insertions, 11 deletions
diff --git a/src/enchant.c b/src/enchant.c index c7e958e..b1178f4 100644 --- a/src/enchant.c +++ b/src/enchant.c @@ -197,17 +197,13 @@ static GSList * tokenize_line (EnchantDict * dict, GString * line) { GSList * tokens = NULL; - char *utf = (char *) line->str; - - GString * word; - - gunichar uc; + gchar *utf = (char *) line->str; size_t cur_pos = 0; size_t start_pos = 0; - word = g_string_new (NULL); + GString * word = g_string_new (NULL); while (cur_pos < line->len && *utf) { - int i; + gunichar uc; /* Skip non-word characters. */ cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf); @@ -228,10 +224,12 @@ tokenize_line (EnchantDict * dict, GString * line) } /* Skip backwards over any characters that can't appear at the end of a word. */ - i = word->len-1; - while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) { - g_string_truncate (word, i); - i--; + for (gchar *i_utf = word->str + word->len; + (i_utf = g_utf8_find_prev_char (word->str, i_utf)) != NULL; + g_string_truncate (word, i_utf - word->str)) { + uc = g_utf8_get_char (i_utf); + if (enchant_dict_is_word_character(dict, uc, 2)) + break; } /* Save (word, position) tuple. */ |