summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorReuben Thomas <rrt@sc3d.org>2019-06-14 14:36:14 +0100
committerReuben Thomas <rrt@sc3d.org>2019-06-17 17:36:21 +0100
commit4db05d04e1311d803e0a066420d0815a2e7c05df (patch)
tree28541a410eeee5bb44d11edffb78e3f86fdd4794 /src
parent7bf08f34a2095e37ea0e8ae5ab92334028d92f75 (diff)
downloadenchant-4db05d04e1311d803e0a066420d0815a2e7c05df.tar.gz
tokenize_line: fix skipping back over non-word characters (fix #212)
We were truncating the string a byte at a time rather than a character at a time. Thanks to Juha Jeronen for the bug report.
Diffstat (limited to 'src')
-rw-r--r--src/enchant.c20
1 files changed, 9 insertions, 11 deletions
diff --git a/src/enchant.c b/src/enchant.c
index c7e958e..b1178f4 100644
--- a/src/enchant.c
+++ b/src/enchant.c
@@ -197,17 +197,13 @@ static GSList *
tokenize_line (EnchantDict * dict, GString * line)
{
GSList * tokens = NULL;
- char *utf = (char *) line->str;
-
- GString * word;
-
- gunichar uc;
+ gchar *utf = (char *) line->str;
size_t cur_pos = 0;
size_t start_pos = 0;
- word = g_string_new (NULL);
+ GString * word = g_string_new (NULL);
while (cur_pos < line->len && *utf) {
- int i;
+ gunichar uc;
/* Skip non-word characters. */
cur_pos = g_utf8_pointer_to_offset ((const char*)line->str, utf);
@@ -228,10 +224,12 @@ tokenize_line (EnchantDict * dict, GString * line)
}
/* Skip backwards over any characters that can't appear at the end of a word. */
- i = word->len-1;
- while ((i >= 0) && !enchant_dict_is_word_character(dict, word->str[i], 2)) {
- g_string_truncate (word, i);
- i--;
+ for (gchar *i_utf = word->str + word->len;
+ (i_utf = g_utf8_find_prev_char (word->str, i_utf)) != NULL;
+ g_string_truncate (word, i_utf - word->str)) {
+ uc = g_utf8_get_char (i_utf);
+ if (enchant_dict_is_word_character(dict, uc, 2))
+ break;
}
/* Save (word, position) tuple. */