Merge pull request #279 from rrthomas/master

Hunspell: ensure extra word characters are returned as UTF-8 (fix #278)
author: Reuben Thomas <rrt@sc3d.org> 2021-08-09 20:19:47 +0100
committer: GitHub <noreply@github.com> 2021-08-09 20:19:47 +0100
commit: deedafc924caf77183bfc5282205997a34116c79 (patch)
tree: ec745dbceb51c2bd4d2c66b5465234e01f467a98
parent: 5b34d2d98d1929bb4f50894232b22f4e95210873 (diff)
parent: 08575fba2e091c16c5fbcf43fc3803e30d3e48b4 (diff)
download: enchant-deedafc924caf77183bfc5282205997a34116c79.tar.gz
1 files changed, 54 insertions, 46 deletions
diff --git a/providers/enchant_hunspell.cpp b/providers/enchant_hunspell.cpp
index a1c6815..535aa57 100644
--- a/providers/enchant_hunspell.cpp
+++ b/providers/enchant_hunspell.cpp
@@ -1,5 +1,6 @@
 /* enchant
  * Copyright (C) 2003-2004 Joan Moratinos <jmo@softcatala.org>, Dom Lachowicz
+ * Copyright (C) 2016-2021 Reuben Thomas <rrt@sc3d.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -16,8 +17,8 @@
  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  * Boston, MA 02110-1301, USA.
  *
- * In addition, as a special exception, Dom Lachowicz
- * gives permission to link the code of this program with
+ * In addition, as a special exception, the copyright holders
+ * give permission to link the code of this program with
  * non-LGPL Spelling Provider libraries (eg: a MSFT Office
  * spell checker backend) and distribute linked combinations including
  * the two.  You must obey the GNU General Public License in all
@@ -45,11 +46,29 @@
 #include "unused-parameter.h"
 
 #include <hunspell/hunspell.hxx>
+// hunspell itself uses this definition (which only supports the BMP)
+#define MAXWORDUTF8LEN (MAXWORDLEN * 3)
 
 #include <glib.h>
 
 /***************************************************************************/
 
+static const char *empty_string = "";
+
+static char *do_iconv(GIConv conv, const char *word) {
+	// g_iconv() does not declare its 'in' parameter const, but iconv() does.
+	char *in = const_cast<char *>(word);
+	size_t len_in = strlen(in);
+	size_t len_out = len_in * 3;
+	char *out_buf = g_new0(char, len_out + 1);
+	char *out = out_buf;
+	size_t result = g_iconv(conv, &in, &len_in, &out, &len_out);
+	if (static_cast<size_t>(-1) == result)
+		return nullptr;
+	*out = '\0';
+	return out_buf;
+}
+
 class HunspellChecker
 {
 public:
@@ -67,6 +86,7 @@ private:
 	GIConv  m_translate_in; /* Selected translation from/to Unicode */
 	GIConv  m_translate_out;
 	Hunspell *hunspell;
+	char *wordchars; /* Value returned by getWordChars() */
 };
 
 /***************************************************************************/
@@ -78,7 +98,7 @@ g_iconv_is_valid(GIConv i)
 }
 
 HunspellChecker::HunspellChecker()
-: apostropheIsWordChar(false), m_translate_in(nullptr), m_translate_out(nullptr), hunspell(nullptr)
+: apostropheIsWordChar(false), m_translate_in(nullptr), m_translate_out(nullptr), hunspell(nullptr), wordchars(nullptr)
 {
 }
 
@@ -89,80 +109,61 @@ HunspellChecker::~HunspellChecker()
 		g_iconv_close(m_translate_in);
 	if (g_iconv_is_valid(m_translate_out))
 		g_iconv_close(m_translate_out);
+	free(wordchars);
 }
 
 bool
 HunspellChecker::checkWord(const char *utf8Word, size_t len)
 {
-	if (len > MAXWORDLEN || !g_iconv_is_valid(m_translate_in))
+	if (len > MAXWORDUTF8LEN || !g_iconv_is_valid(m_translate_in))
 		return false;
 
 	// the 8bit encodings use precomposed forms
 	char *normalizedWord = g_utf8_normalize (utf8Word, len, G_NORMALIZE_NFC);
-	char *in = normalizedWord;
-	char word8[MAXWORDLEN + 1];
-	char *out = word8;
-	size_t len_in = strlen(in);
-	size_t len_out = sizeof( word8 ) - 1;
-	size_t result = g_iconv(m_translate_in, &in, &len_in, &out, &len_out);
+	char *out = do_iconv(m_translate_in, normalizedWord);
 	g_free(normalizedWord);
-	if (static_cast<size_t>(-1) == result)
-		return false;
-	*out = '\0';
-	if (hunspell->spell(std::string(word8)))
-		return true;
-	else
+	if (out == NULL)
 		return false;
+	bool result = hunspell->spell(std::string(out)) != 0;
+	free(out);
+	return result;
 }
 
 char**
 HunspellChecker::suggestWord(const char* const utf8Word, size_t len, size_t *nsug)
 {
-	if (len > MAXWORDLEN 
+	if (len > MAXWORDUTF8LEN
 		|| !g_iconv_is_valid(m_translate_in)
 		|| !g_iconv_is_valid(m_translate_out))
 		return nullptr;
 
 	// the 8bit encodings use precomposed forms
 	char *normalizedWord = g_utf8_normalize (utf8Word, len, G_NORMALIZE_NFC);
-	char *in = normalizedWord;
-	char word8[MAXWORDLEN + 1];
-	char *out = word8;
-	size_t len_in = strlen(in);
-	size_t len_out = sizeof(word8) - 1;
-	size_t result = g_iconv(m_translate_in, &in, &len_in, &out, &len_out);
+	char *out = do_iconv(m_translate_in, normalizedWord);
 	g_free(normalizedWord);
-	if (static_cast<size_t>(-1) == result)
+	if (out == NULL)
 		return nullptr;
 
-	*out = '\0';
-	std::vector<std::string> sugMS = hunspell->suggest(word8);
+	std::vector<std::string> sugMS = hunspell->suggest(out);
+	g_free(out);
 	*nsug = sugMS.size();
 	if (*nsug > 0) {
 		char **sug = g_new0 (char *, *nsug + 1);
-		for (size_t i=0; i<*nsug; i++) {
-			in = const_cast<char *>(sugMS[i].c_str());
-			len_in = strlen(in);
-			len_out = MAXWORDLEN;
-			char *word = g_new0(char, len_out + 1);
-			out = word;
-			if (static_cast<size_t>(-1) == g_iconv(m_translate_out, &in, &len_in, &out, &len_out)) {
-				*nsug = i;
-				break;
-			}
-			*out = '\0';
-			sug[i] = word;
+		for (size_t i=0, j=0; i<*nsug; i++) {
+			const char *in = sugMS[i].c_str();
+			out = do_iconv(m_translate_out, in);
+			if (out != NULL)
+				sug[j++] = out;
 		}
 		return sug;
 	}
-	else
-		return nullptr;
+	return nullptr;
 }
 
-const char*
+_GL_ATTRIBUTE_PURE const char*
 HunspellChecker::getWordchars()
 {
-	return hunspell->get_wordchars();
+	return static_cast<const char *>(wordchars);
 }
 
 static void
@@ -304,8 +305,11 @@ HunspellChecker::requestDictionary(const char *szLang)
 	std::string aff(s_correspondingAffFile(dic));
 	if (s_fileExists(aff))
 	{
-		if (hunspell)
+		if (hunspell) {
 			delete hunspell;
+			free(wordchars);
+			wordchars = NULL;
+		}
 		hunspell = new Hunspell(aff.c_str(), dic);
 	}
 	free(dic);
@@ -317,9 +321,13 @@ HunspellChecker::requestDictionary(const char *szLang)
 	m_translate_in = g_iconv_open(enc, "UTF-8");
 	m_translate_out = g_iconv_open("UTF-8", enc);
 
-	const char *word_chars = hunspell->get_wordchars();
-	apostropheIsWordChar = g_utf8_strchr(word_chars, -1, g_utf8_get_char("'")) ||
-		g_utf8_strchr(word_chars, -1, g_utf8_get_char("’"));
+	wordchars = do_iconv(m_translate_out, hunspell->get_wordchars());
+	if (wordchars == NULL)
+		wordchars = strdup(empty_string);
+	if (wordchars == NULL)
+		return false;
+	apostropheIsWordChar = g_utf8_strchr(wordchars, -1, g_utf8_get_char("'")) ||
+		g_utf8_strchr(wordchars, -1, g_utf8_get_char("’"));
 
 	return true;
 }
author	Reuben Thomas <rrt@sc3d.org>	2021-08-09 20:19:47 +0100
committer	GitHub <noreply@github.com>	2021-08-09 20:19:47 +0100
commit	deedafc924caf77183bfc5282205997a34116c79 (patch)
tree	ec745dbceb51c2bd4d2c66b5465234e01f467a98
parent	5b34d2d98d1929bb4f50894232b22f4e95210873 (diff)
parent	08575fba2e091c16c5fbcf43fc3803e30d3e48b4 (diff)
download	enchant-deedafc924caf77183bfc5282205997a34116c79.tar.gz