/* enchant * Copyright (C) 2003-2004 Joan Moratinos , Dom Lachowicz * Copyright (C) 2016-2021 Reuben Thomas * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. * * In addition, as a special exception, the copyright holders * give permission to link the code of this program with * non-LGPL Spelling Provider libraries (eg: a MSFT Office * spell checker backend) and distribute linked combinations including * the two. You must obey the GNU General Public License in all * respects for all of the code used other than said providers. If you modify * this file, you may extend this exception to your version of the * file, but you are not obligated to do so. If you do not wish to * do so, delete this exception statement from your version. */ /* * This is the Hunspell Enchant Backend. * Hunspell is by László Németh. See: https://hunspell.github.io/ */ #include "config.h" #include #include #include #include #include #include "enchant-provider.h" #include "unused-parameter.h" #include // hunspell itself uses this definition (which only supports the BMP) #define MAXWORDUTF8LEN (MAXWORDLEN * 3) #include /***************************************************************************/ static const char *empty_string = ""; static char *do_iconv(GIConv conv, const char *word) { // g_iconv() does not declare its 'in' parameter const, but iconv() does. char *in = const_cast(word); size_t len_in = strlen(in); size_t len_out = len_in * 3; char *out_buf = g_new0(char, len_out + 1); char *out = out_buf; size_t result = g_iconv(conv, &in, &len_in, &out, &len_out); if (static_cast(-1) == result) return nullptr; *out = '\0'; return out_buf; } class HunspellChecker { public: HunspellChecker(); ~HunspellChecker(); bool checkWord (const char *word, size_t len); char **suggestWord (const char* const word, size_t len, size_t *out_n_suggs); const char *getWordchars (); bool apostropheIsWordChar; bool requestDictionary (const char * szLang); private: GIConv m_translate_in; /* Selected translation from/to Unicode */ GIConv m_translate_out; Hunspell *hunspell; char *wordchars; /* Value returned by getWordChars() */ }; /***************************************************************************/ static bool g_iconv_is_valid(GIConv i) { return (i != nullptr); } HunspellChecker::HunspellChecker() : apostropheIsWordChar(false), m_translate_in(nullptr), m_translate_out(nullptr), hunspell(nullptr), wordchars(nullptr) { } HunspellChecker::~HunspellChecker() { delete hunspell; if (g_iconv_is_valid (m_translate_in)) g_iconv_close(m_translate_in); if (g_iconv_is_valid(m_translate_out)) g_iconv_close(m_translate_out); free(wordchars); } bool HunspellChecker::checkWord(const char *utf8Word, size_t len) { if (len > MAXWORDUTF8LEN || !g_iconv_is_valid(m_translate_in)) return false; // the 8bit encodings use precomposed forms char *normalizedWord = g_utf8_normalize (utf8Word, len, G_NORMALIZE_NFC); char *out = do_iconv(m_translate_in, normalizedWord); g_free(normalizedWord); if (out == NULL) return false; bool result = hunspell->spell(std::string(out)) != 0; free(out); return result; } char** HunspellChecker::suggestWord(const char* const utf8Word, size_t len, size_t *nsug) { if (len > MAXWORDUTF8LEN || !g_iconv_is_valid(m_translate_in) || !g_iconv_is_valid(m_translate_out)) return nullptr; // the 8bit encodings use precomposed forms char *normalizedWord = g_utf8_normalize (utf8Word, len, G_NORMALIZE_NFC); char *out = do_iconv(m_translate_in, normalizedWord); g_free(normalizedWord); if (out == NULL) return nullptr; std::vector sugMS = hunspell->suggest(out); g_free(out); *nsug = sugMS.size(); if (*nsug > 0) { char **sug = g_new0 (char *, *nsug + 1); for (size_t i=0, j=0; i<*nsug; i++) { const char *in = sugMS[i].c_str(); out = do_iconv(m_translate_out, in); if (out != NULL) sug[j++] = out; } return sug; } return nullptr; } _GL_ATTRIBUTE_PURE const char* HunspellChecker::getWordchars() { return static_cast(wordchars); } static void s_buildDictionaryDirs (std::vector & dirs) { dirs.clear (); char * config_dir = enchant_get_user_config_dir (); gchar * tmp = g_build_filename (config_dir, "hunspell", nullptr); dirs.push_back (tmp); free (config_dir); g_free(tmp); for (const gchar* const * iter = g_get_system_data_dirs (); *iter; iter++) { tmp = g_build_filename (*iter, "hunspell", nullptr); dirs.push_back (tmp); g_free(tmp); } /* Dynamically locate library and search for modules relative to it. */ char * enchant_prefix = enchant_get_prefix_dir(); if(enchant_prefix) { tmp = g_build_filename(enchant_prefix, "share", "enchant", "hunspell", nullptr); dirs.push_back (tmp); free(enchant_prefix); g_free(tmp); } #ifdef ENCHANT_HUNSPELL_DICT_DIR config_dir = enchant_relocate (ENCHANT_HUNSPELL_DICT_DIR); dirs.push_back (config_dir); free(config_dir); #endif } static void s_buildHashNames (std::vector & names, const char * dict) { names.clear (); std::vector dirs; s_buildDictionaryDirs (dirs); char *dict_dic = g_strconcat(dict, ".dic", nullptr); for (size_t i = 0; i < dirs.size(); i++) { char *tmp = g_build_filename (dirs[i].c_str(), dict_dic, nullptr); names.push_back (tmp); g_free (tmp); } g_free(dict_dic); } static const std::string s_correspondingAffFile(const std::string & dicFile) { std::string aff = dicFile; aff.replace(aff.end()-3,aff.end(), "aff"); return aff; } static bool s_fileExists(const std::string & file) { return g_file_test(file.c_str(), G_FILE_TEST_EXISTS) != 0; } static bool is_plausible_dict_for_tag(const char *dir_entry, const char *tag) { const char *dic_suffix = ".dic"; size_t dic_suffix_len = strlen(dic_suffix); size_t dir_entry_len = strlen(dir_entry); size_t tag_len = strlen(tag); if (dir_entry_len - dic_suffix_len < tag_len) return false; if (strcmp(dir_entry+dir_entry_len-dic_suffix_len, dic_suffix) != 0) return false; if (strncmp (dir_entry, tag, tag_len) != 0) return false; //e.g. requested dict for "fi", //reject "fil_PH.dic" //allow "fi-FOO.dic", "fi_FOO.dic", "fi.dic", etc. if (!ispunct(dir_entry[tag_len])) return false; return true; } static char * hunspell_request_dictionary (const char * tag) { std::vector names; s_buildHashNames (names, tag); for (size_t i = 0; i < names.size (); i++) { if (g_file_test(names[i].c_str(), G_FILE_TEST_EXISTS) && s_fileExists(s_correspondingAffFile(names[i]))) { return strdup (names[i].c_str()); } } std::vector dirs; s_buildDictionaryDirs (dirs); for (size_t i = 0; i < dirs.size(); i++) { GDir *dir = g_dir_open (dirs[i].c_str(), 0, nullptr); if (dir) { const char *dir_entry; while ((dir_entry = g_dir_read_name (dir)) != NULL) { if (is_plausible_dict_for_tag(dir_entry, tag)) { char *dict = g_build_filename (dirs[i].c_str(), dir_entry, nullptr); if(s_fileExists(s_correspondingAffFile(dict))) { g_dir_close (dir); return dict; } g_free(dict); } } g_dir_close (dir); } } return NULL; } bool HunspellChecker::requestDictionary(const char *szLang) { char *dic = hunspell_request_dictionary (szLang); if (!dic) return false; std::string aff(s_correspondingAffFile(dic)); if (s_fileExists(aff)) { if (hunspell) { delete hunspell; free(wordchars); wordchars = NULL; } hunspell = new Hunspell(aff.c_str(), dic); } free(dic); if(hunspell == NULL){ return false; } const char *enc = hunspell->get_dic_encoding(); m_translate_in = g_iconv_open(enc, "UTF-8"); m_translate_out = g_iconv_open("UTF-8", enc); wordchars = do_iconv(m_translate_out, hunspell->get_wordchars()); if (wordchars == NULL) wordchars = strdup(empty_string); if (wordchars == NULL) return false; apostropheIsWordChar = g_utf8_strchr(wordchars, -1, g_utf8_get_char("'")) || g_utf8_strchr(wordchars, -1, g_utf8_get_char("’")); return true; } /* * Enchant */ static char ** hunspell_dict_suggest (EnchantDict * me, const char *const word, size_t len, size_t * out_n_suggs) { HunspellChecker * checker = static_cast(me->user_data); return checker->suggestWord (word, len, out_n_suggs); } static int hunspell_dict_check (EnchantDict * me, const char *const word, size_t len) { HunspellChecker * checker = static_cast(me->user_data); if (checker->checkWord(word, len)) return 0; return 1; } static const char* hunspell_dict_get_extra_word_characters (EnchantDict *me) { HunspellChecker * checker = static_cast(me->user_data); return checker->getWordchars(); } static int hunspell_dict_is_word_character (EnchantDict *me, uint32_t uc, size_t n) { HunspellChecker * checker = static_cast(me->user_data); /* Accept quote marks anywhere except at the end of a word, as per hunspell's textparser.cxx/TextParser::next_token */ if ((uc == g_utf8_get_char("'") || uc == g_utf8_get_char("’")) && checker->apostropheIsWordChar) { return n < 2; } return g_unichar_isalpha(uc) || g_utf8_strchr(checker->getWordchars(), -1, uc); } static void hunspell_provider_enum_dicts (const char * const directory, std::vector & out_dicts) { GDir * dir = g_dir_open (directory, 0, nullptr); if (dir) { const char * entry; while ((entry = g_dir_read_name (dir)) != NULL) { char * utf8_entry = g_filename_to_utf8 (entry, -1, nullptr, nullptr, nullptr); if (utf8_entry) { std::string dir_entry (utf8_entry); g_free (utf8_entry); int hit = dir_entry.rfind (".dic"); if (hit != -1) { /* don't include hyphenation dictionaries and require .aff file to be present*/ if(dir_entry.compare (0, 5, "hyph_") != 0) { char * dic = g_build_filename(directory, dir_entry.c_str(), nullptr); if (s_fileExists(s_correspondingAffFile(dic))) { out_dicts.push_back (dir_entry.substr (0, hit)); } g_free(dic); } } } } g_dir_close (dir); } } extern "C" { static char ** hunspell_provider_list_dicts (EnchantProvider * me _GL_UNUSED_PARAMETER, size_t * out_n_dicts) { std::vector dict_dirs, dicts; char ** dictionary_list = NULL; s_buildDictionaryDirs (dict_dirs); for (size_t i = 0; i < dict_dirs.size(); i++) { hunspell_provider_enum_dicts (dict_dirs[i].c_str(), dicts); } if (dicts.size () > 0) { dictionary_list = g_new0 (char *, dicts.size() + 1); for (size_t i = 0; i < dicts.size(); i++) dictionary_list[i] = g_strdup (dicts[i].c_str()); } *out_n_dicts = dicts.size (); return dictionary_list; } static EnchantDict * hunspell_provider_request_dict(EnchantProvider * me _GL_UNUSED_PARAMETER, const char *const tag) { HunspellChecker * checker = new HunspellChecker(); if (!checker) return NULL; if (!checker->requestDictionary(tag)) { delete checker; return NULL; } EnchantDict *dict = g_new0(EnchantDict, 1); dict->user_data = (void *) checker; dict->check = hunspell_dict_check; dict->suggest = hunspell_dict_suggest; // don't implement personal, session dict->get_extra_word_characters = hunspell_dict_get_extra_word_characters; dict->is_word_character = hunspell_dict_is_word_character; return dict; } static void hunspell_provider_dispose_dict (EnchantProvider * me _GL_UNUSED_PARAMETER, EnchantDict * dict) { HunspellChecker *checker = (HunspellChecker *) dict->user_data; delete checker; g_free (dict); } static int hunspell_provider_dictionary_exists (struct str_enchant_provider * me _GL_UNUSED_PARAMETER, const char *const tag) { std::vector names; s_buildHashNames (names, tag); for (size_t i = 0; i < names.size(); i++) { if (g_file_test (names[i].c_str(), G_FILE_TEST_EXISTS) && s_fileExists(s_correspondingAffFile(names[i]))) { return 1; } } return 0; } static void hunspell_provider_dispose (EnchantProvider * me) { g_free (me); } static const char * hunspell_provider_identify (EnchantProvider * me _GL_UNUSED_PARAMETER) { return "hunspell"; } static const char * hunspell_provider_describe (EnchantProvider * me _GL_UNUSED_PARAMETER) { return "Hunspell Provider"; } EnchantProvider *init_enchant_provider (void); EnchantProvider * init_enchant_provider (void) { EnchantProvider *provider = g_new0(EnchantProvider, 1); provider->dispose = hunspell_provider_dispose; provider->request_dict = hunspell_provider_request_dict; provider->dispose_dict = hunspell_provider_dispose_dict; provider->dictionary_exists = hunspell_provider_dictionary_exists; provider->identify = hunspell_provider_identify; provider->describe = hunspell_provider_describe; provider->list_dicts = hunspell_provider_list_dicts; return provider; } } // extern C linkage