diff options
author | PanderMusubi <pander@users.sourceforge.net> | 2020-01-05 19:50:55 +0100 |
---|---|---|
committer | PanderMusubi <pander@users.sourceforge.net> | 2020-02-27 10:29:35 +0100 |
commit | baff1888b76fa99ab62774f29d62aea9a437900f (patch) | |
tree | e28cfc2748c5118ca9021771408f72a5201ce291 /providers | |
parent | 36ce0b94e8f7d648dd79fc888c862bc6adf08316 (diff) | |
download | enchant-baff1888b76fa99ab62774f29d62aea9a437900f.tar.gz |
Nuspell provider implementation
Diffstat (limited to 'providers')
-rw-r--r-- | providers/Makefile.am | 7 | ||||
-rw-r--r-- | providers/enchant_nuspell.cpp | 438 |
2 files changed, 445 insertions, 0 deletions
diff --git a/providers/Makefile.am b/providers/Makefile.am index c5eeb75..8571dcc 100644 --- a/providers/Makefile.am +++ b/providers/Makefile.am @@ -24,6 +24,13 @@ enchant_hunspell_la_CXXFLAGS = $(AM_CXXFLAGS) $(HUNSPELL_CFLAGS) enchant_hunspell_la_LIBADD = $(HUNSPELL_LIBS) enchant_hunspell_la_SOURCES = enchant_hunspell.cpp +if WITH_NUSPELL +provider_LTLIBRARIES += enchant_nuspell.la +endif +enchant_nuspell_la_CXXFLAGS = $(AM_CXXFLAGS) $(NUSPELL_CFLAGS) -std=c++17 +enchant_nuspell_la_LIBADD = $(NUSPELL_LIBS) +enchant_nuspell_la_SOURCES = enchant_nuspell.cpp + if WITH_VOIKKO provider_LTLIBRARIES += enchant_voikko.la endif diff --git a/providers/enchant_nuspell.cpp b/providers/enchant_nuspell.cpp new file mode 100644 index 0000000..e97a3d3 --- /dev/null +++ b/providers/enchant_nuspell.cpp @@ -0,0 +1,438 @@ +/* enchant + * Copyright (C) 2020 Sander van Geloven + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * In addition, as a special exception, Dom Lachowicz + * gives permission to link the code of this program with + * non-LGPL Spelling Provider libraries (eg: a MSFT Office + * spell checker backend) and distribute linked combinations including + * the two. You must obey the GNU General Public License in all + * respects for all of the code used other than said providers. If you modify + * this file, you may extend this exception to your version of the + * file, but you are not obligated to do so. If you do not wish to + * do so, delete this exception statement from your version. + */ + +/* + * This is the Nuspell Enchant Backend. + * Nuspell is by Dimitrij Mijoski and Sander van Geloven. + * See: http://nuspell.github.io/ + */ + +#include "config.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <string> +#include <vector> + +#include "enchant-provider.h" +#include "unused-parameter.h" + +#include <nuspell/dictionary.hxx> +#include <nuspell/finder.hxx> + +#include <glib.h> + +using namespace std; +using namespace nuspell; + +/***************************************************************************/ + +class NuspellChecker +{ +public: + bool checkWord (const char *word, size_t len); + char **suggestWord (const char* const word, size_t len, size_t *out_n_suggs); + + bool requestDictionary (const char * szLang); + +private: + Dictionary nuspell; +}; + +/***************************************************************************/ + +bool +NuspellChecker::checkWord(const char *utf8Word, size_t len) +{ + // the 8-bit encodings use precomposed forms + char *normalizedWord = g_utf8_normalize (utf8Word, len, G_NORMALIZE_NFC); + + return nuspell.spell(string(normalizedWord)); +} + +char** +NuspellChecker::suggestWord(const char* const utf8Word, size_t len, size_t *nsug) +{ + // the 8-bit encodings use precomposed forms + char *normalizedWord = g_utf8_normalize (utf8Word, len, G_NORMALIZE_NFC); + auto suggestions = vector<string>(); + nuspell.suggest(string(normalizedWord), suggestions); + if (suggestions.empty()) + return nullptr; + *nsug = suggestions.size(); + char **sug = g_new0 (char *, *nsug + 1); + size_t i = 0; + for (auto& suggest : suggestions) { + char *word = g_new0(char, suggest.size() + 1); + strcpy(word, suggest.c_str()); + sug[i] = word; + i++; + } + return sug; +} + +static void +s_buildDictionaryDirs (vector<string> & dirs) +{ + dirs.clear (); + + /* 1. personal overrides for Enchant + * ~/.config/enchant/nuspell + */ + gchar * tmp; + char * config_dir = enchant_get_user_config_dir (); + tmp = g_build_filename (config_dir, "nuspell", nullptr); + dirs.push_back (tmp); + free (config_dir); + g_free(tmp); + + /* Dynamically retrieved from Nuspell dictionary finder: + * 2. personal overrides for Hunspell + * ~/.local/share/hunspell + * 3. system installed for Hunspell + * /usr/local/share/hunspell + * /usr/share/hunspell + * 4. personal or system installations for + * Mozilla (such as Firefox and Thunderbird), + * LibreOffice and Apache OpenOffice + */ + auto nuspell_finder = nuspell::Finder::search_all_dirs_for_dicts(); + for (auto& path : nuspell_finder.get_dir_paths()) { + if (path.compare(".") != 0) + dirs.push_back (path.c_str()); + } + + /* 5. system installations by Enchant + * /usr/local/share/enchant/nuspell + * /usr/share/enchant/nuspell + */ + char * enchant_prefix = enchant_get_prefix_dir(); + if (enchant_prefix) { + tmp = g_build_filename(enchant_prefix, "share", "enchant", "nuspell", nullptr); + dirs.push_back (tmp); + g_free(enchant_prefix); + g_free(tmp); + } + + /* Hunspell paths are used, therefore ENCHANT_NUSPELL_DICT_DIR is + * irrelevant. Hence, the following paths are not to be considered: + * /usr/local/share/nuspell and /usr/share/nuspell + */ +} + +static void +s_buildHashNames (vector<string> & names, const char * dict) +{ + names.clear (); + + vector<string> dirs; + s_buildDictionaryDirs (dirs); + + char *dict_dic = g_strconcat(dict, ".dic", nullptr); + for (size_t i = 0; i < dirs.size(); i++) { + char *tmp = g_build_filename (dirs[i].c_str(), dict_dic, nullptr); + names.push_back (tmp); + g_free (tmp); + } + + g_free(dict_dic); +} + +static const string +s_correspondingAffFile(const string & dicFile) +{ + string aff = dicFile; + if (aff.size() >= 4 && aff.compare(aff.size() - 4, 4, ".dic") == 0) { + aff.erase(aff.size() - 3); + aff += "aff"; + } + return aff; +} + +static bool +s_fileExists(const string & file) +{ + return g_file_test(file.c_str(), G_FILE_TEST_EXISTS) != 0; +} + +static bool is_plausible_dict_for_tag(const char *dir_entry, const char *tag) +{ + const char *dic_suffix = ".dic"; + size_t dic_suffix_len = strlen(dic_suffix); + size_t dir_entry_len = strlen(dir_entry); + size_t tag_len = strlen(tag); + + if (dir_entry_len - dic_suffix_len < tag_len) + return false; + if (strcmp(dir_entry + dir_entry_len - dic_suffix_len, dic_suffix) != 0) + return false; + if (strncmp (dir_entry, tag, tag_len) != 0) + return false; + //e.g. requested dict for "fi", + //reject "fil_PH.dic" + //allow "fi-FOO.dic", "fi_FOO.dic", "fi.dic", etc. + if (!ispunct(dir_entry[tag_len])) + return false; + return true; +} + +static char * +nuspell_request_dictionary (const char * tag) +{ + vector<string> names; + + s_buildHashNames (names, tag); + + for (size_t i = 0; i < names.size (); i++) { + if (g_file_test(names[i].c_str(), G_FILE_TEST_EXISTS) && + s_fileExists(s_correspondingAffFile(names[i]))) { + return strdup (names[i].c_str()); + } + } + + vector<string> dirs; + s_buildDictionaryDirs (dirs); + + for (size_t i = 0; i < dirs.size(); i++) { + GDir *dir = g_dir_open (dirs[i].c_str(), 0, nullptr); + if (dir) { + const char *dir_entry; + while ((dir_entry = g_dir_read_name (dir)) != NULL) { + if (is_plausible_dict_for_tag(dir_entry, tag)) { + char *dict = g_build_filename (dirs[i].c_str(), + dir_entry, nullptr); + if(s_fileExists(s_correspondingAffFile(dict))) { + g_dir_close (dir); + return dict; + } + g_free(dict); + } + } + + g_dir_close (dir); + } + } + + return NULL; +} + +bool +NuspellChecker::requestDictionary(const char *szLang) +{ + char *dic = nuspell_request_dictionary (szLang); + if (!dic) + return false; + string aff(s_correspondingAffFile(dic)); + if (!s_fileExists(aff)) + return false; + auto path = string(dic); + if (path.size() >= 4 && path.compare(path.size() - 4, 4, ".dic") == 0) + path.erase(path.size() - 4); + else + return false; + try { + nuspell = nuspell::Dictionary::load_from_path(path); + } catch (const std::runtime_error& e) { + return false; + } + + return true; +} + +/* + * Enchant + */ + +static char ** +nuspell_dict_suggest (EnchantDict * me, const char *const word, + size_t len, size_t * out_n_suggs) +{ + NuspellChecker * checker = static_cast<NuspellChecker *>(me->user_data); + return checker->suggestWord (word, len, out_n_suggs); +} + +static int +nuspell_dict_check (EnchantDict * me, const char *const word, size_t len) +{ + NuspellChecker * checker = static_cast<NuspellChecker *>(me->user_data); + + return !(checker->checkWord(word, len)); +} + +static int +nuspell_dict_is_word_character (EnchantDict * me _GL_UNUSED_PARAMETER, + uint32_t uc, size_t n _GL_UNUSED_PARAMETER) +{ + return g_unichar_isalpha(uc); +} + +static void +nuspell_provider_enum_dicts (const char * const directory, + vector<string> & out_dicts) +{ + GDir * dir = g_dir_open (directory, 0, nullptr); + if (dir) { + const char * entry; + while ((entry = g_dir_read_name (dir)) != NULL) { + char * utf8_entry = g_filename_to_utf8 (entry, -1, nullptr, nullptr, nullptr); + if (utf8_entry) { + string dir_entry (utf8_entry); + g_free (utf8_entry); + + int hit = dir_entry.rfind (".dic"); + // don't include hyphenation dictionaries + if (hit != -1) { + // require .aff file to be present + if(dir_entry.compare (0, 5, "hyph_") != 0) { + char * dic = g_build_filename(directory, dir_entry.c_str(), nullptr); + if (s_fileExists(s_correspondingAffFile(dic))) { + out_dicts.push_back (dir_entry.substr (0, hit)); + } + g_free(dic); + } + } + } + } + + g_dir_close (dir); + } +} + +extern "C" { + +static char ** +nuspell_provider_list_dicts (EnchantProvider * me _GL_UNUSED_PARAMETER, + size_t * out_n_dicts) +{ + vector<string> dict_dirs, dicts; + char ** dictionary_list = NULL; + + s_buildDictionaryDirs (dict_dirs); + + for (size_t i = 0; i < dict_dirs.size(); i++) { + nuspell_provider_enum_dicts (dict_dirs[i].c_str(), dicts); + } + + if (dicts.size () > 0) { + dictionary_list = g_new0 (char *, dicts.size() + 1); + + for (size_t i = 0; i < dicts.size(); i++) + dictionary_list[i] = g_strdup (dicts[i].c_str()); + } + + *out_n_dicts = dicts.size (); + return dictionary_list; +} + +static EnchantDict * +nuspell_provider_request_dict(EnchantProvider * me _GL_UNUSED_PARAMETER, const char *const tag) +{ + NuspellChecker * checker = new NuspellChecker(); + + if (!checker) + return NULL; + + if (!checker->requestDictionary(tag)) { + delete checker; + return NULL; + } + + EnchantDict *dict = g_new0(EnchantDict, 1); + dict->user_data = (void *) checker; + dict->check = nuspell_dict_check; + dict->suggest = nuspell_dict_suggest; + // don't implement personal, session + dict->is_word_character = nuspell_dict_is_word_character; + + return dict; +} + +static void +nuspell_provider_dispose_dict (EnchantProvider * me _GL_UNUSED_PARAMETER, EnchantDict * dict) +{ + NuspellChecker *checker = (NuspellChecker *) dict->user_data; + delete checker; + + g_free (dict); +} + +static int +nuspell_provider_dictionary_exists (struct str_enchant_provider * me _GL_UNUSED_PARAMETER, + const char *const tag) +{ + vector <string> names; + s_buildHashNames (names, tag); + for (size_t i = 0; i < names.size(); i++) { + if (g_file_test (names[i].c_str(), G_FILE_TEST_EXISTS) && + s_fileExists(s_correspondingAffFile(names[i]))) { + return 1; + } + } + + return 0; +} + +static void +nuspell_provider_dispose (EnchantProvider * me) +{ + g_free (me); +} + +static const char * +nuspell_provider_identify (EnchantProvider * me _GL_UNUSED_PARAMETER) +{ + return "nuspell"; +} + +static const char * +nuspell_provider_describe (EnchantProvider * me _GL_UNUSED_PARAMETER) +{ + return "Nuspell Provider"; +} + +EnchantProvider *init_enchant_provider (void); + +EnchantProvider * +init_enchant_provider (void) +{ + EnchantProvider *provider = g_new0(EnchantProvider, 1); + provider->dispose = nuspell_provider_dispose; + provider->request_dict = nuspell_provider_request_dict; + provider->dispose_dict = nuspell_provider_dispose_dict; + provider->dictionary_exists = nuspell_provider_dictionary_exists; + provider->identify = nuspell_provider_identify; + provider->describe = nuspell_provider_describe; + provider->list_dicts = nuspell_provider_list_dicts; + + return provider; +} + +} // extern C linkage |