summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDom Lachowicz <domlachowicz@gmail.com>2006-01-14 02:18:48 +0000
committerDom Lachowicz <domlachowicz@gmail.com>2006-01-14 02:18:48 +0000
commit7f5d852c3116af74620e630a776b6a8e03f8e5c9 (patch)
tree08856630ce7f546ecafe18d7d68cbc41f13113ef
parent48a8a34b95d427464cc9ca8af9fbf2900f1dcf30 (diff)
downloadenchant-7f5d852c3116af74620e630a776b6a8e03f8e5c9.tar.gz
build against hunspell (http://hunspell.sf.net/) instead of myspell.
hunspell will be replacing myspell in a future version of OpenOffice.org. it is compatible with myspell's dictionaries and offers a lot of improvements for non-western languages. we can no longer build against a system version of myspell. we will always build against our own copy of hunspell unless told otherwise. this is bug 9820 git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@21089 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r--configure.in9
-rw-r--r--src/myspell/Makefile.am44
-rw-r--r--src/myspell/affentry.cxx638
-rw-r--r--src/myspell/affentry.hxx60
-rw-r--r--src/myspell/affixmgr.cxx3084
-rw-r--r--src/myspell/affixmgr.hxx178
-rw-r--r--src/myspell/atypes.hxx62
-rw-r--r--src/myspell/baseaffix.hxx26
-rw-r--r--src/myspell/csutil.cxx1583
-rw-r--r--src/myspell/csutil.hxx79
-rw-r--r--src/myspell/enchant_myspell.hxx42
-rw-r--r--src/myspell/hashmgr.cxx544
-rw-r--r--src/myspell/hashmgr.hxx32
-rw-r--r--src/myspell/htypes.hxx11
-rw-r--r--src/myspell/hunspell.cxx1616
-rw-r--r--src/myspell/hunspell.dsp164
-rw-r--r--src/myspell/hunspell.hxx142
-rw-r--r--src/myspell/myspell.cxx302
-rw-r--r--src/myspell/myspell_checker.cpp11
-rw-r--r--src/myspell/suggestmgr.cxx1370
-rw-r--r--src/myspell/suggestmgr.hxx69
21 files changed, 8926 insertions, 1140 deletions
diff --git a/configure.in b/configure.in
index 1cca3b8..f106836 100644
--- a/configure.in
+++ b/configure.in
@@ -123,12 +123,6 @@ if test "x$with_myspell_dir" != "x" ; then
myspell_dir=$with_myspell_dir
fi
-with_system_myspell=no
-if test "x$build_myspell" != "xno"; then
- PKG_CHECK_MODULES(MYSPELL, myspell, with_system_myspell=yes, with_system_myspell=no)
-fi
-AM_CONDITIONAL(WITH_SYSTEM_MYSPELL, test "x$with_system_myspell" = "xyes")
-
MYSPELL_CFLAGS="$MYSPELL_CFLAGS -DENCHANT_MYSPELL_DICT_DIR='\"$myspell_dir\"'"
if test "x$with_system_myspell" != "xno"; then
MYSPELL_CFLAGS="$MYSPELL_CFLAGS -DWITH_SYSTEM_MYSPELL=1"
@@ -273,7 +267,6 @@ $PACKAGE-$VERSION
Build Ispell backend: ${build_ispell}
Build Uspell backend: ${build_uspell}
Build Hspell backend: ${build_hspell}
- Build Myspell backend: ${build_myspell}
- Build against system Myspell: ${with_system_myspell}
+ Build Myspell/Hunspell backend: ${build_myspell}
Build with Binreloc $br_cv_binreloc
"
diff --git a/src/myspell/Makefile.am b/src/myspell/Makefile.am
index 1f84195..7a57c3d 100644
--- a/src/myspell/Makefile.am
+++ b/src/myspell/Makefile.am
@@ -13,43 +13,33 @@ libenchant_myspell_lalibdir=$(libdir)/enchant
libenchant_myspell_la_LIBADD= $(MYSPELL_LIBS) $(ENCHANT_LIBS) $(top_builddir)/src/libenchant.la
libenchant_myspell_la_LDFLAGS = -version-info $(VERSION_INFO) -no-undefined
-if WITH_SYSTEM_MYSPELL
libenchant_myspell_la_SOURCES = \
- myspell_checker.cpp
-else
-libenchant_myspell_la_SOURCES = \
- affentry.cxx \
- affentry.hxx \
- affixmgr.cxx \
- affixmgr.hxx \
- atypes.hxx \
- baseaffix.hxx \
- csutil.cxx \
- csutil.hxx \
- hashmgr.cxx \
- hashmgr.hxx \
- htypes.hxx \
- myspell.cxx \
- enchant_myspell.hxx \
- suggestmgr.cxx \
- suggestmgr.hxx \
- myspell_checker.cpp
-endif
-
-EXTRA_DIST= \
- license.readme \
affentry.hxx \
affixmgr.hxx \
atypes.hxx \
baseaffix.hxx \
csutil.hxx \
+ dictmgr.hxx \
hashmgr.hxx \
htypes.hxx \
- enchant_myspell.hxx \
+ hunspell.hxx \
+ langnum.hxx \
suggestmgr.hxx \
affentry.cxx \
affixmgr.cxx \
csutil.cxx \
+ dictmgr.cxx \
hashmgr.cxx \
- myspell.cxx \
- suggestmgr.cxx
+ hunspell.cxx \
+ suggestmgr.cxx \
+ myspell_checker.cpp
+
+EXTRA_DIST= \
+ license.readme \
+ utf_info.cxx \
+ README \
+ license.hunspell \
+ license.myspell \
+ license.readme \
+ hunspell.dsp
+
diff --git a/src/myspell/affentry.cxx b/src/myspell/affentry.cxx
index 603616d..014e925 100644
--- a/src/myspell/affentry.cxx
+++ b/src/myspell/affentry.cxx
@@ -1,5 +1,5 @@
-#include "license.readme"
-
+#include "license.hunspell"
+#include "license.myspell"
#include <cctype>
#include <cstring>
@@ -7,13 +7,12 @@
#include <cstdio>
#include "affentry.hxx"
+#include "csutil.hxx"
-#ifndef WINDOWS
+#ifndef W32
using namespace std;
#endif
-extern char * mystrdup(const char * s);
-extern char * myrevstrdup(const char * s);
PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
{
@@ -21,73 +20,216 @@ PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)
pmyMgr = pmgr;
// set up its intial values
- achar = dp->achar; // char flag
+
+ aflag = dp->aflag; // flag
strip = dp->strip; // string to strip
appnd = dp->appnd; // string to append
stripl = dp->stripl; // length of strip string
appndl = dp->appndl; // length of append string
numconds = dp->numconds; // number of conditions to match
- xpflg = dp->xpflg; // cross product flag
+ opts = dp->opts; // cross product flag
// then copy over all of the conditions
- memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
+ memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
next = NULL;
nextne = NULL;
nexteq = NULL;
+ morphcode = dp->morphcode;
+ contclass = dp->contclass;
+ contclasslen = dp->contclasslen;
}
PfxEntry::~PfxEntry()
{
- achar = '\0';
+ aflag = 0;
if (appnd) free(appnd);
- if (strip)free(strip);
+ if (strip) free(strip);
pmyMgr = NULL;
appnd = NULL;
- strip = NULL;
+ strip = NULL;
+ if (opts & aeUTF8) {
+ for (int i = 0; i < 8; i++) {
+ if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
+ }
+ }
+ if (morphcode && !(opts & aeALIASM)) free(morphcode);
+ if (contclass && !(opts & aeALIASF)) free(contclass);
}
-
-
// add prefix to this word assuming conditions hold
char * PfxEntry::add(const char * word, int len)
{
- int cond;
- char tword[MAXWORDLEN+1];
+ char tword[MAXWORDUTF8LEN + 4];
- /* make sure all conditions match */
- if ((len > stripl) && (len >= numconds)) {
- unsigned char * cp = (unsigned char *) word;
- for (cond = 0; cond < numconds; cond++) {
- if ((conds[*cp++] & (1 << cond)) == 0)
- break;
- }
- if (cond >= numconds) {
- /* we have a match so add prefix */
- int tlen = 0;
+ if ((len > stripl) && (len >= numconds) && test_condition(word) &&
+ (!stripl || (strncmp(word, strip, stripl) == 0)) &&
+ ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
+ /* we have a match so add prefix */
+ char * pp = tword;
if (appndl) {
- strcpy(tword,appnd);
- tlen += appndl;
- }
- char * pp = tword + tlen;
+ strcpy(tword,appnd);
+ pp += appndl;
+ }
strcpy(pp, (word + stripl));
return mystrdup(tword);
- }
}
return NULL;
}
+inline int PfxEntry::test_condition(const char * st)
+{
+ int cond;
+ unsigned char * cp = (unsigned char *)st;
+ if (!(opts & aeUTF8)) { // 256-character codepage
+ for (cond = 0; cond < numconds; cond++) {
+ if ((conds.base[*cp++] & (1 << cond)) == 0) return 0;
+ }
+ } else { // UTF-8 encoding
+ unsigned short wc;
+ for (cond = 0; cond < numconds; cond++) {
+ // a simple 7-bit ASCII character in UTF-8
+ if ((*cp >> 7) == 0) {
+ // also check limit (end of word)
+ if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0;
+ // UTF-8 multibyte character
+ } else {
+ // not dot wildcard in rule
+ if (!conds.utf8.all[cond]) {
+ if (conds.utf8.neg[cond]) {
+ u8_u16((w_char *) &wc, 1, (char *) cp);
+ if (conds.utf8.wchars[cond] &&
+ flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
+ wc, (short) conds.utf8.wlen[cond])) return 0;
+ } else {
+ if (!conds.utf8.wchars[cond]) return 0;
+ u8_u16((w_char *) &wc, 1, (char *) cp);
+ if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
+ wc, (short)conds.utf8.wlen[cond])) return 0;
+ }
+ }
+ // jump to next UTF-8 character
+ for(cp++; (*cp & 0xc0) == 0x80; cp++);
+ }
+ }
+ }
+ return 1;
+}
// check if this prefix entry matches
-struct hentry * PfxEntry::check(const char * word, int len)
+struct hentry * PfxEntry::check(const char * word, int len, char in_compound, const FLAG needflag)
{
- int cond; // condition number being examined
int tmpl; // length of tmpword
struct hentry * he; // hash entry of root word or NULL
- unsigned char * cp;
- char tmpword[MAXWORDLEN+1];
+ char tmpword[MAXWORDUTF8LEN + 4];
+
+ // on entry prefix is 0 length or already matches the beginning of the word.
+ // So if the remaining root word has positive length
+ // and if there are enough chars in root word and added back strip chars
+ // to meet the number of characters conditions, then test it
+
+ tmpl = len - appndl;
+
+ if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
+
+ // generate new root word by removing prefix and adding
+ // back any characters that would have been stripped
+
+ if (stripl) strcpy (tmpword, strip);
+ strcpy ((tmpword + stripl), (word + appndl));
+
+ // now make sure all of the conditions on characters
+ // are met. Please see the appendix at the end of
+ // this file for more info on exactly what is being
+ // tested
+
+ // if all conditions are met then check if resulting
+ // root word in the dictionary
+
+ if (test_condition(tmpword)) {
+ tmpl += stripl;
+ if ((he = pmyMgr->lookup(tmpword)) != NULL) {
+ do {
+ if (TESTAFF(he->astr, aflag, he->alen) &&
+ // forbid single prefixes with pseudoroot flag
+ ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
+ // needflag
+ ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
+ (contclass && TESTAFF(contclass, needflag, contclasslen))))
+ return he;
+ } while ((he = he->next_homonym)); // check homonyms
+ }
+
+ // prefix matched but no root word was found
+ // if aeXPRODUCT is allowed, try again but now
+ // ross checked combined with a suffix
+
+ //if ((opts & aeXPRODUCT) && in_compound) {
+ if ((opts & aeXPRODUCT)) {
+ he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL,
+ 0, NULL, FLAG_NULL, needflag, in_compound);
+ if (he) return he;
+ }
+ }
+ }
+ return NULL;
+}
+
+// check if this prefix entry matches
+struct hentry * PfxEntry::check_twosfx(const char * word, int len,
+ char in_compound, const FLAG needflag)
+{
+ int tmpl; // length of tmpword
+ struct hentry * he; // hash entry of root word or NULL
+ char tmpword[MAXWORDUTF8LEN + 4];
+
+ // on entry prefix is 0 length or already matches the beginning of the word.
+ // So if the remaining root word has positive length
+ // and if there are enough chars in root word and added back strip chars
+ // to meet the number of characters conditions, then test it
+
+ tmpl = len - appndl;
+ if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
+
+ // generate new root word by removing prefix and adding
+ // back any characters that would have been stripped
+
+ if (stripl) strcpy (tmpword, strip);
+ strcpy ((tmpword + stripl), (word + appndl));
+
+ // now make sure all of the conditions on characters
+ // are met. Please see the appendix at the end of
+ // this file for more info on exactly what is being
+ // tested
+
+ // if all conditions are met then check if resulting
+ // root word in the dictionary
+
+ if (test_condition(tmpword)) {
+ tmpl += stripl;
+
+ // prefix matched but no root word was found
+ // if aeXPRODUCT is allowed, try again but now
+ // cross checked combined with a suffix
+
+ if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
+ he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag);
+ if (he) return he;
+ }
+ }
+ }
+ return NULL;
+}
+
+
+// check if this prefix entry matches
+char * PfxEntry::check_twosfx_morph(const char * word, int len,
+ char in_compound, const FLAG needflag)
+{
+ int tmpl; // length of tmpword
+ char tmpword[MAXWORDUTF8LEN + 4];
// on entry prefix is 0 length or already matches the beginning of the word.
// So if the remaining root word has positive length
@@ -109,117 +251,317 @@ struct hentry * PfxEntry::check(const char * word, int len)
// this file for more info on exactly what is being
// tested
- cp = (unsigned char *)tmpword;
- for (cond = 0; cond < numconds; cond++) {
- if ((conds[*cp++] & (1 << cond)) == 0) break;
+ // if all conditions are met then check if resulting
+ // root word in the dictionary
+
+ if (test_condition(tmpword)) {
+ tmpl += stripl;
+
+ // prefix matched but no root word was found
+ // if aeXPRODUCT is allowed, try again but now
+ // ross checked combined with a suffix
+
+ if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
+ return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,
+ aeXPRODUCT, (AffEntry *)this, needflag);
+ }
}
+ }
+ return NULL;
+}
+
+// check if this prefix entry matches
+char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)
+{
+ int tmpl; // length of tmpword
+ struct hentry * he; // hash entry of root word or NULL
+ char tmpword[MAXWORDUTF8LEN + 4];
+ char result[MAXLNLEN];
+ char * st;
+
+ *result = '\0';
+
+ // on entry prefix is 0 length or already matches the beginning of the word.
+ // So if the remaining root word has positive length
+ // and if there are enough chars in root word and added back strip chars
+ // to meet the number of characters conditions, then test it
+
+ tmpl = len - appndl;
+
+ if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
+
+ // generate new root word by removing prefix and adding
+ // back any characters that would have been stripped
+
+ if (stripl) strcpy (tmpword, strip);
+ strcpy ((tmpword + stripl), (word + appndl));
+
+ // now make sure all of the conditions on characters
+ // are met. Please see the appendix at the end of
+ // this file for more info on exactly what is being
+ // tested
// if all conditions are met then check if resulting
// root word in the dictionary
- if (cond >= numconds) {
+ if (test_condition(tmpword)) {
tmpl += stripl;
if ((he = pmyMgr->lookup(tmpword)) != NULL) {
- if (TESTAFF(he->astr, achar, he->alen)) return he;
+ do {
+ if (TESTAFF(he->astr, aflag, he->alen) &&
+ // forbid single prefixes with pseudoroot flag
+ ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) &&
+ // needflag
+ ((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||
+ (contclass && TESTAFF(contclass, needflag, contclasslen)))) {
+ if (morphcode) strcat(result, morphcode); else strcat(result,getKey());
+ if (he->description) {
+ if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word);
+ strcat(result,he->description);
+ }
+ strcat(result, "\n");
+ }
+ } while ((he = he->next_homonym));
}
// prefix matched but no root word was found
- // if XPRODUCT is allowed, try again but now
+ // if aeXPRODUCT is allowed, try again but now
// ross checked combined with a suffix
- if (xpflg & XPRODUCT) {
- he = pmyMgr->suffix_check(tmpword, tmpl, XPRODUCT, (AffEntry *)this);
- if (he) return he;
+ if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {
+ st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this,
+ FLAG_NULL, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
}
}
}
+
+ if (*result) return mystrdup(result);
return NULL;
}
-
SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)
{
// register affix manager
pmyMgr = pmgr;
// set up its intial values
- achar = dp->achar; // char flag
+ aflag = dp->aflag; // char flag
strip = dp->strip; // string to strip
appnd = dp->appnd; // string to append
stripl = dp->stripl; // length of strip string
appndl = dp->appndl; // length of append string
numconds = dp->numconds; // number of conditions to match
- xpflg = dp->xpflg; // cross product flag
+ opts = dp->opts; // cross product flag
// then copy over all of the conditions
- memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0]));
+ memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0]));
rappnd = myrevstrdup(appnd);
+
+ morphcode = dp->morphcode;
+ contclass = dp->contclass;
+ contclasslen = dp->contclasslen;
}
SfxEntry::~SfxEntry()
{
- achar = '\0';
+ aflag = 0;
if (appnd) free(appnd);
if (rappnd) free(rappnd);
if (strip) free(strip);
pmyMgr = NULL;
appnd = NULL;
strip = NULL;
+ if (opts & aeUTF8) {
+ for (int i = 0; i < 8; i++) {
+ if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]);
+ }
+ }
+ if (morphcode && !(opts & aeALIASM)) free(morphcode);
+ if (contclass && !(opts & aeALIASF)) free(contclass);
}
-
-
// add suffix to this word assuming conditions hold
char * SfxEntry::add(const char * word, int len)
{
- int cond;
- char tword[MAXWORDLEN+1];
+ char tword[MAXWORDUTF8LEN + 4];
/* make sure all conditions match */
- if ((len > stripl) && (len >= numconds)) {
- unsigned char * cp = (unsigned char *) (word + len);
- for (cond = numconds; --cond >=0; ) {
- if ((conds[*--cp] & (1 << cond)) == 0)
- break;
- }
- if (cond < 0) {
+ if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) &&
+ (!stripl || (strcmp(word + len - stripl, strip) == 0)) &&
+ ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {
/* we have a match so add suffix */
strcpy(tword,word);
- int tlen = len;
- if (stripl) {
- tlen -= stripl;
- }
- char * pp = (tword + tlen);
if (appndl) {
- strcpy(pp,appnd);
- tlen += appndl;
- } else *pp = '\0';
- return mystrdup(tword);
- }
+ strcpy(tword + len - stripl, appnd);
+ } else {
+ *(tword + len - stripl) = '\0';
+ }
+ return mystrdup(tword);
}
return NULL;
}
+inline int SfxEntry::test_condition(const char * st, const char * beg)
+{
+ int cond;
+ unsigned char * cp = (unsigned char *) st;
+ if (!(opts & aeUTF8)) { // 256-character codepage
+ // Dömölki affix algorithm
+ for (cond = numconds; --cond >= 0; ) {
+ if ((conds.base[*--cp] & (1 << cond)) == 0) return 0;
+ }
+ } else { // UTF-8 encoding
+ unsigned short wc;
+ for (cond = numconds; --cond >= 0; ) {
+ // go to next character position and check limit
+ if ((char *) --cp < beg) return 0;
+ // a simple 7-bit ASCII character in UTF-8
+ if ((*cp >> 7) == 0) {
+ if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0;
+ // UTF-8 multibyte character
+ } else {
+ // go to first character of UTF-8 multibyte character
+ for (; (*cp & 0xc0) == 0x80; cp--);
+ // not dot wildcard in rule
+ if (!conds.utf8.all[cond]) {
+ if (conds.utf8.neg[cond]) {
+ u8_u16((w_char *) &wc, 1, (char *) cp);
+ if (conds.utf8.wchars[cond] &&
+ flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
+ wc, (short) conds.utf8.wlen[cond])) return 0;
+ } else {
+ if (!conds.utf8.wchars[cond]) return 0;
+ u8_u16((w_char *) &wc, 1, (char *) cp);
+ if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond],
+ wc, (short)conds.utf8.wlen[cond])) return 0;
+ }
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+
// see if this suffix is present in the word
-struct hentry * SfxEntry::check(const char * word, int len, int optflags, AffEntry* ppfx)
+struct hentry * SfxEntry::check(const char * word, int len, int optflags,
+ AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag)
+{
+ int tmpl; // length of tmpword
+ struct hentry * he; // hash entry pointer
+ unsigned char * cp;
+ char tmpword[MAXWORDUTF8LEN + 4];
+ PfxEntry* ep = (PfxEntry *) ppfx;
+
+ // if this suffix is being cross checked with a prefix
+ // but it does not support cross products skip it
+
+ if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))
+ return NULL;
+
+ // upon entry suffix is 0 length or already matches the end of the word.
+ // So if the remaining root word has positive length
+ // and if there are enough chars in root word and added back strip chars
+ // to meet the number of characters conditions, then test it
+
+ tmpl = len - appndl;
+ // the second condition is not enough for UTF-8 strings
+ // it checked in test_condition()
+
+ if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
+
+ // generate new root word by removing suffix and adding
+ // back any characters that would have been stripped or
+ // or null terminating the shorter string
+
+ strcpy (tmpword, word);
+ cp = (unsigned char *)(tmpword + tmpl);
+ if (stripl) {
+ strcpy ((char *)cp, strip);
+ tmpl += stripl;
+ cp = (unsigned char *)(tmpword + tmpl);
+ } else *cp = '\0';
+
+ // now make sure all of the conditions on characters
+ // are met. Please see the appendix at the end of
+ // this file for more info on exactly what is being // tested
+
+ // if all conditions are met then check if resulting
+ // root word in the dictionary
+
+ if (test_condition((char *) cp, (char *) tmpword)) {
+
+#ifdef SZOSZABLYA_POSSIBLE_ROOTS
+ fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);
+#endif
+ if ((he = pmyMgr->lookup(tmpword)) != NULL) {
+ do {
+ // check conditional suffix (enabled by prefix)
+ if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&
+ TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
+ (((optflags & aeXPRODUCT) == 0) ||
+ TESTAFF(he->astr, ep->getFlag(), he->alen) ||
+ // enabled by prefix
+ ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
+ ) &&
+ // handle cont. class
+ ((!cclass) ||
+ ((contclass) && TESTAFF(contclass, cclass, contclasslen))
+ ) &&
+ // handle required flag
+ ((!needflag) ||
+ (TESTAFF(he->astr, needflag, he->alen) ||
+ ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
+ )
+ ) return he;
+ } while ((he = he->next_homonym)); // check homonyms
+
+ // obsolote stemming code (used only by the
+ // experimental SuffixMgr:suggest_pos_stems)
+ // store resulting root in wlst
+ } else if (wlst && (*ns < maxSug)) {
+ int cwrd = 1;
+ for (int k=0; k < *ns; k++)
+ if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;
+ if (cwrd) {
+ wlst[*ns] = mystrdup(tmpword);
+ if (wlst[*ns] == NULL) {
+ for (int j=0; j<*ns; j++) free(wlst[j]);
+ *ns = -1;
+ return NULL;
+ }
+ (*ns)++;
+ }
+ }
+ }
+ }
+ return NULL;
+}
+
+// see if two-level suffix is present in the word
+struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,
+ AffEntry* ppfx, const FLAG needflag)
{
int tmpl; // length of tmpword
- int cond; // condition beng examined
struct hentry * he; // hash entry pointer
unsigned char * cp;
- char tmpword[MAXWORDLEN+1];
+ char tmpword[MAXWORDUTF8LEN + 4];
PfxEntry* ep = (PfxEntry *) ppfx;
// if this suffix is being cross checked with a prefix
// but it does not support cross products skip it
- if ((optflags & XPRODUCT) != 0 && (xpflg & XPRODUCT) == 0)
+ if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
return NULL;
// upon entry suffix is 0 length or already matches the end of the word.
@@ -248,25 +590,135 @@ struct hentry * SfxEntry::check(const char * word, int len, int optflags, AffEnt
// this file for more info on exactly what is being
// tested
- for (cond = numconds; --cond >= 0; ) {
- if ((conds[*--cp] & (1 << cond)) == 0) break;
- }
+ // if all conditions are met then recall suffix_check
+
+ if (test_condition((char *) cp, (char *) tmpword)) {
+ if (ppfx) {
+ // handle conditional suffix
+ if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
+ he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
+ else
+ he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);
+ } else {
+ he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);
+ }
+ if (he) return he;
+ }
+ }
+ return NULL;
+}
- // if all conditions are met then check if resulting
- // root word in the dictionary
- if (cond < 0) {
- if ((he = pmyMgr->lookup(tmpword)) != NULL) {
- if (TESTAFF(he->astr, achar , he->alen) &&
- ((optflags & XPRODUCT) == 0 ||
- TESTAFF(he->astr, ep->getFlag(), he->alen))) return he;
- }
- }
+// see if two-level suffix is present in the word
+char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,
+ AffEntry* ppfx, const FLAG needflag)
+{
+ int tmpl; // length of tmpword
+ unsigned char * cp;
+ char tmpword[MAXWORDUTF8LEN + 4];
+ PfxEntry* ep = (PfxEntry *) ppfx;
+ char * st;
+
+ char result[MAXLNLEN];
+
+ *result = '\0';
+
+ // if this suffix is being cross checked with a prefix
+ // but it does not support cross products skip it
+
+ if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)
+ return NULL;
+
+ // upon entry suffix is 0 length or already matches the end of the word.
+ // So if the remaining root word has positive length
+ // and if there are enough chars in root word and added back strip chars
+ // to meet the number of characters conditions, then test it
+
+ tmpl = len - appndl;
+
+ if ((tmpl > 0) && (tmpl + stripl >= numconds)) {
+
+ // generate new root word by removing suffix and adding
+ // back any characters that would have been stripped or
+ // or null terminating the shorter string
+
+ strcpy (tmpword, word);
+ cp = (unsigned char *)(tmpword + tmpl);
+ if (stripl) {
+ strcpy ((char *)cp, strip);
+ tmpl += stripl;
+ cp = (unsigned char *)(tmpword + tmpl);
+ } else *cp = '\0';
+
+ // now make sure all of the conditions on characters
+ // are met. Please see the appendix at the end of
+ // this file for more info on exactly what is being
+ // tested
+
+ // if all conditions are met then recall suffix_check
+
+ if (test_condition((char *) cp, (char *) tmpword)) {
+ if (ppfx) {
+ // handle conditional suffix
+ if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {
+ st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
+ if (st) {
+ if (((PfxEntry *) ppfx)->getMorph()) {
+ strcat(result, ((PfxEntry *) ppfx)->getMorph());
+ }
+ strcat(result,st);
+ free(st);
+ mychomp(result);
+ }
+ } else {
+ st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ mychomp(result);
+ }
+ }
+ } else {
+ st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ mychomp(result);
+ }
+ }
+ if (*result) return mystrdup(result);
+ }
}
return NULL;
}
+// get next homonym with same affix
+struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx,
+ const FLAG cclass, const FLAG needflag)
+{
+ PfxEntry* ep = (PfxEntry *) ppfx;
+ while (he->next_homonym) {
+ he = he->next_homonym;
+ if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&
+ ((optflags & aeXPRODUCT) == 0 ||
+ TESTAFF(he->astr, ep->getFlag(), he->alen) ||
+ // handle conditional suffix
+ ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))
+ ) &&
+ // handle cont. class
+ ((!cclass) ||
+ ((contclass) && TESTAFF(contclass, cclass, contclasslen))
+ ) &&
+ // handle required flag
+ ((!needflag) ||
+ (TESTAFF(he->astr, needflag, he->alen) ||
+ ((contclass) && TESTAFF(contclass, needflag, contclasslen)))
+ )
+ ) return he;
+ }
+ return NULL;
+}
#if 0
@@ -286,14 +738,14 @@ The structure affentry is defined as follows:
struct affentry
{
- unsigned char achar; // char used to represent the affix
- char * strip; // string to strip before adding affix
- char * appnd; // the affix string to add
- short stripl; // length of the strip string
- short appndl; // length of the affix string
- short numconds; // the number of conditions that must be met
- short xpflg; // flag: XPRODUCT- combine both prefix and suffix
- char conds[SETSIZE]; // array which encodes the conditions to be met
+ unsigned short aflag; // ID used to represent the affix
+ char * strip; // string to strip before adding affix
+ char * appnd; // the affix string to add
+ unsigned char stripl; // length of the strip string
+ unsigned char appndl; // length of the affix string
+ char numconds; // the number of conditions that must be met
+ char opts; // flag: aeXPRODUCT- combine both prefix and suffix
+ char conds[SETSIZE]; // array which encodes the conditions to be met
};
diff --git a/src/myspell/affentry.hxx b/src/myspell/affentry.hxx
index 9c4713c..1dd784a 100644
--- a/src/myspell/affentry.hxx
+++ b/src/myspell/affentry.hxx
@@ -5,7 +5,6 @@
#include "baseaffix.hxx"
#include "affixmgr.hxx"
-
/* A Prefix Entry */
class PfxEntry : public AffEntry
@@ -22,13 +21,29 @@ public:
PfxEntry(AffixMgr* pmgr, affentry* dp );
~PfxEntry();
- struct hentry * check(const char * word, int len);
+ inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); }
+ struct hentry * check(const char * word, int len, char in_compound,
+ const FLAG needflag = FLAG_NULL);
+
+ struct hentry * check_twosfx(const char * word, int len, char in_compound, const FLAG needflag = NULL);
+
+ char * check_morph(const char * word, int len, char in_compound,
+ const FLAG needflag = FLAG_NULL);
+
+ char * check_twosfx_morph(const char * word, int len,
+ char in_compound, const FLAG needflag = FLAG_NULL);
- inline bool allowCross() { return ((xpflg & XPRODUCT) != 0); }
- inline unsigned char getFlag() { return achar; }
+ inline FLAG getFlag() { return aflag; }
inline const char * getKey() { return appnd; }
char * add(const char * word, int len);
+ inline short getKeyLen() { return appndl; }
+
+ inline const char * getMorph() { return morphcode; }
+
+ inline const unsigned short * getCont() { return contclass; }
+ inline short getContLen() { return contclasslen; }
+
inline PfxEntry * getNext() { return next; }
inline PfxEntry * getNextNE() { return nextne; }
inline PfxEntry * getNextEQ() { return nexteq; }
@@ -38,6 +53,8 @@ public:
inline void setNextNE(PfxEntry * ptr) { nextne = ptr; }
inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; }
inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; }
+
+ inline int test_condition(const char * st);
};
@@ -54,23 +71,50 @@ class SfxEntry : public AffEntry
SfxEntry * nexteq;
SfxEntry * nextne;
SfxEntry * flgnxt;
+
+ SfxEntry * l_morph;
+ SfxEntry * r_morph;
+ SfxEntry * eq_morph;
public:
SfxEntry(AffixMgr* pmgr, affentry* dp );
~SfxEntry();
+ inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); }
struct hentry * check(const char * word, int len, int optflags,
- AffEntry* ppfx);
+ AffEntry* ppfx, char ** wlst, int maxSug, int * ns,
+ const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL);
+
+ struct hentry * check_twosfx(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag = NULL);
- inline bool allowCross() { return ((xpflg & XPRODUCT) != 0); }
- inline unsigned char getFlag() { return achar; }
+ char * check_twosfx_morph(const char * word, int len, int optflags,
+ AffEntry* ppfx, const FLAG needflag = FLAG_NULL);
+ struct hentry * get_next_homonym(struct hentry * he);
+ struct hentry * get_next_homonym(struct hentry * word, int optflags, AffEntry* ppfx,
+ const FLAG cclass, const FLAG needflag);
+
+
+ inline FLAG getFlag() { return aflag; }
inline const char * getKey() { return rappnd; }
char * add(const char * word, int len);
+
+ inline const char * getMorph() { return morphcode; }
+
+ inline const unsigned short * getCont() { return contclass; }
+ inline short getContLen() { return contclasslen; }
+ inline const char * getAffix() { return appnd; }
+
+ inline short getKeyLen() { return appndl; }
+
inline SfxEntry * getNext() { return next; }
inline SfxEntry * getNextNE() { return nextne; }
inline SfxEntry * getNextEQ() { return nexteq; }
+
+ inline SfxEntry * getLM() { return l_morph; }
+ inline SfxEntry * getRM() { return r_morph; }
+ inline SfxEntry * getEQM() { return eq_morph; }
inline SfxEntry * getFlgNxt() { return flgnxt; }
inline void setNext(SfxEntry * ptr) { next = ptr; }
@@ -78,9 +122,9 @@ public:
inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; }
inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; }
+ inline int test_condition(const char * st, const char * begin);
};
-
#endif
diff --git a/src/myspell/affixmgr.cxx b/src/myspell/affixmgr.cxx
index 3a5714b..69220e5 100644
--- a/src/myspell/affixmgr.cxx
+++ b/src/myspell/affixmgr.cxx
@@ -1,50 +1,104 @@
-#include "license.readme"
+#include "license.hunspell"
+#include "license.myspell"
#include <cstdlib>
#include <cstring>
+#include <cctype>
#include <cstdio>
#include "affixmgr.hxx"
#include "affentry.hxx"
+#include "langnum.hxx"
-#ifndef WINDOWS
+#include "csutil.hxx"
+
+#ifndef W32
using namespace std;
#endif
-
-// First some base level utility routines
-extern void mychomp(char * s);
-extern char * mystrdup(const char * s);
-extern char * myrevstrdup(const char * s);
-extern char * mystrsep(char ** sptr, const char delim);
-extern int isSubset(const char * s1, const char * s2);
-extern int isRevSubset(const char * s1, const char * end_of_s2, int len_s2);
-
-
AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
{
// register hash manager and load affix data from aff file
pHMgr = ptr;
trystring = NULL;
encoding=NULL;
- reptable = NULL;
- numrep = 0;
+ utf8 = 0;
+ utf_tbl = NULL;
+ complexprefixes = 0;
maptable = NULL;
nummap = 0;
- compound=NULL;
- nosplitsugs= (0==1);
-
+ breaktable = NULL;
+ numbreak = 0;
+ reptable = NULL;
+ numrep = 0;
+ checkcpdtable = NULL;
+ numcheckcpd = 0;
+ defcpdtable = NULL;
+ numdefcpd = 0;
+ compoundflag = FLAG_NULL; // permits word in compound forms
+ compoundbegin = FLAG_NULL; // may be first word in compound forms
+ compoundmiddle = FLAG_NULL; // may be middle word in compound forms
+ compoundend = FLAG_NULL; // may be last word in compound forms
+ compoundroot = FLAG_NULL; // compound word signing flag
+ compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word
+ compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word
+ checkcompounddup = 0; // forbid double words in compounds
+ checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution)
+ checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds
+ checkcompoundtriple = 0; // forbid compounds with triple letters
+ forbiddenword = FLAG_NULL; // forbidden word signing flag
+ nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag
+ lang = NULL; // language
+ langnum = 0; // language code (see http://l10n.openoffice.org/languages.html)
+ pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes
+ cpdwordmax=0; // default: unlimited wordcount in compound words
cpdmin = 3; // default value
+ cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words
+ cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX)
+ cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search)
+ cpdvowels_utf16_len=0; // vowels
+ pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG
+ sfxappnd=NULL; // previous suffix for counting a special syllables BUG
+ cpdsyllablenum=NULL; // syllable count incrementing flag
+ checknum=0; // checking numbers, and word with numbers
+ wordchars=NULL; // letters + spec. word characters
+ wordchars_utf16=NULL; // letters + spec. word characters
+ wordchars_utf16_len=0; // letters + spec. word characters
+ version=NULL; // affix and dictionary file version string
+ havecontclass=0; // flags of possible continuing classes (double affix)
+ // LEMMA_PRESENT: not put root into the morphological output. Lemma presents
+ // in morhological description in dictionary file. It's often combined with PSEUDOROOT.
+ lemma_present = FLAG_NULL;
+ circumfix = FLAG_NULL;
+ onlyincompound = FLAG_NULL;
+ flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file
+ maxngramsugs = -1; // undefined
+ nosplitsugs = 0;
+ sugswithdots = 0;
+ keepcase = 0;
+ checksharps = 0;
+
+ derived = NULL; // XXX not threadsafe variable for experimental stemming
+ sfx = NULL;
+ pfx = NULL;
+
for (int i=0; i < SETSIZE; i++) {
pStart[i] = NULL;
sStart[i] = NULL;
pFlag[i] = NULL;
sFlag[i] = NULL;
}
+
+ for (int j=0; j < CONTSIZE; j++) {
+ contclasses[j] = 0;
+ }
+
if (parse_file(affpath)) {
fprintf(stderr,"Failure loading aff file %s\n",affpath);
fflush(stderr);
+ wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");
}
+
}
@@ -74,7 +128,8 @@ AffixMgr::~AffixMgr()
delete(ptr);
ptr = nptr;
nptr = NULL;
- }
+ }
+ sStart[j] = NULL;
}
if (trystring) free(trystring);
@@ -83,7 +138,8 @@ AffixMgr::~AffixMgr()
encoding=NULL;
if (maptable) {
for (int j=0; j < nummap; j++) {
- free(maptable[j].set);
+ if (maptable[j].set) free(maptable[j].set);
+ if (maptable[j].set_utf16) free(maptable[j].set_utf16);
maptable[j].set = NULL;
maptable[j].len = 0;
}
@@ -91,21 +147,73 @@ AffixMgr::~AffixMgr()
maptable = NULL;
}
nummap = 0;
+ if (breaktable) {
+ for (int j=0; j < numbreak; j++) {
+ if (breaktable[j]) free(breaktable[j]);
+ breaktable[j] = NULL;
+ }
+ free(breaktable);
+ breaktable = NULL;
+ }
+ numbreak = 0;
if (reptable) {
for (int j=0; j < numrep; j++) {
free(reptable[j].pattern);
- free(reptable[j].replacement);
+ free(reptable[j].pattern2);
reptable[j].pattern = NULL;
- reptable[j].replacement = NULL;
+ reptable[j].pattern2 = NULL;
}
free(reptable);
reptable = NULL;
}
+ if (defcpdtable) {
+ for (int j=0; j < numdefcpd; j++) {
+ free(defcpdtable[j].def);
+ defcpdtable[j].def = NULL;
+ }
+ free(defcpdtable);
+ defcpdtable = NULL;
+ }
numrep = 0;
- if (compound) free(compound);
- compound=NULL;
+ if (checkcpdtable) {
+ for (int j=0; j < numcheckcpd; j++) {
+ free(checkcpdtable[j].pattern);
+ free(checkcpdtable[j].pattern2);
+ checkcpdtable[j].pattern = NULL;
+ checkcpdtable[j].pattern2 = NULL;
+ }
+ free(checkcpdtable);
+ checkcpdtable = NULL;
+ }
+ numcheckcpd = 0;
+ FREE_FLAG(compoundflag);
+ FREE_FLAG(compoundbegin);
+ FREE_FLAG(compoundmiddle);
+ FREE_FLAG(compoundend);
+ FREE_FLAG(compoundpermitflag);
+ FREE_FLAG(compoundforbidflag);
+ FREE_FLAG(compoundroot);
+ FREE_FLAG(forbiddenword);
+ FREE_FLAG(nosuggest);
+ FREE_FLAG(pseudoroot);
+ FREE_FLAG(lemma_present);
+ FREE_FLAG(circumfix);
+ FREE_FLAG(onlyincompound);
+
+ cpdwordmax = 0;
pHMgr = NULL;
cpdmin = 0;
+ cpdmaxsyllable = 0;
+ if (cpdvowels) free(cpdvowels);
+ if (cpdvowels_utf16) free(cpdvowels_utf16);
+ if (cpdsyllablenum) free(cpdsyllablenum);
+ if (utf_tbl) free(utf_tbl);
+ if (lang) free(lang);
+ if (wordchars) free(wordchars);
+ if (wordchars_utf16) free(wordchars_utf16);
+ if (version) free(version);
+ if (derived) free(derived);
+ checknum=0;
}
@@ -118,6 +226,10 @@ int AffixMgr::parse_file(const char * affpath)
// affix type
char ft;
+
+ // checking flag duplication
+ char dupflags[CONTSIZE];
+ char dupflags_ini = 1;
// open the affix file
FILE * afflst;
@@ -151,16 +263,167 @@ int AffixMgr::parse_file(const char * affpath)
}
}
+ /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */
+ if (strncmp(line,"COMPLEXPREFIXES",15) == 0)
+ complexprefixes = 1;
+
/* parse in the flag used by the controlled compound words */
if (strncmp(line,"COMPOUNDFLAG",12) == 0) {
- if (parse_cpdflag(line)) {
+ if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) {
return 1;
}
}
- /* parse in the flag used by the controlled compound words */
+ /* parse in the flag used by compound words */
+ if (strncmp(line,"COMPOUNDBEGIN",13) == 0) {
+ if (complexprefixes) {
+ if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) {
+ return 1;
+ }
+ } else {
+ if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) {
+ return 1;
+ }
+ }
+ }
+
+ /* parse in the flag used by compound words */
+ if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) {
+ if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) {
+ return 1;
+ }
+ }
+ /* parse in the flag used by compound words */
+ if (strncmp(line,"COMPOUNDEND",11) == 0) {
+ if (complexprefixes) {
+ if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) {
+ return 1;
+ }
+ } else {
+ if (parse_flag(line, &compoundend, "COMPOUNDEND")) {
+ return 1;
+ }
+ }
+ }
+
+ /* parse in the flag used by compound_check() method */
+ if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) {
+ if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag sign compounds in dictionary */
+ if (strncmp(line,"COMPOUNDROOT",12) == 0) {
+ if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by compound_check() method */
+ if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) {
+ if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by compound_check() method */
+ if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) {
+ if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) {
+ return 1;
+ }
+ }
+
+ if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0)
+ checkcompounddup = 1;
+
+ if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0)
+ checkcompoundrep = 1;
+
+ if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0)
+ checkcompoundtriple = 1;
+
+ if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0)
+ checkcompoundcase = 1;
+
+ if (strncmp(line,"NOSUGGEST",9) == 0) {
+ if (parse_flag(line, &nosuggest, "NOSUGGEST")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by forbidden words */
+ if (strncmp(line,"FORBIDDENWORD",13) == 0) {
+ if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by forbidden words */
+ if (strncmp(line,"LEMMA_PRESENT",13) == 0) {
+ if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by circumfixes */
+ if (strncmp(line,"CIRCUMFIX",9) == 0) {
+ if (parse_flag(line, &circumfix, "CIRCUMFIX")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by fogemorphemes */
+ if (strncmp(line,"ONLYINCOMPOUND",14) == 0) {
+ if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by `pseudoroots' */
+ if (strncmp(line,"PSEUDOROOT",10) == 0) {
+ if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by `pseudoroots' */
+ if (strncmp(line,"NEEDAFFIX",9) == 0) {
+ if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) {
+ return 1;
+ }
+ }
+
+ /* parse in the minimal length for words in compounds */
if (strncmp(line,"COMPOUNDMIN",11) == 0) {
- if (parse_cpdmin(line)) {
+ if (parse_num(line, &cpdmin, "COMPOUNDMIN")) {
+ return 1;
+ }
+ if (cpdmin < 1) cpdmin = 1;
+ }
+
+ /* parse in the max. words and syllables in compounds */
+ if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) {
+ if (parse_cpdsyllable(line)) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by compound_check() method */
+ if (strncmp(line,"SYLLABLENUM",11) == 0) {
+ if (parse_syllablenum(line)) {
+ return 1;
+ }
+ }
+
+ /* parse in the flag used by the controlled compound words */
+ if (strncmp(line,"CHECKNUM",8) == 0) {
+ checknum=1;
+ }
+
+ /* parse in the try string */
+ if (strncmp(line,"WORDCHARS",9) == 0) {
+ if (parse_wordchars(line)) {
return 1;
}
}
@@ -172,6 +435,20 @@ int AffixMgr::parse_file(const char * affpath)
}
}
+ /* parse in the checkcompoundpattern table */
+ if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) {
+ if (parse_checkcpdtable(line, afflst)) {
+ return 1;
+ }
+ }
+
+ /* parse in the defcompound table */
+ if (strncmp(line,"COMPOUNDRULE",12) == 0) {
+ if (parse_defcpdtable(line, afflst)) {
+ return 1;
+ }
+ }
+
/* parse in the related character map table */
if (strncmp(line,"MAP",3) == 0) {
if (parse_maptable(line, afflst)) {
@@ -179,19 +456,64 @@ int AffixMgr::parse_file(const char * affpath)
}
}
- // parse this affix: P - prefix, S - suffix
- ft = ' ';
- if (strncmp(line,"PFX",3) == 0) ft = 'P';
- if (strncmp(line,"SFX",3) == 0) ft = 'S';
- if (ft != ' ') {
- if (parse_affix(line, ft, afflst)) {
+ /* parse in the word breakpoints table */
+ if (strncmp(line,"BREAK",5) == 0) {
+ if (parse_breaktable(line, afflst)) {
+ return 1;
+ }
+ }
+
+ /* parse in the language for language specific codes */
+ if (strncmp(line,"LANG",4) == 0) {
+ if (parse_lang(line)) {
+ return 1;
+ }
+ }
+
+ if (strncmp(line,"VERSION",7) == 0) {
+ if (parse_version(line)) {
+ return 1;
+ }
+ }
+
+ if (strncmp(line,"MAXNGRAMSUGS",12) == 0) {
+ if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) {
return 1;
}
}
- // handle NOSPLITSUGS
if (strncmp(line,"NOSPLITSUGS",11) == 0)
- nosplitsugs=(0==0);
+ nosplitsugs=1;
+
+ if (strncmp(line,"SUGSWITHDOTS",12) == 0)
+ sugswithdots=1;
+
+ /* parse in the flag used by forbidden words */
+ if (strncmp(line,"KEEPCASE",8) == 0) {
+ if (parse_flag(line, &keepcase, "KEEPCASE")) {
+ return 1;
+ }
+ }
+
+ if (strncmp(line,"CHECKSHARPS",11) == 0)
+ checksharps=1;
+
+ /* parse this affix: P - prefix, S - suffix */
+ ft = ' ';
+ if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P';
+ if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S';
+ if (ft != ' ') {
+ if (dupflags_ini) {
+ for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0;
+ dupflags_ini = 0;
+ }
+ if (parse_affix(line, ft, afflst, dupflags)) {
+ fclose(afflst);
+ process_pfx_tree_to_list();
+ process_sfx_tree_to_list();
+ return 1;
+ }
+ }
}
fclose(afflst);
@@ -226,6 +548,29 @@ int AffixMgr::parse_file(const char * affpath)
process_pfx_order();
process_sfx_order();
+ // expand wordchars string, based on csutil (for external tokenization)
+
+ char * enc = get_encoding();
+ csconv = get_current_cs(enc);
+ free(enc);
+ enc = NULL;
+
+ char expw[MAXLNLEN];
+ if (wordchars) {
+ strcpy(expw, wordchars);
+ free(wordchars);
+ } else *expw = '\0';
+
+ for (int i = 0; i <= 255; i++) {
+ if ( (csconv[i].cupper != csconv[i].clower) &&
+ (! strchr(expw, (char) i))) {
+ *(expw + strlen(expw) + 1) = '\0';
+ *(expw + strlen(expw)) = (char) i;
+ }
+ }
+
+ wordchars = mystrdup(expw);
+
return 0;
}
@@ -295,8 +640,6 @@ int AffixMgr::build_pfxtree(AffEntry* pfxptr)
return 0;
}
-
-
// we want to be able to quickly access suffix information
// both by suffix flag, and sorted by the reverse of the
// suffix string itself; so we need to set up two indexes
@@ -315,7 +658,6 @@ int AffixMgr::build_sfxtree(AffEntry* sfxptr)
ep->setFlgNxt(ptr);
sFlag[flg] = (AffEntry *) ep;
-
// next index by affix string
// handle the special case of null affix string
@@ -340,7 +682,6 @@ int AffixMgr::build_sfxtree(AffEntry* sfxptr)
return 0;
}
-
// otherwise use binary tree insertion so that a sorted
// list can easily be generated later
pptr = NULL;
@@ -363,7 +704,6 @@ int AffixMgr::build_sfxtree(AffEntry* sfxptr)
return 0;
}
-
// convert from binary tree to sorted list
int AffixMgr::process_pfx_tree_to_list()
{
@@ -405,7 +745,6 @@ AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
}
-
// reinitialize the PfxEntry links NextEQ and NextNE to speed searching
// using the idea of leading subsets this time
int AffixMgr::process_pfx_order()
@@ -455,9 +794,7 @@ int AffixMgr::process_pfx_order()
return 0;
}
-
-
-// reinitialize the SfxEntry links NextEQ and NextNE to speed searching
+// initialize the SfxEntry links NextEQ and NextNE to speed searching
// using the idea of leading subsets this time
int AffixMgr::process_sfx_order()
{
@@ -513,14 +850,16 @@ int AffixMgr::process_sfx_order()
// file affentry.cxx which describes what is going on here
// in much more detail
-void AffixMgr::encodeit(struct affentry * ptr, char * cs)
+int AffixMgr::encodeit(struct affentry * ptr, char * cs)
{
unsigned char c;
int i, j, k;
unsigned char mbr[MAXLNLEN];
+ w_char wmbr[MAXLNLEN];
+ w_char * wpos = wmbr;
// now clear the conditions array */
- for (i=0;i<SETSIZE;i++) ptr->conds[i] = (unsigned char) 0;
+ for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0;
// now parse the string to create the conds array */
int nc = strlen(cs);
@@ -533,7 +872,7 @@ void AffixMgr::encodeit(struct affentry * ptr, char * cs)
// if no condition just return
if (strcmp(cs,".")==0) {
ptr->numconds = 0;
- return;
+ return 0;
}
i = 0;
@@ -570,21 +909,21 @@ void AffixMgr::encodeit(struct affentry * ptr, char * cs)
ec = 1;
}
-
- if (ec) {
+ if (ec) {
+ if (!utf8) {
if (grp == 1) {
if (neg == 0) {
// set the proper bits in the condition array vals for those chars
for (j=0;j<nm;j++) {
k = (unsigned int) mbr[j];
- ptr->conds[k] = ptr->conds[k] | (1 << n);
+ ptr->conds.base[k] = ptr->conds.base[k] | (1 << n);
}
} else {
// complement so set all of them and then unset indicated ones
- for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
+ for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | (1 << n);
for (j=0;j<nm;j++) {
k = (unsigned int) mbr[j];
- ptr->conds[k] = ptr->conds[k] & ~(1 << n);
+ ptr->conds.base[k] = ptr->conds.base[k] & ~(1 << n);
}
}
neg = 0;
@@ -595,33 +934,115 @@ void AffixMgr::encodeit(struct affentry * ptr, char * cs)
// but first handle special case of . inside condition
if (c == '.') {
// wild card character so set them all
- for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n);
+ for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | (1 << n);
} else {
- ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n);
+ ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | (1 << n);
}
}
n++;
ec = 0;
- }
-
+ } else { // UTF-8 character set
+ if (grp == 1) {
+ ptr->conds.utf8.neg[n] = neg;
+ if (neg == 0) {
+ // set the proper bits in the condition array vals for those chars
+ for (j=0;j<nm;j++) {
+ k = (unsigned int) mbr[j];
+ if (k >> 7) {
+ u8_u16(wpos, 1, (char *) mbr + j);
+ wpos++;
+ if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
+ } else {
+ ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | (1 << n);
+ }
+ }
+ } else { // neg == 1
+ // complement so set all of them and then unset indicated ones
+ for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | (1 << n);
+ for (j=0;j<nm;j++) {
+ k = (unsigned int) mbr[j];
+ if (k >> 7) {
+ u8_u16(wpos, 1, (char *) mbr + j);
+ wpos++;
+ if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character
+ } else {
+ ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~(1 << n);
+ }
+ }
+ }
+ neg = 0;
+ grp = 0;
+ nm = 0;
+ ptr->conds.utf8.wlen[n] = wpos - wmbr;
+ if ((wpos - wmbr) != 0) {
+ ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr));
+ if (!ptr->conds.utf8.wchars[n]) return 1;
+ memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr));
+ flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]);
+ wpos = wmbr;
+ }
+ } else { // grp == 0
+ // is UTF-8 character?
+ if (c >> 7) {
+ ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char));
+ if (!ptr->conds.utf8.wchars[n]) return 1;
+ ptr->conds.utf8.wlen[n] = 1;
+ u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i);
+ if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character
+ } else {
+ ptr->conds.utf8.wchars[n] = NULL;
+ // not a group so just set the proper bit for this char
+ // but first handle special case of . inside condition
+ if (c == '.') {
+ ptr->conds.utf8.all[n] = 1;
+ // wild card character so set them all
+ for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | (1 << n);
+ } else {
+ ptr->conds.utf8.all[n] = 0;
+ ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | (1 << n);
+ }
+ }
+ neg = 0;
+ }
+ n++;
+ ec = 0;
+ neg = 0;
+ }
+ }
i++;
}
ptr->numconds = n;
- return;
+ return 0;
}
// check word for prefixes
-struct hentry * AffixMgr::prefix_check (const char * word, int len)
+struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound,
+ const FLAG needflag)
{
struct hentry * rv= NULL;
-
+
+ pfx = NULL;
+ pfxappnd = NULL;
+ sfxappnd = NULL;
+
// first handle the special case of 0 length prefixes
PfxEntry * pe = (PfxEntry *) pStart[0];
while (pe) {
- rv = pe->check(word,len);
- if (rv) return rv;
+ if (
+ // fogemorpheme
+ ((in_compound != IN_CPD_NOT) || !(pe->getCont() &&
+ (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) &&
+ // permit prefixes in compounds
+ ((in_compound != IN_CPD_END) || (pe->getCont() &&
+ (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen())))) &&
+ // check prefix
+ (rv = pe->check(word, len, in_compound, needflag))
+ ) {
+ pfx=(AffEntry *)pe; // BUG: pfx not stateless
+ return rv;
+ }
pe = pe->getNext();
}
@@ -631,8 +1052,19 @@ struct hentry * AffixMgr::prefix_check (const char * word, int len)
while (pptr) {
if (isSubset(pptr->getKey(),word)) {
- rv = pptr->check(word,len);
- if (rv) return rv;
+ if (
+ // fogemorpheme
+ ((in_compound != IN_CPD_NOT) || !(pptr->getCont() &&
+ (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) &&
+ // permit prefixes in compounds
+ ((in_compound != IN_CPD_END) || (pptr->getCont() &&
+ (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen())))) &&
+ // check prefix
+ (rv = pptr->check(word, len, in_compound, needflag))
+ ) {
+ pfx=(AffEntry *)pptr; // BUG: pfx not stateless
+ return rv;
+ }
pptr = pptr->getNextEQ();
} else {
pptr = pptr->getNextNE();
@@ -642,113 +1074,1574 @@ struct hentry * AffixMgr::prefix_check (const char * word, int len)
return NULL;
}
-// check if compound word is correctly spelled
-struct hentry * AffixMgr::compound_check (const char * word, int len, char compound_flag)
+// check word for prefixes
+struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len,
+ char in_compound, const FLAG needflag)
{
- int i;
struct hentry * rv= NULL;
+
+ pfx = NULL;
+ sfxappnd = NULL;
+
+ // first handle the special case of 0 length prefixes
+ PfxEntry * pe = (PfxEntry *) pStart[0];
+
+ while (pe) {
+ rv = pe->check_twosfx(word, len, in_compound, needflag);
+ if (rv) return rv;
+ pe = pe->getNext();
+ }
+
+ // now handle the general case
+ unsigned char sp = *((const unsigned char *)word);
+ PfxEntry * pptr = (PfxEntry *)pStart[sp];
+
+ while (pptr) {
+ if (isSubset(pptr->getKey(),word)) {
+ rv = pptr->check_twosfx(word, len, in_compound, needflag);
+ if (rv) {
+ pfx = (AffEntry *)pptr;
+ return rv;
+ }
+ pptr = pptr->getNextEQ();
+ } else {
+ pptr = pptr->getNextNE();
+ }
+ }
+
+ return NULL;
+}
+
+
+// check word for prefixes
+char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound,
+ const FLAG needflag)
+{
char * st;
- char ch;
+
+ char result[MAXLNLEN];
+ result[0] = '\0';
+
+ pfx = NULL;
+ sfxappnd = NULL;
- // handle case of string too short to be a piece of a compound word
- if (len < cpdmin) return NULL;
+ // first handle the special case of 0 length prefixes
+ PfxEntry * pe = (PfxEntry *) pStart[0];
+ while (pe) {
+ st = pe->check_morph(word,len,in_compound, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ // if (rv) return rv;
+ pe = pe->getNext();
+ }
+
+ // now handle the general case
+ unsigned char sp = *((const unsigned char *)word);
+ PfxEntry * pptr = (PfxEntry *)pStart[sp];
- st = mystrdup(word);
+ while (pptr) {
+ if (isSubset(pptr->getKey(),word)) {
+ st = pptr->check_morph(word,len,in_compound, needflag);
+ if (st) {
+ // fogemorpheme
+ if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() &&
+ (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) {
+ strcat(result, st);
+ pfx = (AffEntry *)pptr;
+ }
+ free(st);
+ }
+ pptr = pptr->getNextEQ();
+ } else {
+ pptr = pptr->getNextNE();
+ }
+ }
- for (i=cpdmin; i < (len - (cpdmin-1)); i++) {
+ if (*result) return mystrdup(result);
+ return NULL;
+}
+
+
+// check word for prefixes
+char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len,
+ char in_compound, const FLAG needflag)
+{
+ char * st;
+
+ char result[MAXLNLEN];
+ result[0] = '\0';
+
+ pfx = NULL;
+ sfxappnd = NULL;
+
+ // first handle the special case of 0 length prefixes
+ PfxEntry * pe = (PfxEntry *) pStart[0];
+ while (pe) {
+ st = pe->check_twosfx_morph(word,len,in_compound, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ pe = pe->getNext();
+ }
+
+ // now handle the general case
+ unsigned char sp = *((const unsigned char *)word);
+ PfxEntry * pptr = (PfxEntry *)pStart[sp];
+
+ while (pptr) {
+ if (isSubset(pptr->getKey(),word)) {
+ st = pptr->check_twosfx_morph(word, len, in_compound, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ pfx = (AffEntry *)pptr;
+ }
+ pptr = pptr->getNextEQ();
+ } else {
+ pptr = pptr->getNextNE();
+ }
+ }
+
+ if (*result) return mystrdup(result);
+ return NULL;
+}
+
+// Is word a non compound with a REP substitution (see checkcompoundrep)?
+int AffixMgr::cpdrep_check(const char * word, int wl)
+{
+ char candidate[MAXLNLEN];
+ const char * r;
+ int lenr, lenp;
+
+ if ((wl < 2) || !numrep) return 0;
+
+ for (int i=0; i < numrep; i++ ) {
+ r = word;
+ lenr = strlen(reptable[i].pattern2);
+ lenp = strlen(reptable[i].pattern);
+ // search every occurence of the pattern in the word
+ while ((r=strstr(r, reptable[i].pattern)) != NULL) {
+ strcpy(candidate, word);
+ if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break;
+ strcpy(candidate+(r-word),reptable[i].pattern2);
+ strcpy(candidate+(r-word)+lenr, r+lenp);
+ if (candidate_check(candidate,strlen(candidate))) return 1;
+ if (candidate_check(candidate,strlen(candidate))) return 1;
+ r++; // search for the next letter
+ }
+ }
+ return 0;
+}
+
+// forbid compoundings when there are special patterns at word bound
+int AffixMgr::cpdpat_check(const char * word, int pos)
+{
+ int len;
+ for (int i = 0; i < numcheckcpd; i++) {
+ if (isSubset(checkcpdtable[i].pattern2, word + pos) &&
+ (len = strlen(checkcpdtable[i].pattern)) && (pos > len) &&
+ (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1;
+ }
+ return 0;
+}
+
+// forbid compounding with neighbouring upper and lower case characters at word bounds
+int AffixMgr::cpdcase_check(const char * word, int pos)
+{
+ if (utf8) {
+ w_char u, w;
+ const char * p;
+ u8_u16(&u, 1, word + pos);
+ for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--);
+ u8_u16(&w, 1, p);
+ unsigned short a = (u.h << 8) + u.l;
+ unsigned short b = (w.h << 8) + w.l;
+ if (utf_tbl[a].cletter && utf_tbl[a].cletter &&
+ ((utf_tbl[a].cupper == a) || (utf_tbl[b].cupper == b))) return 1;
+ } else {
+ unsigned char a = *(word + pos - 1);
+ unsigned char b = *(word + pos);
+ if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1;
+ }
+ return 0;
+}
+
+// check compound patterns
+int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all)
+{
+ short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking
+ short btwp[MAXWORDLEN]; // word positions for metacharacters
+ int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions
+ short bt = 0;
+ int i;
+ int ok;
+ int w = 0;
+ if (!*words) {
+ w = 1;
+ *words = def;
+ }
+ (*words)[wnum] = rv;
+
+ for (i = 0; i < numdefcpd; i++) {
+ int pp = 0; // pattern position
+ int wp = 0; // "words" position
+ int ok2;
+ ok = 1;
+ ok2 = 1;
+ do {
+ while ((pp < defcpdtable[i].len) && (wp <= wnum)) {
+ if (((pp+1) < defcpdtable[i].len) &&
+ ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) {
+ int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum;
+ ok2 = 1;
+ pp+=2;
+ btpp[bt] = pp;
+ btwp[bt] = wp;
+ while (wp <= wend) {
+ if (!(*words)[wp]->alen ||
+ !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) {
+ ok2 = 0;
+ break;
+ }
+ wp++;
+ }
+ if (wp <= wnum) ok2 = 0;
+ btnum[bt] = wp - btwp[bt];
+ if (btnum[bt] > 0) bt++;
+ if (ok2) break;
+ } else {
+ ok2 = 1;
+ if (!(*words)[wp] || !(*words)[wp]->alen ||
+ !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) {
+ ok = 0;
+ break;
+ }
+ pp++;
+ wp++;
+ if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0;
+ }
+ }
+ if (ok && ok2) {
+ int r = pp;
+ while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) &&
+ ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2;
+ if (defcpdtable[i].len <= r) return 1;
+ }
+ // backtrack
+ if (bt) do {
+ ok = 1;
+ btnum[bt - 1]--;
+ pp = btpp[bt - 1];
+ wp = btwp[bt - 1] + btnum[bt - 1];
+ } while ((btnum[bt - 1] < 0) && --bt);
+ } while (bt);
+
+ if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1;
+ // check zero ending
+ while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) &&
+ ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2;
+ if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1;
+ }
+ (*words)[wnum] = NULL;
+ if (w) *words = NULL;
+ return 0;
+}
+
+inline int AffixMgr::candidate_check(const char * word, int len)
+{
+ struct hentry * rv=NULL;
+
+ rv = lookup(word);
+ if (rv) return 1;
+
+// rv = prefix_check(word,len,1);
+// if (rv) return 1;
+
+ rv = affix_check(word,len);
+ if (rv) return 1;
+ return 0;
+}
+
+// calculate number of syllable for compound-checking
+int AffixMgr::get_syllable(const char * word, int wlen)
+{
+ if (cpdmaxsyllable==0) return 0;
+
+ int num=0;
+
+ if (!utf8) {
+ for (int i=0; i<wlen; i++) {
+ if (strchr(cpdvowels, word[i])) num++;
+ }
+ } else if (cpdvowels_utf16) {
+ w_char w[MAXWORDUTF8LEN];
+ int i = u8_u16(w, MAXWORDUTF8LEN, word);
+ for (; i; i--) {
+ if (flag_bsearch((unsigned short *) cpdvowels_utf16,
+ ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++;
+ }
+ }
+ return num;
+}
+
+// check if compound word is correctly spelled
+// hu_mov_rule = spec. Hungarian rule (XXX)
+struct hentry * AffixMgr::compound_check(const char * word, int len,
+ short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL,
+ char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0)
+{
+ int i, oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
+ int oldcmpdstemnum = 0;
+ struct hentry * rv = NULL;
+ struct hentry * rv_first;
+ struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
+ char st [MAXWORDUTF8LEN + 4];
+ char ch;
+ int cmin;
+ int cmax;
+
+ int checked_prefix;
+
+#ifdef HUNSTEM
+ if (cmpdstemnum) {
+ if (wordnum == 0) {
+ *cmpdstemnum = 1;
+ } else {
+ (*cmpdstemnum)++;
+ }
+ }
+#endif
+ if (utf8) {
+ for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
+ cmin++;
+ for (; (word[cmin] & 0xc0) == 0x80; cmin++);
+ }
+ for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
+ cmax--;
+ for (; (word[cmax] & 0xc0) == 0x80; cmax--);
+ }
+ } else {
+ cmin = cpdmin;
+ cmax = len - cpdmin + 1;
+ }
+
+ strcpy(st, word);
+
+ for (i = cmin; i < cmax; i++) {
+
+ oldnumsyllable = numsyllable;
+ oldwordnum = wordnum;
+ checked_prefix = 0;
+
+ // go to end of the UTF-8 character
+ if (utf8) {
+ for (; (st[i] & 0xc0) == 0x80; i++);
+ if (i >= cmax) return NULL;
+ }
+
+
ch = st[i];
- st[i] = '\0';
+ st[i] = '\0';
+
+ sfx = NULL;
+ pfx = NULL;
+
+ // FIRST WORD
+
+ rv = lookup(st); // perhaps without prefix
+
+ // search homonym with compound flag
+ while ((rv) && !hu_mov_rule &&
+ ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
+ !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ (compoundbegin && !wordnum &&
+ TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
+ (compoundmiddle && wordnum && !words &&
+ TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
+ (numdefcpd &&
+ ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
+ (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
+ ))) {
+ rv = rv->next_homonym;
+ }
- rv = lookup(st);
- if (!rv) rv = affix_check(st,i);
+ if (!rv) {
+ if (compoundflag &&
+ !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
+ if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
+ FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
+ ((SfxEntry*)sfx)->getCont() &&
+ ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
+ ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+ }
+ if (rv ||
+ (((wordnum == 0) && compoundbegin &&
+ ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
+ (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
+ ((wordnum > 0) && compoundmiddle &&
+ ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
+ (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
+ ) checked_prefix = 1;
+ // else check forbiddenwords and pseudoroot
+ } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
+ TESTAFF(rv->astr, pseudoroot, rv->alen) ||
+ (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen))
+ )) {
+ st[i] = ch;
+ continue;
+ }
- if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
- rv = lookup((word+i));
- if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
- free(st);
- return rv;
+ // check non_compound flag in suffix and prefix
+ if ((rv) && !hu_mov_rule &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+
+ // check compoundend flag in suffix and prefix
+ if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+
+ // check compoundmiddle flag in suffix and prefix
+ if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+
+ // check forbiddenwords
+ if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
+ (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) {
+ return NULL;
+ }
+
+ // increment word number, if the second root has a compoundroot flag
+ if ((rv) && compoundroot &&
+ (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+ wordnum++;
+ }
+
+ // first word is acceptable in compound words?
+ if (((rv) &&
+ ( checked_prefix || (words && words[wnum]) ||
+ (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
+ ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// ||
+// (numdefcpd && )
+
+// LANG_hu section: spec. Hungarian rule
+ || ((langnum == LANG_hu) && hu_mov_rule && (
+ TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes
+ TESTAFF(rv->astr, 'G', rv->alen) ||
+ TESTAFF(rv->astr, 'H', rv->alen)
+ )
+ )
+// END of LANG_hu section
+ )
+ && ! (( checkcompoundtriple && // test triple letters
+ (word[i-1]==word[i]) && (
+ ((i>1) && (word[i-1]==word[i-2])) ||
+ ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
+ )
+ ) ||
+ (
+ // test CHECKCOMPOUNDPATTERN
+ numcheckcpd && cpdpat_check(word, i)
+ ) ||
+ (
+ checkcompoundcase && cpdcase_check(word, i)
+ ))
+ )
+// LANG_hu section: spec. Hungarian rule
+ || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
+ (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes
+ TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
+ TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
+ )
+ )
+ )
+// END of LANG_hu section
+ ) {
+
+// LANG_hu section: spec. Hungarian rule
+ if (langnum == LANG_hu) {
+ // calculate syllable number of the word
+ numsyllable += get_syllable(st, i);
+
+ // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
+ if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
+ }
+// END of LANG_hu section
+
+#ifdef HUNSTEM
+ if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i;
+#endif
+
+ // NEXT WORD(S)
+ rv_first = rv;
+ rv = lookup((word+i)); // perhaps without prefix
+
+ // search homonym with compound flag
+ while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
+ !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
+ (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
+ rv = rv->next_homonym;
+ }
+
+ if (rv && words && words[wnum + 1]) return rv;
+
+ oldnumsyllable2 = numsyllable;
+ oldwordnum2 = wordnum;
+
+// LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code
+ if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
+ numsyllable--;
}
- rv = affix_check((word+i),strlen(word+i));
- if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) {
- free(st);
- return rv;
+// END of LANG_hu section
+
+ // increment word number, if the second root has a compoundroot flag
+ if ((rv) && (compoundroot) &&
+ (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+ wordnum++;
+ }
+
+ // check forbiddenwords
+ if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
+ (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
+
+ // second word is acceptable, as a root?
+ // hungarian conventions: compounding is acceptable,
+ // when compound forms consist of 2 words, or if more,
+ // then the syllable number of root words must be 6, or lesser.
+
+ if ((rv) && (
+ (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
+ )
+ && (
+ ((cpdwordmax==0) || (wordnum+1<cpdwordmax)) ||
+ ((cpdmaxsyllable==0) ||
+ (numsyllable + get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable))
+ )
+ && (
+ (!checkcompounddup || (rv != rv_first))
+ )
+ )
+ {
+ // forbid compound word, if it is a non compound word with typical fault
+ if (checkcompoundrep && cpdrep_check(word,len)) return NULL;
+ return rv;
+ }
+
+ numsyllable = oldnumsyllable2 ;
+ wordnum = oldwordnum2;
+
+ // perhaps second word has prefix or/and suffix
+ sfx = NULL;
+ sfxflag = FLAG_NULL;
+ rv = (compoundflag) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL;
+ if (!rv && compoundend) {
+ sfx = NULL;
+ pfx = NULL;
+ rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END);
+ }
+
+ if (!rv && numdefcpd && words) {
+ rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
+ if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv;
+ }
+
+ // check non_compound flag in suffix and prefix
+ if ((rv) &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+
+ // check forbiddenwords
+ if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
+ (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL;
+
+ // pfxappnd = prefix of word+i, or NULL
+ // calculate syllable number of prefix.
+ // hungarian convention: when syllable number of prefix is more,
+ // than 1, the prefix+word counts as two words.
+
+ if (langnum == LANG_hu) {
+ // calculate syllable number of the word
+ numsyllable += get_syllable(word + i, strlen(word + i));
+
+ // - affix syllable num.
+ // XXX only second suffix (inflections, not derivations)
+ if (sfxappnd) {
+ char * tmp = myrevstrdup(sfxappnd);
+ numsyllable -= get_syllable(tmp, strlen(tmp));
+ free(tmp);
+ }
+
+ // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
+ if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
+
+ // increment syllable num, if last word has a SYLLABLENUM flag
+ // and the suffix is beginning `s'
+
+ if (cpdsyllablenum) {
+ switch (sfxflag) {
+ case 'c': { numsyllable+=2; break; }
+ case 'J': { numsyllable += 1; break; }
+ case 'I': { if (TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
+ }
+ }
+ }
+
+ // increment word number, if the second word has a compoundroot flag
+ if ((rv) && (compoundroot) &&
+ (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+ wordnum++;
+ }
+
+ // second word is acceptable, as a word with prefix or/and suffix?
+ // hungarian conventions: compounding is acceptable,
+ // when compound forms consist 2 word, otherwise
+ // the syllable number of root words is 6, or lesser.
+ if ((rv) &&
+ (
+ ((cpdwordmax ==0 ) || (wordnum + 1 < cpdwordmax)) ||
+ ((cpdmaxsyllable == 0) ||
+ (numsyllable <= cpdmaxsyllable))
+ )
+ && (
+ (!checkcompounddup || (rv != rv_first))
+ )) {
+ // forbid compound word, if it is a non compound word with typical fault
+ if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
+ return rv;
+ }
+
+ numsyllable = oldnumsyllable2;
+ wordnum = oldwordnum2;
+#ifdef HUNSTEM
+ if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum;
+#endif
+ // perhaps second word is a compound word (recursive call)
+ if (wordnum < maxwordnum) {
+ rv = compound_check((word+i),strlen(word+i), wordnum+1,
+ numsyllable, maxwordnum, wnum + 1, words,
+ 0, cmpdstemnum, cmpdstem, is_sug);
+ } else {
+ rv=NULL;
}
- rv = compound_check((word+i),strlen(word+i),compound_flag);
if (rv) {
- free(st);
+ // forbid compound word, if it is a non compound word with typical fault
+ if (checkcompoundrep && cpdrep_check(word, len)) return NULL;
return rv;
+ } else {
+#ifdef HUNSTEM
+ if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum;
+#endif
}
-
}
st[i] = ch;
+ wordnum = oldwordnum;
+ numsyllable = oldnumsyllable;
}
- free(st);
+
return NULL;
}
+// check if compound word is correctly spelled
+// hu_mov_rule = spec. Hungarian rule (XXX)
+int AffixMgr::compound_check_morph(const char * word, int len,
+ short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
+ char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL)
+{
+ int i, oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
+ int ok = 0;
+
+ struct hentry * rv = NULL;
+ struct hentry * rv_first;
+ struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking
+ char st [MAXWORDUTF8LEN + 4];
+ char ch;
+
+ int checked_prefix;
+ char presult[MAXLNLEN];
+
+ int cmin;
+ int cmax;
+
+ if (utf8) {
+ for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) {
+ cmin++;
+ for (; (word[cmin] & 0xc0) == 0x80; cmin++);
+ }
+ for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) {
+ cmax--;
+ for (; (word[cmax] & 0xc0) == 0x80; cmax--);
+ }
+ } else {
+ cmin = cpdmin;
+ cmax = len - cpdmin + 1;
+ }
+
+ strcpy(st, word);
+
+ for (i = cmin; i < cmax; i++) {
+ oldnumsyllable = numsyllable;
+ oldwordnum = wordnum;
+ checked_prefix = 0;
+
+ // go to end of the UTF-8 character
+ if (utf8) {
+ for (; (st[i] & 0xc0) == 0x80; i++);
+ if (i >= cmax) return 0;
+ }
+
+ ch = st[i];
+ st[i] = '\0';
+ sfx = NULL;
+
+ // FIRST WORD
+ *presult = '\0';
+ if (partresult) strcat(presult, partresult);
+
+ rv = lookup(st); // perhaps without prefix
+
+ // search homonym with compound flag
+ while ((rv) && !hu_mov_rule &&
+ ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
+ !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ (compoundbegin && !wordnum &&
+ TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
+ (compoundmiddle && wordnum && !words &&
+ TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
+ (numdefcpd &&
+ ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) ||
+ (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0))))
+ ))) {
+ rv = rv->next_homonym;
+ }
+
+ if (rv) {
+ if (rv->description) {
+ if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen))
+ strcat(presult, st);
+ strcat(presult, rv->description);
+ }
+ }
+
+ if (!rv) {
+ if (compoundflag &&
+ !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) {
+ if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL,
+ FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule &&
+ ((SfxEntry*)sfx)->getCont() &&
+ ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
+ ((SfxEntry*)sfx)->getContLen())) || (compoundend &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+ }
+
+ if (rv ||
+ (((wordnum == 0) && compoundbegin &&
+ ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
+ (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) ||
+ ((wordnum > 0) && compoundmiddle &&
+ ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
+ (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle)))))
+ ) {
+ //char * p = prefix_check_morph(st, i, 0, compound);
+ char * p = NULL;
+ if (compoundflag) p = affix_check_morph(st, i, compoundflag);
+ if (!p || (*p == '\0')) {
+ if ((wordnum == 0) && compoundbegin) {
+ p = affix_check_morph(st, i, compoundbegin);
+ } else if ((wordnum > 0) && compoundmiddle) {
+ p = affix_check_morph(st, i, compoundmiddle);
+ }
+ }
+ if (*p != '\0') {
+ line_uniq(p);
+ if (strchr(p, '\n')) {
+ strcat(presult, "(");
+ strcat(presult, line_join(p, '|'));
+ strcat(presult, ")");
+ } else {
+ strcat(presult, p);
+ }
+ }
+ if (presult[strlen(presult) - 1] == '\n') {
+ presult[strlen(presult) - 1] = '\0';
+ }
+ checked_prefix = 1;
+ //strcat(presult, "+");
+ }
+ // else check forbiddenwords
+ } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
+ TESTAFF(rv->astr, pseudoroot, rv->alen))) {
+ st[i] = ch;
+ continue;
+ }
+
+ // check non_compound flag in suffix and prefix
+ if ((rv) && !hu_mov_rule &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ continue;
+ }
+
+ // check compoundend flag in suffix and prefix
+ if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ continue;
+ }
+
+ // check compoundmiddle flag in suffix and prefix
+ if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+
+ // check forbiddenwords
+ if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) continue;
+
+ // increment word number, if the second root has a compoundroot flag
+ if ((rv) && (compoundroot) &&
+ (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+ wordnum++;
+ }
+
+ // first word is acceptable in compound words?
+ if (((rv) &&
+ ( checked_prefix || (words && words[wnum]) ||
+ (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
+ ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))
+// LANG_hu section: spec. Hungarian rule
+ || ((langnum == LANG_hu) && // hu_mov_rule
+ hu_mov_rule && (
+ TESTAFF(rv->astr, 'F', rv->alen) ||
+ TESTAFF(rv->astr, 'G', rv->alen) ||
+ TESTAFF(rv->astr, 'H', rv->alen)
+ )
+ )
+// END of LANG_hu section
+ )
+ && ! (( checkcompoundtriple && // test triple letters
+ (word[i-1]==word[i]) && (
+ ((i>1) && (word[i-1]==word[i-2])) ||
+ ((word[i-1]==word[i+1])) // may be word[i+1] == '\0'
+ )
+ ) ||
+ (
+ // test CHECKCOMPOUNDPATTERN
+ numcheckcpd && cpdpat_check(word, i)
+ ) ||
+ (
+ checkcompoundcase && cpdcase_check(word, i)
+ ))
+ )
+// LANG_hu section: spec. Hungarian rule
+ || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) &&
+ (sfx && ((SfxEntry*)sfx)->getCont() && (
+ TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) ||
+ TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen())
+ )
+ )
+ )
+// END of LANG_hu section
+ ) {
+
+// LANG_hu section: spec. Hungarian rule
+ if (langnum == LANG_hu) {
+ // calculate syllable number of the word
+ numsyllable += get_syllable(st, i);
+
+ // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
+ if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
+ }
+// END of LANG_hu section
+
+ // NEXT WORD(S)
+ rv_first = rv;
+ rv = lookup((word+i)); // perhaps without prefix
+
+ // search homonym with compound flag
+ while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) ||
+ !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) ||
+ (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) {
+ rv = rv->next_homonym;
+ }
+
+ if (rv && words && words[wnum + 1]) {
+ strcat(*result, presult);
+ if (complexprefixes && rv->description) strcat(*result, rv->description);
+ if (rv->description && ((!rv->astr) ||
+ !TESTAFF(rv->astr, lemma_present, rv->alen)))
+ strcat(*result, rv->word);
+ if (!complexprefixes && rv->description) strcat(*result, rv->description);
+ strcat(*result, "\n");
+ ok = 1;
+ return 0;
+ }
+
+ oldnumsyllable2 = numsyllable;
+ oldwordnum2 = wordnum;
+
+// LANG_hu section: spec. Hungarian rule
+ if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) {
+ numsyllable--;
+ }
+// END of LANG_hu section
+ // increment word number, if the second root has a compoundroot flag
+ if ((rv) && (compoundroot) &&
+ (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+ wordnum++;
+ }
+
+ // check forbiddenwords
+ if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) {
+ st[i] = ch;
+ continue;
+ }
+
+ // second word is acceptable, as a root?
+ // hungarian conventions: compounding is acceptable,
+ // when compound forms consist of 2 words, or if more,
+ // then the syllable number of root words must be 6, or lesser.
+ if ((rv) && (
+ (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
+ (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))
+ )
+ && (
+ ((cpdwordmax==0) || (wordnum+1<cpdwordmax)) ||
+ ((cpdmaxsyllable==0) ||
+ (numsyllable+get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable))
+ )
+ && (
+ (!checkcompounddup || (rv != rv_first))
+ )
+ )
+ {
+ // bad compound word
+ strcat(*result, presult);
+
+ if (rv->description) {
+ if (complexprefixes) strcat(*result, rv->description);
+ if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen))
+ strcat(*result, rv->word);
+ if (!complexprefixes) strcat(*result, rv->description);
+ }
+ strcat(*result, "\n");
+ ok = 1;
+ }
+
+ numsyllable = oldnumsyllable2 ;
+ wordnum = oldwordnum2;
+
+ // perhaps second word has prefix or/and suffix
+ sfx = NULL;
+ sfxflag = FLAG_NULL;
+
+ if (compoundflag) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL;
+
+ if (!rv && compoundend) {
+ sfx = NULL;
+ pfx = NULL;
+ rv = affix_check((word+i),strlen(word+i), compoundend);
+ }
+
+ if (!rv && numdefcpd && words) {
+ rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END);
+ if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
+ char * m = NULL;
+ if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
+ if ((!m || *m == '\0') && compoundend)
+ m = affix_check_morph((word+i),strlen(word+i), compoundend);
+ strcat(*result, presult);
+ line_uniq(m);
+ if (strchr(m, '\n')) {
+ strcat(*result, "(");
+ strcat(*result, line_join(m, '|'));
+ strcat(*result, ")");
+ } else {
+ strcat(*result, m);
+ }
+ free(m);
+ strcat(*result, "\n");
+ ok = 1;
+ }
+ }
+
+ // check non_compound flag in suffix and prefix
+ if ((rv) &&
+ ((pfx && ((PfxEntry*)pfx)->getCont() &&
+ TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag,
+ ((PfxEntry*)pfx)->getContLen())) ||
+ (sfx && ((SfxEntry*)sfx)->getCont() &&
+ TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag,
+ ((SfxEntry*)sfx)->getContLen())))) {
+ rv = NULL;
+ }
+
+ // check forbiddenwords
+ if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen))
+ && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) {
+ st[i] = ch;
+ continue;
+ }
+
+ if (langnum == LANG_hu) {
+ // calculate syllable number of the word
+ numsyllable += get_syllable(word + i, strlen(word + i));
+
+ // - affix syllable num.
+ // XXX only second suffix (inflections, not derivations)
+ if (sfxappnd) {
+ char * tmp = myrevstrdup(sfxappnd);
+ numsyllable -= get_syllable(tmp, strlen(tmp));
+ free(tmp);
+ }
+
+ // + 1 word, if syllable number of the prefix > 1 (hungarian convention)
+ if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++;
+
+ // increment syllable num, if last word has a SYLLABLENUM flag
+ // and the suffix is beginning `s'
+
+ if (cpdsyllablenum) {
+ switch (sfxflag) {
+ case 'c': { numsyllable+=2; break; }
+ case 'J': { numsyllable += 1; break; }
+ case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; }
+ }
+ }
+ }
+
+ // increment word number, if the second word has a compoundroot flag
+ if ((rv) && (compoundroot) &&
+ (TESTAFF(rv->astr, compoundroot, rv->alen))) {
+ wordnum++;
+ }
+ // second word is acceptable, as a word with prefix or/and suffix?
+ // hungarian conventions: compounding is acceptable,
+ // when compound forms consist 2 word, otherwise
+ // the syllable number of root words is 6, or lesser.
+ if ((rv) &&
+ (
+ ((cpdwordmax==0) || (wordnum+1<cpdwordmax)) ||
+ ((cpdmaxsyllable==0) ||
+ (numsyllable <= cpdmaxsyllable))
+ )
+ && (
+ (!checkcompounddup || (rv != rv_first))
+ )) {
+ char * m = NULL;
+ if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag);
+ if ((!m || *m == '\0') && compoundend)
+ m = affix_check_morph((word+i),strlen(word+i), compoundend);
+ strcat(*result, presult);
+ line_uniq(m);
+ if (strchr(m, '\n')) {
+ strcat(*result, "(");
+ strcat(*result, line_join(m, '|'));
+ strcat(*result, ")");
+ } else {
+ strcat(*result, m);
+ }
+ free(m);
+ strcat(*result, "\n");
+ ok = 1;
+ }
+
+ numsyllable = oldnumsyllable2;
+ wordnum = oldwordnum2;
+
+ // perhaps second word is a compound word (recursive call)
+ if ((wordnum < maxwordnum) && (ok == 0)) {
+ compound_check_morph((word+i),strlen(word+i), wordnum+1,
+ numsyllable, maxwordnum, wnum + 1, words, 0, result, presult);
+ } else {
+ rv=NULL;
+ }
+ }
+ st[i] = ch;
+ wordnum = oldwordnum;
+ numsyllable = oldnumsyllable;
+ }
+ return 0;
+}
+
+
+
// check word for suffixes
+
struct hentry * AffixMgr::suffix_check (const char * word, int len,
- int sfxopts, AffEntry * ppfx)
+ int sfxopts, AffEntry * ppfx, char ** wlst, int maxSug, int * ns,
+ const FLAG cclass, const FLAG needflag, char in_compound)
{
struct hentry * rv = NULL;
+ char result[MAXLNLEN];
+
+ PfxEntry* ep = (PfxEntry *) ppfx;
// first handle the special case of 0 length suffixes
SfxEntry * se = (SfxEntry *) sStart[0];
+
while (se) {
- rv = se->check(word,len, sfxopts, ppfx);
- if (rv) return rv;
+ if (!cclass || se->getCont()) {
+ // suffixes are not allowed in beginning of compounds
+ if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
+ // except when signed with compoundpermitflag flag
+ (se->getCont() && compoundpermitflag &&
+ TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
+ // no circumfix flag in prefix and suffix
+ ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
+ // circumfix flag in prefix AND suffix
+ ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
+ // fogemorpheme
+ (in_compound ||
+ !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
+ // pseudoroot on prefix or first suffix
+ (cclass ||
+ !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) ||
+ (ppfx && !((ep->getCont()) &&
+ TESTAFF(ep->getCont(), pseudoroot,
+ ep->getContLen())))
+ )
+ ) &&
+ (rv = se->check(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass, needflag))) {
+ sfx=(AffEntry *)se; // BUG: sfx not stateless
+ return rv;
+ }
+ }
se = se->getNext();
}
// now handle the general case
unsigned char sp = *((const unsigned char *)(word + len - 1));
+ SfxEntry * sptr = (SfxEntry *) sStart[sp];
+
+ while (sptr) {
+ if (isRevSubset(sptr->getKey(), word + len - 1, len)
+ ) {
+ // suffixes are not allowed in beginning of compounds
+ if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
+ // except when signed with compoundpermitflag flag
+ (sptr->getCont() && compoundpermitflag &&
+ TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
+ // no circumfix flag in prefix and suffix
+ ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
+ // circumfix flag in prefix AND suffix
+ ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
+ // fogemorpheme
+ (in_compound ||
+ !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
+ // pseudoroot on prefix or first suffix
+ (cclass ||
+ !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) ||
+ (ppfx && !((ep->getCont()) &&
+ TESTAFF(ep->getCont(), pseudoroot,
+ ep->getContLen())))
+ )
+ ) &&
+ (rv = sptr->check(word,len, sfxopts, ppfx, wlst, maxSug, ns, cclass, needflag))) {
+ sfx=(AffEntry *)sptr; // BUG: sfx not stateless
+ sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
+ if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
+ if (cclass || sptr->getCont()) {
+ if (!derived) {
+ derived = mystrdup(word);
+ } else {
+ strcpy(result, derived); // XXX check size
+ strcat(result, "\n");
+ strcat(result, word);
+ free(derived);
+ derived = mystrdup(result);
+ }
+ }
+ return rv;
+ }
+ sptr = sptr->getNextEQ();
+ } else {
+ sptr = sptr->getNextNE();
+ }
+ }
+ return NULL;
+}
+// check word for two-level suffixes
+
+struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len,
+ int sfxopts, AffEntry * ppfx, const FLAG needflag)
+{
+ struct hentry * rv = NULL;
+
+ // first handle the special case of 0 length suffixes
+ SfxEntry * se = (SfxEntry *) sStart[0];
+ while (se) {
+ if (contclasses[se->getFlag()])
+ {
+ rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag);
+ if (rv) return rv;
+ }
+ se = se->getNext();
+ }
+
+ // now handle the general case
+ unsigned char sp = *((const unsigned char *)(word + len - 1));
SfxEntry * sptr = (SfxEntry *) sStart[sp];
while (sptr) {
- if (isRevSubset(sptr->getKey(),(word+len-1), len)) {
- rv = sptr->check(word,len, sfxopts, ppfx);
- if (rv) {
- return rv;
- }
- sptr = sptr->getNextEQ();
+ if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
+ if (contclasses[sptr->getFlag()])
+ {
+ rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag);
+ if (rv) {
+ sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
+ if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
+ return rv;
+ }
+ }
+ sptr = sptr->getNextEQ();
} else {
sptr = sptr->getNextNE();
}
}
+
return NULL;
}
+char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len,
+ int sfxopts, AffEntry * ppfx, const FLAG needflag)
+{
+ char result[MAXLNLEN];
+ char result2[MAXLNLEN];
+ char result3[MAXLNLEN];
+
+ char * st;
+
+ result[0] = '\0';
+ result2[0] = '\0';
+ result3[0] = '\0';
+
+ // first handle the special case of 0 length suffixes
+ SfxEntry * se = (SfxEntry *) sStart[0];
+ while (se) {
+ if (contclasses[se->getFlag()])
+ {
+ st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
+ if (st) {
+ if (ppfx) {
+ if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
+ }
+ strcat(result, st);
+ free(st);
+ if (se->getMorph()) strcat(result, se->getMorph());
+ strcat(result, "\n");
+ }
+ }
+ se = se->getNext();
+ }
+
+ // now handle the general case
+ unsigned char sp = *((const unsigned char *)(word + len - 1));
+ SfxEntry * sptr = (SfxEntry *) sStart[sp];
+
+ while (sptr) {
+ if (isRevSubset(sptr->getKey(), word + len - 1, len)) {
+ if (contclasses[sptr->getFlag()])
+ {
+ st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag);
+ if (st) {
+ sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless
+ if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless
+ strcpy(result2, st);
+ free(st);
+
+ result3[0] = '\0';
+#ifdef DEBUG
+ unsigned short flag = sptr->getFlag();
+ char flagch[2] = &flag;
+ if (flag_mode == FLAG_NUM) {
+ sprintf(result3, "%d", sptr->getKey());
+ } else if (flag_mode == FLAG_LONG) {
+ sprintf(result3, "%c%c", flagch[0], flagch[1]);
+ } else sprintf(result3, "%c", flagch[1]);
+ strcat(result3, ":");
+#endif
+ if (sptr->getMorph()) strcat(result3, sptr->getMorph());
+ strlinecat(result2, result3);
+ strcat(result2, "\n");
+ strcat(result, result2);
+ }
+ }
+ sptr = sptr->getNextEQ();
+ } else {
+ sptr = sptr->getNextNE();
+ }
+ }
+ if (result) return mystrdup(result);
+ return NULL;
+}
+
+char * AffixMgr::suffix_check_morph(const char * word, int len,
+ int sfxopts, AffEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound)
+{
+ char result[MAXLNLEN];
+
+ struct hentry * rv = NULL;
+
+ result[0] = '\0';
+
+ PfxEntry* ep = (PfxEntry *) ppfx;
+
+ // first handle the special case of 0 length suffixes
+ SfxEntry * se = (SfxEntry *) sStart[0];
+ while (se) {
+ if (!cclass || se->getCont()) {
+ // suffixes are not allowed in beginning of compounds
+ if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
+ // except when signed with compoundpermitflag flag
+ (se->getCont() && compoundpermitflag &&
+ TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix ||
+ // no circumfix flag in prefix and suffix
+ ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) ||
+ // circumfix flag in prefix AND suffix
+ ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) &&
+ // fogemorpheme
+ (in_compound ||
+ !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) &&
+ // pseudoroot on prefix or first suffix
+ (cclass ||
+ !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) ||
+ (ppfx && !((ep->getCont()) &&
+ TESTAFF(ep->getCont(), pseudoroot,
+ ep->getContLen())))
+ )
+ ))
+ rv = se->check(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
+ while (rv) {
+ if (ppfx) {
+ if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
+ }
+ if (complexprefixes && rv->description) strcat(result, rv->description);
+ if (rv->description && ((!rv->astr) ||
+ !TESTAFF(rv->astr, lemma_present, rv->alen)))
+ strcat(result, rv->word);
+ if (!complexprefixes && rv->description) strcat(result, rv->description);
+ if (se->getMorph()) strcat(result, se->getMorph());
+ strcat(result, "\n");
+ rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
+ }
+ }
+ se = se->getNext();
+ }
+
+ // now handle the general case
+ unsigned char sp = *((const unsigned char *)(word + len - 1));
+ SfxEntry * sptr = (SfxEntry *) sStart[sp];
+
+ while (sptr) {
+ if (isRevSubset(sptr->getKey(), word + len - 1, len)
+ ) {
+ // suffixes are not allowed in beginning of compounds
+ if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass
+ // except when signed with compoundpermitflag flag
+ (sptr->getCont() && compoundpermitflag &&
+ TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix ||
+ // no circumfix flag in prefix and suffix
+ ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) ||
+ // circumfix flag in prefix AND suffix
+ ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(),
+ circumfix, ep->getContLen())) &&
+ (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) &&
+ // fogemorpheme
+ (in_compound ||
+ !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) &&
+ // pseudoroot on first suffix
+ (cclass || !(sptr->getCont() &&
+ TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())))
+ )) rv = sptr->check(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag);
+ while (rv) {
+ if (ppfx) {
+ if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph());
+ }
+ if (complexprefixes && rv->description) strcat(result, rv->description);
+ if (rv->description && ((!rv->astr) ||
+ !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, rv->word);
+ if (!complexprefixes && rv->description) strcat(result, rv->description);
+#ifdef DEBUG
+ unsigned short flag = sptr->getKey();
+ char flagch[2] = &flag;
+ if (flag_mode == FLAG_NUM) {
+ sprintf(result2, "%d", sptr->getKey());
+ } else if (flag_mode == FLAG_LONG) {
+ sprintf(result2, "%c%c", flagch[0], flagch[1]);
+ } else sprintf(result2, "%c", flagch[1]);
+ strcat(result2, ":");
+ strcat(result, result2);
+#endif
+
+ if (sptr->getMorph()) strcat(result, sptr->getMorph());
+ strcat(result, "\n");
+ rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag);
+ }
+ sptr = sptr->getNextEQ();
+ } else {
+ sptr = sptr->getNextNE();
+ }
+ }
+
+ if (*result) return mystrdup(result);
+ return NULL;
+}
+
// check if word with affixes is correctly spelled
-struct hentry * AffixMgr::affix_check (const char * word, int len)
+struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound)
{
struct hentry * rv= NULL;
+ if (derived) free(derived);
+ derived = NULL;
// check all prefixes (also crossed with suffixes if allowed)
- rv = prefix_check(word, len);
+ rv = prefix_check(word, len, in_compound, needflag);
if (rv) return rv;
// if still not found check all suffixes
- rv = suffix_check(word, len, 0, NULL);
+ rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound);
+
+ if (havecontclass) {
+ sfx = NULL;
+ pfx = NULL;
+ if (rv) return rv;
+ // if still not found check all two-level suffixes
+ rv = suffix_check_twosfx(word, len, 0, NULL, needflag);
+ if (rv) return rv;
+ // if still not found check all two-level suffixes
+ rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag);
+ }
return rv;
}
-int AffixMgr::expand_rootword(struct guessword * wlst, int maxn,
- const char * ts, int wl, const char * ap, int al)
+// check if word with affixes is correctly spelled
+char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound)
+{
+ char result[MAXLNLEN];
+ char * st = NULL;
+
+ *result = '\0';
+
+ // check all prefixes (also crossed with suffixes if allowed)
+ st = prefix_check_morph(word, len, in_compound);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+
+ // if still not found check all suffixes
+ st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+
+ if (havecontclass) {
+ sfx = NULL;
+ pfx = NULL;
+ // if still not found check all two-level suffixes
+ st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+
+ // if still not found check all two-level suffixes
+ st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ }
+
+ return mystrdup(result);
+}
+
+
+int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts,
+ int wl, const unsigned short * ap, unsigned short al, char * bad, int badl)
{
int nh=0;
// first add root word to list
-
- if (nh < maxn) {
+ if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) ||
+ (onlyincompound && TESTAFF(ap, onlyincompound, al))))) {
wlst[nh].word = mystrdup(ts);
wlst[nh].allow = (1 == 0);
nh++;
@@ -756,19 +2649,28 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn,
// handle suffixes
for (int i = 0; i < al; i++) {
- unsigned char c = (unsigned char) ap[i];
+ unsigned short c = (unsigned short) ap[i];
SfxEntry * sptr = (SfxEntry *)sFlag[c];
while (sptr) {
- char * newword = sptr->add(ts, wl);
- if (newword) {
- if (nh < maxn) {
- wlst[nh].word = newword;
- wlst[nh].allow = sptr->allowCross();
- nh++;
- } else {
- free(newword);
- }
- }
+ if (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) &&
+ (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0)) &&
+ // check pseudoroot flag
+ !(sptr->getCont() && ((pseudoroot &&
+ TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) ||
+ (onlyincompound &&
+ TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))
+ ) {
+ char * newword = sptr->add(ts, wl);
+ if (newword) {
+ if (nh < maxn) {
+ wlst[nh].word = newword;
+ wlst[nh].allow = sptr->allowCross();
+ nh++;
+ } else {
+ free(newword);
+ }
+ }
+ }
sptr = (SfxEntry *)sptr ->getFlgNxt();
}
}
@@ -779,10 +2681,11 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn,
for (int j=1;j<n ;j++)
if (wlst[j].allow) {
for (int k = 0; k < al; k++) {
- unsigned char c = (unsigned char) ap[k];
+ unsigned short c = (unsigned short) ap[k];
PfxEntry * cptr = (PfxEntry *) pFlag[c];
while (cptr) {
- if (cptr->allowCross()) {
+ if (cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) &&
+ (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) {
int l1 = strlen(wlst[j].word);
char * newword = cptr->add(wlst[j].word, l1);
if (newword) {
@@ -803,19 +2706,28 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn,
// now handle pure prefixes
for (int m = 0; m < al; m ++) {
- unsigned char c = (unsigned char) ap[m];
+ unsigned short c = (unsigned short) ap[m];
PfxEntry * ptr = (PfxEntry *) pFlag[c];
while (ptr) {
- char * newword = ptr->add(ts, wl);
- if (newword) {
- if (nh < maxn) {
- wlst[nh].word = newword;
- wlst[nh].allow = ptr->allowCross();
- nh++;
- } else {
- free(newword);
- }
- }
+ if (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) &&
+ (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0)) &&
+ // check pseudoroot flag
+ !(ptr->getCont() && ((pseudoroot &&
+ TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) ||
+ (onlyincompound &&
+ TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen()))))
+ ) {
+ char * newword = ptr->add(ts, wl);
+ if (newword) {
+ if (nh < maxn) {
+ wlst[nh].word = newword;
+ wlst[nh].allow = ptr->allowCross();
+ nh++;
+ } else {
+ free(newword);
+ }
+ }
+ }
ptr = (PfxEntry *)ptr ->getFlgNxt();
}
}
@@ -824,6 +2736,7 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn,
}
+
// return length of replacing table
int AffixMgr::get_numrep()
{
@@ -837,7 +2750,6 @@ struct replentry * AffixMgr::get_reptable()
return reptable;
}
-
// return length of character map table
int AffixMgr::get_nummap()
{
@@ -851,6 +2763,19 @@ struct mapentry * AffixMgr::get_maptable()
return maptable;
}
+// return length of word break table
+int AffixMgr::get_numbreak()
+{
+ return numbreak;
+}
+
+// return character map table
+char ** AffixMgr::get_breaktable()
+{
+ if (! breaktable ) return NULL;
+ return breaktable;
+}
+
// return text encoding of dictionary
char * AffixMgr::get_encoding()
{
@@ -860,6 +2785,33 @@ char * AffixMgr::get_encoding()
return mystrdup(encoding);
}
+// return text encoding of dictionary
+int AffixMgr::get_langnum()
+{
+ return langnum;
+}
+
+// return UTF info table
+struct unicode_info2 * AffixMgr::get_utf_conv()
+{
+ return utf_tbl;
+}
+
+// return double prefix option
+int AffixMgr::get_complexprefixes()
+{
+ return complexprefixes;
+}
+
+FLAG AffixMgr::get_keepcase()
+{
+ return keepcase;
+}
+
+int AffixMgr::get_checksharps()
+{
+ return checksharps;
+}
// return the preferred try string for suggestions
char * AffixMgr::get_try_string()
@@ -868,11 +2820,101 @@ char * AffixMgr::get_try_string()
return mystrdup(trystring);
}
+// return the preferred try string for suggestions
+const char * AffixMgr::get_wordchars()
+{
+ return wordchars;
+}
+
+unsigned short * AffixMgr::get_wordchars_utf16(int * len)
+{
+ *len = wordchars_utf16_len;
+ return wordchars_utf16;
+}
+
+// is there compounding?
+int AffixMgr::get_compound()
+{
+ return compoundflag || compoundbegin || numdefcpd;
+}
+
// return the compound words control flag
-char * AffixMgr::get_compound()
+FLAG AffixMgr::get_compoundflag()
+{
+ return compoundflag;
+}
+
+// return the forbidden words control flag
+FLAG AffixMgr::get_forbiddenword()
+{
+ return forbiddenword;
+}
+
+// return the forbidden words control flag
+FLAG AffixMgr::get_nosuggest()
+{
+ return nosuggest;
+}
+
+// return the forbidden words flag modify flag
+FLAG AffixMgr::get_pseudoroot()
+{
+ return pseudoroot;
+}
+
+// return the onlyincompound flag
+FLAG AffixMgr::get_onlyincompound()
+{
+ return onlyincompound;
+}
+
+// return the compound word signal flag
+FLAG AffixMgr::get_compoundroot()
{
- if (! compound ) return NULL;
- return compound;
+ return compoundroot;
+}
+
+// return the compound begin signal flag
+FLAG AffixMgr::get_compoundbegin()
+{
+ return compoundbegin;
+}
+
+// return the value of checknum
+int AffixMgr::get_checknum()
+{
+ return checknum;
+}
+
+// return the value of prefix
+const char * AffixMgr::get_prefix()
+{
+ if (pfx) return ((PfxEntry *)pfx)->getKey();
+ return NULL;
+}
+
+// return the value of suffix
+const char * AffixMgr::get_suffix()
+{
+ return sfxappnd;
+}
+
+// return the value of derived form (base word with first suffix).
+const char * AffixMgr::get_derived()
+{
+ return derived;
+}
+
+// return the value of suffix
+const char * AffixMgr::get_version()
+{
+ return version;
+}
+
+// return lemma_present flag
+FLAG AffixMgr::get_lemma_present()
+{
+ return lemma_present;
}
// utility method to look up root words in hash table
@@ -882,12 +2924,36 @@ struct hentry * AffixMgr::lookup(const char * word)
return pHMgr->lookup(word);
}
+// return the value of suffix
+const int AffixMgr::have_contclass()
+{
+ return havecontclass;
+}
+
+// return utf8
+int AffixMgr::get_utf8()
+{
+ return utf8;
+}
+
// return nosplitsugs
-bool AffixMgr::get_nosplitsugs(void)
+int AffixMgr::get_maxngramsugs(void)
+{
+ return maxngramsugs;
+}
+
+// return nosplitsugs
+int AffixMgr::get_nosplitsugs(void)
{
return nosplitsugs;
}
+// return sugswithdots
+int AffixMgr::get_sugswithdots(void)
+{
+ return sugswithdots;
+}
+
/* parse in the try string */
int AffixMgr::parse_try(char * line)
{
@@ -899,7 +2965,7 @@ int AffixMgr::parse_try(char * line)
char * piece;
int i = 0;
int np = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
@@ -929,11 +2995,32 @@ int AffixMgr::parse_set(char * line)
char * piece;
int i = 0;
int np = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
- case 1: { encoding = mystrdup(piece); np++; break; }
+ case 1: { encoding = mystrdup(piece);
+ if (strcmp(encoding, "UTF-8") == 0) {
+ unicode_info * uni = get_utf_cs();
+ utf8 = 1;
+ utf_tbl = (unicode_info2 *) malloc(CONTSIZE * sizeof(unicode_info2));
+ if (utf_tbl) {
+ int j;
+ for (j = 0; j < CONTSIZE; j++) {
+ utf_tbl[j].cletter = 0;
+ utf_tbl[j].clower = j;
+ utf_tbl[j].cupper = j;
+ }
+ for (j = 0; j < get_utf_cs_len(); j++) {
+ utf_tbl[uni[j].c].cletter = 1;
+ utf_tbl[uni[j].c].clower = uni[j].clower;
+ utf_tbl[uni[j].c].cupper = uni[j].cupper;
+ }
+ // set Azeri, Turkish spec. lowercasing
+ set_spec_utf8_encoding();
+ } else return 1;
+ }
+ np++; break; }
default: break;
}
i++;
@@ -947,49 +3034,169 @@ int AffixMgr::parse_set(char * line)
return 0;
}
+/* parse flag */
+int AffixMgr::parse_flag(char * line, unsigned short * out, char * name)
+{
+ if (*out) {
+ fprintf(stderr,"error: duplicate %s strings\n", name);
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ *out = pHMgr->decode_flag(piece);
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ fprintf(stderr,"error: missing %s information\n", name);
+ return 1;
+ }
+ return 0;
+}
+
+/* parse flag */
+int AffixMgr::parse_num(char * line, int * out, char * name)
+{
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ *out = atoi(piece);
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ fprintf(stderr,"error: missing %s information\n", name);
+ return 1;
+ }
+ return 0;
+}
-/* parse in the flag used by the controlled compound words */
-int AffixMgr::parse_cpdflag(char * line)
+/* parse in the wordchars string */
+int AffixMgr::parse_wordchars(char * line)
{
- if (compound) {
- fprintf(stderr,"error: duplicate compound flags used\n");
+ if (wordchars) {
+ fprintf(stderr,"error: duplicate WORDCHARS strings\n");
return 1;
}
char * tp = line;
char * piece;
int i = 0;
int np = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ w_char w[MAXWORDLEN];
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ if (!utf8) {
+ wordchars = mystrdup(piece);
+ } else {
+ int n = u8_u16(w, MAXWORDLEN, piece);
+ if (n > 0) {
+ flag_qsort((unsigned short *) w, 0, n);
+ wordchars_utf16 = (unsigned short *) malloc(n * sizeof(unsigned short));
+ if (!wordchars_utf16) return 1;
+ memcpy(wordchars_utf16, w, n * sizeof(unsigned short));
+ }
+ wordchars_utf16_len = n;
+ }
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ fprintf(stderr,"error: missing WORDCHARS information\n");
+ return 1;
+ }
+ return 0;
+}
+
+
+/* parse in the max syllablecount of compound words and */
+int AffixMgr::parse_cpdsyllable(char * line)
+{
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ w_char w[MAXWORDLEN];
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
- case 1: { compound = mystrdup(piece); np++; break; }
+ case 1: { cpdmaxsyllable = atoi(piece); np++; break; }
+ case 2: {
+ if (!utf8) {
+ cpdvowels = mystrdup(piece);
+ } else {
+ int n = u8_u16(w, MAXWORDLEN, piece);
+ if (n > 0) {
+ flag_qsort((unsigned short *) w, 0, n);
+ cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char));
+ if (!cpdvowels_utf16) return 1;
+ memcpy(cpdvowels_utf16, w, n * sizeof(w_char));
+ }
+ cpdvowels_utf16_len = n;
+ }
+ np++;
+ break;
+ }
default: break;
}
i++;
}
free(piece);
}
- if (np != 2) {
- fprintf(stderr,"error: missing compound flag information\n");
+ if (np < 2) {
+ fprintf(stderr,"error: missing compoundsyllable information\n");
return 1;
}
+ if (np == 2) cpdvowels = mystrdup("aeiouAEIOU");
return 0;
}
-
-/* parse in the min compound word length */
-int AffixMgr::parse_cpdmin(char * line)
+/* parse in the flags, that increments syllable number */
+int AffixMgr::parse_syllablenum(char * line)
{
char * tp = line;
char * piece;
int i = 0;
int np = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
- case 1: { cpdmin = atoi(piece); np++; break; }
+ case 1: { cpdsyllablenum = mystrdup(piece); np++; break; }
default: break;
}
i++;
@@ -997,14 +3204,12 @@ int AffixMgr::parse_cpdmin(char * line)
free(piece);
}
if (np != 2) {
- fprintf(stderr,"error: missing compound min information\n");
+ fprintf(stderr,"error: missing cpdsyllablenum information\n");
return 1;
- }
- if ((cpdmin < 1) || (cpdmin > 50)) cpdmin = 3;
+ }
return 0;
}
-
/* parse in the typical fault correcting table */
int AffixMgr::parse_reptable(char * line, FILE * af)
{
@@ -1016,7 +3221,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af)
char * piece;
int i = 0;
int np = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
@@ -1028,6 +3233,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af)
return 1;
}
reptable = (replentry *) malloc(numrep * sizeof(struct replentry));
+ if (!reptable) return 1;
np++;
break;
}
@@ -1045,13 +3251,13 @@ int AffixMgr::parse_reptable(char * line, FILE * af)
/* now parse the numrep lines to read in the remainder of the table */
char * nl = line;
for (int j=0; j < numrep; j++) {
- fgets(nl,MAXLNLEN,af);
+ if (!fgets(nl,MAXLNLEN,af)) return 1;
mychomp(nl);
tp = nl;
i = 0;
reptable[j].pattern = NULL;
- reptable[j].replacement = NULL;
- while ((piece=mystrsep(&tp,' '))) {
+ reptable[j].pattern2 = NULL;
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: {
@@ -1063,14 +3269,14 @@ int AffixMgr::parse_reptable(char * line, FILE * af)
break;
}
case 1: { reptable[j].pattern = mystrdup(piece); break; }
- case 2: { reptable[j].replacement = mystrdup(piece); break; }
+ case 2: { reptable[j].pattern2 = mystrdup(piece); break; }
default: break;
}
i++;
}
free(piece);
}
- if ((!(reptable[j].pattern)) || (!(reptable[j].replacement))) {
+ if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) {
fprintf(stderr,"error: replacement table is corrupt\n");
return 1;
}
@@ -1078,6 +3284,155 @@ int AffixMgr::parse_reptable(char * line, FILE * af)
return 0;
}
+/* parse in the checkcompoundpattern table */
+int AffixMgr::parse_checkcpdtable(char * line, FILE * af)
+{
+ if (numcheckcpd != 0) {
+ fprintf(stderr,"error: duplicate compound pattern tables used\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ numcheckcpd = atoi(piece);
+ if (numcheckcpd < 1) {
+ fprintf(stderr,"incorrect number of entries in compound pattern table\n");
+ free(piece);
+ return 1;
+ }
+ checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry));
+ if (!checkcpdtable) return 1;
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ fprintf(stderr,"error: missing compound pattern table information\n");
+ return 1;
+ }
+
+ /* now parse the numcheckcpd lines to read in the remainder of the table */
+ char * nl = line;
+ for (int j=0; j < numcheckcpd; j++) {
+ if (!fgets(nl,MAXLNLEN,af)) return 1;
+ mychomp(nl);
+ tp = nl;
+ i = 0;
+ checkcpdtable[j].pattern = NULL;
+ checkcpdtable[j].pattern2 = NULL;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: {
+ if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) {
+ fprintf(stderr,"error: compound pattern table is corrupt\n");
+ free(piece);
+ return 1;
+ }
+ break;
+ }
+ case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; }
+ case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) {
+ fprintf(stderr,"error: compound pattern table is corrupt\n");
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* parse in the compound rule table */
+int AffixMgr::parse_defcpdtable(char * line, FILE * af)
+{
+ if (numdefcpd != 0) {
+ fprintf(stderr,"error: duplicate compound rule tables used\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ numdefcpd = atoi(piece);
+ if (numdefcpd < 1) {
+ fprintf(stderr,"incorrect number of entries in compound rule table\n");
+ free(piece);
+ return 1;
+ }
+ defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry));
+ if (!defcpdtable) return 1;
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ fprintf(stderr,"error: missing compound rule table information\n");
+ return 1;
+ }
+
+ /* now parse the numdefcpd lines to read in the remainder of the table */
+ char * nl = line;
+ for (int j=0; j < numdefcpd; j++) {
+ if (!fgets(nl,MAXLNLEN,af)) return 1;
+ mychomp(nl);
+ tp = nl;
+ i = 0;
+ defcpdtable[j].def = NULL;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: {
+ if (strncmp(piece, "COMPOUNDRULE", 12) != 0) {
+ fprintf(stderr,"error: compound rule table is corrupt\n");
+ free(piece);
+ return 1;
+ }
+ break;
+ }
+ case 1: {
+ defcpdtable[j].len =
+ pHMgr->decode_flags(&(defcpdtable[j].def), piece);
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (!defcpdtable[j].len) {
+ fprintf(stderr,"error: compound rule table is corrupt\n");
+ return 1;
+ }
+ }
+ return 0;
+}
/* parse in the character map table */
@@ -1091,7 +3446,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af)
char * piece;
int i = 0;
int np = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: { np++; break; }
@@ -1103,6 +3458,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af)
return 1;
}
maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
+ if (!maptable) return 1;
np++;
break;
}
@@ -1120,13 +3476,13 @@ int AffixMgr::parse_maptable(char * line, FILE * af)
/* now parse the nummap lines to read in the remainder of the table */
char * nl = line;
for (int j=0; j < nummap; j++) {
- fgets(nl,MAXLNLEN,af);
+ if (!fgets(nl,MAXLNLEN,af)) return 1;
mychomp(nl);
tp = nl;
i = 0;
maptable[j].set = NULL;
maptable[j].len = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
case 0: {
@@ -1137,8 +3493,24 @@ int AffixMgr::parse_maptable(char * line, FILE * af)
}
break;
}
- case 1: { maptable[j].set = mystrdup(piece);
- maptable[j].len = strlen(maptable[j].set);
+ case 1: {
+ maptable[j].len = 0;
+ maptable[j].set = NULL;
+ maptable[j].set_utf16 = NULL;
+ if (!utf8) {
+ maptable[j].set = mystrdup(piece);
+ maptable[j].len = strlen(maptable[j].set);
+ } else {
+ w_char w[MAXWORDLEN];
+ int n = u8_u16(w, MAXWORDLEN, piece);
+ if (n > 0) {
+ flag_qsort((unsigned short *) w, 0, n);
+ maptable[j].set_utf16 = (w_char *) malloc(n * sizeof(w_char));
+ if (!maptable[j].set_utf16) return 1;
+ memcpy(maptable[j].set_utf16, w, n * sizeof(w_char));
+ }
+ maptable[j].len = n;
+ }
break; }
default: break;
}
@@ -1146,7 +3518,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af)
}
free(piece);
}
- if ((!(maptable[j].set)) || (!(maptable[j].len))) {
+ if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) {
fprintf(stderr,"error: map table is corrupt\n");
return 1;
}
@@ -1154,13 +3526,134 @@ int AffixMgr::parse_maptable(char * line, FILE * af)
return 0;
}
+/* parse in the word breakpoint table */
+int AffixMgr::parse_breaktable(char * line, FILE * af)
+{
+ if (numbreak != 0) {
+ fprintf(stderr,"error: duplicate word breakpoint tables used\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ numbreak = atoi(piece);
+ if (numbreak < 1) {
+ fprintf(stderr,"incorrect number of entries in BREAK table\n");
+ free(piece);
+ return 1;
+ }
+ breaktable = (char **) malloc(numbreak * sizeof(char *));
+ if (!breaktable) return 1;
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ fprintf(stderr,"error: missing word breakpoint table information\n");
+ return 1;
+ }
+
+ /* now parse the numbreak lines to read in the remainder of the table */
+ char * nl = line;
+ for (int j=0; j < numbreak; j++) {
+ if (!fgets(nl,MAXLNLEN,af)) return 1;
+ mychomp(nl);
+ tp = nl;
+ i = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: {
+ if (strncmp(piece,"BREAK",5) != 0) {
+ fprintf(stderr,"error: BREAK table is corrupt\n");
+ free(piece);
+ return 1;
+ }
+ break;
+ }
+ case 1: {
+ breaktable[j] = mystrdup(piece);
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (!breaktable) {
+ fprintf(stderr,"error: BREAK table is corrupt\n");
+ return 1;
+ }
+ }
+ return 0;
+}
+/* parse in the flag used by affix_check() */
+int AffixMgr::parse_lang(char * line)
+{
+ if (lang != NULL) {
+ fprintf(stderr,"error: duplicate LANG used\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ lang = mystrdup(piece);
+ langnum = get_lang_num(piece);
+ set_spec_utf8_encoding();
+ np++; break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np < 2) {
+ fprintf(stderr,"error: missing LANG information\n");
+ return 1;
+ }
+ return 0;
+}
+/* parse in the version string */
+int AffixMgr::parse_version(char * line)
+{
+ if (version) {
+ fprintf(stderr,"error: duplicate VERSION strings\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece = mystrsep(&tp, 0);
+ version = mystrdup(tp);
+ free(piece);
+ return 0;
+}
-int AffixMgr::parse_affix(char * line, const char at, FILE * af)
+int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags)
{
int numents = 0; // number of affentry structures to parse
- char achar='\0'; // affix char identifier
+
+ unsigned short aflag = 0; // affix char identifier
+
short ff=0;
struct affentry * ptr= NULL;
struct affentry * nptr= NULL;
@@ -1170,29 +3663,51 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
char * piece;
int i = 0;
+ // checking lines with bad syntax
+ int basefieldnum = 0;
+
// split affix header line into pieces
int np = 0;
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
// piece 1 - is type of affix
case 0: { np++; break; }
// piece 2 - is affix char
- case 1: { np++; achar = *piece; break; }
-
+ case 1: {
+ np++;
+ aflag = pHMgr->decode_flag(piece);
+ if (((at == 'S') && (dupflags[aflag] & dupSFX)) ||
+ ((at == 'P') && (dupflags[aflag] & dupPFX))) {
+ fprintf(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl);
+ // return 1; XXX permissive mode for bad dictionaries
+ }
+ dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX);
+ break;
+ }
// piece 3 - is cross product indicator
- case 2: { np++; if (*piece == 'Y') ff = XPRODUCT; break; }
+ case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; }
// piece 4 - is number of affentries
case 3: {
np++;
numents = atoi(piece);
+ if (numents == 0) {
+ char * err = pHMgr->encode_flag(aflag);
+ fprintf(stderr, "error: affix %s header has incorrect entry count in line %s\n",
+ err, nl);
+ free(err);
+ return 1;
+ }
ptr = (struct affentry *) malloc(numents * sizeof(struct affentry));
- ptr->xpflg = ff;
- ptr->achar = achar;
- break;
+ if (!ptr) return 1;
+ ptr->opts = ff;
+ if (utf8) ptr->opts += aeUTF8;
+ if (pHMgr->is_aliasf()) ptr->opts += aeALIASF;
+ if (pHMgr->is_aliasm()) ptr->opts += aeALIASM;
+ ptr->aflag = aflag;
}
default: break;
@@ -1203,7 +3718,9 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
}
// check to make sure we parsed enough pieces
if (np != 4) {
- fprintf(stderr, "error: affix %c header has insufficient data in line %s\n",achar,nl);
+ char * err = pHMgr->encode_flag(aflag);
+ fprintf(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl);
+ free(err);
free(ptr);
return 1;
}
@@ -1213,40 +3730,45 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
// now parse numents affentries for this affix
for (int j=0; j < numents; j++) {
- fgets(nl,MAXLNLEN,af);
+ if (!fgets(nl,MAXLNLEN,af)) return 1;
mychomp(nl);
tp = nl;
i = 0;
np = 0;
// split line into pieces
- while ((piece=mystrsep(&tp,' '))) {
+ while ((piece=mystrsep(&tp, 0))) {
if (*piece != '\0') {
switch(i) {
-
// piece 1 - is type
case 0: {
np++;
- if (nptr != ptr) nptr->xpflg = ptr->xpflg;
+ if (nptr != ptr) nptr->opts = ptr->opts;
break;
}
// piece 2 - is affix char
case 1: {
np++;
- if (*piece != achar) {
- fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
+ if (pHMgr->decode_flag(piece) != aflag) {
+ char * err = pHMgr->encode_flag(aflag);
+ fprintf(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
fprintf(stderr, "error: possible incorrect count\n");
+ free(err);
free(piece);
return 1;
}
- if (nptr != ptr) nptr->achar = ptr->achar;
+
+ if (nptr != ptr) nptr->aflag = ptr->aflag;
break;
}
// piece 3 - is string to strip or 0 for null
case 2: {
np++;
+ if (complexprefixes) {
+ if (utf8) reverseword_utf(piece); else reverseword(piece);
+ }
nptr->strip = mystrdup(piece);
nptr->stripl = strlen(nptr->strip);
if (strcmp(nptr->strip,"0") == 0) {
@@ -1259,8 +3781,39 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
// piece 4 - is affix string or 0 for null
case 3: {
+ char * dash;
+ nptr->morphcode = NULL;
+ nptr->contclass = NULL;
+ nptr->contclasslen = 0;
np++;
- nptr->appnd = mystrdup(piece);
+ dash = strchr(piece, '/');
+ if (dash) {
+ *dash = '\0';
+ if (complexprefixes) {
+ if (utf8) reverseword_utf(piece); else reverseword(piece);
+ }
+ nptr->appnd = mystrdup(piece);
+
+ if (pHMgr->is_aliasf()) {
+ int index = atoi(dash + 1);
+ nptr->contclasslen = pHMgr->get_aliasf(index, &(nptr->contclass));
+ } else {
+ nptr->contclasslen = pHMgr->decode_flags(&(nptr->contclass), dash + 1);
+ flag_qsort(nptr->contclass, 0, nptr->contclasslen);
+ }
+ *dash = '/';
+
+ havecontclass = 1;
+ for (unsigned short i = 0; i < nptr->contclasslen; i++) {
+ contclasses[(nptr->contclass)[i]] = 1;
+ }
+ } else {
+ if (complexprefixes) {
+ if (utf8) reverseword_utf(piece); else reverseword(piece);
+ }
+ nptr->appnd = mystrdup(piece);
+ }
+
nptr->appndl = strlen(nptr->appnd);
if (strcmp(nptr->appnd,"0") == 0) {
free(nptr->appnd);
@@ -1271,7 +3824,77 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
}
// piece 5 - is the conditions descriptions
- case 4: { np++; encodeit(nptr,piece); }
+ case 4: {
+ np++;
+ if (complexprefixes) {
+ int neg = 0;
+ if (utf8) reverseword_utf(piece); else reverseword(piece);
+ // reverse condition
+ for (char * k = piece + strlen(piece) - 1; k >= piece; k--) {
+ switch(*k) {
+ case '[': {
+ if (neg) *(k+1) = '['; else *k = ']';
+ break;
+ }
+ case ']': {
+ *k = '[';
+ if (neg) *(k+1) = '^';
+ neg = 0;
+ break;
+ }
+ case '^': {
+ if (*(k+1) == ']') neg = 1; else *(k+1) = *k;
+ break;
+ }
+ default: {
+ if (neg) *(k+1) = *k;
+ }
+ }
+ }
+ }
+ if (nptr->stripl && (strcmp(piece, ".") != 0) &&
+ redundant_condition(at, nptr->strip, nptr->stripl, piece, nl))
+ strcpy(piece, ".");
+ if (encodeit(nptr,piece)) return 1;
+ break;
+ }
+
+ case 5: {
+ np++;
+ if (pHMgr->is_aliasm()) {
+ int index = atoi(piece);
+ nptr->morphcode = pHMgr->get_aliasm(index);
+ } else {
+ if (complexprefixes) {
+ if (utf8) reverseword_utf(piece); else reverseword(piece);
+ }
+ nptr->morphcode = mystrdup(piece);
+ }
+ break;
+ }
+
+ case 6: {
+ // XXX deprecated syntax
+ np++;
+ if (nptr->contclass) {
+ fprintf(stderr, "error: affix rule contains two contclass "
+ "(%s and %s by deprecated syntax).\n", nptr->contclass, piece);
+ } else {
+ if (pHMgr->is_aliasf()) {
+ int index = atoi(piece);
+ nptr->contclasslen = pHMgr->get_aliasf(index, &(nptr->contclass));
+ } else {
+ nptr->contclasslen = pHMgr->decode_flags(&(nptr->contclass), piece);
+ flag_qsort(nptr->contclass, 0, nptr->contclasslen);
+ }
+ havecontclass = 1;
+ for (unsigned short i = 0; i < nptr->contclasslen; i++) {
+ contclasses[(nptr->contclass)[i]] = 1;
+ }
+ }
+ break;
+
+ }
default: break;
}
@@ -1280,14 +3903,27 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
free(piece);
}
// check to make sure we parsed enough pieces
- if (np != 5) {
- fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl);
+ if (np < 5) {
+ char * err = pHMgr->encode_flag(aflag);
+ fprintf(stderr, "error: affix %s is corrupt near line %s\n", err, nl);
+ free(err);
free(ptr);
return 1;
}
+
+#if DEBUG
+ // detect unnecessary fields, excepting comments
+ if (basefieldnum) {
+ int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
+ if (fieldnum != basefieldnum)
+ fprintf(stderr, "warning - bad field number:\n%s\n", nl);
+ } else {
+ basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6);
+ }
+#endif
nptr++;
}
-
+
// now create SfxEntry or PfxEntry objects and use links to
// build an ordered (sorted by affix string) list
nptr = ptr;
@@ -1304,3 +3940,81 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
free(ptr);
return 0;
}
+
+void AffixMgr::set_spec_utf8_encoding() {
+ if (utf8) {
+ // In Azeri and Turkish, I and i dictinct letters:
+ // There are a dotless lower case i pair of upper `I',
+ // and an upper I with dot pair of lower `i'.
+ if ((langnum == LANG_az) || (langnum == LANG_tr)) {
+ utf_tbl[0x0049].clower = 0x0131;
+ utf_tbl[0x0069].cupper = 0x0130;
+ }
+ }
+}
+
+int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * line) {
+ int condl = strlen(cond);
+ int i;
+ int j;
+ int neg;
+ int in;
+ if (ft == 'P') { // prefix
+ if (strncmp(strip, cond, condl) == 0) return 1;
+ if (utf8) {
+ } else {
+ for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) {
+ if (cond[j] != '[') {
+ if (cond[j] != strip[i]) {
+ fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line);
+ }
+ } else {
+ neg = (cond[j+1] == '^') ? 1 : 0;
+ in = 0;
+ do {
+ j++;
+ if (strip[i] == cond[j]) in = 1;
+ } while ((j < (condl - 1)) && (cond[j] != ']'));
+ if (j == (condl - 1) && (cond[j] != ']')) {
+ fprintf(stderr, "error - missing ] in condition:\n%s\n", line);
+ return 0;
+ }
+ if ((!neg && !in) || (neg && in)) {
+ fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line);
+ return 0;
+ }
+ }
+ }
+ if (j >= condl) return 1;
+ }
+ } else { // suffix
+ if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1;
+ if (utf8) {
+ } else {
+ for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) {
+ if (cond[j] != ']') {
+ if (cond[j] != strip[i]) {
+ fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line);
+ }
+ } else {
+ in = 0;
+ do {
+ j--;
+ if (strip[i] == cond[j]) in = 1;
+ } while ((j > 0) && (cond[j] != '['));
+ if ((j == 0) && (cond[j] != '[')) {
+ fprintf(stderr, "error - missing ] in condition:\n%s\n", line);
+ return 0;
+ }
+ neg = (cond[j+1] == '^') ? 1 : 0;
+ if ((!neg && !in) || (neg && in)) {
+ fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line);
+ return 0;
+ }
+ }
+ }
+ if (j < 0) return 1;
+ }
+ }
+ return 0;
+}
diff --git a/src/myspell/affixmgr.hxx b/src/myspell/affixmgr.hxx
index 6cbd112..e93ba8e 100644
--- a/src/myspell/affixmgr.hxx
+++ b/src/myspell/affixmgr.hxx
@@ -1,69 +1,203 @@
#ifndef _AFFIXMGR_HXX_
#define _AFFIXMGR_HXX_
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
#include "atypes.hxx"
#include "baseaffix.hxx"
#include "hashmgr.hxx"
-#include <cstdio>
+
+// check flag duplication
+#define dupSFX (1 << 0)
+#define dupPFX (1 << 1)
class AffixMgr
{
AffEntry * pStart[SETSIZE];
AffEntry * sStart[SETSIZE];
- AffEntry * pFlag[SETSIZE];
- AffEntry * sFlag[SETSIZE];
+ AffEntry * pFlag[CONTSIZE];
+ AffEntry * sFlag[CONTSIZE];
HashMgr * pHMgr;
char * trystring;
char * encoding;
- char * compound;
+ struct cs_info * csconv;
+ int utf8;
+ struct unicode_info2 * utf_tbl;
+ int complexprefixes;
+ FLAG compoundflag;
+ FLAG compoundbegin;
+ FLAG compoundmiddle;
+ FLAG compoundend;
+ FLAG compoundroot;
+ FLAG compoundforbidflag;
+ FLAG compoundpermitflag;
+ int checkcompounddup;
+ int checkcompoundrep;
+ int checkcompoundcase;
+ int checkcompoundtriple;
+ FLAG forbiddenword;
+ FLAG nosuggest;
+ FLAG pseudoroot;
int cpdmin;
int numrep;
replentry * reptable;
int nummap;
mapentry * maptable;
- bool nosplitsugs;
-
+ int numbreak;
+ char ** breaktable;
+ int numcheckcpd;
+ replentry * checkcpdtable;
+ int numdefcpd;
+ flagentry * defcpdtable;
+ int maxngramsugs;
+ int nosplitsugs;
+ int sugswithdots;
+ int cpdwordmax;
+ int cpdmaxsyllable;
+ char * cpdvowels;
+ w_char * cpdvowels_utf16;
+ int cpdvowels_utf16_len;
+ char * cpdsyllablenum;
+ const char * pfxappnd; // BUG: not stateless
+ const char * sfxappnd; // BUG: not stateless
+ FLAG sfxflag; // BUG: not stateless
+ char * derived; // BUG: not stateless
+ AffEntry * sfx; // BUG: not stateless
+ AffEntry * pfx; // BUG: not stateless
+ int checknum;
+ char * wordchars;
+ unsigned short * wordchars_utf16;
+ int wordchars_utf16_len;
+ char * version;
+ char * lang;
+ int langnum;
+ FLAG lemma_present;
+ FLAG circumfix;
+ FLAG onlyincompound;
+ FLAG keepcase;
+ int checksharps;
+ int havecontclass; // boolean variable
+ char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold affix)
+ flag flag_mode;
+
public:
AffixMgr(const char * affpath, HashMgr * ptr);
~AffixMgr();
- struct hentry * affix_check(const char * word, int len);
- struct hentry * prefix_check(const char * word, int len);
- struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx);
- int expand_rootword(struct guessword * wlst, int maxn,
- const char * ts, int wl, const char * ap, int al);
- struct hentry * compound_check(const char * word, int len, char compound_flag);
+ struct hentry * affix_check(const char * word, int len,
+ const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT);
+ struct hentry * prefix_check(const char * word, int len,
+ char in_compound, const FLAG needflag = FLAG_NULL);
+ struct hentry * prefix_check_twosfx(const char * word, int len,
+ char in_compound, const FLAG needflag = FLAG_NULL);
+ struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx,
+ char ** wlst, int maxSug, int * ns, const FLAG cclass = FLAG_NULL,
+ const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT);
+ struct hentry * suffix_check_twosfx(const char * word, int len,
+ int sfxopts, AffEntry* ppfx, const FLAG needflag = FLAG_NULL);
+
+ char * affix_check_morph(const char * word, int len,
+ const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT);
+ char * prefix_check_morph(const char * word, int len,
+ char in_compound, const FLAG needflag = FLAG_NULL);
+ char * suffix_check_morph (const char * word, int len, int sfxopts, AffEntry * ppfx,
+ const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT);
+
+ char * prefix_check_twosfx_morph(const char * word, int len,
+ char in_compound, const FLAG needflag = FLAG_NULL);
+ char * suffix_check_twosfx_morph(const char * word, int len,
+ int sfxopts, AffEntry * ppfx, const FLAG needflag = FLAG_NULL);
+
+ int expand_rootword(struct guessword * wlst, int maxn, const char * ts,
+ int wl, const unsigned short * ap, unsigned short al, char * bad, int);
+
+ int get_syllable (const char * word, int wlen);
+ int cpdrep_check(const char * word, int len);
+ int cpdpat_check(const char * word, int len);
+ int defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** rwords, char all);
+ int cpdcase_check(const char * word, int len);
+ int candidate_check(const char * word, int len);
+ struct hentry * compound_check(const char * word, int len,
+ short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
+ char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug);
+
+ int compound_check_morph(const char * word, int len,
+ short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words,
+ char hu_mov_rule, char ** result, char * partresult);
+
struct hentry * lookup(const char * word);
int get_numrep();
struct replentry * get_reptable();
int get_nummap();
struct mapentry * get_maptable();
+ int get_numbreak();
+ char ** get_breaktable();
char * get_encoding();
+ int get_langnum();
+ struct unicode_info2 * get_utf_conv();
char * get_try_string();
- char * get_compound();
- bool get_nosplitsugs();
-
+ const char * get_wordchars();
+ unsigned short * get_wordchars_utf16(int * len);
+ int get_compound();
+ FLAG get_compoundflag();
+ FLAG get_compoundbegin();
+ FLAG get_forbiddenword();
+ FLAG get_nosuggest();
+ FLAG get_pseudoroot();
+ FLAG get_onlyincompound();
+ FLAG get_compoundroot();
+ FLAG get_lemma_present();
+ int get_checknum();
+ char * get_possible_root();
+ const char * get_prefix();
+ const char * get_suffix();
+ const char * get_derived();
+ const char * get_version();
+ const int have_contclass();
+ int get_utf8();
+ int get_complexprefixes();
+ char * get_suffixed(char );
+ int get_maxngramsugs();
+ int get_nosplitsugs();
+ int get_sugswithdots(void);
+ FLAG get_keepcase(void);
+ int get_checksharps(void);
+
private:
int parse_file(const char * affpath);
int parse_try(char * line);
int parse_set(char * line);
+ int parse_flag(char * line, unsigned short * out, char * name);
+ int parse_num(char * line, int * out, char * name);
int parse_cpdflag(char * line);
- int parse_cpdmin(char * line);
+ int parse_cpdforbid(char * line);
+ int parse_forbid(char * line);
+ int parse_cpdsyllable(char * line);
+ int parse_syllablenum(char * line);
int parse_reptable(char * line, FILE * af);
int parse_maptable(char * line, FILE * af);
- int parse_affix(char * line, const char at, FILE * af);
+ int parse_breaktable(char * line, FILE * af);
+ int parse_checkcpdtable(char * line, FILE * af);
+ int parse_defcpdtable(char * line, FILE * af);
+ int parse_affix(char * line, const char at, FILE * af, char * dupflags);
+ int parse_wordchars(char * line);
+ int parse_lang(char * line);
+ int parse_version(char * line);
- void encodeit(struct affentry * ptr, char * cs);
+ int encodeit(struct affentry * ptr, char * cs);
int build_pfxtree(AffEntry* pfxptr);
int build_sfxtree(AffEntry* sfxptr);
- AffEntry* process_sfx_in_order(AffEntry* ptr, AffEntry* nptr);
- AffEntry* process_pfx_in_order(AffEntry* ptr, AffEntry* nptr);
- int process_pfx_tree_to_list();
- int process_sfx_tree_to_list();
int process_pfx_order();
int process_sfx_order();
+ AffEntry * process_pfx_in_order(AffEntry * ptr, AffEntry * nptr);
+ AffEntry * process_sfx_in_order(AffEntry * ptr, AffEntry * nptr);
+ int process_pfx_tree_to_list();
+ int process_sfx_tree_to_list();
+ void set_spec_utf8_encoding();
+ int redundant_condition(char, char * strip, int stripl, const char * cond, char *);
};
#endif
diff --git a/src/myspell/atypes.hxx b/src/myspell/atypes.hxx
index a10c69d..c8c9257 100644
--- a/src/myspell/atypes.hxx
+++ b/src/myspell/atypes.hxx
@@ -1,34 +1,74 @@
#ifndef _ATYPES_HXX_
#define _ATYPES_HXX_
+// HUNSTEM def.
+#define HUNSTEM
+
+#include "csutil.hxx"
+#include "hashmgr.hxx"
+
#define SETSIZE 256
-#define MAXAFFIXES 256
+#define CONTSIZE 65536
#define MAXWORDLEN 100
-#define XPRODUCT (1 << 0)
+#define MAXWORDUTF8LEN (MAXWORDLEN * 4)
+
+// affentry options
+#define aeXPRODUCT (1 << 0)
+#define aeUTF8 (1 << 1)
+#define aeALIASF (1 << 2)
+#define aeALIASM (1 << 3)
+
+enum {IN_CPD_NOT, IN_CPD_BEGIN, IN_CPD_END, IN_CPD_OTHER};
+
+#define MAXLNLEN 8192 * 4
-#define MAXLNLEN 1024
+#define MAXCOMPOUND 10
-#define TESTAFF( a , b , c ) memchr((void *)(a), (int)(b), (size_t)(c) )
+#define MAXACC 1000
+
+#define FLAG unsigned short
+#define FLAG_NULL 0x00
+#define FREE_FLAG(a) a = 0
+
+#define TESTAFF( a, b , c ) flag_bsearch((unsigned short *) a, (unsigned short) b, c)
struct affentry
{
char * strip;
char * appnd;
- short stripl;
- short appndl;
- short numconds;
- short xpflg;
- char achar;
- char conds[SETSIZE];
+ unsigned char stripl;
+ unsigned char appndl;
+ char numconds;
+ char opts;
+ unsigned short aflag;
+ union {
+ char base[SETSIZE];
+ struct {
+ char ascii[SETSIZE/2];
+ char neg[8];
+ char all[8];
+ w_char * wchars[8];
+ int wlen[8];
+ } utf8;
+ } conds;
+ char * morphcode;
+ unsigned short * contclass;
+ short contclasslen;
};
struct replentry {
char * pattern;
- char * replacement;
+ char * pattern2;
};
struct mapentry {
char * set;
+ w_char * set_utf16;
+ int len;
+};
+
+struct flagentry {
+ FLAG * def;
int len;
};
diff --git a/src/myspell/baseaffix.hxx b/src/myspell/baseaffix.hxx
index 6aa4351..da7c010 100644
--- a/src/myspell/baseaffix.hxx
+++ b/src/myspell/baseaffix.hxx
@@ -3,15 +3,29 @@
class AffEntry
{
+public:
+
protected:
char * appnd;
char * strip;
- short appndl;
- short stripl;
- short numconds;
- short xpflg;
- char achar;
- char conds[SETSIZE];
+ unsigned char appndl;
+ unsigned char stripl;
+ char numconds;
+ char opts;
+ unsigned short aflag;
+ union {
+ char base[SETSIZE];
+ struct {
+ char ascii[SETSIZE/2];
+ char neg[8];
+ char all[8];
+ w_char * wchars[8];
+ int wlen[8];
+ } utf8;
+ } conds;
+ char * morphcode;
+ unsigned short * contclass;
+ short contclasslen;
};
#endif
diff --git a/src/myspell/csutil.cxx b/src/myspell/csutil.cxx
index 73065f1..4fe2fbf 100644
--- a/src/myspell/csutil.cxx
+++ b/src/myspell/csutil.cxx
@@ -1,178 +1,497 @@
#include <cstdlib>
#include <cstring>
+#include <cctype>
#include <cstdio>
#include "csutil.hxx"
-#ifndef WINDOWS
-using namespace std;
-#endif
+#include "atypes.hxx"
+#include "langnum.hxx"
-// strip strings into token based on single char delimiter
-// acts like strsep() but only uses a delim char and not
-// a delim string
-
-char * mystrsep(char ** stringp, const char delim)
-{
- char * rv = NULL;
- char * mp = *stringp;
- int n = strlen(mp);
- if (n > 0) {
- char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n);
- if (dp) {
- *stringp = dp+1;
- int nc = (int)((unsigned long)dp - (unsigned long)mp);
- rv = (char *) malloc(nc+1);
- memcpy(rv,mp,nc);
- *(rv+nc) = '\0';
- return rv;
- } else {
- rv = (char *) malloc(n+1);
- memcpy(rv, mp, n);
- *(rv+n) = '\0';
- *stringp = mp + n;
- return rv;
- }
- }
- return NULL;
-}
+#include "utf_info.cxx"
+#define UTF_LST_LEN (sizeof(utf_lst) / (sizeof(unicode_info)))
+#ifndef W32
+using namespace std;
+#endif
-// replaces strdup with ansi version
-char * mystrdup(const char * s)
-{
- char * d = NULL;
- if (s) {
- int sl = strlen(s);
- d = (char *) malloc(((sl+1) * sizeof(char)));
- if (d) memcpy(d,s,((sl+1)*sizeof(char)));
- }
- return d;
+/* only UTF-16 (BMP) implementation */
+char * u16_u8(char * dest, int size, const w_char * src, int srclen) {
+ char * u8 = dest;
+ char * u8_max = u8 + size;
+ const w_char * u2 = src;
+ const w_char * u2_max = src + srclen;
+ while ((u2 < u2_max) && (u8 < u8_max)) {
+ if (u2->h) { // > 0xFF
+ // XXX 4-byte haven't implemented yet.
+ if (u2->h >= 0x08) { // >= 0x800 (3-byte UTF-8 character)
+ *u8 = 0xe0 + (u2->h >> 4);
+ u8++;
+ if (u8 < u8_max) {
+ *u8 = 0x80 + ((u2->h & 0xf) << 2) + (u2->l >> 6);
+ u8++;
+ if (u8 < u8_max) {
+ *u8 = 0x80 + (u2->l & 0x3f);
+ u8++;
+ }
+ }
+ } else { // < 0x800 (2-byte UTF-8 character)
+ *u8 = 0xc0 + (u2->h << 2) + (u2->l >> 6);
+ u8++;
+ if (u8 < u8_max) {
+ *u8 = 0x80 + (u2->l & 0x3f);
+ u8++;
+ }
+ }
+ } else { // <= 0xFF
+ if (u2->l & 0x80) { // >0x80 (2-byte UTF-8 character)
+ *u8 = 0xc0 + (u2->l >> 6);
+ u8++;
+ if (u8 < u8_max) {
+ *u8 = 0x80 + (u2->l & 0x3f);
+ u8++;
+ }
+ } else { // < 0x80 (1-byte UTF-8 character)
+ *u8 = u2->l;
+ u8++;
+ }
+ }
+ u2++;
+ }
+ *u8 = '\0';
+ return dest;
}
-// remove cross-platform text line end characters
-void mychomp(char * s)
-{
- int k = strlen(s);
- if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
- if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
+/* only UTF-16 (BMP) implementation */
+int u8_u16(w_char * dest, int size, const char * src) {
+ const char * u8 = src;
+ w_char * u2 = dest;
+ w_char * u2_max = u2 + size;
+
+ while (*u8 && (u2 < u2_max)) {
+ switch ((*u8) & 0xf0) {
+ case 0x00:
+ case 0x10:
+ case 0x20:
+ case 0x30:
+ case 0x40:
+ case 0x50:
+ case 0x60:
+ case 0x70: {
+ u2->h = 0;
+ u2->l = *u8;
+ break;
+ }
+ case 0x80:
+ case 0x90:
+ case 0xa0:
+ case 0xb0: {
+ fprintf(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %d. character position\n%s\n", u8 - src, src);
+ u2->h = 0xff;
+ u2->l = 0xfd;
+ break;
+ }
+ case 0xc0:
+ case 0xd0: { // 2-byte UTF-8 codes
+ if ((*(u8+1) & 0xc0) == 0x80) {
+ u2->h = (*u8 & 0x1f) >> 2;
+ u2->l = (*u8 << 6) + (*(u8+1) & 0x3f);
+ u8++;
+ } else {
+ fprintf(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src);
+ u2->h = 0xff;
+ u2->l = 0xfd;
+ }
+ break;
+ }
+ case 0xe0: { // 3-byte UTF-8 codes
+ if ((*(u8+1) & 0xc0) == 0x80) {
+ u2->h = ((*u8 & 0x0f) << 4) + ((*(u8+1) & 0x3f) >> 2);
+ u8++;
+ if ((*(u8+1) & 0xc0) == 0x80) {
+ u2->l = (*u8 << 6) + (*(u8+1) & 0x3f);
+ u8++;
+ } else {
+ fprintf(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src);
+ u2->h = 0xff;
+ u2->l = 0xfd;
+ }
+ } else {
+ fprintf(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src);
+ u2->h = 0xff;
+ u2->l = 0xfd;
+ }
+ break;
+ }
+ case 0xf0: { // 4 or more byte UTF-8 codes
+ fprintf(stderr, "This UTF-8 encoding can't convert to UTF-16:\n%s\n", src);
+ u2->h = 0xff;
+ u2->l = 0xfd;
+ break;
+ }
+ }
+ u8++;
+ u2++;
+ }
+ return u2 - dest;
}
-
-// does an ansi strdup of the reverse of a string
-char * myrevstrdup(const char * s)
-{
- char * d = NULL;
- if (s) {
- int sl = strlen(s);
- d = (char *) malloc((sl+1) * sizeof(char));
- if (d) {
- const char * p = s + sl - 1;
- char * q = d;
- while (p >= s) *q++ = *p--;
- *q = '\0';
+void flag_qsort(unsigned short flags[], int begin, int end) {
+ unsigned short reg;
+ if (end > begin) {
+ unsigned short pivot = flags[begin];
+ int l = begin + 1;
+ int r = end;
+ while(l < r) {
+ if (flags[l] <= pivot) {
+ l++;
+ } else {
+ r--;
+ reg = flags[l];
+ flags[l] = flags[r];
+ flags[r] = reg;
+ }
}
+ l--;
+ reg = flags[begin];
+ flags[begin] = flags[l];
+ flags[l] = reg;
+
+ flag_qsort(flags, begin, l);
+ flag_qsort(flags, r, end);
}
- return d;
-}
+ }
-#if 0
-// return 1 if s1 is a leading subset of s2
-int isSubset(const char * s1, const char * s2)
-{
- int l1 = strlen(s1);
- int l2 = strlen(s2);
- if (l1 > l2) return 0;
- if (strncmp(s2,s1,l1) == 0) return 1;
- return 0;
+int flag_bsearch(unsigned short flags[], unsigned short flag, int length) {
+ int mid;
+ int left = 0;
+ int right = length - 1;
+ while (left <= right) {
+ mid = (left + right) / 2;
+ if (flags[mid] == flag) return 1;
+ if (flag < flags[mid]) right = mid - 1;
+ else left = mid + 1;
+ }
+ return 0;
}
-#endif
+ // strip strings into token based on single char delimiter
+ // acts like strsep() but only uses a delim char and not
+ // a delim string
+ // default delimiter: white space characters
+
+ char * mystrsep(char ** stringp, const char delim)
+ {
+ char * rv = NULL;
+ char * mp = *stringp;
+ int n = strlen(mp);
+ if (n > 0) {
+ char * dp;
+ if (delim) {
+ dp = (char *)memchr(mp,(int)((unsigned char)delim),n);
+ } else {
+ for (dp = mp; (*dp && !isspace(*dp)); dp++);
+ if (!*dp) dp = NULL;
+ }
+ if (dp) {
+ *stringp = dp+1;
+ int nc = (int)((unsigned long)dp - (unsigned long)mp);
+ rv = (char *) malloc(nc+1);
+ memcpy(rv,mp,nc);
+ *(rv+nc) = '\0';
+ return rv;
+ } else {
+ rv = (char *) malloc(n+1);
+ memcpy(rv, mp, n);
+ *(rv+n) = '\0';
+ *stringp = mp + n;
+ return rv;
+ }
+ }
+ return NULL;
+ }
-// return 1 if s1 is a leading subset of s2
-int isSubset(const char * s1, const char * s2)
-{
- while( *s1 && *s2 && (*s1 == *s2) ) {
- s1++;
- s2++;
- }
- return (*s1 == '\0');
-}
+
+ // replaces strdup with ansi version
+ char * mystrdup(const char * s)
+ {
+ char * d = NULL;
+ if (s) {
+ int sl = strlen(s);
+ d = (char *) malloc(((sl+1) * sizeof(char)));
+ if (d) memcpy(d,s,((sl+1)*sizeof(char)));
+ }
+ return d;
+ }
+
+
+ // remove cross-platform text line end characters
+ void mychomp(char * s)
+ {
+ int k = strlen(s);
+ if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
+ if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
+ }
+
+
+ // does an ansi strdup of the reverse of a string
+ char * myrevstrdup(const char * s)
+ {
+ char * d = NULL;
+ if (s) {
+ int sl = strlen(s);
+ d = (char *) malloc((sl+1) * sizeof(char));
+ if (d) {
+ const char * p = s + sl - 1;
+ char * q = d;
+ while (p >= s) *q++ = *p--;
+ *q = '\0';
+ }
+ }
+ return d;
+ }
-// return 1 if s1 (reversed) is a leading subset of end of s2
-int isRevSubset(const char * s1, const char * end_of_s2, int len)
-{
- while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
- s1++;
- end_of_s2--;
- len --;
- }
- return (*s1 == '\0');
-}
+ // return 1 if s1 is a leading subset of s2
+ int isSubset(const char * s1, const char * s2)
+ {
+ while ((*s1 == *s2) && *s1) {
+ s1++;
+ s2++;
+ }
+ return (*s1 == '\0');
+ }
-// convert null terminated string to all caps using encoding
-void enmkallcap(char * d, const char * p, const char * encoding)
-{
- struct cs_info * csconv = get_current_cs(encoding);
- while (*p != '\0') {
- *d++ = csconv[((unsigned char) *p)].cupper;
- p++;
- }
- *d = '\0';
-}
+ // return 1 if s1 (reversed) is a leading subset of end of s2
+ int isRevSubset(const char * s1, const char * end_of_s2, int len)
+ {
+ while ((len > 0) && *s1 && (*s1 == *end_of_s2)) {
+ s1++;
+ end_of_s2--;
+ len--;
+ }
+ return (*s1 == '\0');
+ }
+ // convert null terminated string to all caps using encoding
+ void enmkallcap(char * d, const char * p, const char * encoding)
+
+ {
+ struct cs_info * csconv = get_current_cs(encoding);
+ while (*p != '\0') {
+ *d++ = csconv[((unsigned char) *p)].cupper;
+ p++;
+ }
+ *d = '\0';
+ }
-// convert null terminated string to all little using encoding
-void enmkallsmall(char * d, const char * p, const char * encoding)
-{
- struct cs_info * csconv = get_current_cs(encoding);
- while (*p != '\0') {
- *d++ = csconv[((unsigned char) *p)].clower;
- p++;
- }
- *d = '\0';
-}
+ // append s to ends of every lines in text
+ void strlinecat(char * dest, const char * s)
+ {
+ char * dup = mystrdup(dest);
+ char * source = dup;
+ int len = strlen(s);
+ while (*source) {
+ if (*source == '\n') {
+ strncpy(dest, s, len);
+ dest += len;
+ }
+ *dest = *source;
+ source++; dest++;
+ }
+ strcpy(dest, s);
+ free(dup);
+ }
-// convert null terminated string to have intial capital using encoding
-void enmkinitcap(char * d, const char * p, const char * encoding)
-{
- struct cs_info * csconv = get_current_cs(encoding);
- memcpy(d,p,(strlen(p)+1));
- if (*p != '\0') *d= csconv[((unsigned char)*p)].cupper;
+// break text to lines
+// return number of lines
+int line_tok(const char * text, char *** lines) {
+ int linenum = 0;
+ char * dup = mystrdup(text);
+ char * p = dup;
+ while ((p = strchr(p, '\n'))) {
+ linenum++;
+ *p = '\0';
+ p++;
+ if (*p == '\0') break;
+ }
+ *lines = (char **) calloc(linenum + 1, sizeof(char *));
+ if (!(*lines)) return -1;
+
+ p = dup;
+ for (int i = 0; i < linenum + 1; i++) {
+ (*lines)[i] = mystrdup(p);
+ p += strlen(p) + 1;
+ }
+ free(dup);
+ return linenum;
}
+// uniq line in place
+char * line_uniq(char * text) {
+ char ** lines;
+ char linenum = line_tok(text, &lines);
+ int i;
+ strcpy(text, lines[0]);
+ for ( i = 1; i<=linenum; i++ ) {
+ int dup = 0;
+ for (int j = 0; j < i; j++) {
+ if (strcmp(lines[i], lines[j]) == 0) dup = 1;
+ }
+ if (!dup) {
+ if ((i > 1) || (*(lines[0]) != '\0')) strcat(text, "\n");
+ strcat(text, lines[i]);
+ }
+ }
+ for ( i = 0; i<=linenum; i++ ) {
+ if (lines[i]) free(lines[i]);
+ }
+ if (lines) free(lines);
+ return text;
+}
-// convert null terminated string to all caps
-void mkallcap(char * p, const struct cs_info * csconv)
-{
- while (*p != '\0') {
- *p = csconv[((unsigned char) *p)].cupper;
- p++;
- }
+// change \n to char c
+char * line_join(char * text, char c) {
+ char * p;
+ for (p = text; *p; p++) if (*p == '\n') *p = c;
+ return text;
}
+// leave only last {[^}]*} substring for handling zero morphemes
+char * delete_zeros(char * morphout) {
+ char * p = morphout;
+ char * q = p;
+ char * q2 = NULL;
+ int suffix = 0;
+
+ for (;*p && *(p+1);) {
+ switch (*p) {
+ case '{':
+ q2 = q;
+ q--;
+ break;
+ case '}':
+ if (q2) {
+ suffix = 1;
+ q--;
+ }
+ break;
+ default:
+ if (suffix) {
+ q = q2;
+ }
+ suffix = 0;
+ *q = *p;
+ }
+ p++;
+ q++;
+ }
+ *q = '\0';
+ return morphout;
+}
-// convert null terminated string to all little
-void mkallsmall(char * p, const struct cs_info * csconv)
-{
- while (*p != '\0') {
- *p = csconv[((unsigned char) *p)].clower;
- p++;
- }
+char * mystrrep(char * word, const char * pat, const char * rep) {
+ char * pos = strstr(word, pat);
+ if (pos) {
+ int replen = strlen(rep);
+ int patlen = strlen(pat);
+ if (replen < patlen) {
+ char * end = word + strlen(word);
+ char * next = pos + replen;
+ char * prev = pos + strlen(pat);
+ for (; prev < end; *next = *prev, prev++, next++);
+ *next = '\0';
+ } else if (replen > patlen) {
+ char * end = pos + patlen;
+ char * next = word + strlen(word) + replen - patlen;
+ char * prev = next - replen + patlen;
+ for (; prev >= end; *next = *prev, prev--, next--);
+ }
+ strncpy(pos, rep, replen);
+ }
+ return word;
}
+ // convert null terminated string to all little using encoding
+ void enmkallsmall(char * d, const char * p, const char * encoding)
+ {
+ struct cs_info * csconv = get_current_cs(encoding);
+ while (*p != '\0') {
+ *d++ = csconv[((unsigned char) *p)].clower;
+ p++;
+ }
+ *d = '\0';
+ }
+ // convert null terminated string to have intial capital using encoding
+ void enmkinitcap(char * d, const char * p, const char * encoding)
+ {
+ struct cs_info * csconv = get_current_cs(encoding);
+ memcpy(d,p,(strlen(p)+1));
+ if (*p != '\0') *d= csconv[((unsigned char)*p)].cupper;
+ }
+
+
+ // convert null terminated string to all caps
+ void mkallcap(char * p, const struct cs_info * csconv)
+ {
+ while (*p != '\0') {
+ *p = csconv[((unsigned char) *p)].cupper;
+ p++;
+ }
+ }
+
+
+ // convert null terminated string to all little
+ void mkallsmall(char * p, const struct cs_info * csconv)
+ {
+ while (*p != '\0') {
+ *p = csconv[((unsigned char) *p)].clower;
+ p++;
+ }
+ }
-// convert null terminated string to have intial capital
-void mkinitcap(char * p, const struct cs_info * csconv)
-{
- if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
+void mkallsmall_utf(w_char * u, int nc, struct unicode_info2 * utfconv) {
+ for (int i = 0; i < nc; i++) {
+ unsigned short idx = (u[i].h << 8) + u[i].l;
+ if (idx != utfconv[idx].clower) {
+ u[i].h = (unsigned char) (utfconv[idx].clower >> 8);
+ u[i].l = (unsigned char) (utfconv[idx].clower & 0x00FF);
+ }
+ }
}
+
+ // convert null terminated string to have intial capital
+ void mkinitcap(char * p, const struct cs_info * csconv)
+ {
+ if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
+ }
+ // reverse word
+ void reverseword(char * word) {
+ char r;
+ for (char * dest = word + strlen(word) - 1; word < dest; word++, dest--) {
+ r=*word;
+ *word = *dest;
+ *dest = r;
+ }
+ }
-
+ // reverse word
+ void reverseword_utf(char * word) {
+ w_char w[MAXWORDLEN];
+ w_char * p;
+ w_char r;
+ int l = u8_u16(w, MAXWORDLEN, word);
+ p = w;
+ for (w_char * dest = w + l - 1; p < dest; p++, dest--) {
+ r=*p;
+ *p = *dest;
+ *dest = r;
+ }
+ u16_u8(word, MAXWORDUTF8LEN, w, l);
+ }
// these are simple character mappings for the
// encodings supported
@@ -3029,7 +3348,7 @@ struct cs_info koi8r_tbl[] = {
{ 0x01, 0xdf, 0xff },
};
-struct cs_info cp1251_tbl[] = {
+struct cs_info koi8u_tbl[] = {
{ 0x00, 0x00, 0x00 },
{ 0x00, 0x01, 0x01 },
{ 0x00, 0x02, 0x02 },
@@ -3193,27 +3512,27 @@ struct cs_info cp1251_tbl[] = {
{ 0x00, 0xa0, 0xa0 },
{ 0x00, 0xa1, 0xa1 },
{ 0x00, 0xa2, 0xa2 },
-{ 0x00, 0xa3, 0xa3 },
-{ 0x00, 0xa4, 0xa4 },
+{ 0x00, 0xa3, 0xb3 },
+{ 0x00, 0xa4, 0xb4 }, /* ie */
{ 0x00, 0xa5, 0xa5 },
-{ 0x00, 0xa6, 0xa6 },
-{ 0x00, 0xa7, 0xa7 },
+{ 0x00, 0xa6, 0xb6 }, /* i */
+{ 0x00, 0xa7, 0xb7 }, /* ii */
{ 0x00, 0xa8, 0xa8 },
{ 0x00, 0xa9, 0xa9 },
{ 0x00, 0xaa, 0xaa },
{ 0x00, 0xab, 0xab },
{ 0x00, 0xac, 0xac },
-{ 0x00, 0xad, 0xad },
+{ 0x00, 0xad, 0xbd }, /* g'' */
{ 0x00, 0xae, 0xae },
{ 0x00, 0xaf, 0xaf },
{ 0x00, 0xb0, 0xb0 },
{ 0x00, 0xb1, 0xb1 },
{ 0x00, 0xb2, 0xb2 },
-{ 0x00, 0xb3, 0xb3 },
-{ 0x00, 0xb4, 0xb4 },
+{ 0x01, 0xa3, 0xb3 },
+{ 0x00, 0xb4, 0xb4 }, /* IE */
{ 0x00, 0xb5, 0xb5 },
-{ 0x00, 0xb6, 0xb6 },
-{ 0x00, 0xb7, 0xb7 },
+{ 0x00, 0xb6, 0xb6 }, /* I */
+{ 0x00, 0xb7, 0xb7 }, /* II */
{ 0x00, 0xb8, 0xb8 },
{ 0x00, 0xb9, 0xb9 },
{ 0x00, 0xba, 0xba },
@@ -3222,72 +3541,591 @@ struct cs_info cp1251_tbl[] = {
{ 0x00, 0xbd, 0xbd },
{ 0x00, 0xbe, 0xbe },
{ 0x00, 0xbf, 0xbf },
-{ 0x00, 0xc0, 0xc0 },
-{ 0x00, 0xc1, 0xc1 },
-{ 0x00, 0xc2, 0xc2 },
-{ 0x00, 0xc3, 0xc3 },
-{ 0x00, 0xc4, 0xc4 },
-{ 0x00, 0xc5, 0xc5 },
-{ 0x00, 0xc6, 0xc6 },
-{ 0x00, 0xc7, 0xc7 },
-{ 0x00, 0xc8, 0xc8 },
-{ 0x00, 0xc9, 0xc9 },
-{ 0x00, 0xca, 0xca },
-{ 0x00, 0xcb, 0xcb },
-{ 0x00, 0xcc, 0xcc },
-{ 0x00, 0xcd, 0xcd },
-{ 0x00, 0xce, 0xce },
-{ 0x00, 0xcf, 0xcf },
-{ 0x00, 0xd0, 0xd0 },
-{ 0x00, 0xd1, 0xd1 },
-{ 0x00, 0xd2, 0xd2 },
-{ 0x00, 0xd3, 0xd3 },
-{ 0x00, 0xd4, 0xd4 },
-{ 0x00, 0xd5, 0xd5 },
-{ 0x00, 0xd6, 0xd6 },
-{ 0x00, 0xd7, 0xd7 },
-{ 0x00, 0xd8, 0xd8 },
-{ 0x00, 0xd9, 0xd9 },
-{ 0x00, 0xda, 0xda },
-{ 0x00, 0xdb, 0xdb },
-{ 0x00, 0xdc, 0xdc },
-{ 0x00, 0xdd, 0xdd },
-{ 0x00, 0xde, 0xde },
-{ 0x00, 0xdf, 0xdf },
-{ 0x00, 0xe0, 0xe0 },
-{ 0x00, 0xe1, 0xe1 },
-{ 0x00, 0xe2, 0xe2 },
-{ 0x00, 0xe3, 0xe3 },
-{ 0x00, 0xe4, 0xe4 },
-{ 0x00, 0xe5, 0xe5 },
-{ 0x00, 0xe6, 0xe6 },
-{ 0x00, 0xe7, 0xe7 },
-{ 0x00, 0xe8, 0xe8 },
-{ 0x00, 0xe9, 0xe9 },
-{ 0x00, 0xea, 0xea },
-{ 0x00, 0xeb, 0xeb },
-{ 0x00, 0xec, 0xec },
-{ 0x00, 0xed, 0xed },
-{ 0x00, 0xee, 0xee },
-{ 0x00, 0xef, 0xef },
-{ 0x00, 0xf0, 0xf0 },
-{ 0x00, 0xf1, 0xf1 },
-{ 0x00, 0xf2, 0xf2 },
-{ 0x00, 0xf3, 0xf3 },
-{ 0x00, 0xf4, 0xf4 },
-{ 0x00, 0xf5, 0xf5 },
-{ 0x00, 0xf6, 0xf6 },
-{ 0x00, 0xf7, 0xf7 },
-{ 0x00, 0xf8, 0xf8 },
-{ 0x00, 0xf9, 0xf9 },
-{ 0x00, 0xfa, 0xfa },
-{ 0x00, 0xfb, 0xfb },
-{ 0x00, 0xfc, 0xfc },
-{ 0x00, 0xfd, 0xfd },
-{ 0x00, 0xfe, 0xfe },
-{ 0x00, 0xff, 0xff },
+{ 0x00, 0xc0, 0xe0 },
+{ 0x00, 0xc1, 0xe1 },
+{ 0x00, 0xc2, 0xe2 },
+{ 0x00, 0xc3, 0xe3 },
+{ 0x00, 0xc4, 0xe4 },
+{ 0x00, 0xc5, 0xe5 },
+{ 0x00, 0xc6, 0xe6 },
+{ 0x00, 0xc7, 0xe7 },
+{ 0x00, 0xc8, 0xe8 },
+{ 0x00, 0xc9, 0xe9 },
+{ 0x00, 0xca, 0xea },
+{ 0x00, 0xcb, 0xeb },
+{ 0x00, 0xcc, 0xec },
+{ 0x00, 0xcd, 0xed },
+{ 0x00, 0xce, 0xee },
+{ 0x00, 0xcf, 0xef },
+{ 0x00, 0xd0, 0xf0 },
+{ 0x00, 0xd1, 0xf1 },
+{ 0x00, 0xd2, 0xf2 },
+{ 0x00, 0xd3, 0xf3 },
+{ 0x00, 0xd4, 0xf4 },
+{ 0x00, 0xd5, 0xf5 },
+{ 0x00, 0xd6, 0xf6 },
+{ 0x00, 0xd7, 0xf7 },
+{ 0x00, 0xd8, 0xf8 },
+{ 0x00, 0xd9, 0xf9 },
+{ 0x00, 0xda, 0xfa },
+{ 0x00, 0xdb, 0xfb },
+{ 0x00, 0xdc, 0xfc },
+{ 0x00, 0xdd, 0xfd },
+{ 0x00, 0xde, 0xfe },
+{ 0x00, 0xdf, 0xff },
+{ 0x01, 0xc0, 0xe0 },
+{ 0x01, 0xc1, 0xe1 },
+{ 0x01, 0xc2, 0xe2 },
+{ 0x01, 0xc3, 0xe3 },
+{ 0x01, 0xc4, 0xe4 },
+{ 0x01, 0xc5, 0xe5 },
+{ 0x01, 0xc6, 0xe6 },
+{ 0x01, 0xc7, 0xe7 },
+{ 0x01, 0xc8, 0xe8 },
+{ 0x01, 0xc9, 0xe9 },
+{ 0x01, 0xca, 0xea },
+{ 0x01, 0xcb, 0xeb },
+{ 0x01, 0xcc, 0xec },
+{ 0x01, 0xcd, 0xed },
+{ 0x01, 0xce, 0xee },
+{ 0x01, 0xcf, 0xef },
+{ 0x01, 0xd0, 0xf0 },
+{ 0x01, 0xd1, 0xf1 },
+{ 0x01, 0xd2, 0xf2 },
+{ 0x01, 0xd3, 0xf3 },
+{ 0x01, 0xd4, 0xf4 },
+{ 0x01, 0xd5, 0xf5 },
+{ 0x01, 0xd6, 0xf6 },
+{ 0x01, 0xd7, 0xf7 },
+{ 0x01, 0xd8, 0xf8 },
+{ 0x01, 0xd9, 0xf9 },
+{ 0x01, 0xda, 0xfa },
+{ 0x01, 0xdb, 0xfb },
+{ 0x01, 0xdc, 0xfc },
+{ 0x01, 0xdd, 0xfd },
+{ 0x01, 0xde, 0xfe },
+{ 0x01, 0xdf, 0xff },
+};
+
+struct cs_info cp1251_tbl[] = {
+{ 0x00, 0x00, 0x00 },
+{ 0x00, 0x01, 0x01 },
+{ 0x00, 0x02, 0x02 },
+{ 0x00, 0x03, 0x03 },
+{ 0x00, 0x04, 0x04 },
+{ 0x00, 0x05, 0x05 },
+{ 0x00, 0x06, 0x06 },
+{ 0x00, 0x07, 0x07 },
+{ 0x00, 0x08, 0x08 },
+{ 0x00, 0x09, 0x09 },
+{ 0x00, 0x0a, 0x0a },
+{ 0x00, 0x0b, 0x0b },
+{ 0x00, 0x0c, 0x0c },
+{ 0x00, 0x0d, 0x0d },
+{ 0x00, 0x0e, 0x0e },
+{ 0x00, 0x0f, 0x0f },
+{ 0x00, 0x10, 0x10 },
+{ 0x00, 0x11, 0x11 },
+{ 0x00, 0x12, 0x12 },
+{ 0x00, 0x13, 0x13 },
+{ 0x00, 0x14, 0x14 },
+{ 0x00, 0x15, 0x15 },
+{ 0x00, 0x16, 0x16 },
+{ 0x00, 0x17, 0x17 },
+{ 0x00, 0x18, 0x18 },
+{ 0x00, 0x19, 0x19 },
+{ 0x00, 0x1a, 0x1a },
+{ 0x00, 0x1b, 0x1b },
+{ 0x00, 0x1c, 0x1c },
+{ 0x00, 0x1d, 0x1d },
+{ 0x00, 0x1e, 0x1e },
+{ 0x00, 0x1f, 0x1f },
+{ 0x00, 0x20, 0x20 },
+{ 0x00, 0x21, 0x21 },
+{ 0x00, 0x22, 0x22 },
+{ 0x00, 0x23, 0x23 },
+{ 0x00, 0x24, 0x24 },
+{ 0x00, 0x25, 0x25 },
+{ 0x00, 0x26, 0x26 },
+{ 0x00, 0x27, 0x27 },
+{ 0x00, 0x28, 0x28 },
+{ 0x00, 0x29, 0x29 },
+{ 0x00, 0x2a, 0x2a },
+{ 0x00, 0x2b, 0x2b },
+{ 0x00, 0x2c, 0x2c },
+{ 0x00, 0x2d, 0x2d },
+{ 0x00, 0x2e, 0x2e },
+{ 0x00, 0x2f, 0x2f },
+{ 0x00, 0x30, 0x30 },
+{ 0x00, 0x31, 0x31 },
+{ 0x00, 0x32, 0x32 },
+{ 0x00, 0x33, 0x33 },
+{ 0x00, 0x34, 0x34 },
+{ 0x00, 0x35, 0x35 },
+{ 0x00, 0x36, 0x36 },
+{ 0x00, 0x37, 0x37 },
+{ 0x00, 0x38, 0x38 },
+{ 0x00, 0x39, 0x39 },
+{ 0x00, 0x3a, 0x3a },
+{ 0x00, 0x3b, 0x3b },
+{ 0x00, 0x3c, 0x3c },
+{ 0x00, 0x3d, 0x3d },
+{ 0x00, 0x3e, 0x3e },
+{ 0x00, 0x3f, 0x3f },
+{ 0x00, 0x40, 0x40 },
+{ 0x01, 0x61, 0x41 },
+{ 0x01, 0x62, 0x42 },
+{ 0x01, 0x63, 0x43 },
+{ 0x01, 0x64, 0x44 },
+{ 0x01, 0x65, 0x45 },
+{ 0x01, 0x66, 0x46 },
+{ 0x01, 0x67, 0x47 },
+{ 0x01, 0x68, 0x48 },
+{ 0x01, 0x69, 0x49 },
+{ 0x01, 0x6a, 0x4a },
+{ 0x01, 0x6b, 0x4b },
+{ 0x01, 0x6c, 0x4c },
+{ 0x01, 0x6d, 0x4d },
+{ 0x01, 0x6e, 0x4e },
+{ 0x01, 0x6f, 0x4f },
+{ 0x01, 0x70, 0x50 },
+{ 0x01, 0x71, 0x51 },
+{ 0x01, 0x72, 0x52 },
+{ 0x01, 0x73, 0x53 },
+{ 0x01, 0x74, 0x54 },
+{ 0x01, 0x75, 0x55 },
+{ 0x01, 0x76, 0x56 },
+{ 0x01, 0x77, 0x57 },
+{ 0x01, 0x78, 0x58 },
+{ 0x01, 0x79, 0x59 },
+{ 0x01, 0x7a, 0x5a },
+{ 0x00, 0x5b, 0x5b },
+{ 0x00, 0x5c, 0x5c },
+{ 0x00, 0x5d, 0x5d },
+{ 0x00, 0x5e, 0x5e },
+{ 0x00, 0x5f, 0x5f },
+{ 0x00, 0x60, 0x60 },
+{ 0x00, 0x61, 0x41 },
+{ 0x00, 0x62, 0x42 },
+{ 0x00, 0x63, 0x43 },
+{ 0x00, 0x64, 0x44 },
+{ 0x00, 0x65, 0x45 },
+{ 0x00, 0x66, 0x46 },
+{ 0x00, 0x67, 0x47 },
+{ 0x00, 0x68, 0x48 },
+{ 0x00, 0x69, 0x49 },
+{ 0x00, 0x6a, 0x4a },
+{ 0x00, 0x6b, 0x4b },
+{ 0x00, 0x6c, 0x4c },
+{ 0x00, 0x6d, 0x4d },
+{ 0x00, 0x6e, 0x4e },
+{ 0x00, 0x6f, 0x4f },
+{ 0x00, 0x70, 0x50 },
+{ 0x00, 0x71, 0x51 },
+{ 0x00, 0x72, 0x52 },
+{ 0x00, 0x73, 0x53 },
+{ 0x00, 0x74, 0x54 },
+{ 0x00, 0x75, 0x55 },
+{ 0x00, 0x76, 0x56 },
+{ 0x00, 0x77, 0x57 },
+{ 0x00, 0x78, 0x58 },
+{ 0x00, 0x79, 0x59 },
+{ 0x00, 0x7a, 0x5a },
+{ 0x00, 0x7b, 0x7b },
+{ 0x00, 0x7c, 0x7c },
+{ 0x00, 0x7d, 0x7d },
+{ 0x00, 0x7e, 0x7e },
+{ 0x00, 0x7f, 0x7f },
+{ 0x01, 0x90, 0x80 },
+{ 0x01, 0x83, 0x81 },
+{ 0x00, 0x82, 0x82 },
+{ 0x00, 0x83, 0x81 },
+{ 0x00, 0x84, 0x84 },
+{ 0x00, 0x85, 0x85 },
+{ 0x00, 0x86, 0x86 },
+{ 0x00, 0x87, 0x87 },
+{ 0x00, 0x88, 0x88 },
+{ 0x00, 0x89, 0x89 },
+{ 0x01, 0x9a, 0x8a },
+{ 0x00, 0x8b, 0x8b },
+{ 0x01, 0x9c, 0x8c },
+{ 0x01, 0x9d, 0x8d },
+{ 0x01, 0x9e, 0x8e },
+{ 0x01, 0x9f, 0x8f },
+{ 0x00, 0x90, 0x80 },
+{ 0x00, 0x91, 0x91 },
+{ 0x00, 0x92, 0x92 },
+{ 0x00, 0x93, 0x93 },
+{ 0x00, 0x94, 0x94 },
+{ 0x00, 0x95, 0x95 },
+{ 0x00, 0x96, 0x96 },
+{ 0x00, 0x97, 0x97 },
+{ 0x00, 0x98, 0x98 },
+{ 0x00, 0x99, 0x99 },
+{ 0x00, 0x9a, 0x8a },
+{ 0x00, 0x9b, 0x9b },
+{ 0x00, 0x9c, 0x8c },
+{ 0x00, 0x9d, 0x8d },
+{ 0x00, 0x9e, 0x8e },
+{ 0x00, 0x9f, 0x8f },
+{ 0x00, 0xa0, 0xa0 },
+{ 0x01, 0xa2, 0xa1 },
+{ 0x00, 0xa2, 0xa1 },
+{ 0x01, 0xbc, 0xa3 },
+{ 0x00, 0xa4, 0xa4 },
+{ 0x01, 0xb4, 0xa5 },
+{ 0x00, 0xa6, 0xa6 },
+{ 0x00, 0xa7, 0xa7 },
+{ 0x01, 0xb8, 0xa8 },
+{ 0x00, 0xa9, 0xa9 },
+{ 0x01, 0xba, 0xaa },
+{ 0x00, 0xab, 0xab },
+{ 0x00, 0xac, 0xac },
+{ 0x00, 0xad, 0xad },
+{ 0x00, 0xae, 0xae },
+{ 0x01, 0xbf, 0xaf },
+{ 0x00, 0xb0, 0xb0 },
+{ 0x00, 0xb1, 0xb1 },
+{ 0x01, 0xb3, 0xb2 },
+{ 0x00, 0xb3, 0xb2 },
+{ 0x00, 0xb4, 0xa5 },
+{ 0x00, 0xb5, 0xb5 },
+{ 0x00, 0xb6, 0xb6 },
+{ 0x00, 0xb7, 0xb7 },
+{ 0x00, 0xb8, 0xa8 },
+{ 0x00, 0xb9, 0xb9 },
+{ 0x00, 0xba, 0xaa },
+{ 0x00, 0xbb, 0xbb },
+{ 0x00, 0xbc, 0xa3 },
+{ 0x01, 0xbe, 0xbd },
+{ 0x00, 0xbe, 0xbd },
+{ 0x00, 0xbf, 0xaf },
+{ 0x01, 0xe0, 0xc0 },
+{ 0x01, 0xe1, 0xc1 },
+{ 0x01, 0xe2, 0xc2 },
+{ 0x01, 0xe3, 0xc3 },
+{ 0x01, 0xe4, 0xc4 },
+{ 0x01, 0xe5, 0xc5 },
+{ 0x01, 0xe6, 0xc6 },
+{ 0x01, 0xe7, 0xc7 },
+{ 0x01, 0xe8, 0xc8 },
+{ 0x01, 0xe9, 0xc9 },
+{ 0x01, 0xea, 0xca },
+{ 0x01, 0xeb, 0xcb },
+{ 0x01, 0xec, 0xcc },
+{ 0x01, 0xed, 0xcd },
+{ 0x01, 0xee, 0xce },
+{ 0x01, 0xef, 0xcf },
+{ 0x01, 0xf0, 0xd0 },
+{ 0x01, 0xf1, 0xd1 },
+{ 0x01, 0xf2, 0xd2 },
+{ 0x01, 0xf3, 0xd3 },
+{ 0x01, 0xf4, 0xd4 },
+{ 0x01, 0xf5, 0xd5 },
+{ 0x01, 0xf6, 0xd6 },
+{ 0x01, 0xf7, 0xd7 },
+{ 0x01, 0xf8, 0xd8 },
+{ 0x01, 0xf9, 0xd9 },
+{ 0x01, 0xfa, 0xda },
+{ 0x01, 0xfb, 0xdb },
+{ 0x01, 0xfc, 0xdc },
+{ 0x01, 0xfd, 0xdd },
+{ 0x01, 0xfe, 0xde },
+{ 0x01, 0xff, 0xdf },
+{ 0x00, 0xe0, 0xc0 },
+{ 0x00, 0xe1, 0xc1 },
+{ 0x00, 0xe2, 0xc2 },
+{ 0x00, 0xe3, 0xc3 },
+{ 0x00, 0xe4, 0xc4 },
+{ 0x00, 0xe5, 0xc5 },
+{ 0x00, 0xe6, 0xc6 },
+{ 0x00, 0xe7, 0xc7 },
+{ 0x00, 0xe8, 0xc8 },
+{ 0x00, 0xe9, 0xc9 },
+{ 0x00, 0xea, 0xca },
+{ 0x00, 0xeb, 0xcb },
+{ 0x00, 0xec, 0xcc },
+{ 0x00, 0xed, 0xcd },
+{ 0x00, 0xee, 0xce },
+{ 0x00, 0xef, 0xcf },
+{ 0x00, 0xf0, 0xd0 },
+{ 0x00, 0xf1, 0xd1 },
+{ 0x00, 0xf2, 0xd2 },
+{ 0x00, 0xf3, 0xd3 },
+{ 0x00, 0xf4, 0xd4 },
+{ 0x00, 0xf5, 0xd5 },
+{ 0x00, 0xf6, 0xd6 },
+{ 0x00, 0xf7, 0xd7 },
+{ 0x00, 0xf8, 0xd8 },
+{ 0x00, 0xf9, 0xd9 },
+{ 0x00, 0xfa, 0xda },
+{ 0x00, 0xfb, 0xdb },
+{ 0x00, 0xfc, 0xdc },
+{ 0x00, 0xfd, 0xdd },
+{ 0x00, 0xfe, 0xde },
+{ 0x00, 0xff, 0xdf },
};
+struct cs_info iso13_tbl[] = {
+{ 0x00, 0x00, 0x00 },
+{ 0x00, 0x01, 0x01 },
+{ 0x00, 0x02, 0x02 },
+{ 0x00, 0x03, 0x03 },
+{ 0x00, 0x04, 0x04 },
+{ 0x00, 0x05, 0x05 },
+{ 0x00, 0x06, 0x06 },
+{ 0x00, 0x07, 0x07 },
+{ 0x00, 0x08, 0x08 },
+{ 0x00, 0x09, 0x09 },
+{ 0x00, 0x0A, 0x0A },
+{ 0x00, 0x0B, 0x0B },
+{ 0x00, 0x0C, 0x0C },
+{ 0x00, 0x0D, 0x0D },
+{ 0x00, 0x0E, 0x0E },
+{ 0x00, 0x0F, 0x0F },
+{ 0x00, 0x10, 0x10 },
+{ 0x00, 0x11, 0x11 },
+{ 0x00, 0x12, 0x12 },
+{ 0x00, 0x13, 0x13 },
+{ 0x00, 0x14, 0x14 },
+{ 0x00, 0x15, 0x15 },
+{ 0x00, 0x16, 0x16 },
+{ 0x00, 0x17, 0x17 },
+{ 0x00, 0x18, 0x18 },
+{ 0x00, 0x19, 0x19 },
+{ 0x00, 0x1A, 0x1A },
+{ 0x00, 0x1B, 0x1B },
+{ 0x00, 0x1C, 0x1C },
+{ 0x00, 0x1D, 0x1D },
+{ 0x00, 0x1E, 0x1E },
+{ 0x00, 0x1F, 0x1F },
+{ 0x00, 0x20, 0x20 },
+{ 0x00, 0x21, 0x21 },
+{ 0x00, 0x22, 0x22 },
+{ 0x00, 0x23, 0x23 },
+{ 0x00, 0x24, 0x24 },
+{ 0x00, 0x25, 0x25 },
+{ 0x00, 0x26, 0x26 },
+{ 0x00, 0x27, 0x27 },
+{ 0x00, 0x28, 0x28 },
+{ 0x00, 0x29, 0x29 },
+{ 0x00, 0x2A, 0x2A },
+{ 0x00, 0x2B, 0x2B },
+{ 0x00, 0x2C, 0x2C },
+{ 0x00, 0x2D, 0x2D },
+{ 0x00, 0x2E, 0x2E },
+{ 0x00, 0x2F, 0x2F },
+{ 0x00, 0x30, 0x30 },
+{ 0x00, 0x31, 0x31 },
+{ 0x00, 0x32, 0x32 },
+{ 0x00, 0x33, 0x33 },
+{ 0x00, 0x34, 0x34 },
+{ 0x00, 0x35, 0x35 },
+{ 0x00, 0x36, 0x36 },
+{ 0x00, 0x37, 0x37 },
+{ 0x00, 0x38, 0x38 },
+{ 0x00, 0x39, 0x39 },
+{ 0x00, 0x3A, 0x3A },
+{ 0x00, 0x3B, 0x3B },
+{ 0x00, 0x3C, 0x3C },
+{ 0x00, 0x3D, 0x3D },
+{ 0x00, 0x3E, 0x3E },
+{ 0x00, 0x3F, 0x3F },
+{ 0x00, 0x40, 0x40 },
+{ 0x01, 0x61, 0x41 },
+{ 0x01, 0x62, 0x42 },
+{ 0x01, 0x63, 0x43 },
+{ 0x01, 0x64, 0x44 },
+{ 0x01, 0x65, 0x45 },
+{ 0x01, 0x66, 0x46 },
+{ 0x01, 0x67, 0x47 },
+{ 0x01, 0x68, 0x48 },
+{ 0x01, 0x69, 0x49 },
+{ 0x01, 0x6A, 0x4A },
+{ 0x01, 0x6B, 0x4B },
+{ 0x01, 0x6C, 0x4C },
+{ 0x01, 0x6D, 0x4D },
+{ 0x01, 0x6E, 0x4E },
+{ 0x01, 0x6F, 0x4F },
+{ 0x01, 0x70, 0x50 },
+{ 0x01, 0x71, 0x51 },
+{ 0x01, 0x72, 0x52 },
+{ 0x01, 0x73, 0x53 },
+{ 0x01, 0x74, 0x54 },
+{ 0x01, 0x75, 0x55 },
+{ 0x01, 0x76, 0x56 },
+{ 0x01, 0x77, 0x57 },
+{ 0x01, 0x78, 0x58 },
+{ 0x01, 0x79, 0x59 },
+{ 0x01, 0x7A, 0x5A },
+{ 0x00, 0x5B, 0x5B },
+{ 0x00, 0x5C, 0x5C },
+{ 0x00, 0x5D, 0x5D },
+{ 0x00, 0x5E, 0x5E },
+{ 0x00, 0x5F, 0x5F },
+{ 0x00, 0x60, 0x60 },
+{ 0x00, 0x61, 0x41 },
+{ 0x00, 0x62, 0x42 },
+{ 0x00, 0x63, 0x43 },
+{ 0x00, 0x64, 0x44 },
+{ 0x00, 0x65, 0x45 },
+{ 0x00, 0x66, 0x46 },
+{ 0x00, 0x67, 0x47 },
+{ 0x00, 0x68, 0x48 },
+{ 0x00, 0x69, 0x49 },
+{ 0x00, 0x6A, 0x4A },
+{ 0x00, 0x6B, 0x4B },
+{ 0x00, 0x6C, 0x4C },
+{ 0x00, 0x6D, 0x4D },
+{ 0x00, 0x6E, 0x4E },
+{ 0x00, 0x6F, 0x4F },
+{ 0x00, 0x70, 0x50 },
+{ 0x00, 0x71, 0x51 },
+{ 0x00, 0x72, 0x52 },
+{ 0x00, 0x73, 0x53 },
+{ 0x00, 0x74, 0x54 },
+{ 0x00, 0x75, 0x55 },
+{ 0x00, 0x76, 0x56 },
+{ 0x00, 0x77, 0x57 },
+{ 0x00, 0x78, 0x58 },
+{ 0x00, 0x79, 0x59 },
+{ 0x00, 0x7A, 0x5A },
+{ 0x00, 0x7B, 0x7B },
+{ 0x00, 0x7C, 0x7C },
+{ 0x00, 0x7D, 0x7D },
+{ 0x00, 0x7E, 0x7E },
+{ 0x00, 0x7F, 0x7F },
+{ 0x00, 0x80, 0x80 },
+{ 0x00, 0x81, 0x81 },
+{ 0x00, 0x82, 0x82 },
+{ 0x00, 0x83, 0x83 },
+{ 0x00, 0x84, 0x84 },
+{ 0x00, 0x85, 0x85 },
+{ 0x00, 0x86, 0x86 },
+{ 0x00, 0x87, 0x87 },
+{ 0x00, 0x88, 0x88 },
+{ 0x00, 0x89, 0x89 },
+{ 0x00, 0x8A, 0x8A },
+{ 0x00, 0x8B, 0x8B },
+{ 0x00, 0x8C, 0x8C },
+{ 0x00, 0x8D, 0x8D },
+{ 0x00, 0x8E, 0x8E },
+{ 0x00, 0x8F, 0x8F },
+{ 0x00, 0x90, 0x90 },
+{ 0x00, 0x91, 0x91 },
+{ 0x00, 0x92, 0x92 },
+{ 0x00, 0x93, 0x93 },
+{ 0x00, 0x94, 0x94 },
+{ 0x00, 0x95, 0x95 },
+{ 0x00, 0x96, 0x96 },
+{ 0x00, 0x97, 0x97 },
+{ 0x00, 0x98, 0x98 },
+{ 0x00, 0x99, 0x99 },
+{ 0x00, 0x9A, 0x9A },
+{ 0x00, 0x9B, 0x9B },
+{ 0x00, 0x9C, 0x9C },
+{ 0x00, 0x9D, 0x9D },
+{ 0x00, 0x9E, 0x9E },
+{ 0x00, 0x9F, 0x9F },
+{ 0x00, 0xA0, 0xA0 },
+{ 0x00, 0xA1, 0xA1 },
+{ 0x00, 0xA2, 0xA2 },
+{ 0x00, 0xA3, 0xA3 },
+{ 0x00, 0xA4, 0xA4 },
+{ 0x00, 0xA5, 0xA5 },
+{ 0x00, 0xA6, 0xA6 },
+{ 0x00, 0xA7, 0xA7 },
+{ 0x01, 0xB8, 0xA8 },
+{ 0x00, 0xA9, 0xA9 },
+{ 0x01, 0xBA, 0xAA },
+{ 0x00, 0xAB, 0xAB },
+{ 0x00, 0xAC, 0xAC },
+{ 0x00, 0xAD, 0xAD },
+{ 0x00, 0xAE, 0xAE },
+{ 0x01, 0xBF, 0xAF },
+{ 0x00, 0xB0, 0xB0 },
+{ 0x00, 0xB1, 0xB1 },
+{ 0x00, 0xB2, 0xB2 },
+{ 0x00, 0xB3, 0xB3 },
+{ 0x00, 0xB4, 0xB4 },
+{ 0x00, 0xB5, 0xB5 },
+{ 0x00, 0xB6, 0xB6 },
+{ 0x00, 0xB7, 0xB7 },
+{ 0x00, 0xB8, 0xA8 },
+{ 0x00, 0xB9, 0xB9 },
+{ 0x00, 0xBA, 0xAA },
+{ 0x00, 0xBB, 0xBB },
+{ 0x00, 0xBC, 0xBC },
+{ 0x00, 0xBD, 0xBD },
+{ 0x00, 0xBE, 0xBE },
+{ 0x00, 0xBF, 0xAF },
+{ 0x01, 0xE0, 0xC0 },
+{ 0x01, 0xE1, 0xC1 },
+{ 0x01, 0xE2, 0xC2 },
+{ 0x01, 0xE3, 0xC3 },
+{ 0x01, 0xE4, 0xC4 },
+{ 0x01, 0xE5, 0xC5 },
+{ 0x01, 0xE6, 0xC6 },
+{ 0x01, 0xE7, 0xC7 },
+{ 0x01, 0xE8, 0xC8 },
+{ 0x01, 0xE9, 0xC9 },
+{ 0x01, 0xEA, 0xCA },
+{ 0x01, 0xEB, 0xCB },
+{ 0x01, 0xEC, 0xCC },
+{ 0x01, 0xED, 0xCD },
+{ 0x01, 0xEE, 0xCE },
+{ 0x01, 0xEF, 0xCF },
+{ 0x01, 0xF0, 0xD0 },
+{ 0x01, 0xF1, 0xD1 },
+{ 0x01, 0xF2, 0xD2 },
+{ 0x01, 0xF3, 0xD3 },
+{ 0x01, 0xF4, 0xD4 },
+{ 0x01, 0xF5, 0xD5 },
+{ 0x01, 0xF6, 0xD6 },
+{ 0x00, 0xD7, 0xD7 },
+{ 0x01, 0xF8, 0xD8 },
+{ 0x01, 0xF9, 0xD9 },
+{ 0x01, 0xFA, 0xDA },
+{ 0x01, 0xFB, 0xDB },
+{ 0x01, 0xFC, 0xDC },
+{ 0x01, 0xFD, 0xDD },
+{ 0x01, 0xFE, 0xDE },
+{ 0x00, 0xDF, 0xDF },
+{ 0x00, 0xE0, 0xC0 },
+{ 0x00, 0xE1, 0xC1 },
+{ 0x00, 0xE2, 0xC2 },
+{ 0x00, 0xE3, 0xC3 },
+{ 0x00, 0xE4, 0xC4 },
+{ 0x00, 0xE5, 0xC5 },
+{ 0x00, 0xE6, 0xC6 },
+{ 0x00, 0xE7, 0xC7 },
+{ 0x00, 0xE8, 0xC8 },
+{ 0x00, 0xE9, 0xC9 },
+{ 0x00, 0xEA, 0xCA },
+{ 0x00, 0xEB, 0xCB },
+{ 0x00, 0xEC, 0xCC },
+{ 0x00, 0xED, 0xCD },
+{ 0x00, 0xEE, 0xCE },
+{ 0x00, 0xEF, 0xCF },
+{ 0x00, 0xF0, 0xD0 },
+{ 0x00, 0xF1, 0xD1 },
+{ 0x00, 0xF2, 0xD2 },
+{ 0x00, 0xF3, 0xD3 },
+{ 0x00, 0xF4, 0xD4 },
+{ 0x00, 0xF5, 0xD5 },
+{ 0x00, 0xF6, 0xD6 },
+{ 0x00, 0xF7, 0xF7 },
+{ 0x00, 0xF8, 0xD8 },
+{ 0x00, 0xF9, 0xD9 },
+{ 0x00, 0xFA, 0xDA },
+{ 0x00, 0xFB, 0xDB },
+{ 0x00, 0xFC, 0xDC },
+{ 0x00, 0xFD, 0xDD },
+{ 0x00, 0xFE, 0xDE },
+{ 0x00, 0xFF, 0xFF },
+};
+
+
struct cs_info iso14_tbl[] = {
{ 0x00, 0x00, 0x00 },
{ 0x00, 0x01, 0x01 },
@@ -3547,6 +4385,264 @@ struct cs_info iso14_tbl[] = {
{ 0x00, 0xff, 0xff },
};
+struct cs_info iso15_tbl[] = {
+{ 0x00, 0x00, 0x00 },
+{ 0x00, 0x01, 0x01 },
+{ 0x00, 0x02, 0x02 },
+{ 0x00, 0x03, 0x03 },
+{ 0x00, 0x04, 0x04 },
+{ 0x00, 0x05, 0x05 },
+{ 0x00, 0x06, 0x06 },
+{ 0x00, 0x07, 0x07 },
+{ 0x00, 0x08, 0x08 },
+{ 0x00, 0x09, 0x09 },
+{ 0x00, 0x0a, 0x0a },
+{ 0x00, 0x0b, 0x0b },
+{ 0x00, 0x0c, 0x0c },
+{ 0x00, 0x0d, 0x0d },
+{ 0x00, 0x0e, 0x0e },
+{ 0x00, 0x0f, 0x0f },
+{ 0x00, 0x10, 0x10 },
+{ 0x00, 0x11, 0x11 },
+{ 0x00, 0x12, 0x12 },
+{ 0x00, 0x13, 0x13 },
+{ 0x00, 0x14, 0x14 },
+{ 0x00, 0x15, 0x15 },
+{ 0x00, 0x16, 0x16 },
+{ 0x00, 0x17, 0x17 },
+{ 0x00, 0x18, 0x18 },
+{ 0x00, 0x19, 0x19 },
+{ 0x00, 0x1a, 0x1a },
+{ 0x00, 0x1b, 0x1b },
+{ 0x00, 0x1c, 0x1c },
+{ 0x00, 0x1d, 0x1d },
+{ 0x00, 0x1e, 0x1e },
+{ 0x00, 0x1f, 0x1f },
+{ 0x00, 0x20, 0x20 },
+{ 0x00, 0x21, 0x21 },
+{ 0x00, 0x22, 0x22 },
+{ 0x00, 0x23, 0x23 },
+{ 0x00, 0x24, 0x24 },
+{ 0x00, 0x25, 0x25 },
+{ 0x00, 0x26, 0x26 },
+{ 0x00, 0x27, 0x27 },
+{ 0x00, 0x28, 0x28 },
+{ 0x00, 0x29, 0x29 },
+{ 0x00, 0x2a, 0x2a },
+{ 0x00, 0x2b, 0x2b },
+{ 0x00, 0x2c, 0x2c },
+{ 0x00, 0x2d, 0x2d },
+{ 0x00, 0x2e, 0x2e },
+{ 0x00, 0x2f, 0x2f },
+{ 0x00, 0x30, 0x30 },
+{ 0x00, 0x31, 0x31 },
+{ 0x00, 0x32, 0x32 },
+{ 0x00, 0x33, 0x33 },
+{ 0x00, 0x34, 0x34 },
+{ 0x00, 0x35, 0x35 },
+{ 0x00, 0x36, 0x36 },
+{ 0x00, 0x37, 0x37 },
+{ 0x00, 0x38, 0x38 },
+{ 0x00, 0x39, 0x39 },
+{ 0x00, 0x3a, 0x3a },
+{ 0x00, 0x3b, 0x3b },
+{ 0x00, 0x3c, 0x3c },
+{ 0x00, 0x3d, 0x3d },
+{ 0x00, 0x3e, 0x3e },
+{ 0x00, 0x3f, 0x3f },
+{ 0x00, 0x40, 0x40 },
+{ 0x01, 0x61, 0x41 },
+{ 0x01, 0x62, 0x42 },
+{ 0x01, 0x63, 0x43 },
+{ 0x01, 0x64, 0x44 },
+{ 0x01, 0x65, 0x45 },
+{ 0x01, 0x66, 0x46 },
+{ 0x01, 0x67, 0x47 },
+{ 0x01, 0x68, 0x48 },
+{ 0x01, 0x69, 0x49 },
+{ 0x01, 0x6a, 0x4a },
+{ 0x01, 0x6b, 0x4b },
+{ 0x01, 0x6c, 0x4c },
+{ 0x01, 0x6d, 0x4d },
+{ 0x01, 0x6e, 0x4e },
+{ 0x01, 0x6f, 0x4f },
+{ 0x01, 0x70, 0x50 },
+{ 0x01, 0x71, 0x51 },
+{ 0x01, 0x72, 0x52 },
+{ 0x01, 0x73, 0x53 },
+{ 0x01, 0x74, 0x54 },
+{ 0x01, 0x75, 0x55 },
+{ 0x01, 0x76, 0x56 },
+{ 0x01, 0x77, 0x57 },
+{ 0x01, 0x78, 0x58 },
+{ 0x01, 0x79, 0x59 },
+{ 0x01, 0x7a, 0x5a },
+{ 0x00, 0x5b, 0x5b },
+{ 0x00, 0x5c, 0x5c },
+{ 0x00, 0x5d, 0x5d },
+{ 0x00, 0x5e, 0x5e },
+{ 0x00, 0x5f, 0x5f },
+{ 0x00, 0x60, 0x60 },
+{ 0x00, 0x61, 0x41 },
+{ 0x00, 0x62, 0x42 },
+{ 0x00, 0x63, 0x43 },
+{ 0x00, 0x64, 0x44 },
+{ 0x00, 0x65, 0x45 },
+{ 0x00, 0x66, 0x46 },
+{ 0x00, 0x67, 0x47 },
+{ 0x00, 0x68, 0x48 },
+{ 0x00, 0x69, 0x49 },
+{ 0x00, 0x6a, 0x4a },
+{ 0x00, 0x6b, 0x4b },
+{ 0x00, 0x6c, 0x4c },
+{ 0x00, 0x6d, 0x4d },
+{ 0x00, 0x6e, 0x4e },
+{ 0x00, 0x6f, 0x4f },
+{ 0x00, 0x70, 0x50 },
+{ 0x00, 0x71, 0x51 },
+{ 0x00, 0x72, 0x52 },
+{ 0x00, 0x73, 0x53 },
+{ 0x00, 0x74, 0x54 },
+{ 0x00, 0x75, 0x55 },
+{ 0x00, 0x76, 0x56 },
+{ 0x00, 0x77, 0x57 },
+{ 0x00, 0x78, 0x58 },
+{ 0x00, 0x79, 0x59 },
+{ 0x00, 0x7a, 0x5a },
+{ 0x00, 0x7b, 0x7b },
+{ 0x00, 0x7c, 0x7c },
+{ 0x00, 0x7d, 0x7d },
+{ 0x00, 0x7e, 0x7e },
+{ 0x00, 0x7f, 0x7f },
+{ 0x00, 0x80, 0x80 },
+{ 0x00, 0x81, 0x81 },
+{ 0x00, 0x82, 0x82 },
+{ 0x00, 0x83, 0x83 },
+{ 0x00, 0x84, 0x84 },
+{ 0x00, 0x85, 0x85 },
+{ 0x00, 0x86, 0x86 },
+{ 0x00, 0x87, 0x87 },
+{ 0x00, 0x88, 0x88 },
+{ 0x00, 0x89, 0x89 },
+{ 0x00, 0x8a, 0x8a },
+{ 0x00, 0x8b, 0x8b },
+{ 0x00, 0x8c, 0x8c },
+{ 0x00, 0x8d, 0x8d },
+{ 0x00, 0x8e, 0x8e },
+{ 0x00, 0x8f, 0x8f },
+{ 0x00, 0x90, 0x90 },
+{ 0x00, 0x91, 0x91 },
+{ 0x00, 0x92, 0x92 },
+{ 0x00, 0x93, 0x93 },
+{ 0x00, 0x94, 0x94 },
+{ 0x00, 0x95, 0x95 },
+{ 0x00, 0x96, 0x96 },
+{ 0x00, 0x97, 0x97 },
+{ 0x00, 0x98, 0x98 },
+{ 0x00, 0x99, 0x99 },
+{ 0x00, 0x9a, 0x9a },
+{ 0x00, 0x9b, 0x9b },
+{ 0x00, 0x9c, 0x9c },
+{ 0x00, 0x9d, 0x9d },
+{ 0x00, 0x9e, 0x9e },
+{ 0x00, 0x9f, 0x9f },
+{ 0x00, 0xa0, 0xa0 },
+{ 0x00, 0xa1, 0xa1 },
+{ 0x00, 0xa2, 0xa2 },
+{ 0x00, 0xa3, 0xa3 },
+{ 0x00, 0xa4, 0xa4 },
+{ 0x00, 0xa5, 0xa5 },
+{ 0x01, 0xa8, 0xa6 },
+{ 0x00, 0xa7, 0xa7 },
+{ 0x00, 0xa8, 0xa6 },
+{ 0x00, 0xa9, 0xa9 },
+{ 0x00, 0xaa, 0xaa },
+{ 0x00, 0xab, 0xab },
+{ 0x00, 0xac, 0xac },
+{ 0x00, 0xad, 0xad },
+{ 0x00, 0xae, 0xae },
+{ 0x00, 0xaf, 0xaf },
+{ 0x00, 0xb0, 0xb0 },
+{ 0x00, 0xb1, 0xb1 },
+{ 0x00, 0xb2, 0xb2 },
+{ 0x00, 0xb3, 0xb3 },
+{ 0x01, 0xb8, 0xb4 },
+{ 0x00, 0xb5, 0xb5 },
+{ 0x00, 0xb6, 0xb6 },
+{ 0x00, 0xb7, 0xb7 },
+{ 0x00, 0xb8, 0xb4 },
+{ 0x00, 0xb9, 0xb9 },
+{ 0x00, 0xba, 0xba },
+{ 0x00, 0xbb, 0xbb },
+{ 0x01, 0xbd, 0xbc },
+{ 0x00, 0xbd, 0xbc },
+{ 0x01, 0xff, 0xbe },
+{ 0x00, 0xbf, 0xbf },
+{ 0x01, 0xe0, 0xc0 },
+{ 0x01, 0xe1, 0xc1 },
+{ 0x01, 0xe2, 0xc2 },
+{ 0x01, 0xe3, 0xc3 },
+{ 0x01, 0xe4, 0xc4 },
+{ 0x01, 0xe5, 0xc5 },
+{ 0x01, 0xe6, 0xc6 },
+{ 0x01, 0xe7, 0xc7 },
+{ 0x01, 0xe8, 0xc8 },
+{ 0x01, 0xe9, 0xc9 },
+{ 0x01, 0xea, 0xca },
+{ 0x01, 0xeb, 0xcb },
+{ 0x01, 0xec, 0xcc },
+{ 0x01, 0xed, 0xcd },
+{ 0x01, 0xee, 0xce },
+{ 0x01, 0xef, 0xcf },
+{ 0x01, 0xf0, 0xd0 },
+{ 0x01, 0xf1, 0xd1 },
+{ 0x01, 0xf2, 0xd2 },
+{ 0x01, 0xf3, 0xd3 },
+{ 0x01, 0xf4, 0xd4 },
+{ 0x01, 0xf5, 0xd5 },
+{ 0x01, 0xf6, 0xd6 },
+{ 0x00, 0xd7, 0xd7 },
+{ 0x01, 0xf8, 0xd8 },
+{ 0x01, 0xf9, 0xd9 },
+{ 0x01, 0xfa, 0xda },
+{ 0x01, 0xfb, 0xdb },
+{ 0x01, 0xfc, 0xdc },
+{ 0x01, 0xfd, 0xdd },
+{ 0x01, 0xfe, 0xde },
+{ 0x00, 0xdf, 0xdf },
+{ 0x00, 0xe0, 0xc0 },
+{ 0x00, 0xe1, 0xc1 },
+{ 0x00, 0xe2, 0xc2 },
+{ 0x00, 0xe3, 0xc3 },
+{ 0x00, 0xe4, 0xc4 },
+{ 0x00, 0xe5, 0xc5 },
+{ 0x00, 0xe6, 0xc6 },
+{ 0x00, 0xe7, 0xc7 },
+{ 0x00, 0xe8, 0xc8 },
+{ 0x00, 0xe9, 0xc9 },
+{ 0x00, 0xea, 0xca },
+{ 0x00, 0xeb, 0xcb },
+{ 0x00, 0xec, 0xcc },
+{ 0x00, 0xed, 0xcd },
+{ 0x00, 0xee, 0xce },
+{ 0x00, 0xef, 0xcf },
+{ 0x00, 0xf0, 0xd0 },
+{ 0x00, 0xf1, 0xd1 },
+{ 0x00, 0xf2, 0xd2 },
+{ 0x00, 0xf3, 0xd3 },
+{ 0x00, 0xf4, 0xd4 },
+{ 0x00, 0xf5, 0xd5 },
+{ 0x00, 0xf6, 0xd6 },
+{ 0x00, 0xf7, 0xf7 },
+{ 0x00, 0xf8, 0xd8 },
+{ 0x00, 0xf9, 0xd9 },
+{ 0x00, 0xfa, 0xda },
+{ 0x00, 0xfb, 0xdb },
+{ 0x00, 0xfc, 0xdc },
+{ 0x00, 0xfd, 0xdd },
+{ 0x00, 0xfe, 0xde },
+{ 0x00, 0xff, 0xbe },
+};
struct cs_info iscii_devanagari_tbl[] = {
{ 0x00, 0x00, 0x00 },
@@ -3807,8 +4903,6 @@ struct cs_info iscii_devanagari_tbl[] = {
{ 0x00, 0xff, 0xff },
};
-
-
struct enc_entry encds[] = {
{"ISO8859-1",iso1_tbl},
{"ISO8859-2",iso2_tbl},
@@ -3821,8 +4915,11 @@ struct enc_entry encds[] = {
{"ISO8859-9",iso9_tbl},
{"ISO8859-10",iso10_tbl},
{"KOI8-R",koi8r_tbl},
-{"CP-1251",cp1251_tbl},
+{"KOI8-U",koi8u_tbl},
+{"microsoft-cp1251",cp1251_tbl},
+{"ISO8859-13", iso13_tbl},
{"ISO8859-14", iso14_tbl},
+{"ISO8859-15", iso15_tbl},
{"ISCII-DEVANAGARI", iscii_devanagari_tbl},
};
@@ -3836,28 +4933,41 @@ struct cs_info * get_current_cs(const char * es) {
}
}
return ccs;
-}
+};
+struct unicode_info * get_utf_cs() {
+ return utf_lst;
+};
+int get_utf_cs_len() {
+ return UTF_LST_LEN;
+};
struct lang_map lang2enc[] = {
- {"ca","ISO8859-1"},
- {"cs","ISO8859-2"},
- {"da","ISO8859-1"},
- {"de","ISO8859-1"},
- {"el","ISO8859-7"},
- {"en","ISO8859-1"},
- {"es","ISO8859-1"},
- {"fr","ISO8859-1"},
- {"hr","ISO8859-2"},
- {"hu","ISO8859-2"},
- {"it","ISO8859-1"},
- {"la","ISO8859-1"},
- {"nl","ISO8859-1"},
- {"pl","ISO8859-2"},
- {"pt","ISO8859-1"},
- {"sv","ISO8859-1"},
- {"ru","KOI8-R"},
+{"az", "UTF-8", LANG_az},
+{"bg", "microsoft-cp1251", LANG_bg},
+{"ca", "ISO8859-1", LANG_ca},
+{"cs", "ISO8859-2", LANG_cs},
+{"da", "ISO8859-1", LANG_da},
+{"de", "ISO8859-1", LANG_de},
+{"el", "ISO8859-7", LANG_el},
+{"en", "ISO8859-1", LANG_en},
+{"es", "ISO8859-1", LANG_es},
+{"eu", "ISO8859-1", LANG_eu},
+{"gl", "ISO8859-1", LANG_gl},
+{"fr", "ISO8859-15", LANG_fr},
+{"hr", "ISO8859-2", LANG_hr},
+{"hu", "ISO8859-2", LANG_hu},
+{"it", "ISO8859-1", LANG_it},
+{"la", "ISO8859-1", LANG_la},
+{"lv", "ISO8859-13", LANG_lv},
+{"nl", "ISO8859-1", LANG_nl},
+{"pl", "ISO8859-2", LANG_pl},
+{"pt", "ISO8859-1", LANG_pt},
+{"sv", "ISO8859-1", LANG_sv},
+{"tr", "UTF-8", LANG_tr},
+{"ru", "KOI8-R", LANG_ru},
+{"uk", "KOI8-U", LANG_uk}
};
@@ -3869,5 +4979,14 @@ const char * get_default_enc(const char * lang) {
}
}
return NULL;
-}
+};
+int get_lang_num(const char * lang) {
+ int n = sizeof(lang2enc) / sizeof(lang2enc[0]);
+ for (int i = 0; i < n; i++) {
+ if (strncmp(lang,lang2enc[i].lang,2) == 0) {
+ return lang2enc[i].num;
+ }
+ }
+ return LANG_xx;
+};
diff --git a/src/myspell/csutil.hxx b/src/myspell/csutil.hxx
index 037eab9..aa50a58 100644
--- a/src/myspell/csutil.hxx
+++ b/src/myspell/csutil.hxx
@@ -1,36 +1,88 @@
#ifndef __CSUTILHXX__
#define __CSUTILHXX__
-
// First some base level utility routines
+typedef struct {
+ unsigned char l;
+ unsigned char h;
+} w_char;
+
+// convert UTF-16 characters to UTF-8
+char * u16_u8(char * dest, int size, const w_char * src, int srclen);
+
+// convert UTF-8 characters to UTF-16
+int u8_u16(w_char * dest, int size, const char * src);
+
+// sort 2-byte vector
+void flag_qsort(unsigned short flags[], int begin, int end);
+
+// binary search in 2-byte vector
+int flag_bsearch(unsigned short flags[], unsigned short flag, int right);
+
// remove end of line char(s)
void mychomp(char * s);
-// duplicate string
+// duplicate string
char * mystrdup(const char * s);
-// duplicate reverse of string
+// duplicate reverse of string
char * myrevstrdup(const char * s);
-// parse into tokens with char delimiter
+// parse into tokens with char delimiter
char * mystrsep(char ** sptr, const char delim);
+// parse into tokens with char delimiter
+char * mystrsep2(char ** sptr, const char delim);
+
+// parse into tokens with char delimiter
+char * mystrrep(char *, const char *, const char *);
-// is one string a leading subset of another
+// is one string a leading subset of another
int isSubset(const char * s1, const char * s2);
// is one reverse string a leading subset of the end of another
-int isRevSubset(const char * s1, const char * end_of_s2, int s2_len);
+int isRevSubset(const char * s1, const char * s2, int len);
+// append s to ends of every lines in text
+void strlinecat(char * lines, const char * s);
-// character encoding information
+// tokenize into lines with new line
+ int line_tok(const char * text, char *** lines);
+
+// tokenize into lines with new line and uniq in place
+ char * line_uniq(char * text);
+
+// change \n to c in place
+ char * line_join(char * text, char c);
+// leave only last {[^}]*} pattern in string
+ char * delete_zeros(char * morphout);
+
+// reverse word
+ void reverseword(char *);
+
+// reverse word
+ void reverseword_utf(char *);
+
+// character encoding information
struct cs_info {
unsigned char ccase;
unsigned char clower;
unsigned char cupper;
};
+// Unicode character encoding information
+struct unicode_info {
+ unsigned short c;
+ unsigned short cupper;
+ unsigned short clower;
+};
+
+struct unicode_info2 {
+ char cletter;
+ unsigned short cupper;
+ unsigned short clower;
+};
struct enc_entry {
const char * enc_name;
@@ -42,13 +94,20 @@ struct enc_entry {
struct lang_map {
const char * lang;
const char * def_enc;
+ int num;
};
struct cs_info * get_current_cs(const char * es);
+struct unicode_info * get_utf_cs();
+
+int get_utf_cs_len();
+
const char * get_default_enc(const char * lang);
-// convert null terminated string to all caps using encoding
+int get_lang_num(const char * lang);
+
+// convert null terminated string to all caps using encoding
void enmkallcap(char * d, const char * p, const char * encoding);
// convert null terminated string to all little using encoding
@@ -57,7 +116,7 @@ void enmkallsmall(char * d, const char * p, const char * encoding);
// convert null terminated string to have intial capital using encoding
void enmkinitcap(char * d, const char * p, const char * encoding);
-// convert null terminated string to all caps
+// convert null terminated string to all caps
void mkallcap(char * p, const struct cs_info * csconv);
// convert null terminated string to all little
@@ -66,5 +125,7 @@ void mkallsmall(char * p, const struct cs_info * csconv);
// convert null terminated string to have intial capital
void mkinitcap(char * p, const struct cs_info * csconv);
+// convert first nc characters of UTF-8 string to little
+void mkallsmall_utf(w_char * u, int nc, struct unicode_info2 * utfconv);
#endif
diff --git a/src/myspell/enchant_myspell.hxx b/src/myspell/enchant_myspell.hxx
deleted file mode 100644
index 0c18549..0000000
--- a/src/myspell/enchant_myspell.hxx
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _MYSPELLMGR_HXX_
-#define _MYSPELLMGR_HXX_
-
-#include "hashmgr.hxx"
-#include "affixmgr.hxx"
-#include "suggestmgr.hxx"
-#include "csutil.hxx"
-
-#define NOCAP 0
-#define INITCAP 1
-#define ALLCAP 2
-#define HUHCAP 3
-
-#ifdef WINDOWS
-#define DLLSUPPORT __declspec(dllexport)
-#else
-#define DLLSUPPORT
-#endif
-
-class DLLSUPPORT MySpell
-{
- AffixMgr* pAMgr;
- HashMgr* pHMgr;
- SuggestMgr* pSMgr;
- char * encoding;
- struct cs_info * csconv;
- int maxSug;
-
-public:
- MySpell(const char * affpath, const char * dpath);
- ~MySpell();
-
- int suggest(char*** slst, const char * word);
- int spell(const char *);
- char * get_dic_encoding();
-
-private:
- int cleanword(char *, const char *, int *, int *);
- char * check(const char *);
-};
-
-#endif
diff --git a/src/myspell/hashmgr.cxx b/src/myspell/hashmgr.cxx
index d7b4ec8..29a05c3 100644
--- a/src/myspell/hashmgr.cxx
+++ b/src/myspell/hashmgr.cxx
@@ -1,25 +1,36 @@
-#include "license.readme"
+#include "license.hunspell"
+#include "license.myspell"
#include <cstdlib>
#include <cstring>
+#include <cctype>
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
#include <cstdio>
#include "hashmgr.hxx"
+#include "csutil.hxx"
-extern void mychomp(char * s);
-extern char * mystrdup(const char *);
-
-#ifndef WINDOWS
+#ifndef W32
+#include <unistd.h>
using namespace std;
#endif
-
// build a hash table from a munched word list
-HashMgr::HashMgr(const char * tpath)
+HashMgr::HashMgr(const char * tpath, const char * apath)
{
tablesize = 0;
tableptr = NULL;
+ flag_mode = FLAG_CHAR;
+ complexprefixes = 0;
+ utf8 = 0;
+ numaliasf = 0;
+ aliasf = NULL;
+ numaliasm = 0;
+ aliasm = NULL;
+ load_config(apath);
int ec = load_tables(tpath);
if (ec) {
/* error condition - what should we do here */
@@ -42,14 +53,17 @@ HashMgr::~HashMgr()
struct hentry * pt = &tableptr[i];
struct hentry * nt = NULL;
if (pt) {
+ if (pt->astr && !aliasf) free(pt->astr);
if (pt->word) free(pt->word);
- if (pt->astr) free(pt->astr);
+ if (pt->description && !aliasm) free(pt->description);
+
pt = pt->next;
}
while(pt) {
nt = pt->next;
+ if (pt->astr && !aliasf) free(pt->astr);
if (pt->word) free(pt->word);
- if (pt->astr) free(pt->astr);
+ if (pt->description && !aliasm) free(pt->description);
free(pt);
pt = nt;
}
@@ -57,9 +71,22 @@ HashMgr::~HashMgr()
free(tableptr);
}
tablesize = 0;
-}
-
+ if (aliasf) {
+ for (int j = 0; j < (numaliasf); j++) free(aliasf[j]);
+ free(aliasf);
+ aliasf = NULL;
+ if (aliasflen) {
+ free(aliasflen);
+ aliasflen = NULL;
+ }
+ }
+ if (aliasm) {
+ for (int j = 0; j < (numaliasm); j++) free(aliasm[j]);
+ free(aliasm);
+ aliasm = NULL;
+ }
+}
// lookup a root word in the hashtable
@@ -76,40 +103,87 @@ struct hentry * HashMgr::lookup(const char *word) const
return NULL;
}
-
-
// add a word to the hash table (private)
-int HashMgr::add_word(const char * word, int wl, const char * aff, int al)
+int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc)
{
- int i = hash(word);
+ char * st = mystrdup(word);
+ if (wl && !st) return 1;
+ if (complexprefixes) {
+ if (utf8) reverseword_utf(st); else reverseword(st);
+ }
+ int i = hash(st);
struct hentry * dp = &tableptr[i];
- struct hentry* hp;
if (dp->word == NULL) {
dp->wlen = wl;
dp->alen = al;
- dp->word = mystrdup(word);
- dp->astr = mystrdup(aff);
+ dp->word = st;
+ dp->astr = aff;
dp->next = NULL;
- if ((wl) && (dp->word == NULL)) return 1;
- if ((al) && (dp->astr == NULL)) return 1;
+ dp->next_homonym = NULL;
+ if (aliasm) {
+ dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
+ } else {
+ dp->description = mystrdup(desc);
+ if (desc && !dp->description) return 1;
+ if (dp->description && complexprefixes) {
+ if (utf8) reverseword_utf(dp->description); else reverseword(dp->description);
+ }
+ }
} else {
- hp = (struct hentry *) malloc (sizeof(struct hentry));
- if (hp == NULL) return 1;
+ struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry));
+ if (!hp) return 1;
hp->wlen = wl;
hp->alen = al;
- hp->word = mystrdup(word);
- hp->astr = mystrdup(aff);
+ hp->word = st;
+ hp->astr = aff;
hp->next = NULL;
- while (dp->next != NULL) dp=dp->next;
+ hp->next_homonym = NULL;
+ if (aliasm) {
+ hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc);
+ } else {
+ hp->description = mystrdup(desc);
+ if (desc && !hp->description) return 1;
+ if (dp->description && complexprefixes) {
+ if (utf8) reverseword_utf(hp->description); else reverseword(hp->description);
+ }
+ }
+ while (dp->next != NULL) {
+ if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
+ dp=dp->next;
+ }
+ if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp;
dp->next = hp;
- if ((wl) && (hp->word == NULL)) return 1;
- if ((al) && (hp->astr == NULL)) return 1;
}
return 0;
}
+// add a custom dic. word to the hash table (public)
+int HashMgr::put_word(const char * word, int wl, char * aff)
+{
+ unsigned short * flags;
+ int al = 0;
+ if (aff) {
+ al = decode_flags(&flags, aff);
+ flag_qsort(flags, 0, al);
+ } else {
+ flags = NULL;
+ }
+ add_word(word, wl, flags, al, NULL);
+ return 0;
+}
+
+int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern)
+{
+ unsigned short * flags;
+ struct hentry * dp = lookup(pattern);
+ if (!dp || !dp->astr) return 1;
+ flags = (unsigned short *) malloc (dp->alen * sizeof(short));
+ memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short));
+ add_word(word, wl, flags, dp->alen, NULL);
+ return 0;
+}
// walk the hash table entry by entry - null at end
struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
@@ -137,14 +211,13 @@ struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const
return hp;
}
-
-
// load a munched word list and build a hash table on the fly
-
int HashMgr::load_tables(const char * tpath)
{
int wl, al;
char * ap;
+ char * dp;
+ unsigned short * flags;
// raw dictionary - munched file
FILE * rawdict = fopen(tpath, "r");
@@ -154,39 +227,72 @@ int HashMgr::load_tables(const char * tpath)
char ts[MAXDELEN];
if (! fgets(ts, MAXDELEN-1,rawdict)) return 2;
mychomp(ts);
+ if ((*ts < '1') || (*ts > '9')) fprintf(stderr, "error - missing word count in dictionary file\n");
tablesize = atoi(ts);
if (!tablesize) return 4;
- tablesize = tablesize + 5;
+ tablesize = tablesize + 5 + USERWORD;
if ((tablesize %2) == 0) tablesize++;
// allocate the hash table
tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry));
if (! tableptr) return 3;
+ for (int i=0; i<tablesize; i++) tableptr[i].word = NULL;
// loop through all words on much list and add to hash
// table and create word and affix strings
while (fgets(ts,MAXDELEN-1,rawdict)) {
mychomp(ts);
+ // split each line into word and morphological description
+ dp = strchr(ts,'\t');
+
+ if (dp) {
+ *dp = '\0';
+ dp++;
+ } else {
+ dp = NULL;
+ }
+
// split each line into word and affix char strings
- ap = strchr(ts,'/');
+ // "\/" signs slash in words (not affix separator)
+ // "/" at beginning of the line is word character (not affix separator)
+ ap = ts;
+ while (ap = strchr(ap,'/')) {
+ if (ap == ts) {
+ ap++;
+ continue;
+ } else if (*(ap - 1) != '\\') break;
+ // replace "\/" with "/"
+ for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++);
+
+ }
+
if (ap) {
*ap = '\0';
- ap++;
- al = strlen(ap);
+ if (aliasf) {
+ int index = atoi(ap + 1);
+ al = get_aliasf(index, &flags);
+ if (!al) {
+ fprintf(stderr, "error - bad flag vector alias: %s\n", ts);
+ *ap = '\0';
+ }
+ } else {
+ al = decode_flags(&flags, ap + 1);
+ flag_qsort(flags, 0, al);
+ }
} else {
al = 0;
ap = NULL;
+ flags = NULL;
}
wl = strlen(ts);
// add the word and its index
- if (add_word(ts,wl,ap,al))
- return 5;;
+ if (add_word(ts,wl,flags,al,dp)) return 5;
}
-
+
fclose(rawdict);
return 0;
}
@@ -207,3 +313,367 @@ int HashMgr::hash(const char * word) const
return (unsigned long) hv % tablesize;
}
+int HashMgr::decode_flags(unsigned short ** result, char * flags) {
+ int len;
+ switch (flag_mode) {
+ case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz)
+ len = strlen(flags);
+ if (len%2 == 1) fprintf(stderr,"error: length of FLAG_LONG flagvector is odd: %s\n", flags);
+ len = len/2;
+ *result = (unsigned short *) malloc(len * sizeof(short));
+ for (int i = 0; i < len; i++) {
+ (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1];
+ }
+ break;
+ }
+ case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233)
+ len = 1;
+ char * src = flags;
+ unsigned short * dest;
+ char * p;
+ for (p = flags; *p; p++) {
+ if (*p == ',') len++;
+ }
+ *result = (unsigned short *) malloc(len * sizeof(short));
+ dest = *result;
+ for (p = flags; *p; p++) {
+ if (*p == ',') {
+ *dest = (unsigned short) atoi(src);
+ if (*dest == 0) fprintf(stderr, "error: 0 is wrong flag id\n");
+ src = p + 1;
+ dest++;
+ }
+ }
+ *dest = (unsigned short) atoi(src);
+ if (*dest == 0) fprintf(stderr, "error: 0 is wrong flag id\n");
+ break;
+ }
+ case FLAG_UNI: { // UTF-8 characters
+ w_char w[MAXDELEN/2];
+ len = u8_u16(w, MAXDELEN/2, flags);
+ *result = (unsigned short *) malloc(len * sizeof(short));
+ memcpy(*result, w, len * sizeof(short));
+ break;
+ }
+ default: { // Ispell's one-character flags (erfg -> e r f g)
+ unsigned short * dest;
+ len = strlen(flags);
+ *result = (unsigned short *) malloc(len * sizeof(short));
+ dest = *result;
+ for (unsigned char * p = (unsigned char *) flags; *p; p++) {
+ *dest = (unsigned short) *p;
+ dest++;
+ }
+ }
+ }
+ return len;
+}
+
+unsigned short HashMgr::decode_flag(const char * f) {
+ unsigned short s = 0;
+ switch (flag_mode) {
+ case FLAG_LONG:
+ s = ((unsigned short) f[0] << 8) + (unsigned short) f[1];
+ break;
+ case FLAG_NUM:
+ s = (unsigned short) atoi(f);
+ break;
+ case FLAG_UNI:
+ u8_u16((w_char *) &s, 1, f);
+ break;
+ default:
+ s = (unsigned short) *((unsigned char *)f);
+ }
+ if (!s) fprintf(stderr, "error: 0 is wrong flag id\n");
+ return s;
+}
+
+char * HashMgr::encode_flag(unsigned short f) {
+ unsigned char ch[10];
+ if (f==0) return mystrdup("(NULL)");
+ if (flag_mode == FLAG_LONG) {
+ ch[0] = (unsigned char) (f >> 8);
+ ch[1] = (unsigned char) (f - ((f >> 8) << 8));
+ ch[2] = '\0';
+ } else if (flag_mode == FLAG_NUM) {
+ sprintf((char *) ch, "%d", f);
+ } else if (flag_mode == FLAG_UNI) {
+ u16_u8((char *) &ch, 10, (w_char *) &f, 1);
+ } else {
+ ch[0] = (unsigned char) (f);
+ ch[1] = '\0';
+ }
+ return mystrdup((char *) ch);
+}
+
+// read in aff file and set flag mode
+int HashMgr::load_config(const char * affpath)
+{
+
+ // io buffers
+ char line[MAXDELEN+1];
+
+ // open the affix file
+ FILE * afflst;
+ afflst = fopen(affpath,"r");
+ if (!afflst) {
+ fprintf(stderr,"Error - could not open affix description file %s\n",affpath);
+ return 1;
+ }
+
+ // read in each line ignoring any that do not
+ // start with a known line type indicator
+
+ while (fgets(line,MAXDELEN,afflst)) {
+ mychomp(line);
+
+ /* parse in the try string */
+ if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) {
+ if (flag_mode != FLAG_CHAR) {
+ fprintf(stderr,"error: duplicate FLAG parameter\n");
+ }
+ if (strstr(line, "long")) flag_mode = FLAG_LONG;
+ if (strstr(line, "num")) flag_mode = FLAG_NUM;
+ if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI;
+ if (flag_mode == FLAG_CHAR) {
+ fprintf(stderr,"error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line);
+ }
+ }
+ if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1;
+
+ if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) {
+ if (parse_aliasf(line, afflst)) {
+ return 1;
+ }
+ }
+
+ if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) {
+ if (parse_aliasm(line, afflst)) {
+ return 1;
+ }
+ }
+
+ if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1;
+ if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break;
+ }
+ fclose(afflst);
+ return 0;
+}
+
+/* parse in the ALIAS table */
+int HashMgr::parse_aliasf(char * line, FILE * af)
+{
+ if (numaliasf != 0) {
+ fprintf(stderr,"error: duplicate AF (alias for flag vector) tables used\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ numaliasf = atoi(piece);
+ if (numaliasf < 1) {
+ numaliasf = 0;
+ aliasf = NULL;
+ aliasflen = NULL;
+ fprintf(stderr,"incorrect number of entries in AF table\n");
+ free(piece);
+ return 1;
+ }
+ aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *));
+ aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short));
+ if (!aliasf || !aliasflen) {
+ numaliasf = 0;
+ if (aliasf) free(aliasf);
+ if (aliasflen) free(aliasflen);
+ aliasf = NULL;
+ aliasflen = NULL;
+ return 1;
+ }
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ numaliasf = 0;
+ free(aliasf);
+ free(aliasflen);
+ aliasf = NULL;
+ aliasflen = NULL;
+ fprintf(stderr,"error: missing AF table information\n");
+ return 1;
+ }
+
+ /* now parse the numaliasf lines to read in the remainder of the table */
+ char * nl = line;
+ for (int j=0; j < numaliasf; j++) {
+ if (!fgets(nl,MAXDELEN,af)) return 1;
+ mychomp(nl);
+ tp = nl;
+ i = 0;
+ aliasf[j] = NULL;
+ aliasflen[j] = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: {
+ if (strncmp(piece,"AF",2) != 0) {
+ numaliasf = 0;
+ free(aliasf);
+ free(aliasflen);
+ aliasf = NULL;
+ aliasflen = NULL;
+ fprintf(stderr,"error: AF table is corrupt\n");
+ free(piece);
+ return 1;
+ }
+ break;
+ }
+ case 1: {
+ aliasflen[j] = decode_flags(&(aliasf[j]), piece);
+ flag_qsort(aliasf[j], 0, aliasflen[j]);
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (!aliasf[j]) {
+ free(aliasf);
+ free(aliasflen);
+ aliasf = NULL;
+ aliasflen = NULL;
+ numaliasf = 0;
+ fprintf(stderr,"error: AF table is corrupt\n");
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* parse morph alias definitions */
+int HashMgr::parse_aliasm(char * line, FILE * af)
+{
+ if (numaliasm != 0) {
+ fprintf(stderr,"error: duplicate AM (aliases for morphological descriptions) tables used\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ numaliasm = atoi(piece);
+ if (numaliasm < 1) {
+ fprintf(stderr,"incorrect number of entries in AM table\n");
+ free(piece);
+ return 1;
+ }
+ aliasm = (char **) malloc(numaliasm * sizeof(char *));
+ if (!aliasm) {
+ numaliasm = 0;
+ return 1;
+ }
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ numaliasm = 0;
+ free(aliasm);
+ aliasm = NULL;
+ fprintf(stderr,"error: missing AM alias information\n");
+ return 1;
+ }
+
+ /* now parse the numaliasm lines to read in the remainder of the table */
+ char * nl = line;
+ for (int j=0; j < numaliasm; j++) {
+ if (!fgets(nl,MAXDELEN,af)) return 1;
+ mychomp(nl);
+ tp = nl;
+ i = 0;
+ aliasm[j] = NULL;
+ while ((piece=mystrsep(&tp, 0))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: {
+ if (strncmp(piece,"AM",2) != 0) {
+ fprintf(stderr,"error: AM table is corrupt\n");
+ free(piece);
+ numaliasm = 0;
+ free(aliasm);
+ aliasm = NULL;
+ return 1;
+ }
+ break;
+ }
+ case 1: {
+ if (complexprefixes) {
+ if (utf8) reverseword_utf(piece);
+ else reverseword(piece);
+ }
+ aliasm[j] = mystrdup(piece);
+ break; }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (!aliasm[j]) {
+ numaliasm = 0;
+ free(aliasm);
+ aliasm = NULL;
+ fprintf(stderr,"error: map table is corrupt\n");
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int HashMgr::is_aliasf() {
+ return (aliasf != NULL);
+}
+
+int HashMgr::is_aliasm() {
+ return (aliasm != NULL);
+}
+
+int HashMgr::get_aliasf(int index, unsigned short ** fvec) {
+ if ((index > 0) && (index <= numaliasf)) {
+ *fvec = aliasf[index - 1];
+ return aliasflen[index - 1];
+ }
+ fprintf(stderr,"error: bad flag alias index: %d\n", index);
+ fprintf(stderr,"hiba: %d\n", index);
+ *fvec = NULL;
+ return 0;
+}
+
+char * HashMgr::get_aliasm(int index) {
+ if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1];
+ fprintf(stderr,"error: bad morph. alias index: %d\n", index);
+ return NULL;
+}
diff --git a/src/myspell/hashmgr.hxx b/src/myspell/hashmgr.hxx
index e8b08c3..3a27b1e 100644
--- a/src/myspell/hashmgr.hxx
+++ b/src/myspell/hashmgr.hxx
@@ -1,26 +1,50 @@
#ifndef _HASHMGR_HXX_
#define _HASHMGR_HXX_
+#include <cstdio>
#include "htypes.hxx"
+enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI };
+
class HashMgr
{
int tablesize;
struct hentry * tableptr;
+ int userword;
+ flag flag_mode;
+ int complexprefixes;
+ int utf8;
+ int numaliasf; // flag vector `compression' with aliases
+ unsigned short ** aliasf;
+ unsigned short * aliasflen;
+ int numaliasm; // morphological desciption `compression' with aliases
+ char ** aliasm;
+
public:
- HashMgr(const char * tpath);
+ HashMgr(const char * tpath, const char * apath);
~HashMgr();
struct hentry * lookup(const char *) const;
int hash(const char *) const;
struct hentry * walk_hashtable(int & col, struct hentry * hp) const;
+ int put_word(const char * word, int wl, char * ap);
+ int put_word_pattern(const char * word, int wl, const char * pattern);
+ int decode_flags(unsigned short ** result, char * flags);
+ unsigned short decode_flag(const char * flag);
+ char * encode_flag(unsigned short flag);
+ int is_aliasf();
+ int is_aliasm();
+ int get_aliasf(int index, unsigned short ** fvec);
+ char * get_aliasm(int index);
+
private:
- HashMgr( const HashMgr & ); // not implemented
- HashMgr &operator=( const HashMgr & ); // not implemented
int load_tables(const char * tpath);
- int add_word(const char * word, int wl, const char * ap, int al);
+ int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc);
+ int load_config(const char * affpath);
+ int parse_aliasf(char * line, FILE * af);
+ int parse_aliasm(char * line, FILE * af);
};
diff --git a/src/myspell/htypes.hxx b/src/myspell/htypes.hxx
index 029e9f2..14a4783 100644
--- a/src/myspell/htypes.hxx
+++ b/src/myspell/htypes.hxx
@@ -1,20 +1,25 @@
#ifndef _HTYPES_HXX_
#define _HTYPES_HXX_
-#define MAXDELEN 256
+#define MAXDELEN 8192
#define ROTATE_LEN 5
#define ROTATE(v,q) \
(v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1));
+// approx. number of user defined words
+#define USERWORD 1000
+
struct hentry
{
short wlen;
short alen;
char * word;
- char * astr;
+ unsigned short * astr;
struct hentry * next;
-};
+ struct hentry * next_homonym;
+ char * description;
+};
#endif
diff --git a/src/myspell/hunspell.cxx b/src/myspell/hunspell.cxx
new file mode 100644
index 0000000..14ea1ad
--- /dev/null
+++ b/src/myspell/hunspell.cxx
@@ -0,0 +1,1616 @@
+#include "license.hunspell"
+#include "license.myspell"
+
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+
+#include "hunspell.hxx"
+
+#ifndef W32
+using namespace std;
+#endif
+
+Hunspell::Hunspell(const char * affpath, const char * dpath)
+{
+ encoding = NULL;
+ csconv = NULL;
+ utfconv = NULL;
+ utf8 = 0;
+ complexprefixes = 0;
+
+ /* first set up the hash manager */
+ pHMgr = new HashMgr(dpath, affpath);
+
+ /* next set up the affix manager */
+ /* it needs access to the hash manager lookup methods */
+ pAMgr = new AffixMgr(affpath,pHMgr);
+
+ /* get the preferred try string and the dictionary */
+ /* encoding from the Affix Manager for that dictionary */
+ char * try_string = pAMgr->get_try_string();
+ encoding = pAMgr->get_encoding();
+ csconv = get_current_cs(encoding);
+ langnum = pAMgr->get_langnum();
+ utf8 = pAMgr->get_utf8();
+ utfconv = pAMgr->get_utf_conv();
+ complexprefixes = pAMgr->get_complexprefixes();
+ wordbreak = pAMgr->get_breaktable();
+
+ /* and finally set up the suggestion manager */
+ pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
+ if (try_string) free(try_string);
+
+ prevroot = NULL;
+ prevcompound = 0;
+ forbidden_compound = 0;
+}
+
+Hunspell::~Hunspell()
+{
+ if (pSMgr) delete pSMgr;
+ if (pAMgr) delete pAMgr;
+ if (pHMgr) delete pHMgr;
+ pSMgr = NULL;
+ pAMgr = NULL;
+ pHMgr = NULL;
+ csconv= NULL;
+ if (encoding) free(encoding);
+ encoding = NULL;
+}
+
+
+// make a copy of src at destination while removing all leading
+// blanks and removing any trailing periods after recording
+// their presence with the abbreviation flag
+// also since already going through character by character,
+// set the capitalization type
+// return the length of the "cleaned" (and UTF-8 encoded) word
+
+int Hunspell::cleanword2(char * dest, const char * src,
+ w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev)
+{
+ unsigned char * p = (unsigned char *) dest;
+ const unsigned char * q = (const unsigned char * ) src;
+ int firstcap = 0;
+
+ // first skip over any leading blanks
+ while ((*q != '\0') && (*q == ' ')) q++;
+
+ // now strip off any trailing periods (recording their presence)
+ *pabbrev = 0;
+ int nl = strlen((const char *)q);
+ while ((nl > 0) && (*(q+nl-1)=='.')) {
+ nl--;
+ (*pabbrev)++;
+ }
+
+ // if no characters are left it can't be capitalized
+ if (nl <= 0) {
+ *pcaptype = NOCAP;
+ *p = '\0';
+ return 0;
+ }
+
+ // now determine the capitalization type of the first nl letters
+ int ncap = 0;
+ int nneutral = 0;
+ *nc = 0;
+
+ if (!utf8) {
+ while (nl > 0) {
+ (*nc)++;
+ if (csconv[(*q)].ccase) ncap++;
+ if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
+ *p++ = *q++;
+ nl--;
+ }
+ // remember to terminate the destination string
+ *p = '\0';
+ if (ncap) {
+ firstcap = csconv[(unsigned char)(*dest)].ccase;
+ }
+ } else {
+ unsigned short idx;
+ *nc = u8_u16(dest_utf, MAXWORDLEN, (const char *) q);
+ // don't check too long words
+ if (*nc >= MAXWORDLEN) return 0;
+ *nc -= *pabbrev;
+ for (int i = 0; i < *nc; i++) {
+ idx = (dest_utf[i].h << 8) + dest_utf[i].l;
+ if (idx != utfconv[idx].clower) ncap++;
+ if (utfconv[idx].cupper == utfconv[idx].clower) nneutral++;
+ }
+ u16_u8(dest, MAXWORDUTF8LEN, dest_utf, *nc);
+ if (ncap) {
+ idx = (dest_utf[0].h << 8) + dest_utf[0].l;
+ firstcap = (idx != utfconv[idx].clower);
+ }
+ }
+
+ // now finally set the captype
+ if (ncap == 0) {
+ *pcaptype = NOCAP;
+ } else if ((ncap == 1) && firstcap) {
+ *pcaptype = INITCAP;
+ } else if ((ncap == *nc) || ((ncap + nneutral) == *nc)) {
+ *pcaptype = ALLCAP;
+ } else if ((ncap > 1) && firstcap) {
+ *pcaptype = HUHINITCAP;
+ } else {
+ *pcaptype = HUHCAP;
+ }
+ return strlen(dest);
+}
+
+int Hunspell::cleanword(char * dest, const char * src,
+ int * pcaptype, int * pabbrev)
+{
+ unsigned char * p = (unsigned char *) dest;
+ const unsigned char * q = (const unsigned char * ) src;
+ int firstcap = 0;
+
+ // first skip over any leading blanks
+ while ((*q != '\0') && (*q == ' ')) q++;
+
+ // now strip off any trailing periods (recording their presence)
+ *pabbrev = 0;
+ int nl = strlen((const char *)q);
+ while ((nl > 0) && (*(q+nl-1)=='.')) {
+ nl--;
+ (*pabbrev)++;
+ }
+
+ // if no characters are left it can't be capitalized
+ if (nl <= 0) {
+ *pcaptype = NOCAP;
+ *p = '\0';
+ return 0;
+ }
+
+ // now determine the capitalization type of the first nl letters
+ int ncap = 0;
+ int nneutral = 0;
+ int nc = 0;
+
+ if (!utf8) {
+ while (nl > 0) {
+ nc++;
+ if (csconv[(*q)].ccase) ncap++;
+ if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
+ *p++ = *q++;
+ nl--;
+ }
+ // remember to terminate the destination string
+ *p = '\0';
+ firstcap = csconv[(unsigned char)(*dest)].ccase;
+ } else {
+ unsigned short idx;
+ w_char t[MAXWORDLEN];
+ nc = u8_u16(t, MAXWORDLEN, src);
+ for (int i = 0; i < nc; i++) {
+ idx = (t[i].h << 8) + t[i].l;
+ if (idx != utfconv[idx].clower) ncap++;
+ if (utfconv[idx].cupper == utfconv[idx].clower) nneutral++;
+ }
+ u16_u8(dest, MAXWORDUTF8LEN, t, nc);
+ if (ncap) {
+ idx = (t[0].h << 8) + t[0].l;
+ firstcap = (idx != utfconv[idx].clower);
+ }
+ }
+
+ // now finally set the captype
+ if (ncap == 0) {
+ *pcaptype = NOCAP;
+ } else if ((ncap == 1) && firstcap) {
+ *pcaptype = INITCAP;
+ } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
+ *pcaptype = ALLCAP;
+ } else if ((ncap > 1) && firstcap) {
+ *pcaptype = HUHINITCAP;
+ } else {
+ *pcaptype = HUHCAP;
+ }
+ return strlen(dest);
+}
+
+
+void Hunspell::mkallcap(char * p)
+{
+ if (utf8) {
+ w_char u[MAXWORDLEN];
+ int nc = u8_u16(u, MAXWORDLEN, p);
+ unsigned short idx;
+ for (int i = 0; i < nc; i++) {
+ idx = (u[i].h << 8) + u[i].l;
+ if (idx != utfconv[idx].cupper) {
+ u[i].h = (unsigned char) (utfconv[idx].cupper >> 8);
+ u[i].l = (unsigned char) (utfconv[idx].cupper & 0x00FF);
+ }
+ }
+ u16_u8(p, MAXWORDUTF8LEN, u, nc);
+ } else {
+ while (*p != '\0') {
+ *p = csconv[((unsigned char) *p)].cupper;
+ p++;
+ }
+ }
+}
+
+int Hunspell::mkallcap2(char * p, w_char * u, int nc)
+{
+ if (utf8) {
+ unsigned short idx;
+ for (int i = 0; i < nc; i++) {
+ idx = (u[i].h << 8) + u[i].l;
+ if (idx != utfconv[idx].cupper) {
+ u[i].h = (unsigned char) (utfconv[idx].cupper >> 8);
+ u[i].l = (unsigned char) (utfconv[idx].cupper & 0x00FF);
+ }
+ }
+ u16_u8(p, MAXWORDUTF8LEN, u, nc);
+ return strlen(p);
+ } else {
+ while (*p != '\0') {
+ *p = csconv[((unsigned char) *p)].cupper;
+ p++;
+ }
+ }
+ return nc;
+}
+
+
+void Hunspell::mkallsmall(char * p)
+{
+ while (*p != '\0') {
+ *p = csconv[((unsigned char) *p)].clower;
+ p++;
+ }
+}
+
+int Hunspell::mkallsmall2(char * p, w_char * u, int nc)
+{
+ if (utf8) {
+ unsigned short idx;
+ for (int i = 0; i < nc; i++) {
+ idx = (u[i].h << 8) + u[i].l;
+ if (idx != utfconv[idx].clower) {
+ u[i].h = (unsigned char) (utfconv[idx].clower >> 8);
+ u[i].l = (unsigned char) (utfconv[idx].clower & 0x00FF);
+ }
+ }
+ u16_u8(p, MAXWORDUTF8LEN, u, nc);
+ return strlen(p);
+ } else {
+ while (*p != '\0') {
+ *p = csconv[((unsigned char) *p)].clower;
+ p++;
+ }
+ }
+ return nc;
+}
+
+// convert UTF-8 sharp S codes to latin 1
+char * Hunspell::sharps_u8_l1(char * dest, char * source) {
+ char * p = dest;
+ *p = *source;
+ for (p++, source++; *(source - 1); p++, source++) {
+ *p = *source;
+ if (*source == 'Ÿ') *--p = 'ß';
+ }
+ return dest;
+}
+
+// recursive search for right ss-ß permutations
+hentry * Hunspell::spellsharps(char * base, char * pos, int n, int repnum, char * tmp) {
+ if ((pos = strstr(pos, "ss")) && (n < MAXSHARPS)) {
+ hentry * h;
+ *pos = 'Ã';
+ *(pos + 1) = 'Ÿ';
+ if (h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp)) return h;
+ *pos = 's';
+ *(pos + 1) = 's';
+ if (h = spellsharps(base, pos + 2, n + 1, repnum, tmp)) return h;
+ } else if (repnum > 0) {
+ if (utf8) return check(base);
+ return check(sharps_u8_l1(tmp, base));
+ }
+ return NULL;
+}
+
+int Hunspell::is_keepcase(const hentry * rv) {
+ return pAMgr && rv->astr && pAMgr->get_keepcase() &&
+ TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
+}
+
+/* check and insert a word to beginning of the suggestion array */
+int Hunspell::insert_sug(char ***slst, char * word, int *ns) {
+ if (spell(word)) {
+ if (*ns == MAXSUGGESTION) {
+ (*ns)--;
+ free((*slst)[*ns]);
+ }
+ for (int k = *ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1];
+ (*slst)[0] = mystrdup(word);
+ (*ns)++;
+ }
+ return 0;
+}
+
+int Hunspell::spell(const char * word)
+{
+ struct hentry * rv=NULL;
+ // need larger vector. For example, Turkish capital letter I converted a
+ // 2-byte UTF-8 character (dotless i) by mkallsmall.
+ char cw[MAXWORDUTF8LEN + 4];
+ char wspace[MAXWORDUTF8LEN + 4];
+ w_char unicw[MAXWORDLEN + 1];
+ int nc = strlen(word);
+ int wl2;
+ if (utf8) {
+ if (nc >= MAXWORDUTF8LEN) return 0;
+ } else {
+ if (nc >= MAXWORDLEN) return 0;
+ }
+ int captype = 0;
+ int abbv = 0;
+ int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+
+ if (wl == 0) return 1;
+
+ // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.)
+ enum { NBEGIN, NNUM, NSEP };
+ int nstate = NBEGIN;
+ int i;
+
+ for (i = 0; (i < wl) &&
+ (((cw[i] <= '9') && (cw[i] >= '0') && (nstate = NNUM)) ||
+ ((nstate == NNUM) && ((cw[i] == ',') ||
+ (cw[i] == '.') || (cw[i] == '-')) && (nstate = NSEP))); i++);
+ if ((i == wl) && (nstate == NNUM)) return 1;
+
+ // LANG_hu section: number(s) + (percent or degree) with suffixes
+ if (langnum == LANG_hu) {
+ if ((nstate == NNUM) && ((cw[i] == '%') || (cw[i] == '°')) && check(cw + i)) return 1;
+ }
+ // END of LANG_hu section
+
+ switch(captype) {
+ case HUHCAP:
+ case HUHINITCAP:
+ case NOCAP: {
+ rv = check(cw);
+ if ((abbv) && !(rv)) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ rv = check(wspace);
+ }
+ break;
+ }
+ case ALLCAP: {
+ rv = check(cw);
+ if (rv) break;
+ if (abbv) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ rv = check(wspace);
+ if (rv) break;
+ }
+ if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) {
+ char tmpword[MAXWORDUTF8LEN];
+ wl = mkallsmall2(cw, unicw, nc);
+ memcpy(wspace,cw,(wl+1));
+ rv = spellsharps(wspace, wspace, 0, 0, tmpword);
+ if (!rv) {
+ wl2 = mkinitcap2(cw, unicw, nc);
+ rv = spellsharps(cw, cw, 0, 0, tmpword);
+ }
+ if ((abbv) && !(rv)) {
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ rv = spellsharps(wspace, wspace, 0, 0, tmpword);
+ if (!rv) {
+ memcpy(wspace, cw, wl2);
+ *(wspace+wl2) = '.';
+ *(wspace+wl2+1) = '\0';
+ rv = spellsharps(wspace, wspace, 0, 0, tmpword);
+ }
+ }
+ if (rv) break;
+ }
+ }
+ case INITCAP: {
+ wl = mkallsmall2(cw, unicw, nc);
+ memcpy(wspace,cw,(wl+1));
+ rv = check(wspace);
+ if (!rv || (is_keepcase(rv) && !((captype == INITCAP) &&
+ // if CHECKSHARPS: KEEPCASE words with ß are allowed
+ // in INITCAP form, too.
+ pAMgr->get_checksharps() && ((utf8 && strstr(wspace, "ß")) ||
+ (!utf8 && strchr(wspace, 'ß')))))) {
+ wl2 = mkinitcap2(cw, unicw, nc);
+ rv = check(cw);
+ if (rv && (captype == ALLCAP) && is_keepcase(rv)) rv = NULL;
+ }
+ if (abbv && !rv) {
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ rv = check(wspace);
+ if (!rv || is_keepcase(rv)) {
+ memcpy(wspace, cw, wl2);
+ *(wspace+wl2) = '.';
+ *(wspace+wl2+1) = '\0';
+ rv = check(wspace);
+ if (rv && ((captype == ALLCAP) && is_keepcase(rv))) rv = NULL;
+ }
+ }
+ break;
+ }
+ }
+
+ if (rv) return 1;
+
+ // recursive breaking at break points (not good for morphological analysis)
+ if (wordbreak) {
+ char * s;
+ char r;
+ for (int i = 0; i < pAMgr->get_numbreak(); i++) {
+ if (s=(char *) strstr(cw, wordbreak[i])) {
+ r = *s;
+ *s = '\0';
+ // examine 2 sides of the break point
+ if (spell(cw) && spell(s + strlen(wordbreak[i]))) {
+ *s = r;
+ return 1;
+ }
+ *s = r;
+ }
+ }
+ }
+
+ // LANG_hu: compoundings with dashes and n-dashes XXX deprecated!
+ if (langnum == LANG_hu) {
+ int n;
+ // compound word with dash (HU) I18n
+ char * dash;
+ int result = 0;
+ // n-dash
+ if (!wordbreak && (dash=(char *) strstr(cw,"–"))) {
+ *dash = '\0';
+ // examine 2 sides of the dash
+ if (spell(cw) && spell(dash + 3)) {
+ *dash = 'â';
+ return 1;
+ }
+ *dash = 'â';
+ }
+ if ((dash=(char *) strchr(cw,'-'))) {
+ *dash='\0';
+ // examine 2 sides of the dash
+ if (dash[1] == '\0') { // base word ending with dash
+ if (spell(cw)) return 1;
+ } else {
+ // first word ending with dash: word-
+ char r2 = *(dash + 1);
+ dash[0]='-';
+ dash[1]='\0';
+ result = spell(cw);
+ dash[1] = r2;
+ dash[0]='\0';
+ if (result && spell(dash+1) && ((strlen(dash+1) > 1) || (dash[1] == 'e') ||
+ ((dash[1] > '0') && (dash[1] < '9')))) return 1;
+ }
+ // affixed number in correct word
+ if (result && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)>='.'))) {
+ *dash='-';
+ n = 1;
+ if (*(dash - n) == '.') n++;
+ // search first not a number character to left from dash
+ while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
+ n++;
+ }
+ if ((dash - n) < cw) n--;
+ // numbers: deprecated
+ for(; n >= 1; n--) {
+ if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && check(dash - n)) return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+struct hentry * Hunspell::check(const char * w)
+{
+ struct hentry * he = NULL;
+ int len;
+ char w2[MAXWORDUTF8LEN];
+ const char * word = w;
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ strcpy(w2, w);
+ if (utf8) reverseword_utf(w2); else reverseword(w2);
+ word = w2;
+ }
+
+ forbidden_compound = 0; // XXX LANG_hu class variable for suggestions (not threadsafe)
+ prevcompound = 0; // compounding information for Hunspell's pipe interface (not threadsafe)
+ prevroot = NULL; // root information for Hunspell's pipe interface (not threadsafe)
+
+ // look word in hash table
+ if (pHMgr) he = pHMgr->lookup(word);
+
+ // check forbidden and onlyincompound words
+ if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
+ // LANG_hu section: set dash information for suggestions
+ if (langnum == LANG_hu) {
+ forbidden_compound = 1;
+ if (pAMgr->get_compoundflag() &&
+ TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
+ forbidden_compound = 2;
+ }
+ }
+ return NULL;
+ }
+
+ // he = next not pseudoroot and not onlyincompound homonym or NULL
+ while (he && (he->astr) &&
+ ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) ||
+ (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen))
+ )) he = he->next_homonym;
+
+ // check with affixes
+ if (!he && pAMgr) {
+ // try stripping off affixes */
+ len = strlen(word);
+ he = pAMgr->affix_check(word, len, 0);
+
+ // check compound restriction
+ if (he && he->astr && pAMgr->get_onlyincompound() &&
+ TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) he = NULL;
+
+ // try check compound word
+ if (he) {
+ if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
+ forbidden_compound = 1; // LANG_hu
+ return NULL;
+ }
+ prevroot = he->word;
+ } else if (pAMgr->get_compound()) {
+ he = pAMgr->compound_check(word, len,
+ 0,0,100,0,NULL,0,NULL,NULL,0);
+ // LANG_hu section: `moving rule' with last dash
+ if ((!he) && (langnum == LANG_hu) && (word[len-1]=='-')) {
+ char * dup = mystrdup(word);
+ dup[len-1] = '\0';
+ he = pAMgr->compound_check(dup, len-1,
+ -5,0,100,0,NULL,1,NULL,NULL,0);
+ free(dup);
+ }
+ // end of LANG speficic region
+ if (he) {
+ prevroot = he->word;
+ prevcompound = 1;
+ }
+ }
+
+ }
+
+ return he;
+}
+
+int Hunspell::suggest(char*** slst, const char * word)
+{
+ char cw[MAXWORDUTF8LEN + 4];
+ char wspace[MAXWORDUTF8LEN + 4];
+ if (! pSMgr) return 0;
+ w_char unicw[MAXWORDLEN + 1];
+ int nc = strlen(word);
+ if (utf8) {
+ if (nc >= MAXWORDUTF8LEN) return 0;
+ } else {
+ if (nc >= MAXWORDLEN) return 0;
+ }
+ int captype = 0;
+ int abbv = 0;
+ int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv);
+ if (wl == 0) return 0;
+ int ns = 0;
+ *slst = NULL;
+ int capwords = 0;
+ int ngramsugs = 0;
+
+ switch(captype) {
+ case NOCAP: {
+ ns = pSMgr->suggest(slst, cw, ns);
+ break;
+ }
+
+ case INITCAP: {
+ capwords = 1;
+ ns = pSMgr->suggest(slst, cw, ns);
+ if (ns == -1) break;
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall2(wspace, unicw, nc);
+ ns = pSMgr->suggest(slst, wspace, ns);
+ break;
+ }
+ case HUHINITCAP:
+ case HUHCAP: {
+ ns = pSMgr->suggest(slst, cw, ns);
+ if (ns != -1) {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall2(wspace, unicw, nc);
+ insert_sug(slst, wspace, &ns);
+ ns = pSMgr->suggest(slst, wspace, ns);
+ if (captype == HUHINITCAP) {
+ mkinitcap2(wspace, unicw, nc);
+ insert_sug(slst, wspace, &ns);
+ ns = pSMgr->suggest(slst, wspace, ns);
+ }
+ }
+ break;
+ }
+
+ case ALLCAP: {
+ memcpy(wspace, cw, (wl+1));
+ mkallsmall2(wspace, unicw, nc);
+ ns = pSMgr->suggest(slst, wspace, ns);
+ if (ns == -1) break;
+ if (pAMgr && pAMgr->get_keepcase()) insert_sug(slst, wspace, &ns);
+ mkinitcap2(wspace, unicw, nc);
+ ns = pSMgr->suggest(slst, wspace, ns);
+ for (int j=0; j < ns; j++) {
+ mkallcap((*slst)[j]);
+ if (pAMgr && pAMgr->get_checksharps()) {
+ char * pos;
+ if (utf8) {
+ while (pos = strstr((*slst)[j], "ß")) {
+ *pos = 'S';
+ *(pos+1) = 'S';
+ }
+ } else {
+ while (pos = strchr((*slst)[j], 'ß')) {
+ (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2);
+ mystrrep((*slst)[j], "ß", "SS");
+ }
+ }
+ }
+ }
+ break;
+ }
+ }
+
+ // LANG_hu section: replace '-' with ' ' in Hungarian
+ if ((langnum == LANG_hu) && (forbidden_compound == 2)) {
+ for (int j=0; j < ns; j++) {
+ char * pos = strchr((*slst)[j],'-');
+ if (pos) *pos = ' ';
+ }
+ }
+ // END OF LANG_hu section
+
+ // try ngram approach since found nothing
+ if ((ns == 0) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) {
+ ngramsugs = 1;
+ switch(captype) {
+ case NOCAP: {
+ ns = pSMgr->ngsuggest(*slst, cw, pHMgr);
+ break;
+ }
+ case HUHCAP: {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall2(wspace, unicw, nc);
+ ns = pSMgr->ngsuggest(*slst, wspace, pHMgr);
+ break;
+ }
+ case INITCAP: {
+ capwords = 1;
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall2(wspace, unicw, nc);
+ ns = pSMgr->ngsuggest(*slst, wspace, pHMgr);
+ break;
+ }
+ case ALLCAP: {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall2(wspace, unicw, nc);
+ ns = pSMgr->ngsuggest(*slst, wspace, pHMgr);
+ for (int j=0; j < ns; j++)
+ mkallcap((*slst)[j]);
+ break;
+ }
+ }
+ }
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ for (int j = 0; j < ns; j++) {
+ if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
+ }
+ }
+
+ // capitalize and erase capitalized duplications
+ if (capwords) {
+ int l = 0;
+ for (int j=0; j < ns; j++) {
+ mkinitcap((*slst)[j]);
+ (*slst)[l] = (*slst)[j];
+ for (int k=0; k < l; k++) {
+ if (strcmp((*slst)[k], (*slst)[j]) == 0) {
+ free((*slst)[j]);
+ l--;
+ }
+ }
+ l++;
+ }
+ ns = l;
+ }
+
+ // expand suggestions with dot(s)
+ if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
+ for (int j = 0; j < ns; j++) {
+ (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
+ strcat((*slst)[j], word + strlen(word) - abbv);
+ }
+ }
+
+ // suggest keepcase
+ if (pAMgr->get_keepcase()) {
+ switch (captype) {
+ case INITCAP:
+ case ALLCAP: {
+ int l = 0;
+ for (int j=0; j < ns; j++) {
+ if (!spell((*slst)[j])) {
+ char s[MAXSWUTF8L];
+ w_char w[MAXSWL];
+ int len;
+ if (utf8) {
+ len = u8_u16(w, MAXSWL, (*slst)[j]);
+ } else {
+ strcpy(s, (*slst)[j]);
+ len = strlen(s);
+ }
+ int wl = mkallsmall2(s, w, len);
+ free((*slst)[j]);
+ if (spell(s)) {
+ (*slst)[l] = mystrdup(s);
+ l++;
+ } else {
+ int wl = mkinitcap2(s, w, len);
+ if (spell(s)) {
+ (*slst)[l] = mystrdup(s);
+ l++;
+ }
+ }
+ } else {
+ (*slst)[l] = (*slst)[j];
+ l++;
+ }
+ }
+ ns = l;
+ l = 0;
+ // remove duplications
+ for (int j=0; j < ns; j++) {
+ (*slst)[l] = (*slst)[j];
+ for (int k=0; k < l; k++) {
+ if (strcmp((*slst)[k], (*slst)[j]) == 0) {
+ free((*slst)[j]);
+ l--;
+ }
+ }
+ l++;
+ }
+ ns = l;
+ }
+ }
+ }
+
+ return ns;
+}
+
+// XXX need UTF-8 support
+int Hunspell::suggest_auto(char*** slst, const char * word)
+{
+ char cw[MAXWORDUTF8LEN + 4];
+ char wspace[MAXWORDUTF8LEN + 4];
+ if (! pSMgr) return 0;
+ int wl = strlen(word);
+ if (utf8) {
+ if (wl >= MAXWORDUTF8LEN) return 0;
+ } else {
+ if (wl >= MAXWORDLEN) return 0;
+ }
+ int captype = 0;
+ int abbv = 0;
+ wl = cleanword(cw, word, &captype, &abbv);
+ if (wl == 0) return 0;
+ int ns = 0;
+ *slst = NULL; // HU, nsug in pSMgr->suggest
+
+ switch(captype) {
+ case NOCAP: {
+ ns = pSMgr->suggest_auto(slst, cw, ns);
+ if (ns>0) break;
+ break;
+ }
+
+ case INITCAP: {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ ns = pSMgr->suggest_auto(slst, wspace, ns);
+ for (int j=0; j < ns; j++)
+ mkinitcap((*slst)[j]);
+ ns = pSMgr->suggest_auto(slst, cw, ns);
+ break;
+
+ }
+
+ case HUHCAP: {
+ ns = pSMgr->suggest_auto(slst, cw, ns);
+ if (ns == 0) {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ ns = pSMgr->suggest_auto(slst, wspace, ns);
+ }
+ break;
+ }
+
+ case ALLCAP: {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ ns = pSMgr->suggest_auto(slst, wspace, ns);
+
+ mkinitcap(wspace);
+ ns = pSMgr->suggest_auto(slst, wspace, ns);
+
+ for (int j=0; j < ns; j++)
+ mkallcap((*slst)[j]);
+ break;
+ }
+ }
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ for (int j = 0; j < ns; j++) {
+ if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]);
+ }
+ }
+
+ // expand suggestions with dot(s)
+ if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
+ for (int j = 0; j < ns; j++) {
+ (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv);
+ strcat((*slst)[j], word + strlen(word) - abbv);
+ }
+ }
+
+ // replace '-' with ' '
+ if (forbidden_compound == 2) {
+ for (int j=0; j < ns; j++) {
+ char * pos = strchr((*slst)[j],'-');
+ if (pos) *pos = ' ';
+ }
+ }
+ return ns;
+}
+
+// XXX need UTF-8 support
+int Hunspell::stem(char*** slst, const char * word)
+{
+ char cw[MAXWORDUTF8LEN + 4];
+ char wspace[MAXWORDUTF8LEN + 4];
+ if (! pSMgr) return 0;
+ int wl = strlen(word);
+ if (utf8) {
+ if (wl >= MAXWORDUTF8LEN) return 0;
+ } else {
+ if (wl >= MAXWORDLEN) return 0;
+ }
+ int captype = 0;
+ int abbv = 0;
+ wl = cleanword(cw, word, &captype, &abbv);
+ if (wl == 0) return 0;
+
+ int ns = 0;
+
+ *slst = NULL; // HU, nsug in pSMgr->suggest
+
+ switch(captype) {
+ case HUHCAP:
+ case NOCAP: {
+ ns = pSMgr->suggest_stems(slst, cw, ns);
+
+ if ((abbv) && (ns == 0)) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ ns = pSMgr->suggest_stems(slst, wspace, ns);
+ }
+
+ break;
+ }
+
+ case INITCAP: {
+
+ ns = pSMgr->suggest_stems(slst, cw, ns);
+
+ if (ns == 0) {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ ns = pSMgr->suggest_stems(slst, wspace, ns);
+
+ }
+
+ if ((abbv) && (ns == 0)) {
+ memcpy(wspace,cw,wl);
+ mkallsmall(wspace);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ ns = pSMgr->suggest_stems(slst, wspace, ns);
+ }
+
+ break;
+
+ }
+
+ case ALLCAP: {
+ ns = pSMgr->suggest_stems(slst, cw, ns);
+ if (ns != 0) break;
+
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ ns = pSMgr->suggest_stems(slst, wspace, ns);
+
+ if (ns == 0) {
+ mkinitcap(wspace);
+ ns = pSMgr->suggest_stems(slst, wspace, ns);
+ }
+
+ if ((abbv) && (ns == 0)) {
+ memcpy(wspace,cw,wl);
+ mkallsmall(wspace);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ ns = pSMgr->suggest_stems(slst, wspace, ns);
+ }
+
+
+ break;
+ }
+ }
+
+ return ns;
+}
+
+int Hunspell::suggest_pos_stems(char*** slst, const char * word)
+{
+ char cw[MAXWORDUTF8LEN + 4];
+ char wspace[MAXWORDUTF8LEN + 4];
+ if (! pSMgr) return 0;
+ int wl = strlen(word);
+ if (utf8) {
+ if (wl >= MAXWORDUTF8LEN) return 0;
+ } else {
+ if (wl >= MAXWORDLEN) return 0;
+ }
+ int captype = 0;
+ int abbv = 0;
+ wl = cleanword(cw, word, &captype, &abbv);
+ if (wl == 0) return 0;
+
+ int ns = 0; // ns=0 = normalized input
+
+ *slst = NULL; // HU, nsug in pSMgr->suggest
+
+ switch(captype) {
+ case HUHCAP:
+ case NOCAP: {
+ ns = pSMgr->suggest_pos_stems(slst, cw, ns);
+
+ if ((abbv) && (ns == 0)) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
+ }
+
+ break;
+ }
+
+ case INITCAP: {
+
+ ns = pSMgr->suggest_pos_stems(slst, cw, ns);
+
+ if (ns == 0 || ((*slst)[0][0] == '#')) {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
+ }
+
+ break;
+
+ }
+
+ case ALLCAP: {
+ ns = pSMgr->suggest_pos_stems(slst, cw, ns);
+ if (ns != 0) break;
+
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
+
+ if (ns == 0) {
+ mkinitcap(wspace);
+ ns = pSMgr->suggest_pos_stems(slst, wspace, ns);
+ }
+ break;
+ }
+ }
+
+ return ns;
+}
+
+char * Hunspell::get_dic_encoding()
+{
+ return encoding;
+}
+
+const char * Hunspell::get_wordchars()
+{
+ return pAMgr->get_wordchars();
+}
+
+unsigned short * Hunspell::get_wordchars_utf16(int * len)
+{
+ return pAMgr->get_wordchars_utf16(len);
+}
+
+char * Hunspell::get_prevroot()
+{
+ return prevroot; // XXX not stateless, not for OOo
+}
+
+int Hunspell::get_prevcompound()
+{
+ return prevcompound; // XXX not stateless, not for OOo
+}
+
+int Hunspell::get_forbidden_compound()
+{
+ return forbidden_compound; // XXX not stateless, not for OOo
+}
+
+void Hunspell::mkinitcap(char * p)
+{
+ if (!utf8) {
+ if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
+ } else {
+ int len;
+ w_char u[MAXWORDLEN];
+ len = u8_u16(u, MAXWORDLEN, p);
+ unsigned short i = utfconv[(u[0].h << 8) + u[0].l].cupper;
+ u[0].h = (unsigned char) (i >> 8);
+ u[0].l = (unsigned char) (i & 0x00FF);
+ u16_u8(p, MAXWORDUTF8LEN, u, len);
+ }
+}
+
+int Hunspell::mkinitcap2(char * p, w_char * u, int nc)
+{
+ if (!utf8) {
+ if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper;
+ } else if (nc > 0) {
+ unsigned short i = utfconv[(u[0].h << 8) + u[0].l].cupper;
+ u[0].h = (unsigned char) (i >> 8);
+ u[0].l = (unsigned char) (i & 0x00FF);
+ u16_u8(p, MAXWORDUTF8LEN, u, nc);
+ return strlen(p);
+ }
+ return nc;
+}
+
+struct cs_info * Hunspell::get_csconv()
+{
+ return csconv;
+}
+
+struct unicode_info2 * Hunspell::get_utf_conv()
+{
+ return utfconv;
+}
+
+int Hunspell::put_word(const char * word)
+{
+ if (pHMgr) {
+ return pHMgr->put_word(word, strlen(word), NULL);
+ }
+ return 0;
+}
+
+int Hunspell::put_word_suffix(const char * word, const char * suffix)
+{
+ if (pHMgr) {
+ return pHMgr->put_word(word, strlen(word), (char *) suffix);
+ }
+ return 0;
+}
+
+int Hunspell::put_word_pattern(const char * word, const char * pattern)
+{
+ if (pHMgr) {
+ return pHMgr->put_word_pattern(word, strlen(word), pattern);
+ }
+ return 0;
+}
+
+const char * Hunspell::get_version()
+{
+ return pAMgr->get_version();
+}
+
+// XXX need UTF-8 support
+char * Hunspell::morph(const char * word)
+{
+ char cw[MAXWORDUTF8LEN + 4];
+ char wspace[MAXWORDUTF8LEN + 4];
+ if (! pSMgr) return 0;
+ int wl = strlen(word);
+ if (utf8) {
+ if (wl >= MAXWORDUTF8LEN) return 0;
+ } else {
+ if (wl >= MAXWORDLEN) return 0;
+ }
+ int captype = 0;
+ int abbv = 0;
+ wl = cleanword(cw, word, &captype, &abbv);
+ if (wl == 0) {
+ if (abbv) {
+ for (wl = 0; wl < abbv; wl++) cw[wl] = '.';
+ cw[wl] = '\0';
+ abbv = 0;
+ } else return 0;
+ }
+
+ char result[MAXLNLEN];
+ char * st = NULL;
+
+ *result = '\0';
+
+ int n = 0;
+ int n2 = 0;
+ int n3 = 0;
+
+ // test numbers
+ // LANG_hu section: set dash information for suggestions
+ if (langnum == LANG_hu) {
+ while ((n < wl) &&
+ (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) {
+ n++;
+ if ((cw[n] == '.') || (cw[n] == ',')) {
+ if (((n2 == 0) && (n > 3)) ||
+ ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break;
+ n2++;
+ n3 = n;
+ }
+ }
+
+ if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL;
+ if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='°')) && check(cw+n))) {
+ strcat(result, cw);
+ result[n - 1] = '\0';
+ if (n == wl) {
+ st = pSMgr->suggest_morph(cw + n - 1);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ } else {
+ char sign = cw[n];
+ cw[n] = '\0';
+ st = pSMgr->suggest_morph(cw + n - 1);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ strcat(result, "+"); // XXX SPEC. MORPHCODE
+ cw[n] = sign;
+ st = pSMgr->suggest_morph(cw + n);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ }
+ return mystrdup(result);
+ }
+ }
+ // END OF LANG_hu section
+
+ switch(captype) {
+ case NOCAP: {
+ st = pSMgr->suggest_morph(cw);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ if (abbv) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ }
+ break;
+ }
+ case INITCAP: {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ st = pSMgr->suggest_morph(cw);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ if (abbv) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ mkinitcap(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ }
+ break;
+ }
+ case HUHCAP: {
+ st = pSMgr->suggest_morph(cw);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+#if 0
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+#endif
+ break;
+ }
+ case ALLCAP: {
+ memcpy(wspace,cw,(wl+1));
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ mkinitcap(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ if (abbv) {
+ memcpy(wspace,cw,(wl+1));
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ if (*result) strcat(result, "\n");
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ mkinitcap(wspace);
+ st = pSMgr->suggest_morph(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ }
+ break;
+ }
+ }
+
+ if (result && (*result)) {
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ if (utf8) reverseword_utf(result); else reverseword(result);
+ }
+ return mystrdup(result);
+ }
+
+ // compound word with dash (HU) I18n
+ char * dash;
+ int nresult = 0;
+ // LANG_hu section: set dash information for suggestions
+ if ((langnum == LANG_hu) && (dash=(char *) strchr(cw,'-'))) {
+ *dash='\0';
+ // examine 2 sides of the dash
+ if (dash[1] == '\0') { // base word ending with dash
+ if (spell(cw)) return pSMgr->suggest_morph(cw);
+ } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat.
+ if (spell(cw) && (spell("-e"))) {
+ st = pSMgr->suggest_morph(cw);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ strcat(result,"+"); // XXX spec. separator in MORPHCODE
+ st = pSMgr->suggest_morph("-e");
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ return mystrdup(result);
+ }
+ } else {
+ // first word ending with dash: word- XXX ???
+ char r2 = *(dash + 1);
+ dash[0]='-';
+ dash[1]='\0';
+ nresult = spell(cw);
+ dash[1] = r2;
+ dash[0]='\0';
+ if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) ||
+ ((dash[1] > '0') && (dash[1] < '9')))) {
+ st = morph(cw);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ strcat(result,"+"); // XXX spec. separator in MORPHCODE
+ }
+ st = morph(dash+1);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ return mystrdup(result);
+ }
+ }
+ // affixed number in correct word
+ if (nresult && (dash > cw) && (((*(dash-1)<='9') &&
+ (*(dash-1)>='0')) || (*(dash-1)=='.'))) {
+ *dash='-';
+ n = 1;
+ if (*(dash - n) == '.') n++;
+ // search first not a number character to left from dash
+ while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) {
+ n++;
+ }
+ if ((dash - n) < cw) n--;
+ // numbers: valami1000000-hoz
+ // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
+ // 56-hoz, 6-hoz
+ for(; n >= 1; n--) {
+ if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && check(dash - n)) {
+ strcat(result, cw);
+ result[dash - cw - n] = '\0';
+ st = pSMgr->suggest_morph(dash - n);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ return mystrdup(result);
+ }
+ }
+ }
+ }
+ return NULL;
+}
+
+// XXX need UTF-8 support
+char * Hunspell::morph_with_correction(const char * word)
+{
+ char cw[MAXWORDUTF8LEN + 4];
+ char wspace[MAXWORDUTF8LEN + 4];
+ if (! pSMgr) return 0;
+ int wl = strlen(word);
+ if (utf8) {
+ if (wl >= MAXWORDUTF8LEN) return 0;
+ } else {
+ if (wl >= MAXWORDLEN) return 0;
+ }
+ int captype = 0;
+ int abbv = 0;
+ wl = cleanword(cw, word, &captype, &abbv);
+ if (wl == 0) return 0;
+
+ char result[MAXLNLEN];
+ char * st = NULL;
+
+ *result = '\0';
+
+
+ switch(captype) {
+ case NOCAP: {
+ st = pSMgr->suggest_morph_for_spelling_error(cw);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ if (abbv) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ }
+ break;
+ }
+ case INITCAP: {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ st = pSMgr->suggest_morph_for_spelling_error(cw);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ if (abbv) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ mkinitcap(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ }
+ break;
+ }
+ case HUHCAP: {
+ st = pSMgr->suggest_morph_for_spelling_error(cw);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ break;
+ }
+ case ALLCAP: {
+ memcpy(wspace,cw,(wl+1));
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ mkinitcap(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ if (abbv) {
+ memcpy(wspace,cw,(wl+1));
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ if (*result) strcat(result, "\n");
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+ mkallsmall(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ mkinitcap(wspace);
+ st = pSMgr->suggest_morph_for_spelling_error(wspace);
+ if (st) {
+ if (*result) strcat(result, "\n");
+ strcat(result, st);
+ free(st);
+ }
+ }
+ break;
+ }
+ }
+
+ if (result) return mystrdup(result);
+ return NULL;
+}
+
+/* analyze word
+ * return line count
+ * XXX need a better data structure for morphological analysis */
+int Hunspell::analyze(char ***out, const char *word) {
+ int n = 0;
+ if (!word) return 0;
+ char * m = morph(word);
+ if(!m) return 0;
+ if (!out) return line_tok(m, out);
+
+ // without memory allocation
+ /* BUG missing buffer size checking */
+ int i, p;
+ for(p = 0, i = 0; m[i]; i++) {
+ if(m[i] == '\n' || !m[i+1]) {
+ n++;
+ strncpy((*out)[n++], m + p, i - p + 1);
+ if (m[i] == '\n') (*out)[n++][i - p] = '\0';
+ if(!m[i+1]) break;
+ p = i + 1;
+ }
+ }
+ free(m);
+ return n;
+}
+
diff --git a/src/myspell/hunspell.dsp b/src/myspell/hunspell.dsp
new file mode 100644
index 0000000..05e072f
--- /dev/null
+++ b/src/myspell/hunspell.dsp
@@ -0,0 +1,164 @@
+# Microsoft Developer Studio Project File - Name="hunspell" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Static Library" 0x0104
+
+CFG=hunspell - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE
+!MESSAGE NMAKE /f "hunspell.mak".
+!MESSAGE
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE
+!MESSAGE NMAKE /f "hunspell.mak" CFG="hunspell - Win32 Debug"
+!MESSAGE
+!MESSAGE Possible choices for configuration are:
+!MESSAGE
+!MESSAGE "hunspell - Win32 Release" (based on "Win32 (x86) Static Library")
+!MESSAGE "hunspell - Win32 Debug" (based on "Win32 (x86) Static Library")
+!MESSAGE
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF "$(CFG)" == "hunspell - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "Release"
+# PROP Intermediate_Dir "Release"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "W32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c
+# ADD CPP /nologo /W3 /GX /O2 /D "W32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c
+# ADD BASE RSC /l 0x40e /d "NDEBUG"
+# ADD RSC /l 0x40e /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo
+
+!ELSEIF "$(CFG)" == "hunspell - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "Debug"
+# PROP Intermediate_Dir "Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "W32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c
+# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "W32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x40e /d "_DEBUG"
+# ADD RSC /l 0x40e /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo
+
+!ENDIF
+
+# Begin Target
+
+# Name "hunspell - Win32 Release"
+# Name "hunspell - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=.\affentry.cxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\affixmgr.cxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\csutil.cxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\dictmgr.cxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\hashmgr.cxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\hunspell.cxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\suggestmgr.cxx
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\affentry.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\affixmgr.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\atypes.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\baseaffix.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\csutil.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\dictmgr.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\hashmgr.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\htypes.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\langnum.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\hunspell.hxx
+# End Source File
+# Begin Source File
+
+SOURCE=.\suggestmgr.hxx
+# End Source File
+# End Group
+# End Target
+# End Project
diff --git a/src/myspell/hunspell.hxx b/src/myspell/hunspell.hxx
new file mode 100644
index 0000000..5860fa8
--- /dev/null
+++ b/src/myspell/hunspell.hxx
@@ -0,0 +1,142 @@
+#include "hashmgr.hxx"
+#include "affixmgr.hxx"
+#include "suggestmgr.hxx"
+#include "csutil.hxx"
+#include "langnum.hxx"
+
+#define NOCAP 0
+#define INITCAP 1
+#define ALLCAP 2
+#define HUHCAP 3
+#define HUHINITCAP 4
+
+#define MAXSUGGESTION 15
+#define MAXSHARPS 5
+
+#ifdef W32
+#define DLLTEST2_API __declspec(dllexport)
+#endif
+
+#ifndef _MYSPELLMGR_HXX_
+#define _MYSPELLMGR_HXX_
+
+#ifdef W32
+class DLLTEST2_API Hunspell
+#else
+class Hunspell
+#endif
+{
+ AffixMgr* pAMgr;
+ HashMgr* pHMgr;
+ SuggestMgr* pSMgr;
+ char * encoding;
+ struct cs_info * csconv;
+ struct unicode_info2 * utfconv;
+ int langnum;
+ int utf8;
+ int complexprefixes;
+ char** wordbreak;
+
+/* XXX not stateless variables for compound handling */
+ char * prevroot;
+ int prevcompound;
+
+/* forbidden_compound:
+ * 0 = not forbidden
+ * 1 = forbidden
+ * 2 = forbidden compound (written without dash in Hungarian)
+ */
+ int forbidden_compound;
+
+
+public:
+
+ /* Hunspell(aff, dic) - constructor of Hunspell class
+ * input: path of affix file and dictionary file
+ */
+
+ Hunspell(const char * affpath, const char * dpath);
+
+ ~Hunspell();
+
+ /* spell(word) - spellcheck word
+ * output: 0 = bad word, not 0 = good word
+ */
+
+ int spell(const char *);
+
+ /* suggest(suggestions, word) - search suggestions
+ * input: pointer to an array of strings pointer and the (bad) word
+ * array of strings pointer (here *slst) may not be initialized
+ * output: number of suggestions in string array, and suggestions in
+ * a newly allocated array of strings (*slts will be NULL when number
+ * of suggestion equals 0.)
+ */
+
+ int suggest(char*** slst, const char * word);
+
+ /* handling custom dictionary */
+
+ int put_word(const char * word);
+
+ /* suffix is an affix flag string, similarly in dictionary files */
+
+ int put_word_suffix(const char * word, const char * suffix);
+
+ /* pattern is a sample dictionary word
+ * put word into custom dictionary with affix flags of pattern word
+ */
+
+ int put_word_pattern(const char * word, const char * pattern);
+
+ /* other */
+
+ char * get_dic_encoding();
+ const char * get_wordchars();
+ unsigned short * get_wordchars_utf16(int * len);
+ struct cs_info * get_csconv();
+ struct unicode_info2 * get_utf_conv();
+ const char * get_version();
+
+ /* experimental functions */
+
+ /* morphological analysis */
+
+ char * morph(const char * word);
+ int analyze(char*** out, const char *word);
+
+ char * morph_with_correction(const char * word);
+
+ /* stemmer function */
+
+ int stem(char*** slst, const char * word);
+
+ /* spec. suggestions */
+ int suggest_auto(char*** slst, const char * word);
+ int suggest_pos_stems(char*** slst, const char * word);
+ char * get_possible_root();
+
+ /* not threadsafe functions for Hunspell command line API */
+
+ char * get_prevroot();
+ int get_prevcompound();
+ int get_forbidden_compound();
+
+private:
+ int cleanword(char *, const char *, int * pcaptype, int * pabbrev);
+ int cleanword2(char *, const char *, w_char *, int * w_len, int * pcaptype, int * pabbrev);
+ void mkinitcap(char *);
+ int mkinitcap2(char * p, w_char * u, int nc);
+ void mkallcap(char *);
+ int mkallcap2(char * p, w_char * u, int nc);
+ void mkallsmall(char *);
+ int mkallsmall2(char * p, w_char * u, int nc);
+ struct hentry * check(const char *);
+ char * sharps_u8_l1(char * dest, char * source);
+ hentry * spellsharps(char * base, char *, int, int, char * tmp);
+ int is_keepcase(const hentry * rv);
+ int insert_sug(char ***slst, char * word, int *ns);
+
+};
+
+#endif
diff --git a/src/myspell/myspell.cxx b/src/myspell/myspell.cxx
deleted file mode 100644
index fcdbaa1..0000000
--- a/src/myspell/myspell.cxx
+++ /dev/null
@@ -1,302 +0,0 @@
-#include "license.readme"
-
-#include <cstring>
-#include <cstdlib>
-#include <cstdio>
-
-#include "enchant_myspell.hxx"
-
-#ifndef WINDOWS
-using namespace std;
-#endif
-
-
-MySpell::MySpell(const char * affpath, const char * dpath)
-{
- encoding = NULL;
- csconv = NULL;
-
- /* first set up the hash manager */
- pHMgr = new HashMgr(dpath);
-
- /* next set up the affix manager */
- /* it needs access to the hash manager lookup methods */
- pAMgr = new AffixMgr(affpath,pHMgr);
-
- /* get the preferred try string and the dictionary */
- /* encoding from the Affix Manager for that dictionary */
- char * try_string = pAMgr->get_try_string();
- encoding = pAMgr->get_encoding();
- csconv = get_current_cs(encoding);
-
- /* and finally set up the suggestion manager */
- maxSug = 15;
- pSMgr = new SuggestMgr(try_string, maxSug, pAMgr);
- if (try_string) free(try_string);
-}
-
-
-MySpell::~MySpell()
-{
- if (pSMgr) delete pSMgr;
- if (pAMgr) delete pAMgr;
- if (pHMgr) delete pHMgr;
- pSMgr = NULL;
- pAMgr = NULL;
- pHMgr = NULL;
- csconv= NULL;
- if (encoding) free(encoding);
- encoding = NULL;
-}
-
-
-// make a copy of src at destination while removing all leading
-// blanks and removing any trailing periods after recording
-// their presence with the abbreviation flag
-// also since already going through character by character,
-// set the capitalization type
-// return the length of the "cleaned" word
-
-int MySpell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev)
-{
-
- // with the new breakiterator code this should not be needed anymore
- const char * special_chars = "._#$%&()* +,-/:;<=>[]\\^`{|}~\t \x0a\x0d\x01\'\"";
-
- unsigned char * p = (unsigned char *) dest;
- const unsigned char * q = (const unsigned char * ) src;
-
- // first skip over any leading special characters
- while ((*q != '\0') && (strchr(special_chars,(int)(*q)))) q++;
-
- // now strip off any trailing special characters
- // if a period comes after a normal char record its presence
- *pabbrev = 0;
- int nl = strlen((const char *)q);
- while ((nl > 0) && (strchr(special_chars,(int)(*(q+nl-1))))) {
- nl--;
- }
- if ( *(q+nl) == '.' ) *pabbrev = 1;
-
- // if no characters are left it can't be an abbreviation and can't be capitalized
- if (nl <= 0) {
- *pcaptype = NOCAP;
- *pabbrev = 0;
- *p = '\0';
- return 0;
- }
-
- // now determine the capitalization type of the first nl letters
- int ncap = 0;
- int nneutral = 0;
- int nc = 0;
- while (nl > 0) {
- nc++;
- if (csconv[(*q)].ccase) ncap++;
- if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++;
- *p++ = *q++;
- nl--;
- }
- // remember to terminate the destination string
- *p = '\0';
-
- // now finally set the captype
- if (ncap == 0) {
- *pcaptype = NOCAP;
- } else if ((ncap == 1) && csconv[(unsigned char)(*dest)].ccase) {
- *pcaptype = INITCAP;
- } else if ((ncap == nc) || ((ncap + nneutral) == nc)){
- *pcaptype = ALLCAP;
- } else {
- *pcaptype = HUHCAP;
- }
- return nc;
-}
-
-
-int MySpell::spell(const char * word)
-{
- char * rv=NULL;
- char cw[MAXWORDLEN+1];
- char wspace[MAXWORDLEN+1];
-
- int wl = strlen(word);
- if (wl > (MAXWORDLEN - 1)) return 0;
- int captype = 0;
- int abbv = 0;
- wl = cleanword(cw, word, &captype, &abbv);
- if (wl == 0) return 1;
-
- switch(captype) {
- case HUHCAP:
- case NOCAP: {
- rv = check(cw);
- if ((abbv) && !(rv)) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- rv = check(wspace);
- }
- break;
- }
-
- case ALLCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace, csconv);
- rv = check(wspace);
- if (!rv) {
- mkinitcap(wspace, csconv);
- rv = check(wspace);
- }
- if (!rv) rv = check(cw);
- if ((abbv) && !(rv)) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- rv = check(wspace);
- }
- break;
- }
- case INITCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace, csconv);
- rv = check(wspace);
- if (!rv) rv = check(cw);
- if ((abbv) && !(rv)) {
- memcpy(wspace,cw,wl);
- *(wspace+wl) = '.';
- *(wspace+wl+1) = '\0';
- rv = check(wspace);
- }
- break;
- }
- }
- if (rv) return 1;
- return 0;
-}
-
-
-char * MySpell::check(const char * word)
-{
- struct hentry * he = NULL;
- if (pHMgr)
- he = pHMgr->lookup (word);
-
- if ((he == NULL) && (pAMgr)) {
- // try stripping off affixes */
- he = pAMgr->affix_check(word, strlen(word));
-
- // try check compound word
- if ((he == NULL) && (pAMgr->get_compound())) {
- he = pAMgr->compound_check(word, strlen(word), (pAMgr->get_compound())[0]);
- }
-
- }
-
- if (he) return he->word;
- return NULL;
-}
-
-
-
-int MySpell::suggest(char*** slst, const char * word)
-{
- char cw[MAXWORDLEN+1];
- char wspace[MAXWORDLEN+1];
- if (! pSMgr) return 0;
- int wl = strlen(word);
- if (wl > (MAXWORDLEN-1)) return 0;
- int captype = 0;
- int abbv = 0;
- wl = cleanword(cw, word, &captype, &abbv);
- if (wl == 0) return 0;
-
- int ns = 0;
- char ** wlst = (char **) calloc(maxSug, sizeof(char *));
- if (wlst == NULL) return 0;
-
- switch(captype) {
- case NOCAP: {
- ns = pSMgr->suggest(wlst, ns, cw);
- break;
- }
-
- case INITCAP: {
-
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace, csconv);
- ns = pSMgr->suggest(wlst, ns, wspace);
- if (ns > 0) {
- for (int j=0; j < ns; j++)
- mkinitcap(wlst[j], csconv);
- }
- ns = pSMgr->suggest(wlst,ns,cw);
- break;
- }
-
- case HUHCAP: {
- ns = pSMgr->suggest(wlst, ns, cw);
- if (ns != -1) {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace, csconv);
- ns = pSMgr->suggest(wlst, ns, wspace);
- }
- break;
- }
-
- case ALLCAP: {
- memcpy(wspace,cw,(wl+1));
- mkallsmall(wspace, csconv);
- ns = pSMgr->suggest(wlst, ns, wspace);
- if (ns > 0) {
- for (int j=0; j < ns; j++)
- mkallcap(wlst[j], csconv);
- }
- if (ns != -1)
- ns = pSMgr->suggest(wlst, ns , cw);
- break;
- }
- }
- if (ns > 0) {
- *slst = wlst;
- return ns;
- }
- // try ngram approach since found nothing
- if (ns == 0) {
- ns = pSMgr->ngsuggest(wlst, cw, pHMgr);
- if (ns) {
- switch(captype) {
- case NOCAP: break;
- case HUHCAP: break;
- case INITCAP: {
- for (int j=0; j < ns; j++)
- mkinitcap(wlst[j], csconv);
- }
- break;
-
- case ALLCAP: {
- for (int j=0; j < ns; j++)
- mkallcap(wlst[j], csconv);
- }
- break;
- }
- *slst = wlst;
- return ns;
- }
- }
- if (ns < 0) {
- // we ran out of memory - we should free up as much as possible
- for (int i=0;i<maxSug; i++)
- if (wlst[i] != NULL) free(wlst[i]);
- }
- if (wlst) free(wlst);
- *slst = NULL;
- return 0;
-}
-
-
-char * MySpell::get_dic_encoding()
-{
- return encoding;
-}
-
diff --git a/src/myspell/myspell_checker.cpp b/src/myspell/myspell_checker.cpp
index f84358a..01e8845 100644
--- a/src/myspell/myspell_checker.cpp
+++ b/src/myspell/myspell_checker.cpp
@@ -38,11 +38,8 @@
#include "enchant.h"
#include "enchant-provider.h"
-#ifdef WITH_SYSTEM_MYSPELL
-#include <myspell.hxx>
-#else
-#include "enchant_myspell.hxx"
-#endif
+/* built against hunspell 1.1.3 on January 13, 2006 */
+#include "hunspell.hxx"
ENCHANT_PLUGIN_DECLARE("Myspell")
@@ -66,7 +63,7 @@ public:
private:
GIConv m_translate_in; /* Selected translation from/to Unicode */
GIConv m_translate_out;
- MySpell *myspell;
+ Hunspell *myspell;
};
/***************************************************************************/
@@ -257,7 +254,7 @@ MySpellChecker::requestDictionary(const char *szLang)
aff = g_strdup(dic);
int len_dic = strlen(dic);
strcpy(aff+len_dic-3, "aff");
- myspell = new MySpell(aff, dic);
+ myspell = new Hunspell(aff, dic);
g_free(dic);
g_free(aff);
char *enc = myspell->get_dic_encoding();
diff --git a/src/myspell/suggestmgr.cxx b/src/myspell/suggestmgr.cxx
index 4e9c051..fe451cc 100644
--- a/src/myspell/suggestmgr.cxx
+++ b/src/myspell/suggestmgr.cxx
@@ -1,4 +1,5 @@
-#include "license.readme"
+#include "license.hunspell"
+#include "license.myspell"
#include <cstdlib>
#include <cctype>
@@ -7,12 +8,10 @@
#include "suggestmgr.hxx"
-#ifndef WINDOWS
+#ifndef W32
using namespace std;
#endif
-extern char * mystrdup(const char *);
-
SuggestMgr::SuggestMgr(const char * tryme, int maxn,
AffixMgr * aptr)
@@ -21,13 +20,41 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn,
// register affix manager and check in string of chars to
// try when building candidate suggestions
pAMgr = aptr;
- ctry = mystrdup(tryme);
+
ctryl = 0;
- if (ctry)
- ctryl = strlen(ctry);
+ ctry = NULL;
+ ctry_utf = NULL;
+
maxSug = maxn;
- nosplitsugs=(0==1);
- if (pAMgr) pAMgr->get_nosplitsugs();
+ nosplitsugs = 0;
+ maxngramsugs = MAXNGRAMSUGS;
+
+ utf8 = 0;
+ utfconv = NULL;
+ complexprefixes = 0;
+
+ if (pAMgr) {
+ char * enc = pAMgr->get_encoding();
+ csconv = get_current_cs(enc);
+ free(enc);
+ nosplitsugs = pAMgr->get_nosplitsugs();
+ if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs();
+ utf8 = pAMgr->get_utf8();
+ utfconv = pAMgr->get_utf_conv();
+ complexprefixes = pAMgr->get_complexprefixes();
+ }
+
+ if (tryme) {
+ if (utf8) {
+ w_char t[MAXSWL];
+ ctryl = u8_u16(t, MAXSWL, tryme);
+ ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char));
+ memcpy(ctry_utf, t, ctryl * sizeof(w_char));
+ } else {
+ ctry = mystrdup(tryme);
+ ctryl = strlen(ctry);
+ }
+ }
}
@@ -36,6 +63,8 @@ SuggestMgr::~SuggestMgr()
pAMgr = NULL;
if (ctry) free(ctry);
ctry = NULL;
+ if (ctry_utf) free(ctry_utf);
+ ctry_utf = NULL;
ctryl = 0;
maxSug = 0;
}
@@ -45,67 +74,182 @@ SuggestMgr::~SuggestMgr()
// generate suggestions for a mispelled word
// pass in address of array of char * pointers
-int SuggestMgr::suggest(char** wlst, int ns, const char * word)
+int SuggestMgr::suggest(char*** slst, const char * w, int nsug)
{
+ int nocompoundtwowords = 0;
+ char ** wlst;
+ w_char word_utf[MAXSWL];
+ int wl;
+
+ char w2[MAXWORDUTF8LEN];
+ const char * word = w;
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ strcpy(w2, w);
+ if (utf8) reverseword_utf(w2); else reverseword(w2);
+ word = w2;
+ }
- int nsug = ns;
+ if (*slst) {
+ wlst = *slst;
+ } else {
+ wlst = (char **) malloc(maxSug * sizeof(char *));
+ if (wlst == NULL) return -1;
+ for (int i = 0; i < maxSug; i++) wlst[i] = NULL;
+ }
+
+ if (utf8) {
+ wl = u8_u16(word_utf, MAXSWL, word);
+ }
- // perhaps we made chose the wrong char from a related set
- if ((nsug < maxSug) && (nsug > -1))
- nsug = mapchars(wlst, word, nsug);
+ for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
// perhaps we made a typical fault of spelling
if ((nsug < maxSug) && (nsug > -1))
- nsug = replchars(wlst, word, nsug);
+ nsug = replchars(wlst, word, nsug, cpdsuggest);
- // did we forget to add a char
+ // perhaps we made chose the wrong char from a related set
if ((nsug < maxSug) && (nsug > -1))
- nsug = forgotchar(wlst, word, nsug);
+ nsug = mapchars(wlst, word, nsug, cpdsuggest);
// did we swap the order of chars by mistake
- if ((nsug < maxSug) && (nsug > -1))
- nsug = swapchar(wlst, word, nsug);
+ if ((nsug < maxSug) && (nsug > -1)) {
+ nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
+ swapchar(wlst, word, nsug, cpdsuggest);
+ }
+
+ // did we forget to add a char
+ if ((nsug < maxSug) && (nsug > -1)) {
+ nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
+ forgotchar(wlst, word, nsug, cpdsuggest);
+ }
// did we add a char that should not be there
- if ((nsug < maxSug) && (nsug > -1))
- nsug = extrachar(wlst, word, nsug);
-
+ if ((nsug < maxSug) && (nsug > -1)) {
+ nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
+ extrachar(wlst, word, nsug, cpdsuggest);
+ }
+
// did we just hit the wrong key in place of a good char
- if ((nsug < maxSug) && (nsug > -1))
- nsug = badchar(wlst, word, nsug);
+ if ((nsug < maxSug) && (nsug > -1)) {
+ nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) :
+ badchar(wlst, word, nsug, cpdsuggest);
+ }
+
+ // only suggest compound words when no other suggestion
+ if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
// perhaps we forgot to hit space and two words ran together
- if (!nosplitsugs) {
- if ((nsug < maxSug) && (nsug > -1))
- nsug = twowords(wlst, word, nsug);
+ if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) {
+ nsug = twowords(wlst, word, nsug, cpdsuggest);
+ }
+
+ } // repeating ``for'' statement compounding support
+
+ if (nsug < 0) {
+ // we ran out of memory - we should free up as much as possible
+ for (int i = 0; i < maxSug; i++)
+ if (wlst[i] != NULL) free(wlst[i]);
+ free(wlst);
+ wlst = NULL;
}
+
+ *slst = wlst;
return nsug;
}
+// generate suggestions for a word with typical mistake
+// pass in address of array of char * pointers
+
+int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug)
+{
+ int nocompoundtwowords = 0;
+ char ** wlst;
+
+ char w2[MAXWORDUTF8LEN];
+ const char * word = w;
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ strcpy(w2, w);
+ if (utf8) reverseword_utf(w2); else reverseword(w2);
+ word = w2;
+ }
+
+ if (*slst) {
+ wlst = *slst;
+ } else {
+ wlst = (char **) malloc(maxSug * sizeof(char *));
+ if (wlst == NULL) return -1;
+ }
+
+ for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) {
+
+ // perhaps we made a typical fault of spelling
+ if ((nsug < maxSug) && (nsug > -1))
+ nsug = replchars(wlst, word, nsug, cpdsuggest);
+
+ // perhaps we made chose the wrong char from a related set
+ if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0))
+ nsug = mapchars(wlst, word, nsug, cpdsuggest);
+
+ if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1;
+
+ // perhaps we forgot to hit space and two words ran together
+
+ if ((nsug < maxSug) && (nsug > -1) && check_forbidden(word, strlen(word))) {
+ nsug = twowords(wlst, word, nsug, cpdsuggest);
+ }
+
+ } // repeating ``for'' statement compounding support
+
+ if (nsug < 0) {
+ for (int i=0;i<maxSug; i++)
+ if (wlst[i] != NULL) free(wlst[i]);
+ free(wlst);
+ return -1;
+ }
+
+ *slst = wlst;
+ return nsug;
+}
// suggestions for when chose the wrong char out of a related set
-int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
+int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest)
{
+ time_t timelimit;
+ int timer;
+
int wl = strlen(word);
if (wl < 2 || ! pAMgr) return ns;
int nummap = pAMgr->get_nummap();
struct mapentry* maptable = pAMgr->get_maptable();
if (maptable==NULL) return ns;
- ns = map_related(word, 0, wlst, ns, maptable, nummap);
+
+ timelimit = time(NULL);
+ timer = MINTIMER;
+ if (utf8) {
+ w_char w[MAXSWL];
+ int len = u8_u16(w, MAXSWL, word);
+ ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
+ } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit);
return ns;
}
-
-int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap)
+int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns,
+ const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
{
- char c = *(word + i);
+ char c = *(word + i);
if (c == 0) {
int cwrd = 1;
+ int wl;
for (int m=0; m < ns; m++)
if (strcmp(word,wlst[m]) == 0) cwrd = 0;
- if ((cwrd) && check(word,strlen(word))) {
+ if ((cwrd) && (wl = strlen(word)) && (check(word, wl, 0, timer, timelimit) ||
+ check(word, wl, 1, timer, timelimit))) {
if (ns < maxSug) {
wlst[ns] = mystrdup(word);
if (wlst[ns] == NULL) return -1;
@@ -121,14 +265,55 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const
char * newword = mystrdup(word);
for (int k = 0; k < maptable[j].len; k++) {
*(newword + i) = *(maptable[j].set + k);
- ns = map_related(newword, (i+1), wlst, ns, maptable, nummap);
+ ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit);
+ if (!(*timelimit)) return ns;
}
free(newword);
}
}
if (!in_map) {
i++;
- ns = map_related(word, i, wlst, ns, maptable, nummap);
+ ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit);
+ }
+ return ns;
+}
+
+int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns,
+ const mapentry* maptable, int nummap, int * timer, time_t * timelimit)
+{
+ if (i == len) {
+ int cwrd = 1;
+ int wl;
+ char s[MAXSWUTF8L];
+ u16_u8(s, MAXSWUTF8L, word, len);
+ for (int m=0; m < ns; m++)
+ if (strcmp(s,wlst[m]) == 0) cwrd = 0;
+ if ((cwrd) && (wl = strlen(s)) && (check(s, wl, 0, timer, timelimit) ||
+ check(s, wl, 1, timer, timelimit))) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(s);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+ }
+ return ns;
+ }
+ int in_map = 0;
+ unsigned short c = *((unsigned short *) word + i);
+ for (int j = 0; j < nummap; j++) {
+ if (flag_bsearch((unsigned short *) maptable[j].set_utf16, c, maptable[j].len)) {
+ in_map = 1;
+ for (int k = 0; k < maptable[j].len; k++) {
+ *(word + i) = *(maptable[j].set_utf16 + k);
+ ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit);
+ if (!(*timelimit)) return ns;
+ }
+ *((unsigned short *) word + i) = c;
+ }
+ }
+ if (!in_map) {
+ i++;
+ ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit);
}
return ns;
}
@@ -137,9 +322,9 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const
// suggestions for a typical fault of spelling, that
// differs with more, than 1 letter from the right form.
-int SuggestMgr::replchars(char** wlst, const char * word, int ns)
+int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest)
{
- char candidate[MAXSWL];
+ char candidate[MAXSWUTF8L];
const char * r;
int lenr, lenp;
int cwrd;
@@ -153,21 +338,24 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns)
for (int i=0; i < numrep; i++ ) {
r = word;
- lenr = strlen(reptable[i].replacement);
+ lenr = strlen(reptable[i].pattern2);
lenp = strlen(reptable[i].pattern);
// search every occurence of the pattern in the word
while ((r=strstr(r, reptable[i].pattern)) != NULL) {
strcpy(candidate, word);
- if (r-word + lenr + strlen(r+lenp) >= MAXSWL) break;
- strcpy(candidate+(r-word),reptable[i].replacement);
+ if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break;
+ strcpy(candidate+(r-word),reptable[i].pattern2);
strcpy(candidate+(r-word)+lenr, r+lenp);
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
- if ((cwrd) && check(candidate,strlen(candidate))) {
+ if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
- if (wlst[ns] == NULL) return -1;
+ if (wlst[ns] == NULL) {
+ for (int j=0; j<ns; j++) free(wlst[j]);
+ return -1;
+ }
ns++;
} else return ns;
}
@@ -177,16 +365,56 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns)
return ns;
}
+// perhaps we made a special pattern mistake
+// for example: vacation -> vacacation (doubled `ac')
+int SuggestMgr::doubledsyllable(char** wlst, const char * word, int ns, int cpdsuggest)
+{
+ char candidate[MAXSWUTF8L];
+ int state=0;
+ int cwrd;
+
+ int wl = strlen(word);
+ if (wl < 5 || ! pAMgr) return ns;
+
+ for (int i=2; i < wl; i++ ) {
+ if (word[i]==word[i-2]) {
+ state++;
+ if (state==3) {
+ strcpy(candidate,word);
+ strcpy(candidate+i-1,word+i+1);
+ cwrd = 1;
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
+ if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) {
+ for (int j=0; j<ns; j++) free(wlst[j]);
+ return -1;
+ }
+ ns++;
+ } else return ns;
+ }
+ state=0;
+ }
+ } else {
+ state=0;
+ }
+ }
+ return ns;
+}
// error is wrong char in place of correct one
-int SuggestMgr::badchar(char ** wlst, const char * word, int ns)
+int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
char tmpc;
- char candidate[MAXSWL];
+ char candidate[MAXSWUTF8L];
+ time_t timelimit = time(NULL);
+ int timer = MINTIMER;
int wl = strlen(word);
int cwrd;
- strcpy (candidate, word);
+ strcpy(candidate, word);
// swap out each char one by one and try all the tryme
// chars in its place to see if that makes a good word
@@ -198,24 +426,92 @@ int SuggestMgr::badchar(char ** wlst, const char * word, int ns)
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
- if ((cwrd) && check(candidate,wl)) {
+ if ((cwrd) && check(candidate,wl, cpdsuggest, &timer, &timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
+ if (!timelimit) return ns;
candidate[i] = tmpc;
}
}
return ns;
}
+// error is wrong char in place of correct one
+int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
+{
+ w_char tmpc;
+ w_char candidate_utf[MAXSWL];
+ char candidate[MAXSWUTF8L];
+ int cwrd;
+ time_t timelimit = time(NULL);
+ int timer = MINTIMER;
+
+ memcpy(candidate_utf, word, wl * sizeof(w_char));
+
+ // swap out each char one by one and try all the tryme
+ // chars in its place to see if that makes a good word
+ for (int i=0; i < wl; i++) {
+ tmpc = candidate_utf[i];
+ for (int j=0; j < ctryl; j++) {
+ if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue;
+ candidate_utf[i] = ctry_utf[j];
+ cwrd = 1;
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
+ if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ } else return ns;
+ }
+ if (!timelimit) return ns;
+ candidate_utf[i] = tmpc;
+ }
+ }
+ return ns;
+}
+
+// error is word has an extra letter it does not need
+int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
+{
+ char candidate[MAXSWUTF8L];
+ w_char candidate_utf[MAXSWL];
+
+ const w_char * p;
+ w_char * r;
+ int cwrd;
+
+ if (wl < 2) return ns;
+
+ // try omitting one char of word at a time
+ memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char));
+ for (p = word, r = candidate_utf; p < word + wl; ) {
+ cwrd = 1;
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1);
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
+ if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ } else return ns;
+ }
+ *r++ = *p++;
+ }
+ return ns;
+}
// error is word has an extra letter it does not need
-int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
+int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest)
{
- char candidate[MAXSWL];
+ char candidate[MAXSWUTF8L];
const char * p;
char * r;
int cwrd;
@@ -229,7 +525,7 @@ int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
- if ((cwrd) && check(candidate,wl-1)) {
+ if ((cwrd) && check(candidate,wl-1, cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
@@ -242,13 +538,15 @@ int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
}
-// error is mising a letter it needs
-int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns)
+// error is missing a letter it needs
+int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
- char candidate[MAXSWL];
+ char candidate[MAXSWUTF8L];
const char * p;
char * q;
int cwrd;
+ time_t timelimit = time(NULL);
+ int timer = MINTIMER;
int wl = strlen(word);
@@ -260,13 +558,14 @@ int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns)
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
- if ((cwrd) && check(candidate,wl+1)) {
+ if ((cwrd) && check(candidate, wl+1, cpdsuggest, &timer, &timelimit)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
ns++;
} else return ns;
}
+ if (!timelimit) return ns;
}
*q++ = *p++;
}
@@ -277,7 +576,57 @@ int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns)
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
- if ((cwrd) && check(candidate,wl+1)) {
+ if ((cwrd) && check(candidate,wl+1, cpdsuggest, NULL, NULL)) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ } else return ns;
+ }
+ }
+ return ns;
+}
+
+// error is missing a letter it needs
+int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
+{
+ w_char candidate_utf[MAXSWL];
+ char candidate[MAXSWUTF8L];
+ const w_char * p;
+ w_char * q;
+ int cwrd;
+ time_t timelimit = time(NULL);
+ int timer = MINTIMER;
+
+ // try inserting a tryme character before every letter
+ memcpy (candidate_utf + 1, word, wl * sizeof(w_char));
+ for (p = word, q = candidate_utf; p < (word + wl); ) {
+ for (int i = 0; i < ctryl; i++) {
+ *q = ctry_utf[i];
+ cwrd = 1;
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
+ if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ } else return ns;
+ }
+ if (!timelimit) return ns;
+ }
+ *q++ = *p++;
+ }
+
+ // now try adding one to end */
+ for (int i = 0; i < ctryl; i++) {
+ *q = ctry_utf[i];
+ cwrd = 1;
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1);
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
+ if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
@@ -290,27 +639,51 @@ int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns)
/* error is should have been two words */
-int SuggestMgr::twowords(char ** wlst, const char * word, int ns)
+int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest)
{
- char candidate[MAXSWL];
+ char candidate[MAXSWUTF8L];
char * p;
+ int c1, c2, cwrd;
+ int forbidden = 0;
int wl=strlen(word);
- if (wl < 3) return ns;
+ if (wl < 4) return ns;
+
+ if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl);
+
strcpy(candidate + 1, word);
+ candidate[0] = word[0];
// split the string into two pieces after every char
// if both pieces are good words make them a suggestion
- for (p = candidate + 1; p[1] != '\0'; p++) {
+ for (p = candidate + 2; p[2] != '\0'; p++) {
p[-1] = *p;
+ // go to end of the UTF-8 character
+ while (utf8 && ((p[1] & 0xc0) == 0x80)) {
+ p++;
+ p[-1] = *p;
+ }
*p = '\0';
- if (check(candidate,strlen(candidate))) {
- if (check((p+1),strlen(p+1))) {
- *p = ' ';
+ if ((c1=check(candidate,strlen(candidate), cpdsuggest, NULL, NULL))) {
+ if ((c2=check((p+1),strlen(p+1), cpdsuggest, NULL, NULL))) {
+ *p = ' ';
+
+ // spec. Hungarian code (need a better compound word support)
+ if ((pAMgr->get_langnum() == LANG_hu) && !forbidden &&
+ // if 3 repeating letter, use - instead of space
+ (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) ||
+ // or multiple compounding, with more, than 6 syllables
+ ((c1 == 3) && (c2 >= 2)))) *p = '-';
+
+ cwrd = 1;
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
if (ns < maxSug) {
- wlst[ns] = mystrdup(candidate);
- if (wlst[ns] == NULL) return -1;
- ns++;
+ if (cwrd) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
} else return ns;
}
}
@@ -320,14 +693,14 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns)
// error is adjacent letter were swapped
-int SuggestMgr::swapchar(char ** wlst, const char * word, int ns)
+int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest)
{
- char candidate[MAXSWL];
+ char candidate[MAXSWUTF8L];
char * p;
char tmpc;
int cwrd;
- int wl = strlen(word);
+ int wl=strlen(word);
// try swapping adjacent chars one by one
strcpy(candidate, word);
@@ -338,7 +711,7 @@ int SuggestMgr::swapchar(char ** wlst, const char * word, int ns)
cwrd = 1;
for (int k=0; k < ns; k++)
if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
- if ((cwrd) && check(candidate,wl)) {
+ if ((cwrd) && check(candidate,wl, cpdsuggest, NULL, NULL)) {
if (ns < maxSug) {
wlst[ns] = mystrdup(candidate);
if (wlst[ns] == NULL) return -1;
@@ -352,9 +725,41 @@ int SuggestMgr::swapchar(char ** wlst, const char * word, int ns)
return ns;
}
+// error is adjacent letter were swapped
+int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest)
+{
+ w_char candidate_utf[MAXSWL];
+ char candidate[MAXSWUTF8L];
+ w_char * p;
+ w_char tmpc;
+ int cwrd;
+
+ // try swapping adjacent chars one by one
+ memcpy (candidate_utf, word, wl * sizeof(w_char));
+ for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) {
+ tmpc = *p;
+ *p = p[1];
+ p[1] = tmpc;
+ cwrd = 1;
+ u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl);
+ for (int k=0; k < ns; k++)
+ if (strcmp(candidate,wlst[k]) == 0) cwrd = 0;
+ if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(candidate);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ } else return ns;
+ }
+ tmpc = *p;
+ *p = p[1];
+ p[1] = tmpc;
+ }
+ return ns;
+}
// generate a set of suggestions for very poorly spelled words
-int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
+int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr)
{
int i, j;
@@ -374,14 +779,32 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
}
lp = MAX_ROOTS - 1;
- int n = strlen(word);
+ char w2[MAXWORDUTF8LEN];
+ char * word = w;
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ strcpy(w2, w);
+ if (utf8) reverseword_utf(w2); else reverseword(w2);
+ word = w2;
+ }
+
+ char mw[MAXSWUTF8L];
+ w_char u8[MAXSWL];
+ int nc = strlen(word);
+ int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc;
struct hentry* hp = NULL;
int col = -1;
while ((hp = pHMgr->walk_hashtable(col, hp))) {
+ // check forbidden words
+ if ((hp->astr) && (pAMgr) &&
+ (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) ||
+ TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) ||
+ TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue;
sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
if (sc > scores[lp]) {
- scores[lp] = sc;
+ scores[lp] = sc;
roots[lp] = hp;
int lval = sc;
for (j=0; j < MAX_ROOTS; j++)
@@ -396,14 +819,17 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
// mangle original word three differnt ways
// and score them to generate a minimum acceptable score
int thresh = 0;
- char * mw = NULL;
for (int sp = 1; sp < 4; sp++) {
- mw = mystrdup(word);
- for (int k=sp; k < n; k+=4) *(mw + k) = '*';
- thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
- free(mw);
+ if (utf8) {
+ for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*';
+ u16_u8(mw, MAXSWUTF8L, u8, n);
+ thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
+ } else {
+ strcpy(mw, word);
+ for (int k=sp; k < n; k+=4) *(mw + k) = '*';
+ thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
+ }
}
- mw = NULL;
thresh = thresh / 3;
thresh--;
@@ -428,99 +854,722 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
if (roots[i]) {
struct hentry * rp = roots[i];
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
- rp->astr, rp->alen);
- for (int k = 0; k < nw; k++) {
+ rp->astr, rp->alen, word, nc);
+
+ for (int k = 0; k < nw ; k++) {
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
- if (sc > thresh)
- {
- if (sc > gscore[lp])
- {
- if (guess[lp]) free(guess[lp]);
- gscore[lp] = sc;
- guess[lp] = glst[k].word;
- glst[k].word = NULL;
- lval = sc;
- for (j=0; j < MAX_GUESS; j++)
- {
- if (gscore[j] < lval)
- {
- lp = j;
- lval = gscore[j];
- }
- }
- }
- }
- free (glst[k].word);
- glst[k].word = NULL;
- glst[k].allow = 0;
+ if ((sc > thresh)) {
+ if (sc > gscore[lp]) {
+ if (guess[lp]) free (guess[lp]);
+ gscore[lp] = sc;
+ guess[lp] = glst[k].word;
+ lval = sc;
+ for (j=0; j < MAX_GUESS; j++)
+ if (gscore[j] < lval) {
+ lp = j;
+ lval = gscore[j];
+ }
+ } else free (glst[k].word);
+ } else free(glst[k].word);
}
}
}
- if (glst) free(glst);
+ free(glst);
// now we are done generating guesses
- // sort in order of decreasing score and copy over
+ // sort in order of decreasing score
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
+
+ // weight suggestions with a similarity index, based on
+ // the longest common subsequent algorithm and resort
+
+ int is_swap;
+ for (i=0; i < MAX_GUESS; i++) {
+ if (guess[i]) {
+ // lowering guess[i]
+ char gl[MAXSWUTF8L];
+ int len;
+ if (utf8) {
+ w_char w[MAXSWL];
+ len = u8_u16(w, MAXSWL, guess[i]);
+ mkallsmall_utf(w, len, utfconv);
+ u16_u8(gl, MAXSWUTF8L, w, len);
+ } else {
+ strcpy(gl, guess[i]);
+ mkallsmall(gl, csconv);
+ len = strlen(guess[i]);
+ }
+
+ int lcs = lcslen(word, gl);
+
+ // same characters with different casing
+ if ((n == len) && (n == lcs)) {
+ gscore[i] += 2000;
+ break;
+ }
+
+ // heuristic weigthing of ngram scores
+ gscore[i] +=
+ // length of longest common subsequent minus lenght difference
+ 2 * lcs - abs((int) (n - len)) +
+ // weight equal first letter
+ equalfirstletter(word, gl) +
+ // weight equal character positions
+ ((lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) +
+ // swap character (not neighboring)
+ ((is_swap) ? 1000 : 0);
+ }
+ }
+
+ bubblesort(&guess[0], &gscore[0], MAX_GUESS);
+
+ // copy over
+
int ns = 0;
+ int same = 0;
for (i=0; i < MAX_GUESS; i++) {
if (guess[i]) {
- int unique = 1;
- for (j=i+1; j < MAX_GUESS; j++)
- if (guess[j])
- if (!strcmp(guess[i], guess[j])) unique = 0;
- if (unique) {
- wlst[ns++] = guess[i];
- } else {
- free(guess[i]);
- }
+ if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) {
+ int unique = 1;
+ // we have excellent suggestion(s)
+ if (gscore[i] > 1000) same = 1;
+ for (j=0; j < ns; j++)
+ // don't suggest previous suggestions or a previous suggestion with prefixes or affixes
+ if (strstr(guess[i], wlst[j]) ||
+ // check forbidden words
+ !check(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0;
+ if (unique) wlst[ns++] = guess[i]; else free(guess[i]);
+ } else free(guess[i]);
}
}
+
return ns;
}
-
-
// see if a candidate suggestion is spelled correctly
// needs to check both root words and words with affixes
-int SuggestMgr::check(const char * word, int len)
+
+// obsolote MySpell-HU modifications:
+// return value 2 and 3 marks compounding with hyphen (-)
+// `3' marks roots without suffix
+int SuggestMgr::check(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit)
{
struct hentry * rv=NULL;
+ int nosuffix = 0;
+
+ // check time limit
+ if (timer) {
+ (*timer)--;
+ if (!(*timer) && timelimit) {
+ if (time(NULL) > *timelimit) {
+ *timelimit = 0;
+ return 0;
+ }
+ *timer = MAXPLUSTIMER;
+ }
+ }
+
if (pAMgr) {
+ if (cpdsuggest==1) {
+ if (pAMgr->get_compound()) {
+ rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1);
+ if (rv) return 3; // XXX obsolote categorisation
+ }
+ return 0;
+ }
+
rv = pAMgr->lookup(word);
- if (rv == NULL) rv = pAMgr->affix_check(word,len);
+
+ if (rv) {
+ if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
+ || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0;
+ if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
+ TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
+ } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX
+
+ if (rv) {
+ nosuffix=1;
+ } else {
+ rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix
+ }
+
+ if (!rv && pAMgr->have_contclass()) {
+ rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL);
+ if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL);
+ }
+
+ // check forbidden words
+ if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)
+ || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) ||
+ TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0;
+
+ if (rv) { // XXX obsolote
+ if ((pAMgr->get_compoundflag()) &&
+ TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix;
+ return 1;
+ }
}
- if (rv) return 1;
return 0;
}
+int SuggestMgr::check_forbidden(const char * word, int len)
+{
+ struct hentry * rv = NULL;
+
+ if (pAMgr) {
+ rv = pAMgr->lookup(word);
+ if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) ||
+ TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL;
+ if (!(pAMgr->prefix_check(word,len,1)))
+ rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix
+ // check forbidden words
+ if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1;
+ }
+ return 0;
+}
+
+// suggest stems, XXX experimental code
+int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug)
+{
+ char buf[MAXSWUTF8L];
+ char ** wlst;
+ int prevnsug = nsug;
+
+ char w2[MAXWORDUTF8LEN];
+ const char * word = w;
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ strcpy(w2, w);
+ if (utf8) reverseword_utf(w2); else reverseword(w2);
+ word = w2;
+ }
+
+ if (*slst) {
+ wlst = *slst;
+ } else {
+ wlst = (char **) calloc(maxSug, sizeof(char *));
+ if (wlst == NULL) return -1;
+ }
+ // perhaps there are a fix stem in the dictionary
+ if ((nsug < maxSug) && (nsug > -1)) {
+
+ nsug = fixstems(wlst, word, nsug);
+ if (nsug == prevnsug) {
+ char * s = mystrdup(word);
+ char * p = s + strlen(s);
+ while ((*p != '-') && (p != s)) p--;
+ if (*p == '-') {
+ *p = '\0';
+ nsug = fixstems(wlst, s, nsug);
+ if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) {
+ char * t;
+ buf[0] = '\0';
+ for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number?
+ if (*t != '\0') strcpy(buf, "# ");
+ strcat(buf, s);
+ wlst[nsug] = mystrdup(buf);
+ if (wlst[nsug] == NULL) return -1;
+ nsug++;
+ }
+ p++;
+ nsug = fixstems(wlst, p, nsug);
+ }
+
+ free(s);
+ }
+ }
+
+ if (nsug < 0) {
+ for (int i=0;i<maxSug; i++)
+ if (wlst[i] != NULL) free(wlst[i]);
+ free(wlst);
+ return -1;
+ }
+
+ *slst = wlst;
+ return nsug;
+}
+
+
+// there are fix stems in dictionary
+int SuggestMgr::fixstems(char ** wlst, const char * word, int ns)
+{
+ char fix[MAXSWUTF8L];
+ char buf[MAXSWUTF8L];
+ char prefix[MAXSWUTF8L] = "";
+
+ char * p;
+ int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound
+ int cpdindex = 0;
+ struct hentry * rv = NULL;
+ struct hentry * rv2 = NULL;
+
+ int wl = strlen(word);
+ int cmpdstemnum;
+ int cmpdstem[MAXCOMPOUND];
+
+ if (pAMgr) {
+ rv = pAMgr->lookup(word);
+ if (rv) {
+ dicstem = 0;
+ } else {
+ // try stripping off affixes
+ rv = pAMgr->affix_check(word, wl);
+
+ // else try check compound word
+ if (!rv && pAMgr->get_compound()) {
+ rv = pAMgr->compound_check(word, wl,
+ 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1);
+
+ if (rv) {
+ dicstem = 2;
+ for (int j = 0; j < cmpdstemnum; j++) {
+ cpdindex += cmpdstem[j];
+ }
+ if(! (pAMgr->lookup(word + cpdindex)))
+ pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix
+ }
+ }
+
+
+ if (pAMgr->get_prefix()) {
+ strcpy(prefix, pAMgr->get_prefix());
+ }
+
+ // XXX obsolote, will be a general solution for stemming
+ if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU)
+ }
+
+ }
+
+
+
+ if ((rv) && (ns < maxSug)) {
+
+ // check fixstem flag and not_valid_stem flag
+ // first word
+ if ((ns < maxSug) && (dicstem < 2)) {
+ strcpy(buf, prefix);
+ if ((dicstem > 0) && pAMgr->get_derived()) {
+ // XXX obsolote
+ if (strlen(prefix) == 1) {
+ strcat(buf, (pAMgr->get_derived()) + 1);
+ } else {
+ strcat(buf, pAMgr->get_derived());
+ }
+ } else {
+ // special stem in affix description
+ const char * wordchars = pAMgr->get_wordchars();
+ if (rv->description &&
+ (strchr(wordchars, *(rv->description)))) {
+ char * desc = (rv->description) + 1;
+ while (strchr(wordchars, *desc)) desc++;
+ strncat(buf, rv->description, desc - (rv->description));
+ } else {
+ strcat(buf, rv->word);
+ }
+ }
+ wlst[ns] = mystrdup(buf);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+
+ if (dicstem == 2) {
+
+ // compound stem
+
+// if (rv->astr && (strchr(rv->astr, '0') == NULL)) {
+ if (rv->astr) {
+ strcpy(buf, word);
+ buf[cpdindex] = '\0';
+ if (prefix) strcat(buf, prefix);
+ if (pAMgr->get_derived()) {
+ strcat(buf, pAMgr->get_derived());
+ } else {
+ // special stem in affix description
+ const char * wordchars = pAMgr->get_wordchars();
+ if (rv->description &&
+ (strchr(wordchars, *(rv->description)))) {
+ char * desc = (rv->description) + 1;
+ while (strchr(wordchars, *desc)) desc++;
+ strncat(buf, rv->description, desc - (rv->description));
+ } else {
+ strcat(buf, rv->word);
+ }
+ }
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(buf);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+ }
+ }
+ }
+while (rv) {
+ if (0) { // obsolote
+ if ((p[1] > '0') && (p[1] <= '9')) {
+ if ((ns < maxSug) && (dicstem != 2)) {
+ int split = p[1] - '0';
+ if (rv->wlen <= split) break;
+
+ strcpy(fix, rv->word);
+
+ // checking verbs ending with `ik'
+
+ fix[rv->wlen - split] = 'i';
+ fix[rv->wlen - split + 1] = 'k';
+ fix[rv->wlen - split + 2] = '\0';
+
+ if (! (rv2 = pAMgr->lookup(fix))) {
+ fix[strlen(fix) - 2] = '\0';
+ rv2 = pAMgr->lookup(fix);
+ if ((!rv2)) {
+ *fix = csconv[((unsigned char) *fix)].cupper;
+ rv2 = pAMgr->lookup(fix);
+ if (! rv2) return ns;
+ }
+
+ }
+
+ if (0) {
+ strcpy(buf, prefix);
+ strcat(buf, fix);
+ wlst[ns] = mystrdup(buf);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+
+ rv = rv2;
+
+ } else return ns;
+ } else {
+ strcpy(fix, "__");
+ strcat(fix, rv->word);
+ rv = NULL;
+ rv2 = pAMgr->lookup(fix);
+ if ((rv2) && (rv2->astr) && (ns < maxSug))
+ if ((rv2) && (rv2->astr) && (ns < maxSug))
+ if (0) {
+ char buf2[MAXSWUTF8L];
+
+ strcpy(buf2, prefix);
+
+ if (*(rv2->astr) == '-') {
+ strcat(buf2, "");
+ } else {
+ strcat(buf2, "");
+ }
+
+ if (dicstem != 2) {
+ wlst[ns] = mystrdup(buf2);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+
+ if ((dicstem == 2) && (ns < maxSug)) {
+ strcpy(buf, word);
+ buf[cpdindex] = '\0';
+ strcat(buf + cpdindex, buf2);
+
+ if (pAMgr->get_compound() &&
+ (pAMgr->compound_check(buf, strlen(buf),
+ 0,0,100,0,NULL,0,NULL,NULL,1))) {
+ wlst[ns] = mystrdup(buf);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+ }
+ // many stems
+ } else {
+ char * str = mystrdup("");
+ char * pos = str;
+ char * pos2;
+ do {
+ int suggest = 1;
+ pos2 = strchr(pos, '|');
+ if (pos2) *pos2 = '\0';
+ // ignore `-xxx' suggestion, when exists prefix
+ if (*pos == '-') {
+ pos++;
+ if (*prefix != '\0') suggest = 0;
+ }
+ // ignore `xxx-' suggestion, when word is not root
+ if ((strlen(pos) > 0) && (pos[strlen(pos)-1] == '-')) {
+ pos[strlen(pos)-1] = '\0';
+ strcpy(buf, prefix);
+ strcat(buf, fix + 2);
+ if ((dicstem != 0) && (strcmp(buf, word) != 0)) suggest = 0;
+ }
+ if ((suggest) && (ns < maxSug) && (strlen(pos) > 0)) {
+ strcpy(buf, prefix);
+ strcat(buf, pos);
+ wlst[ns] = mystrdup(buf);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+ if (pos2) pos = pos2 + 1;
+ } while (pos2);
+ free(str);
+ }
+ }
+ } else return ns;
+
+}
+
+return ns;
+
+}
+
+// suggest possible stems
+int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug)
+{
+ char ** wlst;
+
+ struct hentry * rv = NULL;
+
+ char w2[MAXSWUTF8L];
+ const char * word = w;
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ strcpy(w2, w);
+ if (utf8) reverseword_utf(w2); else reverseword(w2);
+ word = w2;
+ }
+
+ int wl = strlen(word);
+
+
+ if (*slst) {
+ wlst = *slst;
+ } else {
+ wlst = (char **) calloc(maxSug, sizeof(char *));
+ if (wlst == NULL) return -1;
+ }
+
+ rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug);
+
+ // delete dash from end of word
+ if (nsug > 0) {
+ for (int j=0; j < nsug; j++) {
+ if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0';
+ }
+ }
+
+ *slst = wlst;
+ return nsug;
+}
+
+
+char * SuggestMgr::suggest_morph(const char * w)
+{
+ char result[MAXLNLEN];
+ char * r = (char *) result;
+ char * st;
+
+ struct hentry * rv = NULL;
+
+ *result = '\0';
+
+ if (! pAMgr) return NULL;
+
+ char w2[MAXSWUTF8L];
+ const char * word = w;
+
+ // word reversing wrapper for complex prefixes
+ if (complexprefixes) {
+ strcpy(w2, w);
+ if (utf8) reverseword_utf(w2); else reverseword(w2);
+ word = w2;
+ }
+
+ rv = pAMgr->lookup(word);
+
+ while (rv) {
+ if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) ||
+ TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) ||
+ TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) {
+ if (rv->description && ((!rv->astr) ||
+ !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen)))
+ strcat(result, word);
+ if (rv->description) strcat(result, rv->description);
+ strcat(result, "\n");
+ }
+ rv = rv->next_homonym;
+ }
+
+ st = pAMgr->affix_check_morph(word,strlen(word));
+ if (st) {
+ strcat(result, st);
+ free(st);
+ }
+
+ if (pAMgr->get_compound() && (*result == '\0'))
+ pAMgr->compound_check_morph(word, strlen(word),
+ 0, 0, 100, 0,NULL, 0, &r, NULL);
+
+ return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL;
+}
+
+char * SuggestMgr::suggest_morph_for_spelling_error(const char * word)
+{
+ char * p = NULL;
+ char ** wlst = (char **) calloc(maxSug, sizeof(char *));
+ // we will use only the first suggestion
+ for (int i = 0; i < maxSug - 1; i++) wlst[i] = "";
+ int ns = suggest(&wlst, word, maxSug - 1);
+ if (ns == maxSug) {
+ p = suggest_morph(wlst[maxSug - 1]);
+ free(wlst[maxSug - 1]);
+ }
+ if (wlst) free(wlst);
+ return p;
+}
// generate an n-gram score comparing s1 and s2
int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
{
int nscore = 0;
- int l1 = strlen(s1);
- int l2 = strlen(s2);
int ns;
- for (int j=1;j<=n;j++) {
- ns = 0;
- for (int i=0;i<=(l1-j);i++) {
- char c = *(s1 + i + j);
- *(s1 + i + j) = '\0';
- if (strstr(s2,(s1+i))) ns++;
- *(s1 + i + j ) = c;
- }
- nscore = nscore + ns;
- if (ns < 2) break;
+ int l1;
+ int l2;
+
+ if (utf8) {
+ w_char su1[MAXSWL];
+ w_char su2[MAXSWL];
+ l1 = u8_u16(su1, MAXSWL, s1);
+ l2 = u8_u16(su2, MAXSWL, s2);
+ if (!l2) return 0;
+ // decapitalize dictionary word
+ if (complexprefixes) {
+ mkallsmall_utf(su2+l2-1, 1, utfconv);
+ } else {
+ mkallsmall_utf(su2, 1, utfconv);
+ }
+ for (int j = 1; j <= n; j++) {
+ ns = 0;
+ for (int i = 0; i <= (l1-j); i++) {
+ for (int l = 0; l <= (l2-j); l++) {
+ int k;
+ for (k = 0; (k < j); k++) {
+ w_char * c1 = su1 + i + k;
+ w_char * c2 = su2 + l + k;
+ if ((c1->l != c2->l) || (c1->h != c2->h)) break;
+ }
+ if (k == j) {
+ ns++;
+ break;
+ }
+ }
+ }
+ nscore = nscore + ns;
+ if (ns < 2) break;
+ }
+ } else {
+ char t[MAXSWUTF8L];
+ l1 = strlen(s1);
+ l2 = strlen(s2);
+ if (!l2) return 0;
+ strcpy(t, s2);
+ if (complexprefixes) {
+ *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower;
+ } else {
+ mkallsmall(t, csconv);
+/// *t = csconv[((unsigned char)*t)].clower;
+ }
+ for (int j = 1; j <= n; j++) {
+ ns = 0;
+ for (int i = 0; i <= (l1-j); i++) {
+ char c = *(s1 + i + j);
+ *(s1 + i + j) = '\0';
+ if (strstr(t,(s1+i))) ns++;
+ *(s1 + i + j ) = c;
+ }
+ nscore = nscore + ns;
+ if (ns < 2) break;
+ }
}
+
ns = 0;
if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
return (nscore - ((ns > 0) ? ns : 0));
}
+int SuggestMgr::equalfirstletter(char * s1, const char * s2) {
+ if (utf8) {
+ w_char su1[MAXSWL];
+ w_char su2[MAXSWL];
+ // decapitalize dictionary word
+ if (complexprefixes) {
+ int l1 = u8_u16(su1, MAXSWL, s1);
+ int l2 = u8_u16(su2, MAXSWL, s2);
+ if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1;
+ } else {
+ u8_u16(su1, 1, s1);
+ u8_u16(su2, 1, s2);
+ if (*((short *)su1) == *((short *)su2)) return 1;
+ }
+ } else {
+ if (complexprefixes) {
+ int l1 = strlen(s1);
+ int l2 = strlen(s2);
+ if (*(s2+l1-1) == *(s2+l2-1)) return 1;
+ } else {
+ if (*s1 == *s2) return 1;
+ }
+ }
+ return 0;
+}
+
+int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) {
+ int num = 0;
+ int diff = 0;
+ int diffpos[2];
+ *is_swap = 0;
+ if (utf8) {
+ w_char su1[MAXSWL];
+ w_char su2[MAXSWL];
+ int l1 = u8_u16(su1, MAXSWL, s1);
+ int l2 = u8_u16(su2, MAXSWL, s2);
+ for (int i = 0; (i < l1) && (i < l2); i++) {
+ if (((short *) su1)[i] == ((short *) su2)[i]) {
+ num++;
+ } else {
+ if (diff < 2) diffpos[diff] = i;
+ diff++;
+ }
+ }
+ if ((diff == 2) && (l1 == l2) &&
+ (((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) &&
+ (((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1;
+ } else {
+ int i;
+ for (i = 0; (*(s1+i) != 0) && (*(s2+i) != 0); i++) {
+ if (*(s1+i) == *(s2+i)) {
+ num++;
+ } else {
+ if (diff < 2) diffpos[diff] = i;
+ diff++;
+ }
+ }
+ if ((diff == 2) && (*(s1+i) == 0) && (*(s2+i) == 0) &&
+ (*(s1+diffpos[0]) == *(s2+diffpos[1])) &&
+ (*(s1+diffpos[1]) == *(s2+diffpos[0]))) *is_swap = 1;
+ }
+ return num;
+}
+
+int SuggestMgr::mystrlen(const char * word) {
+ if (utf8) {
+ w_char w[MAXSWL];
+ return u8_u16(w, MAXSWL, word);
+ } else return strlen(word);
+}
// sort in decreasing order of score
void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
@@ -544,3 +1593,66 @@ void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
return;
}
+// longest common subsequence
+void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) {
+ int n, m;
+ w_char su[MAXSWL];
+ w_char su2[MAXSWL];
+ char * b;
+ char * c;
+ int i;
+ int j;
+ if (utf8) {
+ m = u8_u16(su, MAXSWL, s);
+ n = u8_u16(su2, MAXSWL, s2);
+ } else {
+ m = strlen(s);
+ n = strlen(s2);
+ }
+ c = (char *) malloc((m + 1) * (n + 1));
+ b = (char *) malloc((m + 1) * (n + 1));
+ for (i = 1; i <= m; i++) c[i*(n+1)] = 0;
+ for (j = 0; j <= n; j++) c[j] = 0;
+ for (i = 1; i <= m; i++) {
+ for (j = 1; j <= n; j++) {
+ if ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1))
+ || (!utf8) && ((*(s+i-1)) == (*(s2+j-1)))) {
+ c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1;
+ b[i*(n+1) + j] = LCS_UPLEFT;
+ } else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) {
+ c[i*(n+1) + j] = c[(i-1)*(n+1) + j];
+ b[i*(n+1) + j] = LCS_UP;
+ } else {
+ c[i*(n+1) + j] = c[i*(n+1) + j-1];
+ b[i*(n+1) + j] = LCS_LEFT;
+ }
+ }
+ }
+ *result = b;
+ free(c);
+ *l1 = m;
+ *l2 = n;
+}
+
+int SuggestMgr::lcslen(const char * s, const char* s2) {
+ int m;
+ int n;
+ int i;
+ int j;
+ char * result;
+ int len = 0;
+ lcs(s, s2, &m, &n, &result);
+ i = m;
+ j = n;
+ while ((i != 0) && (j != 0)) {
+ if (result[i*(n+1) + j] == LCS_UPLEFT) {
+ len++;
+ i--;
+ j--;
+ } else if (result[i*(n+1) + j] == LCS_UP) {
+ i--;
+ } else j--;
+ }
+ if (result) free(result);
+ return len;
+}
diff --git a/src/myspell/suggestmgr.hxx b/src/myspell/suggestmgr.hxx
index 7c5a6e2..5bc64bb 100644
--- a/src/myspell/suggestmgr.hxx
+++ b/src/myspell/suggestmgr.hxx
@@ -2,46 +2,85 @@
#define _SUGGESTMGR_HXX_
#define MAXSWL 100
-#define MAX_ROOTS 10
-#define MAX_WORDS 500
-#define MAX_GUESS 10
+#define MAXSWUTF8L (MAXSWL * 4)
+#define MAX_ROOTS 50
+#define MAX_WORDS 200
+#define MAX_GUESS 200
+#define MAXNGRAMSUGS 5
+
+#define MINTIMER 500
+#define MAXPLUSTIMER 500
#define NGRAM_IGNORE_LENGTH 0
#define NGRAM_LONGER_WORSE 1
#define NGRAM_ANY_MISMATCH 2
-
#include "atypes.hxx"
#include "affixmgr.hxx"
#include "hashmgr.hxx"
+#include "langnum.hxx"
+#include <time.h>
+
+enum { LCS_UP, LCS_LEFT, LCS_UPLEFT };
class SuggestMgr
{
char * ctry;
int ctryl;
+ w_char * ctry_utf;
+
AffixMgr* pAMgr;
int maxSug;
- bool nosplitsugs;
+ struct cs_info * csconv;
+ struct unicode_info2 * utfconv;
+ int utf8;
+ int nosplitsugs;
+ int maxngramsugs;
+ int complexprefixes;
+
public:
SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr);
~SuggestMgr();
- int suggest(char** wlst, int ns, const char * word);
- int check(const char *, int);
+ int suggest(char*** slst, const char * word, int nsug);
int ngsuggest(char ** wlst, char * word, HashMgr* pHMgr);
+ int suggest_auto(char*** slst, const char * word, int nsug);
+ int suggest_stems(char*** slst, const char * word, int nsug);
+ int suggest_pos_stems(char*** slst, const char * word, int nsug);
+
+ char * suggest_morph(const char * word);
+ char * suggest_morph_for_spelling_error(const char * word);
private:
- int replchars(char**, const char *, int);
- int mapchars(char**, const char *, int);
- int map_related(const char *, int, char ** wlst, int, const mapentry*, int);
- int forgotchar(char **, const char *, int);
- int swapchar(char **, const char *, int);
- int extrachar(char **, const char *, int);
- int badchar(char **, const char *, int);
- int twowords(char **, const char *, int);
+ int check(const char *, int, int, int *, time_t *);
+ int check_forbidden(const char *, int);
+
+ int replchars(char**, const char *, int, int);
+ int doubledsyllable(char**, const char *, int, int);
+ int forgotchar(char **, const char *, int, int);
+ int swapchar(char **, const char *, int, int);
+ int extrachar(char **, const char *, int, int);
+ int badchar(char **, const char *, int, int);
+ int twowords(char **, const char *, int, int);
+ int fixstems(char **, const char *, int);
+
+ int forgotchar_utf(char**, const w_char *, int wl, int, int);
+ int extrachar_utf(char**, const w_char *, int wl, int, int);
+ int badchar_utf(char **, const w_char *, int wl, int, int);
+ int swapchar_utf(char **, const w_char *, int wl, int, int);
+
+ int mapchars(char**, const char *, int, int);
+ int map_related(const char *, int, char ** wlst, int, const mapentry*, int, int *, time_t *);
+ int map_related_utf(w_char *, int, int, char ** wlst, int, const mapentry*, int, int *, time_t *);
int ngram(int n, char * s1, const char * s2, int uselen);
+ int mystrlen(const char * word);
+ int equalfirstletter(char * s1, const char * s2);
+ int commoncharacterpositions(char * s1, const char * s2, int * is_swap);
void bubblesort( char ** rwd, int * rsc, int n);
+ void lcs(const char * s, const char * s2, int * l1, int * l2, char ** result);
+ int lcslen(const char * s, const char* s2);
+
};
#endif