diff options
author | Dom Lachowicz <domlachowicz@gmail.com> | 2006-01-14 02:18:48 +0000 |
---|---|---|
committer | Dom Lachowicz <domlachowicz@gmail.com> | 2006-01-14 02:18:48 +0000 |
commit | 7f5d852c3116af74620e630a776b6a8e03f8e5c9 (patch) | |
tree | 08856630ce7f546ecafe18d7d68cbc41f13113ef | |
parent | 48a8a34b95d427464cc9ca8af9fbf2900f1dcf30 (diff) | |
download | enchant-7f5d852c3116af74620e630a776b6a8e03f8e5c9.tar.gz |
build against hunspell (http://hunspell.sf.net/) instead of myspell.
hunspell will be replacing myspell in a future version of OpenOffice.org.
it is compatible with myspell's dictionaries and offers a lot of improvements
for non-western languages.
we can no longer build against a system version of myspell. we will always
build against our own copy of hunspell unless told otherwise.
this is bug 9820
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@21089 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r-- | configure.in | 9 | ||||
-rw-r--r-- | src/myspell/Makefile.am | 44 | ||||
-rw-r--r-- | src/myspell/affentry.cxx | 638 | ||||
-rw-r--r-- | src/myspell/affentry.hxx | 60 | ||||
-rw-r--r-- | src/myspell/affixmgr.cxx | 3084 | ||||
-rw-r--r-- | src/myspell/affixmgr.hxx | 178 | ||||
-rw-r--r-- | src/myspell/atypes.hxx | 62 | ||||
-rw-r--r-- | src/myspell/baseaffix.hxx | 26 | ||||
-rw-r--r-- | src/myspell/csutil.cxx | 1583 | ||||
-rw-r--r-- | src/myspell/csutil.hxx | 79 | ||||
-rw-r--r-- | src/myspell/enchant_myspell.hxx | 42 | ||||
-rw-r--r-- | src/myspell/hashmgr.cxx | 544 | ||||
-rw-r--r-- | src/myspell/hashmgr.hxx | 32 | ||||
-rw-r--r-- | src/myspell/htypes.hxx | 11 | ||||
-rw-r--r-- | src/myspell/hunspell.cxx | 1616 | ||||
-rw-r--r-- | src/myspell/hunspell.dsp | 164 | ||||
-rw-r--r-- | src/myspell/hunspell.hxx | 142 | ||||
-rw-r--r-- | src/myspell/myspell.cxx | 302 | ||||
-rw-r--r-- | src/myspell/myspell_checker.cpp | 11 | ||||
-rw-r--r-- | src/myspell/suggestmgr.cxx | 1370 | ||||
-rw-r--r-- | src/myspell/suggestmgr.hxx | 69 |
21 files changed, 8926 insertions, 1140 deletions
diff --git a/configure.in b/configure.in index 1cca3b8..f106836 100644 --- a/configure.in +++ b/configure.in @@ -123,12 +123,6 @@ if test "x$with_myspell_dir" != "x" ; then myspell_dir=$with_myspell_dir fi -with_system_myspell=no -if test "x$build_myspell" != "xno"; then - PKG_CHECK_MODULES(MYSPELL, myspell, with_system_myspell=yes, with_system_myspell=no) -fi -AM_CONDITIONAL(WITH_SYSTEM_MYSPELL, test "x$with_system_myspell" = "xyes") - MYSPELL_CFLAGS="$MYSPELL_CFLAGS -DENCHANT_MYSPELL_DICT_DIR='\"$myspell_dir\"'" if test "x$with_system_myspell" != "xno"; then MYSPELL_CFLAGS="$MYSPELL_CFLAGS -DWITH_SYSTEM_MYSPELL=1" @@ -273,7 +267,6 @@ $PACKAGE-$VERSION Build Ispell backend: ${build_ispell} Build Uspell backend: ${build_uspell} Build Hspell backend: ${build_hspell} - Build Myspell backend: ${build_myspell} - Build against system Myspell: ${with_system_myspell} + Build Myspell/Hunspell backend: ${build_myspell} Build with Binreloc $br_cv_binreloc " diff --git a/src/myspell/Makefile.am b/src/myspell/Makefile.am index 1f84195..7a57c3d 100644 --- a/src/myspell/Makefile.am +++ b/src/myspell/Makefile.am @@ -13,43 +13,33 @@ libenchant_myspell_lalibdir=$(libdir)/enchant libenchant_myspell_la_LIBADD= $(MYSPELL_LIBS) $(ENCHANT_LIBS) $(top_builddir)/src/libenchant.la libenchant_myspell_la_LDFLAGS = -version-info $(VERSION_INFO) -no-undefined -if WITH_SYSTEM_MYSPELL libenchant_myspell_la_SOURCES = \ - myspell_checker.cpp -else -libenchant_myspell_la_SOURCES = \ - affentry.cxx \ - affentry.hxx \ - affixmgr.cxx \ - affixmgr.hxx \ - atypes.hxx \ - baseaffix.hxx \ - csutil.cxx \ - csutil.hxx \ - hashmgr.cxx \ - hashmgr.hxx \ - htypes.hxx \ - myspell.cxx \ - enchant_myspell.hxx \ - suggestmgr.cxx \ - suggestmgr.hxx \ - myspell_checker.cpp -endif - -EXTRA_DIST= \ - license.readme \ affentry.hxx \ affixmgr.hxx \ atypes.hxx \ baseaffix.hxx \ csutil.hxx \ + dictmgr.hxx \ hashmgr.hxx \ htypes.hxx \ - enchant_myspell.hxx \ + hunspell.hxx \ + langnum.hxx \ suggestmgr.hxx \ affentry.cxx \ affixmgr.cxx \ csutil.cxx \ + dictmgr.cxx \ hashmgr.cxx \ - myspell.cxx \ - suggestmgr.cxx + hunspell.cxx \ + suggestmgr.cxx \ + myspell_checker.cpp + +EXTRA_DIST= \ + license.readme \ + utf_info.cxx \ + README \ + license.hunspell \ + license.myspell \ + license.readme \ + hunspell.dsp + diff --git a/src/myspell/affentry.cxx b/src/myspell/affentry.cxx index 603616d..014e925 100644 --- a/src/myspell/affentry.cxx +++ b/src/myspell/affentry.cxx @@ -1,5 +1,5 @@ -#include "license.readme" - +#include "license.hunspell" +#include "license.myspell" #include <cctype> #include <cstring> @@ -7,13 +7,12 @@ #include <cstdio> #include "affentry.hxx" +#include "csutil.hxx" -#ifndef WINDOWS +#ifndef W32 using namespace std; #endif -extern char * mystrdup(const char * s); -extern char * myrevstrdup(const char * s); PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) { @@ -21,73 +20,216 @@ PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) pmyMgr = pmgr; // set up its intial values - achar = dp->achar; // char flag + + aflag = dp->aflag; // flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string numconds = dp->numconds; // number of conditions to match - xpflg = dp->xpflg; // cross product flag + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0])); + memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); next = NULL; nextne = NULL; nexteq = NULL; + morphcode = dp->morphcode; + contclass = dp->contclass; + contclasslen = dp->contclasslen; } PfxEntry::~PfxEntry() { - achar = '\0'; + aflag = 0; if (appnd) free(appnd); - if (strip)free(strip); + if (strip) free(strip); pmyMgr = NULL; appnd = NULL; - strip = NULL; + strip = NULL; + if (opts & aeUTF8) { + for (int i = 0; i < 8; i++) { + if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]); + } + } + if (morphcode && !(opts & aeALIASM)) free(morphcode); + if (contclass && !(opts & aeALIASF)) free(contclass); } - - // add prefix to this word assuming conditions hold char * PfxEntry::add(const char * word, int len) { - int cond; - char tword[MAXWORDLEN+1]; + char tword[MAXWORDUTF8LEN + 4]; - /* make sure all conditions match */ - if ((len > stripl) && (len >= numconds)) { - unsigned char * cp = (unsigned char *) word; - for (cond = 0; cond < numconds; cond++) { - if ((conds[*cp++] & (1 << cond)) == 0) - break; - } - if (cond >= numconds) { - /* we have a match so add prefix */ - int tlen = 0; + if ((len > stripl) && (len >= numconds) && test_condition(word) && + (!stripl || (strncmp(word, strip, stripl) == 0)) && + ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { + /* we have a match so add prefix */ + char * pp = tword; if (appndl) { - strcpy(tword,appnd); - tlen += appndl; - } - char * pp = tword + tlen; + strcpy(tword,appnd); + pp += appndl; + } strcpy(pp, (word + stripl)); return mystrdup(tword); - } } return NULL; } +inline int PfxEntry::test_condition(const char * st) +{ + int cond; + unsigned char * cp = (unsigned char *)st; + if (!(opts & aeUTF8)) { // 256-character codepage + for (cond = 0; cond < numconds; cond++) { + if ((conds.base[*cp++] & (1 << cond)) == 0) return 0; + } + } else { // UTF-8 encoding + unsigned short wc; + for (cond = 0; cond < numconds; cond++) { + // a simple 7-bit ASCII character in UTF-8 + if ((*cp >> 7) == 0) { + // also check limit (end of word) + if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0; + // UTF-8 multibyte character + } else { + // not dot wildcard in rule + if (!conds.utf8.all[cond]) { + if (conds.utf8.neg[cond]) { + u8_u16((w_char *) &wc, 1, (char *) cp); + if (conds.utf8.wchars[cond] && + flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short) conds.utf8.wlen[cond])) return 0; + } else { + if (!conds.utf8.wchars[cond]) return 0; + u8_u16((w_char *) &wc, 1, (char *) cp); + if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short)conds.utf8.wlen[cond])) return 0; + } + } + // jump to next UTF-8 character + for(cp++; (*cp & 0xc0) == 0x80; cp++); + } + } + } + return 1; +} // check if this prefix entry matches -struct hentry * PfxEntry::check(const char * word, int len) +struct hentry * PfxEntry::check(const char * word, int len, char in_compound, const FLAG needflag) { - int cond; // condition number being examined int tmpl; // length of tmpword struct hentry * he; // hash entry of root word or NULL - unsigned char * cp; - char tmpword[MAXWORDLEN+1]; + char tmpword[MAXWORDUTF8LEN + 4]; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + if (stripl) strcpy (tmpword, strip); + strcpy ((tmpword + stripl), (word + appndl)); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword)) { + tmpl += stripl; + if ((he = pmyMgr->lookup(tmpword)) != NULL) { + do { + if (TESTAFF(he->astr, aflag, he->alen) && + // forbid single prefixes with pseudoroot flag + ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // needflag + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || + (contclass && TESTAFF(contclass, needflag, contclasslen)))) + return he; + } while ((he = he->next_homonym)); // check homonyms + } + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + //if ((opts & aeXPRODUCT) && in_compound) { + if ((opts & aeXPRODUCT)) { + he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, NULL, + 0, NULL, FLAG_NULL, needflag, in_compound); + if (he) return he; + } + } + } + return NULL; +} + +// check if this prefix entry matches +struct hentry * PfxEntry::check_twosfx(const char * word, int len, + char in_compound, const FLAG needflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry of root word or NULL + char tmpword[MAXWORDUTF8LEN + 4]; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + if (stripl) strcpy (tmpword, strip); + strcpy ((tmpword + stripl), (word + appndl)); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword)) { + tmpl += stripl; + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // cross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, needflag); + if (he) return he; + } + } + } + return NULL; +} + + +// check if this prefix entry matches +char * PfxEntry::check_twosfx_morph(const char * word, int len, + char in_compound, const FLAG needflag) +{ + int tmpl; // length of tmpword + char tmpword[MAXWORDUTF8LEN + 4]; // on entry prefix is 0 length or already matches the beginning of the word. // So if the remaining root word has positive length @@ -109,117 +251,317 @@ struct hentry * PfxEntry::check(const char * word, int len) // this file for more info on exactly what is being // tested - cp = (unsigned char *)tmpword; - for (cond = 0; cond < numconds; cond++) { - if ((conds[*cp++] & (1 << cond)) == 0) break; + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition(tmpword)) { + tmpl += stripl; + + // prefix matched but no root word was found + // if aeXPRODUCT is allowed, try again but now + // ross checked combined with a suffix + + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl, + aeXPRODUCT, (AffEntry *)this, needflag); + } } + } + return NULL; +} + +// check if this prefix entry matches +char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry of root word or NULL + char tmpword[MAXWORDUTF8LEN + 4]; + char result[MAXLNLEN]; + char * st; + + *result = '\0'; + + // on entry prefix is 0 length or already matches the beginning of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + + // generate new root word by removing prefix and adding + // back any characters that would have been stripped + + if (stripl) strcpy (tmpword, strip); + strcpy ((tmpword + stripl), (word + appndl)); + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested // if all conditions are met then check if resulting // root word in the dictionary - if (cond >= numconds) { + if (test_condition(tmpword)) { tmpl += stripl; if ((he = pmyMgr->lookup(tmpword)) != NULL) { - if (TESTAFF(he->astr, achar, he->alen)) return he; + do { + if (TESTAFF(he->astr, aflag, he->alen) && + // forbid single prefixes with pseudoroot flag + ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // needflag + ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || + (contclass && TESTAFF(contclass, needflag, contclasslen)))) { + if (morphcode) strcat(result, morphcode); else strcat(result,getKey()); + if (he->description) { + if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word); + strcat(result,he->description); + } + strcat(result, "\n"); + } + } while ((he = he->next_homonym)); } // prefix matched but no root word was found - // if XPRODUCT is allowed, try again but now + // if aeXPRODUCT is allowed, try again but now // ross checked combined with a suffix - if (xpflg & XPRODUCT) { - he = pmyMgr->suffix_check(tmpword, tmpl, XPRODUCT, (AffEntry *)this); - if (he) return he; + if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, (AffEntry *)this, + FLAG_NULL, needflag); + if (st) { + strcat(result, st); + free(st); + } } } } + + if (*result) return mystrdup(result); return NULL; } - SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) { // register affix manager pmyMgr = pmgr; // set up its intial values - achar = dp->achar; // char flag + aflag = dp->aflag; // char flag strip = dp->strip; // string to strip appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string numconds = dp->numconds; // number of conditions to match - xpflg = dp->xpflg; // cross product flag + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds[0],&dp->conds[0],SETSIZE*sizeof(conds[0])); + memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); rappnd = myrevstrdup(appnd); + + morphcode = dp->morphcode; + contclass = dp->contclass; + contclasslen = dp->contclasslen; } SfxEntry::~SfxEntry() { - achar = '\0'; + aflag = 0; if (appnd) free(appnd); if (rappnd) free(rappnd); if (strip) free(strip); pmyMgr = NULL; appnd = NULL; strip = NULL; + if (opts & aeUTF8) { + for (int i = 0; i < 8; i++) { + if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]); + } + } + if (morphcode && !(opts & aeALIASM)) free(morphcode); + if (contclass && !(opts & aeALIASF)) free(contclass); } - - // add suffix to this word assuming conditions hold char * SfxEntry::add(const char * word, int len) { - int cond; - char tword[MAXWORDLEN+1]; + char tword[MAXWORDUTF8LEN + 4]; /* make sure all conditions match */ - if ((len > stripl) && (len >= numconds)) { - unsigned char * cp = (unsigned char *) (word + len); - for (cond = numconds; --cond >=0; ) { - if ((conds[*--cp] & (1 << cond)) == 0) - break; - } - if (cond < 0) { + if ((len > stripl) && (len >= numconds) && test_condition(word + len, word) && + (!stripl || (strcmp(word + len - stripl, strip) == 0)) && + ((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) { /* we have a match so add suffix */ strcpy(tword,word); - int tlen = len; - if (stripl) { - tlen -= stripl; - } - char * pp = (tword + tlen); if (appndl) { - strcpy(pp,appnd); - tlen += appndl; - } else *pp = '\0'; - return mystrdup(tword); - } + strcpy(tword + len - stripl, appnd); + } else { + *(tword + len - stripl) = '\0'; + } + return mystrdup(tword); } return NULL; } +inline int SfxEntry::test_condition(const char * st, const char * beg) +{ + int cond; + unsigned char * cp = (unsigned char *) st; + if (!(opts & aeUTF8)) { // 256-character codepage + // Dömölki affix algorithm + for (cond = numconds; --cond >= 0; ) { + if ((conds.base[*--cp] & (1 << cond)) == 0) return 0; + } + } else { // UTF-8 encoding + unsigned short wc; + for (cond = numconds; --cond >= 0; ) { + // go to next character position and check limit + if ((char *) --cp < beg) return 0; + // a simple 7-bit ASCII character in UTF-8 + if ((*cp >> 7) == 0) { + if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0; + // UTF-8 multibyte character + } else { + // go to first character of UTF-8 multibyte character + for (; (*cp & 0xc0) == 0x80; cp--); + // not dot wildcard in rule + if (!conds.utf8.all[cond]) { + if (conds.utf8.neg[cond]) { + u8_u16((w_char *) &wc, 1, (char *) cp); + if (conds.utf8.wchars[cond] && + flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short) conds.utf8.wlen[cond])) return 0; + } else { + if (!conds.utf8.wchars[cond]) return 0; + u8_u16((w_char *) &wc, 1, (char *) cp); + if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], + wc, (short)conds.utf8.wlen[cond])) return 0; + } + } + } + } + } + return 1; +} + + // see if this suffix is present in the word -struct hentry * SfxEntry::check(const char * word, int len, int optflags, AffEntry* ppfx) +struct hentry * SfxEntry::check(const char * word, int len, int optflags, + AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag) +{ + int tmpl; // length of tmpword + struct hentry * he; // hash entry pointer + unsigned char * cp; + char tmpword[MAXWORDUTF8LEN + 4]; + PfxEntry* ep = (PfxEntry *) ppfx; + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0)) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + // the second condition is not enough for UTF-8 strings + // it checked in test_condition() + + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + strcpy (tmpword, word); + cp = (unsigned char *)(tmpword + tmpl); + if (stripl) { + strcpy ((char *)cp, strip); + tmpl += stripl; + cp = (unsigned char *)(tmpword + tmpl); + } else *cp = '\0'; + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being // tested + + // if all conditions are met then check if resulting + // root word in the dictionary + + if (test_condition((char *) cp, (char *) tmpword)) { + +#ifdef SZOSZABLYA_POSSIBLE_ROOTS + fprintf(stdout,"%s %s %c\n", word, tmpword, aflag); +#endif + if ((he = pmyMgr->lookup(tmpword)) != NULL) { + do { + // check conditional suffix (enabled by prefix) + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && + TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + (((optflags & aeXPRODUCT) == 0) || + TESTAFF(he->astr, ep->getFlag(), he->alen) || + // enabled by prefix + ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + ) && + // handle cont. class + ((!cclass) || + ((contclass) && TESTAFF(contclass, cclass, contclasslen)) + ) && + // handle required flag + ((!needflag) || + (TESTAFF(he->astr, needflag, he->alen) || + ((contclass) && TESTAFF(contclass, needflag, contclasslen))) + ) + ) return he; + } while ((he = he->next_homonym)); // check homonyms + + // obsolote stemming code (used only by the + // experimental SuffixMgr:suggest_pos_stems) + // store resulting root in wlst + } else if (wlst && (*ns < maxSug)) { + int cwrd = 1; + for (int k=0; k < *ns; k++) + if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0; + if (cwrd) { + wlst[*ns] = mystrdup(tmpword); + if (wlst[*ns] == NULL) { + for (int j=0; j<*ns; j++) free(wlst[j]); + *ns = -1; + return NULL; + } + (*ns)++; + } + } + } + } + return NULL; +} + +// see if two-level suffix is present in the word +struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, + AffEntry* ppfx, const FLAG needflag) { int tmpl; // length of tmpword - int cond; // condition beng examined struct hentry * he; // hash entry pointer unsigned char * cp; - char tmpword[MAXWORDLEN+1]; + char tmpword[MAXWORDUTF8LEN + 4]; PfxEntry* ep = (PfxEntry *) ppfx; // if this suffix is being cross checked with a prefix // but it does not support cross products skip it - if ((optflags & XPRODUCT) != 0 && (xpflg & XPRODUCT) == 0) + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) return NULL; // upon entry suffix is 0 length or already matches the end of the word. @@ -248,25 +590,135 @@ struct hentry * SfxEntry::check(const char * word, int len, int optflags, AffEnt // this file for more info on exactly what is being // tested - for (cond = numconds; --cond >= 0; ) { - if ((conds[*--cp] & (1 << cond)) == 0) break; - } + // if all conditions are met then recall suffix_check + + if (test_condition((char *) cp, (char *) tmpword)) { + if (ppfx) { + // handle conditional suffix + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); + else + he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag); + } else { + he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag); + } + if (he) return he; + } + } + return NULL; +} - // if all conditions are met then check if resulting - // root word in the dictionary - if (cond < 0) { - if ((he = pmyMgr->lookup(tmpword)) != NULL) { - if (TESTAFF(he->astr, achar , he->alen) && - ((optflags & XPRODUCT) == 0 || - TESTAFF(he->astr, ep->getFlag(), he->alen))) return he; - } - } +// see if two-level suffix is present in the word +char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, + AffEntry* ppfx, const FLAG needflag) +{ + int tmpl; // length of tmpword + unsigned char * cp; + char tmpword[MAXWORDUTF8LEN + 4]; + PfxEntry* ep = (PfxEntry *) ppfx; + char * st; + + char result[MAXLNLEN]; + + *result = '\0'; + + // if this suffix is being cross checked with a prefix + // but it does not support cross products skip it + + if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0) + return NULL; + + // upon entry suffix is 0 length or already matches the end of the word. + // So if the remaining root word has positive length + // and if there are enough chars in root word and added back strip chars + // to meet the number of characters conditions, then test it + + tmpl = len - appndl; + + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + + // generate new root word by removing suffix and adding + // back any characters that would have been stripped or + // or null terminating the shorter string + + strcpy (tmpword, word); + cp = (unsigned char *)(tmpword + tmpl); + if (stripl) { + strcpy ((char *)cp, strip); + tmpl += stripl; + cp = (unsigned char *)(tmpword + tmpl); + } else *cp = '\0'; + + // now make sure all of the conditions on characters + // are met. Please see the appendix at the end of + // this file for more info on exactly what is being + // tested + + // if all conditions are met then recall suffix_check + + if (test_condition((char *) cp, (char *) tmpword)) { + if (ppfx) { + // handle conditional suffix + if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); + if (st) { + if (((PfxEntry *) ppfx)->getMorph()) { + strcat(result, ((PfxEntry *) ppfx)->getMorph()); + } + strcat(result,st); + free(st); + mychomp(result); + } + } else { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag); + if (st) { + strcat(result, st); + free(st); + mychomp(result); + } + } + } else { + st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag); + if (st) { + strcat(result, st); + free(st); + mychomp(result); + } + } + if (*result) return mystrdup(result); + } } return NULL; } +// get next homonym with same affix +struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, + const FLAG cclass, const FLAG needflag) +{ + PfxEntry* ep = (PfxEntry *) ppfx; + while (he->next_homonym) { + he = he->next_homonym; + if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) && + ((optflags & aeXPRODUCT) == 0 || + TESTAFF(he->astr, ep->getFlag(), he->alen) || + // handle conditional suffix + ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) + ) && + // handle cont. class + ((!cclass) || + ((contclass) && TESTAFF(contclass, cclass, contclasslen)) + ) && + // handle required flag + ((!needflag) || + (TESTAFF(he->astr, needflag, he->alen) || + ((contclass) && TESTAFF(contclass, needflag, contclasslen))) + ) + ) return he; + } + return NULL; +} #if 0 @@ -286,14 +738,14 @@ The structure affentry is defined as follows: struct affentry { - unsigned char achar; // char used to represent the affix - char * strip; // string to strip before adding affix - char * appnd; // the affix string to add - short stripl; // length of the strip string - short appndl; // length of the affix string - short numconds; // the number of conditions that must be met - short xpflg; // flag: XPRODUCT- combine both prefix and suffix - char conds[SETSIZE]; // array which encodes the conditions to be met + unsigned short aflag; // ID used to represent the affix + char * strip; // string to strip before adding affix + char * appnd; // the affix string to add + unsigned char stripl; // length of the strip string + unsigned char appndl; // length of the affix string + char numconds; // the number of conditions that must be met + char opts; // flag: aeXPRODUCT- combine both prefix and suffix + char conds[SETSIZE]; // array which encodes the conditions to be met }; diff --git a/src/myspell/affentry.hxx b/src/myspell/affentry.hxx index 9c4713c..1dd784a 100644 --- a/src/myspell/affentry.hxx +++ b/src/myspell/affentry.hxx @@ -5,7 +5,6 @@ #include "baseaffix.hxx" #include "affixmgr.hxx" - /* A Prefix Entry */ class PfxEntry : public AffEntry @@ -22,13 +21,29 @@ public: PfxEntry(AffixMgr* pmgr, affentry* dp ); ~PfxEntry(); - struct hentry * check(const char * word, int len); + inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); } + struct hentry * check(const char * word, int len, char in_compound, + const FLAG needflag = FLAG_NULL); + + struct hentry * check_twosfx(const char * word, int len, char in_compound, const FLAG needflag = NULL); + + char * check_morph(const char * word, int len, char in_compound, + const FLAG needflag = FLAG_NULL); + + char * check_twosfx_morph(const char * word, int len, + char in_compound, const FLAG needflag = FLAG_NULL); - inline bool allowCross() { return ((xpflg & XPRODUCT) != 0); } - inline unsigned char getFlag() { return achar; } + inline FLAG getFlag() { return aflag; } inline const char * getKey() { return appnd; } char * add(const char * word, int len); + inline short getKeyLen() { return appndl; } + + inline const char * getMorph() { return morphcode; } + + inline const unsigned short * getCont() { return contclass; } + inline short getContLen() { return contclasslen; } + inline PfxEntry * getNext() { return next; } inline PfxEntry * getNextNE() { return nextne; } inline PfxEntry * getNextEQ() { return nexteq; } @@ -38,6 +53,8 @@ public: inline void setNextNE(PfxEntry * ptr) { nextne = ptr; } inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; } + + inline int test_condition(const char * st); }; @@ -54,23 +71,50 @@ class SfxEntry : public AffEntry SfxEntry * nexteq; SfxEntry * nextne; SfxEntry * flgnxt; + + SfxEntry * l_morph; + SfxEntry * r_morph; + SfxEntry * eq_morph; public: SfxEntry(AffixMgr* pmgr, affentry* dp ); ~SfxEntry(); + inline bool allowCross() { return ((opts & aeXPRODUCT) != 0); } struct hentry * check(const char * word, int len, int optflags, - AffEntry* ppfx); + AffEntry* ppfx, char ** wlst, int maxSug, int * ns, + const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL); + + struct hentry * check_twosfx(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag = NULL); - inline bool allowCross() { return ((xpflg & XPRODUCT) != 0); } - inline unsigned char getFlag() { return achar; } + char * check_twosfx_morph(const char * word, int len, int optflags, + AffEntry* ppfx, const FLAG needflag = FLAG_NULL); + struct hentry * get_next_homonym(struct hentry * he); + struct hentry * get_next_homonym(struct hentry * word, int optflags, AffEntry* ppfx, + const FLAG cclass, const FLAG needflag); + + + inline FLAG getFlag() { return aflag; } inline const char * getKey() { return rappnd; } char * add(const char * word, int len); + + inline const char * getMorph() { return morphcode; } + + inline const unsigned short * getCont() { return contclass; } + inline short getContLen() { return contclasslen; } + inline const char * getAffix() { return appnd; } + + inline short getKeyLen() { return appndl; } + inline SfxEntry * getNext() { return next; } inline SfxEntry * getNextNE() { return nextne; } inline SfxEntry * getNextEQ() { return nexteq; } + + inline SfxEntry * getLM() { return l_morph; } + inline SfxEntry * getRM() { return r_morph; } + inline SfxEntry * getEQM() { return eq_morph; } inline SfxEntry * getFlgNxt() { return flgnxt; } inline void setNext(SfxEntry * ptr) { next = ptr; } @@ -78,9 +122,9 @@ public: inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; } + inline int test_condition(const char * st, const char * begin); }; - #endif diff --git a/src/myspell/affixmgr.cxx b/src/myspell/affixmgr.cxx index 3a5714b..69220e5 100644 --- a/src/myspell/affixmgr.cxx +++ b/src/myspell/affixmgr.cxx @@ -1,50 +1,104 @@ -#include "license.readme" +#include "license.hunspell" +#include "license.myspell" #include <cstdlib> #include <cstring> +#include <cctype> #include <cstdio> #include "affixmgr.hxx" #include "affentry.hxx" +#include "langnum.hxx" -#ifndef WINDOWS +#include "csutil.hxx" + +#ifndef W32 using namespace std; #endif - -// First some base level utility routines -extern void mychomp(char * s); -extern char * mystrdup(const char * s); -extern char * myrevstrdup(const char * s); -extern char * mystrsep(char ** sptr, const char delim); -extern int isSubset(const char * s1, const char * s2); -extern int isRevSubset(const char * s1, const char * end_of_s2, int len_s2); - - AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) { // register hash manager and load affix data from aff file pHMgr = ptr; trystring = NULL; encoding=NULL; - reptable = NULL; - numrep = 0; + utf8 = 0; + utf_tbl = NULL; + complexprefixes = 0; maptable = NULL; nummap = 0; - compound=NULL; - nosplitsugs= (0==1); - + breaktable = NULL; + numbreak = 0; + reptable = NULL; + numrep = 0; + checkcpdtable = NULL; + numcheckcpd = 0; + defcpdtable = NULL; + numdefcpd = 0; + compoundflag = FLAG_NULL; // permits word in compound forms + compoundbegin = FLAG_NULL; // may be first word in compound forms + compoundmiddle = FLAG_NULL; // may be middle word in compound forms + compoundend = FLAG_NULL; // may be last word in compound forms + compoundroot = FLAG_NULL; // compound word signing flag + compoundpermitflag = FLAG_NULL; // compound permitting flag for suffixed word + compoundforbidflag = FLAG_NULL; // compound fordidden flag for suffixed word + checkcompounddup = 0; // forbid double words in compounds + checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution) + checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds + checkcompoundtriple = 0; // forbid compounds with triple letters + forbiddenword = FLAG_NULL; // forbidden word signing flag + nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag + lang = NULL; // language + langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) + pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes + cpdwordmax=0; // default: unlimited wordcount in compound words cpdmin = 3; // default value + cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words + cpdvowels=NULL; // vowels (for calculating of Hungarian compounding limit, O(n) search! XXX) + cpdvowels_utf16=NULL; // vowels for UTF-8 encoding (bsearch instead of O(n) search) + cpdvowels_utf16_len=0; // vowels + pfxappnd=NULL; // previous prefix for counting the syllables of prefix BUG + sfxappnd=NULL; // previous suffix for counting a special syllables BUG + cpdsyllablenum=NULL; // syllable count incrementing flag + checknum=0; // checking numbers, and word with numbers + wordchars=NULL; // letters + spec. word characters + wordchars_utf16=NULL; // letters + spec. word characters + wordchars_utf16_len=0; // letters + spec. word characters + version=NULL; // affix and dictionary file version string + havecontclass=0; // flags of possible continuing classes (double affix) + // LEMMA_PRESENT: not put root into the morphological output. Lemma presents + // in morhological description in dictionary file. It's often combined with PSEUDOROOT. + lemma_present = FLAG_NULL; + circumfix = FLAG_NULL; + onlyincompound = FLAG_NULL; + flag_mode = FLAG_CHAR; // default one-character flags in affix and dic file + maxngramsugs = -1; // undefined + nosplitsugs = 0; + sugswithdots = 0; + keepcase = 0; + checksharps = 0; + + derived = NULL; // XXX not threadsafe variable for experimental stemming + sfx = NULL; + pfx = NULL; + for (int i=0; i < SETSIZE; i++) { pStart[i] = NULL; sStart[i] = NULL; pFlag[i] = NULL; sFlag[i] = NULL; } + + for (int j=0; j < CONTSIZE; j++) { + contclasses[j] = 0; + } + if (parse_file(affpath)) { fprintf(stderr,"Failure loading aff file %s\n",affpath); fflush(stderr); + wordchars = mystrdup("qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM"); } + } @@ -74,7 +128,8 @@ AffixMgr::~AffixMgr() delete(ptr); ptr = nptr; nptr = NULL; - } + } + sStart[j] = NULL; } if (trystring) free(trystring); @@ -83,7 +138,8 @@ AffixMgr::~AffixMgr() encoding=NULL; if (maptable) { for (int j=0; j < nummap; j++) { - free(maptable[j].set); + if (maptable[j].set) free(maptable[j].set); + if (maptable[j].set_utf16) free(maptable[j].set_utf16); maptable[j].set = NULL; maptable[j].len = 0; } @@ -91,21 +147,73 @@ AffixMgr::~AffixMgr() maptable = NULL; } nummap = 0; + if (breaktable) { + for (int j=0; j < numbreak; j++) { + if (breaktable[j]) free(breaktable[j]); + breaktable[j] = NULL; + } + free(breaktable); + breaktable = NULL; + } + numbreak = 0; if (reptable) { for (int j=0; j < numrep; j++) { free(reptable[j].pattern); - free(reptable[j].replacement); + free(reptable[j].pattern2); reptable[j].pattern = NULL; - reptable[j].replacement = NULL; + reptable[j].pattern2 = NULL; } free(reptable); reptable = NULL; } + if (defcpdtable) { + for (int j=0; j < numdefcpd; j++) { + free(defcpdtable[j].def); + defcpdtable[j].def = NULL; + } + free(defcpdtable); + defcpdtable = NULL; + } numrep = 0; - if (compound) free(compound); - compound=NULL; + if (checkcpdtable) { + for (int j=0; j < numcheckcpd; j++) { + free(checkcpdtable[j].pattern); + free(checkcpdtable[j].pattern2); + checkcpdtable[j].pattern = NULL; + checkcpdtable[j].pattern2 = NULL; + } + free(checkcpdtable); + checkcpdtable = NULL; + } + numcheckcpd = 0; + FREE_FLAG(compoundflag); + FREE_FLAG(compoundbegin); + FREE_FLAG(compoundmiddle); + FREE_FLAG(compoundend); + FREE_FLAG(compoundpermitflag); + FREE_FLAG(compoundforbidflag); + FREE_FLAG(compoundroot); + FREE_FLAG(forbiddenword); + FREE_FLAG(nosuggest); + FREE_FLAG(pseudoroot); + FREE_FLAG(lemma_present); + FREE_FLAG(circumfix); + FREE_FLAG(onlyincompound); + + cpdwordmax = 0; pHMgr = NULL; cpdmin = 0; + cpdmaxsyllable = 0; + if (cpdvowels) free(cpdvowels); + if (cpdvowels_utf16) free(cpdvowels_utf16); + if (cpdsyllablenum) free(cpdsyllablenum); + if (utf_tbl) free(utf_tbl); + if (lang) free(lang); + if (wordchars) free(wordchars); + if (wordchars_utf16) free(wordchars_utf16); + if (version) free(version); + if (derived) free(derived); + checknum=0; } @@ -118,6 +226,10 @@ int AffixMgr::parse_file(const char * affpath) // affix type char ft; + + // checking flag duplication + char dupflags[CONTSIZE]; + char dupflags_ini = 1; // open the affix file FILE * afflst; @@ -151,16 +263,167 @@ int AffixMgr::parse_file(const char * affpath) } } + /* parse COMPLEXPREFIXES for agglutinative languages with right-to-left writing system */ + if (strncmp(line,"COMPLEXPREFIXES",15) == 0) + complexprefixes = 1; + /* parse in the flag used by the controlled compound words */ if (strncmp(line,"COMPOUNDFLAG",12) == 0) { - if (parse_cpdflag(line)) { + if (parse_flag(line, &compoundflag, "COMPOUNDFLAG")) { return 1; } } - /* parse in the flag used by the controlled compound words */ + /* parse in the flag used by compound words */ + if (strncmp(line,"COMPOUNDBEGIN",13) == 0) { + if (complexprefixes) { + if (parse_flag(line, &compoundend, "COMPOUNDBEGIN")) { + return 1; + } + } else { + if (parse_flag(line, &compoundbegin, "COMPOUNDBEGIN")) { + return 1; + } + } + } + + /* parse in the flag used by compound words */ + if (strncmp(line,"COMPOUNDMIDDLE",14) == 0) { + if (parse_flag(line, &compoundmiddle, "COMPOUNDMIDDLE")) { + return 1; + } + } + /* parse in the flag used by compound words */ + if (strncmp(line,"COMPOUNDEND",11) == 0) { + if (complexprefixes) { + if (parse_flag(line, &compoundbegin, "COMPOUNDEND")) { + return 1; + } + } else { + if (parse_flag(line, &compoundend, "COMPOUNDEND")) { + return 1; + } + } + } + + /* parse in the flag used by compound_check() method */ + if (strncmp(line,"COMPOUNDWORDMAX",15) == 0) { + if (parse_num(line, &cpdwordmax, "COMPOUNDWORDMAX")) { + return 1; + } + } + + /* parse in the flag sign compounds in dictionary */ + if (strncmp(line,"COMPOUNDROOT",12) == 0) { + if (parse_flag(line, &compoundroot, "COMPOUNDROOT")) { + return 1; + } + } + + /* parse in the flag used by compound_check() method */ + if (strncmp(line,"COMPOUNDPERMITFLAG",18) == 0) { + if (parse_flag(line, &compoundpermitflag, "COMPOUNDPERMITFLAG")) { + return 1; + } + } + + /* parse in the flag used by compound_check() method */ + if (strncmp(line,"COMPOUNDFORBIDFLAG",18) == 0) { + if (parse_flag(line, &compoundforbidflag, "COMPOUNDFORBIDFLAG")) { + return 1; + } + } + + if (strncmp(line,"CHECKCOMPOUNDDUP",16) == 0) + checkcompounddup = 1; + + if (strncmp(line,"CHECKCOMPOUNDREP",16) == 0) + checkcompoundrep = 1; + + if (strncmp(line,"CHECKCOMPOUNDTRIPLE",19) == 0) + checkcompoundtriple = 1; + + if (strncmp(line,"CHECKCOMPOUNDCASE",17) == 0) + checkcompoundcase = 1; + + if (strncmp(line,"NOSUGGEST",9) == 0) { + if (parse_flag(line, &nosuggest, "NOSUGGEST")) { + return 1; + } + } + + /* parse in the flag used by forbidden words */ + if (strncmp(line,"FORBIDDENWORD",13) == 0) { + if (parse_flag(line, &forbiddenword, "FORBIDDENWORD")) { + return 1; + } + } + + /* parse in the flag used by forbidden words */ + if (strncmp(line,"LEMMA_PRESENT",13) == 0) { + if (parse_flag(line, &lemma_present, "LEMMA_PRESENT")) { + return 1; + } + } + + /* parse in the flag used by circumfixes */ + if (strncmp(line,"CIRCUMFIX",9) == 0) { + if (parse_flag(line, &circumfix, "CIRCUMFIX")) { + return 1; + } + } + + /* parse in the flag used by fogemorphemes */ + if (strncmp(line,"ONLYINCOMPOUND",14) == 0) { + if (parse_flag(line, &onlyincompound, "ONLYINCOMPOUND")) { + return 1; + } + } + + /* parse in the flag used by `pseudoroots' */ + if (strncmp(line,"PSEUDOROOT",10) == 0) { + if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) { + return 1; + } + } + + /* parse in the flag used by `pseudoroots' */ + if (strncmp(line,"NEEDAFFIX",9) == 0) { + if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) { + return 1; + } + } + + /* parse in the minimal length for words in compounds */ if (strncmp(line,"COMPOUNDMIN",11) == 0) { - if (parse_cpdmin(line)) { + if (parse_num(line, &cpdmin, "COMPOUNDMIN")) { + return 1; + } + if (cpdmin < 1) cpdmin = 1; + } + + /* parse in the max. words and syllables in compounds */ + if (strncmp(line,"COMPOUNDSYLLABLE",16) == 0) { + if (parse_cpdsyllable(line)) { + return 1; + } + } + + /* parse in the flag used by compound_check() method */ + if (strncmp(line,"SYLLABLENUM",11) == 0) { + if (parse_syllablenum(line)) { + return 1; + } + } + + /* parse in the flag used by the controlled compound words */ + if (strncmp(line,"CHECKNUM",8) == 0) { + checknum=1; + } + + /* parse in the try string */ + if (strncmp(line,"WORDCHARS",9) == 0) { + if (parse_wordchars(line)) { return 1; } } @@ -172,6 +435,20 @@ int AffixMgr::parse_file(const char * affpath) } } + /* parse in the checkcompoundpattern table */ + if (strncmp(line,"CHECKCOMPOUNDPATTERN",20) == 0) { + if (parse_checkcpdtable(line, afflst)) { + return 1; + } + } + + /* parse in the defcompound table */ + if (strncmp(line,"COMPOUNDRULE",12) == 0) { + if (parse_defcpdtable(line, afflst)) { + return 1; + } + } + /* parse in the related character map table */ if (strncmp(line,"MAP",3) == 0) { if (parse_maptable(line, afflst)) { @@ -179,19 +456,64 @@ int AffixMgr::parse_file(const char * affpath) } } - // parse this affix: P - prefix, S - suffix - ft = ' '; - if (strncmp(line,"PFX",3) == 0) ft = 'P'; - if (strncmp(line,"SFX",3) == 0) ft = 'S'; - if (ft != ' ') { - if (parse_affix(line, ft, afflst)) { + /* parse in the word breakpoints table */ + if (strncmp(line,"BREAK",5) == 0) { + if (parse_breaktable(line, afflst)) { + return 1; + } + } + + /* parse in the language for language specific codes */ + if (strncmp(line,"LANG",4) == 0) { + if (parse_lang(line)) { + return 1; + } + } + + if (strncmp(line,"VERSION",7) == 0) { + if (parse_version(line)) { + return 1; + } + } + + if (strncmp(line,"MAXNGRAMSUGS",12) == 0) { + if (parse_num(line, &maxngramsugs, "MAXNGRAMSUGS")) { return 1; } } - // handle NOSPLITSUGS if (strncmp(line,"NOSPLITSUGS",11) == 0) - nosplitsugs=(0==0); + nosplitsugs=1; + + if (strncmp(line,"SUGSWITHDOTS",12) == 0) + sugswithdots=1; + + /* parse in the flag used by forbidden words */ + if (strncmp(line,"KEEPCASE",8) == 0) { + if (parse_flag(line, &keepcase, "KEEPCASE")) { + return 1; + } + } + + if (strncmp(line,"CHECKSHARPS",11) == 0) + checksharps=1; + + /* parse this affix: P - prefix, S - suffix */ + ft = ' '; + if (strncmp(line,"PFX",3) == 0) ft = complexprefixes ? 'S' : 'P'; + if (strncmp(line,"SFX",3) == 0) ft = complexprefixes ? 'P' : 'S'; + if (ft != ' ') { + if (dupflags_ini) { + for (int i = 0; i < CONTSIZE; i++) dupflags[i] = 0; + dupflags_ini = 0; + } + if (parse_affix(line, ft, afflst, dupflags)) { + fclose(afflst); + process_pfx_tree_to_list(); + process_sfx_tree_to_list(); + return 1; + } + } } fclose(afflst); @@ -226,6 +548,29 @@ int AffixMgr::parse_file(const char * affpath) process_pfx_order(); process_sfx_order(); + // expand wordchars string, based on csutil (for external tokenization) + + char * enc = get_encoding(); + csconv = get_current_cs(enc); + free(enc); + enc = NULL; + + char expw[MAXLNLEN]; + if (wordchars) { + strcpy(expw, wordchars); + free(wordchars); + } else *expw = '\0'; + + for (int i = 0; i <= 255; i++) { + if ( (csconv[i].cupper != csconv[i].clower) && + (! strchr(expw, (char) i))) { + *(expw + strlen(expw) + 1) = '\0'; + *(expw + strlen(expw)) = (char) i; + } + } + + wordchars = mystrdup(expw); + return 0; } @@ -295,8 +640,6 @@ int AffixMgr::build_pfxtree(AffEntry* pfxptr) return 0; } - - // we want to be able to quickly access suffix information // both by suffix flag, and sorted by the reverse of the // suffix string itself; so we need to set up two indexes @@ -315,7 +658,6 @@ int AffixMgr::build_sfxtree(AffEntry* sfxptr) ep->setFlgNxt(ptr); sFlag[flg] = (AffEntry *) ep; - // next index by affix string // handle the special case of null affix string @@ -340,7 +682,6 @@ int AffixMgr::build_sfxtree(AffEntry* sfxptr) return 0; } - // otherwise use binary tree insertion so that a sorted // list can easily be generated later pptr = NULL; @@ -363,7 +704,6 @@ int AffixMgr::build_sfxtree(AffEntry* sfxptr) return 0; } - // convert from binary tree to sorted list int AffixMgr::process_pfx_tree_to_list() { @@ -405,7 +745,6 @@ AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr) } - // reinitialize the PfxEntry links NextEQ and NextNE to speed searching // using the idea of leading subsets this time int AffixMgr::process_pfx_order() @@ -455,9 +794,7 @@ int AffixMgr::process_pfx_order() return 0; } - - -// reinitialize the SfxEntry links NextEQ and NextNE to speed searching +// initialize the SfxEntry links NextEQ and NextNE to speed searching // using the idea of leading subsets this time int AffixMgr::process_sfx_order() { @@ -513,14 +850,16 @@ int AffixMgr::process_sfx_order() // file affentry.cxx which describes what is going on here // in much more detail -void AffixMgr::encodeit(struct affentry * ptr, char * cs) +int AffixMgr::encodeit(struct affentry * ptr, char * cs) { unsigned char c; int i, j, k; unsigned char mbr[MAXLNLEN]; + w_char wmbr[MAXLNLEN]; + w_char * wpos = wmbr; // now clear the conditions array */ - for (i=0;i<SETSIZE;i++) ptr->conds[i] = (unsigned char) 0; + for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0; // now parse the string to create the conds array */ int nc = strlen(cs); @@ -533,7 +872,7 @@ void AffixMgr::encodeit(struct affentry * ptr, char * cs) // if no condition just return if (strcmp(cs,".")==0) { ptr->numconds = 0; - return; + return 0; } i = 0; @@ -570,21 +909,21 @@ void AffixMgr::encodeit(struct affentry * ptr, char * cs) ec = 1; } - - if (ec) { + if (ec) { + if (!utf8) { if (grp == 1) { if (neg == 0) { // set the proper bits in the condition array vals for those chars for (j=0;j<nm;j++) { k = (unsigned int) mbr[j]; - ptr->conds[k] = ptr->conds[k] | (1 << n); + ptr->conds.base[k] = ptr->conds.base[k] | (1 << n); } } else { // complement so set all of them and then unset indicated ones - for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n); + for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | (1 << n); for (j=0;j<nm;j++) { k = (unsigned int) mbr[j]; - ptr->conds[k] = ptr->conds[k] & ~(1 << n); + ptr->conds.base[k] = ptr->conds.base[k] & ~(1 << n); } } neg = 0; @@ -595,33 +934,115 @@ void AffixMgr::encodeit(struct affentry * ptr, char * cs) // but first handle special case of . inside condition if (c == '.') { // wild card character so set them all - for (j=0;j<SETSIZE;j++) ptr->conds[j] = ptr->conds[j] | (1 << n); + for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | (1 << n); } else { - ptr->conds[(unsigned int) c] = ptr->conds[(unsigned int)c] | (1 << n); + ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | (1 << n); } } n++; ec = 0; - } - + } else { // UTF-8 character set + if (grp == 1) { + ptr->conds.utf8.neg[n] = neg; + if (neg == 0) { + // set the proper bits in the condition array vals for those chars + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + if (k >> 7) { + u8_u16(wpos, 1, (char *) mbr + j); + wpos++; + if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character + } else { + ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | (1 << n); + } + } + } else { // neg == 1 + // complement so set all of them and then unset indicated ones + for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | (1 << n); + for (j=0;j<nm;j++) { + k = (unsigned int) mbr[j]; + if (k >> 7) { + u8_u16(wpos, 1, (char *) mbr + j); + wpos++; + if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character + } else { + ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~(1 << n); + } + } + } + neg = 0; + grp = 0; + nm = 0; + ptr->conds.utf8.wlen[n] = wpos - wmbr; + if ((wpos - wmbr) != 0) { + ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr)); + if (!ptr->conds.utf8.wchars[n]) return 1; + memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr)); + flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]); + wpos = wmbr; + } + } else { // grp == 0 + // is UTF-8 character? + if (c >> 7) { + ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char)); + if (!ptr->conds.utf8.wchars[n]) return 1; + ptr->conds.utf8.wlen[n] = 1; + u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i); + if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character + } else { + ptr->conds.utf8.wchars[n] = NULL; + // not a group so just set the proper bit for this char + // but first handle special case of . inside condition + if (c == '.') { + ptr->conds.utf8.all[n] = 1; + // wild card character so set them all + for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | (1 << n); + } else { + ptr->conds.utf8.all[n] = 0; + ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | (1 << n); + } + } + neg = 0; + } + n++; + ec = 0; + neg = 0; + } + } i++; } ptr->numconds = n; - return; + return 0; } // check word for prefixes -struct hentry * AffixMgr::prefix_check (const char * word, int len) +struct hentry * AffixMgr::prefix_check(const char * word, int len, char in_compound, + const FLAG needflag) { struct hentry * rv= NULL; - + + pfx = NULL; + pfxappnd = NULL; + sfxappnd = NULL; + // first handle the special case of 0 length prefixes PfxEntry * pe = (PfxEntry *) pStart[0]; while (pe) { - rv = pe->check(word,len); - if (rv) return rv; + if ( + // fogemorpheme + ((in_compound != IN_CPD_NOT) || !(pe->getCont() && + (TESTAFF(pe->getCont(), onlyincompound, pe->getContLen())))) && + // permit prefixes in compounds + ((in_compound != IN_CPD_END) || (pe->getCont() && + (TESTAFF(pe->getCont(), compoundpermitflag, pe->getContLen())))) && + // check prefix + (rv = pe->check(word, len, in_compound, needflag)) + ) { + pfx=(AffEntry *)pe; // BUG: pfx not stateless + return rv; + } pe = pe->getNext(); } @@ -631,8 +1052,19 @@ struct hentry * AffixMgr::prefix_check (const char * word, int len) while (pptr) { if (isSubset(pptr->getKey(),word)) { - rv = pptr->check(word,len); - if (rv) return rv; + if ( + // fogemorpheme + ((in_compound != IN_CPD_NOT) || !(pptr->getCont() && + (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen())))) && + // permit prefixes in compounds + ((in_compound != IN_CPD_END) || (pptr->getCont() && + (TESTAFF(pptr->getCont(), compoundpermitflag, pptr->getContLen())))) && + // check prefix + (rv = pptr->check(word, len, in_compound, needflag)) + ) { + pfx=(AffEntry *)pptr; // BUG: pfx not stateless + return rv; + } pptr = pptr->getNextEQ(); } else { pptr = pptr->getNextNE(); @@ -642,113 +1074,1574 @@ struct hentry * AffixMgr::prefix_check (const char * word, int len) return NULL; } -// check if compound word is correctly spelled -struct hentry * AffixMgr::compound_check (const char * word, int len, char compound_flag) +// check word for prefixes +struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, + char in_compound, const FLAG needflag) { - int i; struct hentry * rv= NULL; + + pfx = NULL; + sfxappnd = NULL; + + // first handle the special case of 0 length prefixes + PfxEntry * pe = (PfxEntry *) pStart[0]; + + while (pe) { + rv = pe->check_twosfx(word, len, in_compound, needflag); + if (rv) return rv; + pe = pe->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char *)word); + PfxEntry * pptr = (PfxEntry *)pStart[sp]; + + while (pptr) { + if (isSubset(pptr->getKey(),word)) { + rv = pptr->check_twosfx(word, len, in_compound, needflag); + if (rv) { + pfx = (AffEntry *)pptr; + return rv; + } + pptr = pptr->getNextEQ(); + } else { + pptr = pptr->getNextNE(); + } + } + + return NULL; +} + + +// check word for prefixes +char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound, + const FLAG needflag) +{ char * st; - char ch; + + char result[MAXLNLEN]; + result[0] = '\0'; + + pfx = NULL; + sfxappnd = NULL; - // handle case of string too short to be a piece of a compound word - if (len < cpdmin) return NULL; + // first handle the special case of 0 length prefixes + PfxEntry * pe = (PfxEntry *) pStart[0]; + while (pe) { + st = pe->check_morph(word,len,in_compound, needflag); + if (st) { + strcat(result, st); + free(st); + } + // if (rv) return rv; + pe = pe->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char *)word); + PfxEntry * pptr = (PfxEntry *)pStart[sp]; - st = mystrdup(word); + while (pptr) { + if (isSubset(pptr->getKey(),word)) { + st = pptr->check_morph(word,len,in_compound, needflag); + if (st) { + // fogemorpheme + if ((in_compound != IN_CPD_NOT) || !((pptr->getCont() && + (TESTAFF(pptr->getCont(), onlyincompound, pptr->getContLen()))))) { + strcat(result, st); + pfx = (AffEntry *)pptr; + } + free(st); + } + pptr = pptr->getNextEQ(); + } else { + pptr = pptr->getNextNE(); + } + } - for (i=cpdmin; i < (len - (cpdmin-1)); i++) { + if (*result) return mystrdup(result); + return NULL; +} + + +// check word for prefixes +char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, + char in_compound, const FLAG needflag) +{ + char * st; + + char result[MAXLNLEN]; + result[0] = '\0'; + + pfx = NULL; + sfxappnd = NULL; + + // first handle the special case of 0 length prefixes + PfxEntry * pe = (PfxEntry *) pStart[0]; + while (pe) { + st = pe->check_twosfx_morph(word,len,in_compound, needflag); + if (st) { + strcat(result, st); + free(st); + } + pe = pe->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char *)word); + PfxEntry * pptr = (PfxEntry *)pStart[sp]; + + while (pptr) { + if (isSubset(pptr->getKey(),word)) { + st = pptr->check_twosfx_morph(word, len, in_compound, needflag); + if (st) { + strcat(result, st); + free(st); + pfx = (AffEntry *)pptr; + } + pptr = pptr->getNextEQ(); + } else { + pptr = pptr->getNextNE(); + } + } + + if (*result) return mystrdup(result); + return NULL; +} + +// Is word a non compound with a REP substitution (see checkcompoundrep)? +int AffixMgr::cpdrep_check(const char * word, int wl) +{ + char candidate[MAXLNLEN]; + const char * r; + int lenr, lenp; + + if ((wl < 2) || !numrep) return 0; + + for (int i=0; i < numrep; i++ ) { + r = word; + lenr = strlen(reptable[i].pattern2); + lenp = strlen(reptable[i].pattern); + // search every occurence of the pattern in the word + while ((r=strstr(r, reptable[i].pattern)) != NULL) { + strcpy(candidate, word); + if (r-word + lenr + strlen(r+lenp) >= MAXLNLEN) break; + strcpy(candidate+(r-word),reptable[i].pattern2); + strcpy(candidate+(r-word)+lenr, r+lenp); + if (candidate_check(candidate,strlen(candidate))) return 1; + if (candidate_check(candidate,strlen(candidate))) return 1; + r++; // search for the next letter + } + } + return 0; +} + +// forbid compoundings when there are special patterns at word bound +int AffixMgr::cpdpat_check(const char * word, int pos) +{ + int len; + for (int i = 0; i < numcheckcpd; i++) { + if (isSubset(checkcpdtable[i].pattern2, word + pos) && + (len = strlen(checkcpdtable[i].pattern)) && (pos > len) && + (strncmp(word + pos - len, checkcpdtable[i].pattern, len) == 0)) return 1; + } + return 0; +} + +// forbid compounding with neighbouring upper and lower case characters at word bounds +int AffixMgr::cpdcase_check(const char * word, int pos) +{ + if (utf8) { + w_char u, w; + const char * p; + u8_u16(&u, 1, word + pos); + for (p = word + pos - 1; (*p & 0xc0) == 0x80; p--); + u8_u16(&w, 1, p); + unsigned short a = (u.h << 8) + u.l; + unsigned short b = (w.h << 8) + w.l; + if (utf_tbl[a].cletter && utf_tbl[a].cletter && + ((utf_tbl[a].cupper == a) || (utf_tbl[b].cupper == b))) return 1; + } else { + unsigned char a = *(word + pos - 1); + unsigned char b = *(word + pos); + if ((csconv[a].ccase || csconv[b].ccase) && (a != '-') && (b != '-')) return 1; + } + return 0; +} + +// check compound patterns +int AffixMgr::defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** def, char all) +{ + short btpp[MAXWORDLEN]; // metacharacter (*, ?) positions for backtracking + short btwp[MAXWORDLEN]; // word positions for metacharacters + int btnum[MAXWORDLEN]; // number of matched characters in metacharacter positions + short bt = 0; + int i; + int ok; + int w = 0; + if (!*words) { + w = 1; + *words = def; + } + (*words)[wnum] = rv; + + for (i = 0; i < numdefcpd; i++) { + int pp = 0; // pattern position + int wp = 0; // "words" position + int ok2; + ok = 1; + ok2 = 1; + do { + while ((pp < defcpdtable[i].len) && (wp <= wnum)) { + if (((pp+1) < defcpdtable[i].len) && + ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) { + int wend = (defcpdtable[i].def[pp+1] == '?') ? wp : wnum; + ok2 = 1; + pp+=2; + btpp[bt] = pp; + btwp[bt] = wp; + while (wp <= wend) { + if (!(*words)[wp]->alen || + !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp-2], (*words)[wp]->alen)) { + ok2 = 0; + break; + } + wp++; + } + if (wp <= wnum) ok2 = 0; + btnum[bt] = wp - btwp[bt]; + if (btnum[bt] > 0) bt++; + if (ok2) break; + } else { + ok2 = 1; + if (!(*words)[wp] || !(*words)[wp]->alen || + !TESTAFF((*words)[wp]->astr, defcpdtable[i].def[pp], (*words)[wp]->alen)) { + ok = 0; + break; + } + pp++; + wp++; + if ((defcpdtable[i].len == pp) && !(wp > wnum)) ok = 0; + } + } + if (ok && ok2) { + int r = pp; + while ((defcpdtable[i].len > r) && ((r+1) < defcpdtable[i].len) && + ((defcpdtable[i].def[r+1] == '*') || (defcpdtable[i].def[r+1] == '?'))) r+=2; + if (defcpdtable[i].len <= r) return 1; + } + // backtrack + if (bt) do { + ok = 1; + btnum[bt - 1]--; + pp = btpp[bt - 1]; + wp = btwp[bt - 1] + btnum[bt - 1]; + } while ((btnum[bt - 1] < 0) && --bt); + } while (bt); + + if (ok && ok2 && (!all || (defcpdtable[i].len <= pp))) return 1; + // check zero ending + while (ok && ok2 && (defcpdtable[i].len > pp) && ((pp+1) < defcpdtable[i].len) && + ((defcpdtable[i].def[pp+1] == '*') || (defcpdtable[i].def[pp+1] == '?'))) pp+=2; + if (ok && ok2 && (defcpdtable[i].len <= pp)) return 1; + } + (*words)[wnum] = NULL; + if (w) *words = NULL; + return 0; +} + +inline int AffixMgr::candidate_check(const char * word, int len) +{ + struct hentry * rv=NULL; + + rv = lookup(word); + if (rv) return 1; + +// rv = prefix_check(word,len,1); +// if (rv) return 1; + + rv = affix_check(word,len); + if (rv) return 1; + return 0; +} + +// calculate number of syllable for compound-checking +int AffixMgr::get_syllable(const char * word, int wlen) +{ + if (cpdmaxsyllable==0) return 0; + + int num=0; + + if (!utf8) { + for (int i=0; i<wlen; i++) { + if (strchr(cpdvowels, word[i])) num++; + } + } else if (cpdvowels_utf16) { + w_char w[MAXWORDUTF8LEN]; + int i = u8_u16(w, MAXWORDUTF8LEN, word); + for (; i; i--) { + if (flag_bsearch((unsigned short *) cpdvowels_utf16, + ((unsigned short *) w)[i - 1], cpdvowels_utf16_len)) num++; + } + } + return num; +} + +// check if compound word is correctly spelled +// hu_mov_rule = spec. Hungarian rule (XXX) +struct hentry * AffixMgr::compound_check(const char * word, int len, + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words = NULL, + char hu_mov_rule = 0, int * cmpdstemnum = NULL, int * cmpdstem = NULL, char is_sug = 0) +{ + int i, oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; + int oldcmpdstemnum = 0; + struct hentry * rv = NULL; + struct hentry * rv_first; + struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking + char st [MAXWORDUTF8LEN + 4]; + char ch; + int cmin; + int cmax; + + int checked_prefix; + +#ifdef HUNSTEM + if (cmpdstemnum) { + if (wordnum == 0) { + *cmpdstemnum = 1; + } else { + (*cmpdstemnum)++; + } + } +#endif + if (utf8) { + for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { + cmin++; + for (; (word[cmin] & 0xc0) == 0x80; cmin++); + } + for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { + cmax--; + for (; (word[cmax] & 0xc0) == 0x80; cmax--); + } + } else { + cmin = cpdmin; + cmax = len - cpdmin + 1; + } + + strcpy(st, word); + + for (i = cmin; i < cmax; i++) { + + oldnumsyllable = numsyllable; + oldwordnum = wordnum; + checked_prefix = 0; + + // go to end of the UTF-8 character + if (utf8) { + for (; (st[i] & 0xc0) == 0x80; i++); + if (i >= cmax) return NULL; + } + + ch = st[i]; - st[i] = '\0'; + st[i] = '\0'; + + sfx = NULL; + pfx = NULL; + + // FIRST WORD + + rv = lookup(st); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && !hu_mov_rule && + ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundbegin && !wordnum && + TESTAFF(rv->astr, compoundbegin, rv->alen)) || + (compoundmiddle && wordnum && !words && + TESTAFF(rv->astr, compoundmiddle, rv->alen)) || + (numdefcpd && + ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || + (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)))) + ))) { + rv = rv->next_homonym; + } - rv = lookup(st); - if (!rv) rv = affix_check(st,i); + if (!rv) { + if (compoundflag && + !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { + if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, + FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule && + ((SfxEntry*)sfx)->getCont() && + ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag, + ((SfxEntry*)sfx)->getContLen())) || (compoundend && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + } + if (rv || + (((wordnum == 0) && compoundbegin && + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) || + ((wordnum > 0) && compoundmiddle && + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) + ) checked_prefix = 1; + // else check forbiddenwords and pseudoroot + } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, pseudoroot, rv->alen) || + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) + )) { + st[i] = ch; + continue; + } - if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) { - rv = lookup((word+i)); - if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) { - free(st); - return rv; + // check non_compound flag in suffix and prefix + if ((rv) && !hu_mov_rule && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + + // check compoundend flag in suffix and prefix + if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + + // check compoundmiddle flag in suffix and prefix + if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) { + return NULL; + } + + // increment word number, if the second root has a compoundroot flag + if ((rv) && compoundroot && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // first word is acceptable in compound words? + if (((rv) && + ( checked_prefix || (words && words[wnum]) || + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) || + ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen))// || +// (numdefcpd && ) + +// LANG_hu section: spec. Hungarian rule + || ((langnum == LANG_hu) && hu_mov_rule && ( + TESTAFF(rv->astr, 'F', rv->alen) || // XXX hardwired Hungarian dictionary codes + TESTAFF(rv->astr, 'G', rv->alen) || + TESTAFF(rv->astr, 'H', rv->alen) + ) + ) +// END of LANG_hu section + ) + && ! (( checkcompoundtriple && // test triple letters + (word[i-1]==word[i]) && ( + ((i>1) && (word[i-1]==word[i-2])) || + ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' + ) + ) || + ( + // test CHECKCOMPOUNDPATTERN + numcheckcpd && cpdpat_check(word, i) + ) || + ( + checkcompoundcase && cpdcase_check(word, i) + )) + ) +// LANG_hu section: spec. Hungarian rule + || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) && + (sfx && ((SfxEntry*)sfx)->getCont() && ( // XXX hardwired Hungarian dic. codes + TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) || + TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen()) + ) + ) + ) +// END of LANG_hu section + ) { + +// LANG_hu section: spec. Hungarian rule + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(st, i); + + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) + if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++; + } +// END of LANG_hu section + +#ifdef HUNSTEM + if (cmpdstem) cmpdstem[*cmpdstemnum - 1] = i; +#endif + + // NEXT WORD(S) + rv_first = rv; + rv = lookup((word+i)); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || + (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { + rv = rv->next_homonym; + } + + if (rv && words && words[wnum + 1]) return rv; + + oldnumsyllable2 = numsyllable; + oldwordnum2 = wordnum; + +// LANG_hu section: spec. Hungarian rule, XXX hardwired dictionary code + if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { + numsyllable--; } - rv = affix_check((word+i),strlen(word+i)); - if ((rv) && (TESTAFF(rv->astr, compound_flag, rv->alen))) { - free(st); - return rv; +// END of LANG_hu section + + // increment word number, if the second root has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL; + + // second word is acceptable, as a root? + // hungarian conventions: compounding is acceptable, + // when compound forms consist of 2 words, or if more, + // then the syllable number of root words must be 6, or lesser. + + if ((rv) && ( + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) + ) + && ( + ((cpdwordmax==0) || (wordnum+1<cpdwordmax)) || + ((cpdmaxsyllable==0) || + (numsyllable + get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) + ) + && ( + (!checkcompounddup || (rv != rv_first)) + ) + ) + { + // forbid compound word, if it is a non compound word with typical fault + if (checkcompoundrep && cpdrep_check(word,len)) return NULL; + return rv; + } + + numsyllable = oldnumsyllable2 ; + wordnum = oldwordnum2; + + // perhaps second word has prefix or/and suffix + sfx = NULL; + sfxflag = FLAG_NULL; + rv = (compoundflag) ? affix_check((word+i),strlen(word+i), compoundflag, IN_CPD_END) : NULL; + if (!rv && compoundend) { + sfx = NULL; + pfx = NULL; + rv = affix_check((word+i),strlen(word+i), compoundend, IN_CPD_END); + } + + if (!rv && numdefcpd && words) { + rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); + if (rv && defcpd_check(&words, wnum + 1, rv, NULL, 1)) return rv; + } + + // check non_compound flag in suffix and prefix + if ((rv) && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)))) return NULL; + + // pfxappnd = prefix of word+i, or NULL + // calculate syllable number of prefix. + // hungarian convention: when syllable number of prefix is more, + // than 1, the prefix+word counts as two words. + + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(word + i, strlen(word + i)); + + // - affix syllable num. + // XXX only second suffix (inflections, not derivations) + if (sfxappnd) { + char * tmp = myrevstrdup(sfxappnd); + numsyllable -= get_syllable(tmp, strlen(tmp)); + free(tmp); + } + + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) + if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++; + + // increment syllable num, if last word has a SYLLABLENUM flag + // and the suffix is beginning `s' + + if (cpdsyllablenum) { + switch (sfxflag) { + case 'c': { numsyllable+=2; break; } + case 'J': { numsyllable += 1; break; } + case 'I': { if (TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; } + } + } + } + + // increment word number, if the second word has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // second word is acceptable, as a word with prefix or/and suffix? + // hungarian conventions: compounding is acceptable, + // when compound forms consist 2 word, otherwise + // the syllable number of root words is 6, or lesser. + if ((rv) && + ( + ((cpdwordmax ==0 ) || (wordnum + 1 < cpdwordmax)) || + ((cpdmaxsyllable == 0) || + (numsyllable <= cpdmaxsyllable)) + ) + && ( + (!checkcompounddup || (rv != rv_first)) + )) { + // forbid compound word, if it is a non compound word with typical fault + if (checkcompoundrep && cpdrep_check(word, len)) return NULL; + return rv; + } + + numsyllable = oldnumsyllable2; + wordnum = oldwordnum2; +#ifdef HUNSTEM + if (cmpdstemnum) oldcmpdstemnum = *cmpdstemnum; +#endif + // perhaps second word is a compound word (recursive call) + if (wordnum < maxwordnum) { + rv = compound_check((word+i),strlen(word+i), wordnum+1, + numsyllable, maxwordnum, wnum + 1, words, + 0, cmpdstemnum, cmpdstem, is_sug); + } else { + rv=NULL; } - rv = compound_check((word+i),strlen(word+i),compound_flag); if (rv) { - free(st); + // forbid compound word, if it is a non compound word with typical fault + if (checkcompoundrep && cpdrep_check(word, len)) return NULL; return rv; + } else { +#ifdef HUNSTEM + if (cmpdstemnum) *cmpdstemnum = oldcmpdstemnum; +#endif } - } st[i] = ch; + wordnum = oldwordnum; + numsyllable = oldnumsyllable; } - free(st); + return NULL; } +// check if compound word is correctly spelled +// hu_mov_rule = spec. Hungarian rule (XXX) +int AffixMgr::compound_check_morph(const char * word, int len, + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule = 0, char ** result = NULL, char * partresult = NULL) +{ + int i, oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2; + int ok = 0; + + struct hentry * rv = NULL; + struct hentry * rv_first; + struct hentry * rwords[MAXWORDLEN]; // buffer for COMPOUND pattern checking + char st [MAXWORDUTF8LEN + 4]; + char ch; + + int checked_prefix; + char presult[MAXLNLEN]; + + int cmin; + int cmax; + + if (utf8) { + for (cmin = 0, i = 0; (i < cpdmin) && word[cmin]; i++) { + cmin++; + for (; (word[cmin] & 0xc0) == 0x80; cmin++); + } + for (cmax = len, i = 0; (i < (cpdmin - 1)) && cmax; i++) { + cmax--; + for (; (word[cmax] & 0xc0) == 0x80; cmax--); + } + } else { + cmin = cpdmin; + cmax = len - cpdmin + 1; + } + + strcpy(st, word); + + for (i = cmin; i < cmax; i++) { + oldnumsyllable = numsyllable; + oldwordnum = wordnum; + checked_prefix = 0; + + // go to end of the UTF-8 character + if (utf8) { + for (; (st[i] & 0xc0) == 0x80; i++); + if (i >= cmax) return 0; + } + + ch = st[i]; + st[i] = '\0'; + sfx = NULL; + + // FIRST WORD + *presult = '\0'; + if (partresult) strcat(presult, partresult); + + rv = lookup(st); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && !hu_mov_rule && + ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundbegin && !wordnum && + TESTAFF(rv->astr, compoundbegin, rv->alen)) || + (compoundmiddle && wordnum && !words && + TESTAFF(rv->astr, compoundmiddle, rv->alen)) || + (numdefcpd && + ((!words && !wordnum && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)) || + (words && defcpd_check(&words, wnum, rv, (hentry **) &rwords, 0)))) + ))) { + rv = rv->next_homonym; + } + + if (rv) { + if (rv->description) { + if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) + strcat(presult, st); + strcat(presult, rv->description); + } + } + + if (!rv) { + if (compoundflag && + !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { + if ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, + FLAG_NULL, compoundflag, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) && !hu_mov_rule && + ((SfxEntry*)sfx)->getCont() && + ((compoundforbidflag && TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag, + ((SfxEntry*)sfx)->getContLen())) || (compoundend && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + } + + if (rv || + (((wordnum == 0) && compoundbegin && + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundbegin, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundbegin)))) || + ((wordnum > 0) && compoundmiddle && + ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || + (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) + ) { + //char * p = prefix_check_morph(st, i, 0, compound); + char * p = NULL; + if (compoundflag) p = affix_check_morph(st, i, compoundflag); + if (!p || (*p == '\0')) { + if ((wordnum == 0) && compoundbegin) { + p = affix_check_morph(st, i, compoundbegin); + } else if ((wordnum > 0) && compoundmiddle) { + p = affix_check_morph(st, i, compoundmiddle); + } + } + if (*p != '\0') { + line_uniq(p); + if (strchr(p, '\n')) { + strcat(presult, "("); + strcat(presult, line_join(p, '|')); + strcat(presult, ")"); + } else { + strcat(presult, p); + } + } + if (presult[strlen(presult) - 1] == '\n') { + presult[strlen(presult) - 1] = '\0'; + } + checked_prefix = 1; + //strcat(presult, "+"); + } + // else check forbiddenwords + } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || + TESTAFF(rv->astr, pseudoroot, rv->alen))) { + st[i] = ch; + continue; + } + + // check non_compound flag in suffix and prefix + if ((rv) && !hu_mov_rule && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag, + ((SfxEntry*)sfx)->getContLen())))) { + continue; + } + + // check compoundend flag in suffix and prefix + if ((rv) && !checked_prefix && compoundend && !hu_mov_rule && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundend, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundend, + ((SfxEntry*)sfx)->getContLen())))) { + continue; + } + + // check compoundmiddle flag in suffix and prefix + if ((rv) && !checked_prefix && (wordnum==0) && compoundmiddle && !hu_mov_rule && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundmiddle, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundmiddle, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) continue; + + // increment word number, if the second root has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // first word is acceptable in compound words? + if (((rv) && + ( checked_prefix || (words && words[wnum]) || + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + ((oldwordnum == 0) && compoundbegin && TESTAFF(rv->astr, compoundbegin, rv->alen)) || + ((oldwordnum > 0) && compoundmiddle && TESTAFF(rv->astr, compoundmiddle, rv->alen)) +// LANG_hu section: spec. Hungarian rule + || ((langnum == LANG_hu) && // hu_mov_rule + hu_mov_rule && ( + TESTAFF(rv->astr, 'F', rv->alen) || + TESTAFF(rv->astr, 'G', rv->alen) || + TESTAFF(rv->astr, 'H', rv->alen) + ) + ) +// END of LANG_hu section + ) + && ! (( checkcompoundtriple && // test triple letters + (word[i-1]==word[i]) && ( + ((i>1) && (word[i-1]==word[i-2])) || + ((word[i-1]==word[i+1])) // may be word[i+1] == '\0' + ) + ) || + ( + // test CHECKCOMPOUNDPATTERN + numcheckcpd && cpdpat_check(word, i) + ) || + ( + checkcompoundcase && cpdcase_check(word, i) + )) + ) +// LANG_hu section: spec. Hungarian rule + || ((!rv) && (langnum == LANG_hu) && hu_mov_rule && (rv = affix_check(st,i)) && + (sfx && ((SfxEntry*)sfx)->getCont() && ( + TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) 'x', ((SfxEntry*)sfx)->getContLen()) || + TESTAFF(((SfxEntry*)sfx)->getCont(), (unsigned short) '%', ((SfxEntry*)sfx)->getContLen()) + ) + ) + ) +// END of LANG_hu section + ) { + +// LANG_hu section: spec. Hungarian rule + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(st, i); + + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) + if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++; + } +// END of LANG_hu section + + // NEXT WORD(S) + rv_first = rv; + rv = lookup((word+i)); // perhaps without prefix + + // search homonym with compound flag + while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || + (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { + rv = rv->next_homonym; + } + + if (rv && words && words[wnum + 1]) { + strcat(*result, presult); + if (complexprefixes && rv->description) strcat(*result, rv->description); + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, lemma_present, rv->alen))) + strcat(*result, rv->word); + if (!complexprefixes && rv->description) strcat(*result, rv->description); + strcat(*result, "\n"); + ok = 1; + return 0; + } + + oldnumsyllable2 = numsyllable; + oldwordnum2 = wordnum; + +// LANG_hu section: spec. Hungarian rule + if ((rv) && (langnum == LANG_hu) && (TESTAFF(rv->astr, 'I', rv->alen)) && !(TESTAFF(rv->astr, 'J', rv->alen))) { + numsyllable--; + } +// END of LANG_hu section + // increment word number, if the second root has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && TESTAFF(rv->astr, forbiddenword, rv->alen)) { + st[i] = ch; + continue; + } + + // second word is acceptable, as a root? + // hungarian conventions: compounding is acceptable, + // when compound forms consist of 2 words, or if more, + // then the syllable number of root words must be 6, or lesser. + if ((rv) && ( + (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) || + (compoundend && TESTAFF(rv->astr, compoundend, rv->alen)) + ) + && ( + ((cpdwordmax==0) || (wordnum+1<cpdwordmax)) || + ((cpdmaxsyllable==0) || + (numsyllable+get_syllable(rv->word,rv->wlen)<=cpdmaxsyllable)) + ) + && ( + (!checkcompounddup || (rv != rv_first)) + ) + ) + { + // bad compound word + strcat(*result, presult); + + if (rv->description) { + if (complexprefixes) strcat(*result, rv->description); + if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) + strcat(*result, rv->word); + if (!complexprefixes) strcat(*result, rv->description); + } + strcat(*result, "\n"); + ok = 1; + } + + numsyllable = oldnumsyllable2 ; + wordnum = oldwordnum2; + + // perhaps second word has prefix or/and suffix + sfx = NULL; + sfxflag = FLAG_NULL; + + if (compoundflag) rv = affix_check((word+i),strlen(word+i), compoundflag); else rv = NULL; + + if (!rv && compoundend) { + sfx = NULL; + pfx = NULL; + rv = affix_check((word+i),strlen(word+i), compoundend); + } + + if (!rv && numdefcpd && words) { + rv = affix_check((word+i),strlen(word+i), 0, IN_CPD_END); + if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { + char * m = NULL; + if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); + if ((!m || *m == '\0') && compoundend) + m = affix_check_morph((word+i),strlen(word+i), compoundend); + strcat(*result, presult); + line_uniq(m); + if (strchr(m, '\n')) { + strcat(*result, "("); + strcat(*result, line_join(m, '|')); + strcat(*result, ")"); + } else { + strcat(*result, m); + } + free(m); + strcat(*result, "\n"); + ok = 1; + } + } + + // check non_compound flag in suffix and prefix + if ((rv) && + ((pfx && ((PfxEntry*)pfx)->getCont() && + TESTAFF(((PfxEntry*)pfx)->getCont(), compoundforbidflag, + ((PfxEntry*)pfx)->getContLen())) || + (sfx && ((SfxEntry*)sfx)->getCont() && + TESTAFF(((SfxEntry*)sfx)->getCont(), compoundforbidflag, + ((SfxEntry*)sfx)->getContLen())))) { + rv = NULL; + } + + // check forbiddenwords + if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen)) + && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) { + st[i] = ch; + continue; + } + + if (langnum == LANG_hu) { + // calculate syllable number of the word + numsyllable += get_syllable(word + i, strlen(word + i)); + + // - affix syllable num. + // XXX only second suffix (inflections, not derivations) + if (sfxappnd) { + char * tmp = myrevstrdup(sfxappnd); + numsyllable -= get_syllable(tmp, strlen(tmp)); + free(tmp); + } + + // + 1 word, if syllable number of the prefix > 1 (hungarian convention) + if (pfx && (get_syllable(((PfxEntry *)pfx)->getKey(),strlen(((PfxEntry *)pfx)->getKey())) > 1)) wordnum++; + + // increment syllable num, if last word has a SYLLABLENUM flag + // and the suffix is beginning `s' + + if (cpdsyllablenum) { + switch (sfxflag) { + case 'c': { numsyllable+=2; break; } + case 'J': { numsyllable += 1; break; } + case 'I': { if (rv && TESTAFF(rv->astr, 'J', rv->alen)) numsyllable += 1; break; } + } + } + } + + // increment word number, if the second word has a compoundroot flag + if ((rv) && (compoundroot) && + (TESTAFF(rv->astr, compoundroot, rv->alen))) { + wordnum++; + } + // second word is acceptable, as a word with prefix or/and suffix? + // hungarian conventions: compounding is acceptable, + // when compound forms consist 2 word, otherwise + // the syllable number of root words is 6, or lesser. + if ((rv) && + ( + ((cpdwordmax==0) || (wordnum+1<cpdwordmax)) || + ((cpdmaxsyllable==0) || + (numsyllable <= cpdmaxsyllable)) + ) + && ( + (!checkcompounddup || (rv != rv_first)) + )) { + char * m = NULL; + if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); + if ((!m || *m == '\0') && compoundend) + m = affix_check_morph((word+i),strlen(word+i), compoundend); + strcat(*result, presult); + line_uniq(m); + if (strchr(m, '\n')) { + strcat(*result, "("); + strcat(*result, line_join(m, '|')); + strcat(*result, ")"); + } else { + strcat(*result, m); + } + free(m); + strcat(*result, "\n"); + ok = 1; + } + + numsyllable = oldnumsyllable2; + wordnum = oldwordnum2; + + // perhaps second word is a compound word (recursive call) + if ((wordnum < maxwordnum) && (ok == 0)) { + compound_check_morph((word+i),strlen(word+i), wordnum+1, + numsyllable, maxwordnum, wnum + 1, words, 0, result, presult); + } else { + rv=NULL; + } + } + st[i] = ch; + wordnum = oldwordnum; + numsyllable = oldnumsyllable; + } + return 0; +} + + + // check word for suffixes + struct hentry * AffixMgr::suffix_check (const char * word, int len, - int sfxopts, AffEntry * ppfx) + int sfxopts, AffEntry * ppfx, char ** wlst, int maxSug, int * ns, + const FLAG cclass, const FLAG needflag, char in_compound) { struct hentry * rv = NULL; + char result[MAXLNLEN]; + + PfxEntry* ep = (PfxEntry *) ppfx; // first handle the special case of 0 length suffixes SfxEntry * se = (SfxEntry *) sStart[0]; + while (se) { - rv = se->check(word,len, sfxopts, ppfx); - if (rv) return rv; + if (!cclass || se->getCont()) { + // suffixes are not allowed in beginning of compounds + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (se->getCont() && compoundpermitflag && + TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) && + // fogemorpheme + (in_compound || + !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && + // pseudoroot on prefix or first suffix + (cclass || + !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + (ppfx && !((ep->getCont()) && + TESTAFF(ep->getCont(), pseudoroot, + ep->getContLen()))) + ) + ) && + (rv = se->check(word,len, sfxopts, ppfx, wlst, maxSug, ns, (FLAG) cclass, needflag))) { + sfx=(AffEntry *)se; // BUG: sfx not stateless + return rv; + } + } se = se->getNext(); } // now handle the general case unsigned char sp = *((const unsigned char *)(word + len - 1)); + SfxEntry * sptr = (SfxEntry *) sStart[sp]; + + while (sptr) { + if (isRevSubset(sptr->getKey(), word + len - 1, len) + ) { + // suffixes are not allowed in beginning of compounds + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (sptr->getCont() && compoundpermitflag && + TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) && + // fogemorpheme + (in_compound || + !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && + // pseudoroot on prefix or first suffix + (cclass || + !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + (ppfx && !((ep->getCont()) && + TESTAFF(ep->getCont(), pseudoroot, + ep->getContLen()))) + ) + ) && + (rv = sptr->check(word,len, sfxopts, ppfx, wlst, maxSug, ns, cclass, needflag))) { + sfx=(AffEntry *)sptr; // BUG: sfx not stateless + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless + if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless + if (cclass || sptr->getCont()) { + if (!derived) { + derived = mystrdup(word); + } else { + strcpy(result, derived); // XXX check size + strcat(result, "\n"); + strcat(result, word); + free(derived); + derived = mystrdup(result); + } + } + return rv; + } + sptr = sptr->getNextEQ(); + } else { + sptr = sptr->getNextNE(); + } + } + return NULL; +} +// check word for two-level suffixes + +struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, + int sfxopts, AffEntry * ppfx, const FLAG needflag) +{ + struct hentry * rv = NULL; + + // first handle the special case of 0 length suffixes + SfxEntry * se = (SfxEntry *) sStart[0]; + while (se) { + if (contclasses[se->getFlag()]) + { + rv = se->check_twosfx(word,len, sfxopts, ppfx, needflag); + if (rv) return rv; + } + se = se->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char *)(word + len - 1)); SfxEntry * sptr = (SfxEntry *) sStart[sp]; while (sptr) { - if (isRevSubset(sptr->getKey(),(word+len-1), len)) { - rv = sptr->check(word,len, sfxopts, ppfx); - if (rv) { - return rv; - } - sptr = sptr->getNextEQ(); + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { + if (contclasses[sptr->getFlag()]) + { + rv = sptr->check_twosfx(word,len, sfxopts, ppfx, needflag); + if (rv) { + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless + if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless + return rv; + } + } + sptr = sptr->getNextEQ(); } else { sptr = sptr->getNextNE(); } } + return NULL; } +char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, + int sfxopts, AffEntry * ppfx, const FLAG needflag) +{ + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + char result3[MAXLNLEN]; + + char * st; + + result[0] = '\0'; + result2[0] = '\0'; + result3[0] = '\0'; + + // first handle the special case of 0 length suffixes + SfxEntry * se = (SfxEntry *) sStart[0]; + while (se) { + if (contclasses[se->getFlag()]) + { + st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); + if (st) { + if (ppfx) { + if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + } + strcat(result, st); + free(st); + if (se->getMorph()) strcat(result, se->getMorph()); + strcat(result, "\n"); + } + } + se = se->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char *)(word + len - 1)); + SfxEntry * sptr = (SfxEntry *) sStart[sp]; + + while (sptr) { + if (isRevSubset(sptr->getKey(), word + len - 1, len)) { + if (contclasses[sptr->getFlag()]) + { + st = sptr->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); + if (st) { + sfxflag = sptr->getFlag(); // BUG: sfxflag not stateless + if (!sptr->getCont()) sfxappnd=sptr->getKey(); // BUG: sfxappnd not stateless + strcpy(result2, st); + free(st); + + result3[0] = '\0'; +#ifdef DEBUG + unsigned short flag = sptr->getFlag(); + char flagch[2] = &flag; + if (flag_mode == FLAG_NUM) { + sprintf(result3, "%d", sptr->getKey()); + } else if (flag_mode == FLAG_LONG) { + sprintf(result3, "%c%c", flagch[0], flagch[1]); + } else sprintf(result3, "%c", flagch[1]); + strcat(result3, ":"); +#endif + if (sptr->getMorph()) strcat(result3, sptr->getMorph()); + strlinecat(result2, result3); + strcat(result2, "\n"); + strcat(result, result2); + } + } + sptr = sptr->getNextEQ(); + } else { + sptr = sptr->getNextNE(); + } + } + if (result) return mystrdup(result); + return NULL; +} + +char * AffixMgr::suffix_check_morph(const char * word, int len, + int sfxopts, AffEntry * ppfx, const FLAG cclass, const FLAG needflag, char in_compound) +{ + char result[MAXLNLEN]; + + struct hentry * rv = NULL; + + result[0] = '\0'; + + PfxEntry* ep = (PfxEntry *) ppfx; + + // first handle the special case of 0 length suffixes + SfxEntry * se = (SfxEntry *) sStart[0]; + while (se) { + if (!cclass || se->getCont()) { + // suffixes are not allowed in beginning of compounds + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (se->getCont() && compoundpermitflag && + TESTAFF(se->getCont(),compoundpermitflag,se->getContLen()))) && (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (!se->getCont() || !(TESTAFF(se->getCont(),circumfix,se->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (se->getCont() && (TESTAFF(se->getCont(),circumfix,se->getContLen()))))) && + // fogemorpheme + (in_compound || + !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && + // pseudoroot on prefix or first suffix + (cclass || + !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + (ppfx && !((ep->getCont()) && + TESTAFF(ep->getCont(), pseudoroot, + ep->getContLen()))) + ) + )) + rv = se->check(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); + while (rv) { + if (ppfx) { + if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + } + if (complexprefixes && rv->description) strcat(result, rv->description); + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, lemma_present, rv->alen))) + strcat(result, rv->word); + if (!complexprefixes && rv->description) strcat(result, rv->description); + if (se->getMorph()) strcat(result, se->getMorph()); + strcat(result, "\n"); + rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); + } + } + se = se->getNext(); + } + + // now handle the general case + unsigned char sp = *((const unsigned char *)(word + len - 1)); + SfxEntry * sptr = (SfxEntry *) sStart[sp]; + + while (sptr) { + if (isRevSubset(sptr->getKey(), word + len - 1, len) + ) { + // suffixes are not allowed in beginning of compounds + if (((((in_compound != IN_CPD_BEGIN)) || // && !cclass + // except when signed with compoundpermitflag flag + (sptr->getCont() && compoundpermitflag && + TESTAFF(sptr->getCont(),compoundpermitflag,sptr->getContLen()))) && (!circumfix || + // no circumfix flag in prefix and suffix + ((!ppfx || !(ep->getCont()) || !TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (!sptr->getCont() || !(TESTAFF(sptr->getCont(),circumfix,sptr->getContLen())))) || + // circumfix flag in prefix AND suffix + ((ppfx && (ep->getCont()) && TESTAFF(ep->getCont(), + circumfix, ep->getContLen())) && + (sptr->getCont() && (TESTAFF(sptr->getCont(),circumfix,sptr->getContLen()))))) && + // fogemorpheme + (in_compound || + !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && + // pseudoroot on first suffix + (cclass || !(sptr->getCont() && + TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen()))) + )) rv = sptr->check(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); + while (rv) { + if (ppfx) { + if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + } + if (complexprefixes && rv->description) strcat(result, rv->description); + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, rv->word); + if (!complexprefixes && rv->description) strcat(result, rv->description); +#ifdef DEBUG + unsigned short flag = sptr->getKey(); + char flagch[2] = &flag; + if (flag_mode == FLAG_NUM) { + sprintf(result2, "%d", sptr->getKey()); + } else if (flag_mode == FLAG_LONG) { + sprintf(result2, "%c%c", flagch[0], flagch[1]); + } else sprintf(result2, "%c", flagch[1]); + strcat(result2, ":"); + strcat(result, result2); +#endif + + if (sptr->getMorph()) strcat(result, sptr->getMorph()); + strcat(result, "\n"); + rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); + } + sptr = sptr->getNextEQ(); + } else { + sptr = sptr->getNextNE(); + } + } + + if (*result) return mystrdup(result); + return NULL; +} + // check if word with affixes is correctly spelled -struct hentry * AffixMgr::affix_check (const char * word, int len) +struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound) { struct hentry * rv= NULL; + if (derived) free(derived); + derived = NULL; // check all prefixes (also crossed with suffixes if allowed) - rv = prefix_check(word, len); + rv = prefix_check(word, len, in_compound, needflag); if (rv) return rv; // if still not found check all suffixes - rv = suffix_check(word, len, 0, NULL); + rv = suffix_check(word, len, 0, NULL, NULL, 0, NULL, FLAG_NULL, needflag, in_compound); + + if (havecontclass) { + sfx = NULL; + pfx = NULL; + if (rv) return rv; + // if still not found check all two-level suffixes + rv = suffix_check_twosfx(word, len, 0, NULL, needflag); + if (rv) return rv; + // if still not found check all two-level suffixes + rv = prefix_check_twosfx(word, len, IN_CPD_NOT, needflag); + } return rv; } -int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, - const char * ts, int wl, const char * ap, int al) +// check if word with affixes is correctly spelled +char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound) +{ + char result[MAXLNLEN]; + char * st = NULL; + + *result = '\0'; + + // check all prefixes (also crossed with suffixes if allowed) + st = prefix_check_morph(word, len, in_compound); + if (st) { + strcat(result, st); + free(st); + } + + // if still not found check all suffixes + st = suffix_check_morph(word, len, 0, NULL, '\0', needflag, in_compound); + if (st) { + strcat(result, st); + free(st); + } + + if (havecontclass) { + sfx = NULL; + pfx = NULL; + // if still not found check all two-level suffixes + st = suffix_check_twosfx_morph(word, len, 0, NULL, needflag); + if (st) { + strcat(result, st); + free(st); + } + + // if still not found check all two-level suffixes + st = prefix_check_twosfx_morph(word, len, IN_CPD_NOT, needflag); + if (st) { + strcat(result, st); + free(st); + } + } + + return mystrdup(result); +} + + +int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, + int wl, const unsigned short * ap, unsigned short al, char * bad, int badl) { int nh=0; // first add root word to list - - if (nh < maxn) { + if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) || + (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { wlst[nh].word = mystrdup(ts); wlst[nh].allow = (1 == 0); nh++; @@ -756,19 +2649,28 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, // handle suffixes for (int i = 0; i < al; i++) { - unsigned char c = (unsigned char) ap[i]; + unsigned short c = (unsigned short) ap[i]; SfxEntry * sptr = (SfxEntry *)sFlag[c]; while (sptr) { - char * newword = sptr->add(ts, wl); - if (newword) { - if (nh < maxn) { - wlst[nh].word = newword; - wlst[nh].allow = sptr->allowCross(); - nh++; - } else { - free(newword); - } - } + if (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && + (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0)) && + // check pseudoroot flag + !(sptr->getCont() && ((pseudoroot && + TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + (onlyincompound && + TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen())))) + ) { + char * newword = sptr->add(ts, wl); + if (newword) { + if (nh < maxn) { + wlst[nh].word = newword; + wlst[nh].allow = sptr->allowCross(); + nh++; + } else { + free(newword); + } + } + } sptr = (SfxEntry *)sptr ->getFlgNxt(); } } @@ -779,10 +2681,11 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, for (int j=1;j<n ;j++) if (wlst[j].allow) { for (int k = 0; k < al; k++) { - unsigned char c = (unsigned char) ap[k]; + unsigned short c = (unsigned short) ap[k]; PfxEntry * cptr = (PfxEntry *) pFlag[c]; while (cptr) { - if (cptr->allowCross()) { + if (cptr->allowCross() && (!cptr->getKeyLen() || ((badl > cptr->getKeyLen()) && + (strncmp(cptr->getKey(), bad, cptr->getKeyLen()) == 0)))) { int l1 = strlen(wlst[j].word); char * newword = cptr->add(wlst[j].word, l1); if (newword) { @@ -803,19 +2706,28 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, // now handle pure prefixes for (int m = 0; m < al; m ++) { - unsigned char c = (unsigned char) ap[m]; + unsigned short c = (unsigned short) ap[m]; PfxEntry * ptr = (PfxEntry *) pFlag[c]; while (ptr) { - char * newword = ptr->add(ts, wl); - if (newword) { - if (nh < maxn) { - wlst[nh].word = newword; - wlst[nh].allow = ptr->allowCross(); - nh++; - } else { - free(newword); - } - } + if (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && + (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0)) && + // check pseudoroot flag + !(ptr->getCont() && ((pseudoroot && + TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) || + (onlyincompound && + TESTAFF(ptr->getCont(), onlyincompound, ptr->getContLen())))) + ) { + char * newword = ptr->add(ts, wl); + if (newword) { + if (nh < maxn) { + wlst[nh].word = newword; + wlst[nh].allow = ptr->allowCross(); + nh++; + } else { + free(newword); + } + } + } ptr = (PfxEntry *)ptr ->getFlgNxt(); } } @@ -824,6 +2736,7 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, } + // return length of replacing table int AffixMgr::get_numrep() { @@ -837,7 +2750,6 @@ struct replentry * AffixMgr::get_reptable() return reptable; } - // return length of character map table int AffixMgr::get_nummap() { @@ -851,6 +2763,19 @@ struct mapentry * AffixMgr::get_maptable() return maptable; } +// return length of word break table +int AffixMgr::get_numbreak() +{ + return numbreak; +} + +// return character map table +char ** AffixMgr::get_breaktable() +{ + if (! breaktable ) return NULL; + return breaktable; +} + // return text encoding of dictionary char * AffixMgr::get_encoding() { @@ -860,6 +2785,33 @@ char * AffixMgr::get_encoding() return mystrdup(encoding); } +// return text encoding of dictionary +int AffixMgr::get_langnum() +{ + return langnum; +} + +// return UTF info table +struct unicode_info2 * AffixMgr::get_utf_conv() +{ + return utf_tbl; +} + +// return double prefix option +int AffixMgr::get_complexprefixes() +{ + return complexprefixes; +} + +FLAG AffixMgr::get_keepcase() +{ + return keepcase; +} + +int AffixMgr::get_checksharps() +{ + return checksharps; +} // return the preferred try string for suggestions char * AffixMgr::get_try_string() @@ -868,11 +2820,101 @@ char * AffixMgr::get_try_string() return mystrdup(trystring); } +// return the preferred try string for suggestions +const char * AffixMgr::get_wordchars() +{ + return wordchars; +} + +unsigned short * AffixMgr::get_wordchars_utf16(int * len) +{ + *len = wordchars_utf16_len; + return wordchars_utf16; +} + +// is there compounding? +int AffixMgr::get_compound() +{ + return compoundflag || compoundbegin || numdefcpd; +} + // return the compound words control flag -char * AffixMgr::get_compound() +FLAG AffixMgr::get_compoundflag() +{ + return compoundflag; +} + +// return the forbidden words control flag +FLAG AffixMgr::get_forbiddenword() +{ + return forbiddenword; +} + +// return the forbidden words control flag +FLAG AffixMgr::get_nosuggest() +{ + return nosuggest; +} + +// return the forbidden words flag modify flag +FLAG AffixMgr::get_pseudoroot() +{ + return pseudoroot; +} + +// return the onlyincompound flag +FLAG AffixMgr::get_onlyincompound() +{ + return onlyincompound; +} + +// return the compound word signal flag +FLAG AffixMgr::get_compoundroot() { - if (! compound ) return NULL; - return compound; + return compoundroot; +} + +// return the compound begin signal flag +FLAG AffixMgr::get_compoundbegin() +{ + return compoundbegin; +} + +// return the value of checknum +int AffixMgr::get_checknum() +{ + return checknum; +} + +// return the value of prefix +const char * AffixMgr::get_prefix() +{ + if (pfx) return ((PfxEntry *)pfx)->getKey(); + return NULL; +} + +// return the value of suffix +const char * AffixMgr::get_suffix() +{ + return sfxappnd; +} + +// return the value of derived form (base word with first suffix). +const char * AffixMgr::get_derived() +{ + return derived; +} + +// return the value of suffix +const char * AffixMgr::get_version() +{ + return version; +} + +// return lemma_present flag +FLAG AffixMgr::get_lemma_present() +{ + return lemma_present; } // utility method to look up root words in hash table @@ -882,12 +2924,36 @@ struct hentry * AffixMgr::lookup(const char * word) return pHMgr->lookup(word); } +// return the value of suffix +const int AffixMgr::have_contclass() +{ + return havecontclass; +} + +// return utf8 +int AffixMgr::get_utf8() +{ + return utf8; +} + // return nosplitsugs -bool AffixMgr::get_nosplitsugs(void) +int AffixMgr::get_maxngramsugs(void) +{ + return maxngramsugs; +} + +// return nosplitsugs +int AffixMgr::get_nosplitsugs(void) { return nosplitsugs; } +// return sugswithdots +int AffixMgr::get_sugswithdots(void) +{ + return sugswithdots; +} + /* parse in the try string */ int AffixMgr::parse_try(char * line) { @@ -899,7 +2965,7 @@ int AffixMgr::parse_try(char * line) char * piece; int i = 0; int np = 0; - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { np++; break; } @@ -929,11 +2995,32 @@ int AffixMgr::parse_set(char * line) char * piece; int i = 0; int np = 0; - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { np++; break; } - case 1: { encoding = mystrdup(piece); np++; break; } + case 1: { encoding = mystrdup(piece); + if (strcmp(encoding, "UTF-8") == 0) { + unicode_info * uni = get_utf_cs(); + utf8 = 1; + utf_tbl = (unicode_info2 *) malloc(CONTSIZE * sizeof(unicode_info2)); + if (utf_tbl) { + int j; + for (j = 0; j < CONTSIZE; j++) { + utf_tbl[j].cletter = 0; + utf_tbl[j].clower = j; + utf_tbl[j].cupper = j; + } + for (j = 0; j < get_utf_cs_len(); j++) { + utf_tbl[uni[j].c].cletter = 1; + utf_tbl[uni[j].c].clower = uni[j].clower; + utf_tbl[uni[j].c].cupper = uni[j].cupper; + } + // set Azeri, Turkish spec. lowercasing + set_spec_utf8_encoding(); + } else return 1; + } + np++; break; } default: break; } i++; @@ -947,49 +3034,169 @@ int AffixMgr::parse_set(char * line) return 0; } +/* parse flag */ +int AffixMgr::parse_flag(char * line, unsigned short * out, char * name) +{ + if (*out) { + fprintf(stderr,"error: duplicate %s strings\n", name); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + *out = pHMgr->decode_flag(piece); + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + fprintf(stderr,"error: missing %s information\n", name); + return 1; + } + return 0; +} + +/* parse flag */ +int AffixMgr::parse_num(char * line, int * out, char * name) +{ + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + *out = atoi(piece); + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + fprintf(stderr,"error: missing %s information\n", name); + return 1; + } + return 0; +} -/* parse in the flag used by the controlled compound words */ -int AffixMgr::parse_cpdflag(char * line) +/* parse in the wordchars string */ +int AffixMgr::parse_wordchars(char * line) { - if (compound) { - fprintf(stderr,"error: duplicate compound flags used\n"); + if (wordchars) { + fprintf(stderr,"error: duplicate WORDCHARS strings\n"); return 1; } char * tp = line; char * piece; int i = 0; int np = 0; - while ((piece=mystrsep(&tp,' '))) { + w_char w[MAXWORDLEN]; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + if (!utf8) { + wordchars = mystrdup(piece); + } else { + int n = u8_u16(w, MAXWORDLEN, piece); + if (n > 0) { + flag_qsort((unsigned short *) w, 0, n); + wordchars_utf16 = (unsigned short *) malloc(n * sizeof(unsigned short)); + if (!wordchars_utf16) return 1; + memcpy(wordchars_utf16, w, n * sizeof(unsigned short)); + } + wordchars_utf16_len = n; + } + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + fprintf(stderr,"error: missing WORDCHARS information\n"); + return 1; + } + return 0; +} + + +/* parse in the max syllablecount of compound words and */ +int AffixMgr::parse_cpdsyllable(char * line) +{ + char * tp = line; + char * piece; + int i = 0; + int np = 0; + w_char w[MAXWORDLEN]; + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { np++; break; } - case 1: { compound = mystrdup(piece); np++; break; } + case 1: { cpdmaxsyllable = atoi(piece); np++; break; } + case 2: { + if (!utf8) { + cpdvowels = mystrdup(piece); + } else { + int n = u8_u16(w, MAXWORDLEN, piece); + if (n > 0) { + flag_qsort((unsigned short *) w, 0, n); + cpdvowels_utf16 = (w_char *) malloc(n * sizeof(w_char)); + if (!cpdvowels_utf16) return 1; + memcpy(cpdvowels_utf16, w, n * sizeof(w_char)); + } + cpdvowels_utf16_len = n; + } + np++; + break; + } default: break; } i++; } free(piece); } - if (np != 2) { - fprintf(stderr,"error: missing compound flag information\n"); + if (np < 2) { + fprintf(stderr,"error: missing compoundsyllable information\n"); return 1; } + if (np == 2) cpdvowels = mystrdup("aeiouAEIOU"); return 0; } - -/* parse in the min compound word length */ -int AffixMgr::parse_cpdmin(char * line) +/* parse in the flags, that increments syllable number */ +int AffixMgr::parse_syllablenum(char * line) { char * tp = line; char * piece; int i = 0; int np = 0; - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { np++; break; } - case 1: { cpdmin = atoi(piece); np++; break; } + case 1: { cpdsyllablenum = mystrdup(piece); np++; break; } default: break; } i++; @@ -997,14 +3204,12 @@ int AffixMgr::parse_cpdmin(char * line) free(piece); } if (np != 2) { - fprintf(stderr,"error: missing compound min information\n"); + fprintf(stderr,"error: missing cpdsyllablenum information\n"); return 1; - } - if ((cpdmin < 1) || (cpdmin > 50)) cpdmin = 3; + } return 0; } - /* parse in the typical fault correcting table */ int AffixMgr::parse_reptable(char * line, FILE * af) { @@ -1016,7 +3221,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) char * piece; int i = 0; int np = 0; - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { np++; break; } @@ -1028,6 +3233,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) return 1; } reptable = (replentry *) malloc(numrep * sizeof(struct replentry)); + if (!reptable) return 1; np++; break; } @@ -1045,13 +3251,13 @@ int AffixMgr::parse_reptable(char * line, FILE * af) /* now parse the numrep lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < numrep; j++) { - fgets(nl,MAXLNLEN,af); + if (!fgets(nl,MAXLNLEN,af)) return 1; mychomp(nl); tp = nl; i = 0; reptable[j].pattern = NULL; - reptable[j].replacement = NULL; - while ((piece=mystrsep(&tp,' '))) { + reptable[j].pattern2 = NULL; + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { @@ -1063,14 +3269,14 @@ int AffixMgr::parse_reptable(char * line, FILE * af) break; } case 1: { reptable[j].pattern = mystrdup(piece); break; } - case 2: { reptable[j].replacement = mystrdup(piece); break; } + case 2: { reptable[j].pattern2 = mystrdup(piece); break; } default: break; } i++; } free(piece); } - if ((!(reptable[j].pattern)) || (!(reptable[j].replacement))) { + if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { fprintf(stderr,"error: replacement table is corrupt\n"); return 1; } @@ -1078,6 +3284,155 @@ int AffixMgr::parse_reptable(char * line, FILE * af) return 0; } +/* parse in the checkcompoundpattern table */ +int AffixMgr::parse_checkcpdtable(char * line, FILE * af) +{ + if (numcheckcpd != 0) { + fprintf(stderr,"error: duplicate compound pattern tables used\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + numcheckcpd = atoi(piece); + if (numcheckcpd < 1) { + fprintf(stderr,"incorrect number of entries in compound pattern table\n"); + free(piece); + return 1; + } + checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry)); + if (!checkcpdtable) return 1; + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + fprintf(stderr,"error: missing compound pattern table information\n"); + return 1; + } + + /* now parse the numcheckcpd lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < numcheckcpd; j++) { + if (!fgets(nl,MAXLNLEN,af)) return 1; + mychomp(nl); + tp = nl; + i = 0; + checkcpdtable[j].pattern = NULL; + checkcpdtable[j].pattern2 = NULL; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) { + fprintf(stderr,"error: compound pattern table is corrupt\n"); + free(piece); + return 1; + } + break; + } + case 1: { checkcpdtable[j].pattern = mystrdup(piece); break; } + case 2: { checkcpdtable[j].pattern2 = mystrdup(piece); break; } + default: break; + } + i++; + } + free(piece); + } + if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { + fprintf(stderr,"error: compound pattern table is corrupt\n"); + return 1; + } + } + return 0; +} + +/* parse in the compound rule table */ +int AffixMgr::parse_defcpdtable(char * line, FILE * af) +{ + if (numdefcpd != 0) { + fprintf(stderr,"error: duplicate compound rule tables used\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + numdefcpd = atoi(piece); + if (numdefcpd < 1) { + fprintf(stderr,"incorrect number of entries in compound rule table\n"); + free(piece); + return 1; + } + defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry)); + if (!defcpdtable) return 1; + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + fprintf(stderr,"error: missing compound rule table information\n"); + return 1; + } + + /* now parse the numdefcpd lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < numdefcpd; j++) { + if (!fgets(nl,MAXLNLEN,af)) return 1; + mychomp(nl); + tp = nl; + i = 0; + defcpdtable[j].def = NULL; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { + fprintf(stderr,"error: compound rule table is corrupt\n"); + free(piece); + return 1; + } + break; + } + case 1: { + defcpdtable[j].len = + pHMgr->decode_flags(&(defcpdtable[j].def), piece); + break; + } + default: break; + } + i++; + } + free(piece); + } + if (!defcpdtable[j].len) { + fprintf(stderr,"error: compound rule table is corrupt\n"); + return 1; + } + } + return 0; +} /* parse in the character map table */ @@ -1091,7 +3446,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) char * piece; int i = 0; int np = 0; - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { np++; break; } @@ -1103,6 +3458,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) return 1; } maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); + if (!maptable) return 1; np++; break; } @@ -1120,13 +3476,13 @@ int AffixMgr::parse_maptable(char * line, FILE * af) /* now parse the nummap lines to read in the remainder of the table */ char * nl = line; for (int j=0; j < nummap; j++) { - fgets(nl,MAXLNLEN,af); + if (!fgets(nl,MAXLNLEN,af)) return 1; mychomp(nl); tp = nl; i = 0; maptable[j].set = NULL; maptable[j].len = 0; - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { case 0: { @@ -1137,8 +3493,24 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } break; } - case 1: { maptable[j].set = mystrdup(piece); - maptable[j].len = strlen(maptable[j].set); + case 1: { + maptable[j].len = 0; + maptable[j].set = NULL; + maptable[j].set_utf16 = NULL; + if (!utf8) { + maptable[j].set = mystrdup(piece); + maptable[j].len = strlen(maptable[j].set); + } else { + w_char w[MAXWORDLEN]; + int n = u8_u16(w, MAXWORDLEN, piece); + if (n > 0) { + flag_qsort((unsigned short *) w, 0, n); + maptable[j].set_utf16 = (w_char *) malloc(n * sizeof(w_char)); + if (!maptable[j].set_utf16) return 1; + memcpy(maptable[j].set_utf16, w, n * sizeof(w_char)); + } + maptable[j].len = n; + } break; } default: break; } @@ -1146,7 +3518,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } free(piece); } - if ((!(maptable[j].set)) || (!(maptable[j].len))) { + if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) { fprintf(stderr,"error: map table is corrupt\n"); return 1; } @@ -1154,13 +3526,134 @@ int AffixMgr::parse_maptable(char * line, FILE * af) return 0; } +/* parse in the word breakpoint table */ +int AffixMgr::parse_breaktable(char * line, FILE * af) +{ + if (numbreak != 0) { + fprintf(stderr,"error: duplicate word breakpoint tables used\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + numbreak = atoi(piece); + if (numbreak < 1) { + fprintf(stderr,"incorrect number of entries in BREAK table\n"); + free(piece); + return 1; + } + breaktable = (char **) malloc(numbreak * sizeof(char *)); + if (!breaktable) return 1; + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + fprintf(stderr,"error: missing word breakpoint table information\n"); + return 1; + } + + /* now parse the numbreak lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < numbreak; j++) { + if (!fgets(nl,MAXLNLEN,af)) return 1; + mychomp(nl); + tp = nl; + i = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece,"BREAK",5) != 0) { + fprintf(stderr,"error: BREAK table is corrupt\n"); + free(piece); + return 1; + } + break; + } + case 1: { + breaktable[j] = mystrdup(piece); + break; + } + default: break; + } + i++; + } + free(piece); + } + if (!breaktable) { + fprintf(stderr,"error: BREAK table is corrupt\n"); + return 1; + } + } + return 0; +} +/* parse in the flag used by affix_check() */ +int AffixMgr::parse_lang(char * line) +{ + if (lang != NULL) { + fprintf(stderr,"error: duplicate LANG used\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + lang = mystrdup(piece); + langnum = get_lang_num(piece); + set_spec_utf8_encoding(); + np++; break; + } + default: break; + } + i++; + } + free(piece); + } + if (np < 2) { + fprintf(stderr,"error: missing LANG information\n"); + return 1; + } + return 0; +} +/* parse in the version string */ +int AffixMgr::parse_version(char * line) +{ + if (version) { + fprintf(stderr,"error: duplicate VERSION strings\n"); + return 1; + } + char * tp = line; + char * piece = mystrsep(&tp, 0); + version = mystrdup(tp); + free(piece); + return 0; +} -int AffixMgr::parse_affix(char * line, const char at, FILE * af) +int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags) { int numents = 0; // number of affentry structures to parse - char achar='\0'; // affix char identifier + + unsigned short aflag = 0; // affix char identifier + short ff=0; struct affentry * ptr= NULL; struct affentry * nptr= NULL; @@ -1170,29 +3663,51 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) char * piece; int i = 0; + // checking lines with bad syntax + int basefieldnum = 0; + // split affix header line into pieces int np = 0; - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { // piece 1 - is type of affix case 0: { np++; break; } // piece 2 - is affix char - case 1: { np++; achar = *piece; break; } - + case 1: { + np++; + aflag = pHMgr->decode_flag(piece); + if (((at == 'S') && (dupflags[aflag] & dupSFX)) || + ((at == 'P') && (dupflags[aflag] & dupPFX))) { + fprintf(stderr, "error: duplicate affix flag %s in line %s\n", piece, nl); + // return 1; XXX permissive mode for bad dictionaries + } + dupflags[aflag] += ((at == 'S') ? dupSFX : dupPFX); + break; + } // piece 3 - is cross product indicator - case 2: { np++; if (*piece == 'Y') ff = XPRODUCT; break; } + case 2: { np++; if (*piece == 'Y') ff = aeXPRODUCT; break; } // piece 4 - is number of affentries case 3: { np++; numents = atoi(piece); + if (numents == 0) { + char * err = pHMgr->encode_flag(aflag); + fprintf(stderr, "error: affix %s header has incorrect entry count in line %s\n", + err, nl); + free(err); + return 1; + } ptr = (struct affentry *) malloc(numents * sizeof(struct affentry)); - ptr->xpflg = ff; - ptr->achar = achar; - break; + if (!ptr) return 1; + ptr->opts = ff; + if (utf8) ptr->opts += aeUTF8; + if (pHMgr->is_aliasf()) ptr->opts += aeALIASF; + if (pHMgr->is_aliasm()) ptr->opts += aeALIASM; + ptr->aflag = aflag; } default: break; @@ -1203,7 +3718,9 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) } // check to make sure we parsed enough pieces if (np != 4) { - fprintf(stderr, "error: affix %c header has insufficient data in line %s\n",achar,nl); + char * err = pHMgr->encode_flag(aflag); + fprintf(stderr, "error: affix %s header has insufficient data in line %s\n", err, nl); + free(err); free(ptr); return 1; } @@ -1213,40 +3730,45 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) // now parse numents affentries for this affix for (int j=0; j < numents; j++) { - fgets(nl,MAXLNLEN,af); + if (!fgets(nl,MAXLNLEN,af)) return 1; mychomp(nl); tp = nl; i = 0; np = 0; // split line into pieces - while ((piece=mystrsep(&tp,' '))) { + while ((piece=mystrsep(&tp, 0))) { if (*piece != '\0') { switch(i) { - // piece 1 - is type case 0: { np++; - if (nptr != ptr) nptr->xpflg = ptr->xpflg; + if (nptr != ptr) nptr->opts = ptr->opts; break; } // piece 2 - is affix char case 1: { np++; - if (*piece != achar) { - fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl); + if (pHMgr->decode_flag(piece) != aflag) { + char * err = pHMgr->encode_flag(aflag); + fprintf(stderr, "error: affix %s is corrupt near line %s\n", err, nl); fprintf(stderr, "error: possible incorrect count\n"); + free(err); free(piece); return 1; } - if (nptr != ptr) nptr->achar = ptr->achar; + + if (nptr != ptr) nptr->aflag = ptr->aflag; break; } // piece 3 - is string to strip or 0 for null case 2: { np++; + if (complexprefixes) { + if (utf8) reverseword_utf(piece); else reverseword(piece); + } nptr->strip = mystrdup(piece); nptr->stripl = strlen(nptr->strip); if (strcmp(nptr->strip,"0") == 0) { @@ -1259,8 +3781,39 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) // piece 4 - is affix string or 0 for null case 3: { + char * dash; + nptr->morphcode = NULL; + nptr->contclass = NULL; + nptr->contclasslen = 0; np++; - nptr->appnd = mystrdup(piece); + dash = strchr(piece, '/'); + if (dash) { + *dash = '\0'; + if (complexprefixes) { + if (utf8) reverseword_utf(piece); else reverseword(piece); + } + nptr->appnd = mystrdup(piece); + + if (pHMgr->is_aliasf()) { + int index = atoi(dash + 1); + nptr->contclasslen = pHMgr->get_aliasf(index, &(nptr->contclass)); + } else { + nptr->contclasslen = pHMgr->decode_flags(&(nptr->contclass), dash + 1); + flag_qsort(nptr->contclass, 0, nptr->contclasslen); + } + *dash = '/'; + + havecontclass = 1; + for (unsigned short i = 0; i < nptr->contclasslen; i++) { + contclasses[(nptr->contclass)[i]] = 1; + } + } else { + if (complexprefixes) { + if (utf8) reverseword_utf(piece); else reverseword(piece); + } + nptr->appnd = mystrdup(piece); + } + nptr->appndl = strlen(nptr->appnd); if (strcmp(nptr->appnd,"0") == 0) { free(nptr->appnd); @@ -1271,7 +3824,77 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) } // piece 5 - is the conditions descriptions - case 4: { np++; encodeit(nptr,piece); } + case 4: { + np++; + if (complexprefixes) { + int neg = 0; + if (utf8) reverseword_utf(piece); else reverseword(piece); + // reverse condition + for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { + switch(*k) { + case '[': { + if (neg) *(k+1) = '['; else *k = ']'; + break; + } + case ']': { + *k = '['; + if (neg) *(k+1) = '^'; + neg = 0; + break; + } + case '^': { + if (*(k+1) == ']') neg = 1; else *(k+1) = *k; + break; + } + default: { + if (neg) *(k+1) = *k; + } + } + } + } + if (nptr->stripl && (strcmp(piece, ".") != 0) && + redundant_condition(at, nptr->strip, nptr->stripl, piece, nl)) + strcpy(piece, "."); + if (encodeit(nptr,piece)) return 1; + break; + } + + case 5: { + np++; + if (pHMgr->is_aliasm()) { + int index = atoi(piece); + nptr->morphcode = pHMgr->get_aliasm(index); + } else { + if (complexprefixes) { + if (utf8) reverseword_utf(piece); else reverseword(piece); + } + nptr->morphcode = mystrdup(piece); + } + break; + } + + case 6: { + // XXX deprecated syntax + np++; + if (nptr->contclass) { + fprintf(stderr, "error: affix rule contains two contclass " + "(%s and %s by deprecated syntax).\n", nptr->contclass, piece); + } else { + if (pHMgr->is_aliasf()) { + int index = atoi(piece); + nptr->contclasslen = pHMgr->get_aliasf(index, &(nptr->contclass)); + } else { + nptr->contclasslen = pHMgr->decode_flags(&(nptr->contclass), piece); + flag_qsort(nptr->contclass, 0, nptr->contclasslen); + } + havecontclass = 1; + for (unsigned short i = 0; i < nptr->contclasslen; i++) { + contclasses[(nptr->contclass)[i]] = 1; + } + } + break; + + } default: break; } @@ -1280,14 +3903,27 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) free(piece); } // check to make sure we parsed enough pieces - if (np != 5) { - fprintf(stderr, "error: affix %c is corrupt near line %s\n",achar,nl); + if (np < 5) { + char * err = pHMgr->encode_flag(aflag); + fprintf(stderr, "error: affix %s is corrupt near line %s\n", err, nl); + free(err); free(ptr); return 1; } + +#if DEBUG + // detect unnecessary fields, excepting comments + if (basefieldnum) { + int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); + if (fieldnum != basefieldnum) + fprintf(stderr, "warning - bad field number:\n%s\n", nl); + } else { + basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); + } +#endif nptr++; } - + // now create SfxEntry or PfxEntry objects and use links to // build an ordered (sorted by affix string) list nptr = ptr; @@ -1304,3 +3940,81 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) free(ptr); return 0; } + +void AffixMgr::set_spec_utf8_encoding() { + if (utf8) { + // In Azeri and Turkish, I and i dictinct letters: + // There are a dotless lower case i pair of upper `I', + // and an upper I with dot pair of lower `i'. + if ((langnum == LANG_az) || (langnum == LANG_tr)) { + utf_tbl[0x0049].clower = 0x0131; + utf_tbl[0x0069].cupper = 0x0130; + } + } +} + +int AffixMgr::redundant_condition(char ft, char * strip, int stripl, const char * cond, char * line) { + int condl = strlen(cond); + int i; + int j; + int neg; + int in; + if (ft == 'P') { // prefix + if (strncmp(strip, cond, condl) == 0) return 1; + if (utf8) { + } else { + for (i = 0, j = 0; (i < stripl) && (j < condl); i++, j++) { + if (cond[j] != '[') { + if (cond[j] != strip[i]) { + fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line); + } + } else { + neg = (cond[j+1] == '^') ? 1 : 0; + in = 0; + do { + j++; + if (strip[i] == cond[j]) in = 1; + } while ((j < (condl - 1)) && (cond[j] != ']')); + if (j == (condl - 1) && (cond[j] != ']')) { + fprintf(stderr, "error - missing ] in condition:\n%s\n", line); + return 0; + } + if ((!neg && !in) || (neg && in)) { + fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line); + return 0; + } + } + } + if (j >= condl) return 1; + } + } else { // suffix + if ((stripl >= condl) && strcmp(strip + stripl - condl, cond) == 0) return 1; + if (utf8) { + } else { + for (i = stripl - 1, j = condl - 1; (i >= 0) && (j >= 0); i--, j--) { + if (cond[j] != ']') { + if (cond[j] != strip[i]) { + fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line); + } + } else { + in = 0; + do { + j--; + if (strip[i] == cond[j]) in = 1; + } while ((j > 0) && (cond[j] != '[')); + if ((j == 0) && (cond[j] != '[')) { + fprintf(stderr, "error - missing ] in condition:\n%s\n", line); + return 0; + } + neg = (cond[j+1] == '^') ? 1 : 0; + if ((!neg && !in) || (neg && in)) { + fprintf(stderr, "warning - incompatible stripping characters and condition:\n%s\n", line); + return 0; + } + } + } + if (j < 0) return 1; + } + } + return 0; +} diff --git a/src/myspell/affixmgr.hxx b/src/myspell/affixmgr.hxx index 6cbd112..e93ba8e 100644 --- a/src/myspell/affixmgr.hxx +++ b/src/myspell/affixmgr.hxx @@ -1,69 +1,203 @@ #ifndef _AFFIXMGR_HXX_ #define _AFFIXMGR_HXX_ +#include <cstdlib> +#include <cstring> +#include <cstdio> #include "atypes.hxx" #include "baseaffix.hxx" #include "hashmgr.hxx" -#include <cstdio> + +// check flag duplication +#define dupSFX (1 << 0) +#define dupPFX (1 << 1) class AffixMgr { AffEntry * pStart[SETSIZE]; AffEntry * sStart[SETSIZE]; - AffEntry * pFlag[SETSIZE]; - AffEntry * sFlag[SETSIZE]; + AffEntry * pFlag[CONTSIZE]; + AffEntry * sFlag[CONTSIZE]; HashMgr * pHMgr; char * trystring; char * encoding; - char * compound; + struct cs_info * csconv; + int utf8; + struct unicode_info2 * utf_tbl; + int complexprefixes; + FLAG compoundflag; + FLAG compoundbegin; + FLAG compoundmiddle; + FLAG compoundend; + FLAG compoundroot; + FLAG compoundforbidflag; + FLAG compoundpermitflag; + int checkcompounddup; + int checkcompoundrep; + int checkcompoundcase; + int checkcompoundtriple; + FLAG forbiddenword; + FLAG nosuggest; + FLAG pseudoroot; int cpdmin; int numrep; replentry * reptable; int nummap; mapentry * maptable; - bool nosplitsugs; - + int numbreak; + char ** breaktable; + int numcheckcpd; + replentry * checkcpdtable; + int numdefcpd; + flagentry * defcpdtable; + int maxngramsugs; + int nosplitsugs; + int sugswithdots; + int cpdwordmax; + int cpdmaxsyllable; + char * cpdvowels; + w_char * cpdvowels_utf16; + int cpdvowels_utf16_len; + char * cpdsyllablenum; + const char * pfxappnd; // BUG: not stateless + const char * sfxappnd; // BUG: not stateless + FLAG sfxflag; // BUG: not stateless + char * derived; // BUG: not stateless + AffEntry * sfx; // BUG: not stateless + AffEntry * pfx; // BUG: not stateless + int checknum; + char * wordchars; + unsigned short * wordchars_utf16; + int wordchars_utf16_len; + char * version; + char * lang; + int langnum; + FLAG lemma_present; + FLAG circumfix; + FLAG onlyincompound; + FLAG keepcase; + int checksharps; + int havecontclass; // boolean variable + char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold affix) + flag flag_mode; + public: AffixMgr(const char * affpath, HashMgr * ptr); ~AffixMgr(); - struct hentry * affix_check(const char * word, int len); - struct hentry * prefix_check(const char * word, int len); - struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx); - int expand_rootword(struct guessword * wlst, int maxn, - const char * ts, int wl, const char * ap, int al); - struct hentry * compound_check(const char * word, int len, char compound_flag); + struct hentry * affix_check(const char * word, int len, + const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); + struct hentry * prefix_check(const char * word, int len, + char in_compound, const FLAG needflag = FLAG_NULL); + struct hentry * prefix_check_twosfx(const char * word, int len, + char in_compound, const FLAG needflag = FLAG_NULL); + struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx, + char ** wlst, int maxSug, int * ns, const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + struct hentry * suffix_check_twosfx(const char * word, int len, + int sfxopts, AffEntry* ppfx, const FLAG needflag = FLAG_NULL); + + char * affix_check_morph(const char * word, int len, + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + char * prefix_check_morph(const char * word, int len, + char in_compound, const FLAG needflag = FLAG_NULL); + char * suffix_check_morph (const char * word, int len, int sfxopts, AffEntry * ppfx, + const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + + char * prefix_check_twosfx_morph(const char * word, int len, + char in_compound, const FLAG needflag = FLAG_NULL); + char * suffix_check_twosfx_morph(const char * word, int len, + int sfxopts, AffEntry * ppfx, const FLAG needflag = FLAG_NULL); + + int expand_rootword(struct guessword * wlst, int maxn, const char * ts, + int wl, const unsigned short * ap, unsigned short al, char * bad, int); + + int get_syllable (const char * word, int wlen); + int cpdrep_check(const char * word, int len); + int cpdpat_check(const char * word, int len); + int defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** rwords, char all); + int cpdcase_check(const char * word, int len); + int candidate_check(const char * word, int len); + struct hentry * compound_check(const char * word, int len, + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug); + + int compound_check_morph(const char * word, int len, + short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, char ** result, char * partresult); + struct hentry * lookup(const char * word); int get_numrep(); struct replentry * get_reptable(); int get_nummap(); struct mapentry * get_maptable(); + int get_numbreak(); + char ** get_breaktable(); char * get_encoding(); + int get_langnum(); + struct unicode_info2 * get_utf_conv(); char * get_try_string(); - char * get_compound(); - bool get_nosplitsugs(); - + const char * get_wordchars(); + unsigned short * get_wordchars_utf16(int * len); + int get_compound(); + FLAG get_compoundflag(); + FLAG get_compoundbegin(); + FLAG get_forbiddenword(); + FLAG get_nosuggest(); + FLAG get_pseudoroot(); + FLAG get_onlyincompound(); + FLAG get_compoundroot(); + FLAG get_lemma_present(); + int get_checknum(); + char * get_possible_root(); + const char * get_prefix(); + const char * get_suffix(); + const char * get_derived(); + const char * get_version(); + const int have_contclass(); + int get_utf8(); + int get_complexprefixes(); + char * get_suffixed(char ); + int get_maxngramsugs(); + int get_nosplitsugs(); + int get_sugswithdots(void); + FLAG get_keepcase(void); + int get_checksharps(void); + private: int parse_file(const char * affpath); int parse_try(char * line); int parse_set(char * line); + int parse_flag(char * line, unsigned short * out, char * name); + int parse_num(char * line, int * out, char * name); int parse_cpdflag(char * line); - int parse_cpdmin(char * line); + int parse_cpdforbid(char * line); + int parse_forbid(char * line); + int parse_cpdsyllable(char * line); + int parse_syllablenum(char * line); int parse_reptable(char * line, FILE * af); int parse_maptable(char * line, FILE * af); - int parse_affix(char * line, const char at, FILE * af); + int parse_breaktable(char * line, FILE * af); + int parse_checkcpdtable(char * line, FILE * af); + int parse_defcpdtable(char * line, FILE * af); + int parse_affix(char * line, const char at, FILE * af, char * dupflags); + int parse_wordchars(char * line); + int parse_lang(char * line); + int parse_version(char * line); - void encodeit(struct affentry * ptr, char * cs); + int encodeit(struct affentry * ptr, char * cs); int build_pfxtree(AffEntry* pfxptr); int build_sfxtree(AffEntry* sfxptr); - AffEntry* process_sfx_in_order(AffEntry* ptr, AffEntry* nptr); - AffEntry* process_pfx_in_order(AffEntry* ptr, AffEntry* nptr); - int process_pfx_tree_to_list(); - int process_sfx_tree_to_list(); int process_pfx_order(); int process_sfx_order(); + AffEntry * process_pfx_in_order(AffEntry * ptr, AffEntry * nptr); + AffEntry * process_sfx_in_order(AffEntry * ptr, AffEntry * nptr); + int process_pfx_tree_to_list(); + int process_sfx_tree_to_list(); + void set_spec_utf8_encoding(); + int redundant_condition(char, char * strip, int stripl, const char * cond, char *); }; #endif diff --git a/src/myspell/atypes.hxx b/src/myspell/atypes.hxx index a10c69d..c8c9257 100644 --- a/src/myspell/atypes.hxx +++ b/src/myspell/atypes.hxx @@ -1,34 +1,74 @@ #ifndef _ATYPES_HXX_ #define _ATYPES_HXX_ +// HUNSTEM def. +#define HUNSTEM + +#include "csutil.hxx" +#include "hashmgr.hxx" + #define SETSIZE 256 -#define MAXAFFIXES 256 +#define CONTSIZE 65536 #define MAXWORDLEN 100 -#define XPRODUCT (1 << 0) +#define MAXWORDUTF8LEN (MAXWORDLEN * 4) + +// affentry options +#define aeXPRODUCT (1 << 0) +#define aeUTF8 (1 << 1) +#define aeALIASF (1 << 2) +#define aeALIASM (1 << 3) + +enum {IN_CPD_NOT, IN_CPD_BEGIN, IN_CPD_END, IN_CPD_OTHER}; + +#define MAXLNLEN 8192 * 4 -#define MAXLNLEN 1024 +#define MAXCOMPOUND 10 -#define TESTAFF( a , b , c ) memchr((void *)(a), (int)(b), (size_t)(c) ) +#define MAXACC 1000 + +#define FLAG unsigned short +#define FLAG_NULL 0x00 +#define FREE_FLAG(a) a = 0 + +#define TESTAFF( a, b , c ) flag_bsearch((unsigned short *) a, (unsigned short) b, c) struct affentry { char * strip; char * appnd; - short stripl; - short appndl; - short numconds; - short xpflg; - char achar; - char conds[SETSIZE]; + unsigned char stripl; + unsigned char appndl; + char numconds; + char opts; + unsigned short aflag; + union { + char base[SETSIZE]; + struct { + char ascii[SETSIZE/2]; + char neg[8]; + char all[8]; + w_char * wchars[8]; + int wlen[8]; + } utf8; + } conds; + char * morphcode; + unsigned short * contclass; + short contclasslen; }; struct replentry { char * pattern; - char * replacement; + char * pattern2; }; struct mapentry { char * set; + w_char * set_utf16; + int len; +}; + +struct flagentry { + FLAG * def; int len; }; diff --git a/src/myspell/baseaffix.hxx b/src/myspell/baseaffix.hxx index 6aa4351..da7c010 100644 --- a/src/myspell/baseaffix.hxx +++ b/src/myspell/baseaffix.hxx @@ -3,15 +3,29 @@ class AffEntry { +public: + protected: char * appnd; char * strip; - short appndl; - short stripl; - short numconds; - short xpflg; - char achar; - char conds[SETSIZE]; + unsigned char appndl; + unsigned char stripl; + char numconds; + char opts; + unsigned short aflag; + union { + char base[SETSIZE]; + struct { + char ascii[SETSIZE/2]; + char neg[8]; + char all[8]; + w_char * wchars[8]; + int wlen[8]; + } utf8; + } conds; + char * morphcode; + unsigned short * contclass; + short contclasslen; }; #endif diff --git a/src/myspell/csutil.cxx b/src/myspell/csutil.cxx index 73065f1..4fe2fbf 100644 --- a/src/myspell/csutil.cxx +++ b/src/myspell/csutil.cxx @@ -1,178 +1,497 @@ #include <cstdlib> #include <cstring> +#include <cctype> #include <cstdio> #include "csutil.hxx" -#ifndef WINDOWS -using namespace std; -#endif +#include "atypes.hxx" +#include "langnum.hxx" -// strip strings into token based on single char delimiter -// acts like strsep() but only uses a delim char and not -// a delim string - -char * mystrsep(char ** stringp, const char delim) -{ - char * rv = NULL; - char * mp = *stringp; - int n = strlen(mp); - if (n > 0) { - char * dp = (char *)memchr(mp,(int)((unsigned char)delim),n); - if (dp) { - *stringp = dp+1; - int nc = (int)((unsigned long)dp - (unsigned long)mp); - rv = (char *) malloc(nc+1); - memcpy(rv,mp,nc); - *(rv+nc) = '\0'; - return rv; - } else { - rv = (char *) malloc(n+1); - memcpy(rv, mp, n); - *(rv+n) = '\0'; - *stringp = mp + n; - return rv; - } - } - return NULL; -} +#include "utf_info.cxx" +#define UTF_LST_LEN (sizeof(utf_lst) / (sizeof(unicode_info))) +#ifndef W32 +using namespace std; +#endif -// replaces strdup with ansi version -char * mystrdup(const char * s) -{ - char * d = NULL; - if (s) { - int sl = strlen(s); - d = (char *) malloc(((sl+1) * sizeof(char))); - if (d) memcpy(d,s,((sl+1)*sizeof(char))); - } - return d; +/* only UTF-16 (BMP) implementation */ +char * u16_u8(char * dest, int size, const w_char * src, int srclen) { + char * u8 = dest; + char * u8_max = u8 + size; + const w_char * u2 = src; + const w_char * u2_max = src + srclen; + while ((u2 < u2_max) && (u8 < u8_max)) { + if (u2->h) { // > 0xFF + // XXX 4-byte haven't implemented yet. + if (u2->h >= 0x08) { // >= 0x800 (3-byte UTF-8 character) + *u8 = 0xe0 + (u2->h >> 4); + u8++; + if (u8 < u8_max) { + *u8 = 0x80 + ((u2->h & 0xf) << 2) + (u2->l >> 6); + u8++; + if (u8 < u8_max) { + *u8 = 0x80 + (u2->l & 0x3f); + u8++; + } + } + } else { // < 0x800 (2-byte UTF-8 character) + *u8 = 0xc0 + (u2->h << 2) + (u2->l >> 6); + u8++; + if (u8 < u8_max) { + *u8 = 0x80 + (u2->l & 0x3f); + u8++; + } + } + } else { // <= 0xFF + if (u2->l & 0x80) { // >0x80 (2-byte UTF-8 character) + *u8 = 0xc0 + (u2->l >> 6); + u8++; + if (u8 < u8_max) { + *u8 = 0x80 + (u2->l & 0x3f); + u8++; + } + } else { // < 0x80 (1-byte UTF-8 character) + *u8 = u2->l; + u8++; + } + } + u2++; + } + *u8 = '\0'; + return dest; } -// remove cross-platform text line end characters -void mychomp(char * s) -{ - int k = strlen(s); - if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; - if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; +/* only UTF-16 (BMP) implementation */ +int u8_u16(w_char * dest, int size, const char * src) { + const char * u8 = src; + w_char * u2 = dest; + w_char * u2_max = u2 + size; + + while (*u8 && (u2 < u2_max)) { + switch ((*u8) & 0xf0) { + case 0x00: + case 0x10: + case 0x20: + case 0x30: + case 0x40: + case 0x50: + case 0x60: + case 0x70: { + u2->h = 0; + u2->l = *u8; + break; + } + case 0x80: + case 0x90: + case 0xa0: + case 0xb0: { + fprintf(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %d. character position\n%s\n", u8 - src, src); + u2->h = 0xff; + u2->l = 0xfd; + break; + } + case 0xc0: + case 0xd0: { // 2-byte UTF-8 codes + if ((*(u8+1) & 0xc0) == 0x80) { + u2->h = (*u8 & 0x1f) >> 2; + u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); + u8++; + } else { + fprintf(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + u2->h = 0xff; + u2->l = 0xfd; + } + break; + } + case 0xe0: { // 3-byte UTF-8 codes + if ((*(u8+1) & 0xc0) == 0x80) { + u2->h = ((*u8 & 0x0f) << 4) + ((*(u8+1) & 0x3f) >> 2); + u8++; + if ((*(u8+1) & 0xc0) == 0x80) { + u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); + u8++; + } else { + fprintf(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + u2->h = 0xff; + u2->l = 0xfd; + } + } else { + fprintf(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + u2->h = 0xff; + u2->l = 0xfd; + } + break; + } + case 0xf0: { // 4 or more byte UTF-8 codes + fprintf(stderr, "This UTF-8 encoding can't convert to UTF-16:\n%s\n", src); + u2->h = 0xff; + u2->l = 0xfd; + break; + } + } + u8++; + u2++; + } + return u2 - dest; } - -// does an ansi strdup of the reverse of a string -char * myrevstrdup(const char * s) -{ - char * d = NULL; - if (s) { - int sl = strlen(s); - d = (char *) malloc((sl+1) * sizeof(char)); - if (d) { - const char * p = s + sl - 1; - char * q = d; - while (p >= s) *q++ = *p--; - *q = '\0'; +void flag_qsort(unsigned short flags[], int begin, int end) { + unsigned short reg; + if (end > begin) { + unsigned short pivot = flags[begin]; + int l = begin + 1; + int r = end; + while(l < r) { + if (flags[l] <= pivot) { + l++; + } else { + r--; + reg = flags[l]; + flags[l] = flags[r]; + flags[r] = reg; + } } + l--; + reg = flags[begin]; + flags[begin] = flags[l]; + flags[l] = reg; + + flag_qsort(flags, begin, l); + flag_qsort(flags, r, end); } - return d; -} + } -#if 0 -// return 1 if s1 is a leading subset of s2 -int isSubset(const char * s1, const char * s2) -{ - int l1 = strlen(s1); - int l2 = strlen(s2); - if (l1 > l2) return 0; - if (strncmp(s2,s1,l1) == 0) return 1; - return 0; +int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { + int mid; + int left = 0; + int right = length - 1; + while (left <= right) { + mid = (left + right) / 2; + if (flags[mid] == flag) return 1; + if (flag < flags[mid]) right = mid - 1; + else left = mid + 1; + } + return 0; } -#endif + // strip strings into token based on single char delimiter + // acts like strsep() but only uses a delim char and not + // a delim string + // default delimiter: white space characters + + char * mystrsep(char ** stringp, const char delim) + { + char * rv = NULL; + char * mp = *stringp; + int n = strlen(mp); + if (n > 0) { + char * dp; + if (delim) { + dp = (char *)memchr(mp,(int)((unsigned char)delim),n); + } else { + for (dp = mp; (*dp && !isspace(*dp)); dp++); + if (!*dp) dp = NULL; + } + if (dp) { + *stringp = dp+1; + int nc = (int)((unsigned long)dp - (unsigned long)mp); + rv = (char *) malloc(nc+1); + memcpy(rv,mp,nc); + *(rv+nc) = '\0'; + return rv; + } else { + rv = (char *) malloc(n+1); + memcpy(rv, mp, n); + *(rv+n) = '\0'; + *stringp = mp + n; + return rv; + } + } + return NULL; + } -// return 1 if s1 is a leading subset of s2 -int isSubset(const char * s1, const char * s2) -{ - while( *s1 && *s2 && (*s1 == *s2) ) { - s1++; - s2++; - } - return (*s1 == '\0'); -} + + // replaces strdup with ansi version + char * mystrdup(const char * s) + { + char * d = NULL; + if (s) { + int sl = strlen(s); + d = (char *) malloc(((sl+1) * sizeof(char))); + if (d) memcpy(d,s,((sl+1)*sizeof(char))); + } + return d; + } + + + // remove cross-platform text line end characters + void mychomp(char * s) + { + int k = strlen(s); + if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; + if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; + } + + + // does an ansi strdup of the reverse of a string + char * myrevstrdup(const char * s) + { + char * d = NULL; + if (s) { + int sl = strlen(s); + d = (char *) malloc((sl+1) * sizeof(char)); + if (d) { + const char * p = s + sl - 1; + char * q = d; + while (p >= s) *q++ = *p--; + *q = '\0'; + } + } + return d; + } -// return 1 if s1 (reversed) is a leading subset of end of s2 -int isRevSubset(const char * s1, const char * end_of_s2, int len) -{ - while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) { - s1++; - end_of_s2--; - len --; - } - return (*s1 == '\0'); -} + // return 1 if s1 is a leading subset of s2 + int isSubset(const char * s1, const char * s2) + { + while ((*s1 == *s2) && *s1) { + s1++; + s2++; + } + return (*s1 == '\0'); + } -// convert null terminated string to all caps using encoding -void enmkallcap(char * d, const char * p, const char * encoding) -{ - struct cs_info * csconv = get_current_cs(encoding); - while (*p != '\0') { - *d++ = csconv[((unsigned char) *p)].cupper; - p++; - } - *d = '\0'; -} + // return 1 if s1 (reversed) is a leading subset of end of s2 + int isRevSubset(const char * s1, const char * end_of_s2, int len) + { + while ((len > 0) && *s1 && (*s1 == *end_of_s2)) { + s1++; + end_of_s2--; + len--; + } + return (*s1 == '\0'); + } + // convert null terminated string to all caps using encoding + void enmkallcap(char * d, const char * p, const char * encoding) + + { + struct cs_info * csconv = get_current_cs(encoding); + while (*p != '\0') { + *d++ = csconv[((unsigned char) *p)].cupper; + p++; + } + *d = '\0'; + } -// convert null terminated string to all little using encoding -void enmkallsmall(char * d, const char * p, const char * encoding) -{ - struct cs_info * csconv = get_current_cs(encoding); - while (*p != '\0') { - *d++ = csconv[((unsigned char) *p)].clower; - p++; - } - *d = '\0'; -} + // append s to ends of every lines in text + void strlinecat(char * dest, const char * s) + { + char * dup = mystrdup(dest); + char * source = dup; + int len = strlen(s); + while (*source) { + if (*source == '\n') { + strncpy(dest, s, len); + dest += len; + } + *dest = *source; + source++; dest++; + } + strcpy(dest, s); + free(dup); + } -// convert null terminated string to have intial capital using encoding -void enmkinitcap(char * d, const char * p, const char * encoding) -{ - struct cs_info * csconv = get_current_cs(encoding); - memcpy(d,p,(strlen(p)+1)); - if (*p != '\0') *d= csconv[((unsigned char)*p)].cupper; +// break text to lines +// return number of lines +int line_tok(const char * text, char *** lines) { + int linenum = 0; + char * dup = mystrdup(text); + char * p = dup; + while ((p = strchr(p, '\n'))) { + linenum++; + *p = '\0'; + p++; + if (*p == '\0') break; + } + *lines = (char **) calloc(linenum + 1, sizeof(char *)); + if (!(*lines)) return -1; + + p = dup; + for (int i = 0; i < linenum + 1; i++) { + (*lines)[i] = mystrdup(p); + p += strlen(p) + 1; + } + free(dup); + return linenum; } +// uniq line in place +char * line_uniq(char * text) { + char ** lines; + char linenum = line_tok(text, &lines); + int i; + strcpy(text, lines[0]); + for ( i = 1; i<=linenum; i++ ) { + int dup = 0; + for (int j = 0; j < i; j++) { + if (strcmp(lines[i], lines[j]) == 0) dup = 1; + } + if (!dup) { + if ((i > 1) || (*(lines[0]) != '\0')) strcat(text, "\n"); + strcat(text, lines[i]); + } + } + for ( i = 0; i<=linenum; i++ ) { + if (lines[i]) free(lines[i]); + } + if (lines) free(lines); + return text; +} -// convert null terminated string to all caps -void mkallcap(char * p, const struct cs_info * csconv) -{ - while (*p != '\0') { - *p = csconv[((unsigned char) *p)].cupper; - p++; - } +// change \n to char c +char * line_join(char * text, char c) { + char * p; + for (p = text; *p; p++) if (*p == '\n') *p = c; + return text; } +// leave only last {[^}]*} substring for handling zero morphemes +char * delete_zeros(char * morphout) { + char * p = morphout; + char * q = p; + char * q2 = NULL; + int suffix = 0; + + for (;*p && *(p+1);) { + switch (*p) { + case '{': + q2 = q; + q--; + break; + case '}': + if (q2) { + suffix = 1; + q--; + } + break; + default: + if (suffix) { + q = q2; + } + suffix = 0; + *q = *p; + } + p++; + q++; + } + *q = '\0'; + return morphout; +} -// convert null terminated string to all little -void mkallsmall(char * p, const struct cs_info * csconv) -{ - while (*p != '\0') { - *p = csconv[((unsigned char) *p)].clower; - p++; - } +char * mystrrep(char * word, const char * pat, const char * rep) { + char * pos = strstr(word, pat); + if (pos) { + int replen = strlen(rep); + int patlen = strlen(pat); + if (replen < patlen) { + char * end = word + strlen(word); + char * next = pos + replen; + char * prev = pos + strlen(pat); + for (; prev < end; *next = *prev, prev++, next++); + *next = '\0'; + } else if (replen > patlen) { + char * end = pos + patlen; + char * next = word + strlen(word) + replen - patlen; + char * prev = next - replen + patlen; + for (; prev >= end; *next = *prev, prev--, next--); + } + strncpy(pos, rep, replen); + } + return word; } + // convert null terminated string to all little using encoding + void enmkallsmall(char * d, const char * p, const char * encoding) + { + struct cs_info * csconv = get_current_cs(encoding); + while (*p != '\0') { + *d++ = csconv[((unsigned char) *p)].clower; + p++; + } + *d = '\0'; + } + // convert null terminated string to have intial capital using encoding + void enmkinitcap(char * d, const char * p, const char * encoding) + { + struct cs_info * csconv = get_current_cs(encoding); + memcpy(d,p,(strlen(p)+1)); + if (*p != '\0') *d= csconv[((unsigned char)*p)].cupper; + } + + + // convert null terminated string to all caps + void mkallcap(char * p, const struct cs_info * csconv) + { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].cupper; + p++; + } + } + + + // convert null terminated string to all little + void mkallsmall(char * p, const struct cs_info * csconv) + { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].clower; + p++; + } + } -// convert null terminated string to have intial capital -void mkinitcap(char * p, const struct cs_info * csconv) -{ - if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; +void mkallsmall_utf(w_char * u, int nc, struct unicode_info2 * utfconv) { + for (int i = 0; i < nc; i++) { + unsigned short idx = (u[i].h << 8) + u[i].l; + if (idx != utfconv[idx].clower) { + u[i].h = (unsigned char) (utfconv[idx].clower >> 8); + u[i].l = (unsigned char) (utfconv[idx].clower & 0x00FF); + } + } } + + // convert null terminated string to have intial capital + void mkinitcap(char * p, const struct cs_info * csconv) + { + if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; + } + // reverse word + void reverseword(char * word) { + char r; + for (char * dest = word + strlen(word) - 1; word < dest; word++, dest--) { + r=*word; + *word = *dest; + *dest = r; + } + } - + // reverse word + void reverseword_utf(char * word) { + w_char w[MAXWORDLEN]; + w_char * p; + w_char r; + int l = u8_u16(w, MAXWORDLEN, word); + p = w; + for (w_char * dest = w + l - 1; p < dest; p++, dest--) { + r=*p; + *p = *dest; + *dest = r; + } + u16_u8(word, MAXWORDUTF8LEN, w, l); + } // these are simple character mappings for the // encodings supported @@ -3029,7 +3348,7 @@ struct cs_info koi8r_tbl[] = { { 0x01, 0xdf, 0xff }, }; -struct cs_info cp1251_tbl[] = { +struct cs_info koi8u_tbl[] = { { 0x00, 0x00, 0x00 }, { 0x00, 0x01, 0x01 }, { 0x00, 0x02, 0x02 }, @@ -3193,27 +3512,27 @@ struct cs_info cp1251_tbl[] = { { 0x00, 0xa0, 0xa0 }, { 0x00, 0xa1, 0xa1 }, { 0x00, 0xa2, 0xa2 }, -{ 0x00, 0xa3, 0xa3 }, -{ 0x00, 0xa4, 0xa4 }, +{ 0x00, 0xa3, 0xb3 }, +{ 0x00, 0xa4, 0xb4 }, /* ie */ { 0x00, 0xa5, 0xa5 }, -{ 0x00, 0xa6, 0xa6 }, -{ 0x00, 0xa7, 0xa7 }, +{ 0x00, 0xa6, 0xb6 }, /* i */ +{ 0x00, 0xa7, 0xb7 }, /* ii */ { 0x00, 0xa8, 0xa8 }, { 0x00, 0xa9, 0xa9 }, { 0x00, 0xaa, 0xaa }, { 0x00, 0xab, 0xab }, { 0x00, 0xac, 0xac }, -{ 0x00, 0xad, 0xad }, +{ 0x00, 0xad, 0xbd }, /* g'' */ { 0x00, 0xae, 0xae }, { 0x00, 0xaf, 0xaf }, { 0x00, 0xb0, 0xb0 }, { 0x00, 0xb1, 0xb1 }, { 0x00, 0xb2, 0xb2 }, -{ 0x00, 0xb3, 0xb3 }, -{ 0x00, 0xb4, 0xb4 }, +{ 0x01, 0xa3, 0xb3 }, +{ 0x00, 0xb4, 0xb4 }, /* IE */ { 0x00, 0xb5, 0xb5 }, -{ 0x00, 0xb6, 0xb6 }, -{ 0x00, 0xb7, 0xb7 }, +{ 0x00, 0xb6, 0xb6 }, /* I */ +{ 0x00, 0xb7, 0xb7 }, /* II */ { 0x00, 0xb8, 0xb8 }, { 0x00, 0xb9, 0xb9 }, { 0x00, 0xba, 0xba }, @@ -3222,72 +3541,591 @@ struct cs_info cp1251_tbl[] = { { 0x00, 0xbd, 0xbd }, { 0x00, 0xbe, 0xbe }, { 0x00, 0xbf, 0xbf }, -{ 0x00, 0xc0, 0xc0 }, -{ 0x00, 0xc1, 0xc1 }, -{ 0x00, 0xc2, 0xc2 }, -{ 0x00, 0xc3, 0xc3 }, -{ 0x00, 0xc4, 0xc4 }, -{ 0x00, 0xc5, 0xc5 }, -{ 0x00, 0xc6, 0xc6 }, -{ 0x00, 0xc7, 0xc7 }, -{ 0x00, 0xc8, 0xc8 }, -{ 0x00, 0xc9, 0xc9 }, -{ 0x00, 0xca, 0xca }, -{ 0x00, 0xcb, 0xcb }, -{ 0x00, 0xcc, 0xcc }, -{ 0x00, 0xcd, 0xcd }, -{ 0x00, 0xce, 0xce }, -{ 0x00, 0xcf, 0xcf }, -{ 0x00, 0xd0, 0xd0 }, -{ 0x00, 0xd1, 0xd1 }, -{ 0x00, 0xd2, 0xd2 }, -{ 0x00, 0xd3, 0xd3 }, -{ 0x00, 0xd4, 0xd4 }, -{ 0x00, 0xd5, 0xd5 }, -{ 0x00, 0xd6, 0xd6 }, -{ 0x00, 0xd7, 0xd7 }, -{ 0x00, 0xd8, 0xd8 }, -{ 0x00, 0xd9, 0xd9 }, -{ 0x00, 0xda, 0xda }, -{ 0x00, 0xdb, 0xdb }, -{ 0x00, 0xdc, 0xdc }, -{ 0x00, 0xdd, 0xdd }, -{ 0x00, 0xde, 0xde }, -{ 0x00, 0xdf, 0xdf }, -{ 0x00, 0xe0, 0xe0 }, -{ 0x00, 0xe1, 0xe1 }, -{ 0x00, 0xe2, 0xe2 }, -{ 0x00, 0xe3, 0xe3 }, -{ 0x00, 0xe4, 0xe4 }, -{ 0x00, 0xe5, 0xe5 }, -{ 0x00, 0xe6, 0xe6 }, -{ 0x00, 0xe7, 0xe7 }, -{ 0x00, 0xe8, 0xe8 }, -{ 0x00, 0xe9, 0xe9 }, -{ 0x00, 0xea, 0xea }, -{ 0x00, 0xeb, 0xeb }, -{ 0x00, 0xec, 0xec }, -{ 0x00, 0xed, 0xed }, -{ 0x00, 0xee, 0xee }, -{ 0x00, 0xef, 0xef }, -{ 0x00, 0xf0, 0xf0 }, -{ 0x00, 0xf1, 0xf1 }, -{ 0x00, 0xf2, 0xf2 }, -{ 0x00, 0xf3, 0xf3 }, -{ 0x00, 0xf4, 0xf4 }, -{ 0x00, 0xf5, 0xf5 }, -{ 0x00, 0xf6, 0xf6 }, -{ 0x00, 0xf7, 0xf7 }, -{ 0x00, 0xf8, 0xf8 }, -{ 0x00, 0xf9, 0xf9 }, -{ 0x00, 0xfa, 0xfa }, -{ 0x00, 0xfb, 0xfb }, -{ 0x00, 0xfc, 0xfc }, -{ 0x00, 0xfd, 0xfd }, -{ 0x00, 0xfe, 0xfe }, -{ 0x00, 0xff, 0xff }, +{ 0x00, 0xc0, 0xe0 }, +{ 0x00, 0xc1, 0xe1 }, +{ 0x00, 0xc2, 0xe2 }, +{ 0x00, 0xc3, 0xe3 }, +{ 0x00, 0xc4, 0xe4 }, +{ 0x00, 0xc5, 0xe5 }, +{ 0x00, 0xc6, 0xe6 }, +{ 0x00, 0xc7, 0xe7 }, +{ 0x00, 0xc8, 0xe8 }, +{ 0x00, 0xc9, 0xe9 }, +{ 0x00, 0xca, 0xea }, +{ 0x00, 0xcb, 0xeb }, +{ 0x00, 0xcc, 0xec }, +{ 0x00, 0xcd, 0xed }, +{ 0x00, 0xce, 0xee }, +{ 0x00, 0xcf, 0xef }, +{ 0x00, 0xd0, 0xf0 }, +{ 0x00, 0xd1, 0xf1 }, +{ 0x00, 0xd2, 0xf2 }, +{ 0x00, 0xd3, 0xf3 }, +{ 0x00, 0xd4, 0xf4 }, +{ 0x00, 0xd5, 0xf5 }, +{ 0x00, 0xd6, 0xf6 }, +{ 0x00, 0xd7, 0xf7 }, +{ 0x00, 0xd8, 0xf8 }, +{ 0x00, 0xd9, 0xf9 }, +{ 0x00, 0xda, 0xfa }, +{ 0x00, 0xdb, 0xfb }, +{ 0x00, 0xdc, 0xfc }, +{ 0x00, 0xdd, 0xfd }, +{ 0x00, 0xde, 0xfe }, +{ 0x00, 0xdf, 0xff }, +{ 0x01, 0xc0, 0xe0 }, +{ 0x01, 0xc1, 0xe1 }, +{ 0x01, 0xc2, 0xe2 }, +{ 0x01, 0xc3, 0xe3 }, +{ 0x01, 0xc4, 0xe4 }, +{ 0x01, 0xc5, 0xe5 }, +{ 0x01, 0xc6, 0xe6 }, +{ 0x01, 0xc7, 0xe7 }, +{ 0x01, 0xc8, 0xe8 }, +{ 0x01, 0xc9, 0xe9 }, +{ 0x01, 0xca, 0xea }, +{ 0x01, 0xcb, 0xeb }, +{ 0x01, 0xcc, 0xec }, +{ 0x01, 0xcd, 0xed }, +{ 0x01, 0xce, 0xee }, +{ 0x01, 0xcf, 0xef }, +{ 0x01, 0xd0, 0xf0 }, +{ 0x01, 0xd1, 0xf1 }, +{ 0x01, 0xd2, 0xf2 }, +{ 0x01, 0xd3, 0xf3 }, +{ 0x01, 0xd4, 0xf4 }, +{ 0x01, 0xd5, 0xf5 }, +{ 0x01, 0xd6, 0xf6 }, +{ 0x01, 0xd7, 0xf7 }, +{ 0x01, 0xd8, 0xf8 }, +{ 0x01, 0xd9, 0xf9 }, +{ 0x01, 0xda, 0xfa }, +{ 0x01, 0xdb, 0xfb }, +{ 0x01, 0xdc, 0xfc }, +{ 0x01, 0xdd, 0xfd }, +{ 0x01, 0xde, 0xfe }, +{ 0x01, 0xdf, 0xff }, +}; + +struct cs_info cp1251_tbl[] = { +{ 0x00, 0x00, 0x00 }, +{ 0x00, 0x01, 0x01 }, +{ 0x00, 0x02, 0x02 }, +{ 0x00, 0x03, 0x03 }, +{ 0x00, 0x04, 0x04 }, +{ 0x00, 0x05, 0x05 }, +{ 0x00, 0x06, 0x06 }, +{ 0x00, 0x07, 0x07 }, +{ 0x00, 0x08, 0x08 }, +{ 0x00, 0x09, 0x09 }, +{ 0x00, 0x0a, 0x0a }, +{ 0x00, 0x0b, 0x0b }, +{ 0x00, 0x0c, 0x0c }, +{ 0x00, 0x0d, 0x0d }, +{ 0x00, 0x0e, 0x0e }, +{ 0x00, 0x0f, 0x0f }, +{ 0x00, 0x10, 0x10 }, +{ 0x00, 0x11, 0x11 }, +{ 0x00, 0x12, 0x12 }, +{ 0x00, 0x13, 0x13 }, +{ 0x00, 0x14, 0x14 }, +{ 0x00, 0x15, 0x15 }, +{ 0x00, 0x16, 0x16 }, +{ 0x00, 0x17, 0x17 }, +{ 0x00, 0x18, 0x18 }, +{ 0x00, 0x19, 0x19 }, +{ 0x00, 0x1a, 0x1a }, +{ 0x00, 0x1b, 0x1b }, +{ 0x00, 0x1c, 0x1c }, +{ 0x00, 0x1d, 0x1d }, +{ 0x00, 0x1e, 0x1e }, +{ 0x00, 0x1f, 0x1f }, +{ 0x00, 0x20, 0x20 }, +{ 0x00, 0x21, 0x21 }, +{ 0x00, 0x22, 0x22 }, +{ 0x00, 0x23, 0x23 }, +{ 0x00, 0x24, 0x24 }, +{ 0x00, 0x25, 0x25 }, +{ 0x00, 0x26, 0x26 }, +{ 0x00, 0x27, 0x27 }, +{ 0x00, 0x28, 0x28 }, +{ 0x00, 0x29, 0x29 }, +{ 0x00, 0x2a, 0x2a }, +{ 0x00, 0x2b, 0x2b }, +{ 0x00, 0x2c, 0x2c }, +{ 0x00, 0x2d, 0x2d }, +{ 0x00, 0x2e, 0x2e }, +{ 0x00, 0x2f, 0x2f }, +{ 0x00, 0x30, 0x30 }, +{ 0x00, 0x31, 0x31 }, +{ 0x00, 0x32, 0x32 }, +{ 0x00, 0x33, 0x33 }, +{ 0x00, 0x34, 0x34 }, +{ 0x00, 0x35, 0x35 }, +{ 0x00, 0x36, 0x36 }, +{ 0x00, 0x37, 0x37 }, +{ 0x00, 0x38, 0x38 }, +{ 0x00, 0x39, 0x39 }, +{ 0x00, 0x3a, 0x3a }, +{ 0x00, 0x3b, 0x3b }, +{ 0x00, 0x3c, 0x3c }, +{ 0x00, 0x3d, 0x3d }, +{ 0x00, 0x3e, 0x3e }, +{ 0x00, 0x3f, 0x3f }, +{ 0x00, 0x40, 0x40 }, +{ 0x01, 0x61, 0x41 }, +{ 0x01, 0x62, 0x42 }, +{ 0x01, 0x63, 0x43 }, +{ 0x01, 0x64, 0x44 }, +{ 0x01, 0x65, 0x45 }, +{ 0x01, 0x66, 0x46 }, +{ 0x01, 0x67, 0x47 }, +{ 0x01, 0x68, 0x48 }, +{ 0x01, 0x69, 0x49 }, +{ 0x01, 0x6a, 0x4a }, +{ 0x01, 0x6b, 0x4b }, +{ 0x01, 0x6c, 0x4c }, +{ 0x01, 0x6d, 0x4d }, +{ 0x01, 0x6e, 0x4e }, +{ 0x01, 0x6f, 0x4f }, +{ 0x01, 0x70, 0x50 }, +{ 0x01, 0x71, 0x51 }, +{ 0x01, 0x72, 0x52 }, +{ 0x01, 0x73, 0x53 }, +{ 0x01, 0x74, 0x54 }, +{ 0x01, 0x75, 0x55 }, +{ 0x01, 0x76, 0x56 }, +{ 0x01, 0x77, 0x57 }, +{ 0x01, 0x78, 0x58 }, +{ 0x01, 0x79, 0x59 }, +{ 0x01, 0x7a, 0x5a }, +{ 0x00, 0x5b, 0x5b }, +{ 0x00, 0x5c, 0x5c }, +{ 0x00, 0x5d, 0x5d }, +{ 0x00, 0x5e, 0x5e }, +{ 0x00, 0x5f, 0x5f }, +{ 0x00, 0x60, 0x60 }, +{ 0x00, 0x61, 0x41 }, +{ 0x00, 0x62, 0x42 }, +{ 0x00, 0x63, 0x43 }, +{ 0x00, 0x64, 0x44 }, +{ 0x00, 0x65, 0x45 }, +{ 0x00, 0x66, 0x46 }, +{ 0x00, 0x67, 0x47 }, +{ 0x00, 0x68, 0x48 }, +{ 0x00, 0x69, 0x49 }, +{ 0x00, 0x6a, 0x4a }, +{ 0x00, 0x6b, 0x4b }, +{ 0x00, 0x6c, 0x4c }, +{ 0x00, 0x6d, 0x4d }, +{ 0x00, 0x6e, 0x4e }, +{ 0x00, 0x6f, 0x4f }, +{ 0x00, 0x70, 0x50 }, +{ 0x00, 0x71, 0x51 }, +{ 0x00, 0x72, 0x52 }, +{ 0x00, 0x73, 0x53 }, +{ 0x00, 0x74, 0x54 }, +{ 0x00, 0x75, 0x55 }, +{ 0x00, 0x76, 0x56 }, +{ 0x00, 0x77, 0x57 }, +{ 0x00, 0x78, 0x58 }, +{ 0x00, 0x79, 0x59 }, +{ 0x00, 0x7a, 0x5a }, +{ 0x00, 0x7b, 0x7b }, +{ 0x00, 0x7c, 0x7c }, +{ 0x00, 0x7d, 0x7d }, +{ 0x00, 0x7e, 0x7e }, +{ 0x00, 0x7f, 0x7f }, +{ 0x01, 0x90, 0x80 }, +{ 0x01, 0x83, 0x81 }, +{ 0x00, 0x82, 0x82 }, +{ 0x00, 0x83, 0x81 }, +{ 0x00, 0x84, 0x84 }, +{ 0x00, 0x85, 0x85 }, +{ 0x00, 0x86, 0x86 }, +{ 0x00, 0x87, 0x87 }, +{ 0x00, 0x88, 0x88 }, +{ 0x00, 0x89, 0x89 }, +{ 0x01, 0x9a, 0x8a }, +{ 0x00, 0x8b, 0x8b }, +{ 0x01, 0x9c, 0x8c }, +{ 0x01, 0x9d, 0x8d }, +{ 0x01, 0x9e, 0x8e }, +{ 0x01, 0x9f, 0x8f }, +{ 0x00, 0x90, 0x80 }, +{ 0x00, 0x91, 0x91 }, +{ 0x00, 0x92, 0x92 }, +{ 0x00, 0x93, 0x93 }, +{ 0x00, 0x94, 0x94 }, +{ 0x00, 0x95, 0x95 }, +{ 0x00, 0x96, 0x96 }, +{ 0x00, 0x97, 0x97 }, +{ 0x00, 0x98, 0x98 }, +{ 0x00, 0x99, 0x99 }, +{ 0x00, 0x9a, 0x8a }, +{ 0x00, 0x9b, 0x9b }, +{ 0x00, 0x9c, 0x8c }, +{ 0x00, 0x9d, 0x8d }, +{ 0x00, 0x9e, 0x8e }, +{ 0x00, 0x9f, 0x8f }, +{ 0x00, 0xa0, 0xa0 }, +{ 0x01, 0xa2, 0xa1 }, +{ 0x00, 0xa2, 0xa1 }, +{ 0x01, 0xbc, 0xa3 }, +{ 0x00, 0xa4, 0xa4 }, +{ 0x01, 0xb4, 0xa5 }, +{ 0x00, 0xa6, 0xa6 }, +{ 0x00, 0xa7, 0xa7 }, +{ 0x01, 0xb8, 0xa8 }, +{ 0x00, 0xa9, 0xa9 }, +{ 0x01, 0xba, 0xaa }, +{ 0x00, 0xab, 0xab }, +{ 0x00, 0xac, 0xac }, +{ 0x00, 0xad, 0xad }, +{ 0x00, 0xae, 0xae }, +{ 0x01, 0xbf, 0xaf }, +{ 0x00, 0xb0, 0xb0 }, +{ 0x00, 0xb1, 0xb1 }, +{ 0x01, 0xb3, 0xb2 }, +{ 0x00, 0xb3, 0xb2 }, +{ 0x00, 0xb4, 0xa5 }, +{ 0x00, 0xb5, 0xb5 }, +{ 0x00, 0xb6, 0xb6 }, +{ 0x00, 0xb7, 0xb7 }, +{ 0x00, 0xb8, 0xa8 }, +{ 0x00, 0xb9, 0xb9 }, +{ 0x00, 0xba, 0xaa }, +{ 0x00, 0xbb, 0xbb }, +{ 0x00, 0xbc, 0xa3 }, +{ 0x01, 0xbe, 0xbd }, +{ 0x00, 0xbe, 0xbd }, +{ 0x00, 0xbf, 0xaf }, +{ 0x01, 0xe0, 0xc0 }, +{ 0x01, 0xe1, 0xc1 }, +{ 0x01, 0xe2, 0xc2 }, +{ 0x01, 0xe3, 0xc3 }, +{ 0x01, 0xe4, 0xc4 }, +{ 0x01, 0xe5, 0xc5 }, +{ 0x01, 0xe6, 0xc6 }, +{ 0x01, 0xe7, 0xc7 }, +{ 0x01, 0xe8, 0xc8 }, +{ 0x01, 0xe9, 0xc9 }, +{ 0x01, 0xea, 0xca }, +{ 0x01, 0xeb, 0xcb }, +{ 0x01, 0xec, 0xcc }, +{ 0x01, 0xed, 0xcd }, +{ 0x01, 0xee, 0xce }, +{ 0x01, 0xef, 0xcf }, +{ 0x01, 0xf0, 0xd0 }, +{ 0x01, 0xf1, 0xd1 }, +{ 0x01, 0xf2, 0xd2 }, +{ 0x01, 0xf3, 0xd3 }, +{ 0x01, 0xf4, 0xd4 }, +{ 0x01, 0xf5, 0xd5 }, +{ 0x01, 0xf6, 0xd6 }, +{ 0x01, 0xf7, 0xd7 }, +{ 0x01, 0xf8, 0xd8 }, +{ 0x01, 0xf9, 0xd9 }, +{ 0x01, 0xfa, 0xda }, +{ 0x01, 0xfb, 0xdb }, +{ 0x01, 0xfc, 0xdc }, +{ 0x01, 0xfd, 0xdd }, +{ 0x01, 0xfe, 0xde }, +{ 0x01, 0xff, 0xdf }, +{ 0x00, 0xe0, 0xc0 }, +{ 0x00, 0xe1, 0xc1 }, +{ 0x00, 0xe2, 0xc2 }, +{ 0x00, 0xe3, 0xc3 }, +{ 0x00, 0xe4, 0xc4 }, +{ 0x00, 0xe5, 0xc5 }, +{ 0x00, 0xe6, 0xc6 }, +{ 0x00, 0xe7, 0xc7 }, +{ 0x00, 0xe8, 0xc8 }, +{ 0x00, 0xe9, 0xc9 }, +{ 0x00, 0xea, 0xca }, +{ 0x00, 0xeb, 0xcb }, +{ 0x00, 0xec, 0xcc }, +{ 0x00, 0xed, 0xcd }, +{ 0x00, 0xee, 0xce }, +{ 0x00, 0xef, 0xcf }, +{ 0x00, 0xf0, 0xd0 }, +{ 0x00, 0xf1, 0xd1 }, +{ 0x00, 0xf2, 0xd2 }, +{ 0x00, 0xf3, 0xd3 }, +{ 0x00, 0xf4, 0xd4 }, +{ 0x00, 0xf5, 0xd5 }, +{ 0x00, 0xf6, 0xd6 }, +{ 0x00, 0xf7, 0xd7 }, +{ 0x00, 0xf8, 0xd8 }, +{ 0x00, 0xf9, 0xd9 }, +{ 0x00, 0xfa, 0xda }, +{ 0x00, 0xfb, 0xdb }, +{ 0x00, 0xfc, 0xdc }, +{ 0x00, 0xfd, 0xdd }, +{ 0x00, 0xfe, 0xde }, +{ 0x00, 0xff, 0xdf }, }; +struct cs_info iso13_tbl[] = { +{ 0x00, 0x00, 0x00 }, +{ 0x00, 0x01, 0x01 }, +{ 0x00, 0x02, 0x02 }, +{ 0x00, 0x03, 0x03 }, +{ 0x00, 0x04, 0x04 }, +{ 0x00, 0x05, 0x05 }, +{ 0x00, 0x06, 0x06 }, +{ 0x00, 0x07, 0x07 }, +{ 0x00, 0x08, 0x08 }, +{ 0x00, 0x09, 0x09 }, +{ 0x00, 0x0A, 0x0A }, +{ 0x00, 0x0B, 0x0B }, +{ 0x00, 0x0C, 0x0C }, +{ 0x00, 0x0D, 0x0D }, +{ 0x00, 0x0E, 0x0E }, +{ 0x00, 0x0F, 0x0F }, +{ 0x00, 0x10, 0x10 }, +{ 0x00, 0x11, 0x11 }, +{ 0x00, 0x12, 0x12 }, +{ 0x00, 0x13, 0x13 }, +{ 0x00, 0x14, 0x14 }, +{ 0x00, 0x15, 0x15 }, +{ 0x00, 0x16, 0x16 }, +{ 0x00, 0x17, 0x17 }, +{ 0x00, 0x18, 0x18 }, +{ 0x00, 0x19, 0x19 }, +{ 0x00, 0x1A, 0x1A }, +{ 0x00, 0x1B, 0x1B }, +{ 0x00, 0x1C, 0x1C }, +{ 0x00, 0x1D, 0x1D }, +{ 0x00, 0x1E, 0x1E }, +{ 0x00, 0x1F, 0x1F }, +{ 0x00, 0x20, 0x20 }, +{ 0x00, 0x21, 0x21 }, +{ 0x00, 0x22, 0x22 }, +{ 0x00, 0x23, 0x23 }, +{ 0x00, 0x24, 0x24 }, +{ 0x00, 0x25, 0x25 }, +{ 0x00, 0x26, 0x26 }, +{ 0x00, 0x27, 0x27 }, +{ 0x00, 0x28, 0x28 }, +{ 0x00, 0x29, 0x29 }, +{ 0x00, 0x2A, 0x2A }, +{ 0x00, 0x2B, 0x2B }, +{ 0x00, 0x2C, 0x2C }, +{ 0x00, 0x2D, 0x2D }, +{ 0x00, 0x2E, 0x2E }, +{ 0x00, 0x2F, 0x2F }, +{ 0x00, 0x30, 0x30 }, +{ 0x00, 0x31, 0x31 }, +{ 0x00, 0x32, 0x32 }, +{ 0x00, 0x33, 0x33 }, +{ 0x00, 0x34, 0x34 }, +{ 0x00, 0x35, 0x35 }, +{ 0x00, 0x36, 0x36 }, +{ 0x00, 0x37, 0x37 }, +{ 0x00, 0x38, 0x38 }, +{ 0x00, 0x39, 0x39 }, +{ 0x00, 0x3A, 0x3A }, +{ 0x00, 0x3B, 0x3B }, +{ 0x00, 0x3C, 0x3C }, +{ 0x00, 0x3D, 0x3D }, +{ 0x00, 0x3E, 0x3E }, +{ 0x00, 0x3F, 0x3F }, +{ 0x00, 0x40, 0x40 }, +{ 0x01, 0x61, 0x41 }, +{ 0x01, 0x62, 0x42 }, +{ 0x01, 0x63, 0x43 }, +{ 0x01, 0x64, 0x44 }, +{ 0x01, 0x65, 0x45 }, +{ 0x01, 0x66, 0x46 }, +{ 0x01, 0x67, 0x47 }, +{ 0x01, 0x68, 0x48 }, +{ 0x01, 0x69, 0x49 }, +{ 0x01, 0x6A, 0x4A }, +{ 0x01, 0x6B, 0x4B }, +{ 0x01, 0x6C, 0x4C }, +{ 0x01, 0x6D, 0x4D }, +{ 0x01, 0x6E, 0x4E }, +{ 0x01, 0x6F, 0x4F }, +{ 0x01, 0x70, 0x50 }, +{ 0x01, 0x71, 0x51 }, +{ 0x01, 0x72, 0x52 }, +{ 0x01, 0x73, 0x53 }, +{ 0x01, 0x74, 0x54 }, +{ 0x01, 0x75, 0x55 }, +{ 0x01, 0x76, 0x56 }, +{ 0x01, 0x77, 0x57 }, +{ 0x01, 0x78, 0x58 }, +{ 0x01, 0x79, 0x59 }, +{ 0x01, 0x7A, 0x5A }, +{ 0x00, 0x5B, 0x5B }, +{ 0x00, 0x5C, 0x5C }, +{ 0x00, 0x5D, 0x5D }, +{ 0x00, 0x5E, 0x5E }, +{ 0x00, 0x5F, 0x5F }, +{ 0x00, 0x60, 0x60 }, +{ 0x00, 0x61, 0x41 }, +{ 0x00, 0x62, 0x42 }, +{ 0x00, 0x63, 0x43 }, +{ 0x00, 0x64, 0x44 }, +{ 0x00, 0x65, 0x45 }, +{ 0x00, 0x66, 0x46 }, +{ 0x00, 0x67, 0x47 }, +{ 0x00, 0x68, 0x48 }, +{ 0x00, 0x69, 0x49 }, +{ 0x00, 0x6A, 0x4A }, +{ 0x00, 0x6B, 0x4B }, +{ 0x00, 0x6C, 0x4C }, +{ 0x00, 0x6D, 0x4D }, +{ 0x00, 0x6E, 0x4E }, +{ 0x00, 0x6F, 0x4F }, +{ 0x00, 0x70, 0x50 }, +{ 0x00, 0x71, 0x51 }, +{ 0x00, 0x72, 0x52 }, +{ 0x00, 0x73, 0x53 }, +{ 0x00, 0x74, 0x54 }, +{ 0x00, 0x75, 0x55 }, +{ 0x00, 0x76, 0x56 }, +{ 0x00, 0x77, 0x57 }, +{ 0x00, 0x78, 0x58 }, +{ 0x00, 0x79, 0x59 }, +{ 0x00, 0x7A, 0x5A }, +{ 0x00, 0x7B, 0x7B }, +{ 0x00, 0x7C, 0x7C }, +{ 0x00, 0x7D, 0x7D }, +{ 0x00, 0x7E, 0x7E }, +{ 0x00, 0x7F, 0x7F }, +{ 0x00, 0x80, 0x80 }, +{ 0x00, 0x81, 0x81 }, +{ 0x00, 0x82, 0x82 }, +{ 0x00, 0x83, 0x83 }, +{ 0x00, 0x84, 0x84 }, +{ 0x00, 0x85, 0x85 }, +{ 0x00, 0x86, 0x86 }, +{ 0x00, 0x87, 0x87 }, +{ 0x00, 0x88, 0x88 }, +{ 0x00, 0x89, 0x89 }, +{ 0x00, 0x8A, 0x8A }, +{ 0x00, 0x8B, 0x8B }, +{ 0x00, 0x8C, 0x8C }, +{ 0x00, 0x8D, 0x8D }, +{ 0x00, 0x8E, 0x8E }, +{ 0x00, 0x8F, 0x8F }, +{ 0x00, 0x90, 0x90 }, +{ 0x00, 0x91, 0x91 }, +{ 0x00, 0x92, 0x92 }, +{ 0x00, 0x93, 0x93 }, +{ 0x00, 0x94, 0x94 }, +{ 0x00, 0x95, 0x95 }, +{ 0x00, 0x96, 0x96 }, +{ 0x00, 0x97, 0x97 }, +{ 0x00, 0x98, 0x98 }, +{ 0x00, 0x99, 0x99 }, +{ 0x00, 0x9A, 0x9A }, +{ 0x00, 0x9B, 0x9B }, +{ 0x00, 0x9C, 0x9C }, +{ 0x00, 0x9D, 0x9D }, +{ 0x00, 0x9E, 0x9E }, +{ 0x00, 0x9F, 0x9F }, +{ 0x00, 0xA0, 0xA0 }, +{ 0x00, 0xA1, 0xA1 }, +{ 0x00, 0xA2, 0xA2 }, +{ 0x00, 0xA3, 0xA3 }, +{ 0x00, 0xA4, 0xA4 }, +{ 0x00, 0xA5, 0xA5 }, +{ 0x00, 0xA6, 0xA6 }, +{ 0x00, 0xA7, 0xA7 }, +{ 0x01, 0xB8, 0xA8 }, +{ 0x00, 0xA9, 0xA9 }, +{ 0x01, 0xBA, 0xAA }, +{ 0x00, 0xAB, 0xAB }, +{ 0x00, 0xAC, 0xAC }, +{ 0x00, 0xAD, 0xAD }, +{ 0x00, 0xAE, 0xAE }, +{ 0x01, 0xBF, 0xAF }, +{ 0x00, 0xB0, 0xB0 }, +{ 0x00, 0xB1, 0xB1 }, +{ 0x00, 0xB2, 0xB2 }, +{ 0x00, 0xB3, 0xB3 }, +{ 0x00, 0xB4, 0xB4 }, +{ 0x00, 0xB5, 0xB5 }, +{ 0x00, 0xB6, 0xB6 }, +{ 0x00, 0xB7, 0xB7 }, +{ 0x00, 0xB8, 0xA8 }, +{ 0x00, 0xB9, 0xB9 }, +{ 0x00, 0xBA, 0xAA }, +{ 0x00, 0xBB, 0xBB }, +{ 0x00, 0xBC, 0xBC }, +{ 0x00, 0xBD, 0xBD }, +{ 0x00, 0xBE, 0xBE }, +{ 0x00, 0xBF, 0xAF }, +{ 0x01, 0xE0, 0xC0 }, +{ 0x01, 0xE1, 0xC1 }, +{ 0x01, 0xE2, 0xC2 }, +{ 0x01, 0xE3, 0xC3 }, +{ 0x01, 0xE4, 0xC4 }, +{ 0x01, 0xE5, 0xC5 }, +{ 0x01, 0xE6, 0xC6 }, +{ 0x01, 0xE7, 0xC7 }, +{ 0x01, 0xE8, 0xC8 }, +{ 0x01, 0xE9, 0xC9 }, +{ 0x01, 0xEA, 0xCA }, +{ 0x01, 0xEB, 0xCB }, +{ 0x01, 0xEC, 0xCC }, +{ 0x01, 0xED, 0xCD }, +{ 0x01, 0xEE, 0xCE }, +{ 0x01, 0xEF, 0xCF }, +{ 0x01, 0xF0, 0xD0 }, +{ 0x01, 0xF1, 0xD1 }, +{ 0x01, 0xF2, 0xD2 }, +{ 0x01, 0xF3, 0xD3 }, +{ 0x01, 0xF4, 0xD4 }, +{ 0x01, 0xF5, 0xD5 }, +{ 0x01, 0xF6, 0xD6 }, +{ 0x00, 0xD7, 0xD7 }, +{ 0x01, 0xF8, 0xD8 }, +{ 0x01, 0xF9, 0xD9 }, +{ 0x01, 0xFA, 0xDA }, +{ 0x01, 0xFB, 0xDB }, +{ 0x01, 0xFC, 0xDC }, +{ 0x01, 0xFD, 0xDD }, +{ 0x01, 0xFE, 0xDE }, +{ 0x00, 0xDF, 0xDF }, +{ 0x00, 0xE0, 0xC0 }, +{ 0x00, 0xE1, 0xC1 }, +{ 0x00, 0xE2, 0xC2 }, +{ 0x00, 0xE3, 0xC3 }, +{ 0x00, 0xE4, 0xC4 }, +{ 0x00, 0xE5, 0xC5 }, +{ 0x00, 0xE6, 0xC6 }, +{ 0x00, 0xE7, 0xC7 }, +{ 0x00, 0xE8, 0xC8 }, +{ 0x00, 0xE9, 0xC9 }, +{ 0x00, 0xEA, 0xCA }, +{ 0x00, 0xEB, 0xCB }, +{ 0x00, 0xEC, 0xCC }, +{ 0x00, 0xED, 0xCD }, +{ 0x00, 0xEE, 0xCE }, +{ 0x00, 0xEF, 0xCF }, +{ 0x00, 0xF0, 0xD0 }, +{ 0x00, 0xF1, 0xD1 }, +{ 0x00, 0xF2, 0xD2 }, +{ 0x00, 0xF3, 0xD3 }, +{ 0x00, 0xF4, 0xD4 }, +{ 0x00, 0xF5, 0xD5 }, +{ 0x00, 0xF6, 0xD6 }, +{ 0x00, 0xF7, 0xF7 }, +{ 0x00, 0xF8, 0xD8 }, +{ 0x00, 0xF9, 0xD9 }, +{ 0x00, 0xFA, 0xDA }, +{ 0x00, 0xFB, 0xDB }, +{ 0x00, 0xFC, 0xDC }, +{ 0x00, 0xFD, 0xDD }, +{ 0x00, 0xFE, 0xDE }, +{ 0x00, 0xFF, 0xFF }, +}; + + struct cs_info iso14_tbl[] = { { 0x00, 0x00, 0x00 }, { 0x00, 0x01, 0x01 }, @@ -3547,6 +4385,264 @@ struct cs_info iso14_tbl[] = { { 0x00, 0xff, 0xff }, }; +struct cs_info iso15_tbl[] = { +{ 0x00, 0x00, 0x00 }, +{ 0x00, 0x01, 0x01 }, +{ 0x00, 0x02, 0x02 }, +{ 0x00, 0x03, 0x03 }, +{ 0x00, 0x04, 0x04 }, +{ 0x00, 0x05, 0x05 }, +{ 0x00, 0x06, 0x06 }, +{ 0x00, 0x07, 0x07 }, +{ 0x00, 0x08, 0x08 }, +{ 0x00, 0x09, 0x09 }, +{ 0x00, 0x0a, 0x0a }, +{ 0x00, 0x0b, 0x0b }, +{ 0x00, 0x0c, 0x0c }, +{ 0x00, 0x0d, 0x0d }, +{ 0x00, 0x0e, 0x0e }, +{ 0x00, 0x0f, 0x0f }, +{ 0x00, 0x10, 0x10 }, +{ 0x00, 0x11, 0x11 }, +{ 0x00, 0x12, 0x12 }, +{ 0x00, 0x13, 0x13 }, +{ 0x00, 0x14, 0x14 }, +{ 0x00, 0x15, 0x15 }, +{ 0x00, 0x16, 0x16 }, +{ 0x00, 0x17, 0x17 }, +{ 0x00, 0x18, 0x18 }, +{ 0x00, 0x19, 0x19 }, +{ 0x00, 0x1a, 0x1a }, +{ 0x00, 0x1b, 0x1b }, +{ 0x00, 0x1c, 0x1c }, +{ 0x00, 0x1d, 0x1d }, +{ 0x00, 0x1e, 0x1e }, +{ 0x00, 0x1f, 0x1f }, +{ 0x00, 0x20, 0x20 }, +{ 0x00, 0x21, 0x21 }, +{ 0x00, 0x22, 0x22 }, +{ 0x00, 0x23, 0x23 }, +{ 0x00, 0x24, 0x24 }, +{ 0x00, 0x25, 0x25 }, +{ 0x00, 0x26, 0x26 }, +{ 0x00, 0x27, 0x27 }, +{ 0x00, 0x28, 0x28 }, +{ 0x00, 0x29, 0x29 }, +{ 0x00, 0x2a, 0x2a }, +{ 0x00, 0x2b, 0x2b }, +{ 0x00, 0x2c, 0x2c }, +{ 0x00, 0x2d, 0x2d }, +{ 0x00, 0x2e, 0x2e }, +{ 0x00, 0x2f, 0x2f }, +{ 0x00, 0x30, 0x30 }, +{ 0x00, 0x31, 0x31 }, +{ 0x00, 0x32, 0x32 }, +{ 0x00, 0x33, 0x33 }, +{ 0x00, 0x34, 0x34 }, +{ 0x00, 0x35, 0x35 }, +{ 0x00, 0x36, 0x36 }, +{ 0x00, 0x37, 0x37 }, +{ 0x00, 0x38, 0x38 }, +{ 0x00, 0x39, 0x39 }, +{ 0x00, 0x3a, 0x3a }, +{ 0x00, 0x3b, 0x3b }, +{ 0x00, 0x3c, 0x3c }, +{ 0x00, 0x3d, 0x3d }, +{ 0x00, 0x3e, 0x3e }, +{ 0x00, 0x3f, 0x3f }, +{ 0x00, 0x40, 0x40 }, +{ 0x01, 0x61, 0x41 }, +{ 0x01, 0x62, 0x42 }, +{ 0x01, 0x63, 0x43 }, +{ 0x01, 0x64, 0x44 }, +{ 0x01, 0x65, 0x45 }, +{ 0x01, 0x66, 0x46 }, +{ 0x01, 0x67, 0x47 }, +{ 0x01, 0x68, 0x48 }, +{ 0x01, 0x69, 0x49 }, +{ 0x01, 0x6a, 0x4a }, +{ 0x01, 0x6b, 0x4b }, +{ 0x01, 0x6c, 0x4c }, +{ 0x01, 0x6d, 0x4d }, +{ 0x01, 0x6e, 0x4e }, +{ 0x01, 0x6f, 0x4f }, +{ 0x01, 0x70, 0x50 }, +{ 0x01, 0x71, 0x51 }, +{ 0x01, 0x72, 0x52 }, +{ 0x01, 0x73, 0x53 }, +{ 0x01, 0x74, 0x54 }, +{ 0x01, 0x75, 0x55 }, +{ 0x01, 0x76, 0x56 }, +{ 0x01, 0x77, 0x57 }, +{ 0x01, 0x78, 0x58 }, +{ 0x01, 0x79, 0x59 }, +{ 0x01, 0x7a, 0x5a }, +{ 0x00, 0x5b, 0x5b }, +{ 0x00, 0x5c, 0x5c }, +{ 0x00, 0x5d, 0x5d }, +{ 0x00, 0x5e, 0x5e }, +{ 0x00, 0x5f, 0x5f }, +{ 0x00, 0x60, 0x60 }, +{ 0x00, 0x61, 0x41 }, +{ 0x00, 0x62, 0x42 }, +{ 0x00, 0x63, 0x43 }, +{ 0x00, 0x64, 0x44 }, +{ 0x00, 0x65, 0x45 }, +{ 0x00, 0x66, 0x46 }, +{ 0x00, 0x67, 0x47 }, +{ 0x00, 0x68, 0x48 }, +{ 0x00, 0x69, 0x49 }, +{ 0x00, 0x6a, 0x4a }, +{ 0x00, 0x6b, 0x4b }, +{ 0x00, 0x6c, 0x4c }, +{ 0x00, 0x6d, 0x4d }, +{ 0x00, 0x6e, 0x4e }, +{ 0x00, 0x6f, 0x4f }, +{ 0x00, 0x70, 0x50 }, +{ 0x00, 0x71, 0x51 }, +{ 0x00, 0x72, 0x52 }, +{ 0x00, 0x73, 0x53 }, +{ 0x00, 0x74, 0x54 }, +{ 0x00, 0x75, 0x55 }, +{ 0x00, 0x76, 0x56 }, +{ 0x00, 0x77, 0x57 }, +{ 0x00, 0x78, 0x58 }, +{ 0x00, 0x79, 0x59 }, +{ 0x00, 0x7a, 0x5a }, +{ 0x00, 0x7b, 0x7b }, +{ 0x00, 0x7c, 0x7c }, +{ 0x00, 0x7d, 0x7d }, +{ 0x00, 0x7e, 0x7e }, +{ 0x00, 0x7f, 0x7f }, +{ 0x00, 0x80, 0x80 }, +{ 0x00, 0x81, 0x81 }, +{ 0x00, 0x82, 0x82 }, +{ 0x00, 0x83, 0x83 }, +{ 0x00, 0x84, 0x84 }, +{ 0x00, 0x85, 0x85 }, +{ 0x00, 0x86, 0x86 }, +{ 0x00, 0x87, 0x87 }, +{ 0x00, 0x88, 0x88 }, +{ 0x00, 0x89, 0x89 }, +{ 0x00, 0x8a, 0x8a }, +{ 0x00, 0x8b, 0x8b }, +{ 0x00, 0x8c, 0x8c }, +{ 0x00, 0x8d, 0x8d }, +{ 0x00, 0x8e, 0x8e }, +{ 0x00, 0x8f, 0x8f }, +{ 0x00, 0x90, 0x90 }, +{ 0x00, 0x91, 0x91 }, +{ 0x00, 0x92, 0x92 }, +{ 0x00, 0x93, 0x93 }, +{ 0x00, 0x94, 0x94 }, +{ 0x00, 0x95, 0x95 }, +{ 0x00, 0x96, 0x96 }, +{ 0x00, 0x97, 0x97 }, +{ 0x00, 0x98, 0x98 }, +{ 0x00, 0x99, 0x99 }, +{ 0x00, 0x9a, 0x9a }, +{ 0x00, 0x9b, 0x9b }, +{ 0x00, 0x9c, 0x9c }, +{ 0x00, 0x9d, 0x9d }, +{ 0x00, 0x9e, 0x9e }, +{ 0x00, 0x9f, 0x9f }, +{ 0x00, 0xa0, 0xa0 }, +{ 0x00, 0xa1, 0xa1 }, +{ 0x00, 0xa2, 0xa2 }, +{ 0x00, 0xa3, 0xa3 }, +{ 0x00, 0xa4, 0xa4 }, +{ 0x00, 0xa5, 0xa5 }, +{ 0x01, 0xa8, 0xa6 }, +{ 0x00, 0xa7, 0xa7 }, +{ 0x00, 0xa8, 0xa6 }, +{ 0x00, 0xa9, 0xa9 }, +{ 0x00, 0xaa, 0xaa }, +{ 0x00, 0xab, 0xab }, +{ 0x00, 0xac, 0xac }, +{ 0x00, 0xad, 0xad }, +{ 0x00, 0xae, 0xae }, +{ 0x00, 0xaf, 0xaf }, +{ 0x00, 0xb0, 0xb0 }, +{ 0x00, 0xb1, 0xb1 }, +{ 0x00, 0xb2, 0xb2 }, +{ 0x00, 0xb3, 0xb3 }, +{ 0x01, 0xb8, 0xb4 }, +{ 0x00, 0xb5, 0xb5 }, +{ 0x00, 0xb6, 0xb6 }, +{ 0x00, 0xb7, 0xb7 }, +{ 0x00, 0xb8, 0xb4 }, +{ 0x00, 0xb9, 0xb9 }, +{ 0x00, 0xba, 0xba }, +{ 0x00, 0xbb, 0xbb }, +{ 0x01, 0xbd, 0xbc }, +{ 0x00, 0xbd, 0xbc }, +{ 0x01, 0xff, 0xbe }, +{ 0x00, 0xbf, 0xbf }, +{ 0x01, 0xe0, 0xc0 }, +{ 0x01, 0xe1, 0xc1 }, +{ 0x01, 0xe2, 0xc2 }, +{ 0x01, 0xe3, 0xc3 }, +{ 0x01, 0xe4, 0xc4 }, +{ 0x01, 0xe5, 0xc5 }, +{ 0x01, 0xe6, 0xc6 }, +{ 0x01, 0xe7, 0xc7 }, +{ 0x01, 0xe8, 0xc8 }, +{ 0x01, 0xe9, 0xc9 }, +{ 0x01, 0xea, 0xca }, +{ 0x01, 0xeb, 0xcb }, +{ 0x01, 0xec, 0xcc }, +{ 0x01, 0xed, 0xcd }, +{ 0x01, 0xee, 0xce }, +{ 0x01, 0xef, 0xcf }, +{ 0x01, 0xf0, 0xd0 }, +{ 0x01, 0xf1, 0xd1 }, +{ 0x01, 0xf2, 0xd2 }, +{ 0x01, 0xf3, 0xd3 }, +{ 0x01, 0xf4, 0xd4 }, +{ 0x01, 0xf5, 0xd5 }, +{ 0x01, 0xf6, 0xd6 }, +{ 0x00, 0xd7, 0xd7 }, +{ 0x01, 0xf8, 0xd8 }, +{ 0x01, 0xf9, 0xd9 }, +{ 0x01, 0xfa, 0xda }, +{ 0x01, 0xfb, 0xdb }, +{ 0x01, 0xfc, 0xdc }, +{ 0x01, 0xfd, 0xdd }, +{ 0x01, 0xfe, 0xde }, +{ 0x00, 0xdf, 0xdf }, +{ 0x00, 0xe0, 0xc0 }, +{ 0x00, 0xe1, 0xc1 }, +{ 0x00, 0xe2, 0xc2 }, +{ 0x00, 0xe3, 0xc3 }, +{ 0x00, 0xe4, 0xc4 }, +{ 0x00, 0xe5, 0xc5 }, +{ 0x00, 0xe6, 0xc6 }, +{ 0x00, 0xe7, 0xc7 }, +{ 0x00, 0xe8, 0xc8 }, +{ 0x00, 0xe9, 0xc9 }, +{ 0x00, 0xea, 0xca }, +{ 0x00, 0xeb, 0xcb }, +{ 0x00, 0xec, 0xcc }, +{ 0x00, 0xed, 0xcd }, +{ 0x00, 0xee, 0xce }, +{ 0x00, 0xef, 0xcf }, +{ 0x00, 0xf0, 0xd0 }, +{ 0x00, 0xf1, 0xd1 }, +{ 0x00, 0xf2, 0xd2 }, +{ 0x00, 0xf3, 0xd3 }, +{ 0x00, 0xf4, 0xd4 }, +{ 0x00, 0xf5, 0xd5 }, +{ 0x00, 0xf6, 0xd6 }, +{ 0x00, 0xf7, 0xf7 }, +{ 0x00, 0xf8, 0xd8 }, +{ 0x00, 0xf9, 0xd9 }, +{ 0x00, 0xfa, 0xda }, +{ 0x00, 0xfb, 0xdb }, +{ 0x00, 0xfc, 0xdc }, +{ 0x00, 0xfd, 0xdd }, +{ 0x00, 0xfe, 0xde }, +{ 0x00, 0xff, 0xbe }, +}; struct cs_info iscii_devanagari_tbl[] = { { 0x00, 0x00, 0x00 }, @@ -3807,8 +4903,6 @@ struct cs_info iscii_devanagari_tbl[] = { { 0x00, 0xff, 0xff }, }; - - struct enc_entry encds[] = { {"ISO8859-1",iso1_tbl}, {"ISO8859-2",iso2_tbl}, @@ -3821,8 +4915,11 @@ struct enc_entry encds[] = { {"ISO8859-9",iso9_tbl}, {"ISO8859-10",iso10_tbl}, {"KOI8-R",koi8r_tbl}, -{"CP-1251",cp1251_tbl}, +{"KOI8-U",koi8u_tbl}, +{"microsoft-cp1251",cp1251_tbl}, +{"ISO8859-13", iso13_tbl}, {"ISO8859-14", iso14_tbl}, +{"ISO8859-15", iso15_tbl}, {"ISCII-DEVANAGARI", iscii_devanagari_tbl}, }; @@ -3836,28 +4933,41 @@ struct cs_info * get_current_cs(const char * es) { } } return ccs; -} +}; +struct unicode_info * get_utf_cs() { + return utf_lst; +}; +int get_utf_cs_len() { + return UTF_LST_LEN; +}; struct lang_map lang2enc[] = { - {"ca","ISO8859-1"}, - {"cs","ISO8859-2"}, - {"da","ISO8859-1"}, - {"de","ISO8859-1"}, - {"el","ISO8859-7"}, - {"en","ISO8859-1"}, - {"es","ISO8859-1"}, - {"fr","ISO8859-1"}, - {"hr","ISO8859-2"}, - {"hu","ISO8859-2"}, - {"it","ISO8859-1"}, - {"la","ISO8859-1"}, - {"nl","ISO8859-1"}, - {"pl","ISO8859-2"}, - {"pt","ISO8859-1"}, - {"sv","ISO8859-1"}, - {"ru","KOI8-R"}, +{"az", "UTF-8", LANG_az}, +{"bg", "microsoft-cp1251", LANG_bg}, +{"ca", "ISO8859-1", LANG_ca}, +{"cs", "ISO8859-2", LANG_cs}, +{"da", "ISO8859-1", LANG_da}, +{"de", "ISO8859-1", LANG_de}, +{"el", "ISO8859-7", LANG_el}, +{"en", "ISO8859-1", LANG_en}, +{"es", "ISO8859-1", LANG_es}, +{"eu", "ISO8859-1", LANG_eu}, +{"gl", "ISO8859-1", LANG_gl}, +{"fr", "ISO8859-15", LANG_fr}, +{"hr", "ISO8859-2", LANG_hr}, +{"hu", "ISO8859-2", LANG_hu}, +{"it", "ISO8859-1", LANG_it}, +{"la", "ISO8859-1", LANG_la}, +{"lv", "ISO8859-13", LANG_lv}, +{"nl", "ISO8859-1", LANG_nl}, +{"pl", "ISO8859-2", LANG_pl}, +{"pt", "ISO8859-1", LANG_pt}, +{"sv", "ISO8859-1", LANG_sv}, +{"tr", "UTF-8", LANG_tr}, +{"ru", "KOI8-R", LANG_ru}, +{"uk", "KOI8-U", LANG_uk} }; @@ -3869,5 +4979,14 @@ const char * get_default_enc(const char * lang) { } } return NULL; -} +}; +int get_lang_num(const char * lang) { + int n = sizeof(lang2enc) / sizeof(lang2enc[0]); + for (int i = 0; i < n; i++) { + if (strncmp(lang,lang2enc[i].lang,2) == 0) { + return lang2enc[i].num; + } + } + return LANG_xx; +}; diff --git a/src/myspell/csutil.hxx b/src/myspell/csutil.hxx index 037eab9..aa50a58 100644 --- a/src/myspell/csutil.hxx +++ b/src/myspell/csutil.hxx @@ -1,36 +1,88 @@ #ifndef __CSUTILHXX__ #define __CSUTILHXX__ - // First some base level utility routines +typedef struct { + unsigned char l; + unsigned char h; +} w_char; + +// convert UTF-16 characters to UTF-8 +char * u16_u8(char * dest, int size, const w_char * src, int srclen); + +// convert UTF-8 characters to UTF-16 +int u8_u16(w_char * dest, int size, const char * src); + +// sort 2-byte vector +void flag_qsort(unsigned short flags[], int begin, int end); + +// binary search in 2-byte vector +int flag_bsearch(unsigned short flags[], unsigned short flag, int right); + // remove end of line char(s) void mychomp(char * s); -// duplicate string +// duplicate string char * mystrdup(const char * s); -// duplicate reverse of string +// duplicate reverse of string char * myrevstrdup(const char * s); -// parse into tokens with char delimiter +// parse into tokens with char delimiter char * mystrsep(char ** sptr, const char delim); +// parse into tokens with char delimiter +char * mystrsep2(char ** sptr, const char delim); + +// parse into tokens with char delimiter +char * mystrrep(char *, const char *, const char *); -// is one string a leading subset of another +// is one string a leading subset of another int isSubset(const char * s1, const char * s2); // is one reverse string a leading subset of the end of another -int isRevSubset(const char * s1, const char * end_of_s2, int s2_len); +int isRevSubset(const char * s1, const char * s2, int len); +// append s to ends of every lines in text +void strlinecat(char * lines, const char * s); -// character encoding information +// tokenize into lines with new line + int line_tok(const char * text, char *** lines); + +// tokenize into lines with new line and uniq in place + char * line_uniq(char * text); + +// change \n to c in place + char * line_join(char * text, char c); +// leave only last {[^}]*} pattern in string + char * delete_zeros(char * morphout); + +// reverse word + void reverseword(char *); + +// reverse word + void reverseword_utf(char *); + +// character encoding information struct cs_info { unsigned char ccase; unsigned char clower; unsigned char cupper; }; +// Unicode character encoding information +struct unicode_info { + unsigned short c; + unsigned short cupper; + unsigned short clower; +}; + +struct unicode_info2 { + char cletter; + unsigned short cupper; + unsigned short clower; +}; struct enc_entry { const char * enc_name; @@ -42,13 +94,20 @@ struct enc_entry { struct lang_map { const char * lang; const char * def_enc; + int num; }; struct cs_info * get_current_cs(const char * es); +struct unicode_info * get_utf_cs(); + +int get_utf_cs_len(); + const char * get_default_enc(const char * lang); -// convert null terminated string to all caps using encoding +int get_lang_num(const char * lang); + +// convert null terminated string to all caps using encoding void enmkallcap(char * d, const char * p, const char * encoding); // convert null terminated string to all little using encoding @@ -57,7 +116,7 @@ void enmkallsmall(char * d, const char * p, const char * encoding); // convert null terminated string to have intial capital using encoding void enmkinitcap(char * d, const char * p, const char * encoding); -// convert null terminated string to all caps +// convert null terminated string to all caps void mkallcap(char * p, const struct cs_info * csconv); // convert null terminated string to all little @@ -66,5 +125,7 @@ void mkallsmall(char * p, const struct cs_info * csconv); // convert null terminated string to have intial capital void mkinitcap(char * p, const struct cs_info * csconv); +// convert first nc characters of UTF-8 string to little +void mkallsmall_utf(w_char * u, int nc, struct unicode_info2 * utfconv); #endif diff --git a/src/myspell/enchant_myspell.hxx b/src/myspell/enchant_myspell.hxx deleted file mode 100644 index 0c18549..0000000 --- a/src/myspell/enchant_myspell.hxx +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _MYSPELLMGR_HXX_ -#define _MYSPELLMGR_HXX_ - -#include "hashmgr.hxx" -#include "affixmgr.hxx" -#include "suggestmgr.hxx" -#include "csutil.hxx" - -#define NOCAP 0 -#define INITCAP 1 -#define ALLCAP 2 -#define HUHCAP 3 - -#ifdef WINDOWS -#define DLLSUPPORT __declspec(dllexport) -#else -#define DLLSUPPORT -#endif - -class DLLSUPPORT MySpell -{ - AffixMgr* pAMgr; - HashMgr* pHMgr; - SuggestMgr* pSMgr; - char * encoding; - struct cs_info * csconv; - int maxSug; - -public: - MySpell(const char * affpath, const char * dpath); - ~MySpell(); - - int suggest(char*** slst, const char * word); - int spell(const char *); - char * get_dic_encoding(); - -private: - int cleanword(char *, const char *, int *, int *); - char * check(const char *); -}; - -#endif diff --git a/src/myspell/hashmgr.cxx b/src/myspell/hashmgr.cxx index d7b4ec8..29a05c3 100644 --- a/src/myspell/hashmgr.cxx +++ b/src/myspell/hashmgr.cxx @@ -1,25 +1,36 @@ -#include "license.readme" +#include "license.hunspell" +#include "license.myspell" #include <cstdlib> #include <cstring> +#include <cctype> +#ifdef HAVE_FCNTL_H +#include <fcntl.h> +#endif #include <cstdio> #include "hashmgr.hxx" +#include "csutil.hxx" -extern void mychomp(char * s); -extern char * mystrdup(const char *); - -#ifndef WINDOWS +#ifndef W32 +#include <unistd.h> using namespace std; #endif - // build a hash table from a munched word list -HashMgr::HashMgr(const char * tpath) +HashMgr::HashMgr(const char * tpath, const char * apath) { tablesize = 0; tableptr = NULL; + flag_mode = FLAG_CHAR; + complexprefixes = 0; + utf8 = 0; + numaliasf = 0; + aliasf = NULL; + numaliasm = 0; + aliasm = NULL; + load_config(apath); int ec = load_tables(tpath); if (ec) { /* error condition - what should we do here */ @@ -42,14 +53,17 @@ HashMgr::~HashMgr() struct hentry * pt = &tableptr[i]; struct hentry * nt = NULL; if (pt) { + if (pt->astr && !aliasf) free(pt->astr); if (pt->word) free(pt->word); - if (pt->astr) free(pt->astr); + if (pt->description && !aliasm) free(pt->description); + pt = pt->next; } while(pt) { nt = pt->next; + if (pt->astr && !aliasf) free(pt->astr); if (pt->word) free(pt->word); - if (pt->astr) free(pt->astr); + if (pt->description && !aliasm) free(pt->description); free(pt); pt = nt; } @@ -57,9 +71,22 @@ HashMgr::~HashMgr() free(tableptr); } tablesize = 0; -} - + if (aliasf) { + for (int j = 0; j < (numaliasf); j++) free(aliasf[j]); + free(aliasf); + aliasf = NULL; + if (aliasflen) { + free(aliasflen); + aliasflen = NULL; + } + } + if (aliasm) { + for (int j = 0; j < (numaliasm); j++) free(aliasm[j]); + free(aliasm); + aliasm = NULL; + } +} // lookup a root word in the hashtable @@ -76,40 +103,87 @@ struct hentry * HashMgr::lookup(const char *word) const return NULL; } - - // add a word to the hash table (private) -int HashMgr::add_word(const char * word, int wl, const char * aff, int al) +int HashMgr::add_word(const char * word, int wl, unsigned short * aff, int al, const char * desc) { - int i = hash(word); + char * st = mystrdup(word); + if (wl && !st) return 1; + if (complexprefixes) { + if (utf8) reverseword_utf(st); else reverseword(st); + } + int i = hash(st); struct hentry * dp = &tableptr[i]; - struct hentry* hp; if (dp->word == NULL) { dp->wlen = wl; dp->alen = al; - dp->word = mystrdup(word); - dp->astr = mystrdup(aff); + dp->word = st; + dp->astr = aff; dp->next = NULL; - if ((wl) && (dp->word == NULL)) return 1; - if ((al) && (dp->astr == NULL)) return 1; + dp->next_homonym = NULL; + if (aliasm) { + dp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); + } else { + dp->description = mystrdup(desc); + if (desc && !dp->description) return 1; + if (dp->description && complexprefixes) { + if (utf8) reverseword_utf(dp->description); else reverseword(dp->description); + } + } } else { - hp = (struct hentry *) malloc (sizeof(struct hentry)); - if (hp == NULL) return 1; + struct hentry* hp = (struct hentry *) malloc (sizeof(struct hentry)); + if (!hp) return 1; hp->wlen = wl; hp->alen = al; - hp->word = mystrdup(word); - hp->astr = mystrdup(aff); + hp->word = st; + hp->astr = aff; hp->next = NULL; - while (dp->next != NULL) dp=dp->next; + hp->next_homonym = NULL; + if (aliasm) { + hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); + } else { + hp->description = mystrdup(desc); + if (desc && !hp->description) return 1; + if (dp->description && complexprefixes) { + if (utf8) reverseword_utf(hp->description); else reverseword(hp->description); + } + } + while (dp->next != NULL) { + if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; + dp=dp->next; + } + if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) dp->next_homonym = hp; dp->next = hp; - if ((wl) && (hp->word == NULL)) return 1; - if ((al) && (hp->astr == NULL)) return 1; } return 0; } +// add a custom dic. word to the hash table (public) +int HashMgr::put_word(const char * word, int wl, char * aff) +{ + unsigned short * flags; + int al = 0; + if (aff) { + al = decode_flags(&flags, aff); + flag_qsort(flags, 0, al); + } else { + flags = NULL; + } + add_word(word, wl, flags, al, NULL); + return 0; +} + +int HashMgr::put_word_pattern(const char * word, int wl, const char * pattern) +{ + unsigned short * flags; + struct hentry * dp = lookup(pattern); + if (!dp || !dp->astr) return 1; + flags = (unsigned short *) malloc (dp->alen * sizeof(short)); + memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); + add_word(word, wl, flags, dp->alen, NULL); + return 0; +} // walk the hash table entry by entry - null at end struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const @@ -137,14 +211,13 @@ struct hentry * HashMgr::walk_hashtable(int &col, struct hentry * hp) const return hp; } - - // load a munched word list and build a hash table on the fly - int HashMgr::load_tables(const char * tpath) { int wl, al; char * ap; + char * dp; + unsigned short * flags; // raw dictionary - munched file FILE * rawdict = fopen(tpath, "r"); @@ -154,39 +227,72 @@ int HashMgr::load_tables(const char * tpath) char ts[MAXDELEN]; if (! fgets(ts, MAXDELEN-1,rawdict)) return 2; mychomp(ts); + if ((*ts < '1') || (*ts > '9')) fprintf(stderr, "error - missing word count in dictionary file\n"); tablesize = atoi(ts); if (!tablesize) return 4; - tablesize = tablesize + 5; + tablesize = tablesize + 5 + USERWORD; if ((tablesize %2) == 0) tablesize++; // allocate the hash table tableptr = (struct hentry *) calloc(tablesize, sizeof(struct hentry)); if (! tableptr) return 3; + for (int i=0; i<tablesize; i++) tableptr[i].word = NULL; // loop through all words on much list and add to hash // table and create word and affix strings while (fgets(ts,MAXDELEN-1,rawdict)) { mychomp(ts); + // split each line into word and morphological description + dp = strchr(ts,'\t'); + + if (dp) { + *dp = '\0'; + dp++; + } else { + dp = NULL; + } + // split each line into word and affix char strings - ap = strchr(ts,'/'); + // "\/" signs slash in words (not affix separator) + // "/" at beginning of the line is word character (not affix separator) + ap = ts; + while (ap = strchr(ap,'/')) { + if (ap == ts) { + ap++; + continue; + } else if (*(ap - 1) != '\\') break; + // replace "\/" with "/" + for (char * sp = ap - 1; *sp; *sp = *(sp + 1), sp++); + + } + if (ap) { *ap = '\0'; - ap++; - al = strlen(ap); + if (aliasf) { + int index = atoi(ap + 1); + al = get_aliasf(index, &flags); + if (!al) { + fprintf(stderr, "error - bad flag vector alias: %s\n", ts); + *ap = '\0'; + } + } else { + al = decode_flags(&flags, ap + 1); + flag_qsort(flags, 0, al); + } } else { al = 0; ap = NULL; + flags = NULL; } wl = strlen(ts); // add the word and its index - if (add_word(ts,wl,ap,al)) - return 5;; + if (add_word(ts,wl,flags,al,dp)) return 5; } - + fclose(rawdict); return 0; } @@ -207,3 +313,367 @@ int HashMgr::hash(const char * word) const return (unsigned long) hv % tablesize; } +int HashMgr::decode_flags(unsigned short ** result, char * flags) { + int len; + switch (flag_mode) { + case FLAG_LONG: { // two-character flags (1x2yZz -> 1x 2y Zz) + len = strlen(flags); + if (len%2 == 1) fprintf(stderr,"error: length of FLAG_LONG flagvector is odd: %s\n", flags); + len = len/2; + *result = (unsigned short *) malloc(len * sizeof(short)); + for (int i = 0; i < len; i++) { + (*result)[i] = (((unsigned short) flags[i * 2]) << 8) + (unsigned short) flags[i * 2 + 1]; + } + break; + } + case FLAG_NUM: { // decimal numbers separated by comma (4521,23,233 -> 4521 23 233) + len = 1; + char * src = flags; + unsigned short * dest; + char * p; + for (p = flags; *p; p++) { + if (*p == ',') len++; + } + *result = (unsigned short *) malloc(len * sizeof(short)); + dest = *result; + for (p = flags; *p; p++) { + if (*p == ',') { + *dest = (unsigned short) atoi(src); + if (*dest == 0) fprintf(stderr, "error: 0 is wrong flag id\n"); + src = p + 1; + dest++; + } + } + *dest = (unsigned short) atoi(src); + if (*dest == 0) fprintf(stderr, "error: 0 is wrong flag id\n"); + break; + } + case FLAG_UNI: { // UTF-8 characters + w_char w[MAXDELEN/2]; + len = u8_u16(w, MAXDELEN/2, flags); + *result = (unsigned short *) malloc(len * sizeof(short)); + memcpy(*result, w, len * sizeof(short)); + break; + } + default: { // Ispell's one-character flags (erfg -> e r f g) + unsigned short * dest; + len = strlen(flags); + *result = (unsigned short *) malloc(len * sizeof(short)); + dest = *result; + for (unsigned char * p = (unsigned char *) flags; *p; p++) { + *dest = (unsigned short) *p; + dest++; + } + } + } + return len; +} + +unsigned short HashMgr::decode_flag(const char * f) { + unsigned short s = 0; + switch (flag_mode) { + case FLAG_LONG: + s = ((unsigned short) f[0] << 8) + (unsigned short) f[1]; + break; + case FLAG_NUM: + s = (unsigned short) atoi(f); + break; + case FLAG_UNI: + u8_u16((w_char *) &s, 1, f); + break; + default: + s = (unsigned short) *((unsigned char *)f); + } + if (!s) fprintf(stderr, "error: 0 is wrong flag id\n"); + return s; +} + +char * HashMgr::encode_flag(unsigned short f) { + unsigned char ch[10]; + if (f==0) return mystrdup("(NULL)"); + if (flag_mode == FLAG_LONG) { + ch[0] = (unsigned char) (f >> 8); + ch[1] = (unsigned char) (f - ((f >> 8) << 8)); + ch[2] = '\0'; + } else if (flag_mode == FLAG_NUM) { + sprintf((char *) ch, "%d", f); + } else if (flag_mode == FLAG_UNI) { + u16_u8((char *) &ch, 10, (w_char *) &f, 1); + } else { + ch[0] = (unsigned char) (f); + ch[1] = '\0'; + } + return mystrdup((char *) ch); +} + +// read in aff file and set flag mode +int HashMgr::load_config(const char * affpath) +{ + + // io buffers + char line[MAXDELEN+1]; + + // open the affix file + FILE * afflst; + afflst = fopen(affpath,"r"); + if (!afflst) { + fprintf(stderr,"Error - could not open affix description file %s\n",affpath); + return 1; + } + + // read in each line ignoring any that do not + // start with a known line type indicator + + while (fgets(line,MAXDELEN,afflst)) { + mychomp(line); + + /* parse in the try string */ + if ((strncmp(line,"FLAG",4) == 0) && isspace(line[4])) { + if (flag_mode != FLAG_CHAR) { + fprintf(stderr,"error: duplicate FLAG parameter\n"); + } + if (strstr(line, "long")) flag_mode = FLAG_LONG; + if (strstr(line, "num")) flag_mode = FLAG_NUM; + if (strstr(line, "UTF-8")) flag_mode = FLAG_UNI; + if (flag_mode == FLAG_CHAR) { + fprintf(stderr,"error: FLAG need `num', `long' or `UTF-8' parameter: %s\n", line); + } + } + if ((strncmp(line,"SET",3) == 0) && isspace(line[3]) && strstr(line, "UTF-8")) utf8 = 1; + + if ((strncmp(line,"AF",2) == 0) && isspace(line[2])) { + if (parse_aliasf(line, afflst)) { + return 1; + } + } + + if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { + if (parse_aliasm(line, afflst)) { + return 1; + } + } + + if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; + if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; + } + fclose(afflst); + return 0; +} + +/* parse in the ALIAS table */ +int HashMgr::parse_aliasf(char * line, FILE * af) +{ + if (numaliasf != 0) { + fprintf(stderr,"error: duplicate AF (alias for flag vector) tables used\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + numaliasf = atoi(piece); + if (numaliasf < 1) { + numaliasf = 0; + aliasf = NULL; + aliasflen = NULL; + fprintf(stderr,"incorrect number of entries in AF table\n"); + free(piece); + return 1; + } + aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *)); + aliasflen = (unsigned short *) malloc(numaliasf * sizeof(short)); + if (!aliasf || !aliasflen) { + numaliasf = 0; + if (aliasf) free(aliasf); + if (aliasflen) free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + return 1; + } + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + numaliasf = 0; + free(aliasf); + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + fprintf(stderr,"error: missing AF table information\n"); + return 1; + } + + /* now parse the numaliasf lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < numaliasf; j++) { + if (!fgets(nl,MAXDELEN,af)) return 1; + mychomp(nl); + tp = nl; + i = 0; + aliasf[j] = NULL; + aliasflen[j] = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece,"AF",2) != 0) { + numaliasf = 0; + free(aliasf); + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + fprintf(stderr,"error: AF table is corrupt\n"); + free(piece); + return 1; + } + break; + } + case 1: { + aliasflen[j] = decode_flags(&(aliasf[j]), piece); + flag_qsort(aliasf[j], 0, aliasflen[j]); + break; + } + default: break; + } + i++; + } + free(piece); + } + if (!aliasf[j]) { + free(aliasf); + free(aliasflen); + aliasf = NULL; + aliasflen = NULL; + numaliasf = 0; + fprintf(stderr,"error: AF table is corrupt\n"); + return 1; + } + } + return 0; +} + +/* parse morph alias definitions */ +int HashMgr::parse_aliasm(char * line, FILE * af) +{ + if (numaliasm != 0) { + fprintf(stderr,"error: duplicate AM (aliases for morphological descriptions) tables used\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + numaliasm = atoi(piece); + if (numaliasm < 1) { + fprintf(stderr,"incorrect number of entries in AM table\n"); + free(piece); + return 1; + } + aliasm = (char **) malloc(numaliasm * sizeof(char *)); + if (!aliasm) { + numaliasm = 0; + return 1; + } + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + numaliasm = 0; + free(aliasm); + aliasm = NULL; + fprintf(stderr,"error: missing AM alias information\n"); + return 1; + } + + /* now parse the numaliasm lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < numaliasm; j++) { + if (!fgets(nl,MAXDELEN,af)) return 1; + mychomp(nl); + tp = nl; + i = 0; + aliasm[j] = NULL; + while ((piece=mystrsep(&tp, 0))) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece,"AM",2) != 0) { + fprintf(stderr,"error: AM table is corrupt\n"); + free(piece); + numaliasm = 0; + free(aliasm); + aliasm = NULL; + return 1; + } + break; + } + case 1: { + if (complexprefixes) { + if (utf8) reverseword_utf(piece); + else reverseword(piece); + } + aliasm[j] = mystrdup(piece); + break; } + default: break; + } + i++; + } + free(piece); + } + if (!aliasm[j]) { + numaliasm = 0; + free(aliasm); + aliasm = NULL; + fprintf(stderr,"error: map table is corrupt\n"); + return 1; + } + } + return 0; +} + +int HashMgr::is_aliasf() { + return (aliasf != NULL); +} + +int HashMgr::is_aliasm() { + return (aliasm != NULL); +} + +int HashMgr::get_aliasf(int index, unsigned short ** fvec) { + if ((index > 0) && (index <= numaliasf)) { + *fvec = aliasf[index - 1]; + return aliasflen[index - 1]; + } + fprintf(stderr,"error: bad flag alias index: %d\n", index); + fprintf(stderr,"hiba: %d\n", index); + *fvec = NULL; + return 0; +} + +char * HashMgr::get_aliasm(int index) { + if ((index > 0) && (index <= numaliasm)) return aliasm[index - 1]; + fprintf(stderr,"error: bad morph. alias index: %d\n", index); + return NULL; +} diff --git a/src/myspell/hashmgr.hxx b/src/myspell/hashmgr.hxx index e8b08c3..3a27b1e 100644 --- a/src/myspell/hashmgr.hxx +++ b/src/myspell/hashmgr.hxx @@ -1,26 +1,50 @@ #ifndef _HASHMGR_HXX_ #define _HASHMGR_HXX_ +#include <cstdio> #include "htypes.hxx" +enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI }; + class HashMgr { int tablesize; struct hentry * tableptr; + int userword; + flag flag_mode; + int complexprefixes; + int utf8; + int numaliasf; // flag vector `compression' with aliases + unsigned short ** aliasf; + unsigned short * aliasflen; + int numaliasm; // morphological desciption `compression' with aliases + char ** aliasm; + public: - HashMgr(const char * tpath); + HashMgr(const char * tpath, const char * apath); ~HashMgr(); struct hentry * lookup(const char *) const; int hash(const char *) const; struct hentry * walk_hashtable(int & col, struct hentry * hp) const; + int put_word(const char * word, int wl, char * ap); + int put_word_pattern(const char * word, int wl, const char * pattern); + int decode_flags(unsigned short ** result, char * flags); + unsigned short decode_flag(const char * flag); + char * encode_flag(unsigned short flag); + int is_aliasf(); + int is_aliasm(); + int get_aliasf(int index, unsigned short ** fvec); + char * get_aliasm(int index); + private: - HashMgr( const HashMgr & ); // not implemented - HashMgr &operator=( const HashMgr & ); // not implemented int load_tables(const char * tpath); - int add_word(const char * word, int wl, const char * ap, int al); + int add_word(const char * word, int wl, unsigned short * ap, int al, const char * desc); + int load_config(const char * affpath); + int parse_aliasf(char * line, FILE * af); + int parse_aliasm(char * line, FILE * af); }; diff --git a/src/myspell/htypes.hxx b/src/myspell/htypes.hxx index 029e9f2..14a4783 100644 --- a/src/myspell/htypes.hxx +++ b/src/myspell/htypes.hxx @@ -1,20 +1,25 @@ #ifndef _HTYPES_HXX_ #define _HTYPES_HXX_ -#define MAXDELEN 256 +#define MAXDELEN 8192 #define ROTATE_LEN 5 #define ROTATE(v,q) \ (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1)); +// approx. number of user defined words +#define USERWORD 1000 + struct hentry { short wlen; short alen; char * word; - char * astr; + unsigned short * astr; struct hentry * next; -}; + struct hentry * next_homonym; + char * description; +}; #endif diff --git a/src/myspell/hunspell.cxx b/src/myspell/hunspell.cxx new file mode 100644 index 0000000..14ea1ad --- /dev/null +++ b/src/myspell/hunspell.cxx @@ -0,0 +1,1616 @@ +#include "license.hunspell" +#include "license.myspell" + +#include <cstring> +#include <cstdlib> +#include <cstdio> + +#include "hunspell.hxx" + +#ifndef W32 +using namespace std; +#endif + +Hunspell::Hunspell(const char * affpath, const char * dpath) +{ + encoding = NULL; + csconv = NULL; + utfconv = NULL; + utf8 = 0; + complexprefixes = 0; + + /* first set up the hash manager */ + pHMgr = new HashMgr(dpath, affpath); + + /* next set up the affix manager */ + /* it needs access to the hash manager lookup methods */ + pAMgr = new AffixMgr(affpath,pHMgr); + + /* get the preferred try string and the dictionary */ + /* encoding from the Affix Manager for that dictionary */ + char * try_string = pAMgr->get_try_string(); + encoding = pAMgr->get_encoding(); + csconv = get_current_cs(encoding); + langnum = pAMgr->get_langnum(); + utf8 = pAMgr->get_utf8(); + utfconv = pAMgr->get_utf_conv(); + complexprefixes = pAMgr->get_complexprefixes(); + wordbreak = pAMgr->get_breaktable(); + + /* and finally set up the suggestion manager */ + pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); + if (try_string) free(try_string); + + prevroot = NULL; + prevcompound = 0; + forbidden_compound = 0; +} + +Hunspell::~Hunspell() +{ + if (pSMgr) delete pSMgr; + if (pAMgr) delete pAMgr; + if (pHMgr) delete pHMgr; + pSMgr = NULL; + pAMgr = NULL; + pHMgr = NULL; + csconv= NULL; + if (encoding) free(encoding); + encoding = NULL; +} + + +// make a copy of src at destination while removing all leading +// blanks and removing any trailing periods after recording +// their presence with the abbreviation flag +// also since already going through character by character, +// set the capitalization type +// return the length of the "cleaned" (and UTF-8 encoded) word + +int Hunspell::cleanword2(char * dest, const char * src, + w_char * dest_utf, int * nc, int * pcaptype, int * pabbrev) +{ + unsigned char * p = (unsigned char *) dest; + const unsigned char * q = (const unsigned char * ) src; + int firstcap = 0; + + // first skip over any leading blanks + while ((*q != '\0') && (*q == ' ')) q++; + + // now strip off any trailing periods (recording their presence) + *pabbrev = 0; + int nl = strlen((const char *)q); + while ((nl > 0) && (*(q+nl-1)=='.')) { + nl--; + (*pabbrev)++; + } + + // if no characters are left it can't be capitalized + if (nl <= 0) { + *pcaptype = NOCAP; + *p = '\0'; + return 0; + } + + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + *nc = 0; + + if (!utf8) { + while (nl > 0) { + (*nc)++; + if (csconv[(*q)].ccase) ncap++; + if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; + *p++ = *q++; + nl--; + } + // remember to terminate the destination string + *p = '\0'; + if (ncap) { + firstcap = csconv[(unsigned char)(*dest)].ccase; + } + } else { + unsigned short idx; + *nc = u8_u16(dest_utf, MAXWORDLEN, (const char *) q); + // don't check too long words + if (*nc >= MAXWORDLEN) return 0; + *nc -= *pabbrev; + for (int i = 0; i < *nc; i++) { + idx = (dest_utf[i].h << 8) + dest_utf[i].l; + if (idx != utfconv[idx].clower) ncap++; + if (utfconv[idx].cupper == utfconv[idx].clower) nneutral++; + } + u16_u8(dest, MAXWORDUTF8LEN, dest_utf, *nc); + if (ncap) { + idx = (dest_utf[0].h << 8) + dest_utf[0].l; + firstcap = (idx != utfconv[idx].clower); + } + } + + // now finally set the captype + if (ncap == 0) { + *pcaptype = NOCAP; + } else if ((ncap == 1) && firstcap) { + *pcaptype = INITCAP; + } else if ((ncap == *nc) || ((ncap + nneutral) == *nc)) { + *pcaptype = ALLCAP; + } else if ((ncap > 1) && firstcap) { + *pcaptype = HUHINITCAP; + } else { + *pcaptype = HUHCAP; + } + return strlen(dest); +} + +int Hunspell::cleanword(char * dest, const char * src, + int * pcaptype, int * pabbrev) +{ + unsigned char * p = (unsigned char *) dest; + const unsigned char * q = (const unsigned char * ) src; + int firstcap = 0; + + // first skip over any leading blanks + while ((*q != '\0') && (*q == ' ')) q++; + + // now strip off any trailing periods (recording their presence) + *pabbrev = 0; + int nl = strlen((const char *)q); + while ((nl > 0) && (*(q+nl-1)=='.')) { + nl--; + (*pabbrev)++; + } + + // if no characters are left it can't be capitalized + if (nl <= 0) { + *pcaptype = NOCAP; + *p = '\0'; + return 0; + } + + // now determine the capitalization type of the first nl letters + int ncap = 0; + int nneutral = 0; + int nc = 0; + + if (!utf8) { + while (nl > 0) { + nc++; + if (csconv[(*q)].ccase) ncap++; + if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; + *p++ = *q++; + nl--; + } + // remember to terminate the destination string + *p = '\0'; + firstcap = csconv[(unsigned char)(*dest)].ccase; + } else { + unsigned short idx; + w_char t[MAXWORDLEN]; + nc = u8_u16(t, MAXWORDLEN, src); + for (int i = 0; i < nc; i++) { + idx = (t[i].h << 8) + t[i].l; + if (idx != utfconv[idx].clower) ncap++; + if (utfconv[idx].cupper == utfconv[idx].clower) nneutral++; + } + u16_u8(dest, MAXWORDUTF8LEN, t, nc); + if (ncap) { + idx = (t[0].h << 8) + t[0].l; + firstcap = (idx != utfconv[idx].clower); + } + } + + // now finally set the captype + if (ncap == 0) { + *pcaptype = NOCAP; + } else if ((ncap == 1) && firstcap) { + *pcaptype = INITCAP; + } else if ((ncap == nc) || ((ncap + nneutral) == nc)){ + *pcaptype = ALLCAP; + } else if ((ncap > 1) && firstcap) { + *pcaptype = HUHINITCAP; + } else { + *pcaptype = HUHCAP; + } + return strlen(dest); +} + + +void Hunspell::mkallcap(char * p) +{ + if (utf8) { + w_char u[MAXWORDLEN]; + int nc = u8_u16(u, MAXWORDLEN, p); + unsigned short idx; + for (int i = 0; i < nc; i++) { + idx = (u[i].h << 8) + u[i].l; + if (idx != utfconv[idx].cupper) { + u[i].h = (unsigned char) (utfconv[idx].cupper >> 8); + u[i].l = (unsigned char) (utfconv[idx].cupper & 0x00FF); + } + } + u16_u8(p, MAXWORDUTF8LEN, u, nc); + } else { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].cupper; + p++; + } + } +} + +int Hunspell::mkallcap2(char * p, w_char * u, int nc) +{ + if (utf8) { + unsigned short idx; + for (int i = 0; i < nc; i++) { + idx = (u[i].h << 8) + u[i].l; + if (idx != utfconv[idx].cupper) { + u[i].h = (unsigned char) (utfconv[idx].cupper >> 8); + u[i].l = (unsigned char) (utfconv[idx].cupper & 0x00FF); + } + } + u16_u8(p, MAXWORDUTF8LEN, u, nc); + return strlen(p); + } else { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].cupper; + p++; + } + } + return nc; +} + + +void Hunspell::mkallsmall(char * p) +{ + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].clower; + p++; + } +} + +int Hunspell::mkallsmall2(char * p, w_char * u, int nc) +{ + if (utf8) { + unsigned short idx; + for (int i = 0; i < nc; i++) { + idx = (u[i].h << 8) + u[i].l; + if (idx != utfconv[idx].clower) { + u[i].h = (unsigned char) (utfconv[idx].clower >> 8); + u[i].l = (unsigned char) (utfconv[idx].clower & 0x00FF); + } + } + u16_u8(p, MAXWORDUTF8LEN, u, nc); + return strlen(p); + } else { + while (*p != '\0') { + *p = csconv[((unsigned char) *p)].clower; + p++; + } + } + return nc; +} + +// convert UTF-8 sharp S codes to latin 1 +char * Hunspell::sharps_u8_l1(char * dest, char * source) { + char * p = dest; + *p = *source; + for (p++, source++; *(source - 1); p++, source++) { + *p = *source; + if (*source == 'Ÿ') *--p = 'ß'; + } + return dest; +} + +// recursive search for right ss-ß permutations +hentry * Hunspell::spellsharps(char * base, char * pos, int n, int repnum, char * tmp) { + if ((pos = strstr(pos, "ss")) && (n < MAXSHARPS)) { + hentry * h; + *pos = 'Ã'; + *(pos + 1) = 'Ÿ'; + if (h = spellsharps(base, pos + 2, n + 1, repnum + 1, tmp)) return h; + *pos = 's'; + *(pos + 1) = 's'; + if (h = spellsharps(base, pos + 2, n + 1, repnum, tmp)) return h; + } else if (repnum > 0) { + if (utf8) return check(base); + return check(sharps_u8_l1(tmp, base)); + } + return NULL; +} + +int Hunspell::is_keepcase(const hentry * rv) { + return pAMgr && rv->astr && pAMgr->get_keepcase() && + TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen); +} + +/* check and insert a word to beginning of the suggestion array */ +int Hunspell::insert_sug(char ***slst, char * word, int *ns) { + if (spell(word)) { + if (*ns == MAXSUGGESTION) { + (*ns)--; + free((*slst)[*ns]); + } + for (int k = *ns; k > 0; k--) (*slst)[k] = (*slst)[k - 1]; + (*slst)[0] = mystrdup(word); + (*ns)++; + } + return 0; +} + +int Hunspell::spell(const char * word) +{ + struct hentry * rv=NULL; + // need larger vector. For example, Turkish capital letter I converted a + // 2-byte UTF-8 character (dotless i) by mkallsmall. + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + w_char unicw[MAXWORDLEN + 1]; + int nc = strlen(word); + int wl2; + if (utf8) { + if (nc >= MAXWORDUTF8LEN) return 0; + } else { + if (nc >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + + if (wl == 0) return 1; + + // allow numbers with dots and commas (but forbid double separators: "..", ",," etc.) + enum { NBEGIN, NNUM, NSEP }; + int nstate = NBEGIN; + int i; + + for (i = 0; (i < wl) && + (((cw[i] <= '9') && (cw[i] >= '0') && (nstate = NNUM)) || + ((nstate == NNUM) && ((cw[i] == ',') || + (cw[i] == '.') || (cw[i] == '-')) && (nstate = NSEP))); i++); + if ((i == wl) && (nstate == NNUM)) return 1; + + // LANG_hu section: number(s) + (percent or degree) with suffixes + if (langnum == LANG_hu) { + if ((nstate == NNUM) && ((cw[i] == '%') || (cw[i] == '°')) && check(cw + i)) return 1; + } + // END of LANG_hu section + + switch(captype) { + case HUHCAP: + case HUHINITCAP: + case NOCAP: { + rv = check(cw); + if ((abbv) && !(rv)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = check(wspace); + } + break; + } + case ALLCAP: { + rv = check(cw); + if (rv) break; + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = check(wspace); + if (rv) break; + } + if (pAMgr && pAMgr->get_checksharps() && strstr(cw, "SS")) { + char tmpword[MAXWORDUTF8LEN]; + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + rv = spellsharps(wspace, wspace, 0, 0, tmpword); + if (!rv) { + wl2 = mkinitcap2(cw, unicw, nc); + rv = spellsharps(cw, cw, 0, 0, tmpword); + } + if ((abbv) && !(rv)) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword); + if (!rv) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + rv = spellsharps(wspace, wspace, 0, 0, tmpword); + } + } + if (rv) break; + } + } + case INITCAP: { + wl = mkallsmall2(cw, unicw, nc); + memcpy(wspace,cw,(wl+1)); + rv = check(wspace); + if (!rv || (is_keepcase(rv) && !((captype == INITCAP) && + // if CHECKSHARPS: KEEPCASE words with ß are allowed + // in INITCAP form, too. + pAMgr->get_checksharps() && ((utf8 && strstr(wspace, "ß")) || + (!utf8 && strchr(wspace, 'ß')))))) { + wl2 = mkinitcap2(cw, unicw, nc); + rv = check(cw); + if (rv && (captype == ALLCAP) && is_keepcase(rv)) rv = NULL; + } + if (abbv && !rv) { + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = check(wspace); + if (!rv || is_keepcase(rv)) { + memcpy(wspace, cw, wl2); + *(wspace+wl2) = '.'; + *(wspace+wl2+1) = '\0'; + rv = check(wspace); + if (rv && ((captype == ALLCAP) && is_keepcase(rv))) rv = NULL; + } + } + break; + } + } + + if (rv) return 1; + + // recursive breaking at break points (not good for morphological analysis) + if (wordbreak) { + char * s; + char r; + for (int i = 0; i < pAMgr->get_numbreak(); i++) { + if (s=(char *) strstr(cw, wordbreak[i])) { + r = *s; + *s = '\0'; + // examine 2 sides of the break point + if (spell(cw) && spell(s + strlen(wordbreak[i]))) { + *s = r; + return 1; + } + *s = r; + } + } + } + + // LANG_hu: compoundings with dashes and n-dashes XXX deprecated! + if (langnum == LANG_hu) { + int n; + // compound word with dash (HU) I18n + char * dash; + int result = 0; + // n-dash + if (!wordbreak && (dash=(char *) strstr(cw,"–"))) { + *dash = '\0'; + // examine 2 sides of the dash + if (spell(cw) && spell(dash + 3)) { + *dash = 'â'; + return 1; + } + *dash = 'â'; + } + if ((dash=(char *) strchr(cw,'-'))) { + *dash='\0'; + // examine 2 sides of the dash + if (dash[1] == '\0') { // base word ending with dash + if (spell(cw)) return 1; + } else { + // first word ending with dash: word- + char r2 = *(dash + 1); + dash[0]='-'; + dash[1]='\0'; + result = spell(cw); + dash[1] = r2; + dash[0]='\0'; + if (result && spell(dash+1) && ((strlen(dash+1) > 1) || (dash[1] == 'e') || + ((dash[1] > '0') && (dash[1] < '9')))) return 1; + } + // affixed number in correct word + if (result && (dash > cw) && (((*(dash-1)<='9') && (*(dash-1)>='0')) || (*(dash-1)>='.'))) { + *dash='-'; + n = 1; + if (*(dash - n) == '.') n++; + // search first not a number character to left from dash + while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { + n++; + } + if ((dash - n) < cw) n--; + // numbers: deprecated + for(; n >= 1; n--) { + if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && check(dash - n)) return 1; + } + } + } + } + return 0; +} + +struct hentry * Hunspell::check(const char * w) +{ + struct hentry * he = NULL; + int len; + char w2[MAXWORDUTF8LEN]; + const char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } + + forbidden_compound = 0; // XXX LANG_hu class variable for suggestions (not threadsafe) + prevcompound = 0; // compounding information for Hunspell's pipe interface (not threadsafe) + prevroot = NULL; // root information for Hunspell's pipe interface (not threadsafe) + + // look word in hash table + if (pHMgr) he = pHMgr->lookup(word); + + // check forbidden and onlyincompound words + if ((he) && (he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { + // LANG_hu section: set dash information for suggestions + if (langnum == LANG_hu) { + forbidden_compound = 1; + if (pAMgr->get_compoundflag() && + TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) { + forbidden_compound = 2; + } + } + return NULL; + } + + // he = next not pseudoroot and not onlyincompound homonym or NULL + while (he && (he->astr) && + ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) || + (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) + )) he = he->next_homonym; + + // check with affixes + if (!he && pAMgr) { + // try stripping off affixes */ + len = strlen(word); + he = pAMgr->affix_check(word, len, 0); + + // check compound restriction + if (he && he->astr && pAMgr->get_onlyincompound() && + TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) he = NULL; + + // try check compound word + if (he) { + if ((he->astr) && (pAMgr) && TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) { + forbidden_compound = 1; // LANG_hu + return NULL; + } + prevroot = he->word; + } else if (pAMgr->get_compound()) { + he = pAMgr->compound_check(word, len, + 0,0,100,0,NULL,0,NULL,NULL,0); + // LANG_hu section: `moving rule' with last dash + if ((!he) && (langnum == LANG_hu) && (word[len-1]=='-')) { + char * dup = mystrdup(word); + dup[len-1] = '\0'; + he = pAMgr->compound_check(dup, len-1, + -5,0,100,0,NULL,1,NULL,NULL,0); + free(dup); + } + // end of LANG speficic region + if (he) { + prevroot = he->word; + prevcompound = 1; + } + } + + } + + return he; +} + +int Hunspell::suggest(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + w_char unicw[MAXWORDLEN + 1]; + int nc = strlen(word); + if (utf8) { + if (nc >= MAXWORDUTF8LEN) return 0; + } else { + if (nc >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + int wl = cleanword2(cw, word, unicw, &nc, &captype, &abbv); + if (wl == 0) return 0; + int ns = 0; + *slst = NULL; + int capwords = 0; + int ngramsugs = 0; + + switch(captype) { + case NOCAP: { + ns = pSMgr->suggest(slst, cw, ns); + break; + } + + case INITCAP: { + capwords = 1; + ns = pSMgr->suggest(slst, cw, ns); + if (ns == -1) break; + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->suggest(slst, wspace, ns); + break; + } + case HUHINITCAP: + case HUHCAP: { + ns = pSMgr->suggest(slst, cw, ns); + if (ns != -1) { + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + insert_sug(slst, wspace, &ns); + ns = pSMgr->suggest(slst, wspace, ns); + if (captype == HUHINITCAP) { + mkinitcap2(wspace, unicw, nc); + insert_sug(slst, wspace, &ns); + ns = pSMgr->suggest(slst, wspace, ns); + } + } + break; + } + + case ALLCAP: { + memcpy(wspace, cw, (wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->suggest(slst, wspace, ns); + if (ns == -1) break; + if (pAMgr && pAMgr->get_keepcase()) insert_sug(slst, wspace, &ns); + mkinitcap2(wspace, unicw, nc); + ns = pSMgr->suggest(slst, wspace, ns); + for (int j=0; j < ns; j++) { + mkallcap((*slst)[j]); + if (pAMgr && pAMgr->get_checksharps()) { + char * pos; + if (utf8) { + while (pos = strstr((*slst)[j], "ß")) { + *pos = 'S'; + *(pos+1) = 'S'; + } + } else { + while (pos = strchr((*slst)[j], 'ß')) { + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 2); + mystrrep((*slst)[j], "ß", "SS"); + } + } + } + } + break; + } + } + + // LANG_hu section: replace '-' with ' ' in Hungarian + if ((langnum == LANG_hu) && (forbidden_compound == 2)) { + for (int j=0; j < ns; j++) { + char * pos = strchr((*slst)[j],'-'); + if (pos) *pos = ' '; + } + } + // END OF LANG_hu section + + // try ngram approach since found nothing + if ((ns == 0) && pAMgr && (pAMgr->get_maxngramsugs() != 0)) { + ngramsugs = 1; + switch(captype) { + case NOCAP: { + ns = pSMgr->ngsuggest(*slst, cw, pHMgr); + break; + } + case HUHCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); + break; + } + case INITCAP: { + capwords = 1; + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); + break; + } + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall2(wspace, unicw, nc); + ns = pSMgr->ngsuggest(*slst, wspace, pHMgr); + for (int j=0; j < ns; j++) + mkallcap((*slst)[j]); + break; + } + } + } + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + for (int j = 0; j < ns; j++) { + if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); + } + } + + // capitalize and erase capitalized duplications + if (capwords) { + int l = 0; + for (int j=0; j < ns; j++) { + mkinitcap((*slst)[j]); + (*slst)[l] = (*slst)[j]; + for (int k=0; k < l; k++) { + if (strcmp((*slst)[k], (*slst)[j]) == 0) { + free((*slst)[j]); + l--; + } + } + l++; + } + ns = l; + } + + // expand suggestions with dot(s) + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { + for (int j = 0; j < ns; j++) { + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); + strcat((*slst)[j], word + strlen(word) - abbv); + } + } + + // suggest keepcase + if (pAMgr->get_keepcase()) { + switch (captype) { + case INITCAP: + case ALLCAP: { + int l = 0; + for (int j=0; j < ns; j++) { + if (!spell((*slst)[j])) { + char s[MAXSWUTF8L]; + w_char w[MAXSWL]; + int len; + if (utf8) { + len = u8_u16(w, MAXSWL, (*slst)[j]); + } else { + strcpy(s, (*slst)[j]); + len = strlen(s); + } + int wl = mkallsmall2(s, w, len); + free((*slst)[j]); + if (spell(s)) { + (*slst)[l] = mystrdup(s); + l++; + } else { + int wl = mkinitcap2(s, w, len); + if (spell(s)) { + (*slst)[l] = mystrdup(s); + l++; + } + } + } else { + (*slst)[l] = (*slst)[j]; + l++; + } + } + ns = l; + l = 0; + // remove duplications + for (int j=0; j < ns; j++) { + (*slst)[l] = (*slst)[j]; + for (int k=0; k < l; k++) { + if (strcmp((*slst)[k], (*slst)[j]) == 0) { + free((*slst)[j]); + l--; + } + } + l++; + } + ns = l; + } + } + } + + return ns; +} + +// XXX need UTF-8 support +int Hunspell::suggest_auto(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return 0; + int ns = 0; + *slst = NULL; // HU, nsug in pSMgr->suggest + + switch(captype) { + case NOCAP: { + ns = pSMgr->suggest_auto(slst, cw, ns); + if (ns>0) break; + break; + } + + case INITCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + for (int j=0; j < ns; j++) + mkinitcap((*slst)[j]); + ns = pSMgr->suggest_auto(slst, cw, ns); + break; + + } + + case HUHCAP: { + ns = pSMgr->suggest_auto(slst, cw, ns); + if (ns == 0) { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + } + break; + } + + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + + mkinitcap(wspace); + ns = pSMgr->suggest_auto(slst, wspace, ns); + + for (int j=0; j < ns; j++) + mkallcap((*slst)[j]); + break; + } + } + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + for (int j = 0; j < ns; j++) { + if (utf8) reverseword_utf((*slst)[j]); else reverseword((*slst)[j]); + } + } + + // expand suggestions with dot(s) + if (abbv && pAMgr && pAMgr->get_sugswithdots()) { + for (int j = 0; j < ns; j++) { + (*slst)[j] = (char *) realloc((*slst)[j], strlen((*slst)[j]) + 1 + abbv); + strcat((*slst)[j], word + strlen(word) - abbv); + } + } + + // replace '-' with ' ' + if (forbidden_compound == 2) { + for (int j=0; j < ns; j++) { + char * pos = strchr((*slst)[j],'-'); + if (pos) *pos = ' '; + } + } + return ns; +} + +// XXX need UTF-8 support +int Hunspell::stem(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return 0; + + int ns = 0; + + *slst = NULL; // HU, nsug in pSMgr->suggest + + switch(captype) { + case HUHCAP: + case NOCAP: { + ns = pSMgr->suggest_stems(slst, cw, ns); + + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + break; + } + + case INITCAP: { + + ns = pSMgr->suggest_stems(slst, cw, ns); + + if (ns == 0) { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_stems(slst, wspace, ns); + + } + + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + mkallsmall(wspace); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + break; + + } + + case ALLCAP: { + ns = pSMgr->suggest_stems(slst, cw, ns); + if (ns != 0) break; + + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_stems(slst, wspace, ns); + + if (ns == 0) { + mkinitcap(wspace); + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + mkallsmall(wspace); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_stems(slst, wspace, ns); + } + + + break; + } + } + + return ns; +} + +int Hunspell::suggest_pos_stems(char*** slst, const char * word) +{ + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return 0; + + int ns = 0; // ns=0 = normalized input + + *slst = NULL; // HU, nsug in pSMgr->suggest + + switch(captype) { + case HUHCAP: + case NOCAP: { + ns = pSMgr->suggest_pos_stems(slst, cw, ns); + + if ((abbv) && (ns == 0)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + } + + break; + } + + case INITCAP: { + + ns = pSMgr->suggest_pos_stems(slst, cw, ns); + + if (ns == 0 || ((*slst)[0][0] == '#')) { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + } + + break; + + } + + case ALLCAP: { + ns = pSMgr->suggest_pos_stems(slst, cw, ns); + if (ns != 0) break; + + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + + if (ns == 0) { + mkinitcap(wspace); + ns = pSMgr->suggest_pos_stems(slst, wspace, ns); + } + break; + } + } + + return ns; +} + +char * Hunspell::get_dic_encoding() +{ + return encoding; +} + +const char * Hunspell::get_wordchars() +{ + return pAMgr->get_wordchars(); +} + +unsigned short * Hunspell::get_wordchars_utf16(int * len) +{ + return pAMgr->get_wordchars_utf16(len); +} + +char * Hunspell::get_prevroot() +{ + return prevroot; // XXX not stateless, not for OOo +} + +int Hunspell::get_prevcompound() +{ + return prevcompound; // XXX not stateless, not for OOo +} + +int Hunspell::get_forbidden_compound() +{ + return forbidden_compound; // XXX not stateless, not for OOo +} + +void Hunspell::mkinitcap(char * p) +{ + if (!utf8) { + if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; + } else { + int len; + w_char u[MAXWORDLEN]; + len = u8_u16(u, MAXWORDLEN, p); + unsigned short i = utfconv[(u[0].h << 8) + u[0].l].cupper; + u[0].h = (unsigned char) (i >> 8); + u[0].l = (unsigned char) (i & 0x00FF); + u16_u8(p, MAXWORDUTF8LEN, u, len); + } +} + +int Hunspell::mkinitcap2(char * p, w_char * u, int nc) +{ + if (!utf8) { + if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; + } else if (nc > 0) { + unsigned short i = utfconv[(u[0].h << 8) + u[0].l].cupper; + u[0].h = (unsigned char) (i >> 8); + u[0].l = (unsigned char) (i & 0x00FF); + u16_u8(p, MAXWORDUTF8LEN, u, nc); + return strlen(p); + } + return nc; +} + +struct cs_info * Hunspell::get_csconv() +{ + return csconv; +} + +struct unicode_info2 * Hunspell::get_utf_conv() +{ + return utfconv; +} + +int Hunspell::put_word(const char * word) +{ + if (pHMgr) { + return pHMgr->put_word(word, strlen(word), NULL); + } + return 0; +} + +int Hunspell::put_word_suffix(const char * word, const char * suffix) +{ + if (pHMgr) { + return pHMgr->put_word(word, strlen(word), (char *) suffix); + } + return 0; +} + +int Hunspell::put_word_pattern(const char * word, const char * pattern) +{ + if (pHMgr) { + return pHMgr->put_word_pattern(word, strlen(word), pattern); + } + return 0; +} + +const char * Hunspell::get_version() +{ + return pAMgr->get_version(); +} + +// XXX need UTF-8 support +char * Hunspell::morph(const char * word) +{ + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) { + if (abbv) { + for (wl = 0; wl < abbv; wl++) cw[wl] = '.'; + cw[wl] = '\0'; + abbv = 0; + } else return 0; + } + + char result[MAXLNLEN]; + char * st = NULL; + + *result = '\0'; + + int n = 0; + int n2 = 0; + int n3 = 0; + + // test numbers + // LANG_hu section: set dash information for suggestions + if (langnum == LANG_hu) { + while ((n < wl) && + (((cw[n] <= '9') && (cw[n] >= '0')) || (((cw[n] == '.') || (cw[n] == ',')) && (n > 0)))) { + n++; + if ((cw[n] == '.') || (cw[n] == ',')) { + if (((n2 == 0) && (n > 3)) || + ((n2 > 0) && ((cw[n-1] == '.') || (cw[n-1] == ',')))) break; + n2++; + n3 = n; + } + } + + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL; + if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='°')) && check(cw+n))) { + strcat(result, cw); + result[n - 1] = '\0'; + if (n == wl) { + st = pSMgr->suggest_morph(cw + n - 1); + if (st) { + strcat(result, st); + free(st); + } + } else { + char sign = cw[n]; + cw[n] = '\0'; + st = pSMgr->suggest_morph(cw + n - 1); + if (st) { + strcat(result, st); + free(st); + } + strcat(result, "+"); // XXX SPEC. MORPHCODE + cw[n] = sign; + st = pSMgr->suggest_morph(cw + n); + if (st) { + strcat(result, st); + free(st); + } + } + return mystrdup(result); + } + } + // END OF LANG_hu section + + switch(captype) { + case NOCAP: { + st = pSMgr->suggest_morph(cw); + if (st) { + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + } + break; + } + case INITCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + strcat(result, st); + free(st); + } + st = pSMgr->suggest_morph(cw); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + } + break; + } + case HUHCAP: { + st = pSMgr->suggest_morph(cw); + if (st) { + strcat(result, st); + free(st); + } +#if 0 + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } +#endif + break; + } + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + st = pSMgr->suggest_morph(wspace); + if (st) { + strcat(result, st); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,(wl+1)); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + if (*result) strcat(result, "\n"); + st = pSMgr->suggest_morph(wspace); + if (st) { + strcat(result, st); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + } + break; + } + } + + if (result && (*result)) { + // word reversing wrapper for complex prefixes + if (complexprefixes) { + if (utf8) reverseword_utf(result); else reverseword(result); + } + return mystrdup(result); + } + + // compound word with dash (HU) I18n + char * dash; + int nresult = 0; + // LANG_hu section: set dash information for suggestions + if ((langnum == LANG_hu) && (dash=(char *) strchr(cw,'-'))) { + *dash='\0'; + // examine 2 sides of the dash + if (dash[1] == '\0') { // base word ending with dash + if (spell(cw)) return pSMgr->suggest_morph(cw); + } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. + if (spell(cw) && (spell("-e"))) { + st = pSMgr->suggest_morph(cw); + if (st) { + strcat(result, st); + free(st); + } + strcat(result,"+"); // XXX spec. separator in MORPHCODE + st = pSMgr->suggest_morph("-e"); + if (st) { + strcat(result, st); + free(st); + } + return mystrdup(result); + } + } else { + // first word ending with dash: word- XXX ??? + char r2 = *(dash + 1); + dash[0]='-'; + dash[1]='\0'; + nresult = spell(cw); + dash[1] = r2; + dash[0]='\0'; + if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || + ((dash[1] > '0') && (dash[1] < '9')))) { + st = morph(cw); + if (st) { + strcat(result, st); + free(st); + strcat(result,"+"); // XXX spec. separator in MORPHCODE + } + st = morph(dash+1); + if (st) { + strcat(result, st); + free(st); + } + return mystrdup(result); + } + } + // affixed number in correct word + if (nresult && (dash > cw) && (((*(dash-1)<='9') && + (*(dash-1)>='0')) || (*(dash-1)=='.'))) { + *dash='-'; + n = 1; + if (*(dash - n) == '.') n++; + // search first not a number character to left from dash + while (((dash - n)>=cw) && ((*(dash - n)=='0') || (n < 3)) && (n < 6)) { + n++; + } + if ((dash - n) < cw) n--; + // numbers: valami1000000-hoz + // examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz, + // 56-hoz, 6-hoz + for(; n >= 1; n--) { + if ((*(dash - n) >= '0') && (*(dash - n) <= '9') && check(dash - n)) { + strcat(result, cw); + result[dash - cw - n] = '\0'; + st = pSMgr->suggest_morph(dash - n); + if (st) { + strcat(result, st); + free(st); + } + return mystrdup(result); + } + } + } + } + return NULL; +} + +// XXX need UTF-8 support +char * Hunspell::morph_with_correction(const char * word) +{ + char cw[MAXWORDUTF8LEN + 4]; + char wspace[MAXWORDUTF8LEN + 4]; + if (! pSMgr) return 0; + int wl = strlen(word); + if (utf8) { + if (wl >= MAXWORDUTF8LEN) return 0; + } else { + if (wl >= MAXWORDLEN) return 0; + } + int captype = 0; + int abbv = 0; + wl = cleanword(cw, word, &captype, &abbv); + if (wl == 0) return 0; + + char result[MAXLNLEN]; + char * st = NULL; + + *result = '\0'; + + + switch(captype) { + case NOCAP: { + st = pSMgr->suggest_morph_for_spelling_error(cw); + if (st) { + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + } + break; + } + case INITCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + strcat(result, st); + free(st); + } + st = pSMgr->suggest_morph_for_spelling_error(cw); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + } + break; + } + case HUHCAP: { + st = pSMgr->suggest_morph_for_spelling_error(cw); + if (st) { + strcat(result, st); + free(st); + } + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + break; + } + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + strcat(result, st); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + if (abbv) { + memcpy(wspace,cw,(wl+1)); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + if (*result) strcat(result, "\n"); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + strcat(result, st); + free(st); + } + mkallsmall(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } + } + break; + } + } + + if (result) return mystrdup(result); + return NULL; +} + +/* analyze word + * return line count + * XXX need a better data structure for morphological analysis */ +int Hunspell::analyze(char ***out, const char *word) { + int n = 0; + if (!word) return 0; + char * m = morph(word); + if(!m) return 0; + if (!out) return line_tok(m, out); + + // without memory allocation + /* BUG missing buffer size checking */ + int i, p; + for(p = 0, i = 0; m[i]; i++) { + if(m[i] == '\n' || !m[i+1]) { + n++; + strncpy((*out)[n++], m + p, i - p + 1); + if (m[i] == '\n') (*out)[n++][i - p] = '\0'; + if(!m[i+1]) break; + p = i + 1; + } + } + free(m); + return n; +} + diff --git a/src/myspell/hunspell.dsp b/src/myspell/hunspell.dsp new file mode 100644 index 0000000..05e072f --- /dev/null +++ b/src/myspell/hunspell.dsp @@ -0,0 +1,164 @@ +# Microsoft Developer Studio Project File - Name="hunspell" - Package Owner=<4> +# Microsoft Developer Studio Generated Build File, Format Version 6.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Static Library" 0x0104 + +CFG=hunspell - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE +!MESSAGE NMAKE /f "hunspell.mak". +!MESSAGE +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f "hunspell.mak" CFG="hunspell - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "hunspell - Win32 Release" (based on "Win32 (x86) Static Library") +!MESSAGE "hunspell - Win32 Debug" (based on "Win32 (x86) Static Library") +!MESSAGE + +# Begin Project +# PROP AllowPerConfigDependencies 0 +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "hunspell - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /GX /O2 /D "W32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c +# ADD CPP /nologo /W3 /GX /O2 /D "W32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c +# ADD BASE RSC /l 0x40e /d "NDEBUG" +# ADD RSC /l 0x40e /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LIB32=link.exe -lib +# ADD BASE LIB32 /nologo +# ADD LIB32 /nologo + +!ELSEIF "$(CFG)" == "hunspell - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "W32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c +# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /D "W32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c +# ADD BASE RSC /l 0x40e /d "_DEBUG" +# ADD RSC /l 0x40e /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LIB32=link.exe -lib +# ADD BASE LIB32 /nologo +# ADD LIB32 /nologo + +!ENDIF + +# Begin Target + +# Name "hunspell - Win32 Release" +# Name "hunspell - Win32 Debug" +# Begin Group "Source Files" + +# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat" +# Begin Source File + +SOURCE=.\affentry.cxx +# End Source File +# Begin Source File + +SOURCE=.\affixmgr.cxx +# End Source File +# Begin Source File + +SOURCE=.\csutil.cxx +# End Source File +# Begin Source File + +SOURCE=.\dictmgr.cxx +# End Source File +# Begin Source File + +SOURCE=.\hashmgr.cxx +# End Source File +# Begin Source File + +SOURCE=.\hunspell.cxx +# End Source File +# Begin Source File + +SOURCE=.\suggestmgr.cxx +# End Source File +# End Group +# Begin Group "Header Files" + +# PROP Default_Filter "h;hpp;hxx;hm;inl" +# Begin Source File + +SOURCE=.\affentry.hxx +# End Source File +# Begin Source File + +SOURCE=.\affixmgr.hxx +# End Source File +# Begin Source File + +SOURCE=.\atypes.hxx +# End Source File +# Begin Source File + +SOURCE=.\baseaffix.hxx +# End Source File +# Begin Source File + +SOURCE=.\csutil.hxx +# End Source File +# Begin Source File + +SOURCE=.\dictmgr.hxx +# End Source File +# Begin Source File + +SOURCE=.\hashmgr.hxx +# End Source File +# Begin Source File + +SOURCE=.\htypes.hxx +# End Source File +# Begin Source File + +SOURCE=.\langnum.hxx +# End Source File +# Begin Source File + +SOURCE=.\hunspell.hxx +# End Source File +# Begin Source File + +SOURCE=.\suggestmgr.hxx +# End Source File +# End Group +# End Target +# End Project diff --git a/src/myspell/hunspell.hxx b/src/myspell/hunspell.hxx new file mode 100644 index 0000000..5860fa8 --- /dev/null +++ b/src/myspell/hunspell.hxx @@ -0,0 +1,142 @@ +#include "hashmgr.hxx" +#include "affixmgr.hxx" +#include "suggestmgr.hxx" +#include "csutil.hxx" +#include "langnum.hxx" + +#define NOCAP 0 +#define INITCAP 1 +#define ALLCAP 2 +#define HUHCAP 3 +#define HUHINITCAP 4 + +#define MAXSUGGESTION 15 +#define MAXSHARPS 5 + +#ifdef W32 +#define DLLTEST2_API __declspec(dllexport) +#endif + +#ifndef _MYSPELLMGR_HXX_ +#define _MYSPELLMGR_HXX_ + +#ifdef W32 +class DLLTEST2_API Hunspell +#else +class Hunspell +#endif +{ + AffixMgr* pAMgr; + HashMgr* pHMgr; + SuggestMgr* pSMgr; + char * encoding; + struct cs_info * csconv; + struct unicode_info2 * utfconv; + int langnum; + int utf8; + int complexprefixes; + char** wordbreak; + +/* XXX not stateless variables for compound handling */ + char * prevroot; + int prevcompound; + +/* forbidden_compound: + * 0 = not forbidden + * 1 = forbidden + * 2 = forbidden compound (written without dash in Hungarian) + */ + int forbidden_compound; + + +public: + + /* Hunspell(aff, dic) - constructor of Hunspell class + * input: path of affix file and dictionary file + */ + + Hunspell(const char * affpath, const char * dpath); + + ~Hunspell(); + + /* spell(word) - spellcheck word + * output: 0 = bad word, not 0 = good word + */ + + int spell(const char *); + + /* suggest(suggestions, word) - search suggestions + * input: pointer to an array of strings pointer and the (bad) word + * array of strings pointer (here *slst) may not be initialized + * output: number of suggestions in string array, and suggestions in + * a newly allocated array of strings (*slts will be NULL when number + * of suggestion equals 0.) + */ + + int suggest(char*** slst, const char * word); + + /* handling custom dictionary */ + + int put_word(const char * word); + + /* suffix is an affix flag string, similarly in dictionary files */ + + int put_word_suffix(const char * word, const char * suffix); + + /* pattern is a sample dictionary word + * put word into custom dictionary with affix flags of pattern word + */ + + int put_word_pattern(const char * word, const char * pattern); + + /* other */ + + char * get_dic_encoding(); + const char * get_wordchars(); + unsigned short * get_wordchars_utf16(int * len); + struct cs_info * get_csconv(); + struct unicode_info2 * get_utf_conv(); + const char * get_version(); + + /* experimental functions */ + + /* morphological analysis */ + + char * morph(const char * word); + int analyze(char*** out, const char *word); + + char * morph_with_correction(const char * word); + + /* stemmer function */ + + int stem(char*** slst, const char * word); + + /* spec. suggestions */ + int suggest_auto(char*** slst, const char * word); + int suggest_pos_stems(char*** slst, const char * word); + char * get_possible_root(); + + /* not threadsafe functions for Hunspell command line API */ + + char * get_prevroot(); + int get_prevcompound(); + int get_forbidden_compound(); + +private: + int cleanword(char *, const char *, int * pcaptype, int * pabbrev); + int cleanword2(char *, const char *, w_char *, int * w_len, int * pcaptype, int * pabbrev); + void mkinitcap(char *); + int mkinitcap2(char * p, w_char * u, int nc); + void mkallcap(char *); + int mkallcap2(char * p, w_char * u, int nc); + void mkallsmall(char *); + int mkallsmall2(char * p, w_char * u, int nc); + struct hentry * check(const char *); + char * sharps_u8_l1(char * dest, char * source); + hentry * spellsharps(char * base, char *, int, int, char * tmp); + int is_keepcase(const hentry * rv); + int insert_sug(char ***slst, char * word, int *ns); + +}; + +#endif diff --git a/src/myspell/myspell.cxx b/src/myspell/myspell.cxx deleted file mode 100644 index fcdbaa1..0000000 --- a/src/myspell/myspell.cxx +++ /dev/null @@ -1,302 +0,0 @@ -#include "license.readme" - -#include <cstring> -#include <cstdlib> -#include <cstdio> - -#include "enchant_myspell.hxx" - -#ifndef WINDOWS -using namespace std; -#endif - - -MySpell::MySpell(const char * affpath, const char * dpath) -{ - encoding = NULL; - csconv = NULL; - - /* first set up the hash manager */ - pHMgr = new HashMgr(dpath); - - /* next set up the affix manager */ - /* it needs access to the hash manager lookup methods */ - pAMgr = new AffixMgr(affpath,pHMgr); - - /* get the preferred try string and the dictionary */ - /* encoding from the Affix Manager for that dictionary */ - char * try_string = pAMgr->get_try_string(); - encoding = pAMgr->get_encoding(); - csconv = get_current_cs(encoding); - - /* and finally set up the suggestion manager */ - maxSug = 15; - pSMgr = new SuggestMgr(try_string, maxSug, pAMgr); - if (try_string) free(try_string); -} - - -MySpell::~MySpell() -{ - if (pSMgr) delete pSMgr; - if (pAMgr) delete pAMgr; - if (pHMgr) delete pHMgr; - pSMgr = NULL; - pAMgr = NULL; - pHMgr = NULL; - csconv= NULL; - if (encoding) free(encoding); - encoding = NULL; -} - - -// make a copy of src at destination while removing all leading -// blanks and removing any trailing periods after recording -// their presence with the abbreviation flag -// also since already going through character by character, -// set the capitalization type -// return the length of the "cleaned" word - -int MySpell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev) -{ - - // with the new breakiterator code this should not be needed anymore - const char * special_chars = "._#$%&()* +,-/:;<=>[]\\^`{|}~\t \x0a\x0d\x01\'\""; - - unsigned char * p = (unsigned char *) dest; - const unsigned char * q = (const unsigned char * ) src; - - // first skip over any leading special characters - while ((*q != '\0') && (strchr(special_chars,(int)(*q)))) q++; - - // now strip off any trailing special characters - // if a period comes after a normal char record its presence - *pabbrev = 0; - int nl = strlen((const char *)q); - while ((nl > 0) && (strchr(special_chars,(int)(*(q+nl-1))))) { - nl--; - } - if ( *(q+nl) == '.' ) *pabbrev = 1; - - // if no characters are left it can't be an abbreviation and can't be capitalized - if (nl <= 0) { - *pcaptype = NOCAP; - *pabbrev = 0; - *p = '\0'; - return 0; - } - - // now determine the capitalization type of the first nl letters - int ncap = 0; - int nneutral = 0; - int nc = 0; - while (nl > 0) { - nc++; - if (csconv[(*q)].ccase) ncap++; - if (csconv[(*q)].cupper == csconv[(*q)].clower) nneutral++; - *p++ = *q++; - nl--; - } - // remember to terminate the destination string - *p = '\0'; - - // now finally set the captype - if (ncap == 0) { - *pcaptype = NOCAP; - } else if ((ncap == 1) && csconv[(unsigned char)(*dest)].ccase) { - *pcaptype = INITCAP; - } else if ((ncap == nc) || ((ncap + nneutral) == nc)){ - *pcaptype = ALLCAP; - } else { - *pcaptype = HUHCAP; - } - return nc; -} - - -int MySpell::spell(const char * word) -{ - char * rv=NULL; - char cw[MAXWORDLEN+1]; - char wspace[MAXWORDLEN+1]; - - int wl = strlen(word); - if (wl > (MAXWORDLEN - 1)) return 0; - int captype = 0; - int abbv = 0; - wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 1; - - switch(captype) { - case HUHCAP: - case NOCAP: { - rv = check(cw); - if ((abbv) && !(rv)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = check(wspace); - } - break; - } - - case ALLCAP: { - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace, csconv); - rv = check(wspace); - if (!rv) { - mkinitcap(wspace, csconv); - rv = check(wspace); - } - if (!rv) rv = check(cw); - if ((abbv) && !(rv)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = check(wspace); - } - break; - } - case INITCAP: { - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace, csconv); - rv = check(wspace); - if (!rv) rv = check(cw); - if ((abbv) && !(rv)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - rv = check(wspace); - } - break; - } - } - if (rv) return 1; - return 0; -} - - -char * MySpell::check(const char * word) -{ - struct hentry * he = NULL; - if (pHMgr) - he = pHMgr->lookup (word); - - if ((he == NULL) && (pAMgr)) { - // try stripping off affixes */ - he = pAMgr->affix_check(word, strlen(word)); - - // try check compound word - if ((he == NULL) && (pAMgr->get_compound())) { - he = pAMgr->compound_check(word, strlen(word), (pAMgr->get_compound())[0]); - } - - } - - if (he) return he->word; - return NULL; -} - - - -int MySpell::suggest(char*** slst, const char * word) -{ - char cw[MAXWORDLEN+1]; - char wspace[MAXWORDLEN+1]; - if (! pSMgr) return 0; - int wl = strlen(word); - if (wl > (MAXWORDLEN-1)) return 0; - int captype = 0; - int abbv = 0; - wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 0; - - int ns = 0; - char ** wlst = (char **) calloc(maxSug, sizeof(char *)); - if (wlst == NULL) return 0; - - switch(captype) { - case NOCAP: { - ns = pSMgr->suggest(wlst, ns, cw); - break; - } - - case INITCAP: { - - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace, csconv); - ns = pSMgr->suggest(wlst, ns, wspace); - if (ns > 0) { - for (int j=0; j < ns; j++) - mkinitcap(wlst[j], csconv); - } - ns = pSMgr->suggest(wlst,ns,cw); - break; - } - - case HUHCAP: { - ns = pSMgr->suggest(wlst, ns, cw); - if (ns != -1) { - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace, csconv); - ns = pSMgr->suggest(wlst, ns, wspace); - } - break; - } - - case ALLCAP: { - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace, csconv); - ns = pSMgr->suggest(wlst, ns, wspace); - if (ns > 0) { - for (int j=0; j < ns; j++) - mkallcap(wlst[j], csconv); - } - if (ns != -1) - ns = pSMgr->suggest(wlst, ns , cw); - break; - } - } - if (ns > 0) { - *slst = wlst; - return ns; - } - // try ngram approach since found nothing - if (ns == 0) { - ns = pSMgr->ngsuggest(wlst, cw, pHMgr); - if (ns) { - switch(captype) { - case NOCAP: break; - case HUHCAP: break; - case INITCAP: { - for (int j=0; j < ns; j++) - mkinitcap(wlst[j], csconv); - } - break; - - case ALLCAP: { - for (int j=0; j < ns; j++) - mkallcap(wlst[j], csconv); - } - break; - } - *slst = wlst; - return ns; - } - } - if (ns < 0) { - // we ran out of memory - we should free up as much as possible - for (int i=0;i<maxSug; i++) - if (wlst[i] != NULL) free(wlst[i]); - } - if (wlst) free(wlst); - *slst = NULL; - return 0; -} - - -char * MySpell::get_dic_encoding() -{ - return encoding; -} - diff --git a/src/myspell/myspell_checker.cpp b/src/myspell/myspell_checker.cpp index f84358a..01e8845 100644 --- a/src/myspell/myspell_checker.cpp +++ b/src/myspell/myspell_checker.cpp @@ -38,11 +38,8 @@ #include "enchant.h" #include "enchant-provider.h" -#ifdef WITH_SYSTEM_MYSPELL -#include <myspell.hxx> -#else -#include "enchant_myspell.hxx" -#endif +/* built against hunspell 1.1.3 on January 13, 2006 */ +#include "hunspell.hxx" ENCHANT_PLUGIN_DECLARE("Myspell") @@ -66,7 +63,7 @@ public: private: GIConv m_translate_in; /* Selected translation from/to Unicode */ GIConv m_translate_out; - MySpell *myspell; + Hunspell *myspell; }; /***************************************************************************/ @@ -257,7 +254,7 @@ MySpellChecker::requestDictionary(const char *szLang) aff = g_strdup(dic); int len_dic = strlen(dic); strcpy(aff+len_dic-3, "aff"); - myspell = new MySpell(aff, dic); + myspell = new Hunspell(aff, dic); g_free(dic); g_free(aff); char *enc = myspell->get_dic_encoding(); diff --git a/src/myspell/suggestmgr.cxx b/src/myspell/suggestmgr.cxx index 4e9c051..fe451cc 100644 --- a/src/myspell/suggestmgr.cxx +++ b/src/myspell/suggestmgr.cxx @@ -1,4 +1,5 @@ -#include "license.readme" +#include "license.hunspell" +#include "license.myspell" #include <cstdlib> #include <cctype> @@ -7,12 +8,10 @@ #include "suggestmgr.hxx" -#ifndef WINDOWS +#ifndef W32 using namespace std; #endif -extern char * mystrdup(const char *); - SuggestMgr::SuggestMgr(const char * tryme, int maxn, AffixMgr * aptr) @@ -21,13 +20,41 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, // register affix manager and check in string of chars to // try when building candidate suggestions pAMgr = aptr; - ctry = mystrdup(tryme); + ctryl = 0; - if (ctry) - ctryl = strlen(ctry); + ctry = NULL; + ctry_utf = NULL; + maxSug = maxn; - nosplitsugs=(0==1); - if (pAMgr) pAMgr->get_nosplitsugs(); + nosplitsugs = 0; + maxngramsugs = MAXNGRAMSUGS; + + utf8 = 0; + utfconv = NULL; + complexprefixes = 0; + + if (pAMgr) { + char * enc = pAMgr->get_encoding(); + csconv = get_current_cs(enc); + free(enc); + nosplitsugs = pAMgr->get_nosplitsugs(); + if (pAMgr->get_maxngramsugs() >= 0) maxngramsugs = pAMgr->get_maxngramsugs(); + utf8 = pAMgr->get_utf8(); + utfconv = pAMgr->get_utf_conv(); + complexprefixes = pAMgr->get_complexprefixes(); + } + + if (tryme) { + if (utf8) { + w_char t[MAXSWL]; + ctryl = u8_u16(t, MAXSWL, tryme); + ctry_utf = (w_char *) malloc(ctryl * sizeof(w_char)); + memcpy(ctry_utf, t, ctryl * sizeof(w_char)); + } else { + ctry = mystrdup(tryme); + ctryl = strlen(ctry); + } + } } @@ -36,6 +63,8 @@ SuggestMgr::~SuggestMgr() pAMgr = NULL; if (ctry) free(ctry); ctry = NULL; + if (ctry_utf) free(ctry_utf); + ctry_utf = NULL; ctryl = 0; maxSug = 0; } @@ -45,67 +74,182 @@ SuggestMgr::~SuggestMgr() // generate suggestions for a mispelled word // pass in address of array of char * pointers -int SuggestMgr::suggest(char** wlst, int ns, const char * word) +int SuggestMgr::suggest(char*** slst, const char * w, int nsug) { + int nocompoundtwowords = 0; + char ** wlst; + w_char word_utf[MAXSWL]; + int wl; + + char w2[MAXWORDUTF8LEN]; + const char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } - int nsug = ns; + if (*slst) { + wlst = *slst; + } else { + wlst = (char **) malloc(maxSug * sizeof(char *)); + if (wlst == NULL) return -1; + for (int i = 0; i < maxSug; i++) wlst[i] = NULL; + } + + if (utf8) { + wl = u8_u16(word_utf, MAXSWL, word); + } - // perhaps we made chose the wrong char from a related set - if ((nsug < maxSug) && (nsug > -1)) - nsug = mapchars(wlst, word, nsug); + for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) { // perhaps we made a typical fault of spelling if ((nsug < maxSug) && (nsug > -1)) - nsug = replchars(wlst, word, nsug); + nsug = replchars(wlst, word, nsug, cpdsuggest); - // did we forget to add a char + // perhaps we made chose the wrong char from a related set if ((nsug < maxSug) && (nsug > -1)) - nsug = forgotchar(wlst, word, nsug); + nsug = mapchars(wlst, word, nsug, cpdsuggest); // did we swap the order of chars by mistake - if ((nsug < maxSug) && (nsug > -1)) - nsug = swapchar(wlst, word, nsug); + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? swapchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + swapchar(wlst, word, nsug, cpdsuggest); + } + + // did we forget to add a char + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? forgotchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + forgotchar(wlst, word, nsug, cpdsuggest); + } // did we add a char that should not be there - if ((nsug < maxSug) && (nsug > -1)) - nsug = extrachar(wlst, word, nsug); - + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? extrachar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + extrachar(wlst, word, nsug, cpdsuggest); + } + // did we just hit the wrong key in place of a good char - if ((nsug < maxSug) && (nsug > -1)) - nsug = badchar(wlst, word, nsug); + if ((nsug < maxSug) && (nsug > -1)) { + nsug = (utf8) ? badchar_utf(wlst, word_utf, wl, nsug, cpdsuggest) : + badchar(wlst, word, nsug, cpdsuggest); + } + + // only suggest compound words when no other suggestion + if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; // perhaps we forgot to hit space and two words ran together - if (!nosplitsugs) { - if ((nsug < maxSug) && (nsug > -1)) - nsug = twowords(wlst, word, nsug); + if ((!nosplitsugs) && (nsug < maxSug) && (nsug > -1)) { + nsug = twowords(wlst, word, nsug, cpdsuggest); + } + + } // repeating ``for'' statement compounding support + + if (nsug < 0) { + // we ran out of memory - we should free up as much as possible + for (int i = 0; i < maxSug; i++) + if (wlst[i] != NULL) free(wlst[i]); + free(wlst); + wlst = NULL; } + + *slst = wlst; return nsug; } +// generate suggestions for a word with typical mistake +// pass in address of array of char * pointers + +int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) +{ + int nocompoundtwowords = 0; + char ** wlst; + + char w2[MAXWORDUTF8LEN]; + const char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } + + if (*slst) { + wlst = *slst; + } else { + wlst = (char **) malloc(maxSug * sizeof(char *)); + if (wlst == NULL) return -1; + } + + for (int cpdsuggest=0; (cpdsuggest<2) && (nocompoundtwowords==0); cpdsuggest++) { + + // perhaps we made a typical fault of spelling + if ((nsug < maxSug) && (nsug > -1)) + nsug = replchars(wlst, word, nsug, cpdsuggest); + + // perhaps we made chose the wrong char from a related set + if ((nsug < maxSug) && (nsug > -1) && (cpdsuggest == 0)) + nsug = mapchars(wlst, word, nsug, cpdsuggest); + + if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; + + // perhaps we forgot to hit space and two words ran together + + if ((nsug < maxSug) && (nsug > -1) && check_forbidden(word, strlen(word))) { + nsug = twowords(wlst, word, nsug, cpdsuggest); + } + + } // repeating ``for'' statement compounding support + + if (nsug < 0) { + for (int i=0;i<maxSug; i++) + if (wlst[i] != NULL) free(wlst[i]); + free(wlst); + return -1; + } + + *slst = wlst; + return nsug; +} // suggestions for when chose the wrong char out of a related set -int SuggestMgr::mapchars(char** wlst, const char * word, int ns) +int SuggestMgr::mapchars(char** wlst, const char * word, int ns, int cpdsuggest) { + time_t timelimit; + int timer; + int wl = strlen(word); if (wl < 2 || ! pAMgr) return ns; int nummap = pAMgr->get_nummap(); struct mapentry* maptable = pAMgr->get_maptable(); if (maptable==NULL) return ns; - ns = map_related(word, 0, wlst, ns, maptable, nummap); + + timelimit = time(NULL); + timer = MINTIMER; + if (utf8) { + w_char w[MAXSWL]; + int len = u8_u16(w, MAXSWL, word); + ns = map_related_utf(w, len, 0, wlst, ns, maptable, nummap, &timer, &timelimit); + } else ns = map_related(word, 0, wlst, ns, maptable, nummap, &timer, &timelimit); return ns; } - -int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap) +int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, + const mapentry* maptable, int nummap, int * timer, time_t * timelimit) { - char c = *(word + i); + char c = *(word + i); if (c == 0) { int cwrd = 1; + int wl; for (int m=0; m < ns; m++) if (strcmp(word,wlst[m]) == 0) cwrd = 0; - if ((cwrd) && check(word,strlen(word))) { + if ((cwrd) && (wl = strlen(word)) && (check(word, wl, 0, timer, timelimit) || + check(word, wl, 1, timer, timelimit))) { if (ns < maxSug) { wlst[ns] = mystrdup(word); if (wlst[ns] == NULL) return -1; @@ -121,14 +265,55 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const char * newword = mystrdup(word); for (int k = 0; k < maptable[j].len; k++) { *(newword + i) = *(maptable[j].set + k); - ns = map_related(newword, (i+1), wlst, ns, maptable, nummap); + ns = map_related(newword, (i+1), wlst, ns, maptable, nummap, timer, timelimit); + if (!(*timelimit)) return ns; } free(newword); } } if (!in_map) { i++; - ns = map_related(word, i, wlst, ns, maptable, nummap); + ns = map_related(word, i, wlst, ns, maptable, nummap, timer, timelimit); + } + return ns; +} + +int SuggestMgr::map_related_utf(w_char * word, int len, int i, char** wlst, int ns, + const mapentry* maptable, int nummap, int * timer, time_t * timelimit) +{ + if (i == len) { + int cwrd = 1; + int wl; + char s[MAXSWUTF8L]; + u16_u8(s, MAXSWUTF8L, word, len); + for (int m=0; m < ns; m++) + if (strcmp(s,wlst[m]) == 0) cwrd = 0; + if ((cwrd) && (wl = strlen(s)) && (check(s, wl, 0, timer, timelimit) || + check(s, wl, 1, timer, timelimit))) { + if (ns < maxSug) { + wlst[ns] = mystrdup(s); + if (wlst[ns] == NULL) return -1; + ns++; + } + } + return ns; + } + int in_map = 0; + unsigned short c = *((unsigned short *) word + i); + for (int j = 0; j < nummap; j++) { + if (flag_bsearch((unsigned short *) maptable[j].set_utf16, c, maptable[j].len)) { + in_map = 1; + for (int k = 0; k < maptable[j].len; k++) { + *(word + i) = *(maptable[j].set_utf16 + k); + ns = map_related_utf(word, len, i + 1, wlst, ns, maptable, nummap, timer, timelimit); + if (!(*timelimit)) return ns; + } + *((unsigned short *) word + i) = c; + } + } + if (!in_map) { + i++; + ns = map_related_utf(word, len, i, wlst, ns, maptable, nummap, timer, timelimit); } return ns; } @@ -137,9 +322,9 @@ int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const // suggestions for a typical fault of spelling, that // differs with more, than 1 letter from the right form. -int SuggestMgr::replchars(char** wlst, const char * word, int ns) +int SuggestMgr::replchars(char** wlst, const char * word, int ns, int cpdsuggest) { - char candidate[MAXSWL]; + char candidate[MAXSWUTF8L]; const char * r; int lenr, lenp; int cwrd; @@ -153,21 +338,24 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns) for (int i=0; i < numrep; i++ ) { r = word; - lenr = strlen(reptable[i].replacement); + lenr = strlen(reptable[i].pattern2); lenp = strlen(reptable[i].pattern); // search every occurence of the pattern in the word while ((r=strstr(r, reptable[i].pattern)) != NULL) { strcpy(candidate, word); - if (r-word + lenr + strlen(r+lenp) >= MAXSWL) break; - strcpy(candidate+(r-word),reptable[i].replacement); + if (r-word + lenr + strlen(r+lenp) >= MAXSWUTF8L) break; + strcpy(candidate+(r-word),reptable[i].pattern2); strcpy(candidate+(r-word)+lenr, r+lenp); cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; - if ((cwrd) && check(candidate,strlen(candidate))) { + if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); - if (wlst[ns] == NULL) return -1; + if (wlst[ns] == NULL) { + for (int j=0; j<ns; j++) free(wlst[j]); + return -1; + } ns++; } else return ns; } @@ -177,16 +365,56 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns) return ns; } +// perhaps we made a special pattern mistake +// for example: vacation -> vacacation (doubled `ac') +int SuggestMgr::doubledsyllable(char** wlst, const char * word, int ns, int cpdsuggest) +{ + char candidate[MAXSWUTF8L]; + int state=0; + int cwrd; + + int wl = strlen(word); + if (wl < 5 || ! pAMgr) return ns; + + for (int i=2; i < wl; i++ ) { + if (word[i]==word[i-2]) { + state++; + if (state==3) { + strcpy(candidate,word); + strcpy(candidate+i-1,word+i+1); + cwrd = 1; + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if ((cwrd) && check(candidate,strlen(candidate), cpdsuggest, NULL, NULL)) { + if (ns < maxSug) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) { + for (int j=0; j<ns; j++) free(wlst[j]); + return -1; + } + ns++; + } else return ns; + } + state=0; + } + } else { + state=0; + } + } + return ns; +} // error is wrong char in place of correct one -int SuggestMgr::badchar(char ** wlst, const char * word, int ns) +int SuggestMgr::badchar(char ** wlst, const char * word, int ns, int cpdsuggest) { char tmpc; - char candidate[MAXSWL]; + char candidate[MAXSWUTF8L]; + time_t timelimit = time(NULL); + int timer = MINTIMER; int wl = strlen(word); int cwrd; - strcpy (candidate, word); + strcpy(candidate, word); // swap out each char one by one and try all the tryme // chars in its place to see if that makes a good word @@ -198,24 +426,92 @@ int SuggestMgr::badchar(char ** wlst, const char * word, int ns) cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; - if ((cwrd) && check(candidate,wl)) { + if ((cwrd) && check(candidate,wl, cpdsuggest, &timer, &timelimit)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; ns++; } else return ns; } + if (!timelimit) return ns; candidate[i] = tmpc; } } return ns; } +// error is wrong char in place of correct one +int SuggestMgr::badchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) +{ + w_char tmpc; + w_char candidate_utf[MAXSWL]; + char candidate[MAXSWUTF8L]; + int cwrd; + time_t timelimit = time(NULL); + int timer = MINTIMER; + + memcpy(candidate_utf, word, wl * sizeof(w_char)); + + // swap out each char one by one and try all the tryme + // chars in its place to see if that makes a good word + for (int i=0; i < wl; i++) { + tmpc = candidate_utf[i]; + for (int j=0; j < ctryl; j++) { + if ((ctry_utf[j].l == tmpc.l) && (ctry_utf[j].h == tmpc.h)) continue; + candidate_utf[i] = ctry_utf[j]; + cwrd = 1; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) { + if (ns < maxSug) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } else return ns; + } + if (!timelimit) return ns; + candidate_utf[i] = tmpc; + } + } + return ns; +} + +// error is word has an extra letter it does not need +int SuggestMgr::extrachar_utf(char** wlst, const w_char * word, int wl, int ns, int cpdsuggest) +{ + char candidate[MAXSWUTF8L]; + w_char candidate_utf[MAXSWL]; + + const w_char * p; + w_char * r; + int cwrd; + + if (wl < 2) return ns; + + // try omitting one char of word at a time + memcpy(candidate_utf, word + 1, (wl - 1) * sizeof(w_char)); + for (p = word, r = candidate_utf; p < word + wl; ) { + cwrd = 1; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl - 1); + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) { + if (ns < maxSug) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } else return ns; + } + *r++ = *p++; + } + return ns; +} // error is word has an extra letter it does not need -int SuggestMgr::extrachar(char** wlst, const char * word, int ns) +int SuggestMgr::extrachar(char** wlst, const char * word, int ns, int cpdsuggest) { - char candidate[MAXSWL]; + char candidate[MAXSWUTF8L]; const char * p; char * r; int cwrd; @@ -229,7 +525,7 @@ int SuggestMgr::extrachar(char** wlst, const char * word, int ns) cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; - if ((cwrd) && check(candidate,wl-1)) { + if ((cwrd) && check(candidate,wl-1, cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; @@ -242,13 +538,15 @@ int SuggestMgr::extrachar(char** wlst, const char * word, int ns) } -// error is mising a letter it needs -int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns) +// error is missing a letter it needs +int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns, int cpdsuggest) { - char candidate[MAXSWL]; + char candidate[MAXSWUTF8L]; const char * p; char * q; int cwrd; + time_t timelimit = time(NULL); + int timer = MINTIMER; int wl = strlen(word); @@ -260,13 +558,14 @@ int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns) cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; - if ((cwrd) && check(candidate,wl+1)) { + if ((cwrd) && check(candidate, wl+1, cpdsuggest, &timer, &timelimit)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; ns++; } else return ns; } + if (!timelimit) return ns; } *q++ = *p++; } @@ -277,7 +576,57 @@ int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns) cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; - if ((cwrd) && check(candidate,wl+1)) { + if ((cwrd) && check(candidate,wl+1, cpdsuggest, NULL, NULL)) { + if (ns < maxSug) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } else return ns; + } + } + return ns; +} + +// error is missing a letter it needs +int SuggestMgr::forgotchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) +{ + w_char candidate_utf[MAXSWL]; + char candidate[MAXSWUTF8L]; + const w_char * p; + w_char * q; + int cwrd; + time_t timelimit = time(NULL); + int timer = MINTIMER; + + // try inserting a tryme character before every letter + memcpy (candidate_utf + 1, word, wl * sizeof(w_char)); + for (p = word, q = candidate_utf; p < (word + wl); ) { + for (int i = 0; i < ctryl; i++) { + *q = ctry_utf[i]; + cwrd = 1; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, &timer, &timelimit)) { + if (ns < maxSug) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } else return ns; + } + if (!timelimit) return ns; + } + *q++ = *p++; + } + + // now try adding one to end */ + for (int i = 0; i < ctryl; i++) { + *q = ctry_utf[i]; + cwrd = 1; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl + 1); + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; @@ -290,27 +639,51 @@ int SuggestMgr::forgotchar(char ** wlst, const char * word, int ns) /* error is should have been two words */ -int SuggestMgr::twowords(char ** wlst, const char * word, int ns) +int SuggestMgr::twowords(char ** wlst, const char * word, int ns, int cpdsuggest) { - char candidate[MAXSWL]; + char candidate[MAXSWUTF8L]; char * p; + int c1, c2, cwrd; + int forbidden = 0; int wl=strlen(word); - if (wl < 3) return ns; + if (wl < 4) return ns; + + if (pAMgr->get_langnum() == LANG_hu) forbidden = check_forbidden(word, wl); + strcpy(candidate + 1, word); + candidate[0] = word[0]; // split the string into two pieces after every char // if both pieces are good words make them a suggestion - for (p = candidate + 1; p[1] != '\0'; p++) { + for (p = candidate + 2; p[2] != '\0'; p++) { p[-1] = *p; + // go to end of the UTF-8 character + while (utf8 && ((p[1] & 0xc0) == 0x80)) { + p++; + p[-1] = *p; + } *p = '\0'; - if (check(candidate,strlen(candidate))) { - if (check((p+1),strlen(p+1))) { - *p = ' '; + if ((c1=check(candidate,strlen(candidate), cpdsuggest, NULL, NULL))) { + if ((c2=check((p+1),strlen(p+1), cpdsuggest, NULL, NULL))) { + *p = ' '; + + // spec. Hungarian code (need a better compound word support) + if ((pAMgr->get_langnum() == LANG_hu) && !forbidden && + // if 3 repeating letter, use - instead of space + (((p[-1] == p[1]) && (((p>candidate+1) && (p[-1] == p[-2])) || (p[-1] == p[2]))) || + // or multiple compounding, with more, than 6 syllables + ((c1 == 3) && (c2 >= 2)))) *p = '-'; + + cwrd = 1; + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; if (ns < maxSug) { - wlst[ns] = mystrdup(candidate); - if (wlst[ns] == NULL) return -1; - ns++; + if (cwrd) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } } else return ns; } } @@ -320,14 +693,14 @@ int SuggestMgr::twowords(char ** wlst, const char * word, int ns) // error is adjacent letter were swapped -int SuggestMgr::swapchar(char ** wlst, const char * word, int ns) +int SuggestMgr::swapchar(char ** wlst, const char * word, int ns, int cpdsuggest) { - char candidate[MAXSWL]; + char candidate[MAXSWUTF8L]; char * p; char tmpc; int cwrd; - int wl = strlen(word); + int wl=strlen(word); // try swapping adjacent chars one by one strcpy(candidate, word); @@ -338,7 +711,7 @@ int SuggestMgr::swapchar(char ** wlst, const char * word, int ns) cwrd = 1; for (int k=0; k < ns; k++) if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; - if ((cwrd) && check(candidate,wl)) { + if ((cwrd) && check(candidate,wl, cpdsuggest, NULL, NULL)) { if (ns < maxSug) { wlst[ns] = mystrdup(candidate); if (wlst[ns] == NULL) return -1; @@ -352,9 +725,41 @@ int SuggestMgr::swapchar(char ** wlst, const char * word, int ns) return ns; } +// error is adjacent letter were swapped +int SuggestMgr::swapchar_utf(char ** wlst, const w_char * word, int wl, int ns, int cpdsuggest) +{ + w_char candidate_utf[MAXSWL]; + char candidate[MAXSWUTF8L]; + w_char * p; + w_char tmpc; + int cwrd; + + // try swapping adjacent chars one by one + memcpy (candidate_utf, word, wl * sizeof(w_char)); + for (p = candidate_utf; p < (candidate_utf + wl - 1); p++) { + tmpc = *p; + *p = p[1]; + p[1] = tmpc; + cwrd = 1; + u16_u8(candidate, MAXSWUTF8L, candidate_utf, wl); + for (int k=0; k < ns; k++) + if (strcmp(candidate,wlst[k]) == 0) cwrd = 0; + if ((cwrd) && check(candidate, strlen(candidate), cpdsuggest, NULL, NULL)) { + if (ns < maxSug) { + wlst[ns] = mystrdup(candidate); + if (wlst[ns] == NULL) return -1; + ns++; + } else return ns; + } + tmpc = *p; + *p = p[1]; + p[1] = tmpc; + } + return ns; +} // generate a set of suggestions for very poorly spelled words -int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr) +int SuggestMgr::ngsuggest(char** wlst, char * w, HashMgr* pHMgr) { int i, j; @@ -374,14 +779,32 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr) } lp = MAX_ROOTS - 1; - int n = strlen(word); + char w2[MAXWORDUTF8LEN]; + char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } + + char mw[MAXSWUTF8L]; + w_char u8[MAXSWL]; + int nc = strlen(word); + int n = (utf8) ? u8_u16(u8, MAXSWL, word) : nc; struct hentry* hp = NULL; int col = -1; while ((hp = pHMgr->walk_hashtable(col, hp))) { + // check forbidden words + if ((hp->astr) && (pAMgr) && + (TESTAFF(hp->astr, pAMgr->get_forbiddenword(), hp->alen) || + TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) || + TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue; sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE); if (sc > scores[lp]) { - scores[lp] = sc; + scores[lp] = sc; roots[lp] = hp; int lval = sc; for (j=0; j < MAX_ROOTS; j++) @@ -396,14 +819,17 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr) // mangle original word three differnt ways // and score them to generate a minimum acceptable score int thresh = 0; - char * mw = NULL; for (int sp = 1; sp < 4; sp++) { - mw = mystrdup(word); - for (int k=sp; k < n; k+=4) *(mw + k) = '*'; - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); - free(mw); + if (utf8) { + for (int k=sp; k < n; k+=4) *((unsigned short *) u8 + k) = '*'; + u16_u8(mw, MAXSWUTF8L, u8, n); + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); + } else { + strcpy(mw, word); + for (int k=sp; k < n; k+=4) *(mw + k) = '*'; + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); + } } - mw = NULL; thresh = thresh / 3; thresh--; @@ -428,99 +854,722 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr) if (roots[i]) { struct hentry * rp = roots[i]; int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen, - rp->astr, rp->alen); - for (int k = 0; k < nw; k++) { + rp->astr, rp->alen, word, nc); + + for (int k = 0; k < nw ; k++) { sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH); - if (sc > thresh) - { - if (sc > gscore[lp]) - { - if (guess[lp]) free(guess[lp]); - gscore[lp] = sc; - guess[lp] = glst[k].word; - glst[k].word = NULL; - lval = sc; - for (j=0; j < MAX_GUESS; j++) - { - if (gscore[j] < lval) - { - lp = j; - lval = gscore[j]; - } - } - } - } - free (glst[k].word); - glst[k].word = NULL; - glst[k].allow = 0; + if ((sc > thresh)) { + if (sc > gscore[lp]) { + if (guess[lp]) free (guess[lp]); + gscore[lp] = sc; + guess[lp] = glst[k].word; + lval = sc; + for (j=0; j < MAX_GUESS; j++) + if (gscore[j] < lval) { + lp = j; + lval = gscore[j]; + } + } else free (glst[k].word); + } else free(glst[k].word); } } } - if (glst) free(glst); + free(glst); // now we are done generating guesses - // sort in order of decreasing score and copy over + // sort in order of decreasing score bubblesort(&guess[0], &gscore[0], MAX_GUESS); + + // weight suggestions with a similarity index, based on + // the longest common subsequent algorithm and resort + + int is_swap; + for (i=0; i < MAX_GUESS; i++) { + if (guess[i]) { + // lowering guess[i] + char gl[MAXSWUTF8L]; + int len; + if (utf8) { + w_char w[MAXSWL]; + len = u8_u16(w, MAXSWL, guess[i]); + mkallsmall_utf(w, len, utfconv); + u16_u8(gl, MAXSWUTF8L, w, len); + } else { + strcpy(gl, guess[i]); + mkallsmall(gl, csconv); + len = strlen(guess[i]); + } + + int lcs = lcslen(word, gl); + + // same characters with different casing + if ((n == len) && (n == lcs)) { + gscore[i] += 2000; + break; + } + + // heuristic weigthing of ngram scores + gscore[i] += + // length of longest common subsequent minus lenght difference + 2 * lcs - abs((int) (n - len)) + + // weight equal first letter + equalfirstletter(word, gl) + + // weight equal character positions + ((lcs == commoncharacterpositions(word, gl, &is_swap)) ? 1: 0) + + // swap character (not neighboring) + ((is_swap) ? 1000 : 0); + } + } + + bubblesort(&guess[0], &gscore[0], MAX_GUESS); + + // copy over + int ns = 0; + int same = 0; for (i=0; i < MAX_GUESS; i++) { if (guess[i]) { - int unique = 1; - for (j=i+1; j < MAX_GUESS; j++) - if (guess[j]) - if (!strcmp(guess[i], guess[j])) unique = 0; - if (unique) { - wlst[ns++] = guess[i]; - } else { - free(guess[i]); - } + if ((ns < maxngramsugs) && (ns < maxSug) && (!same || (gscore[i] > 1000))) { + int unique = 1; + // we have excellent suggestion(s) + if (gscore[i] > 1000) same = 1; + for (j=0; j < ns; j++) + // don't suggest previous suggestions or a previous suggestion with prefixes or affixes + if (strstr(guess[i], wlst[j]) || + // check forbidden words + !check(guess[i], strlen(guess[i]), 0, NULL, NULL)) unique = 0; + if (unique) wlst[ns++] = guess[i]; else free(guess[i]); + } else free(guess[i]); } } + return ns; } - - // see if a candidate suggestion is spelled correctly // needs to check both root words and words with affixes -int SuggestMgr::check(const char * word, int len) + +// obsolote MySpell-HU modifications: +// return value 2 and 3 marks compounding with hyphen (-) +// `3' marks roots without suffix +int SuggestMgr::check(const char * word, int len, int cpdsuggest, int * timer, time_t * timelimit) { struct hentry * rv=NULL; + int nosuffix = 0; + + // check time limit + if (timer) { + (*timer)--; + if (!(*timer) && timelimit) { + if (time(NULL) > *timelimit) { + *timelimit = 0; + return 0; + } + *timer = MAXPLUSTIMER; + } + } + if (pAMgr) { + if (cpdsuggest==1) { + if (pAMgr->get_compound()) { + rv = pAMgr->compound_check(word,len,0,0,0,0,NULL,0,NULL,NULL,1); + if (rv) return 3; // XXX obsolote categorisation + } + return 0; + } + rv = pAMgr->lookup(word); - if (rv == NULL) rv = pAMgr->affix_check(word,len); + + if (rv) { + if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) + || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; + if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; + } else rv = pAMgr->prefix_check(word, len, 0); // only prefix, and prefix + suffix XXX + + if (rv) { + nosuffix=1; + } else { + rv = pAMgr->suffix_check(word, len, 0, NULL, NULL, 0, NULL); // only suffix + } + + if (!rv && pAMgr->have_contclass()) { + rv = pAMgr->suffix_check_twosfx(word, len, 0, NULL, FLAG_NULL); + if (!rv) rv = pAMgr->prefix_check_twosfx(word, len, 1, FLAG_NULL); + } + + // check forbidden words + if ((rv) && (rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) + || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen) || + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) return 0; + + if (rv) { // XXX obsolote + if ((pAMgr->get_compoundflag()) && + TESTAFF(rv->astr, pAMgr->get_compoundflag(), rv->alen)) return 2 + nosuffix; + return 1; + } } - if (rv) return 1; return 0; } +int SuggestMgr::check_forbidden(const char * word, int len) +{ + struct hentry * rv = NULL; + + if (pAMgr) { + rv = pAMgr->lookup(word); + if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; + if (!(pAMgr->prefix_check(word,len,1))) + rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix + // check forbidden words + if ((rv) && (rv->astr) && TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen)) return 1; + } + return 0; +} + +// suggest stems, XXX experimental code +int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug) +{ + char buf[MAXSWUTF8L]; + char ** wlst; + int prevnsug = nsug; + + char w2[MAXWORDUTF8LEN]; + const char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } + + if (*slst) { + wlst = *slst; + } else { + wlst = (char **) calloc(maxSug, sizeof(char *)); + if (wlst == NULL) return -1; + } + // perhaps there are a fix stem in the dictionary + if ((nsug < maxSug) && (nsug > -1)) { + + nsug = fixstems(wlst, word, nsug); + if (nsug == prevnsug) { + char * s = mystrdup(word); + char * p = s + strlen(s); + while ((*p != '-') && (p != s)) p--; + if (*p == '-') { + *p = '\0'; + nsug = fixstems(wlst, s, nsug); + if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) { + char * t; + buf[0] = '\0'; + for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number? + if (*t != '\0') strcpy(buf, "# "); + strcat(buf, s); + wlst[nsug] = mystrdup(buf); + if (wlst[nsug] == NULL) return -1; + nsug++; + } + p++; + nsug = fixstems(wlst, p, nsug); + } + + free(s); + } + } + + if (nsug < 0) { + for (int i=0;i<maxSug; i++) + if (wlst[i] != NULL) free(wlst[i]); + free(wlst); + return -1; + } + + *slst = wlst; + return nsug; +} + + +// there are fix stems in dictionary +int SuggestMgr::fixstems(char ** wlst, const char * word, int ns) +{ + char fix[MAXSWUTF8L]; + char buf[MAXSWUTF8L]; + char prefix[MAXSWUTF8L] = ""; + + char * p; + int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound + int cpdindex = 0; + struct hentry * rv = NULL; + struct hentry * rv2 = NULL; + + int wl = strlen(word); + int cmpdstemnum; + int cmpdstem[MAXCOMPOUND]; + + if (pAMgr) { + rv = pAMgr->lookup(word); + if (rv) { + dicstem = 0; + } else { + // try stripping off affixes + rv = pAMgr->affix_check(word, wl); + + // else try check compound word + if (!rv && pAMgr->get_compound()) { + rv = pAMgr->compound_check(word, wl, + 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1); + + if (rv) { + dicstem = 2; + for (int j = 0; j < cmpdstemnum; j++) { + cpdindex += cmpdstem[j]; + } + if(! (pAMgr->lookup(word + cpdindex))) + pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix + } + } + + + if (pAMgr->get_prefix()) { + strcpy(prefix, pAMgr->get_prefix()); + } + + // XXX obsolote, will be a general solution for stemming + if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU) + } + + } + + + + if ((rv) && (ns < maxSug)) { + + // check fixstem flag and not_valid_stem flag + // first word + if ((ns < maxSug) && (dicstem < 2)) { + strcpy(buf, prefix); + if ((dicstem > 0) && pAMgr->get_derived()) { + // XXX obsolote + if (strlen(prefix) == 1) { + strcat(buf, (pAMgr->get_derived()) + 1); + } else { + strcat(buf, pAMgr->get_derived()); + } + } else { + // special stem in affix description + const char * wordchars = pAMgr->get_wordchars(); + if (rv->description && + (strchr(wordchars, *(rv->description)))) { + char * desc = (rv->description) + 1; + while (strchr(wordchars, *desc)) desc++; + strncat(buf, rv->description, desc - (rv->description)); + } else { + strcat(buf, rv->word); + } + } + wlst[ns] = mystrdup(buf); + if (wlst[ns] == NULL) return -1; + ns++; + } + + if (dicstem == 2) { + + // compound stem + +// if (rv->astr && (strchr(rv->astr, '0') == NULL)) { + if (rv->astr) { + strcpy(buf, word); + buf[cpdindex] = '\0'; + if (prefix) strcat(buf, prefix); + if (pAMgr->get_derived()) { + strcat(buf, pAMgr->get_derived()); + } else { + // special stem in affix description + const char * wordchars = pAMgr->get_wordchars(); + if (rv->description && + (strchr(wordchars, *(rv->description)))) { + char * desc = (rv->description) + 1; + while (strchr(wordchars, *desc)) desc++; + strncat(buf, rv->description, desc - (rv->description)); + } else { + strcat(buf, rv->word); + } + } + if (ns < maxSug) { + wlst[ns] = mystrdup(buf); + if (wlst[ns] == NULL) return -1; + ns++; + } + } + } + } +while (rv) { + if (0) { // obsolote + if ((p[1] > '0') && (p[1] <= '9')) { + if ((ns < maxSug) && (dicstem != 2)) { + int split = p[1] - '0'; + if (rv->wlen <= split) break; + + strcpy(fix, rv->word); + + // checking verbs ending with `ik' + + fix[rv->wlen - split] = 'i'; + fix[rv->wlen - split + 1] = 'k'; + fix[rv->wlen - split + 2] = '\0'; + + if (! (rv2 = pAMgr->lookup(fix))) { + fix[strlen(fix) - 2] = '\0'; + rv2 = pAMgr->lookup(fix); + if ((!rv2)) { + *fix = csconv[((unsigned char) *fix)].cupper; + rv2 = pAMgr->lookup(fix); + if (! rv2) return ns; + } + + } + + if (0) { + strcpy(buf, prefix); + strcat(buf, fix); + wlst[ns] = mystrdup(buf); + if (wlst[ns] == NULL) return -1; + ns++; + } + + rv = rv2; + + } else return ns; + } else { + strcpy(fix, "__"); + strcat(fix, rv->word); + rv = NULL; + rv2 = pAMgr->lookup(fix); + if ((rv2) && (rv2->astr) && (ns < maxSug)) + if ((rv2) && (rv2->astr) && (ns < maxSug)) + if (0) { + char buf2[MAXSWUTF8L]; + + strcpy(buf2, prefix); + + if (*(rv2->astr) == '-') { + strcat(buf2, ""); + } else { + strcat(buf2, ""); + } + + if (dicstem != 2) { + wlst[ns] = mystrdup(buf2); + if (wlst[ns] == NULL) return -1; + ns++; + } + + if ((dicstem == 2) && (ns < maxSug)) { + strcpy(buf, word); + buf[cpdindex] = '\0'; + strcat(buf + cpdindex, buf2); + + if (pAMgr->get_compound() && + (pAMgr->compound_check(buf, strlen(buf), + 0,0,100,0,NULL,0,NULL,NULL,1))) { + wlst[ns] = mystrdup(buf); + if (wlst[ns] == NULL) return -1; + ns++; + } + } + // many stems + } else { + char * str = mystrdup(""); + char * pos = str; + char * pos2; + do { + int suggest = 1; + pos2 = strchr(pos, '|'); + if (pos2) *pos2 = '\0'; + // ignore `-xxx' suggestion, when exists prefix + if (*pos == '-') { + pos++; + if (*prefix != '\0') suggest = 0; + } + // ignore `xxx-' suggestion, when word is not root + if ((strlen(pos) > 0) && (pos[strlen(pos)-1] == '-')) { + pos[strlen(pos)-1] = '\0'; + strcpy(buf, prefix); + strcat(buf, fix + 2); + if ((dicstem != 0) && (strcmp(buf, word) != 0)) suggest = 0; + } + if ((suggest) && (ns < maxSug) && (strlen(pos) > 0)) { + strcpy(buf, prefix); + strcat(buf, pos); + wlst[ns] = mystrdup(buf); + if (wlst[ns] == NULL) return -1; + ns++; + } + if (pos2) pos = pos2 + 1; + } while (pos2); + free(str); + } + } + } else return ns; + +} + +return ns; + +} + +// suggest possible stems +int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) +{ + char ** wlst; + + struct hentry * rv = NULL; + + char w2[MAXSWUTF8L]; + const char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } + + int wl = strlen(word); + + + if (*slst) { + wlst = *slst; + } else { + wlst = (char **) calloc(maxSug, sizeof(char *)); + if (wlst == NULL) return -1; + } + + rv = pAMgr->suffix_check(word, wl, 0, NULL, wlst, maxSug, &nsug); + + // delete dash from end of word + if (nsug > 0) { + for (int j=0; j < nsug; j++) { + if (wlst[j][strlen(wlst[j]) - 1] == '-') wlst[j][strlen(wlst[j]) - 1] = '\0'; + } + } + + *slst = wlst; + return nsug; +} + + +char * SuggestMgr::suggest_morph(const char * w) +{ + char result[MAXLNLEN]; + char * r = (char *) result; + char * st; + + struct hentry * rv = NULL; + + *result = '\0'; + + if (! pAMgr) return NULL; + + char w2[MAXSWUTF8L]; + const char * word = w; + + // word reversing wrapper for complex prefixes + if (complexprefixes) { + strcpy(w2, w); + if (utf8) reverseword_utf(w2); else reverseword(w2); + word = w2; + } + + rv = pAMgr->lookup(word); + + while (rv) { + if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) || + TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { + if (rv->description && ((!rv->astr) || + !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen))) + strcat(result, word); + if (rv->description) strcat(result, rv->description); + strcat(result, "\n"); + } + rv = rv->next_homonym; + } + + st = pAMgr->affix_check_morph(word,strlen(word)); + if (st) { + strcat(result, st); + free(st); + } + + if (pAMgr->get_compound() && (*result == '\0')) + pAMgr->compound_check_morph(word, strlen(word), + 0, 0, 100, 0,NULL, 0, &r, NULL); + + return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL; +} + +char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) +{ + char * p = NULL; + char ** wlst = (char **) calloc(maxSug, sizeof(char *)); + // we will use only the first suggestion + for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; + int ns = suggest(&wlst, word, maxSug - 1); + if (ns == maxSug) { + p = suggest_morph(wlst[maxSug - 1]); + free(wlst[maxSug - 1]); + } + if (wlst) free(wlst); + return p; +} // generate an n-gram score comparing s1 and s2 int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) { int nscore = 0; - int l1 = strlen(s1); - int l2 = strlen(s2); int ns; - for (int j=1;j<=n;j++) { - ns = 0; - for (int i=0;i<=(l1-j);i++) { - char c = *(s1 + i + j); - *(s1 + i + j) = '\0'; - if (strstr(s2,(s1+i))) ns++; - *(s1 + i + j ) = c; - } - nscore = nscore + ns; - if (ns < 2) break; + int l1; + int l2; + + if (utf8) { + w_char su1[MAXSWL]; + w_char su2[MAXSWL]; + l1 = u8_u16(su1, MAXSWL, s1); + l2 = u8_u16(su2, MAXSWL, s2); + if (!l2) return 0; + // decapitalize dictionary word + if (complexprefixes) { + mkallsmall_utf(su2+l2-1, 1, utfconv); + } else { + mkallsmall_utf(su2, 1, utfconv); + } + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1-j); i++) { + for (int l = 0; l <= (l2-j); l++) { + int k; + for (k = 0; (k < j); k++) { + w_char * c1 = su1 + i + k; + w_char * c2 = su2 + l + k; + if ((c1->l != c2->l) || (c1->h != c2->h)) break; + } + if (k == j) { + ns++; + break; + } + } + } + nscore = nscore + ns; + if (ns < 2) break; + } + } else { + char t[MAXSWUTF8L]; + l1 = strlen(s1); + l2 = strlen(s2); + if (!l2) return 0; + strcpy(t, s2); + if (complexprefixes) { + *(t+l2-1) = csconv[((unsigned char)*(t+l2-1))].clower; + } else { + mkallsmall(t, csconv); +/// *t = csconv[((unsigned char)*t)].clower; + } + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1-j); i++) { + char c = *(s1 + i + j); + *(s1 + i + j) = '\0'; + if (strstr(t,(s1+i))) ns++; + *(s1 + i + j ) = c; + } + nscore = nscore + ns; + if (ns < 2) break; + } } + ns = 0; if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2; if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; return (nscore - ((ns > 0) ? ns : 0)); } +int SuggestMgr::equalfirstletter(char * s1, const char * s2) { + if (utf8) { + w_char su1[MAXSWL]; + w_char su2[MAXSWL]; + // decapitalize dictionary word + if (complexprefixes) { + int l1 = u8_u16(su1, MAXSWL, s1); + int l2 = u8_u16(su2, MAXSWL, s2); + if (*((short *)su1+l1-1) == *((short *)su2+l2-1)) return 1; + } else { + u8_u16(su1, 1, s1); + u8_u16(su2, 1, s2); + if (*((short *)su1) == *((short *)su2)) return 1; + } + } else { + if (complexprefixes) { + int l1 = strlen(s1); + int l2 = strlen(s2); + if (*(s2+l1-1) == *(s2+l2-1)) return 1; + } else { + if (*s1 == *s2) return 1; + } + } + return 0; +} + +int SuggestMgr::commoncharacterpositions(char * s1, const char * s2, int * is_swap) { + int num = 0; + int diff = 0; + int diffpos[2]; + *is_swap = 0; + if (utf8) { + w_char su1[MAXSWL]; + w_char su2[MAXSWL]; + int l1 = u8_u16(su1, MAXSWL, s1); + int l2 = u8_u16(su2, MAXSWL, s2); + for (int i = 0; (i < l1) && (i < l2); i++) { + if (((short *) su1)[i] == ((short *) su2)[i]) { + num++; + } else { + if (diff < 2) diffpos[diff] = i; + diff++; + } + } + if ((diff == 2) && (l1 == l2) && + (((short *) su1)[diffpos[0]] == ((short *) su2)[diffpos[1]]) && + (((short *) su1)[diffpos[1]] == ((short *) su2)[diffpos[0]])) *is_swap = 1; + } else { + int i; + for (i = 0; (*(s1+i) != 0) && (*(s2+i) != 0); i++) { + if (*(s1+i) == *(s2+i)) { + num++; + } else { + if (diff < 2) diffpos[diff] = i; + diff++; + } + } + if ((diff == 2) && (*(s1+i) == 0) && (*(s2+i) == 0) && + (*(s1+diffpos[0]) == *(s2+diffpos[1])) && + (*(s1+diffpos[1]) == *(s2+diffpos[0]))) *is_swap = 1; + } + return num; +} + +int SuggestMgr::mystrlen(const char * word) { + if (utf8) { + w_char w[MAXSWL]; + return u8_u16(w, MAXSWL, word); + } else return strlen(word); +} // sort in decreasing order of score void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) @@ -544,3 +1593,66 @@ void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) return; } +// longest common subsequence +void SuggestMgr::lcs(const char * s, const char * s2, int * l1, int * l2, char ** result) { + int n, m; + w_char su[MAXSWL]; + w_char su2[MAXSWL]; + char * b; + char * c; + int i; + int j; + if (utf8) { + m = u8_u16(su, MAXSWL, s); + n = u8_u16(su2, MAXSWL, s2); + } else { + m = strlen(s); + n = strlen(s2); + } + c = (char *) malloc((m + 1) * (n + 1)); + b = (char *) malloc((m + 1) * (n + 1)); + for (i = 1; i <= m; i++) c[i*(n+1)] = 0; + for (j = 0; j <= n; j++) c[j] = 0; + for (i = 1; i <= m; i++) { + for (j = 1; j <= n; j++) { + if ((utf8) && (*((short *) su+i-1) == *((short *)su2+j-1)) + || (!utf8) && ((*(s+i-1)) == (*(s2+j-1)))) { + c[i*(n+1) + j] = c[(i-1)*(n+1) + j-1]+1; + b[i*(n+1) + j] = LCS_UPLEFT; + } else if (c[(i-1)*(n+1) + j] >= c[i*(n+1) + j-1]) { + c[i*(n+1) + j] = c[(i-1)*(n+1) + j]; + b[i*(n+1) + j] = LCS_UP; + } else { + c[i*(n+1) + j] = c[i*(n+1) + j-1]; + b[i*(n+1) + j] = LCS_LEFT; + } + } + } + *result = b; + free(c); + *l1 = m; + *l2 = n; +} + +int SuggestMgr::lcslen(const char * s, const char* s2) { + int m; + int n; + int i; + int j; + char * result; + int len = 0; + lcs(s, s2, &m, &n, &result); + i = m; + j = n; + while ((i != 0) && (j != 0)) { + if (result[i*(n+1) + j] == LCS_UPLEFT) { + len++; + i--; + j--; + } else if (result[i*(n+1) + j] == LCS_UP) { + i--; + } else j--; + } + if (result) free(result); + return len; +} diff --git a/src/myspell/suggestmgr.hxx b/src/myspell/suggestmgr.hxx index 7c5a6e2..5bc64bb 100644 --- a/src/myspell/suggestmgr.hxx +++ b/src/myspell/suggestmgr.hxx @@ -2,46 +2,85 @@ #define _SUGGESTMGR_HXX_ #define MAXSWL 100 -#define MAX_ROOTS 10 -#define MAX_WORDS 500 -#define MAX_GUESS 10 +#define MAXSWUTF8L (MAXSWL * 4) +#define MAX_ROOTS 50 +#define MAX_WORDS 200 +#define MAX_GUESS 200 +#define MAXNGRAMSUGS 5 + +#define MINTIMER 500 +#define MAXPLUSTIMER 500 #define NGRAM_IGNORE_LENGTH 0 #define NGRAM_LONGER_WORSE 1 #define NGRAM_ANY_MISMATCH 2 - #include "atypes.hxx" #include "affixmgr.hxx" #include "hashmgr.hxx" +#include "langnum.hxx" +#include <time.h> + +enum { LCS_UP, LCS_LEFT, LCS_UPLEFT }; class SuggestMgr { char * ctry; int ctryl; + w_char * ctry_utf; + AffixMgr* pAMgr; int maxSug; - bool nosplitsugs; + struct cs_info * csconv; + struct unicode_info2 * utfconv; + int utf8; + int nosplitsugs; + int maxngramsugs; + int complexprefixes; + public: SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr); ~SuggestMgr(); - int suggest(char** wlst, int ns, const char * word); - int check(const char *, int); + int suggest(char*** slst, const char * word, int nsug); int ngsuggest(char ** wlst, char * word, HashMgr* pHMgr); + int suggest_auto(char*** slst, const char * word, int nsug); + int suggest_stems(char*** slst, const char * word, int nsug); + int suggest_pos_stems(char*** slst, const char * word, int nsug); + + char * suggest_morph(const char * word); + char * suggest_morph_for_spelling_error(const char * word); private: - int replchars(char**, const char *, int); - int mapchars(char**, const char *, int); - int map_related(const char *, int, char ** wlst, int, const mapentry*, int); - int forgotchar(char **, const char *, int); - int swapchar(char **, const char *, int); - int extrachar(char **, const char *, int); - int badchar(char **, const char *, int); - int twowords(char **, const char *, int); + int check(const char *, int, int, int *, time_t *); + int check_forbidden(const char *, int); + + int replchars(char**, const char *, int, int); + int doubledsyllable(char**, const char *, int, int); + int forgotchar(char **, const char *, int, int); + int swapchar(char **, const char *, int, int); + int extrachar(char **, const char *, int, int); + int badchar(char **, const char *, int, int); + int twowords(char **, const char *, int, int); + int fixstems(char **, const char *, int); + + int forgotchar_utf(char**, const w_char *, int wl, int, int); + int extrachar_utf(char**, const w_char *, int wl, int, int); + int badchar_utf(char **, const w_char *, int wl, int, int); + int swapchar_utf(char **, const w_char *, int wl, int, int); + + int mapchars(char**, const char *, int, int); + int map_related(const char *, int, char ** wlst, int, const mapentry*, int, int *, time_t *); + int map_related_utf(w_char *, int, int, char ** wlst, int, const mapentry*, int, int *, time_t *); int ngram(int n, char * s1, const char * s2, int uselen); + int mystrlen(const char * word); + int equalfirstletter(char * s1, const char * s2); + int commoncharacterpositions(char * s1, const char * s2, int * is_swap); void bubblesort( char ** rwd, int * rsc, int n); + void lcs(const char * s, const char * s2, int * l1, int * l2, char ** result); + int lcslen(const char * s, const char* s2); + }; #endif |