diff options
author | Dom Lachowicz <domlachowicz@gmail.com> | 2008-01-06 15:02:36 +0000 |
---|---|---|
committer | Dom Lachowicz <domlachowicz@gmail.com> | 2008-01-06 15:02:36 +0000 |
commit | 0e6694c9b1812f3deac32122f969c50d89bf05ad (patch) | |
tree | 341a860489bdc5d338712e5aacaf3cd18e923674 | |
parent | d7a2136697545d7ec7e6a2328d5377e32811e188 (diff) | |
download | enchant-0e6694c9b1812f3deac32122f969c50d89bf05ad.tar.gz |
upgrade to hunspell 1.2.1, released in November, 2007
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@22579 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r-- | src/myspell/README | 21 | ||||
-rw-r--r-- | src/myspell/affentry.cxx | 300 | ||||
-rw-r--r-- | src/myspell/affentry.hxx | 3 | ||||
-rw-r--r-- | src/myspell/affixmgr.cxx | 704 | ||||
-rw-r--r-- | src/myspell/affixmgr.hxx | 66 | ||||
-rw-r--r-- | src/myspell/atypes.hxx | 30 | ||||
-rw-r--r-- | src/myspell/baseaffix.hxx | 37 | ||||
-rw-r--r-- | src/myspell/csutil.cxx | 326 | ||||
-rw-r--r-- | src/myspell/csutil.hxx | 58 | ||||
-rw-r--r-- | src/myspell/hashmgr.cxx | 133 | ||||
-rw-r--r-- | src/myspell/hashmgr.hxx | 10 | ||||
-rw-r--r-- | src/myspell/htypes.hxx | 13 | ||||
-rw-r--r-- | src/myspell/hunspell.cxx | 533 | ||||
-rw-r--r-- | src/myspell/hunspell.h | 54 | ||||
-rw-r--r-- | src/myspell/hunspell.hxx | 71 | ||||
-rw-r--r-- | src/myspell/license.hunspell | 26 | ||||
-rw-r--r-- | src/myspell/makefile.mk | 113 | ||||
-rw-r--r-- | src/myspell/suggestmgr.cxx | 373 | ||||
-rw-r--r-- | src/myspell/suggestmgr.hxx | 2 |
19 files changed, 1657 insertions, 1216 deletions
diff --git a/src/myspell/README b/src/myspell/README new file mode 100644 index 0000000..b452096 --- /dev/null +++ b/src/myspell/README @@ -0,0 +1,21 @@ +Hunspell spell checker and morphological analyser library + +Documentation, tests, examples: http://hunspell.sourceforge.net + +Author of Hunspell: +László Németh (nemethl (at) gyorsposta.hu) + +Hunspell based on OpenOffice.org's Myspell. MySpell's author: +Kevin Hendricks (kevin.hendricks (at) sympatico.ca) + +License: GPL 2.0/LGPL 2.1/MPL 1.1 tri-license + +The contents of this library may be used under the terms of +the GNU General Public License Version 2 or later (the "GPL"), or +the GNU Lesser General Public License Version 2.1 or later (the "LGPL", +see http://gnu.org/copyleft/lesser.html) or the Mozilla Public License +Version 1.1 or later (the "MPL", see http://mozilla.org/MPL/MPL-1.1.html). + +Software distributed under these licenses is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the licences +for the specific language governing rights and limitations under the licenses. diff --git a/src/myspell/affentry.cxx b/src/myspell/affentry.cxx index 741496b..0ffe557 100644 --- a/src/myspell/affentry.cxx +++ b/src/myspell/affentry.cxx @@ -35,16 +35,17 @@ PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp) appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // number of conditions to match - opts = dp->opts; // cross product flag + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); + if (opts & aeLONGCOND) { + memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); next = NULL; nextne = NULL; nexteq = NULL; -#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; -#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -58,14 +59,8 @@ PfxEntry::~PfxEntry() pmyMgr = NULL; appnd = NULL; strip = NULL; - if (opts & aeUTF8) { - for (int i = 0; i < numconds; i++) { - if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]); - } - } -#ifdef HUNSPELL_EXPERIMENTAL + if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) free(morphcode); -#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -89,47 +84,71 @@ char * PfxEntry::add(const char * word, int len) return NULL; } +inline char * PfxEntry::nextchar(char * p) { + if (p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.conds + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + } + return p; +} inline int PfxEntry::test_condition(const char * st) { - int cond; - unsigned char * cp = (unsigned char *)st; - if (!(opts & aeUTF8)) { // 256-character codepage - for (cond = 0; cond < numconds; cond++) { - if ((conds.base[*cp++] & (1 << cond)) == 0) return 0; - } - } else { // UTF-8 encoding - unsigned short wc; - for (cond = 0; cond < numconds; cond++) { - // a simple 7-bit ASCII character in UTF-8 - if ((*cp >> 7) == 0) { - // also check limit (end of word) - if ((!*cp) || ((conds.utf8.ascii[*cp++] & (1 << cond)) == 0)) return 0; - // UTF-8 multibyte character - } else { - // not dot wildcard in rule - if (!conds.utf8.all[cond]) { - if (conds.utf8.neg[cond]) { - u8_u16((w_char *) &wc, 1, (char *) cp); - if (conds.utf8.wchars[cond] && - flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short) conds.utf8.wlen[cond])) return 0; - } else { - if (!conds.utf8.wchars[cond]) return 0; - u8_u16((w_char *) &wc, 1, (char *) cp); - if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short)conds.utf8.wlen[cond])) return 0; - } + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { p = nextchar(p); pos = st; break; } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { if ((neg && ingroup) || (!neg && !ingroup)) return 0; + pos = NULL; + neg = false; + ingroup = false; + p = nextchar(p); + st++; + if (*st == '\0' && p && *p != '\0') return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++); + if (*st == '\0') return 0; // word <= condition + break; + } + default: { + if (*st == *p) { + st++; + p = nextchar(p); + if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte + while (p && (*p & 0xc0) == 0x80) { // character + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + p = nextchar(p); + st++; + } + if (st != pos) ingroup = true; + } else if (pos) ingroup = true; + } else if (pos) { // group + p = nextchar(p); + } else return 0; } - // jump to next UTF-8 character - for(cp++; (*cp & 0xc0) == 0x80; cp++); - } } + if (!p) return 1; } - return 1; } - // check if this prefix entry matches struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag) { @@ -144,14 +163,15 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { +// if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if (tmpl > 0) { // generate new root word by removing prefix and adding // back any characters that would have been stripped if (stripl) strcpy (tmpword, strip); strcpy ((tmpword + stripl), (word + appndl)); - + // now make sure all of the conditions on characters // are met. Please see the appendix at the end of // this file for more info on exactly what is being @@ -165,8 +185,8 @@ struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with pseudoroot flag - ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || (contclass && TESTAFF(contclass, needflag, contclasslen)))) @@ -205,7 +225,8 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { +// if (tmpl > 0) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -237,7 +258,6 @@ struct hentry * PfxEntry::check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL // check if this prefix entry matches char * PfxEntry::check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag) @@ -252,7 +272,8 @@ char * PfxEntry::check_twosfx_morph(const char * word, int len, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { +// if (tmpl > 0) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -302,7 +323,8 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { +// if (tmpl > 0) { // generate new root word by removing prefix and adding // back any characters that would have been stripped @@ -323,15 +345,25 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const if ((he = pmyMgr->lookup(tmpword)) != NULL) { do { if (TESTAFF(he->astr, aflag, he->alen) && - // forbid single prefixes with pseudoroot flag - ! TESTAFF(contclass, pmyMgr->get_pseudoroot(), contclasslen) && + // forbid single prefixes with needaffix flag + ! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) && // needflag ((!needflag) || TESTAFF(he->astr, needflag, he->alen) || - (contclass && TESTAFF(contclass, needflag, contclasslen)))) { - if (morphcode) strcat(result, morphcode); else strcat(result,getKey()); - if (he->description) { - if ((*(he->description)=='[')||(*(he->description)=='<')) strcat(result,he->word); - strcat(result,he->description); + (contclass && TESTAFF(contclass, needflag, contclasslen)))) { + if (morphcode) { + strcat(result, " "); + strcat(result, morphcode); + } else strcat(result,getKey()); + if (!HENTRY_FIND(he, MORPH_STEM)) { + strcat(result, " "); + strcat(result, MORPH_STEM); + strcat(result,HENTRY_WORD(he)); + } + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, he); + if (HENTRY_DATA(he)) { + strcat(result, " "); + strcat(result,HENTRY_DATA(he)); } strcat(result, "\n"); } @@ -357,7 +389,6 @@ char * PfxEntry::check_morph(const char * word, int len, char in_compound, const if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) { @@ -370,17 +401,17 @@ SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp) appnd = dp->appnd; // string to append stripl = dp->stripl; // length of strip string appndl = dp->appndl; // length of append string - numconds = dp->numconds; // number of conditions to match - opts = dp->opts; // cross product flag + numconds = dp->numconds; // length of the condition + opts = dp->opts; // cross product flag // then copy over all of the conditions - memcpy(&conds.base[0],&dp->conds.base[0],SETSIZE*sizeof(conds.base[0])); + if (opts & aeLONGCOND) { + memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1); + c.l.conds2 = dp->c.l.conds2; + } else memcpy(c.conds, dp->c.conds, MAXCONDLEN); rappnd = myrevstrdup(appnd); - -#ifdef HUNSPELL_EXPERIMENTAL morphcode = dp->morphcode; -#endif contclass = dp->contclass; contclasslen = dp->contclasslen; } @@ -395,14 +426,8 @@ SfxEntry::~SfxEntry() pmyMgr = NULL; appnd = NULL; strip = NULL; - if (opts & aeUTF8) { - for (int i = 0; i < numconds; i++) { - if (conds.utf8.wchars[i]) free(conds.utf8.wchars[i]); - } - } -#ifdef HUNSPELL_EXPERIMENTAL + if (opts & aeLONGCOND) free(c.l.conds2); if (morphcode && !(opts & aeALIASM)) free(morphcode); -#endif if (contclass && !(opts & aeALIASF)) free(contclass); } @@ -427,50 +452,92 @@ char * SfxEntry::add(const char * word, int len) return NULL; } +inline char * SfxEntry::nextchar(char * p) { + p++; + if (opts & aeLONGCOND) { + // jump to the 2nd part of the condition + if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2; + // end of the MAXCONDLEN length condition + } else if (p == c.conds + MAXCONDLEN) return NULL; + return p; +} inline int SfxEntry::test_condition(const char * st, const char * beg) { - int cond; - unsigned char * cp = (unsigned char *) st; - if (!(opts & aeUTF8)) { // 256-character codepage - // Domolki affix algorithm - for (cond = numconds; --cond >= 0; ) { - if ((conds.base[*--cp] & (1 << cond)) == 0) return 0; - } - } else { // UTF-8 encoding - unsigned short wc; - for (cond = numconds; --cond >= 0; ) { - // go to next character position and check limit - if ((char *) --cp < beg) return 0; - // a simple 7-bit ASCII character in UTF-8 - if ((*cp >> 7) == 0) { - if ((conds.utf8.ascii[*cp] & (1 << cond)) == 0) return 0; - // UTF-8 multibyte character - } else { - // go to first character of UTF-8 multibyte character - for (; (*cp & 0xc0) == 0x80; cp--); - // not dot wildcard in rule - if (!conds.utf8.all[cond]) { - if (conds.utf8.neg[cond]) { - u8_u16((w_char *) &wc, 1, (char *) cp); - if (conds.utf8.wchars[cond] && - flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short) conds.utf8.wlen[cond])) return 0; - } else { - if (!conds.utf8.wchars[cond]) return 0; - u8_u16((w_char *) &wc, 1, (char *) cp); - if (!flag_bsearch((unsigned short *)conds.utf8.wchars[cond], - wc, (short)conds.utf8.wlen[cond])) return 0; + const char * pos = NULL; // group with pos input position + bool neg = false; // complementer + bool ingroup = false; // character in the group + if (numconds == 0) return 1; + char * p = c.conds; + st--; + int c = 1; + while (1) { + switch (*p) { + case '\0': return 1; + case '[': { p = nextchar(p); pos = st; break; } + case '^': { p = nextchar(p); neg = true; break; } + case ']': { if (!neg && !ingroup) return 0; + c++; + pos = NULL; + neg = false; + ingroup = false; + p = nextchar(p); + st--; + if (st < beg && p && *p != '\0') return 0; // word <= condition + break; + } + case '.': if (!pos) { // dots are not metacharacters in groups: [.] + p = nextchar(p); + // skip the next character + for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--); + if (st < beg) return 0; // word <= condition + if (*st & 0x80) { // head of the UTF-8 character + st--; + if (st < beg) return 0; // word <= condition } + break; + } + default: { + if (*st == *p) { + p = nextchar(p); + if ((opts & aeUTF8) && (*st & 0x80)) { + st--; + while (p && (st >= beg)) { + if (*p != *st) { + if (!pos) return 0; + st = pos; + break; + } + // first byte of the UTF-8 multibyte character + if ((*p & 0xc0) != 0x80) break; + p = nextchar(p); + st--; + } + if (pos && st != pos) { + if (neg) return 0; + else if (c == numconds) return 1; + ingroup = true; + } + if (p && *p != '\0') p = nextchar(p); + } else if (pos) { + if (neg) return 0; + else if (c == numconds) return 1; + ingroup = true; + } + if (!pos) { + c++; + st--; + if (st < beg && p && *p != '\0') return 0; // word <= condition + } + } else if (pos) { // group + p = nextchar(p); + } else return 0; } - } } + if (!p) return 1; } - return 1; } - - // see if this suffix is present in the word struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, AffEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag, @@ -497,7 +564,8 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, // the second condition is not enough for UTF-8 strings // it checked in test_condition() - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { +// if (tmpl > 0) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -513,7 +581,8 @@ struct hentry * SfxEntry::checkword(const char * word, int len, int optflags, // now make sure all of the conditions on characters // are met. Please see the appendix at the end of - // this file for more info on exactly what is being // tested + // this file for more info on exactly what is being + // tested // if all conditions are met then check if resulting // root word in the dictionary @@ -595,7 +664,8 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { +// if (tmpl > 0) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -632,7 +702,6 @@ struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL // see if two-level suffix is present in the word char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, AffEntry* ppfx, const FLAG needflag) @@ -660,7 +729,8 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, tmpl = len - appndl; - if ((tmpl > 0) && (tmpl + stripl >= numconds)) { + if ((tmpl > 0) && (tmpl + stripl >= numconds)) { +// if (tmpl > 0) { // generate new root word by removing suffix and adding // back any characters that would have been stripped or @@ -689,6 +759,7 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, if (st) { if (((PfxEntry *) ppfx)->getMorph()) { strcat(result, ((PfxEntry *) ppfx)->getMorph()); + strcat(result, " "); } strcat(result,st); free(st); @@ -715,7 +786,6 @@ char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags, } return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE // get next homonym with same affix struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, AffEntry* ppfx, diff --git a/src/myspell/affentry.hxx b/src/myspell/affentry.hxx index bb21773..ef1f86d 100644 --- a/src/myspell/affentry.hxx +++ b/src/myspell/affentry.hxx @@ -54,6 +54,7 @@ public: inline void setNextEQ(PfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(PfxEntry * ptr) { flgnxt = ptr; } + inline char * nextchar(char * p); inline int test_condition(const char * st); }; @@ -123,7 +124,9 @@ public: inline void setNextEQ(SfxEntry * ptr) { nexteq = ptr; } inline void setFlgNxt(SfxEntry * ptr) { flgnxt = ptr; } + inline char * nextchar(char * p); inline int test_condition(const char * st, const char * begin); + }; #endif diff --git a/src/myspell/affixmgr.cxx b/src/myspell/affixmgr.cxx index a853f82..d3e36be 100644 --- a/src/myspell/affixmgr.cxx +++ b/src/myspell/affixmgr.cxx @@ -56,11 +56,11 @@ AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) checkcompoundrep = 0; // forbid bad compounds (may be non compound word with a REP substitution) checkcompoundcase = 0; // forbid upper and lowercase combinations at word bounds checkcompoundtriple = 0; // forbid compounds with triple letters - forbiddenword = FLAG_NULL; // forbidden word signing flag + forbiddenword = FORBIDDENWORD; // forbidden word signing flag nosuggest = FLAG_NULL; // don't suggest words signed with NOSUGGEST flag lang = NULL; // language langnum = 0; // language code (see http://l10n.openoffice.org/languages.html) - pseudoroot = FLAG_NULL; // forbidden root, allowed only with suffixes + needaffix = FLAG_NULL; // forbidden root, allowed only with suffixes cpdwordmax = -1; // default: unlimited wordcount in compound words cpdmin = -1; // undefined cpdmaxsyllable = 0; // default: unlimited syllablecount in compound words @@ -90,6 +90,7 @@ AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) sugswithdots = 0; keepcase = 0; checksharps = 0; + substandard = FLAG_NULL; derived = NULL; // XXX not threadsafe variable for experimental stemming sfx = NULL; @@ -218,7 +219,7 @@ AffixMgr::~AffixMgr() FREE_FLAG(compoundroot); FREE_FLAG(forbiddenword); FREE_FLAG(nosuggest); - FREE_FLAG(pseudoroot); + FREE_FLAG(needaffix); FREE_FLAG(lemma_present); FREE_FLAG(circumfix); FREE_FLAG(onlyincompound); @@ -453,17 +454,17 @@ int AffixMgr::parse_file(const char * affpath) } } - /* parse in the flag used by `pseudoroots' */ + /* parse in the flag used by `needaffixs' */ if (strncmp(line,"PSEUDOROOT",10) == 0) { - if (parse_flag(line, &pseudoroot, "PSEUDOROOT")) { + if (parse_flag(line, &needaffix, "PSEUDOROOT")) { fclose(afflst); return 1; } } - /* parse in the flag used by `pseudoroots' */ + /* parse in the flag used by `needaffixs' */ if (strncmp(line,"NEEDAFFIX",9) == 0) { - if (parse_flag(line, &pseudoroot, "NEEDAFFIX")) { + if (parse_flag(line, &needaffix, "NEEDAFFIX")) { fclose(afflst); return 1; } @@ -602,6 +603,14 @@ int AffixMgr::parse_file(const char * affpath) } } + /* parse in the flag used by the affix generator */ + if (strncmp(line,"SUBSTANDARD",11) == 0) { + if (parse_flag(line, &substandard, "SUBSTANDARD")) { + fclose(afflst); + return 1; + } + } + if (strncmp(line,"CHECKSHARPS",11) == 0) { checksharps=1; } @@ -941,191 +950,40 @@ int AffixMgr::process_sfx_order() return 0; } - - -// takes aff file condition string and creates the -// conds array - please see the appendix at the end of the -// file affentry.cxx which describes what is going on here -// in much more detail - -int AffixMgr::encodeit(struct affentry * ptr, char * cs) +// calculate the character length of the condition +int AffixMgr::condlen(char * st) { - unsigned char c; - int i, j, k; - unsigned char mbr[MAXLNLEN]; - w_char wmbr[MAXLNLEN]; - w_char * wpos = wmbr; - - // now clear the conditions array */ - for (i=0;i<SETSIZE;i++) ptr->conds.base[i] = (unsigned char) 0; - - // now parse the string to create the conds array */ - int nc = strlen(cs); - unsigned char neg = 0; // complement indicator - int grp = 0; // group indicator - unsigned char n = 0; // number of conditions - int ec = 0; // end condition indicator - int nm = 0; // number of member in group - - // if no condition just return - if (strcmp(cs,".")==0) { - ptr->numconds = 0; - return 0; + int l = 0; + bool group = false; + for(; *st; st++) { + if (*st == '[') { + group = true; + l++; + } else if (*st == ']') group = false; + else if (!group && (!utf8 || + (!(*st & 0x80) || ((*st & 0xc0) == 0x80)))) l++; } + return l; +} - i = 0; - while (i < nc) { - c = *((unsigned char *)(cs + i)); - - // start group indicator - if (c == '[') { - grp = 1; - c = 0; - } - - // complement flag - if ((grp == 1) && (c == '^')) { - neg = 1; - c = 0; - } - - // end goup indicator - if (c == ']') { - ec = 1; - c = 0; - } - - // add character of group to list - if ((grp == 1) && (c != 0)) { - *(mbr + nm) = c; - nm++; - c = 0; - } - - // end of condition - if (c != 0) { - ec = 1; +int AffixMgr::encodeit(struct affentry * ptr, char * cs) +{ + if (strcmp(cs,".") != 0) { + ptr->numconds = condlen(cs); + strncpy(ptr->c.conds, cs, MAXCONDLEN); + // long condition (end of conds padded by strncpy) + if (ptr->c.conds[MAXCONDLEN - 1] && cs[MAXCONDLEN]) { + ptr->opts += aeLONGCOND; + ptr->c.l.conds2 = mystrdup(cs + MAXCONDLEN_1); } - - if (ec) { - if (!utf8) { - if (grp == 1) { - if (neg == 0) { - // set the proper bits in the condition array vals for those chars - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - ptr->conds.base[k] = ptr->conds.base[k] | ((unsigned char)1 << n); - } - } else { - // complement so set all of them and then unset indicated ones - for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - ptr->conds.base[k] = ptr->conds.base[k] & ~((unsigned char)1 << n); - } - } - neg = 0; - grp = 0; - nm = 0; - } else { - // not a group so just set the proper bit for this char - // but first handle special case of . inside condition - if (c == '.') { - // wild card character so set them all - for (j=0;j<SETSIZE;j++) ptr->conds.base[j] = ptr->conds.base[j] | ((unsigned char)1 << n); - } else { - ptr->conds.base[(unsigned int) c] = ptr->conds.base[(unsigned int)c] | ((unsigned char)1 << n); - } - } - n++; - ec = 0; - } else { // UTF-8 character set - if (grp == 1) { - ptr->conds.utf8.neg[n] = neg; - if (neg == 0) { - // set the proper bits in the condition array vals for those chars - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - if (k >> 7) { - u8_u16(wpos, 1, (char *) mbr + j); - wpos++; - if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character - } else { - ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] | ((unsigned char)1 << n); - } - } - } else { // neg == 1 - // complement so set all of them and then unset indicated ones - for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); - for (j=0;j<nm;j++) { - k = (unsigned int) mbr[j]; - if (k >> 7) { - u8_u16(wpos, 1, (char *) mbr + j); - wpos++; - if ((k & 0xe0) == 0xe0) j+=2; else j++; // 3-byte UTF-8 character - } else { - ptr->conds.utf8.ascii[k] = ptr->conds.utf8.ascii[k] & ~((unsigned char)1 << n); - } - } - } - neg = 0; - grp = 0; - nm = 0; - ptr->conds.utf8.wlen[n] = wpos - wmbr; - if ((wpos - wmbr) != 0) { - ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char) * (wpos - wmbr)); - if (!ptr->conds.utf8.wchars[n]) return 1; - memcpy(ptr->conds.utf8.wchars[n], wmbr, sizeof(w_char) * (wpos - wmbr)); - flag_qsort((unsigned short *) ptr->conds.utf8.wchars[n], 0, ptr->conds.utf8.wlen[n]); - wpos = wmbr; - } - } else { // grp == 0 - // is UTF-8 character? - if (c >> 7) { - ptr->conds.utf8.wchars[n] = (w_char *) malloc(sizeof(w_char)); - if (!ptr->conds.utf8.wchars[n]) return 1; - ptr->conds.utf8.wlen[n] = 1; - u8_u16(ptr->conds.utf8.wchars[n], 1, cs + i); - if ((c & 0xe0) == 0xe0) i+=2; else i++; // 3-byte UFT-8 character - } else { - ptr->conds.utf8.wchars[n] = NULL; - // not a group so just set the proper bit for this char - // but first handle special case of . inside condition - if (c == '.') { - ptr->conds.utf8.all[n] = 1; - // wild card character so set them all - for (j=0;j<(SETSIZE/2);j++) ptr->conds.utf8.ascii[j] = ptr->conds.utf8.ascii[j] | ((unsigned char)1 << n); - } else { - ptr->conds.utf8.all[n] = 0; - ptr->conds.utf8.ascii[(unsigned int) c] = ptr->conds.utf8.ascii[(unsigned int)c] | ((unsigned char)1 << n); - } - } - neg = 0; - } - n++; - ec = 0; - neg = 0; - } - } - - i++; + } else { + ptr->numconds = 0; + ptr->c.conds[0] = '\0'; } - ptr->numconds = n; return 0; } - // return 1 if s1 is a leading subset of s2 -/* inline int AffixMgr::isSubset(const char * s1, const char * s2) - { - while ((*s1 == *s2) && *s1) { - s1++; - s2++; - } - return (*s1 == '\0'); - } -*/ - - // return 1 if s1 is a leading subset of s2 (dots are for infixes) +// return 1 if s1 is a leading subset of s2 (dots are for infixes) inline int AffixMgr::isSubset(const char * s1, const char * s2) { while (((*s1 == *s2) || (*s1 == '.')) && (*s1 != '\0')) { @@ -1235,7 +1093,6 @@ struct hentry * AffixMgr::prefix_check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL // check word for prefixes char * AffixMgr::prefix_check_morph(const char * word, int len, char in_compound, const FLAG needflag) @@ -1331,8 +1188,6 @@ char * AffixMgr::prefix_check_twosfx_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - // Is word a non compound with a REP substitution (see checkcompoundrep)? int AffixMgr::cpdrep_check(const char * word, int wl) @@ -1579,7 +1434,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -1613,9 +1468,9 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) checked_prefix = 1; - // else check forbiddenwords and pseudoroot + // else check forbiddenwords and needaffix } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, pseudoroot, rv->alen) || + TESTAFF(rv->astr, needaffix, rv->alen) || (is_sug && nosuggest && TESTAFF(rv->astr, nosuggest, rv->alen)) )) { st[i] = ch; @@ -1728,7 +1583,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, rv = lookup((word+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || (numdefcpd && words && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { @@ -1768,7 +1623,7 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || ((cpdmaxsyllable==0) || - (numsyllable + get_syllable(&(rv->word), rv->clen)<=cpdmaxsyllable)) + (numsyllable + get_syllable(HENTRY_WORD(rv), rv->clen)<=cpdmaxsyllable)) ) && ( (!checkcompounddup || (rv != rv_first)) @@ -1901,7 +1756,6 @@ struct hentry * AffixMgr::compound_check(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL // check if compound word is correctly spelled // hu_mov_rule = spec. Hungarian rule (XXX) int AffixMgr::compound_check_morph(const char * word, int len, @@ -1963,7 +1817,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, // search homonym with compound flag while ((rv) && !hu_mov_rule && - ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundbegin && !wordnum && TESTAFF(rv->astr, compoundbegin, rv->alen)) || @@ -1977,13 +1831,16 @@ int AffixMgr::compound_check_morph(const char * word, int len, } if (rv) { - if (rv->description) { - if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) - strcat(presult, st); - strcat(presult, rv->description); + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_PART, st); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + sprintf(presult + strlen(presult), "%c%s%s", MSEP_FLD, MORPH_STEM, st); } - } - + // store the pointer of the hash entry +// sprintf(presult + strlen(presult), "%c%s%p", MSEP_FLD, MORPH_HENTRY, rv); + if (HENTRY_DATA(rv)) { + sprintf(presult + strlen(presult), "%c%s", MSEP_FLD, HENTRY_DATA(rv)); + } + } if (!rv) { if (compoundflag && !(rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundflag))) { @@ -2006,35 +1863,28 @@ int AffixMgr::compound_check_morph(const char * word, int len, ((rv = suffix_check(st, i, 0, NULL, NULL, 0, NULL, FLAG_NULL, compoundmiddle, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) || (rv = prefix_check(st, i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN, compoundmiddle))))) ) { - //char * p = prefix_check_morph(st, i, 0, compound); + // char * p = prefix_check_morph(st, i, 0, compound); char * p = NULL; if (compoundflag) p = affix_check_morph(st, i, compoundflag); if (!p || (*p == '\0')) { + if (p) free(p); + p = NULL; if ((wordnum == 0) && compoundbegin) { p = affix_check_morph(st, i, compoundbegin); } else if ((wordnum > 0) && compoundmiddle) { p = affix_check_morph(st, i, compoundmiddle); } } - if (*p != '\0') { - line_uniq(p); - if (strchr(p, '\n')) { - strcat(presult, "("); - strcat(presult, line_join(p, '|')); - strcat(presult, ")"); - } else { - strcat(presult, p); - } - } - if (presult[strlen(presult) - 1] == '\n') { - presult[strlen(presult) - 1] = '\0'; + if (p && (*p != '\0')) { + sprintf(presult + strlen(presult), "%c%s%s%s", MSEP_FLD, + MORPH_PART, st, line_uniq_app(&p, MSEP_REC)); } + if (p) free(p); checked_prefix = 1; - //strcat(presult, "+"); } // else check forbiddenwords } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) || - TESTAFF(rv->astr, pseudoroot, rv->alen))) { + TESTAFF(rv->astr, needaffix, rv->alen))) { st[i] = ch; continue; } @@ -2137,7 +1987,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, rv = lookup((word+i)); // perhaps without prefix // search homonym with compound flag - while ((rv) && ((pseudoroot && TESTAFF(rv->astr, pseudoroot, rv->alen)) || + while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) || !((compoundflag && !words && TESTAFF(rv->astr, compoundflag, rv->alen)) || (compoundend && !words && TESTAFF(rv->astr, compoundend, rv->alen)) || (numdefcpd && defcpd_check(&words, wnum + 1, rv, NULL,1))))) { @@ -2146,11 +1996,21 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && words[wnum + 1]) { strcat(*result, presult); - if (complexprefixes && rv->description) strcat(*result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) - strcat(*result, &(rv->word)); - if (!complexprefixes && rv->description) strcat(*result, rv->description); + strcat(*result, " "); + strcat(*result, MORPH_PART); + strcat(*result, word+i); + if (complexprefixes && HENTRY_DATA(rv)) strcat(*result, HENTRY_DATA(rv)); + if (!HENTRY_FIND(rv, MORPH_STEM)) { + strcat(*result, " "); + strcat(*result, MORPH_STEM); + strcat(*result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); + if (!complexprefixes && HENTRY_DATA(rv)) { + strcat(*result, " "); + strcat(*result, HENTRY_DATA(rv)); + } strcat(*result, "\n"); ok = 1; return 0; @@ -2187,7 +2047,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, && ( ((cpdwordmax==-1) || (wordnum+1<cpdwordmax)) || ((cpdmaxsyllable==0) || - (numsyllable+get_syllable(&(rv->word),rv->wlen)<=cpdmaxsyllable)) + (numsyllable+get_syllable(HENTRY_WORD(rv),rv->blen)<=cpdmaxsyllable)) ) && ( (!checkcompounddup || (rv != rv_first)) @@ -2196,12 +2056,23 @@ int AffixMgr::compound_check_morph(const char * word, int len, { // bad compound word strcat(*result, presult); + strcat(*result, " "); + strcat(*result, MORPH_PART); + strcat(*result, word+i); - if (rv->description) { - if (complexprefixes) strcat(*result, rv->description); - if ((!rv->astr) || !TESTAFF(rv->astr, lemma_present, rv->alen)) - strcat(*result, &(rv->word)); - if (!complexprefixes) strcat(*result, rv->description); + if (HENTRY_DATA(rv)) { + if (complexprefixes) strcat(*result, HENTRY_DATA(rv)); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + strcat(*result, " "); + strcat(*result, MORPH_STEM); + strcat(*result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(*result + strlen(*result), " %s%p", MORPH_HENTRY, rv); + if (!complexprefixes) { + strcat(*result, " "); + strcat(*result, HENTRY_DATA(rv)); + } } strcat(*result, "\n"); ok = 1; @@ -2227,20 +2098,16 @@ int AffixMgr::compound_check_morph(const char * word, int len, if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) + if ((!m || *m == '\0') && compoundend) { + if (m) free(m); m = affix_check_morph((word+i),strlen(word+i), compoundend); + } strcat(*result, presult); - if (m) { - line_uniq(m); - if (strchr(m, '\n')) { - strcat(*result, "("); - strcat(*result, line_join(m, '|')); - strcat(*result, ")"); - } else { - strcat(*result, m); - } - free(m); + if (m || (*m != '\0')) { + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); } + if (m) free(m); strcat(*result, "\n"); ok = 1; } @@ -2259,7 +2126,7 @@ int AffixMgr::compound_check_morph(const char * word, int len, // check forbiddenwords if ((rv) && (rv->astr) && (TESTAFF(rv->astr,forbiddenword,rv->alen)) - && (! TESTAFF(rv->astr, pseudoroot, rv->alen))) { + && (! TESTAFF(rv->astr, needaffix, rv->alen))) { st[i] = ch; continue; } @@ -2311,21 +2178,17 @@ int AffixMgr::compound_check_morph(const char * word, int len, )) { char * m = NULL; if (compoundflag) m = affix_check_morph((word+i),strlen(word+i), compoundflag); - if ((!m || *m == '\0') && compoundend) + if ((!m || *m == '\0') && compoundend) { + if (m) free(m); m = affix_check_morph((word+i),strlen(word+i), compoundend); + } strcat(*result, presult); - if (m) { - line_uniq(m); - if (strchr(m, '\n')) { - strcat(*result, "("); - strcat(*result, line_join(m, '|')); - strcat(*result, ")"); - } else { - strcat(*result, m); - } - free(m); + if (m && (*m != '\0')) { + sprintf(*result + strlen(*result), "%c%s%s%s", MSEP_FLD, + MORPH_PART, word + i, line_uniq_app(&m, MSEP_REC)); } - strcat(*result, "\n"); + if (m) free(m); + sprintf(*result + strlen(*result), "%c", MSEP_REC); ok = 1; } @@ -2346,7 +2209,6 @@ int AffixMgr::compound_check_morph(const char * word, int len, } return 0; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE // return 1 if s1 (reversed) is a leading subset of end of s2 /* inline int AffixMgr::isRevSubset(const char * s1, const char * end_of_s2, int len) @@ -2402,11 +2264,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) ) { @@ -2444,11 +2306,11 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(sptr->getCont() && TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + !(sptr->getCont() && TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) ) { @@ -2462,9 +2324,15 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, if (!derived) { derived = mystrdup(word); } else { + strcat(result, " "); + strcpy(result, MORPH_STEM); strcpy(result, derived); // XXX check size strcat(result, "\n"); + strcat(result, " "); + strcat(result, MORPH_STEM); strcat(result, word); + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); free(derived); derived = mystrdup(result); } @@ -2523,7 +2391,6 @@ struct hentry * AffixMgr::suffix_check_twosfx(const char * word, int len, return NULL; } -#ifdef HUNSPELL_EXPERIMENTAL char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag) { @@ -2545,11 +2412,17 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, st = se->check_twosfx_morph(word,len, sfxopts, ppfx, needflag); if (st) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + strcat(result, ((PfxEntry *) ppfx)->getMorph()); + strcat(result, " "); + } } strcat(result, st); free(st); - if (se->getMorph()) strcat(result, se->getMorph()); + if (se->getMorph()) { + strcat(result, " "); + strcat(result, se->getMorph()); + } strcat(result, "\n"); } } @@ -2581,7 +2454,10 @@ char * AffixMgr::suffix_check_twosfx_morph(const char * word, int len, } else sprintf(result3, "<%c>", flag); strcat(result3, ":"); #endif - if (sptr->getMorph()) strcat(result3, sptr->getMorph()); + if (sptr->getMorph()) { + strcat(result3, " "); + strcat(result3, sptr->getMorph()); + } strlinecat(result2, result3); strcat(result2, "\n"); strcat(result, result2); @@ -2627,25 +2503,39 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((se->getCont() && (TESTAFF(se->getCont(), onlyincompound, se->getContLen()))))) && - // pseudoroot on prefix or first suffix + // needaffix on prefix or first suffix (cclass || - !(se->getCont() && TESTAFF(se->getCont(), pseudoroot, se->getContLen())) || + !(se->getCont() && TESTAFF(se->getCont(), needaffix, se->getContLen())) || (ppfx && !((ep->getCont()) && - TESTAFF(ep->getCont(), pseudoroot, + TESTAFF(ep->getCont(), needaffix, ep->getContLen()))) ) )) rv = se->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + strcat(result, ((PfxEntry *) ppfx)->getMorph()); + strcat(result, " "); + } + } + if (complexprefixes && HENTRY_DATA(rv)) strcat(result, HENTRY_DATA(rv)); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + strcat(result, " "); + strcat(result, MORPH_STEM); + strcat(result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); + + if (!complexprefixes && HENTRY_DATA(rv)) { + strcat(result, " "); + strcat(result, HENTRY_DATA(rv)); + } + if (se->getMorph()) { + strcat(result, " "); + strcat(result, se->getMorph()); } - if (complexprefixes && rv->description) strcat(result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) - strcat(result, &(rv->word)); - if (!complexprefixes && rv->description) strcat(result, rv->description); - if (se->getMorph()) strcat(result, se->getMorph()); strcat(result, "\n"); rv = se->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } @@ -2676,18 +2566,30 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, // fogemorpheme (in_compound || !((sptr->getCont() && (TESTAFF(sptr->getCont(), onlyincompound, sptr->getContLen()))))) && - // pseudoroot on first suffix + // needaffix on first suffix (cclass || !(sptr->getCont() && - TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen()))) + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen()))) )) rv = sptr->checkword(word,len, sfxopts, ppfx, NULL, 0, 0, cclass, needflag); while (rv) { if (ppfx) { - if (((PfxEntry *) ppfx)->getMorph()) strcat(result, ((PfxEntry *) ppfx)->getMorph()); + if (((PfxEntry *) ppfx)->getMorph()) { + strcat(result, ((PfxEntry *) ppfx)->getMorph()); + strcat(result, " "); + } } - if (complexprefixes && rv->description) strcat(result, rv->description); - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, lemma_present, rv->alen))) strcat(result, &(rv->word)); - if (!complexprefixes && rv->description) strcat(result, rv->description); + if (complexprefixes && HENTRY_DATA(rv)) strcat(result, HENTRY_DATA(rv)); + if (! HENTRY_FIND(rv, MORPH_STEM)) { + strcat(result, " "); + strcat(result, MORPH_STEM); + strcat(result, HENTRY_WORD(rv)); + } + // store the pointer of the hash entry +// sprintf(result + strlen(result), " %s%p", MORPH_HENTRY, rv); + + if (!complexprefixes && HENTRY_DATA(rv)) { + strcat(result, " "); + strcat(result, HENTRY_DATA(rv)); + } #ifdef DEBUG unsigned short flag = sptr->getFlag(); if (flag_mode == FLAG_NUM) { @@ -2698,7 +2600,10 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, strcat(result, ":"); #endif - if (sptr->getMorph()) strcat(result, sptr->getMorph()); + if (sptr->getMorph()) { + strcat(result, " "); + strcat(result, sptr->getMorph()); + } strcat(result, "\n"); rv = sptr->get_next_homonym(rv, sfxopts, ppfx, cclass, needflag); } @@ -2711,8 +2616,6 @@ char * AffixMgr::suffix_check_morph(const char * word, int len, if (*result) return mystrdup(result); return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE - // check if word with affixes is correctly spelled struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG needflag, char in_compound) @@ -2741,7 +2644,6 @@ struct hentry * AffixMgr::affix_check (const char * word, int len, const FLAG ne return rv; } -#ifdef HUNSPELL_EXPERIMENTAL // check if word with affixes is correctly spelled char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needflag, char in_compound) { @@ -2781,20 +2683,95 @@ char * AffixMgr::affix_check_morph(const char * word, int len, const FLAG needfl free(st); } } - + return mystrdup(result); } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE + +char * AffixMgr::morphgen(char * ts, int wl, const unsigned short * ap, + unsigned short al, char * morph, char * targetmorph, int level) +{ + // handle suffixes + char * stemmorph; + char * stemmorphcatpos; + char mymorph[MAXLNLEN]; + + if (!morph && !targetmorph) return NULL; + + // check substandard flag + if (TESTAFF(ap, substandard, al)) return NULL; + + if (morphcmp(morph, targetmorph) == 0) return mystrdup(ts); + +// int targetcount = get_sfxcount(targetmorph); + + // use input suffix fields, if exist + if (strstr(morph, MORPH_INFL_SFX) || strstr(morph, MORPH_DERI_SFX)) { + stemmorph = mymorph; + strcpy(stemmorph, morph); + strcat(stemmorph, " "); + stemmorphcatpos = stemmorph + strlen(stemmorph); + } else { + stemmorph = morph; + stemmorphcatpos = NULL; + } + + for (int i = 0; i < al; i++) { + const unsigned char c = (unsigned char) (ap[i] & 0x00FF); + SfxEntry * sptr = (SfxEntry *)sFlag[c]; + while (sptr) { + if (sptr->getFlag() == ap[i] && ((sptr->getContLen() == 0) || + // don't generate forms with substandard affixes + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen()))) { + + if (stemmorphcatpos) strcpy(stemmorphcatpos, sptr->getMorph()); + else stemmorph = (char *) sptr->getMorph(); + + int cmp = morphcmp(stemmorph, targetmorph); + + if (cmp == 0) { + char * newword = sptr->add(ts, wl); + if (newword) { + hentry * check = pHMgr->lookup(newword); + if (!check || !check->astr || + !TESTAFF(check->astr, forbiddenword, check->alen)) { + return newword; + } + free(newword); + } + } + + // recursive call for secondary suffixes + if ((level == 0) && (cmp == 1) && (sptr->getContLen() > 0) && +// (get_sfxcount(stemmorph) < targetcount) && + !TESTAFF(sptr->getCont(), substandard, sptr->getContLen())) { + char * newword = sptr->add(ts, wl); + if (newword) { + char * newword2 = morphgen(newword, strlen(newword), sptr->getCont(), + sptr->getContLen(), stemmorph, targetmorph, 1); + + if (newword2) { + free(newword); + return newword2; + } + free(newword); + newword = NULL; + } + } + } + sptr = (SfxEntry *)sptr ->getFlgNxt(); + } + } + return NULL; +} int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts, int wl, const unsigned short * ap, unsigned short al, char * bad, int badl, char * phone) { - int nh=0; // first add root word to list - if ((nh < maxn) && !(al && ((pseudoroot && TESTAFF(ap, pseudoroot, al)) || + if ((nh < maxn) && !(al && ((needaffix && TESTAFF(ap, needaffix, al)) || (onlyincompound && TESTAFF(ap, onlyincompound, al))))) { wlst[nh].word = mystrdup(ts); wlst[nh].allow = (1 == 0); @@ -2816,9 +2793,9 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts while (sptr) { if ((sptr->getFlag() == ap[i]) && (!sptr->getKeyLen() || ((badl > sptr->getKeyLen()) && (strcmp(sptr->getAffix(), bad + badl - sptr->getKeyLen()) == 0))) && - // check pseudoroot flag - !(sptr->getCont() && ((pseudoroot && - TESTAFF(sptr->getCont(), pseudoroot, sptr->getContLen())) || + // check needaffix flag + !(sptr->getCont() && ((needaffix && + TESTAFF(sptr->getCont(), needaffix, sptr->getContLen())) || (circumfix && TESTAFF(sptr->getCont(), circumfix, sptr->getContLen())) || (onlyincompound && @@ -2888,9 +2865,9 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts while (ptr) { if ((ptr->getFlag() == ap[m]) && (!ptr->getKeyLen() || ((badl > ptr->getKeyLen()) && (strncmp(ptr->getKey(), bad, ptr->getKeyLen()) == 0))) && - // check pseudoroot flag - !(ptr->getCont() && ((pseudoroot && - TESTAFF(ptr->getCont(), pseudoroot, ptr->getContLen())) || + // check needaffix flag + !(ptr->getCont() && ((needaffix && + TESTAFF(ptr->getCont(), needaffix, ptr->getContLen())) || (circumfix && TESTAFF(ptr->getCont(), circumfix, ptr->getContLen())) || (onlyincompound && @@ -2915,8 +2892,6 @@ int AffixMgr::expand_rootword(struct guessword * wlst, int maxn, const char * ts return nh; } - - // return length of replacing table int AffixMgr::get_numrep() { @@ -3059,9 +3034,9 @@ FLAG AffixMgr::get_nosuggest() } // return the forbidden words flag modify flag -FLAG AffixMgr::get_pseudoroot() +FLAG AffixMgr::get_needaffix() { - return pseudoroot; + return needaffix; } // return the onlyincompound flag @@ -3159,7 +3134,7 @@ int AffixMgr::get_sugswithdots(void) /* parse flag */ int AffixMgr::parse_flag(char * line, unsigned short * out, const char * name) { char * s = NULL; - if (*out != FLAG_NULL) { + if (*out != FLAG_NULL && !(*out >= DEFAULTFLAGS)) { HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); return 1; } @@ -3216,7 +3191,7 @@ int AffixMgr::parse_cpdsyllable(char * line) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np < 2) { @@ -3247,7 +3222,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) numrep = atoi(piece); if (numrep < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in replacement table\n"); - free(piece); + // free(piece); return 1; } reptable = (replentry *) malloc(numrep * sizeof(struct replentry)); @@ -3259,7 +3234,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -3284,7 +3259,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) if (strncmp(piece,"REP",3) != 0) { HUNSPELL_WARNING(stderr, "error: replacement table is corrupt\n"); numrep = 0; - free(piece); + // free(piece); return 1; } break; @@ -3295,7 +3270,7 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if ((!(reptable[j].pattern)) || (!(reptable[j].pattern2))) { @@ -3331,7 +3306,7 @@ int AffixMgr::parse_phonetable(char * line, FILE * af) if (!phone) return 1; if (phone->num < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in phonelacement table\n"); - free(piece); + // free(piece); return 1; } phone->rules = (char * *) malloc(2 * (phone->num + 1) * sizeof(char *)); @@ -3343,7 +3318,7 @@ int AffixMgr::parse_phonetable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -3368,7 +3343,7 @@ int AffixMgr::parse_phonetable(char * line, FILE * af) if (strncmp(piece,"PHONE",5) != 0) { HUNSPELL_WARNING(stderr, "error: PHONE table is corrupt\n"); phone->num = 0; - free(piece); + // free(piece); return 1; } break; @@ -3379,7 +3354,7 @@ int AffixMgr::parse_phonetable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if ((!(phone->rules[j * 2])) || (!(phone->rules[j * 2 + 1]))) { @@ -3414,7 +3389,7 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) numcheckcpd = atoi(piece); if (numcheckcpd < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in compound pattern table\n"); - free(piece); + // free(piece); return 1; } checkcpdtable = (replentry *) malloc(numcheckcpd * sizeof(struct replentry)); @@ -3426,7 +3401,7 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -3451,7 +3426,7 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) if (strncmp(piece,"CHECKCOMPOUNDPATTERN",20) != 0) { HUNSPELL_WARNING(stderr, "error: compound pattern table is corrupt\n"); numcheckcpd = 0; - free(piece); + // free(piece); return 1; } break; @@ -3462,7 +3437,7 @@ int AffixMgr::parse_checkcpdtable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if ((!(checkcpdtable[j].pattern)) || (!(checkcpdtable[j].pattern2))) { @@ -3494,7 +3469,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) numdefcpd = atoi(piece); if (numdefcpd < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in compound rule table\n"); - free(piece); + // free(piece); return 1; } defcpdtable = (flagentry *) malloc(numdefcpd * sizeof(flagentry)); @@ -3506,7 +3481,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -3529,7 +3504,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) case 0: { if (strncmp(piece, "COMPOUNDRULE", 12) != 0) { HUNSPELL_WARNING(stderr, "error: compound rule table is corrupt\n"); - free(piece); + // free(piece); numdefcpd = 0; return 1; } @@ -3544,7 +3519,7 @@ int AffixMgr::parse_defcpdtable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (!defcpdtable[j].len) { @@ -3577,7 +3552,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) nummap = atoi(piece); if (nummap < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in map table\n"); - free(piece); + // free(piece); return 1; } maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); @@ -3589,7 +3564,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -3614,7 +3589,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) if (strncmp(piece,"MAP",3) != 0) { HUNSPELL_WARNING(stderr, "error: map table is corrupt\n"); nummap = 0; - free(piece); + // free(piece); return 1; } break; @@ -3642,7 +3617,7 @@ int AffixMgr::parse_maptable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if ((!(maptable[j].set || maptable[j].set_utf16)) || (!(maptable[j].len))) { @@ -3674,7 +3649,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) numbreak = atoi(piece); if (numbreak < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in BREAK table\n"); - free(piece); + // free(piece); return 1; } breaktable = (char **) malloc(numbreak * sizeof(char *)); @@ -3686,7 +3661,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -3708,7 +3683,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) case 0: { if (strncmp(piece,"BREAK",5) != 0) { HUNSPELL_WARNING(stderr, "error: BREAK table is corrupt\n"); - free(piece); + // free(piece); numbreak = 0; return 1; } @@ -3722,7 +3697,7 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (!breaktable) { @@ -3734,6 +3709,31 @@ int AffixMgr::parse_breaktable(char * line, FILE * af) return 0; } +void AffixMgr::reverse_condition(char * piece) { + int neg = 0; + for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { + switch(*k) { + case '[': { + if (neg) *(k+1) = '['; else *k = ']'; + break; + } + case ']': { + *k = '['; + if (neg) *(k+1) = '^'; + neg = 0; + break; + } + case '^': { + if (*(k+1) == ']') neg = 1; else *(k+1) = *k; + break; + } + default: { + if (neg) *(k+1) = *k; + } + } + } +} + int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflags) { int numents = 0; // number of affentry structures to parse @@ -3795,9 +3795,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag ptr->opts = ff; if (utf8) ptr->opts += aeUTF8; if (pHMgr->is_aliasf()) ptr->opts += aeALIASF; -#ifdef HUNSPELL_EXPERIMENTAL if (pHMgr->is_aliasm()) ptr->opts += aeALIASM; -#endif ptr->aflag = aflag; } @@ -3805,7 +3803,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces @@ -3836,7 +3834,8 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // piece 1 - is type case 0: { np++; - if (nptr != ptr) nptr->opts = ptr->opts; + if (nptr != ptr) nptr->opts = ptr->opts & + (aeXPRODUCT + aeUTF8 + aeALIASF + aeALIASM); break; } @@ -3848,7 +3847,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag HUNSPELL_WARNING(stderr, "error: affix %s is corrupt near line %s\n", err, nl); HUNSPELL_WARNING(stderr, "error: possible incorrect count\n"); free(err); - free(piece); + // free(piece); return 1; } @@ -3875,9 +3874,7 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag // piece 4 - is affix string or 0 for null case 3: { char * dash; -#ifdef HUNSPELL_EXPERIMENTAL nptr->morphcode = NULL; -#endif nptr->contclass = NULL; nptr->contclasslen = 0; np++; @@ -3939,59 +3936,44 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag case 4: { np++; if (complexprefixes) { - int neg = 0; if (utf8) reverseword_utf(piece); else reverseword(piece); - // reverse condition - for (char * k = piece + strlen(piece) - 1; k >= piece; k--) { - switch(*k) { - case '[': { - if (neg) *(k+1) = '['; else *k = ']'; - break; - } - case ']': { - *k = '['; - if (neg) *(k+1) = '^'; - neg = 0; - break; - } - case '^': { - if (*(k+1) == ']') neg = 1; else *(k+1) = *k; - break; - } - default: { - if (neg) *(k+1) = *k; - } - } - } + reverse_condition(piece); } if (nptr->stripl && (strcmp(piece, ".") != 0) && redundant_condition(at, nptr->strip, nptr->stripl, piece, nl)) strcpy(piece, "."); - if (encodeit(nptr,piece)) return 1; + if (at == 'S') { + reverseword(piece); + reverse_condition(piece); + } + if (encodeit(nptr, piece)) return 1; break; } -#ifdef HUNSPELL_EXPERIMENTAL case 5: { np++; if (pHMgr->is_aliasm()) { int index = atoi(piece); nptr->morphcode = pHMgr->get_aliasm(index); } else { - if (complexprefixes) { + if (complexprefixes) { // XXX - fix me for morph. gen. if (utf8) reverseword_utf(piece); else reverseword(piece); } - nptr->morphcode = mystrdup(piece); + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + tp = tp + strlen(tp); + } + nptr->morphcode = (char *) malloc(strlen(piece)+1); + strcpy(nptr->morphcode, piece); } break; } -#endif - default: break; } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } // check to make sure we parsed enough pieces @@ -4004,7 +3986,6 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag } #ifdef DEBUG -#ifdef HUNSPELL_EXPERIMENTAL // detect unnecessary fields, excepting comments if (basefieldnum) { int fieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); @@ -4014,7 +3995,6 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af, char * dupflag basefieldnum = !(nptr->morphcode) ? 5 : ((*(nptr->morphcode)=='#') ? 5 : 6); } #endif -#endif nptr++; } diff --git a/src/myspell/affixmgr.hxx b/src/myspell/affixmgr.hxx index 37ae20e..644d2c9 100644 --- a/src/myspell/affixmgr.hxx +++ b/src/myspell/affixmgr.hxx @@ -46,7 +46,7 @@ class AffixMgr int checkcompoundtriple; FLAG forbiddenword; FLAG nosuggest; - FLAG pseudoroot; + FLAG needaffix; int cpdmin; int numrep; replentry * reptable; @@ -88,6 +88,7 @@ class AffixMgr FLAG circumfix; FLAG onlyincompound; FLAG keepcase; + FLAG substandard; int checksharps; int havecontclass; // boolean variable @@ -99,48 +100,55 @@ public: AffixMgr(const char * affpath, HashMgr * ptr); ~AffixMgr(); struct hentry * affix_check(const char * word, int len, - const unsigned short needflag = (unsigned short) 0, char in_compound = IN_CPD_NOT); + const unsigned short needflag = (unsigned short) 0, + char in_compound = IN_CPD_NOT); struct hentry * prefix_check(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isSubset(const char * s1, const char * s2); struct hentry * prefix_check_twosfx(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); inline int isRevSubset(const char * s1, const char * end_of_s2, int len); - struct hentry * suffix_check(const char * word, int len, int sfxopts, AffEntry* ppfx, - char ** wlst, int maxSug, int * ns, const FLAG cclass = FLAG_NULL, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + struct hentry * suffix_check(const char * word, int len, int sfxopts, + AffEntry* ppfx, char ** wlst, int maxSug, int * ns, + const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, + char in_compound = IN_CPD_NOT); struct hentry * suffix_check_twosfx(const char * word, int len, int sfxopts, AffEntry* ppfx, const FLAG needflag = FLAG_NULL); char * affix_check_morph(const char * word, int len, - const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_morph(const char * word, int len, - char in_compound, const FLAG needflag = FLAG_NULL); - char * suffix_check_morph (const char * word, int len, int sfxopts, AffEntry * ppfx, - const FLAG cclass = FLAG_NULL, const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); + char in_compound, const FLAG needflag = FLAG_NULL); + char * suffix_check_morph (const char * word, int len, int sfxopts, + AffEntry * ppfx, const FLAG cclass = FLAG_NULL, + const FLAG needflag = FLAG_NULL, char in_compound = IN_CPD_NOT); char * prefix_check_twosfx_morph(const char * word, int len, char in_compound, const FLAG needflag = FLAG_NULL); char * suffix_check_twosfx_morph(const char * word, int len, int sfxopts, AffEntry * ppfx, const FLAG needflag = FLAG_NULL); - int expand_rootword(struct guessword * wlst, int maxn, const char * ts, - int wl, const unsigned short * ap, unsigned short al, char * bad, int, - char *); + char * morphgen(char * ts, int wl, const unsigned short * ap, + unsigned short al, char * morph, char * targetmorph, int level); + + int expand_rootword(struct guessword * wlst, int maxn, const char * ts, + int wl, const unsigned short * ap, unsigned short al, char * bad, + int, char *); - short get_syllable (const char * word, int wlen); - int cpdrep_check(const char * word, int len); - int cpdpat_check(const char * word, int len); - int defcpd_check(hentry *** words, short wnum, hentry * rv, hentry ** rwords, char all); - int cpdcase_check(const char * word, int len); - inline int candidate_check(const char * word, int len); - struct hentry * compound_check(const char * word, int len, - short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug); + short get_syllable (const char * word, int wlen); + int cpdrep_check(const char * word, int len); + int cpdpat_check(const char * word, int len); + int defcpd_check(hentry *** words, short wnum, hentry * rv, + hentry ** rwords, char all); + int cpdcase_check(const char * word, int len); + inline int candidate_check(const char * word, int len); + struct hentry * compound_check(const char * word, int len, short wordnum, + short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, int * cmpdstemnum, int * cmpdstem, char is_sug); - int compound_check_morph(const char * word, int len, - short wordnum, short numsyllable, short maxwordnum, short wnum, hentry ** words, - char hu_mov_rule, char ** result, char * partresult); + int compound_check_morph(const char * word, int len, short wordnum, + short numsyllable, short maxwordnum, short wnum, hentry ** words, + char hu_mov_rule, char ** result, char * partresult); struct hentry * lookup(const char * word); int get_numrep(); @@ -164,7 +172,7 @@ public: FLAG get_forbiddenword(); FLAG get_nosuggest(); // FLAG get_circumfix(); - FLAG get_pseudoroot(); + FLAG get_needaffix(); FLAG get_onlyincompound(); FLAG get_compoundroot(); FLAG get_lemma_present(); @@ -186,11 +194,8 @@ public: private: int parse_file(const char * affpath); -// int parse_string(char * line, char ** out, const char * name); int parse_flag(char * line, unsigned short * out, const char * name); int parse_num(char * line, int * out, const char * name); -// int parse_array(char * line, char ** out, unsigned short ** out_utf16, -// int * out_utf16_len, const char * name); int parse_cpdsyllable(char * line); int parse_reptable(char * line, FILE * af); int parse_phonetable(char * line, FILE * af); @@ -200,6 +205,8 @@ private: int parse_defcpdtable(char * line, FILE * af); int parse_affix(char * line, const char at, FILE * af, char * dupflags); + void reverse_condition(char *); + int condlen(char *); int encodeit(struct affentry * ptr, char * cs); int build_pfxtree(AffEntry* pfxptr); int build_sfxtree(AffEntry* sfxptr); @@ -209,7 +216,8 @@ private: AffEntry * process_sfx_in_order(AffEntry * ptr, AffEntry * nptr); int process_pfx_tree_to_list(); int process_sfx_tree_to_list(); - int redundant_condition(char, char * strip, int stripl, const char * cond, char *); + int redundant_condition(char, char * strip, int stripl, + const char * cond, char *); }; #endif diff --git a/src/myspell/atypes.hxx b/src/myspell/atypes.hxx index 009e85a..0afb345 100644 --- a/src/myspell/atypes.hxx +++ b/src/myspell/atypes.hxx @@ -26,7 +26,7 @@ static inline void HUNSPELL_WARNING(FILE *, const char *, ...) {} #define aeUTF8 (1 << 1) #define aeALIASF (1 << 2) #define aeALIASM (1 << 3) -#define aeINFIX (1 << 4) +#define aeLONGCOND (1 << 4) // compound options #define IN_CPD_NOT 0 @@ -38,6 +38,8 @@ static inline void HUNSPELL_WARNING(FILE *, const char *, ...) {} #define MINCPDLEN 3 #define MAXCOMPOUND 10 +#define MAXCONDLEN 20 +#define MAXCONDLEN_1 (MAXCONDLEN - sizeof(char *)) #define MAXACC 1000 @@ -56,21 +58,16 @@ struct affentry char numconds; char opts; unsigned short aflag; - union { - char base[SETSIZE]; - struct { - char ascii[SETSIZE/2]; - char neg[8]; - char all[8]; - w_char * wchars[8]; - int wlen[8]; - } utf8; - } conds; -#ifdef HUNSPELL_EXPERIMENTAL - char * morphcode; -#endif unsigned short * contclass; short contclasslen; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char * conds2; + } l; + } c; + char * morphcode; }; struct mapentry { @@ -91,8 +88,3 @@ struct guessword { }; #endif - - - - - diff --git a/src/myspell/baseaffix.hxx b/src/myspell/baseaffix.hxx index d6a5cd6..03a876d 100644 --- a/src/myspell/baseaffix.hxx +++ b/src/myspell/baseaffix.hxx @@ -6,26 +6,23 @@ class AffEntry public: protected: - char * appnd; - char * strip; - unsigned char appndl; - unsigned char stripl; - char numconds; - char opts; - unsigned short aflag; - union { - char base[SETSIZE]; - struct { - char ascii[SETSIZE/2]; - char neg[8]; - char all[8]; - w_char * wchars[8]; - int wlen[8]; - } utf8; - } conds; - char * morphcode; - unsigned short * contclass; - short contclasslen; + char * appnd; + char * strip; + unsigned char appndl; + unsigned char stripl; + char numconds; + char opts; + unsigned short aflag; + union { + char conds[MAXCONDLEN]; + struct { + char conds1[MAXCONDLEN_1]; + char * conds2; + } l; + } c; + char * morphcode; + unsigned short * contclass; + short contclasslen; }; #endif diff --git a/src/myspell/csutil.cxx b/src/myspell/csutil.cxx index c56f493..6914957 100644 --- a/src/myspell/csutil.cxx +++ b/src/myspell/csutil.cxx @@ -125,7 +125,7 @@ int u8_u16(w_char * dest, int size, const char * src) { case 0x90: case 0xa0: case 0xb0: { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %d. character position\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %ld. character position\n%s\n", static_cast<long>(u8 - src), src); u2->h = 0xff; u2->l = 0xfd; break; @@ -137,7 +137,7 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -151,12 +151,12 @@ int u8_u16(w_char * dest, int size, const char * src) { u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); u8++; } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - src), src); u2->h = 0xff; u2->l = 0xfd; } } else { - HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); + HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - src), src); u2->h = 0xff; u2->l = 0xfd; } @@ -221,13 +221,11 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { char * mystrsep(char ** stringp, const char delim) { - char * rv = NULL; char * mp = *stringp; - int n = strlen(mp); - if (n > 0) { + if (*mp != '\0') { char * dp; if (delim) { - dp = (char *)memchr(mp,(int)((unsigned char)delim),n); + dp = strchr(mp, delim); } else { // don't use isspace() here, the string can be in some random charset // that's way different than the locale's @@ -237,20 +235,11 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { if (dp) { *stringp = dp+1; int nc = (int)((unsigned long)dp - (unsigned long)mp); - rv = (char *) malloc(nc+1); - if (rv) { - memcpy(rv,mp,nc); - *(rv+nc) = '\0'; - return rv; - } + *(mp+nc) = '\0'; + return mp; } else { - rv = (char *) malloc(n+1); - if (rv) { - memcpy(rv, mp, n); - *(rv+n) = '\0'; - *stringp = mp + n; - return rv; - } + *stringp = mp + strlen(mp); + return mp; } } return NULL; @@ -296,112 +285,248 @@ int flag_bsearch(unsigned short flags[], unsigned short flag, int length) { return d; } -#ifdef HUNSPELL_EXPERIMENTAL - // append s to ends of every lines in text - void strlinecat(char * dest, const char * s) - { - char * dup = mystrdup(dest); - char * source = dup; - int len = strlen(s); - while (*source) { - if (*source == '\n') { - strncpy(dest, s, len); - dest += len; - } - *dest = *source; - source++; dest++; - } - strcpy(dest, s); - free(dup); - } - // break text to lines // return number of lines -int line_tok(const char * text, char *** lines) { +int line_tok(const char * text, char *** lines, char breakchar) { int linenum = 0; char * dup = mystrdup(text); - char * p = strchr(dup, '\n'); + char * p = strchr(dup, breakchar); while (p) { linenum++; *p = '\0'; p++; - p = strchr(p, '\n'); + p = strchr(p, breakchar); + } + linenum++; + *lines = (char **) malloc(linenum * sizeof(char *)); + if (!(*lines)) { + free(dup); + return 0; } - *lines = (char **) calloc(linenum + 1, sizeof(char *)); - if (!(*lines)) return -1; - p = dup; - for (int i = 0; i < linenum + 1; i++) { - (*lines)[i] = mystrdup(p); + p = dup; + int l = 0; + for (int i = 0; i < linenum; i++) { + if (*p != '\0') { + (*lines)[l] = mystrdup(p); + l++; + } p += strlen(p) + 1; } free(dup); - return linenum; + if (!l) free(*lines); + return l; } // uniq line in place -char * line_uniq(char * text) { +char * line_uniq(char * text, char breakchar) { char ** lines; - int linenum = line_tok(text, &lines); + int linenum = line_tok(text, &lines, breakchar); int i; strcpy(text, lines[0]); - for ( i = 1; i<=linenum; i++ ) { + for ( i = 1; i < linenum; i++ ) { int dup = 0; for (int j = 0; j < i; j++) { if (strcmp(lines[i], lines[j]) == 0) dup = 1; } if (!dup) { - if ((i > 1) || (*(lines[0]) != '\0')) strcat(text, "\n"); + if ((i > 1) || (*(lines[0]) != '\0')) { + sprintf(text + strlen(text), "%c", breakchar); + } strcat(text, lines[i]); } } - for ( i = 0; i<=linenum; i++ ) { + for ( i = 0; i < linenum; i++ ) { if (lines[i]) free(lines[i]); } if (lines) free(lines); return text; } +// uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) " +char * line_uniq_app(char ** text, char breakchar) { + if (!strchr(*text, breakchar)) { + return *text; + } + + char ** lines; + int linenum = line_tok(*text, &lines, breakchar); + int dup = 0; + for (int i = 0; i < linenum; i++) { + for (int j = 0; j < (i - 1); j++) { + if (strcmp(lines[i], lines[j]) == 0) { + *(lines[i]) = '\0'; + dup++; + break; + } + } + } + if ((linenum - dup) == 1) { + strcpy(*text, lines[0]); + freelist(&lines, linenum); + return *text; + } + char * newtext = (char *) malloc(strlen(*text) + 2 * linenum + 3 + 1); + if (newtext) { + free(*text); + *text = newtext; + } else { + freelist(&lines, linenum); + return *text; + } + strcpy(*text," ( "); + for (int i = 0; i < linenum; i++) if (*(lines[i])) { + sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); + } + (*text)[strlen(*text) - 2] = ')'; // " ) " + freelist(&lines, linenum); + return *text; +} + + // append s to ends of every lines in text + void strlinecat(char * dest, const char * s) + { + char * dup = mystrdup(dest); + char * source = dup; + int len = strlen(s); + while (*source) { + if (*source == '\n') { + strncpy(dest, s, len); + dest += len; + } + *dest = *source; + source++; dest++; + } + strcpy(dest, s); + free(dup); + } + // change \n to char c -char * line_join(char * text, char c) { +char * tr(char * text, char oldc, char newc) { char * p; - for (p = text; *p; p++) if (*p == '\n') *p = c; + for (p = text; *p; p++) if (*p == oldc) *p = newc; return text; } -// leave only last {[^}]*} substring for handling zero morphemes -char * delete_zeros(char * morphout) { - char * p = morphout; - char * q = p; - char * q2 = NULL; - int suffix = 0; - - for (;*p && *(p+1);) { - switch (*p) { - case '{': - q2 = q; - q--; - break; - case '}': - if (q2) { - suffix = 1; - q--; - } - break; - default: - if (suffix) { - q = q2; - } - suffix = 0; - *q = *p; +// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields +// in the first line of the inputs +// return 0, if inputs equal +// return 1, if inputs may equal with a secondary suffix +// otherwise return -1 +int morphcmp(const char * s, const char * t) +{ + int se; + int te; + char * sl; + char * tl; + const char * olds; + const char * oldt; + if (!s || !t) return 1; + olds = s; + sl = strchr(s, '\n'); + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s= strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + tl = strchr(t, '\n'); + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + while (s && t && (!sl || sl > s) && (!tl || tl > t)) { + s += MORPH_TAG_LEN; + t += MORPH_TAG_LEN; + se = 0; + te = 0; + while ((*s == *t) && !se && !te) { + s++; + t++; + switch(*s) { + case ' ': + case '\n': + case '\t': + case '\0': se = 1; + } + switch(*t) { + case ' ': + case '\n': + case '\t': + case '\0': te = 1; + } } - p++; - q++; + if (!se || !te) { + // not terminal suffix difference + if (olds) return -1; + return 1; + } + olds = s; + s = strstr(s, MORPH_DERI_SFX); + if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); + if (!s || (sl && sl < s)) { + s = strstr(olds, MORPH_TERM_SFX); + olds = NULL; + } + oldt = t; + t = strstr(t, MORPH_DERI_SFX); + if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); + if (!t || (tl && tl < t)) { + t = strstr(oldt, MORPH_TERM_SFX); + oldt = NULL; + } + } + if (!s && !t && se && te) return 0; + return 1; +} + +int get_sfxcount(const char * morph) +{ + if (!morph || !*morph) return 0; + int n = 0; + const char * old = morph; + morph = strstr(morph, MORPH_DERI_SFX); + if (!morph) morph = strstr(old, MORPH_INFL_SFX); + if (!morph) morph = strstr(old, MORPH_TERM_SFX); + while (morph) { + n++; + old = morph; + morph = strstr(morph + 1, MORPH_DERI_SFX); + if (!morph) morph = strstr(old + 1, MORPH_INFL_SFX); + if (!morph) morph = strstr(old + 1, MORPH_TERM_SFX); + } + return n; +} + + +int fieldlen(const char * r) +{ + int n = 0; + while (r && *r != '\t' && *r != '\0' && *r != '\n' && *r != ' ') { + r++; + n++; } - *q = '\0'; - return morphout; + return n; +} + +char * copy_field(char * dest, const char * morph, const char * var) +{ + if (!morph) return NULL; + char * beg = strstr(morph, var); + if (beg) { + char * d = dest; + for (beg += MORPH_TAG_LEN; *beg != ' ' && *beg != '\t' && + *beg != '\n' && *beg != '\0'; d++, beg++) { + *d = *beg; + } + *d = '\0'; + return dest; + } + return NULL; } -#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * mystrrep(char * word, const char * pat, const char * rep) { char * pos = strstr(word, pat); @@ -452,6 +577,33 @@ char * mystrrep(char * word, const char * pat, const char * rep) { u16_u8(word, MAXWORDUTF8LEN, w, l); return 0; } + + int uniqlist(char ** list, int n) { + if (n < 2) return n; + for (int i = 0; i < n; i++) { + for (int j = 0; j < i; j++) { + if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { + free(list[i]); + list[i] = NULL; + break; + } + } + } + int m = 1; + for (int i = 1; i < n; i++) if (list[i]) { + list[m] = list[i]; + m++; + } + return m; + } + + void freelist(char *** list, int n) { + if (list && (n > 0)) { + for (int i = 0; i < n; i++) if ((*list)[i]) free((*list)[i]); + free(*list); + *list = NULL; + } + } // convert null terminated string to all caps void mkallcap(char * p, const struct cs_info * csconv) @@ -5319,7 +5471,7 @@ int parse_string(char * line, char ** out, const char * warnvar) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { diff --git a/src/myspell/csutil.hxx b/src/myspell/csutil.hxx index e1ba94e..df7979b 100644 --- a/src/myspell/csutil.hxx +++ b/src/myspell/csutil.hxx @@ -9,17 +9,32 @@ #define HUHCAP 3 #define HUHINITCAP 4 -#define FIELD_STEM "st:" -#define FIELD_POS "po:" -#define FIELD_SUFF "su:" -#define FIELD_PREF "pr:" -#define FIELD_FREQ "fr:" -#define FIELD_PHON "ph:" -#define FIELD_HYPH "hy:" -#define FIELD_COMP "co:" +#define MORPH_STEM "st:" +#define MORPH_ALLOMORPH "al:" +#define MORPH_POS "po:" +#define MORPH_DERI_PFX "dp:" +#define MORPH_INFL_PFX "ip:" +#define MORPH_TERM_PFX "tp:" +#define MORPH_DERI_SFX "ds:" +#define MORPH_INFL_SFX "is:" +#define MORPH_TERM_SFX "ts:" +#define MORPH_SURF_PFX "sp:" +#define MORPH_FREQ "fr:" +#define MORPH_PHON "ph:" +#define MORPH_HYPH "hy:" +#define MORPH_PART "pa:" +#define MORPH_HENTRY "_H:" +#define MORPH_TAG_LEN strlen(MORPH_STEM) + +#define MSEP_FLD ' ' +#define MSEP_REC '\n' +#define MSEP_ALT '\v' + // default flags -#define ONLYUPCASEFLAG 65535 +#define DEFAULTFLAGS 65510 +#define FORBIDDENWORD 65510 +#define ONLYUPCASEFLAG 65511 typedef struct { unsigned char l; @@ -61,16 +76,14 @@ char * mystrrep(char *, const char *, const char *); void strlinecat(char * lines, const char * s); // tokenize into lines with new line - int line_tok(const char * text, char *** lines); + int line_tok(const char * text, char *** lines, char breakchar); // tokenize into lines with new line and uniq in place - char * line_uniq(char * text); - -// change \n to c in place - char * line_join(char * text, char c); + char * line_uniq(char * text, char breakchar); + char * line_uniq_app(char ** text, char breakchar); -// leave only last {[^}]*} pattern in string - char * delete_zeros(char * morphout); +// change oldchar to newchar in place + char * tr(char * text, char oldc, char newc); // reverse word int reverseword(char *); @@ -78,6 +91,12 @@ void strlinecat(char * lines, const char * s); // reverse word int reverseword_utf(char *); +// remove duplicates + int uniqlist(char ** list, int n); + +// free character array list + void freelist(char *** list, int n); + // character encoding information struct cs_info { unsigned char ccase; @@ -174,4 +193,11 @@ int parse_string(char * line, char ** out, const char * name); int parse_array(char * line, char ** out, unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8); +int fieldlen(const char * r); +char * copy_field(char * dest, const char * morph, const char * var); + +int morphcmp(const char * s, const char * t); + +int get_sfxcount(const char * morph); + #endif diff --git a/src/myspell/hashmgr.cxx b/src/myspell/hashmgr.cxx index 656fb85..08e061c 100644 --- a/src/myspell/hashmgr.cxx +++ b/src/myspell/hashmgr.cxx @@ -47,7 +47,7 @@ HashMgr::HashMgr(const char * tpath, const char * apath) aliasf = NULL; numaliasm = 0; aliasm = NULL; - forbiddenword = FLAG_NULL; // forbidden word signing flag + forbiddenword = FORBIDDENWORD; // forbidden word signing flag load_config(apath); int ec = load_tables(tpath); if (ec) { @@ -70,20 +70,9 @@ HashMgr::~HashMgr() for (int i=0; i < tablesize; i++) { struct hentry * pt = tableptr[i]; struct hentry * nt = NULL; -/* if (pt) { - if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr); -#ifdef HUNSPELL_EXPERIMENTAL - if (pt->description && !aliasm) free(pt->description); -#endif - pt = pt->next; - } -*/ while(pt) { nt = pt->next; if (pt->astr && (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen))) free(pt->astr); -#ifdef HUNSPELL_EXPERIMENTAL - if (pt->description && !aliasm) free(pt->description); -#endif free(pt); pt = nt; } @@ -140,20 +129,13 @@ int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, int al, const char * desc, bool onlyupcase) { bool upcasehomonym = false; - int descl = (desc) ? strlen(desc) : 0; + int descl = desc ? (aliasm ? sizeof(char *) : strlen(desc) + 1) : 0; // variable-length hash record with word and optional fields - // instead of mmap implementation temporarily struct hentry* hp = - (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl + 1); + (struct hentry *) malloc (sizeof(struct hentry) + wbl + descl); if (!hp) return 1; char * hpw = &(hp->word); strcpy(hpw, word); - if (desc && strncmp(desc, FIELD_PHON, strlen(FIELD_PHON)) == 0) { - strcpy(hpw + wbl + 1, desc + strlen(FIELD_PHON)); - hp->var = 1; - } else { - hp->var = 0; - } if (ignorechars != NULL) { if (utf8) { remove_ignored_chars_utf(hpw, ignorechars_utf16, ignorechars_utf16_len); @@ -167,29 +149,29 @@ int HashMgr::add_word(const char * word, int wbl, int wcl, unsigned short * aff, int i = hash(hpw); - hp->blen = (unsigned char) wbl; - hp->clen = (unsigned char) wcl; - hp->alen = (short) al; - hp->astr = aff; - hp->next = NULL; - hp->next_homonym = NULL; -#ifdef HUNSPELL_EXPERIMENTAL - if (aliasm) { - hp->description = (desc) ? get_aliasm(atoi(desc)) : mystrdup(desc); - } else { - hp->description = mystrdup(desc); - if (desc && !hp->description) - { - free(hp->astr); - free(hp); - return 1; - } - if (hp->description && complexprefixes) { - if (utf8) reverseword_utf(hp->description); else reverseword(hp->description); + hp->blen = (unsigned char) wbl; + hp->clen = (unsigned char) wcl; + hp->alen = (short) al; + hp->astr = aff; + hp->next = NULL; + hp->next_homonym = NULL; + + // store the description string or its pointer + if (desc) { + hp->var = H_OPT; + if (aliasm) { + hp->var += H_OPT_ALIASM; + *((char **) (hpw + wbl + 1)) = get_aliasm(atoi(desc)); + } else { + strcpy(hpw + wbl + 1, desc); + if (complexprefixes) { + if (utf8) reverseword_utf(HENTRY_DATA(hp)); + else reverseword(HENTRY_DATA(hp)); } - } -#endif - + } + if (strstr(HENTRY_DATA(hp), MORPH_PHON)) hp->var += H_OPT_PHON; + } else hp->var = 0; + struct hentry * dp = tableptr[i]; if (!dp) { tableptr[i] = hp; @@ -284,8 +266,31 @@ int HashMgr::get_clen_and_captype(const char * word, int wbl, int * captype) { return len; } +// remove word with FORBIDDENWORD flag (not implemented) +int HashMgr::remove(const char * word) +{ + struct hentry * dp = lookup(word); +/* + if (!word || (!dp->astr || !TESTAFF(dp->astr, forbiddenword, pt->alen))) { + int wbl = strlen(word); + int wcl = get_clen_and_captype(word, wbl, &captype); + if (aliasf) { + add_word(word, wbl, wcl, dp->astr, dp->alen, NULL, false); + } else { + unsigned short * flags = (unsigned short *) malloc (dp->alen * sizeof(short)); + if (flags) { + memcpy((void *) flags, (void *) dp->astr, dp->alen * sizeof(short)); + add_word(word, wbl, wcl, flags, dp->alen, NULL, false); + } else return 1; + } + return add_hidden_capitalized_word((char *) word, wbl, wcl, dp->astr, dp->alen, NULL, captype); + } +*/ + return 1; +} + // add a custom dic. word to the hash table (public) -int HashMgr::put_word(const char * word, char * aff) +int HashMgr::add(const char * word, char * aff) { unsigned short * flags; int al = 0; @@ -303,10 +308,10 @@ int HashMgr::put_word(const char * word, char * aff) return add_hidden_capitalized_word((char *) word, wbl, wcl, flags, al, NULL, captype); } -int HashMgr::put_word_pattern(const char * word, const char * pattern) +int HashMgr::add_with_affix(const char * word, const char * example) { // detect captype and modify word length for UTF-8 encoding - struct hentry * dp = lookup(pattern); + struct hentry * dp = lookup(example); if (dp && dp->astr) { int captype; int wbl = strlen(word); @@ -389,6 +394,8 @@ int HashMgr::load_tables(const char * tpath) mychomp(ts); // split each line into word and morphological description dp = strchr(ts,'\t'); + char * dp2 = strchr(ts,' '); + if (dp2 && (!dp || (dp2 < dp))) dp = dp2; if (dp) { *dp = '\0'; @@ -644,16 +651,15 @@ int HashMgr::load_config(const char * affpath) } } -#ifdef HUNSPELL_EXPERIMENTAL if ((strncmp(line,"AM",2) == 0) && isspace(line[2])) { if (parse_aliasm(line, afflst)) { fclose(afflst); return 1; } } -#endif - if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; - if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; + + if (strncmp(line,"COMPLEXPREFIXES",15) == 0) complexprefixes = 1; + if (((strncmp(line,"SFX",3) == 0) || (strncmp(line,"PFX",3) == 0)) && isspace(line[3])) break; } if (csconv == NULL) csconv = get_current_cs("ISO8859-1"); fclose(afflst); @@ -683,7 +689,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) aliasf = NULL; aliasflen = NULL; HUNSPELL_WARNING(stderr, "incorrect number of entries in AF table\n"); - free(piece); + // free(piece); return 1; } aliasf = (unsigned short **) malloc(numaliasf * sizeof(unsigned short *)); @@ -703,7 +709,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -737,7 +743,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) aliasf = NULL; aliasflen = NULL; HUNSPELL_WARNING(stderr, "error: AF table is corrupt\n"); - free(piece); + // free(piece); return 1; } break; @@ -751,7 +757,7 @@ int HashMgr::parse_aliasf(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (!aliasf[j]) { @@ -781,7 +787,6 @@ int HashMgr::get_aliasf(int index, unsigned short ** fvec) { return 0; } -#ifdef HUNSPELL_EXPERIMENTAL /* parse morph alias definitions */ int HashMgr::parse_aliasm(char * line, FILE * af) { @@ -802,7 +807,7 @@ int HashMgr::parse_aliasm(char * line, FILE * af) numaliasm = atoi(piece); if (numaliasm < 1) { HUNSPELL_WARNING(stderr, "incorrect number of entries in AM table\n"); - free(piece); + // free(piece); return 1; } aliasm = (char **) malloc(numaliasm * sizeof(char *)); @@ -817,7 +822,7 @@ int HashMgr::parse_aliasm(char * line, FILE * af) } i++; } - free(piece); + // free(piece); piece = mystrsep(&tp, 0); } if (np != 2) { @@ -836,14 +841,14 @@ int HashMgr::parse_aliasm(char * line, FILE * af) tp = nl; i = 0; aliasm[j] = NULL; - piece = mystrsep(&tp, 0); + piece = mystrsep(&tp, ' '); while (piece) { if (*piece != '\0') { switch(i) { case 0: { if (strncmp(piece,"AM",2) != 0) { HUNSPELL_WARNING(stderr, "error: AM table is corrupt\n"); - free(piece); + // free(piece); numaliasm = 0; free(aliasm); aliasm = NULL; @@ -852,6 +857,11 @@ int HashMgr::parse_aliasm(char * line, FILE * af) break; } case 1: { + // add the remaining of the line + if (*tp) { + *(tp - 1) = ' '; + tp = tp + strlen(tp); + } if (complexprefixes) { if (utf8) reverseword_utf(piece); else reverseword(piece); @@ -862,8 +872,8 @@ int HashMgr::parse_aliasm(char * line, FILE * af) } i++; } - free(piece); - piece = mystrsep(&tp, 0); + // free(piece); + piece = mystrsep(&tp, ' '); } if (!aliasm[j]) { numaliasm = 0; @@ -885,4 +895,3 @@ char * HashMgr::get_aliasm(int index) { HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index); return NULL; } -#endif diff --git a/src/myspell/hashmgr.hxx b/src/myspell/hashmgr.hxx index cf5148f..d88de48 100644 --- a/src/myspell/hashmgr.hxx +++ b/src/myspell/hashmgr.hxx @@ -42,18 +42,16 @@ public: int hash(const char *) const; struct hentry * walk_hashtable(int & col, struct hentry * hp) const; - int put_word(const char * word, char * ap); - int put_word_pattern(const char * word, const char * pattern); + int add(const char * word, char * aff); + int add_with_affix(const char * word, const char * pattern); + int remove(const char * word); int decode_flags(unsigned short ** result, char * flags); unsigned short decode_flag(const char * flag); char * encode_flag(unsigned short flag); int is_aliasf(); int get_aliasf(int index, unsigned short ** fvec); -#ifdef HUNSPELL_EXPERIMENTAL int is_aliasm(); char * get_aliasm(int index); -#endif - private: int get_clen_and_captype(const char * word, int wbl, int * captype); @@ -64,9 +62,7 @@ private: int parse_aliasf(char * line, FILE * af); int add_hidden_capitalized_word(char * word, int wbl, int wcl, unsigned short * flags, int al, char * dp, int captype); -#ifdef HUNSPELL_EXPERIMENTAL int parse_aliasm(char * line, FILE * af); -#endif }; diff --git a/src/myspell/htypes.hxx b/src/myspell/htypes.hxx index ea43730..bc078c3 100644 --- a/src/myspell/htypes.hxx +++ b/src/myspell/htypes.hxx @@ -8,6 +8,16 @@ #define ROTATE(v,q) \ (v) = ((v) << (q)) | (((v) >> (32 - q)) & ((1 << (q))-1)); +// hentry options +#define H_OPT (1 << 0) +#define H_OPT_ALIASM (1 << 1) +#define H_OPT_PHON (1 << 2) + +#define HENTRY_WORD(h) &(h->word) +#define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \ + *((char **) (&(h->word) + h->blen + 1)) : &(h->word) + h->blen + 1) : NULL) +#define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL) + // approx. number of user defined words #define USERWORD 1000 @@ -19,9 +29,6 @@ struct hentry unsigned short * astr; // affix flag vector struct hentry * next; // next word with same hash code struct hentry * next_homonym; // next homonym word (with same hash code) -#ifdef HUNSPELL_EXPERIMENTAL - char * description; // morphological data (optional) -#endif char var; // variable fields (only for special pronounciation yet) char word; // variable-length word (8-bit or UTF-8 encoding) }; diff --git a/src/myspell/hunspell.cxx b/src/myspell/hunspell.cxx index cd31a64..3f114c9 100644 --- a/src/myspell/hunspell.cxx +++ b/src/myspell/hunspell.cxx @@ -14,6 +14,10 @@ #include "hunspell.hxx" #include "hunspell.h" +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + #ifndef MOZILLA_CLIENT #ifndef W32 using namespace std; @@ -59,7 +63,7 @@ Hunspell::~Hunspell() pAMgr = NULL; pHMgr = NULL; #ifdef MOZILLA_CLIENT - delete csconv; + free(csconv); #endif csconv= NULL; if (encoding) free(encoding); @@ -117,7 +121,6 @@ int Hunspell::cleanword2(char * dest, const char * src, return nl; } -#ifdef HUNSPELL_EXPERIMENTAL int Hunspell::cleanword(char * dest, const char * src, int * pcaptype, int * pabbrev) { @@ -190,7 +193,6 @@ int Hunspell::cleanword(char * dest, const char * src, } return strlen(dest); } -#endif void Hunspell::mkallcap(char * p) { @@ -602,9 +604,9 @@ struct hentry * Hunspell::checkword(const char * w, int * info, char ** root) return NULL; } - // he = next not pseudoroot, onlyincompound homonym or onlyupcase word + // he = next not needaffix, onlyincompound homonym or onlyupcase word while (he && (he->astr) && - ((pAMgr->get_pseudoroot() && TESTAFF(he->astr, pAMgr->get_pseudoroot(), he->alen)) || + ((pAMgr->get_needaffix() && TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) || (pAMgr->get_onlyincompound() && TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) || (info && (*info & SPELL_INITCAP) && TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)) )) he = he->next_homonym; @@ -1025,95 +1027,78 @@ int Hunspell::suggest_auto(char*** slst, const char * word) // END OF LANG_hu section return ns; } +#endif -// XXX need UTF-8 support -int Hunspell::stem(char*** slst, const char * word) +int Hunspell::stem(char*** slst, char ** desc, int n) { - char cw[MAXWORDUTF8LEN]; - char wspace[MAXWORDUTF8LEN]; - if (! pSMgr) return 0; - int wl = strlen(word); - if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; - } else { - if (wl >= MAXWORDLEN) return 0; - } - int captype = 0; - int abbv = 0; - wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 0; - - int ns = 0; - - *slst = NULL; // HU, nsug in pSMgr->suggest - - switch(captype) { - case HUHCAP: - case NOCAP: { - ns = pSMgr->suggest_stems(slst, cw, ns); - - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - - break; - } - - case INITCAP: { - - ns = pSMgr->suggest_stems(slst, cw, ns); - - if (ns == 0) { - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - - } - - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - mkallsmall(wspace); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - - break; - - } - - case ALLCAP: { - ns = pSMgr->suggest_stems(slst, cw, ns); - if (ns != 0) break; - - memcpy(wspace,cw,(wl+1)); - mkallsmall(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - - if (ns == 0) { - mkinitcap(wspace); - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - - if ((abbv) && (ns == 0)) { - memcpy(wspace,cw,wl); - mkallsmall(wspace); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - ns = pSMgr->suggest_stems(slst, wspace, ns); - } - + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + if (n == 0) return 0; + *result2 = '\0'; + for (int i = 0; i < n; i++) { + *result = '\0'; + // add compound word parts (except the last one) + char * s = (char *) desc[i]; + char * part = strstr(s, MORPH_PART); + if (part) { + char * nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + copy_field(result + strlen(result), part, MORPH_PART); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } - break; - } + char **pl; + char tok[MAXLNLEN]; + strcpy(tok, s); + char * alt = strstr(tok, " | "); + while (alt) { + alt[1] = MSEP_ALT; + alt = strstr(alt, " | "); + } + int pln = line_tok(tok, &pl, MSEP_ALT); + for (int i = 0; i < pln; i++) { + // add derivational suffixes + if (strstr(pl[i], MORPH_DERI_SFX)) { + // remove inflectional suffixes + char * is = strstr(pl[i], MORPH_INFL_SFX); + if (is) *is = '\0'; + char * sg = pSMgr->suggest_gen(&(pl[i]), 1, pl[i]); + if (sg) { + char ** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + for (int j = 0; j < genl; j++) { + sprintf(result2 + strlen(result2), "%c%s%s", + MSEP_REC, result, gen[j]); + } + freelist(&gen, genl); + } + } else { + sprintf(result2 + strlen(result2), "%c%s", MSEP_REC, result); + if (strstr(pl[i], MORPH_SURF_PFX)) { + copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX); + } + copy_field(result2 + strlen(result2), pl[i], MORPH_STEM); + } + } + freelist(&pl, pln); } - - return ns; + return uniqlist(*slst, line_tok(result2, slst, MSEP_REC)); } +int Hunspell::stem(char*** slst, const char * word) +{ + char ** pl; + int pln = analyze(&pl, word); + int pln2 = stem(slst, pl, pln); + freelist(&pl, pln); + return pln2; +} + +#ifdef HUNSPELL_EXPERIMENTAL int Hunspell::suggest_pos_stems(char*** slst, const char * word) { char cw[MAXWORDUTF8LEN]; @@ -1236,15 +1221,23 @@ int Hunspell::mkinitsmall2(char * p, w_char * u, int nc) return nc; } -int Hunspell::put_word(const char * word) +int Hunspell::add(const char * word) { - if (pHMgr) return pHMgr->put_word(word, NULL); + if (pHMgr) return pHMgr->add(word, NULL); return 0; } -int Hunspell::put_word_pattern(const char * word, const char * pattern) +int Hunspell::add_with_affix(const char * word, const char * example) { - if (pHMgr) return pHMgr->put_word_pattern(word, pattern); + if (pHMgr) return pHMgr->add_with_affix(word, example); + return 0; +} + +/* XXX not implemented yet */ + +int Hunspell::remove(const char * word) +{ + if (pHMgr) return pHMgr->remove(word); return 0; } @@ -1258,9 +1251,16 @@ struct cs_info * Hunspell::get_csconv() return csconv; } -#ifdef HUNSPELL_EXPERIMENTAL -// XXX need UTF-8 support -char * Hunspell::morph(const char * word) +char * Hunspell::cat_result(char * result, char * st) +{ + if (st) { + if (*result) strcat(result, "\n"); + strcat(result, st); + free(st); + } +} + +int Hunspell::analyze(char*** slst, const char * word) { char cw[MAXWORDUTF8LEN]; char wspace[MAXWORDUTF8LEN]; @@ -1305,156 +1305,77 @@ char * Hunspell::morph(const char * word) } } - if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return NULL; + if ((n == wl) && (n3 > 0) && (n - n3 > 3)) return 0; if ((n == wl) || ((n>0) && ((cw[n]=='%') || (cw[n]=='\xB0')) && checkword(cw+n, NULL, NULL))) { strcat(result, cw); result[n - 1] = '\0'; - if (n == wl) { - st = pSMgr->suggest_morph(cw + n - 1); - if (st) { - strcat(result, st); - free(st); - } - } else { + if (n == wl) cat_result(result, pSMgr->suggest_morph(cw + n - 1)); + else { char sign = cw[n]; cw[n] = '\0'; - st = pSMgr->suggest_morph(cw + n - 1); - if (st) { - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(cw + n - 1)); strcat(result, "+"); // XXX SPEC. MORPHCODE cw[n] = sign; - st = pSMgr->suggest_morph(cw + n); - if (st) { - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(cw + n)); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } // END OF LANG_hu section switch(captype) { - case NOCAP: { - st = pSMgr->suggest_morph(cw); - if (st) { - strcat(result, st); - free(st); - } - if (abbv) { - memcpy(wspace,cw,wl); - *(wspace+wl) = '.'; - *(wspace+wl+1) = '\0'; - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - } - break; - } + case NOCAP: { + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + cat_result(result, pSMgr->suggest_morph(wspace)); + } + break; + } case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } - st = pSMgr->suggest_morph(cw); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - if (abbv) { - memcpy(wspace,cw,wl); + cat_result(result, pSMgr->suggest_morph(wspace)); + cat_result(result, pSMgr->suggest_morph(cw)); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(wspace)); mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(wspace)); } break; } case HUHCAP: { - st = pSMgr->suggest_morph(cw); - if (st) { - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(cw)); #if 0 memcpy(wspace,cw,(wl+1)); mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(wspace)); #endif break; } case ALLCAP: { memcpy(wspace,cw,(wl+1)); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(wspace)); mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - if (abbv) { + cat_result(result, pSMgr->suggest_morph(wspace)); + mkinitcap(wspace); + cat_result(result, pSMgr->suggest_morph(wspace)); + if (abbv) { memcpy(wspace,cw,(wl+1)); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; - if (*result) strcat(result, "\n"); - st = pSMgr->suggest_morph(wspace); - if (st) { - strcat(result, st); - free(st); - } + cat_result(result, pSMgr->suggest_morph(wspace)); mkallsmall(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - mkinitcap(wspace); - st = pSMgr->suggest_morph(wspace); - if (st) { - if (*result) strcat(result, "\n"); - strcat(result, st); - free(st); - } - } + cat_result(result, pSMgr->suggest_morph(wspace)); + mkinitcap(wspace); + cat_result(result, pSMgr->suggest_morph(wspace)); + } break; } } @@ -1464,7 +1385,8 @@ char * Hunspell::morph(const char * word) if (complexprefixes) { if (utf8) reverseword_utf(result); else reverseword(result); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); + } // compound word with dash (HU) I18n @@ -1476,7 +1398,7 @@ char * Hunspell::morph(const char * word) *dash='\0'; // examine 2 sides of the dash if (dash[1] == '\0') { // base word ending with dash - if (spell(cw)) return pSMgr->suggest_morph(cw); + if (spell(cw)) return line_tok(pSMgr->suggest_morph(cw), slst, MSEP_REC); } else if ((dash[1] == 'e') && (dash[2] == '\0')) { // XXX (HU) -e hat. if (spell(cw) && (spell("-e"))) { st = pSMgr->suggest_morph(cw); @@ -1490,7 +1412,7 @@ char * Hunspell::morph(const char * word) strcat(result, st); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } else { // first word ending with dash: word- XXX ??? @@ -1502,18 +1424,18 @@ char * Hunspell::morph(const char * word) dash[0]='\0'; if (nresult && spell(dash+1) && ((strlen(dash+1) > 1) || ((dash[1] > '0') && (dash[1] < '9')))) { - st = morph(cw); + st = pSMgr->suggest_morph(cw); if (st) { strcat(result, st); free(st); strcat(result,"+"); // XXX spec. separator in MORPHCODE } - st = morph(dash+1); + st = pSMgr->suggest_morph(dash+1); if (st) { strcat(result, st); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } // affixed number in correct word @@ -1539,30 +1461,89 @@ char * Hunspell::morph(const char * word) strcat(result, st); free(st); } - return mystrdup(result); + return line_tok(result, slst, MSEP_REC); } } } } - return NULL; + return 0; } +int Hunspell::generate(char*** slst, const char * word, char ** pl, int pln) +{ + if (!pSMgr || !pln) return 0; + char **pl2; + char pl2n = analyze(&pl2, word); + int captype = 0; + int abbv = 0; + char cw[MAXWORDUTF8LEN]; + cleanword(cw, word, &captype, &abbv); + char result[MAXLNLEN]; + *result = '\0'; + + for (int i = 0; i < pln; i++) { + cat_result(result, pSMgr->suggest_gen(pl2, pl2n, pl[i])); + } + freelist(&pl2, pl2n); + + if (*result) { + // allcap + if (captype == ALLCAP) mkallcap(result); + + // line split + int linenum = line_tok(result, slst, MSEP_REC); + + // capitalize + if (captype == INITCAP || captype == HUHINITCAP) { + for (int j=0; j < linenum; j++) mkinitcap((*slst)[j]); + } + + // temporary filtering of prefix related errors (eg. + // generate("undrinkable", "eats") --> "undrinkables" and "*undrinks") + + int r = 0; + for (int j=0; j < linenum; j++) { + if (!spell((*slst)[j])) { + free((*slst)[j]); + (*slst)[j] = NULL; + } else { + if (r < j) (*slst)[r] = (*slst)[j]; + r++; + } + } + if (r > 0) return r; + free(*slst); + *slst = NULL; + } + return 0; +} + +int Hunspell::generate(char*** slst, const char * word, const char * pattern) +{ + char **pl; + char pln = analyze(&pl, pattern); + int n = generate(slst, word, pl, pln); + freelist(&pl, pln); + return uniqlist(*slst, n); +} + +#ifdef HUNSPELL_EXPERIMENTAL // XXX need UTF-8 support char * Hunspell::morph_with_correction(const char * word) { char cw[MAXWORDUTF8LEN]; char wspace[MAXWORDUTF8LEN]; - if (! pSMgr) return 0; + if (! pSMgr) return NULL; int wl = strlen(word); if (utf8) { - if (wl >= MAXWORDUTF8LEN) return 0; + if (wl >= MAXWORDUTF8LEN) return NULL; } else { - if (wl >= MAXWORDLEN) return 0; + if (wl >= MAXWORDLEN) return NULL; } int captype = 0; int abbv = 0; wl = cleanword(cw, word, &captype, &abbv); - if (wl == 0) return 0; + if (wl == 0) return NULL; char result[MAXLNLEN]; char * st = NULL; @@ -1577,8 +1558,8 @@ char * Hunspell::morph_with_correction(const char * word) strcat(result, st); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; st = pSMgr->suggest_morph_for_spelling_error(wspace); @@ -1598,14 +1579,14 @@ char * Hunspell::morph_with_correction(const char * word) strcat(result, st); free(st); } - st = pSMgr->suggest_morph_for_spelling_error(cw); + st = pSMgr->suggest_morph_for_spelling_error(cw); if (st) { if (*result) strcat(result, "\n"); strcat(result, st); free(st); } - if (abbv) { - memcpy(wspace,cw,wl); + if (abbv) { + memcpy(wspace,cw,wl); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; mkallsmall(wspace); @@ -1614,14 +1595,14 @@ char * Hunspell::morph_with_correction(const char * word) if (*result) strcat(result, "\n"); strcat(result, st); free(st); - } + } mkinitcap(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { if (*result) strcat(result, "\n"); strcat(result, st); free(st); - } + } } break; } @@ -1655,22 +1636,22 @@ char * Hunspell::morph_with_correction(const char * word) strcat(result, st); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { if (*result) strcat(result, "\n"); strcat(result, st); free(st); } - if (abbv) { + if (abbv) { memcpy(wspace,cw,(wl+1)); *(wspace+wl) = '.'; *(wspace+wl+1) = '\0'; if (*result) strcat(result, "\n"); st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { - strcat(result, st); - free(st); + strcat(result, st); + free(st); } mkallsmall(wspace); st = pSMgr->suggest_morph_for_spelling_error(wspace); @@ -1679,14 +1660,14 @@ char * Hunspell::morph_with_correction(const char * word) strcat(result, st); free(st); } - mkinitcap(wspace); - st = pSMgr->suggest_morph_for_spelling_error(wspace); + mkinitcap(wspace); + st = pSMgr->suggest_morph_for_spelling_error(wspace); if (st) { if (*result) strcat(result, "\n"); strcat(result, st); free(st); } - } + } break; } } @@ -1695,37 +1676,6 @@ char * Hunspell::morph_with_correction(const char * word) return NULL; } -/* analyze word - * return line count - * XXX need a better data structure for morphological analysis */ -int Hunspell::analyze(char ***out, const char *word) { - int n = 0; - if (!word) return 0; - char * m = morph(word); - if(!m) return 0; - if (!out) - { - n = line_tok(m, out); - free(m); - return n; - } - - // without memory allocation - /* BUG missing buffer size checking */ - int i, p; - for(p = 0, i = 0; m[i]; i++) { - if(m[i] == '\n' || !m[i+1]) { - n++; - strncpy((*out)[n++], m + p, i - p + 1); - if (m[i] == '\n') (*out)[n++][i - p] = '\0'; - if(!m[i+1]) break; - p = i + 1; - } - } - free(m); - return n; -} - #endif // END OF HUNSPELL_EXPERIMENTAL CODE Hunhandle *Hunspell_create(const char * affpath, const char * dpath) @@ -1753,3 +1703,54 @@ int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word) return ((Hunspell*)pHunspell)->suggest(slst, word); } +int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->analyze(slst, word); +} + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word) +{ + return ((Hunspell*)pHunspell)->stem(slst, word); +} + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, char** desc, int n) +{ + return ((Hunspell*)pHunspell)->stem(slst, desc, n); +} + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + const char * word2) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, word2); +} + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + char** desc, int n) +{ + return ((Hunspell*)pHunspell)->generate(slst, word, desc, n); +} + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + +int Hunspell_add(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->add(word); +} + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +int Hunspell_add_with_affix(Hunhandle *pHunspell, const char * word, + const char * example) { + return ((Hunspell*)pHunspell)->add_with_affix(word, example); +} + + /* remove word from the run-time dictionary */ + /* NOTE: not implemented yet */ + +int Hunspell_remove(Hunhandle *pHunspell, const char * word) { + return ((Hunspell*)pHunspell)->remove(word); +} diff --git a/src/myspell/hunspell.h b/src/myspell/hunspell.h index b04b83a..452599c 100644 --- a/src/myspell/hunspell.h +++ b/src/myspell/hunspell.h @@ -26,6 +26,60 @@ char *Hunspell_get_dic_encoding(Hunhandle *pHunspell); */ int Hunspell_suggest(Hunhandle *pHunspell, char*** slst, const char * word); + /* morphological functions */ + + /* analyze(result, word) - morphological analysis of the word */ + +int Hunspell_analyze(Hunhandle *pHunspell, char*** slst, const char * word); + + /* stem(result, word) - stemmer function */ + +int Hunspell_stem(Hunhandle *pHunspell, char*** slst, const char * word); + + /* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = Hunspell_analyze(result, "words"); + * int n2 = Hunspell_stem2(result2, result, n1); + */ + +int Hunspell_stem2(Hunhandle *pHunspell, char*** slst, char** desc, int n); + + /* generate(result, word, word2) - morphological generation by example(s) */ + +int Hunspell_generate(Hunhandle *pHunspell, char*** slst, const char * word, + const char * word2); + + /* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = Hunspell_generate2(result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + +int Hunspell_generate2(Hunhandle *pHunspell, char*** slst, const char * word, + char** desc, int n); + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ + +int Hunspell_add(const char * word); + + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. + */ + +int Hunspell_add_with_affix(const char * word, const char * example); + + /* remove word from the run-time dictionary */ + /* NOTE: not implemented yet */ + +int Hunspell_remove(const char * word); + + #ifdef __cplusplus } #endif diff --git a/src/myspell/hunspell.hxx b/src/myspell/hunspell.hxx index 6d3f092..38c141e 100644 --- a/src/myspell/hunspell.hxx +++ b/src/myspell/hunspell.hxx @@ -73,17 +73,59 @@ public: */ int suggest(char*** slst, const char * word); + char * get_dic_encoding(); - /* handling custom dictionary */ + /* morphological functions */ + + /* analyze(result, word) - morphological analysis of the word */ + + int analyze(char*** slst, const char * word); + + /* stem(result, word) - stemmer function */ + + int stem(char*** slst, const char * word); + + /* stem(result, analysis, n) - get stems from a morph. analysis + * example: + * char ** result, result2; + * int n1 = analyze(result, "words"); + * int n2 = stem(result2, result, n1); + */ + + int stem(char*** slst, char ** morph, int n); + + /* generate(result, word, word2) - morphological generation by example(s) */ + + int generate(char*** slst, const char * word, const char * word2); + + /* generate(result, word, desc, n) - generation by morph. description(s) + * example: + * char ** result; + * char * affix = "is:plural"; // description depends from dictionaries, too + * int n = generate(result, "word", &affix, 1); + * for (int i = 0; i < n; i++) printf("%s\n", result[i]); + */ + + int generate(char*** slst, const char * word, char ** desc, int n); + + /* functions for run-time modification of the dictionary */ + + /* add word to the run-time dictionary */ - int put_word(const char * word); + int add(const char * word); - /* pattern is a sample dictionary word - * put word into custom dictionary with affix flags of pattern word + /* add word to the run-time dictionary with affix flags of + * the example (a dictionary word): Hunspell will recognize + * affixed forms of the new word, too. */ - int put_word_pattern(const char * word, const char * pattern); + int add_with_affix(const char * word, const char * example); + + /* remove word from the run-time dictionary */ + /* NOTE: not implemented yet */ + + int remove(const char * word); /* other */ @@ -93,25 +135,14 @@ public: struct cs_info * get_csconv(); const char * get_version(); - - /* experimental functions */ + + /* experimental and deprecated functions */ #ifdef HUNSPELL_EXPERIMENTAL - /* suffix is an affix flag string, similarly in dictionary files */ - + /* suffix is an affix flag string, similarly in dictionary files */ int put_word_suffix(const char * word, const char * suffix); - - /* morphological analysis */ - - char * morph(const char * word); - int analyze(char*** out, const char *word); - char * morph_with_correction(const char * word); - /* stemmer function */ - - int stem(char*** slst, const char * word); - /* spec. suggestions */ int suggest_auto(char*** slst, const char * word); int suggest_pos_stems(char*** slst, const char * word); @@ -133,6 +164,8 @@ private: hentry * spellsharps(char * base, char *, int, int, char * tmp, int * info, char **root); int is_keepcase(const hentry * rv); int insert_sug(char ***slst, char * word, int ns); + char * cat_result(char * result, char * st); + char * stem_description(const char * desc); }; diff --git a/src/myspell/license.hunspell b/src/myspell/license.hunspell index 92c531c..81ffad8 100644 --- a/src/myspell/license.hunspell +++ b/src/myspell/license.hunspell @@ -14,7 +14,7 @@ * The Original Code is Hunspell, based on MySpell. * * The Initial Developers of the Original Code are - * Kevin Hendricks (MySpell) and Németh László (Hunspell). + * Kevin Hendricks (MySpell) and Laszlo Nemeth (Hunspell). * Portions created by the Initial Developers are Copyright (C) 2002-2005 * the Initial Developers. All Rights Reserved. * @@ -24,22 +24,22 @@ * Giuseppe Modugno * Gianluca Turconi * Simon Brouwer - * Noll János - * Bíró Árpád - * Goldman Eleonóra - * Sarlós Tamás - * Bencsáth Boldizsár - * Halácsy Péter - * Dvornik László - * Gefferth András + * Noll Janos + * Biro Arpad + * Goldman Eleonora + * Sarlos Tamas + * Bencsath Boldizsar + * Halacsy Peter + * Dvornik Laszlo + * Gefferth Andras * Nagy Viktor - * Varga Dániel + * Varga Daniel * Chris Halls * Rene Engelhard * Bram Moolenaar * Dafydd Jones - * Harri Pitkänen - * András Tímár + * Harri Pitkanen + * Andras Timar * Tor Lillqvist * * Alternatively, the contents of this file may be used under the terms of @@ -58,4 +58,4 @@ #ifdef HAVE_CONFIG_H #include "config.h" -#endif
\ No newline at end of file +#endif diff --git a/src/myspell/makefile.mk b/src/myspell/makefile.mk new file mode 100644 index 0000000..8eeaebe --- /dev/null +++ b/src/myspell/makefile.mk @@ -0,0 +1,113 @@ +#************************************************************************* +# +# $RCSfile: makefile.mk,v $ +# +# $Revision: 1.7 $ +# +# last change: $Author: vg $ $Date: 2003/06/12 10:38:24 $ +# +# The Contents of this file are made available subject to the terms of +# either of the following licenses +# +# - GNU Lesser General Public License Version 2.1 +# - Sun Industry Standards Source License Version 1.1 +# +# Sun Microsystems Inc., October, 2000 +# +# GNU Lesser General Public License Version 2.1 +# ============================================= +# Copyright 2000 by Sun Microsystems, Inc. +# 901 San Antonio Road, Palo Alto, CA 94303, USA +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1, as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# +# +# Sun Industry Standards Source License Version 1.1 +# ================================================= +# The contents of this file are subject to the Sun Industry Standards +# Source License Version 1.1 (the "License"); You may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at http://www.openoffice.org/license.html. +# +# Software provided under this License is provided on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, +# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, +# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. +# See the License for the specific provisions governing your rights and +# obligations concerning the Software. +# +# The Initial Developer of the Original Code is: Sun Microsystems, Inc. +# +# Copyright: 2000 by Sun Microsystems, Inc. +# +# All Rights Reserved. +# +# Contributor(s): _______________________________________ +# +# +# +#************************************************************************* + +PRJ = .. + +PRJNAME = hunspell +TARGET = hunspell +LIBTARGET=NO + +#----- Settings --------------------------------------------------------- + +.INCLUDE : settings.mk + +# --- Files -------------------------------------------------------- + +# all_target: ALLTAR DICTIONARY +all_target: ALLTAR + +##CXXFLAGS += -I..$/..$/lingutil +##CFLAGSCXX += -I..$/..$/lingutil +##CFLAGSCC += -I..$/..$/lingutil + +CDEFS+=-DOPENOFFICEORG + +SLOFILES= \ + $(SLO)$/affentry.obj \ + $(SLO)$/affixmgr.obj \ + $(SLO)$/dictmgr.obj \ + $(SLO)$/csutil.obj \ + $(SLO)$/utf_info.obj \ + $(SLO)$/hashmgr.obj \ + $(SLO)$/suggestmgr.obj \ + $(SLO)$/hunspell.obj + +LIB1TARGET= $(SLB)$/lib$(TARGET).lib +LIB1ARCHIV= $(LB)/lib$(TARGET).a +LIB1OBJFILES= $(SLOFILES) + +# DIC2BIN= \ +# en_US.aff \ +# en_US.dic +# +# de_DE.aff \ +# de_DE.dic + + +# DICTIONARY : +# +$(COPY) $(foreach,i,$(DIC2BIN) $i) $(BIN) + + +# --- Targets ------------------------------------------------------ + +.INCLUDE : target.mk + diff --git a/src/myspell/suggestmgr.cxx b/src/myspell/suggestmgr.cxx index 964004c..b1a58f3 100644 --- a/src/myspell/suggestmgr.cxx +++ b/src/myspell/suggestmgr.cxx @@ -14,6 +14,7 @@ #endif #include "suggestmgr.hxx" +#include "htypes.hxx" #ifndef MOZILLA_CLIENT #ifndef W32 @@ -278,7 +279,7 @@ int SuggestMgr::suggest_auto(char*** slst, const char * w, int nsug) if ((nsug < maxSug) && (nsug > -1)) nsug = mapchars(wlst, word, nsug, cpdsuggest); - if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; else * + if ((cpdsuggest==0) && (nsug>0)) nocompoundtwowords=1; // perhaps we forgot to hit space and two words ran together @@ -1055,6 +1056,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) scphon = scoresphon[MAX_ROOTS-1]; char w2[MAXWORDUTF8LEN]; + char f[MAXSWUTF8L]; char * word = w; // word reversing wrapper for complex prefixes @@ -1085,7 +1087,6 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) strcpy(candidate, word); mkallcap(candidate, csconv); phonet(candidate, target, n, *ph); -// fprintf(stderr, "Tip: %s->%s\n", candidate, target); } while ((hp = pHMgr->walk_hashtable(col, hp))) { @@ -1095,19 +1096,19 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) TESTAFF(hp->astr, pAMgr->get_nosuggest(), hp->alen) || TESTAFF(hp->astr, pAMgr->get_onlyincompound(), hp->alen))) continue; - sc = ngram(3, word, &(hp->word), NGRAM_LONGER_WORSE + NGRAM_LOWERING) + - leftcommonsubstring(word, &(hp->word)); + sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + NGRAM_LOWERING) + + leftcommonsubstring(word, HENTRY_WORD(hp)); // check special pronounciation - if (hp->var) { - int sc2 = ngram(3, word, &(hp->word) + hp->blen + 1, NGRAM_LONGER_WORSE + NGRAM_LOWERING) + - leftcommonsubstring(word, &(hp->word) + hp->blen + 1); + if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { + int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + NGRAM_LOWERING) + + leftcommonsubstring(word, f); if (sc2 > sc) sc = sc2; } if (ph && (sc > 2) && (abs(n - (int) hp->clen) <= 3)) { char target2[MAXSWUTF8L]; - strcpy(candidate, &(hp->word)); + strcpy(candidate, HENTRY_WORD(hp)); mkallcap(candidate, csconv); phonet(candidate, target2, -1, *ph); scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); @@ -1126,7 +1127,7 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) if (scphon > scoresphon[lpphon]) { scoresphon[lpphon] = scphon; - rootsphon[lpphon] = &(hp->word); + rootsphon[lpphon] = HENTRY_WORD(hp); lval = scphon; for (j=0; j < MAX_ROOTS; j++) if (scoresphon[j] < lval) { @@ -1178,9 +1179,9 @@ int SuggestMgr::ngsuggest(char** wlst, char * w, int ns, HashMgr* pHMgr) for (i = 0; i < MAX_ROOTS; i++) { if (roots[i]) { struct hentry * rp = roots[i]; - int nw = pAMgr->expand_rootword(glst, MAX_WORDS, &(rp->word), rp->blen, + int nw = pAMgr->expand_rootword(glst, MAX_WORDS, HENTRY_WORD(rp), rp->blen, rp->astr, rp->alen, word, nc, - ((rp->var) ? &(rp->word) + rp->blen + 1 : NULL)); + ((rp->var & H_OPT_PHON) ? copy_field(f, HENTRY_DATA(rp), MORPH_PHON) : NULL)); for (int k = 0; k < nw ; k++) { sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + NGRAM_LOWERING) + @@ -1383,7 +1384,7 @@ int SuggestMgr::checkword(const char * word, int len, int cpdsuggest, int * time if ((rv->astr) && (TESTAFF(rv->astr,pAMgr->get_forbiddenword(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_nosuggest(),rv->alen))) return 0; while (rv) { - if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || + if (rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { rv = rv->next_homonym; @@ -1423,7 +1424,7 @@ int SuggestMgr::check_forbidden(const char * word, int len) if (pAMgr) { rv = pAMgr->lookup(word); - if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_pseudoroot(),rv->alen) || + if (rv && rv->astr && (TESTAFF(rv->astr,pAMgr->get_needaffix(),rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) rv = NULL; if (!(pAMgr->prefix_check(word,len,1))) rv = pAMgr->suffix_check(word,len, 0, NULL, NULL, 0, NULL); // prefix+suffix, suffix @@ -1434,184 +1435,6 @@ int SuggestMgr::check_forbidden(const char * word, int len) } #ifdef HUNSPELL_EXPERIMENTAL -// suggest stems, XXX experimental code -int SuggestMgr::suggest_stems(char*** slst, const char * w, int nsug) -{ - char buf[MAXSWUTF8L]; - char ** wlst; - int prevnsug = nsug; - - char w2[MAXWORDUTF8LEN]; - const char * word = w; - - // word reversing wrapper for complex prefixes - if (complexprefixes) { - strcpy(w2, w); - if (utf8) reverseword_utf(w2); else reverseword(w2); - word = w2; - } - - if (*slst) { - wlst = *slst; - } else { - wlst = (char **) calloc(maxSug, sizeof(char *)); - if (wlst == NULL) return -1; - } - // perhaps there are a fix stem in the dictionary - if ((nsug < maxSug) && (nsug > -1)) { - - nsug = fixstems(wlst, word, nsug); - if (nsug == prevnsug) { - char * s = mystrdup(word); - char * p = s + strlen(s); - while ((*p != '-') && (p != s)) p--; - if (*p == '-') { - *p = '\0'; - nsug = fixstems(wlst, s, nsug); - if ((nsug == prevnsug) && (nsug < maxSug) && (nsug >= 0)) { - char * t; - buf[0] = '\0'; - for (t = s; (t[0] != '\0') && ((t[0] >= '0') || (t[0] <= '9')); t++); // is a number? - if (*t != '\0') strcpy(buf, "# "); - strcat(buf, s); - wlst[nsug] = mystrdup(buf); - if (wlst[nsug] == NULL) return -1; - nsug++; - } - p++; - nsug = fixstems(wlst, p, nsug); - } - - free(s); - } - } - - if (nsug < 0) { - for (int i=0;i<maxSug; i++) - if (wlst[i] != NULL) free(wlst[i]); - free(wlst); - return -1; - } - - *slst = wlst; - return nsug; -} - - -// there are fix stems in dictionary -int SuggestMgr::fixstems(char ** wlst, const char * word, int ns) -{ - char buf[MAXSWUTF8L]; - char prefix[MAXSWUTF8L] = ""; - - int dicstem = 1; // 0 = lookup, 1= affix, 2 = compound - int cpdindex = 0; - struct hentry * rv = NULL; - - int wl = strlen(word); - int cmpdstemnum; - int cmpdstem[MAXCOMPOUND]; - - if (pAMgr) { - rv = pAMgr->lookup(word); - if (rv) { - dicstem = 0; - } else { - // try stripping off affixes - rv = pAMgr->affix_check(word, wl); - - // else try check compound word - if (!rv && pAMgr->get_compound()) { - rv = pAMgr->compound_check(word, wl, - 0, 0, 100, 0, NULL, 0, &cmpdstemnum, cmpdstem,1); - - if (rv) { - dicstem = 2; - for (int j = 0; j < cmpdstemnum; j++) { - cpdindex += cmpdstem[j]; - } - if(! (pAMgr->lookup(word + cpdindex))) - pAMgr->affix_check(word + cpdindex, wl - cpdindex); // for prefix - } - } - - - if (pAMgr->get_prefix()) { - strcpy(prefix, pAMgr->get_prefix()); - } - - // XXX obsolete, will be a general solution for stemming - if ((prefix) && (strncmp(prefix, "leg", 3)==0)) prefix[0] = '\0'; // (HU) - } - - } - - - - if ((rv) && (ns < maxSug)) { - - // check fixstem flag and not_valid_stem flag - // first word - if ((ns < maxSug) && (dicstem < 2)) { - strcpy(buf, prefix); - if ((dicstem > 0) && pAMgr->get_derived()) { - // XXX obsolote - if (strlen(prefix) == 1) { - strcat(buf, (pAMgr->get_derived()) + 1); - } else { - strcat(buf, pAMgr->get_derived()); - } - } else { - // special stem in affix description - const char * wordchars = pAMgr->get_wordchars(); - if (rv->description && - (strchr(wordchars, *(rv->description)))) { - char * desc = (rv->description) + 1; - while (strchr(wordchars, *desc)) desc++; - strncat(buf, rv->description, desc - (rv->description)); - } else { - strcat(buf, rv->word); - } - } - wlst[ns] = mystrdup(buf); - if (wlst[ns] == NULL) return -1; - ns++; - } - - if (dicstem == 2) { - - // compound stem - -// if (rv->astr && (strchr(rv->astr, '0') == NULL)) { - if (rv->astr) { - strcpy(buf, word); - buf[cpdindex] = '\0'; - if (prefix) strcat(buf, prefix); - if (pAMgr->get_derived()) { - strcat(buf, pAMgr->get_derived()); - } else { - // special stem in affix description - const char * wordchars = pAMgr->get_wordchars(); - if (rv->description && - (strchr(wordchars, *(rv->description)))) { - char * desc = (rv->description) + 1; - while (strchr(wordchars, *desc)) desc++; - strncat(buf, rv->description, desc - (rv->description)); - } else { - strcat(buf, rv->word); - } - } - if (ns < maxSug) { - wlst[ns] = mystrdup(buf); - if (wlst[ns] == NULL) return -1; - ns++; - } - } - } - } - return ns; -} - // suggest possible stems int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) { @@ -1651,6 +1474,7 @@ int SuggestMgr::suggest_pos_stems(char*** slst, const char * w, int nsug) *slst = wlst; return nsug; } +#endif // END OF HUNSPELL_EXPERIMENTAL CODE char * SuggestMgr::suggest_morph(const char * w) @@ -1679,13 +1503,18 @@ char * SuggestMgr::suggest_morph(const char * w) while (rv) { if ((!rv->astr) || !(TESTAFF(rv->astr, pAMgr->get_forbiddenword(), rv->alen) || - TESTAFF(rv->astr, pAMgr->get_pseudoroot(), rv->alen) || + TESTAFF(rv->astr, pAMgr->get_needaffix(), rv->alen) || TESTAFF(rv->astr,pAMgr->get_onlyincompound(),rv->alen))) { - if (rv->description && ((!rv->astr) || - !TESTAFF(rv->astr, pAMgr->get_lemma_present(), rv->alen))) + if (!HENTRY_FIND(rv, MORPH_STEM)) { + strcat(result, " "); + strcat(result, MORPH_STEM); strcat(result, word); - if (rv->description) strcat(result, rv->description); - strcat(result, "\n"); + } + if (HENTRY_DATA(rv)) { + strcat(result, " "); + strcat(result, HENTRY_DATA(rv)); + } + strcat(result, "\n"); } rv = rv->next_homonym; } @@ -1700,9 +1529,10 @@ char * SuggestMgr::suggest_morph(const char * w) pAMgr->compound_check_morph(word, strlen(word), 0, 0, 100, 0,NULL, 0, &r, NULL); - return (*result) ? mystrdup(line_uniq(delete_zeros(result))) : NULL; + return (*result) ? mystrdup(line_uniq(result, MSEP_REC)) : NULL; } +#ifdef HUNSPELL_EXPERIMENTAL char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) { char * p = NULL; @@ -1710,7 +1540,7 @@ char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) if (!**wlst) return NULL; // we will use only the first suggestion for (int i = 0; i < maxSug - 1; i++) wlst[i] = ""; - int ns = suggest(&wlst, word, maxSug - 1); + int ns = suggest(&wlst, word, maxSug - 1, NULL); if (ns == maxSug) { p = suggest_morph(wlst[maxSug - 1]); free(wlst[maxSug - 1]); @@ -1720,6 +1550,153 @@ char * SuggestMgr::suggest_morph_for_spelling_error(const char * word) } #endif // END OF HUNSPELL_EXPERIMENTAL CODE +/* affixation */ +char * SuggestMgr::suggest_hentry_gen(hentry * rv, char * pattern) +{ + char result[MAXLNLEN]; + *result = '\0'; + int sfxcount = get_sfxcount(pattern); + +// if (get_sfxcount(HENTRY_DATA(rv)) > sfxcount) return NULL; + + if (HENTRY_DATA(rv)) { + char * aff = pAMgr->morphgen(HENTRY_WORD(rv), rv->blen, rv->astr, rv->alen, + HENTRY_DATA(rv), pattern, 0); + if (aff) { + strcat(result, aff); + strcat(result, "\n"); + free(aff); + } + } + + // check all allomorphs + char allomorph[MAXLNLEN]; + char * p = NULL; + if (HENTRY_DATA(rv)) p = strstr(HENTRY_DATA(rv), MORPH_ALLOMORPH); + while (p) { + struct hentry * rv2 = NULL; + p += MORPH_TAG_LEN; + int plen = fieldlen(p); + strncpy(allomorph, p, plen); + allomorph[plen] = '\0'; + rv2 = pAMgr->lookup(allomorph); + while (rv2) { +// if (HENTRY_DATA(rv2) && get_sfxcount(HENTRY_DATA(rv2)) <= sfxcount) { + if (HENTRY_DATA(rv2)) { + char * st = strstr(HENTRY_DATA(rv2), MORPH_STEM); + if (st && (strncmp(st + MORPH_TAG_LEN, + HENTRY_WORD(rv), fieldlen(st + MORPH_TAG_LEN)) == 0)) { + char * aff = pAMgr->morphgen(HENTRY_WORD(rv2), rv2->blen, rv2->astr, rv2->alen, + HENTRY_DATA(rv2), pattern, 0); + if (aff) { + strcat(result, aff); + strcat(result, "\n"); + free(aff); + } + } + } + rv2 = rv2->next_homonym; + } + p = strstr(p + plen, MORPH_ALLOMORPH); + } + + return (*result) ? mystrdup(result) : NULL; +} + +char * SuggestMgr::suggest_gen(char ** desc, int n, char * pattern) { + char result[MAXLNLEN]; + char result2[MAXLNLEN]; + char newpattern[MAXLNLEN]; + *newpattern = '\0'; + if (n == 0) return 0; + *result2 = '\0'; + struct hentry * rv = NULL; + if (!pAMgr) return NULL; + +// search affixed forms with and without derivational suffixes + while(1) { + + for (int k = 0; k < n; k++) { + *result = '\0'; + // add compound word parts (except the last one) + char * s = (char *) desc[k]; + char * part = strstr(s, MORPH_PART); + if (part) { + char * nextpart = strstr(part + 1, MORPH_PART); + while (nextpart) { + copy_field(result + strlen(result), part, MORPH_PART); + part = nextpart; + nextpart = strstr(part + 1, MORPH_PART); + } + s = part; + } + + char **pl; + char tok[MAXLNLEN]; + strcpy(tok, s); + char * alt = strstr(tok, " | "); + while (alt) { + alt[1] = MSEP_ALT; + alt = strstr(alt, " | "); + } + int pln = line_tok(tok, &pl, MSEP_ALT); + for (int i = 0; i < pln; i++) { + // remove inflectional and terminal suffixes + char * is = strstr(pl[i], MORPH_INFL_SFX); + if (is) *is = '\0'; + char * ts = strstr(pl[i], MORPH_TERM_SFX); + while (ts) { + *ts = '_'; + ts = strstr(pl[i], MORPH_TERM_SFX); + } + char * st = strstr(s, MORPH_STEM); + if (st) { + copy_field(tok, st, MORPH_STEM); + rv = pAMgr->lookup(tok); + while (rv) { + char newpat[MAXLNLEN]; + strcpy(newpat, pl[i]); + strcat(newpat, pattern); + char * sg = suggest_hentry_gen(rv, newpat); + if (!sg) sg = suggest_hentry_gen(rv, pattern); + if (sg) { + char ** gen; + int genl = line_tok(sg, &gen, MSEP_REC); + free(sg); + sg = NULL; + for (int j = 0; j < genl; j++) { + if (strstr(pl[i], MORPH_SURF_PFX)) { + int r2l = strlen(result2); + result2[r2l] = MSEP_REC; + strcpy(result2 + r2l + 1, result); + copy_field(result2 + strlen(result2), pl[i], MORPH_SURF_PFX); + strcat(result2, gen[j]); + } else { + sprintf(result2 + strlen(result2), "%c%s%s", + MSEP_REC, result, gen[j]); + } + } + freelist(&gen, genl); + } + rv = rv->next_homonym; + } + } + } + freelist(&pl, pln); + } + + if (*result2 || !strstr(pattern, MORPH_DERI_SFX)) break; + strcpy(newpattern, pattern); + pattern = newpattern; + char * ds = strstr(pattern, MORPH_DERI_SFX); + while (ds) { + strncpy(ds, MORPH_TERM_SFX, MORPH_TAG_LEN); + ds = strstr(pattern, MORPH_DERI_SFX); + } + } + return (*result2 ? mystrdup(result2) : NULL); +} + // generate an n-gram score comparing s1 and s2 int SuggestMgr::ngram(int n, char * s1, const char * s2, int opt) diff --git a/src/myspell/suggestmgr.hxx b/src/myspell/suggestmgr.hxx index 28fa1ac..d22884f 100644 --- a/src/myspell/suggestmgr.hxx +++ b/src/myspell/suggestmgr.hxx @@ -57,6 +57,7 @@ public: int suggest_pos_stems(char*** slst, const char * word, int nsug); char * suggest_morph(const char * word); + char * suggest_gen(char ** pl, int pln, char * pattern); char * suggest_morph_for_spelling_error(const char * word); private: @@ -98,6 +99,7 @@ private: void bubblesort( char ** rwd, char ** rwd2, int * rsc, int n); void lcs(const char * s, const char * s2, int * l1, int * l2, char ** result); int lcslen(const char * s, const char* s2); + char * suggest_hentry_gen(hentry * rv, char * pattern); }; |