diff options
author | Dom Lachowicz <domlachowicz@gmail.com> | 2004-01-12 04:09:01 +0000 |
---|---|---|
committer | Dom Lachowicz <domlachowicz@gmail.com> | 2004-01-12 04:09:01 +0000 |
commit | 19ca5d04f57a178d77287b0a303adb293491ef20 (patch) | |
tree | 93ee3d237f3a83ad2c9060c602e099491292a70d | |
parent | 8b23ed4ec5e4769f66296b4e87ccbc6f4bfb9142 (diff) | |
download | enchant-19ca5d04f57a178d77287b0a303adb293491ef20.tar.gz |
upgrade to myspell 3.1
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@20903 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r-- | src/myspell/affentry.cxx | 2 | ||||
-rw-r--r-- | src/myspell/affixmgr.cxx | 259 | ||||
-rw-r--r-- | src/myspell/affixmgr.hxx | 16 | ||||
-rw-r--r-- | src/myspell/atypes.hxx | 5 | ||||
-rw-r--r-- | src/myspell/csutil.cxx | 27 | ||||
-rw-r--r-- | src/myspell/csutil.hxx | 3 | ||||
-rw-r--r-- | src/myspell/hashmgr.cxx | 4 | ||||
-rw-r--r-- | src/myspell/myspell.cxx | 39 | ||||
-rw-r--r-- | src/myspell/myspell.hxx | 13 | ||||
-rw-r--r-- | src/myspell/suggestmgr.cxx | 133 | ||||
-rw-r--r-- | src/myspell/suggestmgr.hxx | 14 |
11 files changed, 443 insertions, 72 deletions
diff --git a/src/myspell/affentry.cxx b/src/myspell/affentry.cxx index cfd54d3..603616d 100644 --- a/src/myspell/affentry.cxx +++ b/src/myspell/affentry.cxx @@ -8,7 +8,9 @@ #include "affentry.hxx" +#ifndef WINDOWS using namespace std; +#endif extern char * mystrdup(const char * s); extern char * myrevstrdup(const char * s); diff --git a/src/myspell/affixmgr.cxx b/src/myspell/affixmgr.cxx index 9ae79ca..87ca583 100644 --- a/src/myspell/affixmgr.cxx +++ b/src/myspell/affixmgr.cxx @@ -7,7 +7,9 @@ #include "affixmgr.hxx" #include "affentry.hxx" +#ifndef WINDOWS using namespace std; +#endif // First some base level utility routines @@ -16,6 +18,7 @@ extern char * mystrdup(const char * s); extern char * myrevstrdup(const char * s); extern char * mystrsep(char ** sptr, const char delim); extern int isSubset(const char * s1, const char * s2); +extern int isRevSubset(const char * s1, const char * end_of_s2, int len_s2); AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) @@ -26,7 +29,11 @@ AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) encoding=NULL; reptable = NULL; numrep = 0; + maptable = NULL; + nummap = 0; compound=NULL; + nosplitsugs= (0==1); + cpdmin = 3; // default value for (int i=0; i < SETSIZE; i++) { pStart[i] = NULL; @@ -74,6 +81,16 @@ AffixMgr::~AffixMgr() trystring=NULL; if (encoding) free(encoding); encoding=NULL; + if (maptable) { + for (int j=0; j < nummap; j++) { + free(maptable[j].set); + maptable[j].set = NULL; + maptable[j].len = 0; + } + free(maptable); + maptable = NULL; + } + nummap = 0; if (reptable) { for (int j=0; j < numrep; j++) { free(reptable[j].pattern); @@ -155,6 +172,13 @@ int AffixMgr::parse_file(const char * affpath) } } + /* parse in the related character map table */ + if (strncmp(line,"MAP",3) == 0) { + if (parse_maptable(line, afflst)) { + return 1; + } + } + // parse this affix: P - prefix, S - suffix ft = ' '; if (strncmp(line,"PFX",3) == 0) ft = 'P'; @@ -165,9 +189,17 @@ int AffixMgr::parse_file(const char * affpath) } } + // handle NOSPLITSUGS + if (strncmp(line,"NOSPLITSUGS",11) == 0) + nosplitsugs=(0==0); + } fclose(afflst); + // convert affix trees to sorted list + process_pfx_tree_to_list(); + process_sfx_tree_to_list(); + // now we can speed up performance greatly taking advantage of the // relationship between the affixes and the idea of "subsets". @@ -197,11 +229,12 @@ int AffixMgr::parse_file(const char * affpath) return 0; } + // we want to be able to quickly access prefix information // both by prefix flag, and sorted by prefix string itself // so we need to set up two indexes -int AffixMgr::build_pfxlist(AffEntry* pfxptr) +int AffixMgr::build_pfxtree(AffEntry* pfxptr) { PfxEntry * ptr; PfxEntry * pptr; @@ -217,8 +250,6 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr) pFlag[flg] = (AffEntry *) ep; - // next index by affix string - // handle the special case of null affix string if (strlen(key) == 0) { // always inset them at head of list at element 0 @@ -228,25 +259,39 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr) return 0; } - // now handle the general case + // now handle the normal case + ep->setNextEQ(NULL); + ep->setNextNE(NULL); + unsigned char sp = *((const unsigned char *)key); ptr = (PfxEntry*)pStart[sp]; - /* handle the insert at top of list case */ - if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) { - ep->setNext(ptr); + // handle the first insert + if (!ptr) { pStart[sp] = (AffEntry*)ep; return 0; } - /* otherwise find where it fits in order and insert it */ + + // otherwise use binary tree insertion so that a sorted + // list can easily be generated later pptr = NULL; - for (; ptr != NULL; ptr = ptr->getNext()) { - if (strcmp( ep->getKey() , ptr->getKey() ) <= 0) break; + for (;;) { pptr = ptr; + if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { + ptr = ptr->getNextEQ(); + if (!ptr) { + pptr->setNextEQ(ep); + break; + } + } else { + ptr = ptr->getNextNE(); + if (!ptr) { + pptr->setNextNE(ep); + break; + } + } } - pptr->setNext(ep); - ep->setNext(ptr); return 0; } @@ -255,7 +300,7 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr) // we want to be able to quickly access suffix information // both by suffix flag, and sorted by the reverse of the // suffix string itself; so we need to set up two indexes -int AffixMgr::build_sfxlist(AffEntry* sfxptr) +int AffixMgr::build_sfxtree(AffEntry* sfxptr) { SfxEntry * ptr; SfxEntry * pptr; @@ -283,30 +328,86 @@ int AffixMgr::build_sfxlist(AffEntry* sfxptr) } // now handle the normal case + ep->setNextEQ(NULL); + ep->setNextNE(NULL); + unsigned char sp = *((const unsigned char *)key); ptr = (SfxEntry*)sStart[sp]; - /* handle the insert at top of list case */ - if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) { - ep->setNext(ptr); + // handle the first insert + if (!ptr) { sStart[sp] = (AffEntry*)ep; return 0; } - /* otherwise find where it fits in order and insert it */ + + // otherwise use binary tree insertion so that a sorted + // list can easily be generated later pptr = NULL; - for (; ptr != NULL; ptr = ptr->getNext()) { - if (strcmp( ep->getKey(), ptr->getKey() ) <= 0) break; + for (;;) { pptr = ptr; + if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) { + ptr = ptr->getNextEQ(); + if (!ptr) { + pptr->setNextEQ(ep); + break; + } + } else { + ptr = ptr->getNextNE(); + if (!ptr) { + pptr->setNextNE(ep); + break; + } + } + } + return 0; +} + + +// convert from binary tree to sorted list +int AffixMgr::process_pfx_tree_to_list() +{ + for (int i=1; i< SETSIZE; i++) { + pStart[i] = process_pfx_in_order(pStart[i],NULL); } - pptr->setNext(ep); - ep->setNext(ptr); return 0; } +AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr) +{ + if (ptr) { + nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr); + ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr); + nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr); + } + return nptr; +} + + +// convert from binary tree to sorted list +int AffixMgr:: process_sfx_tree_to_list() +{ + for (int i=1; i< SETSIZE; i++) { + sStart[i] = process_sfx_in_order(sStart[i],NULL); + } + return 0; +} + +AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr) +{ + if (ptr) { + nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr); + ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr); + nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr); + } + return nptr; +} -// initialize the PfxEntry links NextEQ and NextNE to speed searching + + +// reinitialize the PfxEntry links NextEQ and NextNE to speed searching +// using the idea of leading subsets this time int AffixMgr::process_pfx_order() { PfxEntry* ptr; @@ -356,7 +457,8 @@ int AffixMgr::process_pfx_order() -// initialize the SfxEntry links NextEQ and NextNE to speed searching +// reinitialize the SfxEntry links NextEQ and NextNE to speed searching +// using the idea of leading subsets this time int AffixMgr::process_sfx_order() { SfxEntry* ptr; @@ -602,15 +704,15 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, } // now handle the general case - char * tmpword = myrevstrdup(word); - unsigned char sp = *((const unsigned char *)tmpword); + unsigned char sp = *((const unsigned char *)(word + len - 1)); + + SfxEntry * sptr = (SfxEntry *) sStart[sp]; while (sptr) { - if (isSubset(sptr->getKey(),tmpword)) { + if (isRevSubset(sptr->getKey(),(word+len-1), len)) { rv = sptr->check(word,len, sfxopts, ppfx); if (rv) { - free(tmpword); return rv; } sptr = sptr->getNextEQ(); @@ -618,8 +720,6 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len, sptr = sptr->getNextNE(); } } - - free(tmpword); return NULL; } @@ -737,6 +837,20 @@ struct replentry * AffixMgr::get_reptable() return reptable; } + +// return length of character map table +int AffixMgr::get_nummap() +{ + return nummap; +} + +// return character map table +struct mapentry * AffixMgr::get_maptable() +{ + if (! maptable ) return NULL; + return maptable; +} + // return text encoding of dictionary char * AffixMgr::get_encoding() { @@ -768,6 +882,11 @@ struct hentry * AffixMgr::lookup(const char * word) return pHMgr->lookup(word); } +// return nosplitsugs +bool AffixMgr::get_nosplitsugs(void) +{ + return nosplitsugs; +} /* parse in the try string */ int AffixMgr::parse_try(char * line) @@ -960,6 +1079,84 @@ int AffixMgr::parse_reptable(char * line, FILE * af) } + +/* parse in the character map table */ +int AffixMgr::parse_maptable(char * line, FILE * af) +{ + if (nummap != 0) { + fprintf(stderr,"error: duplicate MAP tables used\n"); + return 1; + } + char * tp = line; + char * piece; + int i = 0; + int np = 0; + while ((piece=mystrsep(&tp,' '))) { + if (*piece != '\0') { + switch(i) { + case 0: { np++; break; } + case 1: { + nummap = atoi(piece); + if (nummap < 1) { + fprintf(stderr,"incorrect number of entries in map table\n"); + free(piece); + return 1; + } + maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry)); + np++; + break; + } + default: break; + } + i++; + } + free(piece); + } + if (np != 2) { + fprintf(stderr,"error: missing map table information\n"); + return 1; + } + + /* now parse the nummap lines to read in the remainder of the table */ + char * nl = line; + for (int j=0; j < nummap; j++) { + fgets(nl,MAXLNLEN,af); + mychomp(nl); + tp = nl; + i = 0; + maptable[j].set = NULL; + maptable[j].len = 0; + while ((piece=mystrsep(&tp,' '))) { + if (*piece != '\0') { + switch(i) { + case 0: { + if (strncmp(piece,"MAP",3) != 0) { + fprintf(stderr,"error: map table is corrupt\n"); + free(piece); + return 1; + } + break; + } + case 1: { maptable[j].set = mystrdup(piece); + maptable[j].len = strlen(maptable[j].set); + break; } + default: break; + } + i++; + } + free(piece); + } + if ((!(maptable[j].set)) || (!(maptable[j].len))) { + fprintf(stderr,"error: map table is corrupt\n"); + return 1; + } + } + return 0; +} + + + + int AffixMgr::parse_affix(char * line, const char at, FILE * af) { int numents = 0; // number of affentry structures to parse @@ -1097,10 +1294,10 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af) for (int k = 0; k < numents; k++) { if (at == 'P') { PfxEntry * pfxptr = new PfxEntry(this,nptr); - build_pfxlist((AffEntry *)pfxptr); + build_pfxtree((AffEntry *)pfxptr); } else { SfxEntry * sfxptr = new SfxEntry(this,nptr); - build_sfxlist((AffEntry *)sfxptr); + build_sfxtree((AffEntry *)sfxptr); } nptr++; } diff --git a/src/myspell/affixmgr.hxx b/src/myspell/affixmgr.hxx index 9abbf26..6cbd112 100644 --- a/src/myspell/affixmgr.hxx +++ b/src/myspell/affixmgr.hxx @@ -20,6 +20,10 @@ class AffixMgr int cpdmin; int numrep; replentry * reptable; + int nummap; + mapentry * maptable; + bool nosplitsugs; + public: @@ -34,9 +38,12 @@ public: struct hentry * lookup(const char * word); int get_numrep(); struct replentry * get_reptable(); + int get_nummap(); + struct mapentry * get_maptable(); char * get_encoding(); char * get_try_string(); char * get_compound(); + bool get_nosplitsugs(); private: int parse_file(const char * affpath); @@ -45,11 +52,16 @@ private: int parse_cpdflag(char * line); int parse_cpdmin(char * line); int parse_reptable(char * line, FILE * af); + int parse_maptable(char * line, FILE * af); int parse_affix(char * line, const char at, FILE * af); void encodeit(struct affentry * ptr, char * cs); - int build_pfxlist(AffEntry* pfxptr); - int build_sfxlist(AffEntry* sfxptr); + int build_pfxtree(AffEntry* pfxptr); + int build_sfxtree(AffEntry* sfxptr); + AffEntry* process_sfx_in_order(AffEntry* ptr, AffEntry* nptr); + AffEntry* process_pfx_in_order(AffEntry* ptr, AffEntry* nptr); + int process_pfx_tree_to_list(); + int process_sfx_tree_to_list(); int process_pfx_order(); int process_sfx_order(); }; diff --git a/src/myspell/atypes.hxx b/src/myspell/atypes.hxx index 4c67ba2..a10c69d 100644 --- a/src/myspell/atypes.hxx +++ b/src/myspell/atypes.hxx @@ -27,6 +27,11 @@ struct replentry { char * replacement; }; +struct mapentry { + char * set; + int len; +}; + struct guessword { char * word; bool allow; diff --git a/src/myspell/csutil.cxx b/src/myspell/csutil.cxx index 498cf7c..0ecafda 100644 --- a/src/myspell/csutil.cxx +++ b/src/myspell/csutil.cxx @@ -3,7 +3,9 @@ #include <cstdio> #include "csutil.hxx" +#ifndef WINDOWS using namespace std; +#endif // strip strings into token based on single char delimiter // acts like strsep() but only uses a delim char and not @@ -74,7 +76,7 @@ char * myrevstrdup(const char * s) return d; } - +#if 0 // return 1 if s1 is a leading subset of s2 int isSubset(const char * s1, const char * s2) { @@ -84,7 +86,30 @@ int isSubset(const char * s1, const char * s2) if (strncmp(s2,s1,l1) == 0) return 1; return 0; } +#endif + + +// return 1 if s1 is a leading subset of s2 +int isSubset(const char * s1, const char * s2) +{ + while( *s1 && *s2 && (*s1 == *s2) ) { + s1++; + s2++; + } + return (*s1 == '\0'); +} + +// return 1 if s1 (reversed) is a leading subset of end of s2 +int isRevSubset(const char * s1, const char * end_of_s2, int len) +{ + while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) { + s1++; + end_of_s2--; + len --; + } + return (*s1 == '\0'); +} // convert null terminated string to all caps using encoding diff --git a/src/myspell/csutil.hxx b/src/myspell/csutil.hxx index eb5be3b..037eab9 100644 --- a/src/myspell/csutil.hxx +++ b/src/myspell/csutil.hxx @@ -19,6 +19,9 @@ char * mystrsep(char ** sptr, const char delim); // is one string a leading subset of another int isSubset(const char * s1, const char * s2); +// is one reverse string a leading subset of the end of another +int isRevSubset(const char * s1, const char * end_of_s2, int s2_len); + // character encoding information diff --git a/src/myspell/hashmgr.cxx b/src/myspell/hashmgr.cxx index 223674a..d7b4ec8 100644 --- a/src/myspell/hashmgr.cxx +++ b/src/myspell/hashmgr.cxx @@ -1,9 +1,7 @@ #include "license.readme" -#include <unistd.h> #include <cstdlib> #include <cstring> -#include <fcntl.h> #include <cstdio> #include "hashmgr.hxx" @@ -11,7 +9,9 @@ extern void mychomp(char * s); extern char * mystrdup(const char *); +#ifndef WINDOWS using namespace std; +#endif // build a hash table from a munched word list diff --git a/src/myspell/myspell.cxx b/src/myspell/myspell.cxx index 264d1a5..6209898 100644 --- a/src/myspell/myspell.cxx +++ b/src/myspell/myspell.cxx @@ -6,7 +6,9 @@ #include "myspell.hxx" +#ifndef WINDOWS using namespace std; +#endif MySpell::MySpell(const char * affpath, const char * dpath) @@ -138,7 +140,23 @@ int MySpell::spell(const char * word) break; } - case ALLCAP: + case ALLCAP: { + memcpy(wspace,cw,(wl+1)); + mkallsmall(wspace, csconv); + rv = check(wspace); + if (!rv) { + mkinitcap(wspace, csconv); + rv = check(wspace); + } + if (!rv) rv = check(cw); + if ((abbv) && !(rv)) { + memcpy(wspace,cw,wl); + *(wspace+wl) = '.'; + *(wspace+wl+1) = '\0'; + rv = check(wspace); + } + break; + } case INITCAP: { memcpy(wspace,cw,(wl+1)); mkallsmall(wspace, csconv); @@ -247,8 +265,23 @@ int MySpell::suggest(char*** slst, const char * word) if (ns == 0) { ns = pSMgr->ngsuggest(wlst, cw, pHMgr); if (ns) { - *slst = wlst; - return ns; + switch(captype) { + case NOCAP: break; + case HUHCAP: break; + case INITCAP: { + for (int j=0; j < ns; j++) + mkinitcap(wlst[j], csconv); + } + break; + + case ALLCAP: { + for (int j=0; j < ns; j++) + mkallcap(wlst[j], csconv); + } + break; + } + *slst = wlst; + return ns; } } if (ns < 0) { diff --git a/src/myspell/myspell.hxx b/src/myspell/myspell.hxx index 20b955c..0c18549 100644 --- a/src/myspell/myspell.hxx +++ b/src/myspell/myspell.hxx @@ -1,3 +1,6 @@ +#ifndef _MYSPELLMGR_HXX_ +#define _MYSPELLMGR_HXX_ + #include "hashmgr.hxx" #include "affixmgr.hxx" #include "suggestmgr.hxx" @@ -8,11 +11,13 @@ #define ALLCAP 2 #define HUHCAP 3 +#ifdef WINDOWS +#define DLLSUPPORT __declspec(dllexport) +#else +#define DLLSUPPORT +#endif -#ifndef _MYSPELLMGR_HXX_ -#define _MYSPELLMGR_HXX_ - -class MySpell +class DLLSUPPORT MySpell { AffixMgr* pAMgr; HashMgr* pHMgr; diff --git a/src/myspell/suggestmgr.cxx b/src/myspell/suggestmgr.cxx index dc8e646..4a756b3 100644 --- a/src/myspell/suggestmgr.cxx +++ b/src/myspell/suggestmgr.cxx @@ -7,7 +7,9 @@ #include "suggestmgr.hxx" +#ifndef WINDOWS using namespace std; +#endif extern char * mystrdup(const char *); @@ -24,6 +26,8 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn, if (ctry) ctryl = strlen(ctry); maxSug = maxn; + nosplitsugs=(0==1); + if (pAMgr) pAMgr->get_nosplitsugs(); } @@ -46,6 +50,10 @@ int SuggestMgr::suggest(char** wlst, int ns, const char * word) int nsug = ns; + // perhaps we made chose the wrong char from a related set + if ((nsug < maxSug) && (nsug > -1)) + nsug = mapchars(wlst, word, nsug); + // perhaps we made a typical fault of spelling if ((nsug < maxSug) && (nsug > -1)) nsug = replchars(wlst, word, nsug); @@ -67,14 +75,66 @@ int SuggestMgr::suggest(char** wlst, int ns, const char * word) nsug = badchar(wlst, word, nsug); // perhaps we forgot to hit space and two words ran together - if ((nsug < maxSug) && (nsug > -1)) - nsug = twowords(wlst, word, nsug); - + if (!nosplitsugs) { + if ((nsug < maxSug) && (nsug > -1)) + nsug = twowords(wlst, word, nsug); + } return nsug; } +// suggestions for when chose the wrong char out of a related set +int SuggestMgr::mapchars(char** wlst, const char * word, int ns) +{ + int wl = strlen(word); + if (wl < 2 || ! pAMgr) return ns; + + int nummap = pAMgr->get_nummap(); + struct mapentry* maptable = pAMgr->get_maptable(); + if (maptable==NULL) return ns; + ns = map_related(word, 0, wlst, ns, maptable, nummap); + return ns; +} + + +int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap) +{ + char c = *(word + i); + if (c == 0) { + int cwrd = 1; + for (int m=0; m < ns; m++) + if (strcmp(word,wlst[m]) == 0) cwrd = 0; + if ((cwrd) && check(word,strlen(word))) { + if (ns < maxSug) { + wlst[ns] = mystrdup(word); + if (wlst[ns] == NULL) return -1; + ns++; + } + } + return ns; + } + int in_map = 0; + for (int j = 0; j < nummap; j++) { + if (strchr(maptable[j].set,c) != 0) { + in_map = 1; + char * newword = strdup(word); + for (int k = 0; k < maptable[j].len; k++) { + *(newword + i) = *(maptable[j].set + k); + ns = map_related(newword, (i+1), wlst, ns, maptable, nummap); + } + free(newword); + } + } + if (!in_map) { + i++; + ns = map_related(word, i, wlst, ns, maptable, nummap); + } + return ns; +} + + + // suggestions for a typical fault of spelling, that // differs with more, than 1 letter from the right form. int SuggestMgr::replchars(char** wlst, const char * word, int ns) @@ -85,11 +145,11 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns) int cwrd; int wl = strlen(word); - if (wl < 2 || ! pAMgr) return 0; + if (wl < 2 || ! pAMgr) return ns; int numrep = pAMgr->get_numrep(); struct replentry* reptable = pAMgr->get_reptable(); - if (reptable==NULL) return 0; + if (reptable==NULL) return ns; for (int i=0; i < numrep; i++ ) { r = word; @@ -161,7 +221,7 @@ int SuggestMgr::extrachar(char** wlst, const char * word, int ns) int cwrd; int wl = strlen(word); - if (wl < 2) return 0; + if (wl < 2) return ns; // try omitting one char of word at a time strcpy (candidate, word + 1); @@ -314,10 +374,12 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr) } lp = MAX_ROOTS - 1; + int n = strlen(word); + struct hentry* hp = NULL; int col = -1; while ((hp = pHMgr->walk_hashtable(col, hp))) { - sc = ngram(3, word, hp->word, (1 == 0)); + sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE); if (sc > scores[lp]) { scores[lp] = sc; roots[lp] = hp; @@ -330,6 +392,21 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr) } } + // find minimum threshhold for a passable suggestion + // mangle original word three differnt ways + // and score them to generate a minimum acceptable score + int thresh = 0; + char * mw = NULL; + for (int sp = 1; sp < 4; sp++) { + mw = strdup(word); + for (int k=sp; k < n; k+=4) *(mw + k) = '*'; + thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH); + free(mw); + } + mw = NULL; + thresh = thresh / 3; + thresh--; + // now expand affixes on each of these root words and // and use length adjusted ngram scores to select // possible suggestions @@ -353,28 +430,30 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr) int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen, rp->astr, rp->alen); for (int k = 0; k < nw; k++) { - sc = ngram(3, word, glst[k].word, (1==1)); - if (sc > gscore[lp]) { - if (guess[lp]) free (guess[lp]); - gscore[lp] = sc; - guess[lp] = glst[k].word; - lval = sc; - for (j=0; j < MAX_GUESS; j++) - if (gscore[j] < lval) { - lp = j; - lval = gscore[j]; - } - } else { - free (glst[k].word); - } + sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH); + if (sc > thresh) { + if (sc > gscore[lp]) { + if (guess[lp]) free (guess[lp]); + gscore[lp] = sc; + guess[lp] = glst[k].word; + lval = sc; + for (j=0; j < MAX_GUESS; j++) + if (gscore[j] < lval) { + lp = j; + lval = gscore[j]; + } + } else { + free (glst[k].word); + } + } } } - } if (glst) free(glst); // now we are done generating guesses // sort in order of decreasing score and copy over + bubblesort(&guess[0], &gscore[0], MAX_GUESS); int ns = 0; for (i=0; i < MAX_GUESS; i++) { @@ -412,12 +491,11 @@ int SuggestMgr::check(const char * word, int len) // generate an n-gram score comparing s1 and s2 -int SuggestMgr::ngram(int n, char * s1, const char * s2, bool uselen) +int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen) { int nscore = 0; int l1 = strlen(s1); - int l2 = l1; - if (uselen) l2 = strlen(s2); + int l2 = strlen(s2); int ns; for (int j=1;j<=n;j++) { ns = 0; @@ -430,7 +508,9 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, bool uselen) nscore = nscore + ns; if (ns < 2) break; } - ns = abs(l1-l2) - 2; + ns = 0; + if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2; + if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2; return (nscore - ((ns > 0) ? ns : 0)); } @@ -456,3 +536,4 @@ void SuggestMgr::bubblesort(char** rword, int* rsc, int n ) } return; } + diff --git a/src/myspell/suggestmgr.hxx b/src/myspell/suggestmgr.hxx index f78d94d..7c5a6e2 100644 --- a/src/myspell/suggestmgr.hxx +++ b/src/myspell/suggestmgr.hxx @@ -4,7 +4,12 @@ #define MAXSWL 100 #define MAX_ROOTS 10 #define MAX_WORDS 500 -#define MAX_GUESS 5 +#define MAX_GUESS 10 + +#define NGRAM_IGNORE_LENGTH 0 +#define NGRAM_LONGER_WORSE 1 +#define NGRAM_ANY_MISMATCH 2 + #include "atypes.hxx" #include "affixmgr.hxx" @@ -16,7 +21,8 @@ class SuggestMgr int ctryl; AffixMgr* pAMgr; int maxSug; - + bool nosplitsugs; + public: SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr); ~SuggestMgr(); @@ -27,12 +33,14 @@ public: private: int replchars(char**, const char *, int); + int mapchars(char**, const char *, int); + int map_related(const char *, int, char ** wlst, int, const mapentry*, int); int forgotchar(char **, const char *, int); int swapchar(char **, const char *, int); int extrachar(char **, const char *, int); int badchar(char **, const char *, int); int twowords(char **, const char *, int); - int ngram(int n, char * s1, const char * s2, bool uselen); + int ngram(int n, char * s1, const char * s2, int uselen); void bubblesort( char ** rwd, int * rsc, int n); }; |