summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDom Lachowicz <domlachowicz@gmail.com>2004-01-12 04:09:01 +0000
committerDom Lachowicz <domlachowicz@gmail.com>2004-01-12 04:09:01 +0000
commit19ca5d04f57a178d77287b0a303adb293491ef20 (patch)
tree93ee3d237f3a83ad2c9060c602e099491292a70d
parent8b23ed4ec5e4769f66296b4e87ccbc6f4bfb9142 (diff)
downloadenchant-19ca5d04f57a178d77287b0a303adb293491ef20.tar.gz
upgrade to myspell 3.1
git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@20903 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
-rw-r--r--src/myspell/affentry.cxx2
-rw-r--r--src/myspell/affixmgr.cxx259
-rw-r--r--src/myspell/affixmgr.hxx16
-rw-r--r--src/myspell/atypes.hxx5
-rw-r--r--src/myspell/csutil.cxx27
-rw-r--r--src/myspell/csutil.hxx3
-rw-r--r--src/myspell/hashmgr.cxx4
-rw-r--r--src/myspell/myspell.cxx39
-rw-r--r--src/myspell/myspell.hxx13
-rw-r--r--src/myspell/suggestmgr.cxx133
-rw-r--r--src/myspell/suggestmgr.hxx14
11 files changed, 443 insertions, 72 deletions
diff --git a/src/myspell/affentry.cxx b/src/myspell/affentry.cxx
index cfd54d3..603616d 100644
--- a/src/myspell/affentry.cxx
+++ b/src/myspell/affentry.cxx
@@ -8,7 +8,9 @@
#include "affentry.hxx"
+#ifndef WINDOWS
using namespace std;
+#endif
extern char * mystrdup(const char * s);
extern char * myrevstrdup(const char * s);
diff --git a/src/myspell/affixmgr.cxx b/src/myspell/affixmgr.cxx
index 9ae79ca..87ca583 100644
--- a/src/myspell/affixmgr.cxx
+++ b/src/myspell/affixmgr.cxx
@@ -7,7 +7,9 @@
#include "affixmgr.hxx"
#include "affentry.hxx"
+#ifndef WINDOWS
using namespace std;
+#endif
// First some base level utility routines
@@ -16,6 +18,7 @@ extern char * mystrdup(const char * s);
extern char * myrevstrdup(const char * s);
extern char * mystrsep(char ** sptr, const char delim);
extern int isSubset(const char * s1, const char * s2);
+extern int isRevSubset(const char * s1, const char * end_of_s2, int len_s2);
AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
@@ -26,7 +29,11 @@ AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
encoding=NULL;
reptable = NULL;
numrep = 0;
+ maptable = NULL;
+ nummap = 0;
compound=NULL;
+ nosplitsugs= (0==1);
+
cpdmin = 3; // default value
for (int i=0; i < SETSIZE; i++) {
pStart[i] = NULL;
@@ -74,6 +81,16 @@ AffixMgr::~AffixMgr()
trystring=NULL;
if (encoding) free(encoding);
encoding=NULL;
+ if (maptable) {
+ for (int j=0; j < nummap; j++) {
+ free(maptable[j].set);
+ maptable[j].set = NULL;
+ maptable[j].len = 0;
+ }
+ free(maptable);
+ maptable = NULL;
+ }
+ nummap = 0;
if (reptable) {
for (int j=0; j < numrep; j++) {
free(reptable[j].pattern);
@@ -155,6 +172,13 @@ int AffixMgr::parse_file(const char * affpath)
}
}
+ /* parse in the related character map table */
+ if (strncmp(line,"MAP",3) == 0) {
+ if (parse_maptable(line, afflst)) {
+ return 1;
+ }
+ }
+
// parse this affix: P - prefix, S - suffix
ft = ' ';
if (strncmp(line,"PFX",3) == 0) ft = 'P';
@@ -165,9 +189,17 @@ int AffixMgr::parse_file(const char * affpath)
}
}
+ // handle NOSPLITSUGS
+ if (strncmp(line,"NOSPLITSUGS",11) == 0)
+ nosplitsugs=(0==0);
+
}
fclose(afflst);
+ // convert affix trees to sorted list
+ process_pfx_tree_to_list();
+ process_sfx_tree_to_list();
+
// now we can speed up performance greatly taking advantage of the
// relationship between the affixes and the idea of "subsets".
@@ -197,11 +229,12 @@ int AffixMgr::parse_file(const char * affpath)
return 0;
}
+
// we want to be able to quickly access prefix information
// both by prefix flag, and sorted by prefix string itself
// so we need to set up two indexes
-int AffixMgr::build_pfxlist(AffEntry* pfxptr)
+int AffixMgr::build_pfxtree(AffEntry* pfxptr)
{
PfxEntry * ptr;
PfxEntry * pptr;
@@ -217,8 +250,6 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr)
pFlag[flg] = (AffEntry *) ep;
- // next index by affix string
-
// handle the special case of null affix string
if (strlen(key) == 0) {
// always inset them at head of list at element 0
@@ -228,25 +259,39 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr)
return 0;
}
- // now handle the general case
+ // now handle the normal case
+ ep->setNextEQ(NULL);
+ ep->setNextNE(NULL);
+
unsigned char sp = *((const unsigned char *)key);
ptr = (PfxEntry*)pStart[sp];
- /* handle the insert at top of list case */
- if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) {
- ep->setNext(ptr);
+ // handle the first insert
+ if (!ptr) {
pStart[sp] = (AffEntry*)ep;
return 0;
}
- /* otherwise find where it fits in order and insert it */
+
+ // otherwise use binary tree insertion so that a sorted
+ // list can easily be generated later
pptr = NULL;
- for (; ptr != NULL; ptr = ptr->getNext()) {
- if (strcmp( ep->getKey() , ptr->getKey() ) <= 0) break;
+ for (;;) {
pptr = ptr;
+ if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
+ ptr = ptr->getNextEQ();
+ if (!ptr) {
+ pptr->setNextEQ(ep);
+ break;
+ }
+ } else {
+ ptr = ptr->getNextNE();
+ if (!ptr) {
+ pptr->setNextNE(ep);
+ break;
+ }
+ }
}
- pptr->setNext(ep);
- ep->setNext(ptr);
return 0;
}
@@ -255,7 +300,7 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr)
// we want to be able to quickly access suffix information
// both by suffix flag, and sorted by the reverse of the
// suffix string itself; so we need to set up two indexes
-int AffixMgr::build_sfxlist(AffEntry* sfxptr)
+int AffixMgr::build_sfxtree(AffEntry* sfxptr)
{
SfxEntry * ptr;
SfxEntry * pptr;
@@ -283,30 +328,86 @@ int AffixMgr::build_sfxlist(AffEntry* sfxptr)
}
// now handle the normal case
+ ep->setNextEQ(NULL);
+ ep->setNextNE(NULL);
+
unsigned char sp = *((const unsigned char *)key);
ptr = (SfxEntry*)sStart[sp];
- /* handle the insert at top of list case */
- if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) {
- ep->setNext(ptr);
+ // handle the first insert
+ if (!ptr) {
sStart[sp] = (AffEntry*)ep;
return 0;
}
- /* otherwise find where it fits in order and insert it */
+
+ // otherwise use binary tree insertion so that a sorted
+ // list can easily be generated later
pptr = NULL;
- for (; ptr != NULL; ptr = ptr->getNext()) {
- if (strcmp( ep->getKey(), ptr->getKey() ) <= 0) break;
+ for (;;) {
pptr = ptr;
+ if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
+ ptr = ptr->getNextEQ();
+ if (!ptr) {
+ pptr->setNextEQ(ep);
+ break;
+ }
+ } else {
+ ptr = ptr->getNextNE();
+ if (!ptr) {
+ pptr->setNextNE(ep);
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+
+// convert from binary tree to sorted list
+int AffixMgr::process_pfx_tree_to_list()
+{
+ for (int i=1; i< SETSIZE; i++) {
+ pStart[i] = process_pfx_in_order(pStart[i],NULL);
}
- pptr->setNext(ep);
- ep->setNext(ptr);
return 0;
}
+AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
+{
+ if (ptr) {
+ nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
+ ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
+ nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
+ }
+ return nptr;
+}
+
+
+// convert from binary tree to sorted list
+int AffixMgr:: process_sfx_tree_to_list()
+{
+ for (int i=1; i< SETSIZE; i++) {
+ sStart[i] = process_sfx_in_order(sStart[i],NULL);
+ }
+ return 0;
+}
+
+AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
+{
+ if (ptr) {
+ nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
+ ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
+ nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
+ }
+ return nptr;
+}
-// initialize the PfxEntry links NextEQ and NextNE to speed searching
+
+
+// reinitialize the PfxEntry links NextEQ and NextNE to speed searching
+// using the idea of leading subsets this time
int AffixMgr::process_pfx_order()
{
PfxEntry* ptr;
@@ -356,7 +457,8 @@ int AffixMgr::process_pfx_order()
-// initialize the SfxEntry links NextEQ and NextNE to speed searching
+// reinitialize the SfxEntry links NextEQ and NextNE to speed searching
+// using the idea of leading subsets this time
int AffixMgr::process_sfx_order()
{
SfxEntry* ptr;
@@ -602,15 +704,15 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len,
}
// now handle the general case
- char * tmpword = myrevstrdup(word);
- unsigned char sp = *((const unsigned char *)tmpword);
+ unsigned char sp = *((const unsigned char *)(word + len - 1));
+
+
SfxEntry * sptr = (SfxEntry *) sStart[sp];
while (sptr) {
- if (isSubset(sptr->getKey(),tmpword)) {
+ if (isRevSubset(sptr->getKey(),(word+len-1), len)) {
rv = sptr->check(word,len, sfxopts, ppfx);
if (rv) {
- free(tmpword);
return rv;
}
sptr = sptr->getNextEQ();
@@ -618,8 +720,6 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len,
sptr = sptr->getNextNE();
}
}
-
- free(tmpword);
return NULL;
}
@@ -737,6 +837,20 @@ struct replentry * AffixMgr::get_reptable()
return reptable;
}
+
+// return length of character map table
+int AffixMgr::get_nummap()
+{
+ return nummap;
+}
+
+// return character map table
+struct mapentry * AffixMgr::get_maptable()
+{
+ if (! maptable ) return NULL;
+ return maptable;
+}
+
// return text encoding of dictionary
char * AffixMgr::get_encoding()
{
@@ -768,6 +882,11 @@ struct hentry * AffixMgr::lookup(const char * word)
return pHMgr->lookup(word);
}
+// return nosplitsugs
+bool AffixMgr::get_nosplitsugs(void)
+{
+ return nosplitsugs;
+}
/* parse in the try string */
int AffixMgr::parse_try(char * line)
@@ -960,6 +1079,84 @@ int AffixMgr::parse_reptable(char * line, FILE * af)
}
+
+/* parse in the character map table */
+int AffixMgr::parse_maptable(char * line, FILE * af)
+{
+ if (nummap != 0) {
+ fprintf(stderr,"error: duplicate MAP tables used\n");
+ return 1;
+ }
+ char * tp = line;
+ char * piece;
+ int i = 0;
+ int np = 0;
+ while ((piece=mystrsep(&tp,' '))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: { np++; break; }
+ case 1: {
+ nummap = atoi(piece);
+ if (nummap < 1) {
+ fprintf(stderr,"incorrect number of entries in map table\n");
+ free(piece);
+ return 1;
+ }
+ maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
+ np++;
+ break;
+ }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if (np != 2) {
+ fprintf(stderr,"error: missing map table information\n");
+ return 1;
+ }
+
+ /* now parse the nummap lines to read in the remainder of the table */
+ char * nl = line;
+ for (int j=0; j < nummap; j++) {
+ fgets(nl,MAXLNLEN,af);
+ mychomp(nl);
+ tp = nl;
+ i = 0;
+ maptable[j].set = NULL;
+ maptable[j].len = 0;
+ while ((piece=mystrsep(&tp,' '))) {
+ if (*piece != '\0') {
+ switch(i) {
+ case 0: {
+ if (strncmp(piece,"MAP",3) != 0) {
+ fprintf(stderr,"error: map table is corrupt\n");
+ free(piece);
+ return 1;
+ }
+ break;
+ }
+ case 1: { maptable[j].set = mystrdup(piece);
+ maptable[j].len = strlen(maptable[j].set);
+ break; }
+ default: break;
+ }
+ i++;
+ }
+ free(piece);
+ }
+ if ((!(maptable[j].set)) || (!(maptable[j].len))) {
+ fprintf(stderr,"error: map table is corrupt\n");
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+
+
int AffixMgr::parse_affix(char * line, const char at, FILE * af)
{
int numents = 0; // number of affentry structures to parse
@@ -1097,10 +1294,10 @@ int AffixMgr::parse_affix(char * line, const char at, FILE * af)
for (int k = 0; k < numents; k++) {
if (at == 'P') {
PfxEntry * pfxptr = new PfxEntry(this,nptr);
- build_pfxlist((AffEntry *)pfxptr);
+ build_pfxtree((AffEntry *)pfxptr);
} else {
SfxEntry * sfxptr = new SfxEntry(this,nptr);
- build_sfxlist((AffEntry *)sfxptr);
+ build_sfxtree((AffEntry *)sfxptr);
}
nptr++;
}
diff --git a/src/myspell/affixmgr.hxx b/src/myspell/affixmgr.hxx
index 9abbf26..6cbd112 100644
--- a/src/myspell/affixmgr.hxx
+++ b/src/myspell/affixmgr.hxx
@@ -20,6 +20,10 @@ class AffixMgr
int cpdmin;
int numrep;
replentry * reptable;
+ int nummap;
+ mapentry * maptable;
+ bool nosplitsugs;
+
public:
@@ -34,9 +38,12 @@ public:
struct hentry * lookup(const char * word);
int get_numrep();
struct replentry * get_reptable();
+ int get_nummap();
+ struct mapentry * get_maptable();
char * get_encoding();
char * get_try_string();
char * get_compound();
+ bool get_nosplitsugs();
private:
int parse_file(const char * affpath);
@@ -45,11 +52,16 @@ private:
int parse_cpdflag(char * line);
int parse_cpdmin(char * line);
int parse_reptable(char * line, FILE * af);
+ int parse_maptable(char * line, FILE * af);
int parse_affix(char * line, const char at, FILE * af);
void encodeit(struct affentry * ptr, char * cs);
- int build_pfxlist(AffEntry* pfxptr);
- int build_sfxlist(AffEntry* sfxptr);
+ int build_pfxtree(AffEntry* pfxptr);
+ int build_sfxtree(AffEntry* sfxptr);
+ AffEntry* process_sfx_in_order(AffEntry* ptr, AffEntry* nptr);
+ AffEntry* process_pfx_in_order(AffEntry* ptr, AffEntry* nptr);
+ int process_pfx_tree_to_list();
+ int process_sfx_tree_to_list();
int process_pfx_order();
int process_sfx_order();
};
diff --git a/src/myspell/atypes.hxx b/src/myspell/atypes.hxx
index 4c67ba2..a10c69d 100644
--- a/src/myspell/atypes.hxx
+++ b/src/myspell/atypes.hxx
@@ -27,6 +27,11 @@ struct replentry {
char * replacement;
};
+struct mapentry {
+ char * set;
+ int len;
+};
+
struct guessword {
char * word;
bool allow;
diff --git a/src/myspell/csutil.cxx b/src/myspell/csutil.cxx
index 498cf7c..0ecafda 100644
--- a/src/myspell/csutil.cxx
+++ b/src/myspell/csutil.cxx
@@ -3,7 +3,9 @@
#include <cstdio>
#include "csutil.hxx"
+#ifndef WINDOWS
using namespace std;
+#endif
// strip strings into token based on single char delimiter
// acts like strsep() but only uses a delim char and not
@@ -74,7 +76,7 @@ char * myrevstrdup(const char * s)
return d;
}
-
+#if 0
// return 1 if s1 is a leading subset of s2
int isSubset(const char * s1, const char * s2)
{
@@ -84,7 +86,30 @@ int isSubset(const char * s1, const char * s2)
if (strncmp(s2,s1,l1) == 0) return 1;
return 0;
}
+#endif
+
+
+// return 1 if s1 is a leading subset of s2
+int isSubset(const char * s1, const char * s2)
+{
+ while( *s1 && *s2 && (*s1 == *s2) ) {
+ s1++;
+ s2++;
+ }
+ return (*s1 == '\0');
+}
+
+// return 1 if s1 (reversed) is a leading subset of end of s2
+int isRevSubset(const char * s1, const char * end_of_s2, int len)
+{
+ while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
+ s1++;
+ end_of_s2--;
+ len --;
+ }
+ return (*s1 == '\0');
+}
// convert null terminated string to all caps using encoding
diff --git a/src/myspell/csutil.hxx b/src/myspell/csutil.hxx
index eb5be3b..037eab9 100644
--- a/src/myspell/csutil.hxx
+++ b/src/myspell/csutil.hxx
@@ -19,6 +19,9 @@ char * mystrsep(char ** sptr, const char delim);
// is one string a leading subset of another
int isSubset(const char * s1, const char * s2);
+// is one reverse string a leading subset of the end of another
+int isRevSubset(const char * s1, const char * end_of_s2, int s2_len);
+
// character encoding information
diff --git a/src/myspell/hashmgr.cxx b/src/myspell/hashmgr.cxx
index 223674a..d7b4ec8 100644
--- a/src/myspell/hashmgr.cxx
+++ b/src/myspell/hashmgr.cxx
@@ -1,9 +1,7 @@
#include "license.readme"
-#include <unistd.h>
#include <cstdlib>
#include <cstring>
-#include <fcntl.h>
#include <cstdio>
#include "hashmgr.hxx"
@@ -11,7 +9,9 @@
extern void mychomp(char * s);
extern char * mystrdup(const char *);
+#ifndef WINDOWS
using namespace std;
+#endif
// build a hash table from a munched word list
diff --git a/src/myspell/myspell.cxx b/src/myspell/myspell.cxx
index 264d1a5..6209898 100644
--- a/src/myspell/myspell.cxx
+++ b/src/myspell/myspell.cxx
@@ -6,7 +6,9 @@
#include "myspell.hxx"
+#ifndef WINDOWS
using namespace std;
+#endif
MySpell::MySpell(const char * affpath, const char * dpath)
@@ -138,7 +140,23 @@ int MySpell::spell(const char * word)
break;
}
- case ALLCAP:
+ case ALLCAP: {
+ memcpy(wspace,cw,(wl+1));
+ mkallsmall(wspace, csconv);
+ rv = check(wspace);
+ if (!rv) {
+ mkinitcap(wspace, csconv);
+ rv = check(wspace);
+ }
+ if (!rv) rv = check(cw);
+ if ((abbv) && !(rv)) {
+ memcpy(wspace,cw,wl);
+ *(wspace+wl) = '.';
+ *(wspace+wl+1) = '\0';
+ rv = check(wspace);
+ }
+ break;
+ }
case INITCAP: {
memcpy(wspace,cw,(wl+1));
mkallsmall(wspace, csconv);
@@ -247,8 +265,23 @@ int MySpell::suggest(char*** slst, const char * word)
if (ns == 0) {
ns = pSMgr->ngsuggest(wlst, cw, pHMgr);
if (ns) {
- *slst = wlst;
- return ns;
+ switch(captype) {
+ case NOCAP: break;
+ case HUHCAP: break;
+ case INITCAP: {
+ for (int j=0; j < ns; j++)
+ mkinitcap(wlst[j], csconv);
+ }
+ break;
+
+ case ALLCAP: {
+ for (int j=0; j < ns; j++)
+ mkallcap(wlst[j], csconv);
+ }
+ break;
+ }
+ *slst = wlst;
+ return ns;
}
}
if (ns < 0) {
diff --git a/src/myspell/myspell.hxx b/src/myspell/myspell.hxx
index 20b955c..0c18549 100644
--- a/src/myspell/myspell.hxx
+++ b/src/myspell/myspell.hxx
@@ -1,3 +1,6 @@
+#ifndef _MYSPELLMGR_HXX_
+#define _MYSPELLMGR_HXX_
+
#include "hashmgr.hxx"
#include "affixmgr.hxx"
#include "suggestmgr.hxx"
@@ -8,11 +11,13 @@
#define ALLCAP 2
#define HUHCAP 3
+#ifdef WINDOWS
+#define DLLSUPPORT __declspec(dllexport)
+#else
+#define DLLSUPPORT
+#endif
-#ifndef _MYSPELLMGR_HXX_
-#define _MYSPELLMGR_HXX_
-
-class MySpell
+class DLLSUPPORT MySpell
{
AffixMgr* pAMgr;
HashMgr* pHMgr;
diff --git a/src/myspell/suggestmgr.cxx b/src/myspell/suggestmgr.cxx
index dc8e646..4a756b3 100644
--- a/src/myspell/suggestmgr.cxx
+++ b/src/myspell/suggestmgr.cxx
@@ -7,7 +7,9 @@
#include "suggestmgr.hxx"
+#ifndef WINDOWS
using namespace std;
+#endif
extern char * mystrdup(const char *);
@@ -24,6 +26,8 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn,
if (ctry)
ctryl = strlen(ctry);
maxSug = maxn;
+ nosplitsugs=(0==1);
+ if (pAMgr) pAMgr->get_nosplitsugs();
}
@@ -46,6 +50,10 @@ int SuggestMgr::suggest(char** wlst, int ns, const char * word)
int nsug = ns;
+ // perhaps we made chose the wrong char from a related set
+ if ((nsug < maxSug) && (nsug > -1))
+ nsug = mapchars(wlst, word, nsug);
+
// perhaps we made a typical fault of spelling
if ((nsug < maxSug) && (nsug > -1))
nsug = replchars(wlst, word, nsug);
@@ -67,14 +75,66 @@ int SuggestMgr::suggest(char** wlst, int ns, const char * word)
nsug = badchar(wlst, word, nsug);
// perhaps we forgot to hit space and two words ran together
- if ((nsug < maxSug) && (nsug > -1))
- nsug = twowords(wlst, word, nsug);
-
+ if (!nosplitsugs) {
+ if ((nsug < maxSug) && (nsug > -1))
+ nsug = twowords(wlst, word, nsug);
+ }
return nsug;
}
+// suggestions for when chose the wrong char out of a related set
+int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
+{
+ int wl = strlen(word);
+ if (wl < 2 || ! pAMgr) return ns;
+
+ int nummap = pAMgr->get_nummap();
+ struct mapentry* maptable = pAMgr->get_maptable();
+ if (maptable==NULL) return ns;
+ ns = map_related(word, 0, wlst, ns, maptable, nummap);
+ return ns;
+}
+
+
+int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap)
+{
+ char c = *(word + i);
+ if (c == 0) {
+ int cwrd = 1;
+ for (int m=0; m < ns; m++)
+ if (strcmp(word,wlst[m]) == 0) cwrd = 0;
+ if ((cwrd) && check(word,strlen(word))) {
+ if (ns < maxSug) {
+ wlst[ns] = mystrdup(word);
+ if (wlst[ns] == NULL) return -1;
+ ns++;
+ }
+ }
+ return ns;
+ }
+ int in_map = 0;
+ for (int j = 0; j < nummap; j++) {
+ if (strchr(maptable[j].set,c) != 0) {
+ in_map = 1;
+ char * newword = strdup(word);
+ for (int k = 0; k < maptable[j].len; k++) {
+ *(newword + i) = *(maptable[j].set + k);
+ ns = map_related(newword, (i+1), wlst, ns, maptable, nummap);
+ }
+ free(newword);
+ }
+ }
+ if (!in_map) {
+ i++;
+ ns = map_related(word, i, wlst, ns, maptable, nummap);
+ }
+ return ns;
+}
+
+
+
// suggestions for a typical fault of spelling, that
// differs with more, than 1 letter from the right form.
int SuggestMgr::replchars(char** wlst, const char * word, int ns)
@@ -85,11 +145,11 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns)
int cwrd;
int wl = strlen(word);
- if (wl < 2 || ! pAMgr) return 0;
+ if (wl < 2 || ! pAMgr) return ns;
int numrep = pAMgr->get_numrep();
struct replentry* reptable = pAMgr->get_reptable();
- if (reptable==NULL) return 0;
+ if (reptable==NULL) return ns;
for (int i=0; i < numrep; i++ ) {
r = word;
@@ -161,7 +221,7 @@ int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
int cwrd;
int wl = strlen(word);
- if (wl < 2) return 0;
+ if (wl < 2) return ns;
// try omitting one char of word at a time
strcpy (candidate, word + 1);
@@ -314,10 +374,12 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
}
lp = MAX_ROOTS - 1;
+ int n = strlen(word);
+
struct hentry* hp = NULL;
int col = -1;
while ((hp = pHMgr->walk_hashtable(col, hp))) {
- sc = ngram(3, word, hp->word, (1 == 0));
+ sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
if (sc > scores[lp]) {
scores[lp] = sc;
roots[lp] = hp;
@@ -330,6 +392,21 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
}
}
+ // find minimum threshhold for a passable suggestion
+ // mangle original word three differnt ways
+ // and score them to generate a minimum acceptable score
+ int thresh = 0;
+ char * mw = NULL;
+ for (int sp = 1; sp < 4; sp++) {
+ mw = strdup(word);
+ for (int k=sp; k < n; k+=4) *(mw + k) = '*';
+ thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
+ free(mw);
+ }
+ mw = NULL;
+ thresh = thresh / 3;
+ thresh--;
+
// now expand affixes on each of these root words and
// and use length adjusted ngram scores to select
// possible suggestions
@@ -353,28 +430,30 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
rp->astr, rp->alen);
for (int k = 0; k < nw; k++) {
- sc = ngram(3, word, glst[k].word, (1==1));
- if (sc > gscore[lp]) {
- if (guess[lp]) free (guess[lp]);
- gscore[lp] = sc;
- guess[lp] = glst[k].word;
- lval = sc;
- for (j=0; j < MAX_GUESS; j++)
- if (gscore[j] < lval) {
- lp = j;
- lval = gscore[j];
- }
- } else {
- free (glst[k].word);
- }
+ sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
+ if (sc > thresh) {
+ if (sc > gscore[lp]) {
+ if (guess[lp]) free (guess[lp]);
+ gscore[lp] = sc;
+ guess[lp] = glst[k].word;
+ lval = sc;
+ for (j=0; j < MAX_GUESS; j++)
+ if (gscore[j] < lval) {
+ lp = j;
+ lval = gscore[j];
+ }
+ } else {
+ free (glst[k].word);
+ }
+ }
}
}
-
}
if (glst) free(glst);
// now we are done generating guesses
// sort in order of decreasing score and copy over
+
bubblesort(&guess[0], &gscore[0], MAX_GUESS);
int ns = 0;
for (i=0; i < MAX_GUESS; i++) {
@@ -412,12 +491,11 @@ int SuggestMgr::check(const char * word, int len)
// generate an n-gram score comparing s1 and s2
-int SuggestMgr::ngram(int n, char * s1, const char * s2, bool uselen)
+int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
{
int nscore = 0;
int l1 = strlen(s1);
- int l2 = l1;
- if (uselen) l2 = strlen(s2);
+ int l2 = strlen(s2);
int ns;
for (int j=1;j<=n;j++) {
ns = 0;
@@ -430,7 +508,9 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, bool uselen)
nscore = nscore + ns;
if (ns < 2) break;
}
- ns = abs(l1-l2) - 2;
+ ns = 0;
+ if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
+ if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
return (nscore - ((ns > 0) ? ns : 0));
}
@@ -456,3 +536,4 @@ void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
}
return;
}
+
diff --git a/src/myspell/suggestmgr.hxx b/src/myspell/suggestmgr.hxx
index f78d94d..7c5a6e2 100644
--- a/src/myspell/suggestmgr.hxx
+++ b/src/myspell/suggestmgr.hxx
@@ -4,7 +4,12 @@
#define MAXSWL 100
#define MAX_ROOTS 10
#define MAX_WORDS 500
-#define MAX_GUESS 5
+#define MAX_GUESS 10
+
+#define NGRAM_IGNORE_LENGTH 0
+#define NGRAM_LONGER_WORSE 1
+#define NGRAM_ANY_MISMATCH 2
+
#include "atypes.hxx"
#include "affixmgr.hxx"
@@ -16,7 +21,8 @@ class SuggestMgr
int ctryl;
AffixMgr* pAMgr;
int maxSug;
-
+ bool nosplitsugs;
+
public:
SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr);
~SuggestMgr();
@@ -27,12 +33,14 @@ public:
private:
int replchars(char**, const char *, int);
+ int mapchars(char**, const char *, int);
+ int map_related(const char *, int, char ** wlst, int, const mapentry*, int);
int forgotchar(char **, const char *, int);
int swapchar(char **, const char *, int);
int extrachar(char **, const char *, int);
int badchar(char **, const char *, int);
int twowords(char **, const char *, int);
- int ngram(int n, char * s1, const char * s2, bool uselen);
+ int ngram(int n, char * s1, const char * s2, int uselen);
void bubblesort( char ** rwd, int * rsc, int n);
};