upgrade to myspell 3.1

git-svn-id: svn+ssh://svn.abisource.com/svnroot/enchant/trunk@20903 bcba8976-2d24-0410-9c9c-aab3bd5fdfd6
author: Dom Lachowicz <domlachowicz@gmail.com> 2004-01-12 04:09:01 +0000
committer: Dom Lachowicz <domlachowicz@gmail.com> 2004-01-12 04:09:01 +0000
commit: 19ca5d04f57a178d77287b0a303adb293491ef20 (patch)
tree: 93ee3d237f3a83ad2c9060c602e099491292a70d
parent: 8b23ed4ec5e4769f66296b4e87ccbc6f4bfb9142 (diff)
download: enchant-19ca5d04f57a178d77287b0a303adb293491ef20.tar.gz
11 files changed, 443 insertions, 72 deletions
diff --git a/src/myspell/affentry.cxx b/src/myspell/affentry.cxx
index cfd54d3..603616d 100644
--- a/src/myspell/affentry.cxx
+++ b/src/myspell/affentry.cxx
@@ -8,7 +8,9 @@
 
 #include "affentry.hxx"
 
+#ifndef WINDOWS
 using namespace std;
+#endif
 
 extern char * mystrdup(const char * s);
 extern char *  myrevstrdup(const char * s);
diff --git a/src/myspell/affixmgr.cxx b/src/myspell/affixmgr.cxx
index 9ae79ca..87ca583 100644
--- a/src/myspell/affixmgr.cxx
+++ b/src/myspell/affixmgr.cxx
@@ -7,7 +7,9 @@
 #include "affixmgr.hxx"
 #include "affentry.hxx"
 
+#ifndef WINDOWS
 using namespace std;
+#endif
 
 
 // First some base level utility routines
@@ -16,6 +18,7 @@ extern char * mystrdup(const char * s);
 extern char * myrevstrdup(const char * s);
 extern char * mystrsep(char ** sptr, const char delim);
 extern int    isSubset(const char * s1, const char * s2); 
+extern int    isRevSubset(const char * s1, const char * end_of_s2, int len_s2); 
 
 
 AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr) 
@@ -26,7 +29,11 @@ AffixMgr::AffixMgr(const char * affpath, HashMgr* ptr)
   encoding=NULL;
   reptable = NULL;
   numrep = 0;
+  maptable = NULL;
+  nummap = 0;
   compound=NULL;
+  nosplitsugs= (0==1);
+
   cpdmin = 3;  // default value
   for (int i=0; i < SETSIZE; i++) {
      pStart[i] = NULL;
@@ -74,6 +81,16 @@ AffixMgr::~AffixMgr()
   trystring=NULL;
   if (encoding) free(encoding);
   encoding=NULL;
+  if (maptable) {  
+     for (int j=0; j < nummap; j++) {
+        free(maptable[j].set);
+        maptable[j].set = NULL;
+        maptable[j].len = 0;
+     }
+     free(maptable);  
+     maptable = NULL;
+  }
+  nummap = 0;
   if (reptable) {  
      for (int j=0; j < numrep; j++) {
         free(reptable[j].pattern);
@@ -155,6 +172,13 @@ int  AffixMgr::parse_file(const char * affpath)
           }
        }
 
+       /* parse in the related character map table */
+       if (strncmp(line,"MAP",3) == 0) {
+          if (parse_maptable(line, afflst)) {
+             return 1;
+          }
+       }
+
        // parse this affix: P - prefix, S - suffix
        ft = ' ';
        if (strncmp(line,"PFX",3) == 0) ft = 'P';
@@ -165,9 +189,17 @@ int  AffixMgr::parse_file(const char * affpath)
           }
        }
 
+       // handle NOSPLITSUGS
+       if (strncmp(line,"NOSPLITSUGS",11) == 0)
+		   nosplitsugs=(0==0);
+
     }
     fclose(afflst);
 
+    // convert affix trees to sorted list
+    process_pfx_tree_to_list();
+    process_sfx_tree_to_list();
+
     // now we can speed up performance greatly taking advantage of the 
     // relationship between the affixes and the idea of "subsets".
 
@@ -197,11 +229,12 @@ int  AffixMgr::parse_file(const char * affpath)
     return 0;
 }
 
+
 // we want to be able to quickly access prefix information
 // both by prefix flag, and sorted by prefix string itself 
 // so we need to set up two indexes
 
-int AffixMgr::build_pfxlist(AffEntry* pfxptr)
+int AffixMgr::build_pfxtree(AffEntry* pfxptr)
 {
   PfxEntry * ptr;
   PfxEntry * pptr;
@@ -217,8 +250,6 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr)
   pFlag[flg] = (AffEntry *) ep;
 
 
-  // next index by affix string
-
   // handle the special case of null affix string
   if (strlen(key) == 0) {
     // always inset them at head of list at element 0
@@ -228,25 +259,39 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr)
      return 0;
   }
 
-  // now handle the general case
+  // now handle the normal case
+  ep->setNextEQ(NULL);
+  ep->setNextNE(NULL);
+
   unsigned char sp = *((const unsigned char *)key);
   ptr = (PfxEntry*)pStart[sp];
   
-  /* handle the insert at top of list case */
-  if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) {
-     ep->setNext(ptr);
+  // handle the first insert 
+  if (!ptr) {
      pStart[sp] = (AffEntry*)ep;
      return 0;
   }
 
-  /* otherwise find where it fits in order and insert it */
+
+  // otherwise use binary tree insertion so that a sorted
+  // list can easily be generated later
   pptr = NULL;
-  for (; ptr != NULL; ptr = ptr->getNext()) {
-    if (strcmp( ep->getKey() , ptr->getKey() ) <= 0) break;
+  for (;;) {
     pptr = ptr;
+    if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
+       ptr = ptr->getNextEQ();
+       if (!ptr) {
+	  pptr->setNextEQ(ep);
+          break;
+       }
+    } else {
+       ptr = ptr->getNextNE();
+       if (!ptr) {
+	  pptr->setNextNE(ep);
+          break;
+       }
+    }
   }
-  pptr->setNext(ep);
-  ep->setNext(ptr);
   return 0;
 }
 
@@ -255,7 +300,7 @@ int AffixMgr::build_pfxlist(AffEntry* pfxptr)
 // we want to be able to quickly access suffix information
 // both by suffix flag, and sorted by the reverse of the
 // suffix string itself; so we need to set up two indexes
-int AffixMgr::build_sfxlist(AffEntry* sfxptr)
+int AffixMgr::build_sfxtree(AffEntry* sfxptr)
 {
   SfxEntry * ptr;
   SfxEntry * pptr;
@@ -283,30 +328,86 @@ int AffixMgr::build_sfxlist(AffEntry* sfxptr)
   }
 
   // now handle the normal case
+  ep->setNextEQ(NULL);
+  ep->setNextNE(NULL);
+
   unsigned char sp = *((const unsigned char *)key);
   ptr = (SfxEntry*)sStart[sp];
   
-  /* handle the insert at top of list case */
-  if ((!ptr) || ( strcmp( ep->getKey() , ptr->getKey() ) <= 0)) {
-     ep->setNext(ptr);
+  // handle the first insert 
+  if (!ptr) {
      sStart[sp] = (AffEntry*)ep;
      return 0;
   }
 
-  /* otherwise find where it fits in order and insert it */
+
+  // otherwise use binary tree insertion so that a sorted
+  // list can easily be generated later
   pptr = NULL;
-  for (; ptr != NULL; ptr = ptr->getNext()) {
-    if (strcmp( ep->getKey(), ptr->getKey() ) <= 0) break;
+  for (;;) {
     pptr = ptr;
+    if (strcmp(ep->getKey(), ptr->getKey() ) <= 0) {
+       ptr = ptr->getNextEQ();
+       if (!ptr) {
+	  pptr->setNextEQ(ep);
+          break;
+       }
+    } else {
+       ptr = ptr->getNextNE();
+       if (!ptr) {
+	  pptr->setNextNE(ep);
+          break;
+       }
+    }
+  }
+  return 0;
+}
+
+
+// convert from binary tree to sorted list
+int AffixMgr::process_pfx_tree_to_list()
+{
+  for (int i=1; i< SETSIZE; i++) {
+    pStart[i] = process_pfx_in_order(pStart[i],NULL);
   }
-  pptr->setNext(ep);
-  ep->setNext(ptr);
   return 0;
 }
 
 
+AffEntry* AffixMgr::process_pfx_in_order(AffEntry* ptr, AffEntry* nptr)
+{
+  if (ptr) {
+    nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextNE(), nptr);
+    ((PfxEntry*) ptr)->setNext((PfxEntry*) nptr);
+    nptr = process_pfx_in_order(((PfxEntry*) ptr)->getNextEQ(), ptr);
+  }
+  return nptr;
+}
+
+
+// convert from binary tree to sorted list
+int AffixMgr:: process_sfx_tree_to_list()
+{
+  for (int i=1; i< SETSIZE; i++) {
+    sStart[i] = process_sfx_in_order(sStart[i],NULL);
+  }
+  return 0;
+}
+
+AffEntry* AffixMgr::process_sfx_in_order(AffEntry* ptr, AffEntry* nptr)
+{
+  if (ptr) {
+    nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextNE(), nptr);
+    ((SfxEntry*) ptr)->setNext((SfxEntry*) nptr);
+    nptr = process_sfx_in_order(((SfxEntry*) ptr)->getNextEQ(), ptr);
+  }
+  return nptr;
+}
 
-// initialize the PfxEntry links NextEQ and NextNE to speed searching
+
+
+// reinitialize the PfxEntry links NextEQ and NextNE to speed searching
+// using the idea of leading subsets this time
 int AffixMgr::process_pfx_order()
 {
     PfxEntry* ptr;
@@ -356,7 +457,8 @@ int AffixMgr::process_pfx_order()
 
 
 
-// initialize the SfxEntry links NextEQ and NextNE to speed searching
+// reinitialize the SfxEntry links NextEQ and NextNE to speed searching
+// using the idea of leading subsets this time
 int AffixMgr::process_sfx_order()
 {
     SfxEntry* ptr;
@@ -602,15 +704,15 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len,
     }
   
     // now handle the general case
-    char * tmpword = myrevstrdup(word);
-    unsigned char sp = *((const unsigned char *)tmpword);
+    unsigned char sp = *((const unsigned char *)(word + len - 1));
+
+
     SfxEntry * sptr = (SfxEntry *) sStart[sp];
 
     while (sptr) {
-        if (isSubset(sptr->getKey(),tmpword)) {
+        if (isRevSubset(sptr->getKey(),(word+len-1), len)) {
 	     rv = sptr->check(word,len, sfxopts, ppfx);
              if (rv) {
-                  free(tmpword);
                   return rv;
              }
              sptr = sptr->getNextEQ();
@@ -618,8 +720,6 @@ struct hentry * AffixMgr::suffix_check (const char * word, int len,
 	     sptr = sptr->getNextNE();
         }
     }
-    
-    free(tmpword);
     return NULL;
 }
 
@@ -737,6 +837,20 @@ struct replentry * AffixMgr::get_reptable()
   return reptable;
 }
 
+
+// return length of character map table
+int AffixMgr::get_nummap()
+{
+  return nummap;
+}
+
+// return character map table
+struct mapentry * AffixMgr::get_maptable()
+{
+  if (! maptable ) return NULL;
+  return maptable;
+}
+
 // return text encoding of dictionary
 char * AffixMgr::get_encoding()
 {
@@ -768,6 +882,11 @@ struct hentry * AffixMgr::lookup(const char * word)
   return pHMgr->lookup(word);
 }
 
+// return nosplitsugs
+bool AffixMgr::get_nosplitsugs(void)
+{
+  return nosplitsugs;
+}
 
 /* parse in the try string */
 int  AffixMgr::parse_try(char * line)
@@ -960,6 +1079,84 @@ int  AffixMgr::parse_reptable(char * line, FILE * af)
 }
 
 
+
+/* parse in the character map table */
+int  AffixMgr::parse_maptable(char * line, FILE * af)
+{
+   if (nummap != 0) {
+      fprintf(stderr,"error: duplicate MAP tables used\n");
+      return 1;
+   }
+   char * tp = line;
+   char * piece;
+   int i = 0;
+   int np = 0;
+   while ((piece=mystrsep(&tp,' '))) {
+       if (*piece != '\0') {
+          switch(i) {
+	     case 0: { np++; break; }
+             case 1: { 
+                       nummap = atoi(piece);
+	               if (nummap < 1) {
+			  fprintf(stderr,"incorrect number of entries in map table\n");
+			  free(piece);
+                          return 1;
+                       }
+                       maptable = (mapentry *) malloc(nummap * sizeof(struct mapentry));
+                       np++;
+                       break;
+	             }
+	     default: break;
+          }
+          i++;
+       }
+       free(piece);
+   }
+   if (np != 2) {
+      fprintf(stderr,"error: missing map table information\n");
+      return 1;
+   } 
+ 
+   /* now parse the nummap lines to read in the remainder of the table */
+   char * nl = line;
+   for (int j=0; j < nummap; j++) {
+        fgets(nl,MAXLNLEN,af);
+        mychomp(nl);
+        tp = nl;
+        i = 0;
+        maptable[j].set = NULL;
+        maptable[j].len = 0;
+        while ((piece=mystrsep(&tp,' '))) {
+           if (*piece != '\0') {
+               switch(i) {
+                  case 0: {
+		             if (strncmp(piece,"MAP",3) != 0) {
+		                 fprintf(stderr,"error: map table is corrupt\n");
+                                 free(piece);
+                                 return 1;
+                             }
+                             break;
+		          }
+                  case 1: { maptable[j].set = mystrdup(piece); 
+		            maptable[j].len = strlen(maptable[j].set);
+                            break; }
+		  default: break;
+               }
+               i++;
+           }
+           free(piece);
+        }
+	if ((!(maptable[j].set)) || (!(maptable[j].len))) {
+	     fprintf(stderr,"error: map table is corrupt\n");
+             return 1;
+        }
+   }
+   return 0;
+}
+
+
+
+
 int  AffixMgr::parse_affix(char * line, const char at, FILE * af)
 {
    int numents = 0;      // number of affentry structures to parse
@@ -1097,10 +1294,10 @@ int  AffixMgr::parse_affix(char * line, const char at, FILE * af)
    for (int k = 0; k < numents; k++) {
       if (at == 'P') {
 	  PfxEntry * pfxptr = new PfxEntry(this,nptr);
-          build_pfxlist((AffEntry *)pfxptr);
+          build_pfxtree((AffEntry *)pfxptr);
       } else {
 	  SfxEntry * sfxptr = new SfxEntry(this,nptr);
-          build_sfxlist((AffEntry *)sfxptr); 
+          build_sfxtree((AffEntry *)sfxptr); 
       }
       nptr++;
    }      
diff --git a/src/myspell/affixmgr.hxx b/src/myspell/affixmgr.hxx
index 9abbf26..6cbd112 100644
--- a/src/myspell/affixmgr.hxx
+++ b/src/myspell/affixmgr.hxx
@@ -20,6 +20,10 @@ class AffixMgr
   int                 cpdmin;
   int                 numrep;
   replentry *         reptable;
+  int                 nummap;
+  mapentry *          maptable;
+  bool                nosplitsugs;
+
 
 public:
  
@@ -34,9 +38,12 @@ public:
   struct hentry *     lookup(const char * word);
   int                 get_numrep();
   struct replentry *  get_reptable();
+  int                 get_nummap();
+  struct mapentry *   get_maptable();
   char *              get_encoding();
   char *              get_try_string();
   char *              get_compound();
+  bool                get_nosplitsugs();
              
 private:
   int  parse_file(const char * affpath);
@@ -45,11 +52,16 @@ private:
   int  parse_cpdflag(char * line);
   int  parse_cpdmin(char * line);
   int  parse_reptable(char * line, FILE * af);
+  int  parse_maptable(char * line, FILE * af);
   int  parse_affix(char * line, const char at, FILE * af);
 
   void encodeit(struct affentry * ptr, char * cs);
-  int build_pfxlist(AffEntry* pfxptr);
-  int build_sfxlist(AffEntry* sfxptr);
+  int build_pfxtree(AffEntry* pfxptr);
+  int build_sfxtree(AffEntry* sfxptr);
+  AffEntry* process_sfx_in_order(AffEntry* ptr, AffEntry* nptr);
+  AffEntry* process_pfx_in_order(AffEntry* ptr, AffEntry* nptr);
+  int process_pfx_tree_to_list();
+  int process_sfx_tree_to_list();
   int process_pfx_order();
   int process_sfx_order();
 };
diff --git a/src/myspell/atypes.hxx b/src/myspell/atypes.hxx
index 4c67ba2..a10c69d 100644
--- a/src/myspell/atypes.hxx
+++ b/src/myspell/atypes.hxx
@@ -27,6 +27,11 @@ struct replentry {
   char * replacement;
 };
 
+struct mapentry {
+  char * set;
+  int len;
+};
+
 struct guessword {
   char * word;
   bool allow;
diff --git a/src/myspell/csutil.cxx b/src/myspell/csutil.cxx
index 498cf7c..0ecafda 100644
--- a/src/myspell/csutil.cxx
+++ b/src/myspell/csutil.cxx
@@ -3,7 +3,9 @@
 #include <cstdio>
 #include "csutil.hxx"
 
+#ifndef WINDOWS
 using namespace std;
+#endif
 
 // strip strings into token based on single char delimiter
 // acts like strsep() but only uses a delim char and not 
@@ -74,7 +76,7 @@ char * myrevstrdup(const char * s)
     return d; 
 }
 
-
+#if 0
 // return 1 if s1 is a leading subset of s2
 int isSubset(const char * s1, const char * s2)
 {
@@ -84,7 +86,30 @@ int isSubset(const char * s1, const char * s2)
   if (strncmp(s2,s1,l1) == 0) return 1;
   return 0;
 }
+#endif
+
+
+// return 1 if s1 is a leading subset of s2
+int isSubset(const char * s1, const char * s2)
+{
+  while( *s1 && *s2 && (*s1 == *s2) ) {
+    s1++;
+    s2++;
+  }
+  return (*s1 == '\0');
+}
+
 
+// return 1 if s1 (reversed) is a leading subset of end of s2
+int isRevSubset(const char * s1, const char * end_of_s2, int len)
+{
+  while( (len > 0) && *s1 && (*s1 == *end_of_s2) ) {
+    s1++;
+    end_of_s2--;
+    len --;
+  }
+  return (*s1 == '\0');
+}
 
 
 // convert null terminated string to all caps using encoding 
diff --git a/src/myspell/csutil.hxx b/src/myspell/csutil.hxx
index eb5be3b..037eab9 100644
--- a/src/myspell/csutil.hxx
+++ b/src/myspell/csutil.hxx
@@ -19,6 +19,9 @@ char * mystrsep(char ** sptr, const char delim);
 // is one string a leading subset of another   
 int    isSubset(const char * s1, const char * s2);
 
+// is one reverse string a leading subset of the end of another   
+int    isRevSubset(const char * s1, const char * end_of_s2, int s2_len);
+
 
 // character encoding information
 
diff --git a/src/myspell/hashmgr.cxx b/src/myspell/hashmgr.cxx
index 223674a..d7b4ec8 100644
--- a/src/myspell/hashmgr.cxx
+++ b/src/myspell/hashmgr.cxx
@@ -1,9 +1,7 @@
 #include "license.readme"
 
-#include <unistd.h>
 #include <cstdlib>
 #include <cstring>
-#include <fcntl.h>
 #include <cstdio>
 
 #include "hashmgr.hxx"
@@ -11,7 +9,9 @@
 extern void mychomp(char * s);
 extern char * mystrdup(const char *);
 
+#ifndef WINDOWS
 using namespace std;
+#endif
 
 
 // build a hash table from a munched word list
diff --git a/src/myspell/myspell.cxx b/src/myspell/myspell.cxx
index 264d1a5..6209898 100644
--- a/src/myspell/myspell.cxx
+++ b/src/myspell/myspell.cxx
@@ -6,7 +6,9 @@
 
 #include "myspell.hxx"
 
+#ifndef WINDOWS
 using namespace std;
+#endif
 
 
 MySpell::MySpell(const char * affpath, const char * dpath)
@@ -138,7 +140,23 @@ int MySpell::spell(const char * word)
                      break;
                    }
 
-     case ALLCAP:
+     case ALLCAP:  {
+                     memcpy(wspace,cw,(wl+1));
+                     mkallsmall(wspace, csconv);
+                     rv = check(wspace);
+                     if (!rv) {
+                        mkinitcap(wspace, csconv);
+                        rv = check(wspace);
+                     }
+                     if (!rv) rv = check(cw);
+                     if ((abbv) && !(rv)) {
+		         memcpy(wspace,cw,wl);
+                         *(wspace+wl) = '.';
+                         *(wspace+wl+1) = '\0';
+                         rv = check(wspace);
+                     }
+                     break; 
+                   }
      case INITCAP: { 
                      memcpy(wspace,cw,(wl+1));
                      mkallsmall(wspace, csconv);
@@ -247,8 +265,23 @@ int MySpell::suggest(char*** slst, const char * word)
   if (ns == 0) { 
      ns = pSMgr->ngsuggest(wlst, cw, pHMgr);
      if (ns) {
-       *slst = wlst;
-       return ns;
+         switch(captype) {
+	    case NOCAP:  break;
+            case HUHCAP: break; 
+            case INITCAP: { 
+                            for (int j=0; j < ns; j++)
+                              mkinitcap(wlst[j], csconv);
+                          }
+                          break;
+
+            case ALLCAP: { 
+                            for (int j=0; j < ns; j++)
+                              mkallcap(wlst[j], csconv);
+                         } 
+                         break;
+	 }
+         *slst = wlst;
+         return ns;
      }
   }
   if (ns < 0) {
diff --git a/src/myspell/myspell.hxx b/src/myspell/myspell.hxx
index 20b955c..0c18549 100644
--- a/src/myspell/myspell.hxx
+++ b/src/myspell/myspell.hxx
@@ -1,3 +1,6 @@
+#ifndef _MYSPELLMGR_HXX_
+#define _MYSPELLMGR_HXX_
+
 #include "hashmgr.hxx"
 #include "affixmgr.hxx"
 #include "suggestmgr.hxx"
@@ -8,11 +11,13 @@
 #define ALLCAP  2
 #define HUHCAP  3
 
+#ifdef WINDOWS
+#define DLLSUPPORT __declspec(dllexport)
+#else
+#define DLLSUPPORT
+#endif
 
-#ifndef _MYSPELLMGR_HXX_
-#define _MYSPELLMGR_HXX_
-
-class MySpell
+class DLLSUPPORT MySpell
 {
   AffixMgr*       pAMgr;
   HashMgr*        pHMgr;
diff --git a/src/myspell/suggestmgr.cxx b/src/myspell/suggestmgr.cxx
index dc8e646..4a756b3 100644
--- a/src/myspell/suggestmgr.cxx
+++ b/src/myspell/suggestmgr.cxx
@@ -7,7 +7,9 @@
 
 #include "suggestmgr.hxx"
 
+#ifndef WINDOWS
 using namespace std;
+#endif
 
 extern char * mystrdup(const char *);
 
@@ -24,6 +26,8 @@ SuggestMgr::SuggestMgr(const char * tryme, int maxn,
   if (ctry)
     ctryl = strlen(ctry);
   maxSug = maxn;
+  nosplitsugs=(0==1);
+  if (pAMgr) pAMgr->get_nosplitsugs();
 }
 
 
@@ -46,6 +50,10 @@ int SuggestMgr::suggest(char** wlst, int ns, const char * word)
     
     int nsug = ns;
 
+    // perhaps we made chose the wrong char from a related set
+    if ((nsug < maxSug) && (nsug > -1))
+      nsug = mapchars(wlst, word, nsug);
+
     // perhaps we made a typical fault of spelling
     if ((nsug < maxSug) && (nsug > -1))
       nsug = replchars(wlst, word, nsug);
@@ -67,14 +75,66 @@ int SuggestMgr::suggest(char** wlst, int ns, const char * word)
       nsug = badchar(wlst, word, nsug);
 
     // perhaps we forgot to hit space and two words ran together
-    if ((nsug < maxSug) && (nsug > -1))
-      nsug = twowords(wlst, word, nsug);
-
+    if (!nosplitsugs) {
+        if ((nsug < maxSug) && (nsug > -1))
+           nsug = twowords(wlst, word, nsug);
+    }
     return nsug;
 }
 
 
 
+// suggestions for when chose the wrong char out of a related set
+int SuggestMgr::mapchars(char** wlst, const char * word, int ns)
+{
+  int wl = strlen(word);
+  if (wl < 2 || ! pAMgr) return ns;
+
+  int nummap = pAMgr->get_nummap();
+  struct mapentry* maptable = pAMgr->get_maptable();
+  if (maptable==NULL) return ns;
+  ns = map_related(word, 0, wlst, ns, maptable, nummap);
+  return ns;
+}
+
+
+int SuggestMgr::map_related(const char * word, int i, char** wlst, int ns, const mapentry* maptable, int nummap) 
+{
+  char c = *(word + i);
+  if (c == 0) {
+      int cwrd = 1;
+      for (int m=0; m < ns; m++)
+	  if (strcmp(word,wlst[m]) == 0) cwrd = 0;
+      if ((cwrd) && check(word,strlen(word))) {
+	  if (ns < maxSug) {
+	      wlst[ns] = mystrdup(word);
+	      if (wlst[ns] == NULL) return -1;
+	      ns++;
+	  }
+      }
+      return ns;
+  } 
+  int in_map = 0;
+  for (int j = 0; j < nummap; j++) {
+    if (strchr(maptable[j].set,c) != 0) {
+      in_map = 1;
+      char * newword = strdup(word);
+      for (int k = 0; k < maptable[j].len; k++) {
+	*(newword + i) = *(maptable[j].set + k);
+	ns = map_related(newword, (i+1), wlst, ns, maptable, nummap);
+      }
+      free(newword);
+    }
+  }
+  if (!in_map) {
+     i++;
+     ns = map_related(word, i, wlst, ns, maptable, nummap);
+  }
+  return ns;
+}
+
+
+
 // suggestions for a typical fault of spelling, that
 // differs with more, than 1 letter from the right form.
 int SuggestMgr::replchars(char** wlst, const char * word, int ns)
@@ -85,11 +145,11 @@ int SuggestMgr::replchars(char** wlst, const char * word, int ns)
   int cwrd;
 
   int wl = strlen(word);
-  if (wl < 2 || ! pAMgr) return 0;
+  if (wl < 2 || ! pAMgr) return ns;
 
   int numrep = pAMgr->get_numrep();
   struct replentry* reptable = pAMgr->get_reptable();
-  if (reptable==NULL) return 0;
+  if (reptable==NULL) return ns;
 
   for (int i=0; i < numrep; i++ ) {
       r = word;
@@ -161,7 +221,7 @@ int SuggestMgr::extrachar(char** wlst, const char * word, int ns)
    int cwrd;
 
    int wl = strlen(word);
-   if (wl < 2) return 0;
+   if (wl < 2) return ns;
 
    // try omitting one char of word at a time
    strcpy (candidate, word + 1);
@@ -314,10 +374,12 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
   }
   lp = MAX_ROOTS - 1;
 
+  int n = strlen(word);
+
   struct hentry* hp = NULL;
   int col = -1;
   while ((hp = pHMgr->walk_hashtable(col, hp))) {
-    sc = ngram(3, word, hp->word, (1 == 0));
+    sc = ngram(3, word, hp->word, NGRAM_LONGER_WORSE);
     if (sc > scores[lp]) {
       scores[lp] = sc;
       roots[lp] = hp;
@@ -330,6 +392,21 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
     }  
   }
 
+  // find minimum threshhold for a passable suggestion
+  // mangle original word three differnt ways
+  // and score them to generate a minimum acceptable score
+  int thresh = 0;
+  char * mw = NULL;
+  for (int sp = 1; sp < 4; sp++) {
+     mw = strdup(word);
+     for (int k=sp; k < n; k+=4) *(mw + k) = '*';
+     thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH);
+     free(mw);
+  }
+  mw = NULL;
+  thresh = thresh / 3;
+  thresh--;
+
   // now expand affixes on each of these root words and
   // and use length adjusted ngram scores to select
   // possible suggestions
@@ -353,28 +430,30 @@ int SuggestMgr::ngsuggest(char** wlst, char * word, HashMgr* pHMgr)
 	int nw = pAMgr->expand_rootword(glst, MAX_WORDS, rp->word, rp->wlen,
                                         rp->astr, rp->alen);
         for (int k = 0; k < nw; k++) {
-           sc = ngram(3, word, glst[k].word, (1==1));
-           if (sc > gscore[lp]) {
-	      if (guess[lp]) free (guess[lp]);
-              gscore[lp] = sc;
-              guess[lp] = glst[k].word;
-              lval = sc;
-              for (j=0; j < MAX_GUESS; j++)
-	         if (gscore[j] < lval) {
-	            lp = j;
-                    lval = gscore[j];
-	         }
-	   } else {
-              free (glst[k].word);  
-           }            
+           sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH);
+           if (sc > thresh) {
+              if (sc > gscore[lp]) {
+	         if (guess[lp]) free (guess[lp]);
+                 gscore[lp] = sc;
+                 guess[lp] = glst[k].word;
+                 lval = sc;
+                 for (j=0; j < MAX_GUESS; j++)
+	            if (gscore[j] < lval) {
+	               lp = j;
+                       lval = gscore[j];
+	            }
+	      } else {
+                 free (glst[k].word);  
+              }
+	   }            
 	}
       }
-
   }
   if (glst) free(glst);
 
   // now we are done generating guesses
   // sort in order of decreasing score and copy over
+  
   bubblesort(&guess[0], &gscore[0], MAX_GUESS);
   int ns = 0;
   for (i=0; i < MAX_GUESS; i++) {
@@ -412,12 +491,11 @@ int SuggestMgr::check(const char * word, int len)
 
 
 // generate an n-gram score comparing s1 and s2
-int SuggestMgr::ngram(int n, char * s1, const char * s2, bool uselen)
+int SuggestMgr::ngram(int n, char * s1, const char * s2, int uselen)
 {
   int nscore = 0;
   int l1 = strlen(s1);
-  int l2 = l1;
-  if (uselen) l2 = strlen(s2);
+  int l2 = strlen(s2);
   int ns;
   for (int j=1;j<=n;j++) {
     ns = 0;
@@ -430,7 +508,9 @@ int SuggestMgr::ngram(int n, char * s1, const char * s2, bool uselen)
     nscore = nscore + ns;
     if (ns < 2) break;
   }
-  ns = abs(l1-l2) - 2;
+  ns = 0;
+  if (uselen == NGRAM_LONGER_WORSE) ns = (l2-l1)-2;
+  if (uselen == NGRAM_ANY_MISMATCH) ns = abs(l2-l1)-2;
   return (nscore - ((ns > 0) ? ns : 0));
 }
 
@@ -456,3 +536,4 @@ void SuggestMgr::bubblesort(char** rword, int* rsc, int n )
       }
       return;
 }
+
diff --git a/src/myspell/suggestmgr.hxx b/src/myspell/suggestmgr.hxx
index f78d94d..7c5a6e2 100644
--- a/src/myspell/suggestmgr.hxx
+++ b/src/myspell/suggestmgr.hxx
@@ -4,7 +4,12 @@
 #define MAXSWL 100
 #define MAX_ROOTS 10
 #define MAX_WORDS 500
-#define MAX_GUESS 5
+#define MAX_GUESS 10
+
+#define NGRAM_IGNORE_LENGTH 0
+#define NGRAM_LONGER_WORSE  1
+#define NGRAM_ANY_MISMATCH  2
+
 
 #include "atypes.hxx"
 #include "affixmgr.hxx"
@@ -16,7 +21,8 @@ class SuggestMgr
   int             ctryl;
   AffixMgr*       pAMgr;
   int             maxSug;
- 
+  bool            nosplitsugs;
+
 public:
   SuggestMgr(const char * tryme, int maxn, AffixMgr *aptr);
   ~SuggestMgr();
@@ -27,12 +33,14 @@ public:
 
 private:
    int replchars(char**, const char *, int);
+   int mapchars(char**, const char *, int);
+   int map_related(const char *, int, char ** wlst, int, const mapentry*, int);
    int forgotchar(char **, const char *, int);
    int swapchar(char **, const char *, int);
    int extrachar(char **, const char *, int);
    int badchar(char **, const char *, int);
    int twowords(char **, const char *, int);
-   int ngram(int n, char * s1, const char * s2, bool uselen);
+   int ngram(int n, char * s1, const char * s2, int uselen);
    void bubblesort( char ** rwd, int * rsc, int n);
 };
author	Dom Lachowicz <domlachowicz@gmail.com>	2004-01-12 04:09:01 +0000
committer	Dom Lachowicz <domlachowicz@gmail.com>	2004-01-12 04:09:01 +0000
commit	19ca5d04f57a178d77287b0a303adb293491ef20 (patch)
tree	93ee3d237f3a83ad2c9060c602e099491292a70d
parent	8b23ed4ec5e4769f66296b4e87ccbc6f4bfb9142 (diff)
download	enchant-19ca5d04f57a178d77287b0a303adb293491ef20.tar.gz