8 files changed, 110 insertions, 20 deletions
diff --git a/Makefile.in b/Makefile.in
index 37a01c6a4..a8d0f932a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -117,6 +117,11 @@ COMPILE_INITS=@COMPILE_INITS@
 
 GS_LIB_DEFAULT=$(gsdatadir)/Resource/Init:$(gsdatadir)/lib:$(gsdatadir)/Resource/Font:$(gsdir)/fonts:@fontpath@
 
+# Define the default search path for Tesseract.  Separate multiple directories
+# with a :.
+
+TESSDATA=@tessdata@
+
 # Define the default directory for cached data files
 # this must be a single path.
 
diff --git a/base/lib.mak b/base/lib.mak
index 5030a0141..ad39310bc 100644
--- a/base/lib.mak
+++ b/base/lib.mak
@@ -3359,7 +3359,7 @@ $(GLD)romfs0.dev :  $(LIB_MAK) $(ECHOGS_XE) $(LIB_MAK) $(MAKEDIRS)
 $(GLGEN)gsromfs1_.c : $(MKROMFS_XE) $(PS_ROMFS_DEPS) $(LIB_MAK) $(MAKEDIRS)
 	$(EXP)$(MKROMFS_XE) -o $(GLGEN)gsromfs1_.c \
 	$(MKROMFS_FLAGS) -X .svn -X CVS -P $(GLSRCDIR)$(D)..$(D) iccprofiles$(D)* \
-	$(PS_ROMFS_ARGS) $(PS_FONT_ROMFS_ARGS) $(GL_ROMFS_ARGS) $(TESS_ROMFS_ARGS)
+	$(TESS_ROMFS_ARGS) $(PS_ROMFS_ARGS) $(PS_FONT_ROMFS_ARGS) $(GL_ROMFS_ARGS)
 
 $(GLGEN)gsromfs1_1.c : $(MKROMFS_XE) $(PS_ROMFS_DEPS) $(LIB_MAK) $(MAKEDIRS)
 	$(EXP)$(MKROMFS_XE) -o $(GLGEN)gsromfs1_1.c \
diff --git a/base/ocr.mak b/base/ocr.mak
index 20acc85e4..3f1b32d0d 100644
--- a/base/ocr.mak
+++ b/base/ocr.mak
@@ -28,7 +28,7 @@ $(GLGEN)libocr.dev : $(LIBOCR_MAK) $(ECHOGS_XE)$(MAKEDIRS)\
 # Tesseract veneer.
 $(GLGEN)tessocr.$(OBJ) : $(GLSRC)tessocr.cpp $(GLSRC)tessocr.h $(LIBOCR_MAK) \
 	$(gsmemory_h) $(gxiodev_h) $(stream_h) $(TESSDEPS)
-	$(TESSCXX) $(D_)LEPTONICA_INTERCEPT_MALLOC=1$(_D) $(I_)$(LEPTONICADIR)$(D)src$(_I) $(GLO_)tessocr.$(OBJ) $(C_) $(GLSRC)tessocr.cpp
+	$(TESSCXX) $(D_)LEPTONICA_INTERCEPT_MALLOC=1$(_D) $(I_)$(LEPTONICADIR)$(D)src$(_I) $(GLO_)tessocr.$(OBJ) $(C_) $(D_)TESSDATA="$(TESSDATA)"$(_D) $(GLSRC)tessocr.cpp
 
 # 0 = No version.
 
diff --git a/base/tesseract.mak b/base/tesseract.mak
index b499cc79a..c2bc1fb5a 100644
--- a/base/tesseract.mak
+++ b/base/tesseract.mak
@@ -1165,4 +1165,4 @@ TESSERACT_LEGACY_OBJS=\
 TESSERACT_LEGACY=
 
 TESS_ROMFS_ARGS=\
-	-c -d Resource/ -P .$(D)Resource$(D) Tesseract$(D)*
+	-c -P $(GLSRCDIR)$(D)..$(D) tessdata$(D)*
diff --git a/base/tessocr.cpp b/base/tessocr.cpp
index 225b75764..8ce19a14e 100644
--- a/base/tessocr.cpp
+++ b/base/tessocr.cpp
@@ -147,22 +147,71 @@ fail:
 }
 
 static bool
+load_file_from_path(const char *path, const char *file, GenericVector<char> *out)
+{
+    const char *sep = gp_file_name_directory_separator();
+    size_t seplen = strlen(sep);
+    size_t bufsize = strlen(path) + seplen + strlen(file) + 1;
+    const char *s, *e;
+    bool ret = 0;
+    char *buf = (char *)gs_alloc_bytes(leptonica_mem, bufsize, "load_file_from_path");
+    if (buf == NULL)
+        return 0;
+
+    s = path;
+    do {
+        e = path;
+        while (*e && *e != gp_file_name_list_separator)
+            e++;
+        memcpy(buf, s, e-s);
+        memcpy(&buf[e-s], sep, seplen);
+        strcpy(&buf[e-s+seplen], file);
+        ret = load_file(buf, out);
+        if (ret)
+            break;
+        s = e;
+        while (*s == gp_file_name_list_separator)
+            s++;
+    } while (*s != 0);
+
+    gs_free_object(leptonica_mem, buf, "load_file_from_path");
+
+    return ret;
+}
+
+#ifndef TESSDATA
+#define TESSDATA tessdata
+#endif
+#define STRINGIFY2(S) #S
+#define STRINGIFY(S) STRINGIFY2(S)
+static char *tessdata_prefix = STRINGIFY(TESSDATA);
+
+static bool
 tess_file_reader(const char *fname, GenericVector<char> *out)
 {
     const char *file = fname;
     const char *s;
     char text[PATH_MAX];
     int code = 0;
+    bool found;
     stream *ps;
     gx_io_device *iodev;
 
+    /* fname, as supplied to us by Tesseract has TESSDATA_PREFIX prepended
+     * to it. Check that first. */
+    found = load_file(fname, out);
+    if (found)
+            return found;
+
+    /* Find file, fname with any prefix removed, and use that in
+     * the rest of the searches. */
     for (s = fname; *s; s++)
         if (*s == '\\' || *s == '/')
             file = s+1;
 
-    /* FIXME: Try loading 'file' from gs specific paths */
+    /* Next look in romfs in the tessdata directory. */
     iodev = gs_findiodevice(leptonica_mem, (const byte *)"%rom", 4);
-    gs_snprintf(text, sizeof(text), "Resource/Tesseract/%s", file);
+    gs_snprintf(text, sizeof(text), "tessdata/%s", file);
     if (iodev) {
         long size;
         long i;
@@ -195,12 +244,13 @@ tess_file_reader(const char *fname, GenericVector<char> *out)
         }
     }
 
-    /* Fall back to gp_file access, first under Resource/Tesseract */
-    if (load_file(text, out))
-        return true;
+    /* Fall back to gp_file access under our configured tessdata path. */
+    found = load_file_from_path(tessdata_prefix, file, out);
+    if (found)
+        return found;
 
-    /* Then under TESSDATA */
-    return load_file(fname, out);
+    /* If all else fails, look in the current directory. */
+    return load_file(file, out);
 }
 
 int
diff --git a/configure.ac b/configure.ac
index 8bd7de3c7..640aca7f0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3198,6 +3198,16 @@ fi
 
 AC_SUBST(fontpath)
 
+dnl look for default tessdata...
+AC_ARG_WITH([tessdata],  AC_HELP_STRING([--with-tessdata],
+    [set tesseract data search path]), tessdata="$withval", tessdata="")
+
+if test "x$tessdata" = "x"; then
+        tessdata="${datadir}/tessdata"
+fi
+
+AC_SUBST(tessdata)
+
 dnl --------------------------------------------------
 dnl Check for library functions
 dnl --------------------------------------------------
diff --git a/doc/Devices.htm b/doc/Devices.htm
index b4dee7e62..ec21350e9 100644
--- a/doc/Devices.htm
+++ b/doc/Devices.htm
@@ -70,13 +70,17 @@
 <li><a href="#BMP">BMP file format</a></li>
 <li><a href="#PCX">PCX file format</a></li>
 <li><a href="#PSD">PSD file format (DeviceN color model)</a></li>
+<li><a href="#PDFimage">Bitmap PDF output, PCLm output</a></li>
+</ul>
+<li><a href="#OCR-Devices">OCR Devices</a></li>
+<ul>
+<li><a href="#OCR">OCR text output</a></li>
+<li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li>
 </ul>
 <li><a href="#High-level">High level formats</a></li>
 <ul>
 <li><a href="#PDF">PDF file output</a></li>
-<li><a href="#PDFimage">Bitmap PDF output, PCLm output</a></li>
 <li><a href="#OCR">OCR devices</a></li>
-<li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li>
 <li><a href="#PS">PostScript file output</a></li>
 <li><a href="#EPS">EPS file output</a></li>
 <li><a href="#PXL">PCL-XL file output</a></li>
@@ -954,9 +958,11 @@ of 'high-level' formats. These allow Ghostscript to preserve (as much as
 possible) the drawing elements of the input file maintaining flexibility,
 resolution independence, and editability.</p>
 
-<h2><a name="High-level"></a>High-level devices</h2>
+<hr>
 
-<h3><a name="OCR"></a>Optical Character Recognition (OCR) output</h3>
+<h2><a name="OCR-Devices"></a>Optical Character Recognition (OCR) devices</h2>
+
+<h3><a name="OCR"></a>OCR text output</h3>
 
 <p>
   These devices render internally in 8 bit greyscale, and then
@@ -974,12 +980,23 @@ resolution independence, and editability.</p>
   standard Tesseract tools.
 </p>
 <p>
-  These files are looked for from a variety of places. Firstly,
-  any files placed in &quot;Resource/Tesseract/&quot; will be
-  included in the binary for any standard (COMPILE_INITS=1) build.
-  Secondly, files will be searched for in the current directory.
-  Thirdly, files will be searched for in the directory given by
-  the environment variable TESSDATA_PREFIX.
+  These files are looked for from a variety of places.
+</p>
+<ul>
+  <li>Firstly, files will be searched for in the directory given by the
+    environment variable TESSDATA_PREFIX.
+  <li>Next, they will be searched for within the ROM filing system. Any
+    files placed in &quot;tessdata&quot; will be included within the ROM
+    filing system in the binary for any standard (COMPILE_INITS=1) build.
+  <li>Next, files will be searched for in the configured 'tessdata' path. On
+    Unix, this can be specified at the configure stage using
+    '--with-tessdata=&lt;path&gt;' (where &lt;path&gt; is a list of
+    directories to search, separated by ':' (on Unix) or ';' (on Windows)).
+  <li>Finally, we resort to searching the current directory.
+</ul>
+<p>
+  Please note, this pattern of directory searching differs from the original
+  release of the OCR devices.
 </p>
 <p>
   By default, the OCR process defaults to looking for English text,
@@ -1042,6 +1059,8 @@ resolution independence, and editability.</p>
 </p>
 <p>
 
+<hr>
+
 <h2><a name="High-level"></a>High-level devices</h2>
 
 <h3><a name="PDF"></a>PDF writer</h3>
diff --git a/psi/msvc.mak b/psi/msvc.mak
index f9f4d6d65..09983671b 100644
--- a/psi/msvc.mak
+++ b/psi/msvc.mak
@@ -308,6 +308,12 @@ AROOTDIR=c:/gs
 GSROOTDIR=$(AROOTDIR)/gs$(GS_DOT_VERSION)
 !endif
 
+# Define the directory to look in for tesseract data.
+
+!ifndef TESSDATA
+TESSDATA=$(GSROOTDIR)/tessdata
+!endif
+
 # Define the directory that will hold documentation at runtime.
 
 !ifndef GS_DOCDIR