diff options
-rw-r--r-- | Makefile.in | 5 | ||||
-rw-r--r-- | base/lib.mak | 2 | ||||
-rw-r--r-- | base/ocr.mak | 2 | ||||
-rw-r--r-- | base/tesseract.mak | 2 | ||||
-rw-r--r-- | base/tessocr.cpp | 64 | ||||
-rw-r--r-- | configure.ac | 10 | ||||
-rw-r--r-- | doc/Devices.htm | 39 | ||||
-rw-r--r-- | psi/msvc.mak | 6 |
8 files changed, 110 insertions, 20 deletions
diff --git a/Makefile.in b/Makefile.in index 37a01c6a4..a8d0f932a 100644 --- a/Makefile.in +++ b/Makefile.in @@ -117,6 +117,11 @@ COMPILE_INITS=@COMPILE_INITS@ GS_LIB_DEFAULT=$(gsdatadir)/Resource/Init:$(gsdatadir)/lib:$(gsdatadir)/Resource/Font:$(gsdir)/fonts:@fontpath@ +# Define the default search path for Tesseract. Separate multiple directories +# with a :. + +TESSDATA=@tessdata@ + # Define the default directory for cached data files # this must be a single path. diff --git a/base/lib.mak b/base/lib.mak index 5030a0141..ad39310bc 100644 --- a/base/lib.mak +++ b/base/lib.mak @@ -3359,7 +3359,7 @@ $(GLD)romfs0.dev : $(LIB_MAK) $(ECHOGS_XE) $(LIB_MAK) $(MAKEDIRS) $(GLGEN)gsromfs1_.c : $(MKROMFS_XE) $(PS_ROMFS_DEPS) $(LIB_MAK) $(MAKEDIRS) $(EXP)$(MKROMFS_XE) -o $(GLGEN)gsromfs1_.c \ $(MKROMFS_FLAGS) -X .svn -X CVS -P $(GLSRCDIR)$(D)..$(D) iccprofiles$(D)* \ - $(PS_ROMFS_ARGS) $(PS_FONT_ROMFS_ARGS) $(GL_ROMFS_ARGS) $(TESS_ROMFS_ARGS) + $(TESS_ROMFS_ARGS) $(PS_ROMFS_ARGS) $(PS_FONT_ROMFS_ARGS) $(GL_ROMFS_ARGS) $(GLGEN)gsromfs1_1.c : $(MKROMFS_XE) $(PS_ROMFS_DEPS) $(LIB_MAK) $(MAKEDIRS) $(EXP)$(MKROMFS_XE) -o $(GLGEN)gsromfs1_1.c \ diff --git a/base/ocr.mak b/base/ocr.mak index 20acc85e4..3f1b32d0d 100644 --- a/base/ocr.mak +++ b/base/ocr.mak @@ -28,7 +28,7 @@ $(GLGEN)libocr.dev : $(LIBOCR_MAK) $(ECHOGS_XE)$(MAKEDIRS)\ # Tesseract veneer. $(GLGEN)tessocr.$(OBJ) : $(GLSRC)tessocr.cpp $(GLSRC)tessocr.h $(LIBOCR_MAK) \ $(gsmemory_h) $(gxiodev_h) $(stream_h) $(TESSDEPS) - $(TESSCXX) $(D_)LEPTONICA_INTERCEPT_MALLOC=1$(_D) $(I_)$(LEPTONICADIR)$(D)src$(_I) $(GLO_)tessocr.$(OBJ) $(C_) $(GLSRC)tessocr.cpp + $(TESSCXX) $(D_)LEPTONICA_INTERCEPT_MALLOC=1$(_D) $(I_)$(LEPTONICADIR)$(D)src$(_I) $(GLO_)tessocr.$(OBJ) $(C_) $(D_)TESSDATA="$(TESSDATA)"$(_D) $(GLSRC)tessocr.cpp # 0 = No version. diff --git a/base/tesseract.mak b/base/tesseract.mak index b499cc79a..c2bc1fb5a 100644 --- a/base/tesseract.mak +++ b/base/tesseract.mak @@ -1165,4 +1165,4 @@ TESSERACT_LEGACY_OBJS=\ TESSERACT_LEGACY= TESS_ROMFS_ARGS=\ - -c -d Resource/ -P .$(D)Resource$(D) Tesseract$(D)* + -c -P $(GLSRCDIR)$(D)..$(D) tessdata$(D)* diff --git a/base/tessocr.cpp b/base/tessocr.cpp index 225b75764..8ce19a14e 100644 --- a/base/tessocr.cpp +++ b/base/tessocr.cpp @@ -147,22 +147,71 @@ fail: } static bool +load_file_from_path(const char *path, const char *file, GenericVector<char> *out) +{ + const char *sep = gp_file_name_directory_separator(); + size_t seplen = strlen(sep); + size_t bufsize = strlen(path) + seplen + strlen(file) + 1; + const char *s, *e; + bool ret = 0; + char *buf = (char *)gs_alloc_bytes(leptonica_mem, bufsize, "load_file_from_path"); + if (buf == NULL) + return 0; + + s = path; + do { + e = path; + while (*e && *e != gp_file_name_list_separator) + e++; + memcpy(buf, s, e-s); + memcpy(&buf[e-s], sep, seplen); + strcpy(&buf[e-s+seplen], file); + ret = load_file(buf, out); + if (ret) + break; + s = e; + while (*s == gp_file_name_list_separator) + s++; + } while (*s != 0); + + gs_free_object(leptonica_mem, buf, "load_file_from_path"); + + return ret; +} + +#ifndef TESSDATA +#define TESSDATA tessdata +#endif +#define STRINGIFY2(S) #S +#define STRINGIFY(S) STRINGIFY2(S) +static char *tessdata_prefix = STRINGIFY(TESSDATA); + +static bool tess_file_reader(const char *fname, GenericVector<char> *out) { const char *file = fname; const char *s; char text[PATH_MAX]; int code = 0; + bool found; stream *ps; gx_io_device *iodev; + /* fname, as supplied to us by Tesseract has TESSDATA_PREFIX prepended + * to it. Check that first. */ + found = load_file(fname, out); + if (found) + return found; + + /* Find file, fname with any prefix removed, and use that in + * the rest of the searches. */ for (s = fname; *s; s++) if (*s == '\\' || *s == '/') file = s+1; - /* FIXME: Try loading 'file' from gs specific paths */ + /* Next look in romfs in the tessdata directory. */ iodev = gs_findiodevice(leptonica_mem, (const byte *)"%rom", 4); - gs_snprintf(text, sizeof(text), "Resource/Tesseract/%s", file); + gs_snprintf(text, sizeof(text), "tessdata/%s", file); if (iodev) { long size; long i; @@ -195,12 +244,13 @@ tess_file_reader(const char *fname, GenericVector<char> *out) } } - /* Fall back to gp_file access, first under Resource/Tesseract */ - if (load_file(text, out)) - return true; + /* Fall back to gp_file access under our configured tessdata path. */ + found = load_file_from_path(tessdata_prefix, file, out); + if (found) + return found; - /* Then under TESSDATA */ - return load_file(fname, out); + /* If all else fails, look in the current directory. */ + return load_file(file, out); } int diff --git a/configure.ac b/configure.ac index 8bd7de3c7..640aca7f0 100644 --- a/configure.ac +++ b/configure.ac @@ -3198,6 +3198,16 @@ fi AC_SUBST(fontpath) +dnl look for default tessdata... +AC_ARG_WITH([tessdata], AC_HELP_STRING([--with-tessdata], + [set tesseract data search path]), tessdata="$withval", tessdata="") + +if test "x$tessdata" = "x"; then + tessdata="${datadir}/tessdata" +fi + +AC_SUBST(tessdata) + dnl -------------------------------------------------- dnl Check for library functions dnl -------------------------------------------------- diff --git a/doc/Devices.htm b/doc/Devices.htm index b4dee7e62..ec21350e9 100644 --- a/doc/Devices.htm +++ b/doc/Devices.htm @@ -70,13 +70,17 @@ <li><a href="#BMP">BMP file format</a></li> <li><a href="#PCX">PCX file format</a></li> <li><a href="#PSD">PSD file format (DeviceN color model)</a></li> +<li><a href="#PDFimage">Bitmap PDF output, PCLm output</a></li> +</ul> +<li><a href="#OCR-Devices">OCR Devices</a></li> +<ul> +<li><a href="#OCR">OCR text output</a></li> +<li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li> </ul> <li><a href="#High-level">High level formats</a></li> <ul> <li><a href="#PDF">PDF file output</a></li> -<li><a href="#PDFimage">Bitmap PDF output, PCLm output</a></li> <li><a href="#OCR">OCR devices</a></li> -<li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li> <li><a href="#PS">PostScript file output</a></li> <li><a href="#EPS">EPS file output</a></li> <li><a href="#PXL">PCL-XL file output</a></li> @@ -954,9 +958,11 @@ of 'high-level' formats. These allow Ghostscript to preserve (as much as possible) the drawing elements of the input file maintaining flexibility, resolution independence, and editability.</p> -<h2><a name="High-level"></a>High-level devices</h2> +<hr> -<h3><a name="OCR"></a>Optical Character Recognition (OCR) output</h3> +<h2><a name="OCR-Devices"></a>Optical Character Recognition (OCR) devices</h2> + +<h3><a name="OCR"></a>OCR text output</h3> <p> These devices render internally in 8 bit greyscale, and then @@ -974,12 +980,23 @@ resolution independence, and editability.</p> standard Tesseract tools. </p> <p> - These files are looked for from a variety of places. Firstly, - any files placed in "Resource/Tesseract/" will be - included in the binary for any standard (COMPILE_INITS=1) build. - Secondly, files will be searched for in the current directory. - Thirdly, files will be searched for in the directory given by - the environment variable TESSDATA_PREFIX. + These files are looked for from a variety of places. +</p> +<ul> + <li>Firstly, files will be searched for in the directory given by the + environment variable TESSDATA_PREFIX. + <li>Next, they will be searched for within the ROM filing system. Any + files placed in "tessdata" will be included within the ROM + filing system in the binary for any standard (COMPILE_INITS=1) build. + <li>Next, files will be searched for in the configured 'tessdata' path. On + Unix, this can be specified at the configure stage using + '--with-tessdata=<path>' (where <path> is a list of + directories to search, separated by ':' (on Unix) or ';' (on Windows)). + <li>Finally, we resort to searching the current directory. +</ul> +<p> + Please note, this pattern of directory searching differs from the original + release of the OCR devices. </p> <p> By default, the OCR process defaults to looking for English text, @@ -1042,6 +1059,8 @@ resolution independence, and editability.</p> </p> <p> +<hr> + <h2><a name="High-level"></a>High-level devices</h2> <h3><a name="PDF"></a>PDF writer</h3> diff --git a/psi/msvc.mak b/psi/msvc.mak index f9f4d6d65..09983671b 100644 --- a/psi/msvc.mak +++ b/psi/msvc.mak @@ -308,6 +308,12 @@ AROOTDIR=c:/gs GSROOTDIR=$(AROOTDIR)/gs$(GS_DOT_VERSION) !endif +# Define the directory to look in for tesseract data. + +!ifndef TESSDATA +TESSDATA=$(GSROOTDIR)/tessdata +!endif + # Define the directory that will hold documentation at runtime. !ifndef GS_DOCDIR |