summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Watts <Robin.Watts@artifex.com>2020-08-25 15:17:45 +0100
committerRobin Watts <Robin.Watts@artifex.com>2020-08-27 12:34:23 +0100
commit367c3858886d1c43b75e3ea48f770d80db800d15 (patch)
tree1674654577ea0f0e6176e4be3df2ce217cd7b4e9
parent30c56c669b05dd09518fad60a8f0ae8de4bcf186 (diff)
downloadghostpdl-367c3858886d1c43b75e3ea48f770d80db800d15.tar.gz
Add OCREngine parameter.
-rw-r--r--base/tessocr.cpp37
-rw-r--r--base/tessocr.h15
-rw-r--r--devices/gdevocr.c20
-rw-r--r--devices/gdevpdfimg.h1
-rw-r--r--devices/gdevpdfocr.c17
-rw-r--r--doc/Devices.htm22
6 files changed, 97 insertions, 15 deletions
diff --git a/base/tessocr.cpp b/base/tessocr.cpp
index 26e5432c7..057a4a369 100644
--- a/base/tessocr.cpp
+++ b/base/tessocr.cpp
@@ -186,9 +186,10 @@ tess_file_reader(const char *fname, GenericVector<char> *out)
}
int
-ocr_init_api(gs_memory_t *mem, const char *language, void **state)
+ocr_init_api(gs_memory_t *mem, const char *language, int engine, void **state)
{
tesseract::TessBaseAPI *api;
+ enum tesseract::OcrEngineMode mode;
leptonica_mem = mem->non_gc_memory;
setPixMemoryManager(my_leptonica_malloc, my_leptonica_free);
@@ -202,10 +203,28 @@ ocr_init_api(gs_memory_t *mem, const char *language, void **state)
return_error(gs_error_VMerror);
}
+ switch (engine)
+ {
+ case OCR_ENGINE_DEFAULT:
+ mode = tesseract::OcrEngineMode::OEM_DEFAULT;
+ break;
+ case OCR_ENGINE_LSTM:
+ mode = tesseract::OcrEngineMode::OEM_LSTM_ONLY;
+ break;
+ case OCR_ENGINE_LEGACY:
+ mode = tesseract::OcrEngineMode::OEM_TESSERACT_ONLY;
+ break;
+ case OCR_ENGINE_BOTH:
+ mode = tesseract::OcrEngineMode::OEM_TESSERACT_LSTM_COMBINED;
+ break;
+ default:
+ return_error(gs_error_rangecheck);
+ }
+
// Initialize tesseract-ocr with English, without specifying tessdata path
if (api->Init(NULL, 0, /* data, data_size */
language,
- tesseract::OcrEngineMode::OEM_DEFAULT,
+ mode,
NULL, 0, /* configs, configs_size */
NULL, NULL, /* vars_vec */
false, /* set_only_non_debug_params */
@@ -265,8 +284,7 @@ do_ocr_image(gs_memory_t *mem,
int w, int h, int bpp, int raster,
int xres, int yres, void *data, int restore,
int hocr, int pagecount,
- const char *language,
- char **out)
+ const char *language, int engine, char **out)
{
char *outText;
tesseract::TessBaseAPI *api;
@@ -277,7 +295,7 @@ do_ocr_image(gs_memory_t *mem,
if (language == NULL || *language == 0)
language = "eng";
- code = ocr_init_api(mem, language, (void **)&api);
+ code = ocr_init_api(mem, language, engine, (void **)&api);
if (code < 0)
return code;
@@ -328,19 +346,20 @@ do_ocr_image(gs_memory_t *mem,
int ocr_image_to_hocr(gs_memory_t *mem,
int w, int h, int bpp, int raster,
int xres, int yres, void *data, int restore,
- int pagecount, const char *language, char **out)
+ int pagecount, const char *language,
+ int engine, char **out)
{
return do_ocr_image(mem, w, h, bpp, raster, xres, yres, data,
- restore, 1, pagecount, language, out);
+ restore, 1, pagecount, language, engine, out);
}
int ocr_image_to_utf8(gs_memory_t *mem,
int w, int h, int bpp, int raster,
int xres, int yres, void *data, int restore,
- const char *language, char **out)
+ const char *language, int engine, char **out)
{
return do_ocr_image(mem, w, h, bpp, raster, xres, yres, data,
- restore, 0, 0, language, out);
+ restore, 0, 0, language, engine, out);
}
int
diff --git a/base/tessocr.h b/base/tessocr.h
index c5e7967b9..78c30a0d4 100644
--- a/base/tessocr.h
+++ b/base/tessocr.h
@@ -21,18 +21,27 @@
#include "gsmemory.h"
+enum
+{
+ OCR_ENGINE_DEFAULT = 0,
+ OCR_ENGINE_LSTM = 1,
+ OCR_ENGINE_LEGACY = 2,
+ OCR_ENGINE_BOTH = 3
+};
+
int ocr_image_to_utf8(gs_memory_t *mem,
int w, int h, int bpp, int raster,
int xres, int yres,
void *data, int restore_data,
- const char *language, char **out);
+ const char *language, int engine, char **out);
int ocr_image_to_hocr(gs_memory_t *mem,
int w, int h, int bpp, int raster,
int xres, int yres, void *data, int restore,
- int pagecount, const char *language, char **out);
+ int pagecount, const char *language,
+ int engine, char **out);
-int ocr_init_api(gs_memory_t *mem, const char *language, void **state);
+int ocr_init_api(gs_memory_t *mem, const char *language, int engine, void **state);
void ocr_fin_api(gs_memory_t *mem, void *api_);
diff --git a/devices/gdevocr.c b/devices/gdevocr.c
index ddfaa809e..44cc72341 100644
--- a/devices/gdevocr.c
+++ b/devices/gdevocr.c
@@ -43,6 +43,7 @@ struct gx_device_ocr_s {
gx_prn_device_common;
gx_downscaler_params downscale;
char language[1024];
+ int engine;
int page_count;
};
@@ -127,6 +128,9 @@ ocr_get_params(gx_device * dev, gs_param_list * plist)
if ((code = param_write_string(plist, "OCRLanguage", &langstr)) < 0)
ecode = code;
+ if ((code = param_write_string(plist, "OCREngine", &pdev->engine)) < 0)
+ ecode = code;
+
if ((code = gx_downscaler_write_params(plist, &pdev->downscale,
GX_DOWNSCALER_PARAMS_MFS)) < 0)
ecode = code;
@@ -146,6 +150,7 @@ ocr_put_params(gx_device *dev, gs_param_list *plist)
gs_param_string langstr;
const char *param_name;
size_t len;
+ int engine;
switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) {
case 0:
@@ -162,6 +167,17 @@ ocr_put_params(gx_device *dev, gs_param_list *plist)
param_signal_error(plist, param_name, ecode);
}
+ switch (code = param_read_int(plist, (param_name = "OCREngine"), &engine)) {
+ case 0:
+ pdev->engine = engine;
+ break;
+ case 1:
+ break;
+ default:
+ ecode = code;
+ param_signal_error(plist, param_name, ecode);
+ }
+
code = gx_downscaler_read_params(plist, &pdev->downscale,
GX_DOWNSCALER_PARAMS_MFS);
if (code < 0)
@@ -225,14 +241,14 @@ do_ocr_print_page(gx_device_ocr * pdev, gp_file * file, int hocr)
(int)pdev->HWResolution[0],
(int)pdev->HWResolution[1],
data, 0, pdev->page_count,
- "eng", &out);
+ "eng", pdev->engine, &out);
else
code = ocr_image_to_utf8(pdev->memory,
width, height,
8, raster,
(int)pdev->HWResolution[0],
(int)pdev->HWResolution[1],
- data, 0, "eng", &out);
+ data, 0, "eng", pdev->engine, &out);
if (code < 0)
goto done;
if (out)
diff --git a/devices/gdevpdfimg.h b/devices/gdevpdfimg.h
index baf8175e0..3fe7aac22 100644
--- a/devices/gdevpdfimg.h
+++ b/devices/gdevpdfimg.h
@@ -64,6 +64,7 @@ typedef struct gx_device_pdf_image_s {
/* OCR data */
struct {
char language[1024];
+ int engine;
void *state;
/* Number of "file level" objects - i.e. the number of objects
diff --git a/devices/gdevpdfocr.c b/devices/gdevpdfocr.c
index 95d358b18..b03116945 100644
--- a/devices/gdevpdfocr.c
+++ b/devices/gdevpdfocr.c
@@ -63,6 +63,7 @@ pdfocr_put_some_params(gx_device * dev, gs_param_list * plist)
gs_param_string langstr;
const char *param_name;
size_t len;
+ int engine;
switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) {
case 0:
@@ -79,6 +80,17 @@ pdfocr_put_some_params(gx_device * dev, gs_param_list * plist)
param_signal_error(plist, param_name, ecode);
}
+ switch (code = param_read_int(plist, (param_name = "OCREngine"), &engine)) {
+ case 0:
+ pdf_dev->ocr.engine = engine;
+ break;
+ case 1:
+ break;
+ default:
+ ecode = code;
+ param_signal_error(plist, param_name, ecode);
+ }
+
return code;
}
@@ -120,6 +132,9 @@ pdfocr_get_some_params(gx_device * dev, gs_param_list * plist)
if ((code = param_write_string(plist, "OCRLanguage", &langstr)) < 0)
ecode = code;
+ if ((code = param_write_int(plist, "OCREngine", &pdf_dev->ocr.engine)) < 0)
+ ecode = code;
+
return ecode;
}
@@ -416,7 +431,7 @@ ocr_file_init(gx_device_pdf_image *dev)
stream_write(dev->strm, funky_font6a, sizeof(funky_font6a));
stream_write(dev->strm, funky_font6b, sizeof(funky_font6b)-1);
- return ocr_init_api(dev->memory, language, &dev->ocr.state);
+ return ocr_init_api(dev->memory, language, dev->ocr.engine, &dev->ocr.state);
}
static void
diff --git a/doc/Devices.htm b/doc/Devices.htm
index 1a64c92a0..d6a656af3 100644
--- a/doc/Devices.htm
+++ b/doc/Devices.htm
@@ -999,6 +999,28 @@ resolution independence, and editability.</p>
</blockquote>
</blockquote>
<p>
+ The system is designed to allow different OCR engines to be used.
+ Even with Tesseract, there are 2 different engines, the &quot;legacy&quot;
+ engine, and the &quot;LSTM&quot; engine. These have different tradeoffs
+ regarding speed, accuracy, sensitivity to different fonts, size of
+ data etc. The engines in use can be changed by using the
+ <code>-dOCREngine=</code> switch;
+</p>
+<blockquote>
+<dl>
+<dt><code>-dOCREngine=</code><b><em>integer</em></b></dt>
+<dd>This sets the engine to use for OCR. It is the callers responsibility
+ to ensure that the traineddata supplied is appropriate for the selected
+ engine.
+ <dl>
+ <dt>0<dd>Default engine.
+ <dt>1<dd>LSTM engine only.
+ <dt>2<dd>Tesseract &quot;legacy&quot; engine only.
+ <dt>3<dd>Legacy and LSTM Engine combined.
+ </dl>
+</dd></dl>
+</blockquote>
+<p>
The first device is named ocr. It extracts data as unicode codepoints
and outputs them to the device as a stream of UTF-8 bytes.
</p>