diff options
author | Robin Watts <Robin.Watts@artifex.com> | 2020-08-25 15:17:45 +0100 |
---|---|---|
committer | Robin Watts <Robin.Watts@artifex.com> | 2020-08-27 12:34:23 +0100 |
commit | 367c3858886d1c43b75e3ea48f770d80db800d15 (patch) | |
tree | 1674654577ea0f0e6176e4be3df2ce217cd7b4e9 | |
parent | 30c56c669b05dd09518fad60a8f0ae8de4bcf186 (diff) | |
download | ghostpdl-367c3858886d1c43b75e3ea48f770d80db800d15.tar.gz |
Add OCREngine parameter.
-rw-r--r-- | base/tessocr.cpp | 37 | ||||
-rw-r--r-- | base/tessocr.h | 15 | ||||
-rw-r--r-- | devices/gdevocr.c | 20 | ||||
-rw-r--r-- | devices/gdevpdfimg.h | 1 | ||||
-rw-r--r-- | devices/gdevpdfocr.c | 17 | ||||
-rw-r--r-- | doc/Devices.htm | 22 |
6 files changed, 97 insertions, 15 deletions
diff --git a/base/tessocr.cpp b/base/tessocr.cpp index 26e5432c7..057a4a369 100644 --- a/base/tessocr.cpp +++ b/base/tessocr.cpp @@ -186,9 +186,10 @@ tess_file_reader(const char *fname, GenericVector<char> *out) } int -ocr_init_api(gs_memory_t *mem, const char *language, void **state) +ocr_init_api(gs_memory_t *mem, const char *language, int engine, void **state) { tesseract::TessBaseAPI *api; + enum tesseract::OcrEngineMode mode; leptonica_mem = mem->non_gc_memory; setPixMemoryManager(my_leptonica_malloc, my_leptonica_free); @@ -202,10 +203,28 @@ ocr_init_api(gs_memory_t *mem, const char *language, void **state) return_error(gs_error_VMerror); } + switch (engine) + { + case OCR_ENGINE_DEFAULT: + mode = tesseract::OcrEngineMode::OEM_DEFAULT; + break; + case OCR_ENGINE_LSTM: + mode = tesseract::OcrEngineMode::OEM_LSTM_ONLY; + break; + case OCR_ENGINE_LEGACY: + mode = tesseract::OcrEngineMode::OEM_TESSERACT_ONLY; + break; + case OCR_ENGINE_BOTH: + mode = tesseract::OcrEngineMode::OEM_TESSERACT_LSTM_COMBINED; + break; + default: + return_error(gs_error_rangecheck); + } + // Initialize tesseract-ocr with English, without specifying tessdata path if (api->Init(NULL, 0, /* data, data_size */ language, - tesseract::OcrEngineMode::OEM_DEFAULT, + mode, NULL, 0, /* configs, configs_size */ NULL, NULL, /* vars_vec */ false, /* set_only_non_debug_params */ @@ -265,8 +284,7 @@ do_ocr_image(gs_memory_t *mem, int w, int h, int bpp, int raster, int xres, int yres, void *data, int restore, int hocr, int pagecount, - const char *language, - char **out) + const char *language, int engine, char **out) { char *outText; tesseract::TessBaseAPI *api; @@ -277,7 +295,7 @@ do_ocr_image(gs_memory_t *mem, if (language == NULL || *language == 0) language = "eng"; - code = ocr_init_api(mem, language, (void **)&api); + code = ocr_init_api(mem, language, engine, (void **)&api); if (code < 0) return code; @@ -328,19 +346,20 @@ do_ocr_image(gs_memory_t *mem, int ocr_image_to_hocr(gs_memory_t *mem, int w, int h, int bpp, int raster, int xres, int yres, void *data, int restore, - int pagecount, const char *language, char **out) + int pagecount, const char *language, + int engine, char **out) { return do_ocr_image(mem, w, h, bpp, raster, xres, yres, data, - restore, 1, pagecount, language, out); + restore, 1, pagecount, language, engine, out); } int ocr_image_to_utf8(gs_memory_t *mem, int w, int h, int bpp, int raster, int xres, int yres, void *data, int restore, - const char *language, char **out) + const char *language, int engine, char **out) { return do_ocr_image(mem, w, h, bpp, raster, xres, yres, data, - restore, 0, 0, language, out); + restore, 0, 0, language, engine, out); } int diff --git a/base/tessocr.h b/base/tessocr.h index c5e7967b9..78c30a0d4 100644 --- a/base/tessocr.h +++ b/base/tessocr.h @@ -21,18 +21,27 @@ #include "gsmemory.h" +enum +{ + OCR_ENGINE_DEFAULT = 0, + OCR_ENGINE_LSTM = 1, + OCR_ENGINE_LEGACY = 2, + OCR_ENGINE_BOTH = 3 +}; + int ocr_image_to_utf8(gs_memory_t *mem, int w, int h, int bpp, int raster, int xres, int yres, void *data, int restore_data, - const char *language, char **out); + const char *language, int engine, char **out); int ocr_image_to_hocr(gs_memory_t *mem, int w, int h, int bpp, int raster, int xres, int yres, void *data, int restore, - int pagecount, const char *language, char **out); + int pagecount, const char *language, + int engine, char **out); -int ocr_init_api(gs_memory_t *mem, const char *language, void **state); +int ocr_init_api(gs_memory_t *mem, const char *language, int engine, void **state); void ocr_fin_api(gs_memory_t *mem, void *api_); diff --git a/devices/gdevocr.c b/devices/gdevocr.c index ddfaa809e..44cc72341 100644 --- a/devices/gdevocr.c +++ b/devices/gdevocr.c @@ -43,6 +43,7 @@ struct gx_device_ocr_s { gx_prn_device_common; gx_downscaler_params downscale; char language[1024]; + int engine; int page_count; }; @@ -127,6 +128,9 @@ ocr_get_params(gx_device * dev, gs_param_list * plist) if ((code = param_write_string(plist, "OCRLanguage", &langstr)) < 0) ecode = code; + if ((code = param_write_string(plist, "OCREngine", &pdev->engine)) < 0) + ecode = code; + if ((code = gx_downscaler_write_params(plist, &pdev->downscale, GX_DOWNSCALER_PARAMS_MFS)) < 0) ecode = code; @@ -146,6 +150,7 @@ ocr_put_params(gx_device *dev, gs_param_list *plist) gs_param_string langstr; const char *param_name; size_t len; + int engine; switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) { case 0: @@ -162,6 +167,17 @@ ocr_put_params(gx_device *dev, gs_param_list *plist) param_signal_error(plist, param_name, ecode); } + switch (code = param_read_int(plist, (param_name = "OCREngine"), &engine)) { + case 0: + pdev->engine = engine; + break; + case 1: + break; + default: + ecode = code; + param_signal_error(plist, param_name, ecode); + } + code = gx_downscaler_read_params(plist, &pdev->downscale, GX_DOWNSCALER_PARAMS_MFS); if (code < 0) @@ -225,14 +241,14 @@ do_ocr_print_page(gx_device_ocr * pdev, gp_file * file, int hocr) (int)pdev->HWResolution[0], (int)pdev->HWResolution[1], data, 0, pdev->page_count, - "eng", &out); + "eng", pdev->engine, &out); else code = ocr_image_to_utf8(pdev->memory, width, height, 8, raster, (int)pdev->HWResolution[0], (int)pdev->HWResolution[1], - data, 0, "eng", &out); + data, 0, "eng", pdev->engine, &out); if (code < 0) goto done; if (out) diff --git a/devices/gdevpdfimg.h b/devices/gdevpdfimg.h index baf8175e0..3fe7aac22 100644 --- a/devices/gdevpdfimg.h +++ b/devices/gdevpdfimg.h @@ -64,6 +64,7 @@ typedef struct gx_device_pdf_image_s { /* OCR data */ struct { char language[1024]; + int engine; void *state; /* Number of "file level" objects - i.e. the number of objects diff --git a/devices/gdevpdfocr.c b/devices/gdevpdfocr.c index 95d358b18..b03116945 100644 --- a/devices/gdevpdfocr.c +++ b/devices/gdevpdfocr.c @@ -63,6 +63,7 @@ pdfocr_put_some_params(gx_device * dev, gs_param_list * plist) gs_param_string langstr; const char *param_name; size_t len; + int engine; switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) { case 0: @@ -79,6 +80,17 @@ pdfocr_put_some_params(gx_device * dev, gs_param_list * plist) param_signal_error(plist, param_name, ecode); } + switch (code = param_read_int(plist, (param_name = "OCREngine"), &engine)) { + case 0: + pdf_dev->ocr.engine = engine; + break; + case 1: + break; + default: + ecode = code; + param_signal_error(plist, param_name, ecode); + } + return code; } @@ -120,6 +132,9 @@ pdfocr_get_some_params(gx_device * dev, gs_param_list * plist) if ((code = param_write_string(plist, "OCRLanguage", &langstr)) < 0) ecode = code; + if ((code = param_write_int(plist, "OCREngine", &pdf_dev->ocr.engine)) < 0) + ecode = code; + return ecode; } @@ -416,7 +431,7 @@ ocr_file_init(gx_device_pdf_image *dev) stream_write(dev->strm, funky_font6a, sizeof(funky_font6a)); stream_write(dev->strm, funky_font6b, sizeof(funky_font6b)-1); - return ocr_init_api(dev->memory, language, &dev->ocr.state); + return ocr_init_api(dev->memory, language, dev->ocr.engine, &dev->ocr.state); } static void diff --git a/doc/Devices.htm b/doc/Devices.htm index 1a64c92a0..d6a656af3 100644 --- a/doc/Devices.htm +++ b/doc/Devices.htm @@ -999,6 +999,28 @@ resolution independence, and editability.</p> </blockquote> </blockquote> <p> + The system is designed to allow different OCR engines to be used. + Even with Tesseract, there are 2 different engines, the "legacy" + engine, and the "LSTM" engine. These have different tradeoffs + regarding speed, accuracy, sensitivity to different fonts, size of + data etc. The engines in use can be changed by using the + <code>-dOCREngine=</code> switch; +</p> +<blockquote> +<dl> +<dt><code>-dOCREngine=</code><b><em>integer</em></b></dt> +<dd>This sets the engine to use for OCR. It is the callers responsibility + to ensure that the traineddata supplied is appropriate for the selected + engine. + <dl> + <dt>0<dd>Default engine. + <dt>1<dd>LSTM engine only. + <dt>2<dd>Tesseract "legacy" engine only. + <dt>3<dd>Legacy and LSTM Engine combined. + </dl> +</dd></dl> +</blockquote> +<p> The first device is named ocr. It extracts data as unicode codepoints and outputs them to the device as a stream of UTF-8 bytes. </p> |