diff options
-rw-r--r-- | base/gsdparam.c | 2 | ||||
-rw-r--r-- | base/gsfcmap.c | 13 | ||||
-rw-r--r-- | base/gsfcmap.h | 2 | ||||
-rw-r--r-- | base/gsparam.h | 4 | ||||
-rw-r--r-- | base/gsparaml.c | 28 | ||||
-rw-r--r-- | base/tesseract.mak | 7 | ||||
-rw-r--r-- | base/tessocr.cpp | 205 | ||||
-rw-r--r-- | base/tessocr.h | 10 | ||||
-rw-r--r-- | devices/devs.mak | 2 | ||||
-rw-r--r-- | devices/vector/gdevpdfb.c | 46 | ||||
-rw-r--r-- | devices/vector/gdevpdfb.h | 11 | ||||
-rw-r--r-- | devices/vector/gdevpdfp.c | 162 | ||||
-rw-r--r-- | devices/vector/gdevpdfx.h | 37 | ||||
-rw-r--r-- | devices/vector/gdevpdte.c | 270 | ||||
-rw-r--r-- | devices/vector/gdevpdtt.c | 74 | ||||
-rw-r--r-- | doc/Devices.htm | 32 | ||||
-rw-r--r-- | doc/VectorDevices.htm | 53 |
17 files changed, 933 insertions, 25 deletions
diff --git a/base/gsdparam.c b/base/gsdparam.c index 3af8e856f..36729692c 100644 --- a/base/gsdparam.c +++ b/base/gsdparam.c @@ -1010,6 +1010,8 @@ gs_putdeviceparams(gx_device * dev, gs_param_list * plist) bool was_open = dev->is_open; int code; + /* gs_param_list_dump(plist); */ + gx_device_set_procs(dev); fill_dev_proc(dev, put_params, gx_default_put_params); fill_dev_proc(dev, get_alpha_bits, gx_default_get_alpha_bits); diff --git a/base/gsfcmap.c b/base/gsfcmap.c index 32a32fdff..3040d390f 100644 --- a/base/gsfcmap.c +++ b/base/gsfcmap.c @@ -643,6 +643,19 @@ gs_cmap_ToUnicode_realloc(gs_memory_t *mem, int new_value_size, gs_cmap_t **ppcm return 0; } +int gs_cmap_ToUnicode_check_pair(gs_cmap_t *pcmap, int code0) +{ + gs_cmap_ToUnicode_t *cmap = (gs_cmap_ToUnicode_t *)pcmap; + uchar *map = pcmap->glyph_name_data; + const int num_codes = ((gs_cmap_ToUnicode_t *)pcmap)->num_codes; + + if (code0 >= num_codes) + return; /* must not happen. */ + if(map[code0 * (cmap->value_size + 2)] == 0 && map[code0 * (cmap->value_size + 2) + 1] == 0) + return 0; + return 1; +} + /* * Write a code pair to ToUnicode CMap. */ diff --git a/base/gsfcmap.h b/base/gsfcmap.h index 73516c3e2..2507865cc 100644 --- a/base/gsfcmap.h +++ b/base/gsfcmap.h @@ -66,4 +66,6 @@ int gs_cmap_ToUnicode_realloc(gs_memory_t *mem, int new_value_size, gs_cmap_t ** */ void gs_cmap_ToUnicode_add_pair(gs_cmap_t *pcmap, int code0, ushort *unicode, unsigned int length); +int gs_cmap_ToUnicode_check_pair(gs_cmap_t *pcmap, int code0); + #endif /* gsfcmap_INCLUDED */ diff --git a/base/gsparam.h b/base/gsparam.h index 3b0aaa21d..456a8a532 100644 --- a/base/gsparam.h +++ b/base/gsparam.h @@ -577,4 +577,8 @@ int gs_param_list_add_parsed_value(gs_param_list *plist, gs_param_name key, cons * address pointed to be len. */ int gs_param_list_to_string(gs_param_list *plist, gs_param_name key, char *value, int *len); +/* Debug function to dump a list of params. Do NOT use in production + * code! */ +int gs_param_list_dump(gs_param_list *plist); + #endif /* gsparam_INCLUDED */ diff --git a/base/gsparaml.c b/base/gsparaml.c index d7e5fcdbf..3d7e49b14 100644 --- a/base/gsparaml.c +++ b/base/gsparaml.c @@ -1046,3 +1046,31 @@ int gs_param_list_to_string(gs_param_list *plist, gs_param_name key, char *value *value = 0; return to_string(plist, key, &out); } + +int gs_param_list_dump(gs_param_list *plist) +{ + gs_param_enumerator_t enumerator; + gs_param_key_t key; + int code; + char buffer[4096]; + int len; + + param_init_enumerator(&enumerator); + while ((code = param_get_next_key(plist, &enumerator, &key)) == 0) { + char string_key[256]; /* big enough for any reasonable key */ + + if (key.size > sizeof(string_key) - 1) { + code = gs_note_error(gs_error_rangecheck); + break; + } + memcpy(string_key, key.data, key.size); + string_key[key.size] = 0; + dlprintf1("%s ", string_key); + code = gs_param_list_to_string(plist, string_key, buffer, &len); + if (code < 0) + break; + dlprintf1("%s ", buffer); + } + dlprintf("\n"); + return code; +} diff --git a/base/tesseract.mak b/base/tesseract.mak index c2bc1fb5a..43e234e75 100644 --- a/base/tesseract.mak +++ b/base/tesseract.mak @@ -24,7 +24,8 @@ TESSINCLUDES=\ # add -DDISABLED_LEGACY_ENGINE to TESSCXX # empty TESSERACT_LEGACY -TESSCXX = $(CXX) $(TESSINCLUDES) $(TESSCXXFLAGS) $(CCFLAGS) -DTESSERACT_IMAGEDATA_AS_PIX -DTESSERACT_DISABLE_DEBUG_FONTS -DGRAPHICS_DISABLED -DDISABLED_LEGACY_ENGINE +TESSCXX = $(CXX) $(TESSINCLUDES) $(TESSCXXFLAGS) $(CCFLAGS) -DTESSERACT_IMAGEDATA_AS_PIX -DTESSERACT_DISABLE_DEBUG_FONTS -DGRAPHICS_DISABLED +#-DDISABLED_LEGACY_ENGINE TESSOBJ = $(GLOBJDIR)$(D)tesseract_ TESSO_ = $(O_)$(TESSOBJ) @@ -1161,8 +1162,8 @@ TESSERACT_LEGACY_OBJS=\ $(TESSOBJ)wordrec_wordclass.$(OBJ) -#TESSERACT_LEGACY=$(TESSERACT_LEGACY_OBJS) -TESSERACT_LEGACY= +TESSERACT_LEGACY=$(TESSERACT_LEGACY_OBJS) +#TESSERACT_LEGACY= TESS_ROMFS_ARGS=\ -c -P $(GLSRCDIR)$(D)..$(D) tessdata$(D)* diff --git a/base/tessocr.cpp b/base/tessocr.cpp index 8ce19a14e..f8161ac3c 100644 --- a/base/tessocr.cpp +++ b/base/tessocr.cpp @@ -504,6 +504,211 @@ ocr_recognise(void *api_, int w, int h, void *data, return code; } +static Pix * +ocr_set_bitmap(tesseract::TessBaseAPI *api, + int w, int h, + const unsigned char *data, int data_x, int raster, + int xres, int yres) +{ + /* Tesseract prefers a border around things, so we add an 8 pixel + * border all around. */ +#define BORDER_SIZE 8 + int r = (w+BORDER_SIZE*2+3)&~3; + Pix *image = pixCreateHeader(r, h+BORDER_SIZE*2, 8); + unsigned char *pdata, *d; + const unsigned char *s; + int x, y; + + if (image == NULL) + return NULL; + + pdata = gs_alloc_bytes(leptonica_mem, r * (h+BORDER_SIZE*2), "ocr_set_bitmap"); + if (pdata == NULL) { + pixDestroy(&image); + return NULL; + } + pixSetData(image, (l_uint32 *)pdata); + pixSetPadBits(image, 1); + pixSetXRes(image, xres); + pixSetYRes(image, yres); + + s = &data[data_x>>3] + raster*(h-1); + d = pdata; + memset(d, 255, r * (h+BORDER_SIZE*2)); + d += r*BORDER_SIZE + BORDER_SIZE; + for (y = 0; y < h; y++) { + int b = 128>>(data_x & 7); + for (x = 0; x < w; x++) { + if (s[x>>3] & b) + d[x^3] = 0; + else + d[x^3] = 255; + b >>= 1; + if (b == 0) + b = 128; + } + s -= raster; + d += r; + } + + api->SetImage(image); +// pixWrite("test.pnm", image, IFF_PNM); + + return image; +} + +static void +ocr_clear_bitmap(Pix *image) +{ + gs_free_object(leptonica_mem, pixGetData(image), "ocr_clear_bitmap"); + pixSetData(image, NULL); + pixDestroy(&image); +} + +int ocr_bitmap_to_unicodes(void *state, + const void *data, int data_x, + int w, int h, int raster, + int xres, int yres, int *unicode, int *char_count) +{ + tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)state; + Pix *image; + int code, max_chars = *char_count, count = 0; + + if (api == NULL) + return 0; + + image = ocr_set_bitmap(api, w, h, (const unsigned char *)data, + data_x, raster, xres, yres); + if (image == NULL) + return_error(gs_error_VMerror); + + code = api->Recognize(NULL); + if (code >= 0) { + /* Bingo! */ + tesseract::ResultIterator *res_it = api->GetIterator(); + + while (!res_it->Empty(tesseract::RIL_BLOCK)) { + if (res_it->Empty(tesseract::RIL_WORD)) { + res_it->Next(tesseract::RIL_WORD); + continue; + } + + do { +#if FUTURE_DEVELOPMENT + int word_bbox[4]; + int char_bbox[4]; + int line_bbox[4]; +#endif + + const unsigned char *graph = (unsigned char *)res_it->GetUTF8Text(tesseract::RIL_SYMBOL); + if (graph && graph[0] != 0) { + /* Quick and nasty conversion from UTF8 to unicode. */ + if (graph[0] < 0x80) + unicode[count] = graph[0]; + else { + unicode[count] = graph[1] & 0x3f; + if (graph[0] < 0xE0) + unicode[count] += (graph[0] & 0x1f)<<6; + else { + unicode[count] = (graph[2] & 0x3f) | (*unicode << 6); + if (graph[0] < 0xF0) { + unicode[count] += (graph[0] & 0x0F)<<6; + } else { + unicode[count] = (graph[3] & 0x3f) | (*unicode<<6); + unicode[count] += (graph[0] & 0x7); + } + } + } + count++; +#if FUTURE_DEVELOPMENT + res_it->BoundingBox(tesseract::RIL_TEXTLINE, + line_bbox,line_bbox + 1, + line_bbox + 2,line_bbox + 3); + res_it->BoundingBox(tesseract::RIL_WORD, + word_bbox,word_bbox + 1, + word_bbox + 2,word_bbox + 3); + res_it->BoundingBox(tesseract::RIL_SYMBOL, + char_bbox,char_bbox + 1, + char_bbox + 2,char_bbox + 3); +#endif + } + res_it->Next(tesseract::RIL_SYMBOL); + } while (!res_it->Empty(tesseract::RIL_BLOCK) && + !res_it->IsAtBeginningOf(tesseract::RIL_WORD) && count < max_chars); + } + delete res_it; + code = code; + } + + ocr_clear_bitmap(image); + *char_count = count; + + return code; +} + +int ocr_bitmap_to_unicode(void *state, + const void *data, int data_x, + int w, int h, int raster, + int xres, int yres, int *unicode) +{ + tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)state; + Pix *image; + int code; + + if (api == NULL) + return 0; + + image = ocr_set_bitmap(api, w, h, (const unsigned char *)data, + data_x, raster, xres, yres); + if (image == NULL) + return_error(gs_error_VMerror); + + code = api->Recognize(NULL); + if (code >= 0) { + /* Bingo! */ + tesseract::ResultIterator *res_it = api->GetIterator(); + + while (!res_it->Empty(tesseract::RIL_BLOCK)) { + if (res_it->Empty(tesseract::RIL_WORD)) { + res_it->Next(tesseract::RIL_WORD); + continue; + } + + do { + const unsigned char *graph = (unsigned char *)res_it->GetUTF8Text(tesseract::RIL_SYMBOL); + if (graph && graph[0] != 0) { + /* Quick and nasty conversion from UTF8 to unicode. */ + if (graph[0] < 0x80) + *unicode = graph[0]; + else { + *unicode = graph[1] & 0x3f; + if (graph[0] < 0xE0) + *unicode += (graph[0] & 0x1f)<<6; + else { + *unicode = (graph[2] & 0x3f) | (*unicode << 6); + if (graph[0] < 0xF0) { + *unicode += (graph[0] & 0x0F)<<6; + } else { + *unicode = (graph[3] & 0x3f) | (*unicode<<6); + *unicode += (graph[0] & 0x7); + } + } + } + } + res_it->Next(tesseract::RIL_SYMBOL); + } while (!res_it->Empty(tesseract::RIL_BLOCK) && + !res_it->IsAtBeginningOf(tesseract::RIL_WORD)); + } + delete res_it; + code = code; + } + + ocr_clear_bitmap(image); + + return code; +} + + }; /* Currently tesseract is the only C++ lib we have. diff --git a/base/tessocr.h b/base/tessocr.h index 78c30a0d4..8beaa8915 100644 --- a/base/tessocr.h +++ b/base/tessocr.h @@ -50,5 +50,15 @@ int ocr_recognise(void *api_, int w, int h, void *data, int (*callback)(void *, const char *, const int *, const int *, const int *, int), void *arg); +int ocr_bitmap_to_unicodes(void* state, + const void* data,int data_x, + int w,int h,int raster, + int xres,int yres,int* unicode, int* char_count); + +int ocr_bitmap_to_unicode(void *state, + const void *data, int data_x, + int w, int h, int raster, + int xres, int yres, int *unicode); + #endif diff --git a/devices/devs.mak b/devices/devs.mak index a5f3750fa..222c02708 100644 --- a/devices/devs.mak +++ b/devices/devs.mak @@ -24,7 +24,7 @@ DEVVECSRC=$(DEVVEC)$(D) DEVI_=$(DEVGENDIR) $(II)$(GLSRCDIR) $(II)$(GLGENDIR) $(II)$(DEVSRCDIR) DEVF_= -DEVCCFLAGS=$(I_)$(DEVI_)$(_I) $(I_)$(DEVVEC)$(_I) $(DEVF_) +DEVCCFLAGS=$(I_)$(DEVI_)$(_I) $(I_)$(DEVVEC)$(_I) $(D_)OCR_VERSION=$(OCR_VERSION)$(_D) $(DEVF_) DEVCC=$(CC_) $(DEVCCFLAGS) XPSDEVCC=$(CC_) $(XPSPRINTCFLAGS) $(DEVCCFLAGS) diff --git a/devices/vector/gdevpdfb.c b/devices/vector/gdevpdfb.c index 203390384..059c79c32 100644 --- a/devices/vector/gdevpdfb.c +++ b/devices/vector/gdevpdfb.c @@ -496,7 +496,7 @@ gdev_pdf_copy_color(gx_device * dev, const byte * base, int sourcex, /* Fill a mask. */ int gdev_pdf_fill_mask(gx_device * dev, - const byte * data, int data_x, int raster, gx_bitmap_id id, + const byte * data, int data_x, int raster, gx_bitmap_id id, int x, int y, int width, int height, const gx_drawing_color * pdcolor, int depth, gs_logical_operation_t lop, const gx_clip_path * pcpath) @@ -505,6 +505,50 @@ gdev_pdf_fill_mask(gx_device * dev, if (width <= 0 || height <= 0) return 0; + + /* If OCRStage is 'OCR_Rendering' then we are handling an image which is a rendered glyph + * that we want to have OCR software process and return a Unicode code point for. + * We specifically do *not* want to send the image to the output PDF file! + */ + if (pdev->OCRStage == OCR_Rendering) { + int code = 0; + ocr_glyph_t *new_glyph = NULL; + int index; + + new_glyph = (ocr_glyph_t *)gs_alloc_bytes(pdev->pdf_memory, sizeof(ocr_glyph_t), ""); + if (new_glyph == NULL) + return_error(gs_error_VMerror); + new_glyph->data = gs_alloc_bytes(pdev->pdf_memory, raster*height, ""); + if (new_glyph->data == NULL) + return_error(gs_error_VMerror); + memcpy(new_glyph->data, data, raster * height); + new_glyph->height = height; + new_glyph->width = width; + new_glyph->raster = raster; + new_glyph->x = x; + new_glyph->y = y; + new_glyph->char_code = pdev->OCR_char_code; + new_glyph->glyph = pdev->OCR_glyph; + new_glyph->next = NULL; + new_glyph->is_space = true; + for(index = 0; index < height * raster;index++){ + if(data[index] != 0x00) { + new_glyph->is_space = false; + break; + } + } + if (pdev->ocr_glyphs == NULL) + pdev->ocr_glyphs = new_glyph; + else { + ocr_glyph_t *next = pdev->ocr_glyphs; + + while (next->next != NULL) + next = next->next; + next->next = new_glyph; + } + return code; + } + if (depth > 1 || (!gx_dc_is_pure(pdcolor) != 0 && !(gx_dc_is_pattern1_color(pdcolor)))) return gx_default_fill_mask(dev, data, data_x, raster, id, x, y, width, height, pdcolor, depth, lop, diff --git a/devices/vector/gdevpdfb.h b/devices/vector/gdevpdfb.h index d8d596959..b2808a3ae 100644 --- a/devices/vector/gdevpdfb.h +++ b/devices/vector/gdevpdfb.h @@ -128,6 +128,8 @@ const gx_device_pdf PDF_DEVICE_IDENT = {0,0}, /* PDFXTrimBoxToMediaBoxOffset */ {0,0}, /* PDFXBleedBoxToTrimBoxOffset */ 1 /* true */, /* PDFXSetBleedBoxToMediaBox */ + "", /* ocr_language */ + 0, /* ocr_engine */ 1 /*true*/, /* ReAssignCharacters */ 1 /*true*/, /* ReEncodeCharacters */ 1, /* FirstObjectNumber */ @@ -300,7 +302,14 @@ const gx_device_pdf PDF_DEVICE_IDENT = 0, /* ExtensionMetadata */ 0, /* PDFFormName */ 0, /* PassThroughWriter */ - 1.0 /* UserUnit */ + 1.0, /* UserUnit */ + 0, /* UseOCR */ + NULL, /* OCRSaved */ + 0, /* OCRStage */ + NULL, /* OCRUnicode */ + 0, /* OCR_char_code */ + 0, /* OCR_glyph */ + NULL /* ocr_glyphs */ }; #else diff --git a/devices/vector/gdevpdfp.c b/devices/vector/gdevpdfp.c index 3a371aada..9cc63644b 100644 --- a/devices/vector/gdevpdfp.c +++ b/devices/vector/gdevpdfp.c @@ -255,6 +255,48 @@ gdev_pdf_get_param(gx_device *dev, char *Param, void *list) return(param_write_null(plist, "DSC")); } } + +#if OCR_VERSION > 0 + if (strcmp(Param, "OCRLanguage") == 0) { + gs_param_string langstr; + if (pdev->ocr_language[0]) { + langstr.data = (const byte *)pdev->ocr_language; + langstr.size = strlen(pdev->ocr_language); + langstr.persistent = false; + } else { + langstr.data = (const byte *)"eng"; + langstr.size = 3; + langstr.persistent = false; + } + return param_write_string(plist, "OCRLanguage", &langstr); + } + if (strcmp(Param, "OCREngine") == 0) + return param_write_int(plist, "OCREngine", &pdev->ocr_engine); + + if (strcmp(Param, "UseOCR") == 0) { + gs_param_string ocrstr; + + switch(pdev->UseOCR) { + case UseOCRNever: + ocrstr.data = (const byte *)"Never"; + ocrstr.size = 5; + ocrstr.persistent = false; + break; + UseOCRAsNeeded: + ocrstr.data = (const byte *)"AsNeeded"; + ocrstr.size = 8; + ocrstr.persistent = false; + break; + UseOCRAlways: + ocrstr.data = (const byte *)"Always"; + ocrstr.size = 8; + ocrstr.persistent = false; + break; + } + return param_write_string(plist, "UseOCR", &ocrstr); + } +#endif + return gdev_psdf_get_param(dev, Param, list); } @@ -269,6 +311,49 @@ gdev_pdf_get_params(gx_device * dev, gs_param_list * plist) int code; int cdv = CoreDistVersion; +#if OCR_VERSION > 0 + gs_param_string langstr; + + if (pdev->ocr_language[0]) { + langstr.data = (const byte *)pdev->ocr_language; + langstr.size = strlen(pdev->ocr_language); + langstr.persistent = false; + } else { + langstr.data = (const byte *)"eng"; + langstr.size = 3; + langstr.persistent = false; + } + + { + gs_param_string ocrstr; + + switch(pdev->UseOCR) { + case UseOCRNever: + ocrstr.data = (const byte *)"Never"; + ocrstr.size = 5; + ocrstr.persistent = false; + break; + UseOCRAsNeeded: + ocrstr.data = (const byte *)"AsNeeded"; + ocrstr.size = 8; + ocrstr.persistent = false; + break; + UseOCRAlways: + ocrstr.data = (const byte *)"Always"; + ocrstr.size = 8; + ocrstr.persistent = false; + break; + } + code = param_write_string(plist, "UseOCR", &ocrstr); + } + code = param_write_string(plist, "OCRLanguage", &langstr); + if(code < 0) + return code; + code = param_write_int(plist, "OCREngine", &pdev->ocr_engine); + if(code < 0) + return code; +#endif + pdev->ParamCompatibilityLevel = cl; code = gdev_psdf_get_params(dev, plist); if (code < 0 || @@ -367,6 +452,83 @@ gdev_pdf_put_params_impl(gx_device * dev, const gx_device_pdf * save_dev, gs_par } } +#if OCR_VERSION > 0 + { + int len; + gs_param_string langstr; + switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) { + case 0: + len = langstr.size; + if (len >= sizeof(pdev->ocr_language)) + len = sizeof(pdev->ocr_language)-1; + memcpy(pdev->ocr_language, langstr.data, len); + pdev->ocr_language[len] = 0; + break; + case 1: + break; + default: + ecode = code; + param_signal_error(plist, param_name, ecode); + } + } + + { + int engine; + switch (code = param_read_int(plist, (param_name = "OCREngine"), &engine)) { + case 0: + pdev->ocr_engine = engine; + break; + case 1: + break; + default: + ecode = code; + param_signal_error(plist, param_name, ecode); + } + } + + { + gs_param_string ocrstr; + + code = param_read_string(plist, (param_name = "UseOCR"), &ocrstr); + switch(code) { + case 0: + if (ocrstr.size == 5 && memcmp(ocrstr.data, "Never", 5) == 0) + pdev->UseOCR = UseOCRNever; + if (ocrstr.size == 8 && memcmp(ocrstr.data, "AsNeeded", 8) == 0) + pdev->UseOCR = UseOCRAsNeeded; + if (ocrstr.size == 6 && memcmp(ocrstr.data, "Always", 6) == 0) + pdev->UseOCR = UseOCRAlways; + break; + case 1: + break; + default: + param_signal_error(plist, param_name, code); + break; + } + } + + { + gs_param_string ocrstr; + + code = param_read_string(plist, (param_name = "UseOCR"), &ocrstr); + switch(code) { + case 0: + if (ocrstr.size == 5 && memcmp(ocrstr.data, "Never", 5) == 0) + pdev->UseOCR = UseOCRNever; + if (ocrstr.size == 8 && memcmp(ocrstr.data, "AsNeeded", 8) == 0) + pdev->UseOCR = UseOCRAsNeeded; + if (ocrstr.size == 6 && memcmp(ocrstr.data, "Always", 6) == 0) + pdev->UseOCR = UseOCRAlways; + break; + case 1: + break; + default: + param_signal_error(plist, param_name, code); + break; + } + } +#endif + /* * Check for LockDistillerParams before doing anything else. * If LockDistillerParams is true and is not being set to false, diff --git a/devices/vector/gdevpdfx.h b/devices/vector/gdevpdfx.h index 16515fcdd..6123c652c 100644 --- a/devices/vector/gdevpdfx.h +++ b/devices/vector/gdevpdfx.h @@ -547,6 +547,33 @@ typedef enum { pdf_compress_Flate } pdf_compression_type; +typedef enum { + OCR_UnInit, + OCR_Rendering, + OCR_Rendered, + OCR_UnicodeAvailable, + OCR_Failed +} pdf_OCR_stage; + +typedef enum { + UseOCRNever, + UseOCRAsNeeded, + UseOCRAlways +} pdf_OCR_usage; + +typedef struct ocr_glyph_s{ + byte *data; + int x; + int y; + int width; + int height; + int raster; + void *next; + gs_char char_code; + gs_glyph glyph; + bool is_space; +} ocr_glyph_t; + /* Define the device structure. */ struct gx_device_pdf_s { gx_device_psdf_common; @@ -572,6 +599,9 @@ struct gx_device_pdf_s { gs_param_float_array PDFXTrimBoxToMediaBoxOffset; gs_param_float_array PDFXBleedBoxToTrimBoxOffset; bool PDFXSetBleedBoxToMediaBox; + /* OCR Parameters */ + char ocr_language[1024]; + int ocr_engine; /* Other parameters */ bool ReAssignCharacters; bool ReEncodeCharacters; @@ -909,6 +939,13 @@ struct gx_device_pdf_s { * anything in the image processing routines. */ float UserUnit; + pdf_OCR_usage UseOCR; /* Never, AsNeeded or Always */ + gs_text_enum_t* OCRSaved; /* Saved state of the text enumerator before rendering glyph bitmaps for later OCR */ + pdf_OCR_stage OCRStage; /* Used to control a (sort of) state machine when using OCR to get a Unicode value for a glyph */ + int *OCRUnicode; /* Used to pass back the Unicode value from the OCR engine to the text processing */ + gs_char OCR_char_code; /* Passes the current character code from text processing to the image processing code when rendering glyph bitmaps for OCR */ + gs_glyph OCR_glyph; /* Passes the current glyph code from text processing to the image processing code when rendering glyph bitmaps for OCR */ + ocr_glyph_t *ocr_glyphs; /* Records bitmaps and other data from text processing when doing OCR */ }; #define is_in_page(pdev)\ diff --git a/devices/vector/gdevpdte.c b/devices/vector/gdevpdte.c index 6f0eb158a..0310f54cf 100644 --- a/devices/vector/gdevpdte.c +++ b/devices/vector/gdevpdte.c @@ -43,6 +43,7 @@ #include "gxcpath.h" #include "gsfcmap.h" +#include "tessocr.h" static int pdf_char_widths(gx_device_pdf *const pdev, pdf_font_resource_t *pdfont, int ch, @@ -80,6 +81,216 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr, return pdf_process_string(penum, pstr, pfmat, ppts, gdata); } +static int OCRText(gx_device_pdf *pdev, gs_glyph glyph, gs_char ch, gs_char *length, byte **unicode) +{ +#if OCR_VERSION > 0 + int code = 0; + + if(pdev->OCRStage == OCR_Rendered) { + int llx, lly, urx, ury, char_count = 0, returned_count = 0, *returned; + ocr_glyph_t *next_glyph = pdev->ocr_glyphs; + int rows, stride, row, column; + byte *bitmap = NULL, *src, *dest, *rowptr, srcmask, destmask; + void *state; + const char *language = pdev->ocr_language; + gp_file *DbgFile; + + if(language == NULL || language[0] == 0) + language = "eng"; + + /* We should alredy have rendered a bitmap for all the glyphs in the + * text operation, so this shuld be redundant, but best to be safe. + */ + if(next_glyph == NULL) + return_error(gs_error_unknownerror); + + /* Identify the bounding box of the returned glyphs by examing the bounds and position + * of each glyph. At the same time count the number of expected returned characters. + * We treat any empty bitmap (all 0x00 bytes) as a space because, obviously, the + * OCR engine can't tell differentiate between a space character and no character at all. + */ + llx = next_glyph->x; + lly = next_glyph->y; + urx = llx + next_glyph->width; + ury = lly + next_glyph->height; + if(next_glyph != NULL && !next_glyph->is_space) + char_count++; + next_glyph = (ocr_glyph_t *)next_glyph->next; + while(next_glyph) { + if(!next_glyph->is_space) + char_count++; + if(next_glyph->x < llx) + llx = next_glyph->x; + if(next_glyph->y < lly) + lly = next_glyph->y; + if(next_glyph->x + next_glyph->width > urx) + urx = next_glyph->x + next_glyph->width; + if(next_glyph->y + next_glyph->height > ury) + ury = next_glyph->y + next_glyph->height; + next_glyph = next_glyph->next; + } + + /* Allocate and initialise the 'strip' bitmap which will receive all the + * individual glyph bitmaps. + */ + rows = ury - lly; + stride = (((urx - llx) + 7) / 8) + 1; + bitmap = gs_alloc_bytes(pdev->memory, rows * stride, "working OCR memory"); + if(bitmap == NULL) + return_error(gs_error_VMerror); + memset(bitmap, 0x00, rows * stride); + + /* Allocate a buffer for the OCR engine to return the Unicode code points. This needs work, + * we might want more information returned (bounding boxes and confidence levels) and we + * need to think about the possibility that the OCR engine finds more character than we + * expected (eg fi ligatures returned as 'f' and 'i'. + */ + returned = (int *)gs_alloc_bytes(pdev->memory, char_count * sizeof(int), "returned unicodes"); + if(returned == NULL) { + gs_free_object(pdev->memory, bitmap, "working OCR memory"); + return_error(gs_error_VMerror); + } + memset(returned, 0x00, char_count * sizeof(int)); + + /* Now copy each glyph bitmap to the correct position in the strip. This is complicated + * by the fact that bitmaps are monochrome pcaked into bytes and so the destination + * may not be aligned on a byte boundary. + */ + next_glyph = (ocr_glyph_t *)pdev->ocr_glyphs; + while(next_glyph) { + rowptr = bitmap + ((next_glyph->y - lly) * stride) + (int)floor((next_glyph->x - llx) / 8); + for(row = 0;row < next_glyph->height;row++) { + dest = rowptr + row * stride; + src = next_glyph->data + (row * next_glyph->raster); + destmask = 0x80 >> (next_glyph->x - llx) % 8; + srcmask = 0x80; + for(column = 0; column < next_glyph->width;column++) { + if(*src & srcmask) { + *dest = *dest | destmask; + } + srcmask = srcmask >> 1; + if(srcmask == 0) { + srcmask = 0x80; + src++; + } + destmask = destmask >> 1; + if(destmask == 0) { + destmask = 0x80; + dest++; + } + } + } + next_glyph = next_glyph->next; + } + +#if 0 + DbgFile = gp_fopen(pdev->memory, "d:/temp/bits.txt", "wb+"); + for(row = 0;row < rows;row++) { + for(column = 0;column < stride;column++) { + dest = bitmap + (row * stride); + gp_fprintf(DbgFile, "%02x", dest[column]); + } + gp_fprintf(DbgFile, "\n"); + } + gp_fclose(DbgFile); +#endif + /* Initialise the OCR engine */ + code = ocr_init_api(pdev->memory->non_gc_memory, language, + pdev->ocr_engine, &state); + if(code < 0) { + gs_free_object(pdev->memory, bitmap, "working OCR memory"); + gs_free_object(pdev->memory, returned, "returned unicodes"); + return code; + } + returned_count = char_count; + + /* Pass our strip to the OCR engine */ + code = ocr_bitmap_to_unicodes(state, + bitmap, 0, stride * 8, rows, stride, + (int)pdev->HWResolution[0], + (int)pdev->HWResolution[1], + returned, &returned_count); + + /* and close the engine back down again */ + ocr_fin_api(pdev->memory->non_gc_memory, state); + gs_free_object(pdev->memory, bitmap, "working OCR memory"); + + if(code < 0) { + pdev->OCRStage = OCR_Failed; + gs_free_object(pdev->memory, returned, "returned unicodes"); + return code; + } + + /* Future enhancement we should fall back to trying the individual bitmap here */ + if(returned_count != char_count) { + pdev->OCRStage = OCR_Failed; + gs_free_object(pdev->memory, returned, "returned unicodes"); + return 0; + } + pdev->OCRUnicode = returned; + + /* Actually perform OCR on the stored bitmaps */ + pdev->OCRStage = OCR_UnicodeAvailable; + } + + if(pdev->OCRStage == OCR_UnicodeAvailable) { + /* We've OCR'ed the bitmaps already, find the unicode value */ + ocr_glyph_t *new_glyph = (ocr_glyph_t *)pdev->ocr_glyphs; + int ocr_index = 0; + uint mask = 0xFF; + int ix; + char *u; + + /* Find the bitmap which matches the character/glyph we are processing */ + while(new_glyph) { + if(new_glyph->char_code == ch || new_glyph->glyph == glyph) { + ocr_glyph_t *g1 = pdev->ocr_glyphs; + + /* Spaces are handled specially, so just jump out now */ + if(new_glyph->is_space) + break; + + /* Otherwise, find all the bitmaps which lie to the left of the + * one we found (we are assuming for now that the returned + * Unicode values are left to right) + */ + while(g1) { + if(!g1->is_space) { + if(g1->x < new_glyph->x) + ocr_index++; + } + g1 = g1->next; + } + break; + } + new_glyph = new_glyph->next; + } + + /* If we found a matching bitmap, get the corresponding unicode code point from + * the stored values returned by the OCR engine. + */ + if(new_glyph) { + *unicode = (byte *)gs_alloc_bytes(pdev->memory, 2 * sizeof(ushort), "temporary Unicode array"); + if(*unicode == NULL) + return_error(gs_error_VMerror); + u = (char *)(*unicode); + if(new_glyph->is_space) { + memset(u, 0x00, 3); + u[3] = 0x20; + } + else { + for(ix = 0;ix < 4;ix++) { + u[3 - ix] = (pdev->OCRUnicode[ocr_index] & mask) >> (8 * ix); + mask = mask << 8; + } + } + *length = 4; + } + } + #endif + return 0; +} + /* * Add char code pair to ToUnicode CMap, * creating the CMap on neccessity. @@ -87,27 +298,43 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr, int pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfont, gs_glyph glyph, gs_char ch, const gs_const_string *gnstr) -{ int code; - gs_char length; +{ int code = 0; + gs_char length = 0; ushort *unicode = 0; if (glyph == GS_NO_GLYPH) return 0; - length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0); - if ((length == 0 || length == GS_NO_CHAR) && gnstr != NULL && gnstr->size == 7) { - if (!memcmp(gnstr->data, "uni", 3)) { - static const char *hexdigits = "0123456789ABCDEF"; - char *d0 = strchr(hexdigits, gnstr->data[3]); - char *d1 = strchr(hexdigits, gnstr->data[4]); - char *d2 = strchr(hexdigits, gnstr->data[5]); - char *d3 = strchr(hexdigits, gnstr->data[6]); - - unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array"); - if (d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) { - char *u = (char *)unicode; - u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits)); - u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits)); - length = 2; + if(pdev->UseOCR == UseOCRAlways) { + code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode); + if(code < 0) + return code; + } + else { + length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0); + if(length == 0 || length == GS_NO_CHAR) { + if(gnstr != NULL && gnstr->size == 7) { + if(!memcmp(gnstr->data, "uni", 3)) { + static const char *hexdigits = "0123456789ABCDEF"; + char *d0 = strchr(hexdigits, gnstr->data[3]); + char *d1 = strchr(hexdigits, gnstr->data[4]); + char *d2 = strchr(hexdigits, gnstr->data[5]); + char *d3 = strchr(hexdigits, gnstr->data[6]); + + unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array"); + if(d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) { + char *u = (char *)unicode; + u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits)); + u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits)); + length = 2; + } + } + } + else { + if(pdev->UseOCR != UseOCRNever) { + code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode); + if(code < 0) + return code; + } } } } @@ -163,6 +390,7 @@ pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfon if (length > 2 && pdfont->u.simple.Encoding != NULL) pdfont->TwoByteToUnicode = 0; } + if (unicode) gs_free_object(pdev->memory, unicode, "temporary Unicode array"); return 0; @@ -255,8 +483,11 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_ pet = &pdfont->u.simple.Encoding[ch]; glyph = (gdata == NULL ? font->procs.encode_char(font, ch, GLYPH_SPACE_NAME) : *gdata); - if (glyph == GS_NO_GLYPH || glyph == pet->glyph) + if (glyph == GS_NO_GLYPH || glyph == pet->glyph) { + if((pdfont->cmap_ToUnicode == NULL || !gs_cmap_ToUnicode_check_pair(pdfont->cmap_ToUnicode, ch)) && pdev->UseOCR != UseOCRNever) + (void)pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr); return 0; + } if (pet->glyph != GS_NO_GLYPH) { /* encoding conflict */ return_error(gs_error_rangecheck); /* Must not happen because pdf_obtain_font_resource @@ -358,7 +589,7 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_ * The decision about writing it out is deferred until pdf_write_font_resource. */ code = pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr); - if (code < 0) + if(code < 0) return code; pet->glyph = glyph; pet->str = gnstr; @@ -1035,6 +1266,7 @@ process_text_return_width(const pdf_text_enum_t *pte, gs_font_base *font, { const gs_glyph *gdata_i = (gdata != NULL ? gdata + i : 0); code = pdf_encode_string_element(pdev, (gs_font *)font, pdfont, ch, gdata_i); + if (code < 0) return code; } diff --git a/devices/vector/gdevpdtt.c b/devices/vector/gdevpdtt.c index 6a89fedf7..e4c1d1471 100644 --- a/devices/vector/gdevpdtt.c +++ b/devices/vector/gdevpdtt.c @@ -299,13 +299,29 @@ static void pdf_text_release(gs_text_enum_t *pte, client_name_t cname) { pdf_text_enum_t *const penum = (pdf_text_enum_t *)pte; + gx_device_pdf *pdev = (gx_device_pdf *)penum->dev; + ocr_glyph_t *next; if (penum->pte_default) { gs_text_release(NULL, penum->pte_default, cname); penum->pte_default = 0; } pdf_text_release_cgp(penum); + + while (pdev->ocr_glyphs != NULL) + { + next = pdev->ocr_glyphs->next; + + gs_free_object(pdev->memory, pdev->ocr_glyphs->data, "free bitmap"); + gs_free_object(pdev->memory, pdev->ocr_glyphs, "free bitmap"); + pdev->ocr_glyphs = next; + } + if (pdev->OCRUnicode != NULL) + gs_free_object(pdev->memory, pdev->OCRUnicode, "free returned unicodes"); + pdev->OCRUnicode = NULL; + gx_default_text_release(pte, cname); + pdev->OCRStage = 0; } void pdf_text_release_cgp(pdf_text_enum_t *penum) @@ -3152,6 +3168,57 @@ static int pdf_query_purge_cached_char(const gs_memory_t *mem, cached_char *cc, return 0; } +static int ProcessTextForOCR(gs_text_enum_t *pte) +{ + pdf_text_enum_t *const penum = (pdf_text_enum_t *)pte; + gx_device_pdf *pdev = (gx_device_pdf *)penum->dev; + gs_text_enum_t *pte_default; + int code; + + if (pdev->OCRStage == OCR_UnInit) { + gs_gsave(pte->pgs); + pdev->OCRSaved = (gs_text_enum_t*)gs_alloc_bytes(pdev->memory,sizeof(gs_text_enum_t),"saved enumerator for OCR"); + if(pdev->OCRSaved == NULL) + return_error(gs_error_VMerror); + *(pdev->OCRSaved) = *pte; + gs_text_enum_copy_dynamic(pdev->OCRSaved,pte,true); + + code = pdf_default_text_begin(pte, &pte->text, &pte_default); + if (code < 0) + return code; + penum->pte_default = pte_default; + gs_text_enum_copy_dynamic(pte_default, pte, false); + pdev->OCRStage = OCR_Rendering; + } + + if (pdev->OCRStage == OCR_Rendering) { + penum->pte_default->can_cache = 0; + code = gs_text_process(penum->pte_default); + pdev->OCR_char_code = penum->pte_default->returned.current_char; + pdev->OCR_glyph = penum->pte_default->returned.current_glyph; + gs_text_enum_copy_dynamic(pte, penum->pte_default, true); + if (code == TEXT_PROCESS_RENDER) + return code; + if (code != 0) { + gs_free_object(pdev->memory, pdev->OCRSaved,"saved enumerator for OCR"); + pdev->OCRSaved = NULL; + gs_grestore(pte->pgs); + gs_text_release(pte->pgs, penum->pte_default, "pdf_text_process"); + penum->pte_default = NULL; + return code; + } + gs_grestore(pte->pgs); + *pte = *(pdev->OCRSaved); + gs_text_enum_copy_dynamic(pte, pdev->OCRSaved, true); + gs_free_object(pdev->memory, pdev->OCRSaved,"saved enumerator for OCR"); + pdev->OCRSaved = NULL; + gs_text_release(pte->pgs, penum->pte_default, "pdf_text_process"); + penum->pte_default = NULL; + pdev->OCRStage = OCR_Rendered; + } + return 0; +} + /* * Continue processing text. This is the 'process' procedure in the text * enumerator. Per the check in pdf_text_begin, we know the operation is @@ -3207,6 +3274,12 @@ pdf_text_process(gs_text_enum_t *pte) goto default_impl; } + if (pdev->UseOCR != UseOCRNever) { + code = ProcessTextForOCR(pte); + if (code != 0) + return code; + } + code = -1; /* to force default implementation */ /* @@ -3547,6 +3620,7 @@ pdf_text_process(gs_text_enum_t *pte) } gs_text_enum_copy_dynamic(pte, pte_default, true); + if (code) return code; gs_text_release(NULL, pte_default, "pdf_text_process"); diff --git a/doc/Devices.htm b/doc/Devices.htm index 91994c82d..87e015f2e 100644 --- a/doc/Devices.htm +++ b/doc/Devices.htm @@ -76,6 +76,7 @@ <ul> <li><a href="#OCR">OCR text output</a></li> <li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li> +<li><a href="#PDFwriteocr">Vector PDF output (with OCR Unicode CMaps)</a></li> </ul> <li><a href="#High-level">High level formats</a></li> <ul> @@ -1059,6 +1060,37 @@ resolution independence, and editability.</p> </p> <p> +<h3><a name="PDFwriteocr"></a>Vector PDF output (with OCR Unicode CMaps)</h3> +<p> +The pdfwrite device has been augmented to use the OCR engine to analyse text +(not images!) in the input stream, and derive Unicode code points for it. +That information can then be used to create ToUnicode CMaps which are attached +to the Font (or CIDFont) objects embedded in the PDF file. +</p> +<p> +Fonts which have ToUnicode CMaps can be reliably (limited by the accuracy of +the CMap) used in search and copy/paste functions, as well as text extraction +from PDF files. Note that OCR is not a 100% perfect process; it is possible +that some text might be misidentified. +</p> +<p> +OCR is a slow operation! In addition it can (for Latin text at least) sometimes +be preferable not to add ToUnicode information which may be incorrect, but instead +to use the existing font Encoding. For English text this may give better results. +</p> +<p>For these reasons the OCR functionality of pdfwrite can be controlled by using a new +parameter <code>-sUseOCR</code>. This has three possible values; +</p> +<dt><code>-sUseOCR=</code><b><em>string</em></b></dt> +<dd> + <dl> + <dt>Never<dd>Default - don't use OCR at all even if support is built-in. + <dt>AsNeeded<dd>If there is no existing ToUnicode information, use OCR. + <dt>Always<dd>Ignore any existing information and always use OCR. + </dl> +</dd> +</p> + <hr> <h2><a name="High-level"></a>High-level devices</h2> diff --git a/doc/VectorDevices.htm b/doc/VectorDevices.htm index f188e44df..2e9e2dcde 100644 --- a/doc/VectorDevices.htm +++ b/doc/VectorDevices.htm @@ -986,6 +986,59 @@ displaying document's properties, so we recommend this value. </dl> +<d1> +<dt><code>-sUseOCR=</code><em>string</em> +<dd>Controls the use of OCR in pdfwrite. If enabled this will use an OCR +engine to analyse the glyph bitmaps used to draw text in a PDF file, and +the resulting Unicode code points are then used to construct a ToUnicode +CMap. +<p> +PDF files containing ToUnicode CMaps can be searched, use copy/paste and +extract the text, subject to the accuracy of the ToUnicode CMap. Since not all +PDF files contain these it can be beneficial to create them. +</p> +<p> +Note that, for English text, it is possible that the existing standard character +encoding (which most PDF consumers will fall back to in the absence of Unicode +information) is better than using OCR, as OCR is ot a 100% reliable process. +OCR processing is also comparatively slow. +</p> +<p> +For the reasons above it is useful to be able to exercise some control over the +action of pdfwrite when OCR processing is available, and the <code>UseOCR</code> +parameter provides that control. There are three possible valuues: +</p> +<li><code>Never</code> Default - don't use OCR at all even if support is built-in. +<li><code>AsNeeded</code> If there is no existing ToUnicode information, use OCR. +<li><code>Always</code> Ignore any existing information and always use OCR. +<p> +Our experimentation with the Tesseract OCR engine has shown that the more text we +can supply for the engine to look at, the better the result we get. We are, unfortunately, +limited to the graphics library operations for text as follows. +</p> +<p> +The code works on text 'fragments'; these are the text sequences sent to the text +operators of the source language. Generally most input languages will try to send +text in its simplest form, eg "Abc", but the requirements of justification, kerning +and so on mean that sometimes each character is positioned independently on the page. +</p> +<p> +So pdfwrite renders all the bitmaps for every charcter in the text document, when +set up to use OCR. Later, if any character in the font does not have a Unicode +value already we use the bitmaps to assemble a 'strip' of text which we then send +to the OCR engine. If the engine returns a different number of recognised characters +than we expected then we ignore that result. We've found that (for English text) +constructions such as ". T" tend to ignore the full stop, presumably because the OCR +engine thinks that it is simply noise. In contrast "text." does identify the full +stop correctly. So by ignoring the failed result we cna get a better result later. +</p> +<p> +Obviously this is all heuristic and undoubtedly there is more we can do to improve the +functionality here, but we need concrete examples to work from. +</p> +</dd> +</dt> + <h3><a name="PS"></a>PostScript file output</h3> <p> The <code>ps2write</code> device handles the same set of distiller |