summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--base/gsdparam.c2
-rw-r--r--base/gsfcmap.c13
-rw-r--r--base/gsfcmap.h2
-rw-r--r--base/gsparam.h4
-rw-r--r--base/gsparaml.c28
-rw-r--r--base/tesseract.mak7
-rw-r--r--base/tessocr.cpp205
-rw-r--r--base/tessocr.h10
-rw-r--r--devices/devs.mak2
-rw-r--r--devices/vector/gdevpdfb.c46
-rw-r--r--devices/vector/gdevpdfb.h11
-rw-r--r--devices/vector/gdevpdfp.c162
-rw-r--r--devices/vector/gdevpdfx.h37
-rw-r--r--devices/vector/gdevpdte.c270
-rw-r--r--devices/vector/gdevpdtt.c74
-rw-r--r--doc/Devices.htm32
-rw-r--r--doc/VectorDevices.htm53
17 files changed, 933 insertions, 25 deletions
diff --git a/base/gsdparam.c b/base/gsdparam.c
index 3af8e856f..36729692c 100644
--- a/base/gsdparam.c
+++ b/base/gsdparam.c
@@ -1010,6 +1010,8 @@ gs_putdeviceparams(gx_device * dev, gs_param_list * plist)
bool was_open = dev->is_open;
int code;
+ /* gs_param_list_dump(plist); */
+
gx_device_set_procs(dev);
fill_dev_proc(dev, put_params, gx_default_put_params);
fill_dev_proc(dev, get_alpha_bits, gx_default_get_alpha_bits);
diff --git a/base/gsfcmap.c b/base/gsfcmap.c
index 32a32fdff..3040d390f 100644
--- a/base/gsfcmap.c
+++ b/base/gsfcmap.c
@@ -643,6 +643,19 @@ gs_cmap_ToUnicode_realloc(gs_memory_t *mem, int new_value_size, gs_cmap_t **ppcm
return 0;
}
+int gs_cmap_ToUnicode_check_pair(gs_cmap_t *pcmap, int code0)
+{
+ gs_cmap_ToUnicode_t *cmap = (gs_cmap_ToUnicode_t *)pcmap;
+ uchar *map = pcmap->glyph_name_data;
+ const int num_codes = ((gs_cmap_ToUnicode_t *)pcmap)->num_codes;
+
+ if (code0 >= num_codes)
+ return; /* must not happen. */
+ if(map[code0 * (cmap->value_size + 2)] == 0 && map[code0 * (cmap->value_size + 2) + 1] == 0)
+ return 0;
+ return 1;
+}
+
/*
* Write a code pair to ToUnicode CMap.
*/
diff --git a/base/gsfcmap.h b/base/gsfcmap.h
index 73516c3e2..2507865cc 100644
--- a/base/gsfcmap.h
+++ b/base/gsfcmap.h
@@ -66,4 +66,6 @@ int gs_cmap_ToUnicode_realloc(gs_memory_t *mem, int new_value_size, gs_cmap_t **
*/
void gs_cmap_ToUnicode_add_pair(gs_cmap_t *pcmap, int code0, ushort *unicode, unsigned int length);
+int gs_cmap_ToUnicode_check_pair(gs_cmap_t *pcmap, int code0);
+
#endif /* gsfcmap_INCLUDED */
diff --git a/base/gsparam.h b/base/gsparam.h
index 3b0aaa21d..456a8a532 100644
--- a/base/gsparam.h
+++ b/base/gsparam.h
@@ -577,4 +577,8 @@ int gs_param_list_add_parsed_value(gs_param_list *plist, gs_param_name key, cons
* address pointed to be len. */
int gs_param_list_to_string(gs_param_list *plist, gs_param_name key, char *value, int *len);
+/* Debug function to dump a list of params. Do NOT use in production
+ * code! */
+int gs_param_list_dump(gs_param_list *plist);
+
#endif /* gsparam_INCLUDED */
diff --git a/base/gsparaml.c b/base/gsparaml.c
index d7e5fcdbf..3d7e49b14 100644
--- a/base/gsparaml.c
+++ b/base/gsparaml.c
@@ -1046,3 +1046,31 @@ int gs_param_list_to_string(gs_param_list *plist, gs_param_name key, char *value
*value = 0;
return to_string(plist, key, &out);
}
+
+int gs_param_list_dump(gs_param_list *plist)
+{
+ gs_param_enumerator_t enumerator;
+ gs_param_key_t key;
+ int code;
+ char buffer[4096];
+ int len;
+
+ param_init_enumerator(&enumerator);
+ while ((code = param_get_next_key(plist, &enumerator, &key)) == 0) {
+ char string_key[256]; /* big enough for any reasonable key */
+
+ if (key.size > sizeof(string_key) - 1) {
+ code = gs_note_error(gs_error_rangecheck);
+ break;
+ }
+ memcpy(string_key, key.data, key.size);
+ string_key[key.size] = 0;
+ dlprintf1("%s ", string_key);
+ code = gs_param_list_to_string(plist, string_key, buffer, &len);
+ if (code < 0)
+ break;
+ dlprintf1("%s ", buffer);
+ }
+ dlprintf("\n");
+ return code;
+}
diff --git a/base/tesseract.mak b/base/tesseract.mak
index c2bc1fb5a..43e234e75 100644
--- a/base/tesseract.mak
+++ b/base/tesseract.mak
@@ -24,7 +24,8 @@ TESSINCLUDES=\
# add -DDISABLED_LEGACY_ENGINE to TESSCXX
# empty TESSERACT_LEGACY
-TESSCXX = $(CXX) $(TESSINCLUDES) $(TESSCXXFLAGS) $(CCFLAGS) -DTESSERACT_IMAGEDATA_AS_PIX -DTESSERACT_DISABLE_DEBUG_FONTS -DGRAPHICS_DISABLED -DDISABLED_LEGACY_ENGINE
+TESSCXX = $(CXX) $(TESSINCLUDES) $(TESSCXXFLAGS) $(CCFLAGS) -DTESSERACT_IMAGEDATA_AS_PIX -DTESSERACT_DISABLE_DEBUG_FONTS -DGRAPHICS_DISABLED
+#-DDISABLED_LEGACY_ENGINE
TESSOBJ = $(GLOBJDIR)$(D)tesseract_
TESSO_ = $(O_)$(TESSOBJ)
@@ -1161,8 +1162,8 @@ TESSERACT_LEGACY_OBJS=\
$(TESSOBJ)wordrec_wordclass.$(OBJ)
-#TESSERACT_LEGACY=$(TESSERACT_LEGACY_OBJS)
-TESSERACT_LEGACY=
+TESSERACT_LEGACY=$(TESSERACT_LEGACY_OBJS)
+#TESSERACT_LEGACY=
TESS_ROMFS_ARGS=\
-c -P $(GLSRCDIR)$(D)..$(D) tessdata$(D)*
diff --git a/base/tessocr.cpp b/base/tessocr.cpp
index 8ce19a14e..f8161ac3c 100644
--- a/base/tessocr.cpp
+++ b/base/tessocr.cpp
@@ -504,6 +504,211 @@ ocr_recognise(void *api_, int w, int h, void *data,
return code;
}
+static Pix *
+ocr_set_bitmap(tesseract::TessBaseAPI *api,
+ int w, int h,
+ const unsigned char *data, int data_x, int raster,
+ int xres, int yres)
+{
+ /* Tesseract prefers a border around things, so we add an 8 pixel
+ * border all around. */
+#define BORDER_SIZE 8
+ int r = (w+BORDER_SIZE*2+3)&~3;
+ Pix *image = pixCreateHeader(r, h+BORDER_SIZE*2, 8);
+ unsigned char *pdata, *d;
+ const unsigned char *s;
+ int x, y;
+
+ if (image == NULL)
+ return NULL;
+
+ pdata = gs_alloc_bytes(leptonica_mem, r * (h+BORDER_SIZE*2), "ocr_set_bitmap");
+ if (pdata == NULL) {
+ pixDestroy(&image);
+ return NULL;
+ }
+ pixSetData(image, (l_uint32 *)pdata);
+ pixSetPadBits(image, 1);
+ pixSetXRes(image, xres);
+ pixSetYRes(image, yres);
+
+ s = &data[data_x>>3] + raster*(h-1);
+ d = pdata;
+ memset(d, 255, r * (h+BORDER_SIZE*2));
+ d += r*BORDER_SIZE + BORDER_SIZE;
+ for (y = 0; y < h; y++) {
+ int b = 128>>(data_x & 7);
+ for (x = 0; x < w; x++) {
+ if (s[x>>3] & b)
+ d[x^3] = 0;
+ else
+ d[x^3] = 255;
+ b >>= 1;
+ if (b == 0)
+ b = 128;
+ }
+ s -= raster;
+ d += r;
+ }
+
+ api->SetImage(image);
+// pixWrite("test.pnm", image, IFF_PNM);
+
+ return image;
+}
+
+static void
+ocr_clear_bitmap(Pix *image)
+{
+ gs_free_object(leptonica_mem, pixGetData(image), "ocr_clear_bitmap");
+ pixSetData(image, NULL);
+ pixDestroy(&image);
+}
+
+int ocr_bitmap_to_unicodes(void *state,
+ const void *data, int data_x,
+ int w, int h, int raster,
+ int xres, int yres, int *unicode, int *char_count)
+{
+ tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)state;
+ Pix *image;
+ int code, max_chars = *char_count, count = 0;
+
+ if (api == NULL)
+ return 0;
+
+ image = ocr_set_bitmap(api, w, h, (const unsigned char *)data,
+ data_x, raster, xres, yres);
+ if (image == NULL)
+ return_error(gs_error_VMerror);
+
+ code = api->Recognize(NULL);
+ if (code >= 0) {
+ /* Bingo! */
+ tesseract::ResultIterator *res_it = api->GetIterator();
+
+ while (!res_it->Empty(tesseract::RIL_BLOCK)) {
+ if (res_it->Empty(tesseract::RIL_WORD)) {
+ res_it->Next(tesseract::RIL_WORD);
+ continue;
+ }
+
+ do {
+#if FUTURE_DEVELOPMENT
+ int word_bbox[4];
+ int char_bbox[4];
+ int line_bbox[4];
+#endif
+
+ const unsigned char *graph = (unsigned char *)res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
+ if (graph && graph[0] != 0) {
+ /* Quick and nasty conversion from UTF8 to unicode. */
+ if (graph[0] < 0x80)
+ unicode[count] = graph[0];
+ else {
+ unicode[count] = graph[1] & 0x3f;
+ if (graph[0] < 0xE0)
+ unicode[count] += (graph[0] & 0x1f)<<6;
+ else {
+ unicode[count] = (graph[2] & 0x3f) | (*unicode << 6);
+ if (graph[0] < 0xF0) {
+ unicode[count] += (graph[0] & 0x0F)<<6;
+ } else {
+ unicode[count] = (graph[3] & 0x3f) | (*unicode<<6);
+ unicode[count] += (graph[0] & 0x7);
+ }
+ }
+ }
+ count++;
+#if FUTURE_DEVELOPMENT
+ res_it->BoundingBox(tesseract::RIL_TEXTLINE,
+ line_bbox,line_bbox + 1,
+ line_bbox + 2,line_bbox + 3);
+ res_it->BoundingBox(tesseract::RIL_WORD,
+ word_bbox,word_bbox + 1,
+ word_bbox + 2,word_bbox + 3);
+ res_it->BoundingBox(tesseract::RIL_SYMBOL,
+ char_bbox,char_bbox + 1,
+ char_bbox + 2,char_bbox + 3);
+#endif
+ }
+ res_it->Next(tesseract::RIL_SYMBOL);
+ } while (!res_it->Empty(tesseract::RIL_BLOCK) &&
+ !res_it->IsAtBeginningOf(tesseract::RIL_WORD) && count < max_chars);
+ }
+ delete res_it;
+ code = code;
+ }
+
+ ocr_clear_bitmap(image);
+ *char_count = count;
+
+ return code;
+}
+
+int ocr_bitmap_to_unicode(void *state,
+ const void *data, int data_x,
+ int w, int h, int raster,
+ int xres, int yres, int *unicode)
+{
+ tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)state;
+ Pix *image;
+ int code;
+
+ if (api == NULL)
+ return 0;
+
+ image = ocr_set_bitmap(api, w, h, (const unsigned char *)data,
+ data_x, raster, xres, yres);
+ if (image == NULL)
+ return_error(gs_error_VMerror);
+
+ code = api->Recognize(NULL);
+ if (code >= 0) {
+ /* Bingo! */
+ tesseract::ResultIterator *res_it = api->GetIterator();
+
+ while (!res_it->Empty(tesseract::RIL_BLOCK)) {
+ if (res_it->Empty(tesseract::RIL_WORD)) {
+ res_it->Next(tesseract::RIL_WORD);
+ continue;
+ }
+
+ do {
+ const unsigned char *graph = (unsigned char *)res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
+ if (graph && graph[0] != 0) {
+ /* Quick and nasty conversion from UTF8 to unicode. */
+ if (graph[0] < 0x80)
+ *unicode = graph[0];
+ else {
+ *unicode = graph[1] & 0x3f;
+ if (graph[0] < 0xE0)
+ *unicode += (graph[0] & 0x1f)<<6;
+ else {
+ *unicode = (graph[2] & 0x3f) | (*unicode << 6);
+ if (graph[0] < 0xF0) {
+ *unicode += (graph[0] & 0x0F)<<6;
+ } else {
+ *unicode = (graph[3] & 0x3f) | (*unicode<<6);
+ *unicode += (graph[0] & 0x7);
+ }
+ }
+ }
+ }
+ res_it->Next(tesseract::RIL_SYMBOL);
+ } while (!res_it->Empty(tesseract::RIL_BLOCK) &&
+ !res_it->IsAtBeginningOf(tesseract::RIL_WORD));
+ }
+ delete res_it;
+ code = code;
+ }
+
+ ocr_clear_bitmap(image);
+
+ return code;
+}
+
+
};
/* Currently tesseract is the only C++ lib we have.
diff --git a/base/tessocr.h b/base/tessocr.h
index 78c30a0d4..8beaa8915 100644
--- a/base/tessocr.h
+++ b/base/tessocr.h
@@ -50,5 +50,15 @@ int ocr_recognise(void *api_, int w, int h, void *data,
int (*callback)(void *, const char *, const int *, const int *, const int *, int),
void *arg);
+int ocr_bitmap_to_unicodes(void* state,
+ const void* data,int data_x,
+ int w,int h,int raster,
+ int xres,int yres,int* unicode, int* char_count);
+
+int ocr_bitmap_to_unicode(void *state,
+ const void *data, int data_x,
+ int w, int h, int raster,
+ int xres, int yres, int *unicode);
+
#endif
diff --git a/devices/devs.mak b/devices/devs.mak
index a5f3750fa..222c02708 100644
--- a/devices/devs.mak
+++ b/devices/devs.mak
@@ -24,7 +24,7 @@ DEVVECSRC=$(DEVVEC)$(D)
DEVI_=$(DEVGENDIR) $(II)$(GLSRCDIR) $(II)$(GLGENDIR) $(II)$(DEVSRCDIR)
DEVF_=
-DEVCCFLAGS=$(I_)$(DEVI_)$(_I) $(I_)$(DEVVEC)$(_I) $(DEVF_)
+DEVCCFLAGS=$(I_)$(DEVI_)$(_I) $(I_)$(DEVVEC)$(_I) $(D_)OCR_VERSION=$(OCR_VERSION)$(_D) $(DEVF_)
DEVCC=$(CC_) $(DEVCCFLAGS)
XPSDEVCC=$(CC_) $(XPSPRINTCFLAGS) $(DEVCCFLAGS)
diff --git a/devices/vector/gdevpdfb.c b/devices/vector/gdevpdfb.c
index 203390384..059c79c32 100644
--- a/devices/vector/gdevpdfb.c
+++ b/devices/vector/gdevpdfb.c
@@ -496,7 +496,7 @@ gdev_pdf_copy_color(gx_device * dev, const byte * base, int sourcex,
/* Fill a mask. */
int
gdev_pdf_fill_mask(gx_device * dev,
- const byte * data, int data_x, int raster, gx_bitmap_id id,
+ const byte * data, int data_x, int raster, gx_bitmap_id id,
int x, int y, int width, int height,
const gx_drawing_color * pdcolor, int depth,
gs_logical_operation_t lop, const gx_clip_path * pcpath)
@@ -505,6 +505,50 @@ gdev_pdf_fill_mask(gx_device * dev,
if (width <= 0 || height <= 0)
return 0;
+
+ /* If OCRStage is 'OCR_Rendering' then we are handling an image which is a rendered glyph
+ * that we want to have OCR software process and return a Unicode code point for.
+ * We specifically do *not* want to send the image to the output PDF file!
+ */
+ if (pdev->OCRStage == OCR_Rendering) {
+ int code = 0;
+ ocr_glyph_t *new_glyph = NULL;
+ int index;
+
+ new_glyph = (ocr_glyph_t *)gs_alloc_bytes(pdev->pdf_memory, sizeof(ocr_glyph_t), "");
+ if (new_glyph == NULL)
+ return_error(gs_error_VMerror);
+ new_glyph->data = gs_alloc_bytes(pdev->pdf_memory, raster*height, "");
+ if (new_glyph->data == NULL)
+ return_error(gs_error_VMerror);
+ memcpy(new_glyph->data, data, raster * height);
+ new_glyph->height = height;
+ new_glyph->width = width;
+ new_glyph->raster = raster;
+ new_glyph->x = x;
+ new_glyph->y = y;
+ new_glyph->char_code = pdev->OCR_char_code;
+ new_glyph->glyph = pdev->OCR_glyph;
+ new_glyph->next = NULL;
+ new_glyph->is_space = true;
+ for(index = 0; index < height * raster;index++){
+ if(data[index] != 0x00) {
+ new_glyph->is_space = false;
+ break;
+ }
+ }
+ if (pdev->ocr_glyphs == NULL)
+ pdev->ocr_glyphs = new_glyph;
+ else {
+ ocr_glyph_t *next = pdev->ocr_glyphs;
+
+ while (next->next != NULL)
+ next = next->next;
+ next->next = new_glyph;
+ }
+ return code;
+ }
+
if (depth > 1 || (!gx_dc_is_pure(pdcolor) != 0 && !(gx_dc_is_pattern1_color(pdcolor))))
return gx_default_fill_mask(dev, data, data_x, raster, id,
x, y, width, height, pdcolor, depth, lop,
diff --git a/devices/vector/gdevpdfb.h b/devices/vector/gdevpdfb.h
index d8d596959..b2808a3ae 100644
--- a/devices/vector/gdevpdfb.h
+++ b/devices/vector/gdevpdfb.h
@@ -128,6 +128,8 @@ const gx_device_pdf PDF_DEVICE_IDENT =
{0,0}, /* PDFXTrimBoxToMediaBoxOffset */
{0,0}, /* PDFXBleedBoxToTrimBoxOffset */
1 /* true */, /* PDFXSetBleedBoxToMediaBox */
+ "", /* ocr_language */
+ 0, /* ocr_engine */
1 /*true*/, /* ReAssignCharacters */
1 /*true*/, /* ReEncodeCharacters */
1, /* FirstObjectNumber */
@@ -300,7 +302,14 @@ const gx_device_pdf PDF_DEVICE_IDENT =
0, /* ExtensionMetadata */
0, /* PDFFormName */
0, /* PassThroughWriter */
- 1.0 /* UserUnit */
+ 1.0, /* UserUnit */
+ 0, /* UseOCR */
+ NULL, /* OCRSaved */
+ 0, /* OCRStage */
+ NULL, /* OCRUnicode */
+ 0, /* OCR_char_code */
+ 0, /* OCR_glyph */
+ NULL /* ocr_glyphs */
};
#else
diff --git a/devices/vector/gdevpdfp.c b/devices/vector/gdevpdfp.c
index 3a371aada..9cc63644b 100644
--- a/devices/vector/gdevpdfp.c
+++ b/devices/vector/gdevpdfp.c
@@ -255,6 +255,48 @@ gdev_pdf_get_param(gx_device *dev, char *Param, void *list)
return(param_write_null(plist, "DSC"));
}
}
+
+#if OCR_VERSION > 0
+ if (strcmp(Param, "OCRLanguage") == 0) {
+ gs_param_string langstr;
+ if (pdev->ocr_language[0]) {
+ langstr.data = (const byte *)pdev->ocr_language;
+ langstr.size = strlen(pdev->ocr_language);
+ langstr.persistent = false;
+ } else {
+ langstr.data = (const byte *)"eng";
+ langstr.size = 3;
+ langstr.persistent = false;
+ }
+ return param_write_string(plist, "OCRLanguage", &langstr);
+ }
+ if (strcmp(Param, "OCREngine") == 0)
+ return param_write_int(plist, "OCREngine", &pdev->ocr_engine);
+
+ if (strcmp(Param, "UseOCR") == 0) {
+ gs_param_string ocrstr;
+
+ switch(pdev->UseOCR) {
+ case UseOCRNever:
+ ocrstr.data = (const byte *)"Never";
+ ocrstr.size = 5;
+ ocrstr.persistent = false;
+ break;
+ UseOCRAsNeeded:
+ ocrstr.data = (const byte *)"AsNeeded";
+ ocrstr.size = 8;
+ ocrstr.persistent = false;
+ break;
+ UseOCRAlways:
+ ocrstr.data = (const byte *)"Always";
+ ocrstr.size = 8;
+ ocrstr.persistent = false;
+ break;
+ }
+ return param_write_string(plist, "UseOCR", &ocrstr);
+ }
+#endif
+
return gdev_psdf_get_param(dev, Param, list);
}
@@ -269,6 +311,49 @@ gdev_pdf_get_params(gx_device * dev, gs_param_list * plist)
int code;
int cdv = CoreDistVersion;
+#if OCR_VERSION > 0
+ gs_param_string langstr;
+
+ if (pdev->ocr_language[0]) {
+ langstr.data = (const byte *)pdev->ocr_language;
+ langstr.size = strlen(pdev->ocr_language);
+ langstr.persistent = false;
+ } else {
+ langstr.data = (const byte *)"eng";
+ langstr.size = 3;
+ langstr.persistent = false;
+ }
+
+ {
+ gs_param_string ocrstr;
+
+ switch(pdev->UseOCR) {
+ case UseOCRNever:
+ ocrstr.data = (const byte *)"Never";
+ ocrstr.size = 5;
+ ocrstr.persistent = false;
+ break;
+ UseOCRAsNeeded:
+ ocrstr.data = (const byte *)"AsNeeded";
+ ocrstr.size = 8;
+ ocrstr.persistent = false;
+ break;
+ UseOCRAlways:
+ ocrstr.data = (const byte *)"Always";
+ ocrstr.size = 8;
+ ocrstr.persistent = false;
+ break;
+ }
+ code = param_write_string(plist, "UseOCR", &ocrstr);
+ }
+ code = param_write_string(plist, "OCRLanguage", &langstr);
+ if(code < 0)
+ return code;
+ code = param_write_int(plist, "OCREngine", &pdev->ocr_engine);
+ if(code < 0)
+ return code;
+#endif
+
pdev->ParamCompatibilityLevel = cl;
code = gdev_psdf_get_params(dev, plist);
if (code < 0 ||
@@ -367,6 +452,83 @@ gdev_pdf_put_params_impl(gx_device * dev, const gx_device_pdf * save_dev, gs_par
}
}
+#if OCR_VERSION > 0
+ {
+ int len;
+ gs_param_string langstr;
+ switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) {
+ case 0:
+ len = langstr.size;
+ if (len >= sizeof(pdev->ocr_language))
+ len = sizeof(pdev->ocr_language)-1;
+ memcpy(pdev->ocr_language, langstr.data, len);
+ pdev->ocr_language[len] = 0;
+ break;
+ case 1:
+ break;
+ default:
+ ecode = code;
+ param_signal_error(plist, param_name, ecode);
+ }
+ }
+
+ {
+ int engine;
+ switch (code = param_read_int(plist, (param_name = "OCREngine"), &engine)) {
+ case 0:
+ pdev->ocr_engine = engine;
+ break;
+ case 1:
+ break;
+ default:
+ ecode = code;
+ param_signal_error(plist, param_name, ecode);
+ }
+ }
+
+ {
+ gs_param_string ocrstr;
+
+ code = param_read_string(plist, (param_name = "UseOCR"), &ocrstr);
+ switch(code) {
+ case 0:
+ if (ocrstr.size == 5 && memcmp(ocrstr.data, "Never", 5) == 0)
+ pdev->UseOCR = UseOCRNever;
+ if (ocrstr.size == 8 && memcmp(ocrstr.data, "AsNeeded", 8) == 0)
+ pdev->UseOCR = UseOCRAsNeeded;
+ if (ocrstr.size == 6 && memcmp(ocrstr.data, "Always", 6) == 0)
+ pdev->UseOCR = UseOCRAlways;
+ break;
+ case 1:
+ break;
+ default:
+ param_signal_error(plist, param_name, code);
+ break;
+ }
+ }
+
+ {
+ gs_param_string ocrstr;
+
+ code = param_read_string(plist, (param_name = "UseOCR"), &ocrstr);
+ switch(code) {
+ case 0:
+ if (ocrstr.size == 5 && memcmp(ocrstr.data, "Never", 5) == 0)
+ pdev->UseOCR = UseOCRNever;
+ if (ocrstr.size == 8 && memcmp(ocrstr.data, "AsNeeded", 8) == 0)
+ pdev->UseOCR = UseOCRAsNeeded;
+ if (ocrstr.size == 6 && memcmp(ocrstr.data, "Always", 6) == 0)
+ pdev->UseOCR = UseOCRAlways;
+ break;
+ case 1:
+ break;
+ default:
+ param_signal_error(plist, param_name, code);
+ break;
+ }
+ }
+#endif
+
/*
* Check for LockDistillerParams before doing anything else.
* If LockDistillerParams is true and is not being set to false,
diff --git a/devices/vector/gdevpdfx.h b/devices/vector/gdevpdfx.h
index 16515fcdd..6123c652c 100644
--- a/devices/vector/gdevpdfx.h
+++ b/devices/vector/gdevpdfx.h
@@ -547,6 +547,33 @@ typedef enum {
pdf_compress_Flate
} pdf_compression_type;
+typedef enum {
+ OCR_UnInit,
+ OCR_Rendering,
+ OCR_Rendered,
+ OCR_UnicodeAvailable,
+ OCR_Failed
+} pdf_OCR_stage;
+
+typedef enum {
+ UseOCRNever,
+ UseOCRAsNeeded,
+ UseOCRAlways
+} pdf_OCR_usage;
+
+typedef struct ocr_glyph_s{
+ byte *data;
+ int x;
+ int y;
+ int width;
+ int height;
+ int raster;
+ void *next;
+ gs_char char_code;
+ gs_glyph glyph;
+ bool is_space;
+} ocr_glyph_t;
+
/* Define the device structure. */
struct gx_device_pdf_s {
gx_device_psdf_common;
@@ -572,6 +599,9 @@ struct gx_device_pdf_s {
gs_param_float_array PDFXTrimBoxToMediaBoxOffset;
gs_param_float_array PDFXBleedBoxToTrimBoxOffset;
bool PDFXSetBleedBoxToMediaBox;
+ /* OCR Parameters */
+ char ocr_language[1024];
+ int ocr_engine;
/* Other parameters */
bool ReAssignCharacters;
bool ReEncodeCharacters;
@@ -909,6 +939,13 @@ struct gx_device_pdf_s {
* anything in the image processing routines.
*/
float UserUnit;
+ pdf_OCR_usage UseOCR; /* Never, AsNeeded or Always */
+ gs_text_enum_t* OCRSaved; /* Saved state of the text enumerator before rendering glyph bitmaps for later OCR */
+ pdf_OCR_stage OCRStage; /* Used to control a (sort of) state machine when using OCR to get a Unicode value for a glyph */
+ int *OCRUnicode; /* Used to pass back the Unicode value from the OCR engine to the text processing */
+ gs_char OCR_char_code; /* Passes the current character code from text processing to the image processing code when rendering glyph bitmaps for OCR */
+ gs_glyph OCR_glyph; /* Passes the current glyph code from text processing to the image processing code when rendering glyph bitmaps for OCR */
+ ocr_glyph_t *ocr_glyphs; /* Records bitmaps and other data from text processing when doing OCR */
};
#define is_in_page(pdev)\
diff --git a/devices/vector/gdevpdte.c b/devices/vector/gdevpdte.c
index 6f0eb158a..0310f54cf 100644
--- a/devices/vector/gdevpdte.c
+++ b/devices/vector/gdevpdte.c
@@ -43,6 +43,7 @@
#include "gxcpath.h"
#include "gsfcmap.h"
+#include "tessocr.h"
static int pdf_char_widths(gx_device_pdf *const pdev,
pdf_font_resource_t *pdfont, int ch,
@@ -80,6 +81,216 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr,
return pdf_process_string(penum, pstr, pfmat, ppts, gdata);
}
+static int OCRText(gx_device_pdf *pdev, gs_glyph glyph, gs_char ch, gs_char *length, byte **unicode)
+{
+#if OCR_VERSION > 0
+ int code = 0;
+
+ if(pdev->OCRStage == OCR_Rendered) {
+ int llx, lly, urx, ury, char_count = 0, returned_count = 0, *returned;
+ ocr_glyph_t *next_glyph = pdev->ocr_glyphs;
+ int rows, stride, row, column;
+ byte *bitmap = NULL, *src, *dest, *rowptr, srcmask, destmask;
+ void *state;
+ const char *language = pdev->ocr_language;
+ gp_file *DbgFile;
+
+ if(language == NULL || language[0] == 0)
+ language = "eng";
+
+ /* We should alredy have rendered a bitmap for all the glyphs in the
+ * text operation, so this shuld be redundant, but best to be safe.
+ */
+ if(next_glyph == NULL)
+ return_error(gs_error_unknownerror);
+
+ /* Identify the bounding box of the returned glyphs by examing the bounds and position
+ * of each glyph. At the same time count the number of expected returned characters.
+ * We treat any empty bitmap (all 0x00 bytes) as a space because, obviously, the
+ * OCR engine can't tell differentiate between a space character and no character at all.
+ */
+ llx = next_glyph->x;
+ lly = next_glyph->y;
+ urx = llx + next_glyph->width;
+ ury = lly + next_glyph->height;
+ if(next_glyph != NULL && !next_glyph->is_space)
+ char_count++;
+ next_glyph = (ocr_glyph_t *)next_glyph->next;
+ while(next_glyph) {
+ if(!next_glyph->is_space)
+ char_count++;
+ if(next_glyph->x < llx)
+ llx = next_glyph->x;
+ if(next_glyph->y < lly)
+ lly = next_glyph->y;
+ if(next_glyph->x + next_glyph->width > urx)
+ urx = next_glyph->x + next_glyph->width;
+ if(next_glyph->y + next_glyph->height > ury)
+ ury = next_glyph->y + next_glyph->height;
+ next_glyph = next_glyph->next;
+ }
+
+ /* Allocate and initialise the 'strip' bitmap which will receive all the
+ * individual glyph bitmaps.
+ */
+ rows = ury - lly;
+ stride = (((urx - llx) + 7) / 8) + 1;
+ bitmap = gs_alloc_bytes(pdev->memory, rows * stride, "working OCR memory");
+ if(bitmap == NULL)
+ return_error(gs_error_VMerror);
+ memset(bitmap, 0x00, rows * stride);
+
+ /* Allocate a buffer for the OCR engine to return the Unicode code points. This needs work,
+ * we might want more information returned (bounding boxes and confidence levels) and we
+ * need to think about the possibility that the OCR engine finds more character than we
+ * expected (eg fi ligatures returned as 'f' and 'i'.
+ */
+ returned = (int *)gs_alloc_bytes(pdev->memory, char_count * sizeof(int), "returned unicodes");
+ if(returned == NULL) {
+ gs_free_object(pdev->memory, bitmap, "working OCR memory");
+ return_error(gs_error_VMerror);
+ }
+ memset(returned, 0x00, char_count * sizeof(int));
+
+ /* Now copy each glyph bitmap to the correct position in the strip. This is complicated
+ * by the fact that bitmaps are monochrome pcaked into bytes and so the destination
+ * may not be aligned on a byte boundary.
+ */
+ next_glyph = (ocr_glyph_t *)pdev->ocr_glyphs;
+ while(next_glyph) {
+ rowptr = bitmap + ((next_glyph->y - lly) * stride) + (int)floor((next_glyph->x - llx) / 8);
+ for(row = 0;row < next_glyph->height;row++) {
+ dest = rowptr + row * stride;
+ src = next_glyph->data + (row * next_glyph->raster);
+ destmask = 0x80 >> (next_glyph->x - llx) % 8;
+ srcmask = 0x80;
+ for(column = 0; column < next_glyph->width;column++) {
+ if(*src & srcmask) {
+ *dest = *dest | destmask;
+ }
+ srcmask = srcmask >> 1;
+ if(srcmask == 0) {
+ srcmask = 0x80;
+ src++;
+ }
+ destmask = destmask >> 1;
+ if(destmask == 0) {
+ destmask = 0x80;
+ dest++;
+ }
+ }
+ }
+ next_glyph = next_glyph->next;
+ }
+
+#if 0
+ DbgFile = gp_fopen(pdev->memory, "d:/temp/bits.txt", "wb+");
+ for(row = 0;row < rows;row++) {
+ for(column = 0;column < stride;column++) {
+ dest = bitmap + (row * stride);
+ gp_fprintf(DbgFile, "%02x", dest[column]);
+ }
+ gp_fprintf(DbgFile, "\n");
+ }
+ gp_fclose(DbgFile);
+#endif
+ /* Initialise the OCR engine */
+ code = ocr_init_api(pdev->memory->non_gc_memory, language,
+ pdev->ocr_engine, &state);
+ if(code < 0) {
+ gs_free_object(pdev->memory, bitmap, "working OCR memory");
+ gs_free_object(pdev->memory, returned, "returned unicodes");
+ return code;
+ }
+ returned_count = char_count;
+
+ /* Pass our strip to the OCR engine */
+ code = ocr_bitmap_to_unicodes(state,
+ bitmap, 0, stride * 8, rows, stride,
+ (int)pdev->HWResolution[0],
+ (int)pdev->HWResolution[1],
+ returned, &returned_count);
+
+ /* and close the engine back down again */
+ ocr_fin_api(pdev->memory->non_gc_memory, state);
+ gs_free_object(pdev->memory, bitmap, "working OCR memory");
+
+ if(code < 0) {
+ pdev->OCRStage = OCR_Failed;
+ gs_free_object(pdev->memory, returned, "returned unicodes");
+ return code;
+ }
+
+ /* Future enhancement we should fall back to trying the individual bitmap here */
+ if(returned_count != char_count) {
+ pdev->OCRStage = OCR_Failed;
+ gs_free_object(pdev->memory, returned, "returned unicodes");
+ return 0;
+ }
+ pdev->OCRUnicode = returned;
+
+ /* Actually perform OCR on the stored bitmaps */
+ pdev->OCRStage = OCR_UnicodeAvailable;
+ }
+
+ if(pdev->OCRStage == OCR_UnicodeAvailable) {
+ /* We've OCR'ed the bitmaps already, find the unicode value */
+ ocr_glyph_t *new_glyph = (ocr_glyph_t *)pdev->ocr_glyphs;
+ int ocr_index = 0;
+ uint mask = 0xFF;
+ int ix;
+ char *u;
+
+ /* Find the bitmap which matches the character/glyph we are processing */
+ while(new_glyph) {
+ if(new_glyph->char_code == ch || new_glyph->glyph == glyph) {
+ ocr_glyph_t *g1 = pdev->ocr_glyphs;
+
+ /* Spaces are handled specially, so just jump out now */
+ if(new_glyph->is_space)
+ break;
+
+ /* Otherwise, find all the bitmaps which lie to the left of the
+ * one we found (we are assuming for now that the returned
+ * Unicode values are left to right)
+ */
+ while(g1) {
+ if(!g1->is_space) {
+ if(g1->x < new_glyph->x)
+ ocr_index++;
+ }
+ g1 = g1->next;
+ }
+ break;
+ }
+ new_glyph = new_glyph->next;
+ }
+
+ /* If we found a matching bitmap, get the corresponding unicode code point from
+ * the stored values returned by the OCR engine.
+ */
+ if(new_glyph) {
+ *unicode = (byte *)gs_alloc_bytes(pdev->memory, 2 * sizeof(ushort), "temporary Unicode array");
+ if(*unicode == NULL)
+ return_error(gs_error_VMerror);
+ u = (char *)(*unicode);
+ if(new_glyph->is_space) {
+ memset(u, 0x00, 3);
+ u[3] = 0x20;
+ }
+ else {
+ for(ix = 0;ix < 4;ix++) {
+ u[3 - ix] = (pdev->OCRUnicode[ocr_index] & mask) >> (8 * ix);
+ mask = mask << 8;
+ }
+ }
+ *length = 4;
+ }
+ }
+ #endif
+ return 0;
+}
+
/*
* Add char code pair to ToUnicode CMap,
* creating the CMap on neccessity.
@@ -87,27 +298,43 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr,
int
pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfont,
gs_glyph glyph, gs_char ch, const gs_const_string *gnstr)
-{ int code;
- gs_char length;
+{ int code = 0;
+ gs_char length = 0;
ushort *unicode = 0;
if (glyph == GS_NO_GLYPH)
return 0;
- length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0);
- if ((length == 0 || length == GS_NO_CHAR) && gnstr != NULL && gnstr->size == 7) {
- if (!memcmp(gnstr->data, "uni", 3)) {
- static const char *hexdigits = "0123456789ABCDEF";
- char *d0 = strchr(hexdigits, gnstr->data[3]);
- char *d1 = strchr(hexdigits, gnstr->data[4]);
- char *d2 = strchr(hexdigits, gnstr->data[5]);
- char *d3 = strchr(hexdigits, gnstr->data[6]);
-
- unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array");
- if (d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) {
- char *u = (char *)unicode;
- u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits));
- u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits));
- length = 2;
+ if(pdev->UseOCR == UseOCRAlways) {
+ code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode);
+ if(code < 0)
+ return code;
+ }
+ else {
+ length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0);
+ if(length == 0 || length == GS_NO_CHAR) {
+ if(gnstr != NULL && gnstr->size == 7) {
+ if(!memcmp(gnstr->data, "uni", 3)) {
+ static const char *hexdigits = "0123456789ABCDEF";
+ char *d0 = strchr(hexdigits, gnstr->data[3]);
+ char *d1 = strchr(hexdigits, gnstr->data[4]);
+ char *d2 = strchr(hexdigits, gnstr->data[5]);
+ char *d3 = strchr(hexdigits, gnstr->data[6]);
+
+ unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array");
+ if(d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) {
+ char *u = (char *)unicode;
+ u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits));
+ u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits));
+ length = 2;
+ }
+ }
+ }
+ else {
+ if(pdev->UseOCR != UseOCRNever) {
+ code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode);
+ if(code < 0)
+ return code;
+ }
}
}
}
@@ -163,6 +390,7 @@ pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfon
if (length > 2 && pdfont->u.simple.Encoding != NULL)
pdfont->TwoByteToUnicode = 0;
}
+
if (unicode)
gs_free_object(pdev->memory, unicode, "temporary Unicode array");
return 0;
@@ -255,8 +483,11 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_
pet = &pdfont->u.simple.Encoding[ch];
glyph = (gdata == NULL ? font->procs.encode_char(font, ch, GLYPH_SPACE_NAME)
: *gdata);
- if (glyph == GS_NO_GLYPH || glyph == pet->glyph)
+ if (glyph == GS_NO_GLYPH || glyph == pet->glyph) {
+ if((pdfont->cmap_ToUnicode == NULL || !gs_cmap_ToUnicode_check_pair(pdfont->cmap_ToUnicode, ch)) && pdev->UseOCR != UseOCRNever)
+ (void)pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr);
return 0;
+ }
if (pet->glyph != GS_NO_GLYPH) { /* encoding conflict */
return_error(gs_error_rangecheck);
/* Must not happen because pdf_obtain_font_resource
@@ -358,7 +589,7 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_
* The decision about writing it out is deferred until pdf_write_font_resource.
*/
code = pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr);
- if (code < 0)
+ if(code < 0)
return code;
pet->glyph = glyph;
pet->str = gnstr;
@@ -1035,6 +1266,7 @@ process_text_return_width(const pdf_text_enum_t *pte, gs_font_base *font,
{ const gs_glyph *gdata_i = (gdata != NULL ? gdata + i : 0);
code = pdf_encode_string_element(pdev, (gs_font *)font, pdfont, ch, gdata_i);
+
if (code < 0)
return code;
}
diff --git a/devices/vector/gdevpdtt.c b/devices/vector/gdevpdtt.c
index 6a89fedf7..e4c1d1471 100644
--- a/devices/vector/gdevpdtt.c
+++ b/devices/vector/gdevpdtt.c
@@ -299,13 +299,29 @@ static void
pdf_text_release(gs_text_enum_t *pte, client_name_t cname)
{
pdf_text_enum_t *const penum = (pdf_text_enum_t *)pte;
+ gx_device_pdf *pdev = (gx_device_pdf *)penum->dev;
+ ocr_glyph_t *next;
if (penum->pte_default) {
gs_text_release(NULL, penum->pte_default, cname);
penum->pte_default = 0;
}
pdf_text_release_cgp(penum);
+
+ while (pdev->ocr_glyphs != NULL)
+ {
+ next = pdev->ocr_glyphs->next;
+
+ gs_free_object(pdev->memory, pdev->ocr_glyphs->data, "free bitmap");
+ gs_free_object(pdev->memory, pdev->ocr_glyphs, "free bitmap");
+ pdev->ocr_glyphs = next;
+ }
+ if (pdev->OCRUnicode != NULL)
+ gs_free_object(pdev->memory, pdev->OCRUnicode, "free returned unicodes");
+ pdev->OCRUnicode = NULL;
+
gx_default_text_release(pte, cname);
+ pdev->OCRStage = 0;
}
void
pdf_text_release_cgp(pdf_text_enum_t *penum)
@@ -3152,6 +3168,57 @@ static int pdf_query_purge_cached_char(const gs_memory_t *mem, cached_char *cc,
return 0;
}
+static int ProcessTextForOCR(gs_text_enum_t *pte)
+{
+ pdf_text_enum_t *const penum = (pdf_text_enum_t *)pte;
+ gx_device_pdf *pdev = (gx_device_pdf *)penum->dev;
+ gs_text_enum_t *pte_default;
+ int code;
+
+ if (pdev->OCRStage == OCR_UnInit) {
+ gs_gsave(pte->pgs);
+ pdev->OCRSaved = (gs_text_enum_t*)gs_alloc_bytes(pdev->memory,sizeof(gs_text_enum_t),"saved enumerator for OCR");
+ if(pdev->OCRSaved == NULL)
+ return_error(gs_error_VMerror);
+ *(pdev->OCRSaved) = *pte;
+ gs_text_enum_copy_dynamic(pdev->OCRSaved,pte,true);
+
+ code = pdf_default_text_begin(pte, &pte->text, &pte_default);
+ if (code < 0)
+ return code;
+ penum->pte_default = pte_default;
+ gs_text_enum_copy_dynamic(pte_default, pte, false);
+ pdev->OCRStage = OCR_Rendering;
+ }
+
+ if (pdev->OCRStage == OCR_Rendering) {
+ penum->pte_default->can_cache = 0;
+ code = gs_text_process(penum->pte_default);
+ pdev->OCR_char_code = penum->pte_default->returned.current_char;
+ pdev->OCR_glyph = penum->pte_default->returned.current_glyph;
+ gs_text_enum_copy_dynamic(pte, penum->pte_default, true);
+ if (code == TEXT_PROCESS_RENDER)
+ return code;
+ if (code != 0) {
+ gs_free_object(pdev->memory, pdev->OCRSaved,"saved enumerator for OCR");
+ pdev->OCRSaved = NULL;
+ gs_grestore(pte->pgs);
+ gs_text_release(pte->pgs, penum->pte_default, "pdf_text_process");
+ penum->pte_default = NULL;
+ return code;
+ }
+ gs_grestore(pte->pgs);
+ *pte = *(pdev->OCRSaved);
+ gs_text_enum_copy_dynamic(pte, pdev->OCRSaved, true);
+ gs_free_object(pdev->memory, pdev->OCRSaved,"saved enumerator for OCR");
+ pdev->OCRSaved = NULL;
+ gs_text_release(pte->pgs, penum->pte_default, "pdf_text_process");
+ penum->pte_default = NULL;
+ pdev->OCRStage = OCR_Rendered;
+ }
+ return 0;
+}
+
/*
* Continue processing text. This is the 'process' procedure in the text
* enumerator. Per the check in pdf_text_begin, we know the operation is
@@ -3207,6 +3274,12 @@ pdf_text_process(gs_text_enum_t *pte)
goto default_impl;
}
+ if (pdev->UseOCR != UseOCRNever) {
+ code = ProcessTextForOCR(pte);
+ if (code != 0)
+ return code;
+ }
+
code = -1; /* to force default implementation */
/*
@@ -3547,6 +3620,7 @@ pdf_text_process(gs_text_enum_t *pte)
}
gs_text_enum_copy_dynamic(pte, pte_default, true);
+
if (code)
return code;
gs_text_release(NULL, pte_default, "pdf_text_process");
diff --git a/doc/Devices.htm b/doc/Devices.htm
index 91994c82d..87e015f2e 100644
--- a/doc/Devices.htm
+++ b/doc/Devices.htm
@@ -76,6 +76,7 @@
<ul>
<li><a href="#OCR">OCR text output</a></li>
<li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li>
+<li><a href="#PDFwriteocr">Vector PDF output (with OCR Unicode CMaps)</a></li>
</ul>
<li><a href="#High-level">High level formats</a></li>
<ul>
@@ -1059,6 +1060,37 @@ resolution independence, and editability.</p>
</p>
<p>
+<h3><a name="PDFwriteocr"></a>Vector PDF output (with OCR Unicode CMaps)</h3>
+<p>
+The pdfwrite device has been augmented to use the OCR engine to analyse text
+(not images!) in the input stream, and derive Unicode code points for it.
+That information can then be used to create ToUnicode CMaps which are attached
+to the Font (or CIDFont) objects embedded in the PDF file.
+</p>
+<p>
+Fonts which have ToUnicode CMaps can be reliably (limited by the accuracy of
+the CMap) used in search and copy/paste functions, as well as text extraction
+from PDF files. Note that OCR is not a 100% perfect process; it is possible
+that some text might be misidentified.
+</p>
+<p>
+OCR is a slow operation! In addition it can (for Latin text at least) sometimes
+be preferable not to add ToUnicode information which may be incorrect, but instead
+to use the existing font Encoding. For English text this may give better results.
+</p>
+<p>For these reasons the OCR functionality of pdfwrite can be controlled by using a new
+parameter <code>-sUseOCR</code>. This has three possible values;
+</p>
+<dt><code>-sUseOCR=</code><b><em>string</em></b></dt>
+<dd>
+ <dl>
+ <dt>Never<dd>Default - don't use OCR at all even if support is built-in.
+ <dt>AsNeeded<dd>If there is no existing ToUnicode information, use OCR.
+ <dt>Always<dd>Ignore any existing information and always use OCR.
+ </dl>
+</dd>
+</p>
+
<hr>
<h2><a name="High-level"></a>High-level devices</h2>
diff --git a/doc/VectorDevices.htm b/doc/VectorDevices.htm
index f188e44df..2e9e2dcde 100644
--- a/doc/VectorDevices.htm
+++ b/doc/VectorDevices.htm
@@ -986,6 +986,59 @@ displaying document's properties,
so we recommend this value.
</dl>
+<d1>
+<dt><code>-sUseOCR=</code><em>string</em>
+<dd>Controls the use of OCR in pdfwrite. If enabled this will use an OCR
+engine to analyse the glyph bitmaps used to draw text in a PDF file, and
+the resulting Unicode code points are then used to construct a ToUnicode
+CMap.
+<p>
+PDF files containing ToUnicode CMaps can be searched, use copy/paste and
+extract the text, subject to the accuracy of the ToUnicode CMap. Since not all
+PDF files contain these it can be beneficial to create them.
+</p>
+<p>
+Note that, for English text, it is possible that the existing standard character
+encoding (which most PDF consumers will fall back to in the absence of Unicode
+information) is better than using OCR, as OCR is ot a 100% reliable process.
+OCR processing is also comparatively slow.
+</p>
+<p>
+For the reasons above it is useful to be able to exercise some control over the
+action of pdfwrite when OCR processing is available, and the <code>UseOCR</code>
+parameter provides that control. There are three possible valuues:
+</p>
+<li><code>Never</code> Default - don't use OCR at all even if support is built-in.
+<li><code>AsNeeded</code> If there is no existing ToUnicode information, use OCR.
+<li><code>Always</code> Ignore any existing information and always use OCR.
+<p>
+Our experimentation with the Tesseract OCR engine has shown that the more text we
+can supply for the engine to look at, the better the result we get. We are, unfortunately,
+limited to the graphics library operations for text as follows.
+</p>
+<p>
+The code works on text 'fragments'; these are the text sequences sent to the text
+operators of the source language. Generally most input languages will try to send
+text in its simplest form, eg "Abc", but the requirements of justification, kerning
+and so on mean that sometimes each character is positioned independently on the page.
+</p>
+<p>
+So pdfwrite renders all the bitmaps for every charcter in the text document, when
+set up to use OCR. Later, if any character in the font does not have a Unicode
+value already we use the bitmaps to assemble a 'strip' of text which we then send
+to the OCR engine. If the engine returns a different number of recognised characters
+than we expected then we ignore that result. We've found that (for English text)
+constructions such as ". T" tend to ignore the full stop, presumably because the OCR
+engine thinks that it is simply noise. In contrast "text." does identify the full
+stop correctly. So by ignoring the failed result we cna get a better result later.
+</p>
+<p>
+Obviously this is all heuristic and undoubtedly there is more we can do to improve the
+functionality here, but we need concrete examples to work from.
+</p>
+</dd>
+</dt>
+
<h3><a name="PS"></a>PostScript file output</h3>
<p>
The <code>ps2write</code> device handles the same set of distiller