17 files changed, 933 insertions, 25 deletions
diff --git a/base/gsdparam.c b/base/gsdparam.c
index 3af8e856f..36729692c 100644
--- a/base/gsdparam.c
+++ b/base/gsdparam.c
@@ -1010,6 +1010,8 @@ gs_putdeviceparams(gx_device * dev, gs_param_list * plist)
     bool was_open = dev->is_open;
     int code;
 
+    /* gs_param_list_dump(plist); */
+
     gx_device_set_procs(dev);
     fill_dev_proc(dev, put_params, gx_default_put_params);
     fill_dev_proc(dev, get_alpha_bits, gx_default_get_alpha_bits);
diff --git a/base/gsfcmap.c b/base/gsfcmap.c
index 32a32fdff..3040d390f 100644
--- a/base/gsfcmap.c
+++ b/base/gsfcmap.c
@@ -643,6 +643,19 @@ gs_cmap_ToUnicode_realloc(gs_memory_t *mem, int new_value_size, gs_cmap_t **ppcm
     return 0;
 }
 
+int gs_cmap_ToUnicode_check_pair(gs_cmap_t *pcmap, int code0)
+{
+    gs_cmap_ToUnicode_t *cmap = (gs_cmap_ToUnicode_t *)pcmap;
+    uchar *map = pcmap->glyph_name_data;
+    const int num_codes = ((gs_cmap_ToUnicode_t *)pcmap)->num_codes;
+
+    if (code0 >= num_codes)
+        return; /* must not happen. */
+    if(map[code0 * (cmap->value_size + 2)] == 0 && map[code0 * (cmap->value_size + 2) + 1] == 0)
+        return 0;
+    return 1;
+}
+
 /*
  * Write a code pair to ToUnicode CMap.
  */
diff --git a/base/gsfcmap.h b/base/gsfcmap.h
index 73516c3e2..2507865cc 100644
--- a/base/gsfcmap.h
+++ b/base/gsfcmap.h
@@ -66,4 +66,6 @@ int gs_cmap_ToUnicode_realloc(gs_memory_t *mem, int new_value_size, gs_cmap_t **
  */
 void gs_cmap_ToUnicode_add_pair(gs_cmap_t *pcmap, int code0, ushort *unicode, unsigned int length);
 
+int gs_cmap_ToUnicode_check_pair(gs_cmap_t *pcmap, int code0);
+
 #endif /* gsfcmap_INCLUDED */
diff --git a/base/gsparam.h b/base/gsparam.h
index 3b0aaa21d..456a8a532 100644
--- a/base/gsparam.h
+++ b/base/gsparam.h
@@ -577,4 +577,8 @@ int gs_param_list_add_parsed_value(gs_param_list *plist, gs_param_name key, cons
  * address pointed to be len. */
 int gs_param_list_to_string(gs_param_list *plist, gs_param_name key, char *value, int *len);
 
+/* Debug function to dump a list of params. Do NOT use in production
+ * code! */
+int gs_param_list_dump(gs_param_list *plist);
+
 #endif /* gsparam_INCLUDED */
diff --git a/base/gsparaml.c b/base/gsparaml.c
index d7e5fcdbf..3d7e49b14 100644
--- a/base/gsparaml.c
+++ b/base/gsparaml.c
@@ -1046,3 +1046,31 @@ int gs_param_list_to_string(gs_param_list *plist, gs_param_name key, char *value
         *value = 0;
     return to_string(plist, key, &out);
 }
+
+int gs_param_list_dump(gs_param_list *plist)
+{
+    gs_param_enumerator_t enumerator;
+    gs_param_key_t key;
+    int code;
+    char buffer[4096];
+    int len;
+
+    param_init_enumerator(&enumerator);
+    while ((code = param_get_next_key(plist, &enumerator, &key)) == 0) {
+        char string_key[256];	/* big enough for any reasonable key */
+
+        if (key.size > sizeof(string_key) - 1) {
+            code = gs_note_error(gs_error_rangecheck);
+            break;
+        }
+        memcpy(string_key, key.data, key.size);
+        string_key[key.size] = 0;
+        dlprintf1("%s ", string_key);
+        code = gs_param_list_to_string(plist, string_key, buffer, &len);
+        if (code < 0)
+            break;
+        dlprintf1("%s ", buffer);
+    }
+    dlprintf("\n");
+    return code;
+}
diff --git a/base/tesseract.mak b/base/tesseract.mak
index c2bc1fb5a..43e234e75 100644
--- a/base/tesseract.mak
+++ b/base/tesseract.mak
@@ -24,7 +24,8 @@ TESSINCLUDES=\
 #   add  -DDISABLED_LEGACY_ENGINE to TESSCXX
 #   empty TESSERACT_LEGACY
 
-TESSCXX = $(CXX) $(TESSINCLUDES) $(TESSCXXFLAGS) $(CCFLAGS) -DTESSERACT_IMAGEDATA_AS_PIX -DTESSERACT_DISABLE_DEBUG_FONTS -DGRAPHICS_DISABLED -DDISABLED_LEGACY_ENGINE
+TESSCXX = $(CXX) $(TESSINCLUDES) $(TESSCXXFLAGS) $(CCFLAGS) -DTESSERACT_IMAGEDATA_AS_PIX -DTESSERACT_DISABLE_DEBUG_FONTS -DGRAPHICS_DISABLED
+#-DDISABLED_LEGACY_ENGINE
 TESSOBJ = $(GLOBJDIR)$(D)tesseract_
 TESSO_ = $(O_)$(TESSOBJ)
 
@@ -1161,8 +1162,8 @@ TESSERACT_LEGACY_OBJS=\
 	$(TESSOBJ)wordrec_wordclass.$(OBJ)
 
 
-#TESSERACT_LEGACY=$(TESSERACT_LEGACY_OBJS)
-TESSERACT_LEGACY=
+TESSERACT_LEGACY=$(TESSERACT_LEGACY_OBJS)
+#TESSERACT_LEGACY=
 
 TESS_ROMFS_ARGS=\
 	-c -P $(GLSRCDIR)$(D)..$(D) tessdata$(D)*
diff --git a/base/tessocr.cpp b/base/tessocr.cpp
index 8ce19a14e..f8161ac3c 100644
--- a/base/tessocr.cpp
+++ b/base/tessocr.cpp
@@ -504,6 +504,211 @@ ocr_recognise(void *api_, int w, int h, void *data,
     return code;
 }
 
+static Pix *
+ocr_set_bitmap(tesseract::TessBaseAPI *api,
+               int w, int h,
+               const unsigned char *data, int data_x, int raster,
+               int xres, int yres)
+{
+    /* Tesseract prefers a border around things, so we add an 8 pixel
+     * border all around. */
+#define BORDER_SIZE 8
+    int r = (w+BORDER_SIZE*2+3)&~3;
+    Pix *image = pixCreateHeader(r, h+BORDER_SIZE*2, 8);
+    unsigned char *pdata, *d;
+    const unsigned char *s;
+    int x, y;
+
+    if (image == NULL)
+        return NULL;
+
+    pdata = gs_alloc_bytes(leptonica_mem, r * (h+BORDER_SIZE*2), "ocr_set_bitmap");
+    if (pdata == NULL) {
+        pixDestroy(&image);
+        return NULL;
+    }
+    pixSetData(image, (l_uint32 *)pdata);
+    pixSetPadBits(image, 1);
+    pixSetXRes(image, xres);
+    pixSetYRes(image, yres);
+
+    s = &data[data_x>>3] + raster*(h-1);
+    d = pdata;
+    memset(d, 255, r * (h+BORDER_SIZE*2));
+    d += r*BORDER_SIZE + BORDER_SIZE;
+    for (y = 0; y < h; y++) {
+        int b = 128>>(data_x & 7);
+        for (x = 0; x < w; x++) {
+            if (s[x>>3] & b)
+                d[x^3] = 0;
+            else
+                d[x^3] = 255;
+            b >>= 1;
+            if (b == 0)
+                b = 128;
+        }
+        s -= raster;
+        d += r;
+    }
+
+    api->SetImage(image);
+//    pixWrite("test.pnm", image, IFF_PNM);
+
+    return image;
+}
+
+static void
+ocr_clear_bitmap(Pix *image)
+{
+    gs_free_object(leptonica_mem, pixGetData(image), "ocr_clear_bitmap");
+    pixSetData(image, NULL);
+    pixDestroy(&image);
+}
+
+int ocr_bitmap_to_unicodes(void *state,
+                          const void *data, int data_x,
+                          int w, int h, int raster,
+                          int xres, int yres, int *unicode, int *char_count)
+{
+    tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)state;
+    Pix *image;
+    int code, max_chars = *char_count, count = 0;
+
+    if (api == NULL)
+        return 0;
+
+    image = ocr_set_bitmap(api, w, h, (const unsigned char *)data,
+                           data_x, raster, xres, yres);
+    if (image == NULL)
+        return_error(gs_error_VMerror);
+
+    code = api->Recognize(NULL);
+    if (code >= 0) {
+        /* Bingo! */
+        tesseract::ResultIterator *res_it = api->GetIterator();
+
+        while (!res_it->Empty(tesseract::RIL_BLOCK)) {
+            if (res_it->Empty(tesseract::RIL_WORD)) {
+                res_it->Next(tesseract::RIL_WORD);
+                continue;
+            }
+
+            do {
+#if FUTURE_DEVELOPMENT
+                int word_bbox[4];
+                int char_bbox[4];
+                int line_bbox[4];
+#endif
+
+                const unsigned char *graph = (unsigned char *)res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
+                if (graph && graph[0] != 0) {
+                    /* Quick and nasty conversion from UTF8 to unicode. */
+                    if (graph[0] < 0x80)
+                        unicode[count] = graph[0];
+                    else {
+                        unicode[count] = graph[1] & 0x3f;
+                        if (graph[0] < 0xE0)
+                            unicode[count] += (graph[0] & 0x1f)<<6;
+                        else {
+                            unicode[count] = (graph[2] & 0x3f) | (*unicode << 6);
+                            if (graph[0] < 0xF0) {
+                                unicode[count] += (graph[0] & 0x0F)<<6;
+                            } else {
+                                unicode[count] = (graph[3] & 0x3f) | (*unicode<<6);
+                                unicode[count] += (graph[0] & 0x7);
+                            }
+                        }
+                    }
+                    count++;
+#if FUTURE_DEVELOPMENT
+                    res_it->BoundingBox(tesseract::RIL_TEXTLINE,
+                        line_bbox,line_bbox + 1,
+                        line_bbox + 2,line_bbox + 3);
+                    res_it->BoundingBox(tesseract::RIL_WORD,
+                        word_bbox,word_bbox + 1,
+                        word_bbox + 2,word_bbox + 3);
+                    res_it->BoundingBox(tesseract::RIL_SYMBOL,
+                        char_bbox,char_bbox + 1,
+                        char_bbox + 2,char_bbox + 3);
+#endif
+                }
+                res_it->Next(tesseract::RIL_SYMBOL);
+             } while (!res_it->Empty(tesseract::RIL_BLOCK) &&
+                      !res_it->IsAtBeginningOf(tesseract::RIL_WORD) && count < max_chars);
+        }
+        delete res_it;
+        code = code;
+    }
+
+    ocr_clear_bitmap(image);
+    *char_count = count;
+
+    return code;
+}
+
+int ocr_bitmap_to_unicode(void *state,
+                          const void *data, int data_x,
+                          int w, int h, int raster,
+                          int xres, int yres, int *unicode)
+{
+    tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)state;
+    Pix *image;
+    int code;
+
+    if (api == NULL)
+        return 0;
+
+    image = ocr_set_bitmap(api, w, h, (const unsigned char *)data,
+                           data_x, raster, xres, yres);
+    if (image == NULL)
+        return_error(gs_error_VMerror);
+
+    code = api->Recognize(NULL);
+    if (code >= 0) {
+        /* Bingo! */
+        tesseract::ResultIterator *res_it = api->GetIterator();
+
+        while (!res_it->Empty(tesseract::RIL_BLOCK)) {
+            if (res_it->Empty(tesseract::RIL_WORD)) {
+                res_it->Next(tesseract::RIL_WORD);
+                continue;
+            }
+
+            do {
+                const unsigned char *graph = (unsigned char *)res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
+                if (graph && graph[0] != 0) {
+                    /* Quick and nasty conversion from UTF8 to unicode. */
+                    if (graph[0] < 0x80)
+                        *unicode = graph[0];
+                    else {
+                        *unicode = graph[1] & 0x3f;
+                        if (graph[0] < 0xE0)
+                            *unicode += (graph[0] & 0x1f)<<6;
+                        else {
+                            *unicode = (graph[2] & 0x3f) | (*unicode << 6);
+                            if (graph[0] < 0xF0) {
+                                *unicode += (graph[0] & 0x0F)<<6;
+                            } else {
+                                *unicode = (graph[3] & 0x3f) | (*unicode<<6);
+                                *unicode += (graph[0] & 0x7);
+                            }
+                        }
+                    }
+                }
+                res_it->Next(tesseract::RIL_SYMBOL);
+             } while (!res_it->Empty(tesseract::RIL_BLOCK) &&
+                      !res_it->IsAtBeginningOf(tesseract::RIL_WORD));
+        }
+        delete res_it;
+        code = code;
+    }
+
+    ocr_clear_bitmap(image);
+
+    return code;
+}
+
+
 };
 
 /* Currently tesseract is the only C++ lib we have.
diff --git a/base/tessocr.h b/base/tessocr.h
index 78c30a0d4..8beaa8915 100644
--- a/base/tessocr.h
+++ b/base/tessocr.h
@@ -50,5 +50,15 @@ int ocr_recognise(void *api_, int w, int h, void *data,
                   int (*callback)(void *, const char *, const int *, const int *, const int *, int),
                   void *arg);
 
+int ocr_bitmap_to_unicodes(void* state,
+    const void* data,int data_x,
+    int w,int h,int raster,
+    int xres,int yres,int* unicode, int* char_count);
+
+int ocr_bitmap_to_unicode(void *state,
+                          const void *data, int data_x,
+                          int w, int h, int raster,
+                          int xres, int yres, int *unicode);
+
 #endif
 
diff --git a/devices/devs.mak b/devices/devs.mak
index a5f3750fa..222c02708 100644
--- a/devices/devs.mak
+++ b/devices/devs.mak
@@ -24,7 +24,7 @@ DEVVECSRC=$(DEVVEC)$(D)
 DEVI_=$(DEVGENDIR) $(II)$(GLSRCDIR) $(II)$(GLGENDIR) $(II)$(DEVSRCDIR)
 DEVF_=
 
-DEVCCFLAGS=$(I_)$(DEVI_)$(_I) $(I_)$(DEVVEC)$(_I) $(DEVF_)
+DEVCCFLAGS=$(I_)$(DEVI_)$(_I) $(I_)$(DEVVEC)$(_I) $(D_)OCR_VERSION=$(OCR_VERSION)$(_D) $(DEVF_)
 DEVCC=$(CC_) $(DEVCCFLAGS)
 XPSDEVCC=$(CC_) $(XPSPRINTCFLAGS) $(DEVCCFLAGS)
 
diff --git a/devices/vector/gdevpdfb.c b/devices/vector/gdevpdfb.c
index 203390384..059c79c32 100644
--- a/devices/vector/gdevpdfb.c
+++ b/devices/vector/gdevpdfb.c
@@ -496,7 +496,7 @@ gdev_pdf_copy_color(gx_device * dev, const byte * base, int sourcex,
 /* Fill a mask. */
 int
 gdev_pdf_fill_mask(gx_device * dev,
-                 const byte * data, int data_x, int raster, gx_bitmap_id id,
+                   const byte * data, int data_x, int raster, gx_bitmap_id id,
                    int x, int y, int width, int height,
                    const gx_drawing_color * pdcolor, int depth,
                    gs_logical_operation_t lop, const gx_clip_path * pcpath)
@@ -505,6 +505,50 @@ gdev_pdf_fill_mask(gx_device * dev,
 
     if (width <= 0 || height <= 0)
         return 0;
+
+    /* If OCRStage is 'OCR_Rendering' then we are handling an image which is a rendered glyph
+     * that we want to have OCR software process and return a Unicode code point for.
+     * We specifically do *not* want to send the image to the output PDF file!
+     */
+    if (pdev->OCRStage == OCR_Rendering) {
+        int code = 0;
+        ocr_glyph_t *new_glyph = NULL;
+        int index;
+
+        new_glyph = (ocr_glyph_t *)gs_alloc_bytes(pdev->pdf_memory, sizeof(ocr_glyph_t), "");
+        if (new_glyph == NULL)
+            return_error(gs_error_VMerror);
+        new_glyph->data = gs_alloc_bytes(pdev->pdf_memory, raster*height, "");
+        if (new_glyph->data == NULL)
+            return_error(gs_error_VMerror);
+        memcpy(new_glyph->data, data, raster * height);
+        new_glyph->height = height;
+        new_glyph->width = width;
+        new_glyph->raster = raster;
+        new_glyph->x = x;
+        new_glyph->y = y;
+        new_glyph->char_code = pdev->OCR_char_code;
+        new_glyph->glyph = pdev->OCR_glyph;
+        new_glyph->next = NULL;
+        new_glyph->is_space = true;
+        for(index = 0; index < height * raster;index++){
+            if(data[index] != 0x00) {
+                new_glyph->is_space = false;
+                break;
+            }
+        }
+        if (pdev->ocr_glyphs == NULL)
+            pdev->ocr_glyphs = new_glyph;
+        else {
+            ocr_glyph_t *next = pdev->ocr_glyphs;
+
+            while (next->next != NULL)
+                next = next->next;
+            next->next = new_glyph;
+        }
+        return code;
+    }
+
     if (depth > 1 || (!gx_dc_is_pure(pdcolor) != 0 && !(gx_dc_is_pattern1_color(pdcolor))))
         return gx_default_fill_mask(dev, data, data_x, raster, id,
                                     x, y, width, height, pdcolor, depth, lop,
diff --git a/devices/vector/gdevpdfb.h b/devices/vector/gdevpdfb.h
index d8d596959..b2808a3ae 100644
--- a/devices/vector/gdevpdfb.h
+++ b/devices/vector/gdevpdfb.h
@@ -128,6 +128,8 @@ const gx_device_pdf PDF_DEVICE_IDENT =
  {0,0},				/* PDFXTrimBoxToMediaBoxOffset */
  {0,0},				/* PDFXBleedBoxToTrimBoxOffset */
  1 /* true */,			/* PDFXSetBleedBoxToMediaBox */
+ "",                            /* ocr_language */
+ 0,                             /* ocr_engine */
  1 /*true*/,			/* ReAssignCharacters */
  1 /*true*/,			/* ReEncodeCharacters */
  1,				/* FirstObjectNumber */
@@ -300,7 +302,14 @@ const gx_device_pdf PDF_DEVICE_IDENT =
  0,                     /* ExtensionMetadata */
  0,                     /* PDFFormName */
  0,                     /* PassThroughWriter */
- 1.0                    /* UserUnit */
+ 1.0,                   /* UserUnit */
+ 0,                     /* UseOCR */
+ NULL,                  /* OCRSaved */
+ 0,                     /* OCRStage */
+ NULL,                  /* OCRUnicode */
+ 0,                     /* OCR_char_code */
+ 0,                     /* OCR_glyph */
+ NULL                   /* ocr_glyphs */
 };
 
 #else
diff --git a/devices/vector/gdevpdfp.c b/devices/vector/gdevpdfp.c
index 3a371aada..9cc63644b 100644
--- a/devices/vector/gdevpdfp.c
+++ b/devices/vector/gdevpdfp.c
@@ -255,6 +255,48 @@ gdev_pdf_get_param(gx_device *dev, char *Param, void *list)
             return(param_write_null(plist, "DSC"));
         }
     }
+
+#if OCR_VERSION > 0
+    if (strcmp(Param, "OCRLanguage") == 0) {
+        gs_param_string langstr;
+        if (pdev->ocr_language[0]) {
+            langstr.data = (const byte *)pdev->ocr_language;
+            langstr.size = strlen(pdev->ocr_language);
+            langstr.persistent = false;
+        } else {
+            langstr.data = (const byte *)"eng";
+            langstr.size = 3;
+            langstr.persistent = false;
+        }
+        return param_write_string(plist, "OCRLanguage", &langstr);
+    }
+    if (strcmp(Param, "OCREngine") == 0)
+        return param_write_int(plist, "OCREngine", &pdev->ocr_engine);
+
+    if (strcmp(Param, "UseOCR") == 0) {
+        gs_param_string ocrstr;
+
+        switch(pdev->UseOCR) {
+            case UseOCRNever:
+                ocrstr.data = (const byte *)"Never";
+                ocrstr.size = 5;
+                ocrstr.persistent = false;
+                break;
+            UseOCRAsNeeded:
+                ocrstr.data = (const byte *)"AsNeeded";
+                ocrstr.size = 8;
+                ocrstr.persistent = false;
+                break;
+            UseOCRAlways:
+                ocrstr.data = (const byte *)"Always";
+                ocrstr.size = 8;
+                ocrstr.persistent = false;
+                break;
+        }
+        return param_write_string(plist, "UseOCR", &ocrstr);
+    }
+#endif
+
     return gdev_psdf_get_param(dev, Param, list);
 }
 
@@ -269,6 +311,49 @@ gdev_pdf_get_params(gx_device * dev, gs_param_list * plist)
     int code;
     int cdv = CoreDistVersion;
 
+#if OCR_VERSION > 0
+    gs_param_string langstr;
+
+    if (pdev->ocr_language[0]) {
+        langstr.data = (const byte *)pdev->ocr_language;
+        langstr.size = strlen(pdev->ocr_language);
+        langstr.persistent = false;
+    } else {
+        langstr.data = (const byte *)"eng";
+        langstr.size = 3;
+        langstr.persistent = false;
+    }
+
+    {
+        gs_param_string ocrstr;
+
+        switch(pdev->UseOCR) {
+            case UseOCRNever:
+                ocrstr.data = (const byte *)"Never";
+                ocrstr.size = 5;
+                ocrstr.persistent = false;
+                break;
+            UseOCRAsNeeded:
+                ocrstr.data = (const byte *)"AsNeeded";
+                ocrstr.size = 8;
+                ocrstr.persistent = false;
+                break;
+            UseOCRAlways:
+                ocrstr.data = (const byte *)"Always";
+                ocrstr.size = 8;
+                ocrstr.persistent = false;
+                break;
+        }
+        code = param_write_string(plist, "UseOCR", &ocrstr);
+    }
+    code = param_write_string(plist, "OCRLanguage", &langstr);
+    if(code < 0)
+        return code;
+    code = param_write_int(plist, "OCREngine", &pdev->ocr_engine);
+    if(code < 0)
+        return code;
+#endif
+
     pdev->ParamCompatibilityLevel = cl;
     code = gdev_psdf_get_params(dev, plist);
     if (code < 0 ||
@@ -367,6 +452,83 @@ gdev_pdf_put_params_impl(gx_device * dev, const gx_device_pdf * save_dev, gs_par
         }
     }
 
+#if OCR_VERSION > 0
+    {
+        int len;
+        gs_param_string langstr;
+        switch (code = param_read_string(plist, (param_name = "OCRLanguage"), &langstr)) {
+            case 0:
+                len = langstr.size;
+                if (len >= sizeof(pdev->ocr_language))
+                    len = sizeof(pdev->ocr_language)-1;
+                memcpy(pdev->ocr_language, langstr.data, len);
+                pdev->ocr_language[len] = 0;
+                break;
+            case 1:
+                break;
+            default:
+                ecode = code;
+                param_signal_error(plist, param_name, ecode);
+        }
+    }
+
+    {
+        int engine;
+        switch (code = param_read_int(plist, (param_name = "OCREngine"), &engine)) {
+            case 0:
+                pdev->ocr_engine = engine;
+                break;
+            case 1:
+                break;
+            default:
+                ecode = code;
+                param_signal_error(plist, param_name, ecode);
+        }
+    }
+
+    {
+        gs_param_string ocrstr;
+
+        code = param_read_string(plist, (param_name = "UseOCR"), &ocrstr);
+        switch(code) {
+            case 0:
+                if (ocrstr.size == 5 && memcmp(ocrstr.data, "Never", 5) == 0)
+                    pdev->UseOCR = UseOCRNever;
+                if (ocrstr.size == 8 && memcmp(ocrstr.data, "AsNeeded", 8) == 0)
+                    pdev->UseOCR = UseOCRAsNeeded;
+                if (ocrstr.size == 6 && memcmp(ocrstr.data, "Always", 6) == 0)
+                    pdev->UseOCR = UseOCRAlways;
+                break;
+            case 1:
+                break;
+            default:
+                param_signal_error(plist, param_name, code);
+                break;
+        }
+    }
+
+    {
+        gs_param_string ocrstr;
+
+        code = param_read_string(plist, (param_name = "UseOCR"), &ocrstr);
+        switch(code) {
+            case 0:
+                if (ocrstr.size == 5 && memcmp(ocrstr.data, "Never", 5) == 0)
+                    pdev->UseOCR = UseOCRNever;
+                if (ocrstr.size == 8 && memcmp(ocrstr.data, "AsNeeded", 8) == 0)
+                    pdev->UseOCR = UseOCRAsNeeded;
+                if (ocrstr.size == 6 && memcmp(ocrstr.data, "Always", 6) == 0)
+                    pdev->UseOCR = UseOCRAlways;
+                break;
+            case 1:
+                break;
+            default:
+                param_signal_error(plist, param_name, code);
+                break;
+        }
+    }
+#endif
+
     /*
      * Check for LockDistillerParams before doing anything else.
      * If LockDistillerParams is true and is not being set to false,
diff --git a/devices/vector/gdevpdfx.h b/devices/vector/gdevpdfx.h
index 16515fcdd..6123c652c 100644
--- a/devices/vector/gdevpdfx.h
+++ b/devices/vector/gdevpdfx.h
@@ -547,6 +547,33 @@ typedef enum {
     pdf_compress_Flate
 } pdf_compression_type;
 
+typedef enum {
+    OCR_UnInit,
+    OCR_Rendering,
+    OCR_Rendered,
+    OCR_UnicodeAvailable,
+    OCR_Failed
+} pdf_OCR_stage;
+
+typedef enum {
+    UseOCRNever,
+    UseOCRAsNeeded,
+    UseOCRAlways
+} pdf_OCR_usage;
+
+typedef struct ocr_glyph_s{
+    byte *data;
+    int x;
+    int y;
+    int width;
+    int height;
+    int raster;
+    void *next;
+    gs_char char_code;
+    gs_glyph glyph;
+    bool is_space;
+} ocr_glyph_t;
+
 /* Define the device structure. */
 struct gx_device_pdf_s {
     gx_device_psdf_common;
@@ -572,6 +599,9 @@ struct gx_device_pdf_s {
     gs_param_float_array PDFXTrimBoxToMediaBoxOffset;
     gs_param_float_array PDFXBleedBoxToTrimBoxOffset;
     bool PDFXSetBleedBoxToMediaBox;
+    /* OCR Parameters */
+    char ocr_language[1024];
+    int ocr_engine;
     /* Other parameters */
     bool ReAssignCharacters;
     bool ReEncodeCharacters;
@@ -909,6 +939,13 @@ struct gx_device_pdf_s {
                                      * anything in the image processing routines.
                                      */
     float UserUnit;
+    pdf_OCR_usage UseOCR;                     /* Never, AsNeeded or Always */
+    gs_text_enum_t* OCRSaved;       /* Saved state of the text enumerator before rendering glyph bitmaps for later OCR */
+    pdf_OCR_stage OCRStage;         /* Used to control a (sort of) state machine when using OCR to get a Unicode value for a glyph */
+    int *OCRUnicode;                /* Used to pass back the Unicode value from the OCR engine to the text processing */
+    gs_char OCR_char_code;          /* Passes the current character code from text processing to the image processing code when rendering glyph bitmaps for OCR */
+    gs_glyph OCR_glyph;             /* Passes the current glyph code from text processing to the image processing code when rendering glyph bitmaps for OCR */
+    ocr_glyph_t *ocr_glyphs;        /* Records bitmaps and other data from text processing when doing OCR */
 };
 
 #define is_in_page(pdev)\
diff --git a/devices/vector/gdevpdte.c b/devices/vector/gdevpdte.c
index 6f0eb158a..0310f54cf 100644
--- a/devices/vector/gdevpdte.c
+++ b/devices/vector/gdevpdte.c
@@ -43,6 +43,7 @@
 #include "gxcpath.h"
 
 #include "gsfcmap.h"
+#include "tessocr.h"
 
 static int pdf_char_widths(gx_device_pdf *const pdev,
                             pdf_font_resource_t *pdfont, int ch,
@@ -80,6 +81,216 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr,
     return pdf_process_string(penum, pstr, pfmat, ppts, gdata);
 }
 
+static int OCRText(gx_device_pdf *pdev, gs_glyph glyph, gs_char ch, gs_char *length, byte **unicode)
+{
+#if OCR_VERSION > 0
+    int code = 0;
+
+    if(pdev->OCRStage == OCR_Rendered) {
+        int llx, lly, urx, ury, char_count = 0, returned_count = 0, *returned;
+        ocr_glyph_t *next_glyph = pdev->ocr_glyphs;
+        int rows, stride, row, column;
+        byte *bitmap = NULL, *src, *dest, *rowptr, srcmask, destmask;
+        void *state;
+        const char *language = pdev->ocr_language;
+        gp_file *DbgFile;
+
+        if(language == NULL || language[0] == 0)
+            language = "eng";
+
+        /* We should alredy have rendered a bitmap for all the glyphs in the
+         * text operation, so this shuld be redundant, but best to be safe.
+         */
+        if(next_glyph == NULL)
+            return_error(gs_error_unknownerror);
+
+        /* Identify the bounding box of the returned glyphs by examing the bounds and position
+         * of each glyph. At the same time count the number of expected returned characters.
+         * We treat any empty bitmap (all 0x00 bytes) as a space because, obviously, the
+         * OCR engine can't tell differentiate between a space character and no character at all.
+         */
+        llx = next_glyph->x;
+        lly = next_glyph->y;
+        urx = llx + next_glyph->width;
+        ury = lly + next_glyph->height;
+        if(next_glyph != NULL && !next_glyph->is_space)
+            char_count++;
+        next_glyph = (ocr_glyph_t *)next_glyph->next;
+        while(next_glyph) {
+            if(!next_glyph->is_space)
+                char_count++;
+            if(next_glyph->x < llx)
+                llx = next_glyph->x;
+            if(next_glyph->y < lly)
+                lly = next_glyph->y;
+            if(next_glyph->x + next_glyph->width > urx)
+                urx = next_glyph->x + next_glyph->width;
+            if(next_glyph->y + next_glyph->height > ury)
+                ury = next_glyph->y + next_glyph->height;
+            next_glyph = next_glyph->next;
+        }
+
+        /* Allocate and initialise the 'strip' bitmap which will receive all the
+         * individual glyph bitmaps.
+         */
+        rows = ury - lly;
+        stride = (((urx - llx) + 7) / 8) + 1;
+        bitmap = gs_alloc_bytes(pdev->memory, rows * stride, "working OCR memory");
+        if(bitmap == NULL)
+            return_error(gs_error_VMerror);
+        memset(bitmap, 0x00, rows * stride);
+
+        /* Allocate a buffer for the OCR engine to return the Unicode code points. This needs work,
+         * we might want more information returned (bounding boxes and confidence levels) and we
+         * need to think about the possibility that the OCR engine finds more character than we
+         * expected (eg fi ligatures returned as 'f' and 'i'.
+         */
+        returned = (int *)gs_alloc_bytes(pdev->memory, char_count * sizeof(int), "returned unicodes");
+        if(returned == NULL) {
+            gs_free_object(pdev->memory, bitmap, "working OCR memory");
+            return_error(gs_error_VMerror);
+        }
+        memset(returned, 0x00, char_count * sizeof(int));
+
+        /* Now copy each glyph bitmap to the correct position in the strip. This is complicated
+         * by the fact that bitmaps are monochrome pcaked into bytes and so the destination
+         * may not be aligned on a byte boundary.
+         */
+        next_glyph = (ocr_glyph_t *)pdev->ocr_glyphs;
+        while(next_glyph) {
+            rowptr = bitmap + ((next_glyph->y - lly) * stride) + (int)floor((next_glyph->x - llx) / 8);
+            for(row = 0;row < next_glyph->height;row++) {
+                dest = rowptr + row * stride;
+                src = next_glyph->data + (row * next_glyph->raster);
+                destmask = 0x80 >> (next_glyph->x - llx) % 8;
+                srcmask = 0x80;
+                for(column = 0; column < next_glyph->width;column++) {
+                    if(*src & srcmask) {
+                        *dest = *dest | destmask;
+                    }
+                    srcmask = srcmask >> 1;
+                    if(srcmask == 0) {
+                        srcmask = 0x80;
+                        src++;
+                    }
+                    destmask = destmask >> 1;
+                    if(destmask == 0) {
+                        destmask = 0x80;
+                        dest++;
+                    }
+                }
+            }
+            next_glyph = next_glyph->next;
+        }
+
+#if 0
+        DbgFile = gp_fopen(pdev->memory, "d:/temp/bits.txt", "wb+");
+        for(row = 0;row < rows;row++) {
+            for(column = 0;column < stride;column++) {
+                dest = bitmap + (row * stride);
+                gp_fprintf(DbgFile, "%02x", dest[column]);
+            }
+            gp_fprintf(DbgFile, "\n");
+        }
+        gp_fclose(DbgFile);
+#endif
+        /* Initialise the OCR engine */
+        code = ocr_init_api(pdev->memory->non_gc_memory, language,
+            pdev->ocr_engine, &state);
+        if(code < 0) {
+            gs_free_object(pdev->memory, bitmap, "working OCR memory");
+            gs_free_object(pdev->memory, returned, "returned unicodes");
+            return code;
+        }
+        returned_count = char_count;
+
+        /* Pass our strip to the OCR engine */
+        code = ocr_bitmap_to_unicodes(state,
+            bitmap, 0, stride * 8, rows, stride,
+            (int)pdev->HWResolution[0],
+            (int)pdev->HWResolution[1],
+            returned, &returned_count);
+
+        /* and close the engine back down again */
+        ocr_fin_api(pdev->memory->non_gc_memory, state);
+        gs_free_object(pdev->memory, bitmap, "working OCR memory");
+
+        if(code < 0) {
+            pdev->OCRStage = OCR_Failed;
+            gs_free_object(pdev->memory, returned, "returned unicodes");
+            return code;
+        }
+
+        /* Future enhancement we should fall back to trying the individual bitmap here */
+        if(returned_count != char_count) {
+            pdev->OCRStage = OCR_Failed;
+            gs_free_object(pdev->memory, returned, "returned unicodes");
+            return 0;
+        }
+        pdev->OCRUnicode = returned;
+
+        /* Actually perform OCR on the stored bitmaps */
+        pdev->OCRStage = OCR_UnicodeAvailable;
+    }
+
+    if(pdev->OCRStage == OCR_UnicodeAvailable) {
+        /* We've OCR'ed the bitmaps already, find the unicode value */
+        ocr_glyph_t *new_glyph = (ocr_glyph_t *)pdev->ocr_glyphs;
+        int ocr_index = 0;
+        uint mask = 0xFF;
+        int ix;
+        char *u;
+
+        /* Find the bitmap which matches the character/glyph we are processing */
+        while(new_glyph) {
+            if(new_glyph->char_code == ch || new_glyph->glyph == glyph) {
+                ocr_glyph_t *g1 = pdev->ocr_glyphs;
+
+                /* Spaces are handled specially, so just jump out now */
+                if(new_glyph->is_space)
+                    break;
+
+                /* Otherwise, find all the bitmaps which lie to the left of the
+                 * one we found (we are assuming for now that the returned
+                 * Unicode values are left to right)
+                 */
+                while(g1) {
+                    if(!g1->is_space) {
+                        if(g1->x < new_glyph->x)
+                            ocr_index++;
+                    }
+                    g1 = g1->next;
+                }
+                break;
+            }
+            new_glyph = new_glyph->next;
+        }
+
+        /* If we found a matching bitmap, get the corresponding unicode code point from
+         * the stored values returned by the OCR engine.
+         */
+        if(new_glyph) {
+            *unicode = (byte *)gs_alloc_bytes(pdev->memory, 2 * sizeof(ushort), "temporary Unicode array");
+            if(*unicode == NULL)
+                return_error(gs_error_VMerror);
+            u = (char *)(*unicode);
+            if(new_glyph->is_space) {
+                memset(u, 0x00, 3);
+                u[3] = 0x20;
+            }
+            else {
+                for(ix = 0;ix < 4;ix++) {
+                    u[3 - ix] = (pdev->OCRUnicode[ocr_index] & mask) >> (8 * ix);
+                    mask = mask << 8;
+                }
+            }
+            *length = 4;
+        }
+    }
+    #endif
+    return 0;
+}
+
 /*
  * Add char code pair to ToUnicode CMap,
  * creating the CMap on neccessity.
@@ -87,27 +298,43 @@ pdf_process_string_aux(pdf_text_enum_t *penum, gs_string *pstr,
 int
 pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfont,
                   gs_glyph glyph, gs_char ch, const gs_const_string *gnstr)
-{   int code;
-    gs_char length;
+{   int code = 0;
+    gs_char length = 0;
     ushort *unicode = 0;
 
     if (glyph == GS_NO_GLYPH)
         return 0;
-    length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0);
-    if ((length == 0 || length == GS_NO_CHAR) && gnstr != NULL && gnstr->size == 7) {
-        if (!memcmp(gnstr->data, "uni", 3)) {
-            static const char *hexdigits = "0123456789ABCDEF";
-            char *d0 = strchr(hexdigits, gnstr->data[3]);
-            char *d1 = strchr(hexdigits, gnstr->data[4]);
-            char *d2 = strchr(hexdigits, gnstr->data[5]);
-            char *d3 = strchr(hexdigits, gnstr->data[6]);
-
-            unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array");
-            if (d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) {
-                char *u = (char *)unicode;
-                u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits));
-                u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits));
-                length = 2;
+    if(pdev->UseOCR == UseOCRAlways) {
+        code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode);
+        if(code < 0)
+            return code;
+    }
+    else {
+        length = font->procs.decode_glyph((gs_font *)font, glyph, ch, NULL, 0);
+        if(length == 0 || length == GS_NO_CHAR) {
+            if(gnstr != NULL && gnstr->size == 7) {
+                if(!memcmp(gnstr->data, "uni", 3)) {
+                    static const char *hexdigits = "0123456789ABCDEF";
+                    char *d0 = strchr(hexdigits, gnstr->data[3]);
+                    char *d1 = strchr(hexdigits, gnstr->data[4]);
+                    char *d2 = strchr(hexdigits, gnstr->data[5]);
+                    char *d3 = strchr(hexdigits, gnstr->data[6]);
+
+                    unicode = (ushort *)gs_alloc_bytes(pdev->memory, sizeof(ushort), "temporary Unicode array");
+                    if(d0 != NULL && d1 != NULL && d2 != NULL && d3 != NULL) {
+                        char *u = (char *)unicode;
+                        u[0] = ((d0 - hexdigits) << 4) + ((d1 - hexdigits));
+                        u[1] = ((d2 - hexdigits) << 4) + ((d3 - hexdigits));
+                        length = 2;
+                    }
+                }
+            }
+            else {
+                if(pdev->UseOCR != UseOCRNever) {
+                    code = OCRText(pdev, glyph, ch, &length, (byte **)&unicode);
+                    if(code < 0)
+                        return code;
+                }
             }
         }
     }
@@ -163,6 +390,7 @@ pdf_add_ToUnicode(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_t *pdfon
         if (length > 2 && pdfont->u.simple.Encoding != NULL)
             pdfont->TwoByteToUnicode = 0;
     }
+
     if (unicode)
         gs_free_object(pdev->memory, unicode, "temporary Unicode array");
     return 0;
@@ -255,8 +483,11 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_
     pet = &pdfont->u.simple.Encoding[ch];
     glyph = (gdata == NULL ? font->procs.encode_char(font, ch, GLYPH_SPACE_NAME)
                            : *gdata);
-    if (glyph == GS_NO_GLYPH || glyph == pet->glyph)
+    if (glyph == GS_NO_GLYPH || glyph == pet->glyph) {
+        if((pdfont->cmap_ToUnicode == NULL || !gs_cmap_ToUnicode_check_pair(pdfont->cmap_ToUnicode, ch)) && pdev->UseOCR != UseOCRNever)
+            (void)pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr);
         return 0;
+    }
     if (pet->glyph != GS_NO_GLYPH) { /* encoding conflict */
         return_error(gs_error_rangecheck);
         /* Must not happen because pdf_obtain_font_resource
@@ -358,7 +589,7 @@ pdf_encode_string_element(gx_device_pdf *pdev, gs_font *font, pdf_font_resource_
         * The decision about writing it out is deferred until pdf_write_font_resource.
         */
     code = pdf_add_ToUnicode(pdev, font, pdfont, glyph, ch, &gnstr);
-    if (code < 0)
+    if(code < 0)
         return code;
     pet->glyph = glyph;
     pet->str = gnstr;
@@ -1035,6 +1266,7 @@ process_text_return_width(const pdf_text_enum_t *pte, gs_font_base *font,
         {  const gs_glyph *gdata_i = (gdata != NULL ? gdata + i : 0);
 
             code = pdf_encode_string_element(pdev, (gs_font *)font, pdfont, ch, gdata_i);
+
             if (code < 0)
                 return code;
         }
diff --git a/devices/vector/gdevpdtt.c b/devices/vector/gdevpdtt.c
index 6a89fedf7..e4c1d1471 100644
--- a/devices/vector/gdevpdtt.c
+++ b/devices/vector/gdevpdtt.c
@@ -299,13 +299,29 @@ static void
 pdf_text_release(gs_text_enum_t *pte, client_name_t cname)
 {
     pdf_text_enum_t *const penum = (pdf_text_enum_t *)pte;
+    gx_device_pdf *pdev = (gx_device_pdf *)penum->dev;
+    ocr_glyph_t *next;
 
     if (penum->pte_default) {
         gs_text_release(NULL, penum->pte_default, cname);
         penum->pte_default = 0;
     }
     pdf_text_release_cgp(penum);
+
+    while (pdev->ocr_glyphs != NULL)
+    {
+        next = pdev->ocr_glyphs->next;
+
+        gs_free_object(pdev->memory, pdev->ocr_glyphs->data, "free bitmap");
+        gs_free_object(pdev->memory, pdev->ocr_glyphs, "free bitmap");
+        pdev->ocr_glyphs = next;
+    }
+    if (pdev->OCRUnicode != NULL)
+        gs_free_object(pdev->memory, pdev->OCRUnicode, "free returned unicodes");
+    pdev->OCRUnicode = NULL;
+
     gx_default_text_release(pte, cname);
+    pdev->OCRStage = 0;
 }
 void
 pdf_text_release_cgp(pdf_text_enum_t *penum)
@@ -3152,6 +3168,57 @@ static int pdf_query_purge_cached_char(const gs_memory_t *mem, cached_char *cc,
     return 0;
 }
 
+static int ProcessTextForOCR(gs_text_enum_t *pte)
+{
+    pdf_text_enum_t *const penum = (pdf_text_enum_t *)pte;
+    gx_device_pdf *pdev = (gx_device_pdf *)penum->dev;
+    gs_text_enum_t *pte_default;
+    int code;
+
+    if (pdev->OCRStage == OCR_UnInit) {
+        gs_gsave(pte->pgs);
+        pdev->OCRSaved = (gs_text_enum_t*)gs_alloc_bytes(pdev->memory,sizeof(gs_text_enum_t),"saved enumerator for OCR");
+        if(pdev->OCRSaved == NULL)
+            return_error(gs_error_VMerror);
+        *(pdev->OCRSaved) = *pte;
+        gs_text_enum_copy_dynamic(pdev->OCRSaved,pte,true);
+
+        code = pdf_default_text_begin(pte, &pte->text, &pte_default);
+        if (code < 0)
+            return code;
+        penum->pte_default = pte_default;
+        gs_text_enum_copy_dynamic(pte_default, pte, false);
+        pdev->OCRStage = OCR_Rendering;
+    }
+
+    if (pdev->OCRStage == OCR_Rendering) {
+        penum->pte_default->can_cache = 0;
+        code = gs_text_process(penum->pte_default);
+        pdev->OCR_char_code = penum->pte_default->returned.current_char;
+        pdev->OCR_glyph = penum->pte_default->returned.current_glyph;
+        gs_text_enum_copy_dynamic(pte, penum->pte_default, true);
+        if (code == TEXT_PROCESS_RENDER)
+            return code;
+        if (code != 0) {
+            gs_free_object(pdev->memory, pdev->OCRSaved,"saved enumerator for OCR");
+            pdev->OCRSaved = NULL;
+            gs_grestore(pte->pgs);
+            gs_text_release(pte->pgs, penum->pte_default, "pdf_text_process");
+            penum->pte_default = NULL;
+            return code;
+        }
+        gs_grestore(pte->pgs);
+        *pte = *(pdev->OCRSaved);
+        gs_text_enum_copy_dynamic(pte, pdev->OCRSaved, true);
+        gs_free_object(pdev->memory, pdev->OCRSaved,"saved enumerator for OCR");
+        pdev->OCRSaved = NULL;
+        gs_text_release(pte->pgs, penum->pte_default, "pdf_text_process");
+        penum->pte_default = NULL;
+        pdev->OCRStage = OCR_Rendered;
+    }
+    return 0;
+}
+
 /*
  * Continue processing text.  This is the 'process' procedure in the text
  * enumerator.  Per the check in pdf_text_begin, we know the operation is
@@ -3207,6 +3274,12 @@ pdf_text_process(gs_text_enum_t *pte)
             goto default_impl;
     }
 
+    if (pdev->UseOCR != UseOCRNever) {
+        code = ProcessTextForOCR(pte);
+        if (code != 0)
+            return code;
+    }
+
     code = -1;                /* to force default implementation */
 
     /*
@@ -3547,6 +3620,7 @@ pdf_text_process(gs_text_enum_t *pte)
         }
 
         gs_text_enum_copy_dynamic(pte, pte_default, true);
+
         if (code)
             return code;
         gs_text_release(NULL, pte_default, "pdf_text_process");
diff --git a/doc/Devices.htm b/doc/Devices.htm
index 91994c82d..87e015f2e 100644
--- a/doc/Devices.htm
+++ b/doc/Devices.htm
@@ -76,6 +76,7 @@
 <ul>
 <li><a href="#OCR">OCR text output</a></li>
 <li><a href="#PDFocr">Bitmap PDF output (with OCR text)</a></li>
+<li><a href="#PDFwriteocr">Vector PDF output (with OCR Unicode CMaps)</a></li>
 </ul>
 <li><a href="#High-level">High level formats</a></li>
 <ul>
@@ -1059,6 +1060,37 @@ resolution independence, and editability.</p>
 </p>
 <p>
 
+<h3><a name="PDFwriteocr"></a>Vector PDF output (with OCR Unicode CMaps)</h3>
+<p>
+The pdfwrite device has been augmented to use the OCR engine to analyse text
+(not images!) in the input stream, and derive Unicode code points for it.
+That information can then be used to create ToUnicode CMaps which are attached
+to the Font (or CIDFont) objects embedded in the PDF file.
+</p>
+<p>
+Fonts which have ToUnicode CMaps can be reliably (limited by the accuracy of
+the CMap) used in search and copy/paste functions, as well as text extraction
+from PDF files. Note that OCR is not a 100% perfect process; it is possible
+that some text might be misidentified.
+</p>
+<p>
+OCR is a slow operation! In addition it can (for Latin text at least) sometimes
+be preferable not to add ToUnicode information which may be incorrect, but instead
+to use the existing font Encoding. For English text this may give better results.
+</p>
+<p>For these reasons the OCR functionality of pdfwrite can be controlled by using a new
+parameter <code>-sUseOCR</code>. This has three possible values;
+</p>
+<dt><code>-sUseOCR=</code><b><em>string</em></b></dt>
+<dd>
+  <dl>
+    <dt>Never<dd>Default - don't use OCR at all even if support is built-in.
+    <dt>AsNeeded<dd>If there is no existing ToUnicode information, use OCR.
+    <dt>Always<dd>Ignore any existing information and always use OCR.
+  </dl>
+</dd>
+</p>
+
 <hr>
 
 <h2><a name="High-level"></a>High-level devices</h2>
diff --git a/doc/VectorDevices.htm b/doc/VectorDevices.htm
index f188e44df..2e9e2dcde 100644
--- a/doc/VectorDevices.htm
+++ b/doc/VectorDevices.htm
@@ -986,6 +986,59 @@ displaying document's properties,
 so we recommend this value.
 </dl>
 
+<d1>
+<dt><code>-sUseOCR=</code><em>string</em>
+<dd>Controls the use of OCR in pdfwrite. If enabled this will use an OCR
+engine to analyse the glyph bitmaps used to draw text in a PDF file, and
+the resulting Unicode code points are then used to construct a ToUnicode
+CMap.
+<p>
+PDF files containing ToUnicode CMaps can be searched, use copy/paste and
+extract the text, subject to the accuracy of the ToUnicode CMap. Since not all
+PDF files contain these it can be beneficial to create them.
+</p>
+<p>
+Note that, for English text, it is possible that the existing standard character
+encoding (which most PDF consumers will fall back to in the absence of Unicode
+information) is better than using OCR, as OCR is ot a 100% reliable process.
+OCR processing is also comparatively slow.
+</p>
+<p>
+For the reasons above it is useful to be able to exercise some control over the
+action of pdfwrite when OCR processing is available, and the <code>UseOCR</code>
+parameter provides that control. There are three possible valuues:
+</p>
+<li><code>Never</code> Default - don't use OCR at all even if support is built-in.
+<li><code>AsNeeded</code> If there is no existing ToUnicode information, use OCR.
+<li><code>Always</code> Ignore any existing information and always use OCR.
+<p>
+Our experimentation with the Tesseract OCR engine has shown that the more text we
+can supply for the engine to look at, the better the result we get. We are, unfortunately,
+limited to the graphics library operations for text as follows.
+</p>
+<p>
+The code works on text 'fragments'; these are the text sequences sent to the text
+operators of the source language. Generally most input languages will try to send
+text in its simplest form, eg "Abc", but the requirements of justification, kerning
+and so on mean that sometimes each character is positioned independently on the page.
+</p>
+<p>
+So pdfwrite renders all the bitmaps for every charcter in the text document, when
+set up to use OCR. Later, if any character in the font does not have a Unicode
+value already we use the bitmaps to assemble a 'strip' of text which we then send
+to the OCR engine. If the engine returns a different number of recognised characters
+than we expected then we ignore that result. We've found that (for English text)
+constructions such as ". T" tend to ignore the full stop, presumably because the OCR
+engine thinks that it is simply noise. In contrast "text." does identify the full
+stop correctly. So by ignoring the failed result we cna get a better result later.
+</p>
+<p>
+Obviously this is all heuristic and undoubtedly there is more we can do to improve the
+functionality here, but we need concrete examples to work from.
+</p>
+</dd>
+</dt>
+
 <h3><a name="PS"></a>PostScript file output</h3>
 <p>
 The <code>ps2write</code> device handles the same set of distiller