Use ToUnicode for substituted CIDFonts

When we substitute a CIDFont in a PDF file with a TTF from disk, use the ToUnicode CMap (if available) to extract Unicode values. In a lot of case, this gets a more accurate character code to glyph mapping than the previous reliance on the low level glyph ordering from the TTF. This also meant being more robust for broken ToUnicode CMap. Also, an additional option (-dIgnoreToUnicode) to skip parsing of the ToUnicode CMap - for cases where a correctly formed ToUnicode is semantically wrong.
author: Chris Liddell <chris.liddell@artifex.com> 2016-07-19 08:43:35 +0100
committer: Chris Liddell <chris.liddell@artifex.com> 2016-10-05 16:47:41 +0100
commit: d609a3d4c1b8583d1c22db6b4d3bce4b239cf88c (patch)
tree: 6429eca837ecbcf2104cac0e80e87e8cf63defc6
parent: 3089131dc90ec008ff540d41df0f1a9fbc2dd47b (diff)
download: ghostpdl-d609a3d4c1b8583d1c22db6b4d3bce4b239cf88c.tar.gz
6 files changed, 177 insertions, 95 deletions
diff --git a/Resource/Init/pdf_font.ps b/Resource/Init/pdf_font.ps
index 9e05e0916..57dbb5ea7 100644
--- a/Resource/Init/pdf_font.ps
+++ b/Resource/Init/pdf_font.ps
@@ -582,104 +582,148 @@ currentdict end readonly def
   } if
 } bind def
 
+/.DoToUnicode?
+{
+  /IgnoreToUnicode where
+  {/IgnoreToUnicode get not}
+  {//true} ifelse
+} bind def
+
 /.processToUnicode   % <font-resource> <font-dict> <encoding|null> .processToUnicode -
 {
-  % Currently pdfwrite is only device which can handle GlyphNames2Unicoide to
-  % generate a ToUnicode CMaps. So don't bother with other devices.
-  /WantsToUnicode /GetDeviceParam .special_op {
-      exch pop
-  }{
-      //false
-  }ifelse
+  //.DoToUnicode? exec
   {
-    PDFDEBUG {
-      (.processToUnicode beg) =
-    } if
-    2 index /ToUnicode knownoget {
-      dup type /dicttype eq { dup /File known not } { //true } ifelse {
-        % We undefine wrong /Length and define /File in stream dictionaries.
-        % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
-        (   **** Warning: Ignoring bad ToUnicode CMap.\n)  pdfformatwarning
-        pop
-      } {
-        /PDFScanRules .getuserparam dup //null eq {
-          pop //PDFScanRules_null
-        } {
-          1 dict dup /PDFScanRules 4 -1 roll put
-        } ifelse
-        //PDFScanRules_true setuserparams
-        PDFfile fileposition 
-        3 -1 roll
-        count 1 sub
-        countdictstack
-        { //false resolvestream
-          % Following Acrobat we ignore everything outside
-          %   begincodespacerange .. endcmap.
-          dup 0 (begincodespacerange) /SubFileDecode filter flushfile
-          /CIDInit /ProcSet findresource begin
-          //ToUnicodeCMapReader begin
-          12 dict begin
-          /CMapType 2 def
-          mark exch % emulate 'begincodespacerange'
-          0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
-          endcmap
-          userdict /.lastToUnicode currentdict put
-          end end end
-        }
-
-        PDFSTOPONERROR {
-          { exec } 0 get
-          //false
-          5 -2 roll
-          5
-        } {
-          { stopped } 0 get
-          4 2 roll
-          4
-        } ifelse
-        array astore cvx exec
-
-        countdictstack exch sub 0 .max { end } repeat
-        count exch sub 2 sub 0 .max { exch pop } repeat
-        3 1 roll                     % Stach the stop flag.
-        PDFfile exch setfileposition
-        setuserparams
+    currentdict count 1 sub /.stackdepth exch .forceput
+    currentdict countdictstack /.dstackdepth exch .forceput
+    {
+      1 index /FontType get 0 eq
+      {
+        1 index /FDepVector get 0 get
+        dup /FontType .knownget not
         {
-          (   **** Warning: Failed to read ToUnicode CMap.\n)  pdfformatwarning
-        } {
-          1 index /FontInfo .knownget not {
-            currentglobal 2 index dup gcheck setglobal
-            /FontInfo 5 dict dup 5 1 roll .forceput
-            setglobal
-          } if
-          dup /GlyphNames2Unicode .knownget not {
-            //true			    % No existing G2U, make one
+          dup /CIDFontType .knownget
+          { dup 2 eq {pop 11} if }
+          {-1} % just some value that's not a valid font type
+          ifelse
+        }if
+        11 eq
+        { /Path known}
+       {pop //false}
+        ifelse
+      }
+      {//false} ifelse
+      % Currently pdfwrite is only device which can handle GlyphNames2Unicoide to
+      % generate a ToUnicode CMaps. So don't bother with other devices.
+      /WantsToUnicode /GetDeviceParam .special_op {
+        exch pop
+      }{
+        //false
+      }ifelse
+      or
+      {
+        PDFDEBUG {
+          (.processToUnicode beg) =
+        } if
+        2 index /ToUnicode knownoget {
+          dup type /dicttype eq { dup /File known not } { //true } ifelse {
+            % We undefine wrong /Length and define /File in stream dictionaries.
+            % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
+            (   **** Warning: Ignoring bad ToUnicode CMap.\n)  pdfformatwarning
+            pop
           } {
-            dup wcheck {
-              //false			    % Existing, writeable G2U, don't make new one
+            /PDFScanRules .getuserparam dup //null eq {
+              pop //PDFScanRules_null
+            } {
+              1 dict dup /PDFScanRules 4 -1 roll put
+            } ifelse
+            //PDFScanRules_true setuserparams
+            PDFfile fileposition
+            3 -1 roll
+            count 1 sub
+            countdictstack
+            { //false resolvestream
+              % Following Acrobat we ignore everything outside
+              %   begincodespacerange .. endcmap.
+              dup 0 (begincodespacerange) /SubFileDecode filter flushfile
+              /CIDInit /ProcSet findresource begin
+              //ToUnicodeCMapReader begin
+              12 dict begin
+              /CMapType 2 def
+              mark exch % emulate 'begincodespacerange'
+              0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
+              endcmap
+              userdict /.lastToUnicode currentdict put
+              end end end
+            }
+
+            PDFSTOPONERROR {
+              { exec } 0 get
+              //false
+              5 -2 roll
+              5
+            } {
+              { stopped } 0 get
+              4 2 roll
+              4
+            } ifelse
+            array astore cvx exec
+
+            countdictstack exch sub 0 .max { end } repeat
+            count exch sub 2 sub 0 .max { exch pop } repeat
+            3 1 roll                     % Stach the stop flag.
+            PDFfile exch setfileposition
+            setuserparams
+            {
+              (   **** Warning: Failed to read ToUnicode CMap.\n)  pdfformatwarning
             } {
-              pop //true			    % Existing read only G2U, make new one
+              1 index /FontInfo .knownget not {
+                currentglobal 2 index dup gcheck setglobal
+                /FontInfo 5 dict dup 5 1 roll .forceput
+                setglobal
+              } if
+              dup /GlyphNames2Unicode .knownget not {
+                //true                        % No existing G2U, make one
+              } {
+                dup wcheck {
+                  //false                     % Existing, writeable G2U, don't make new one
+                } {
+                  pop //true                          % Existing read only G2U, make new one
+                } ifelse
+              } ifelse
+              {
+                currentglobal exch dup gcheck setglobal
+                dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
+                3 2 roll setglobal
+              } if                                 % font-res font-dict encoding|null font-info g2u
+              exch pop exch                        % font-res font-dict g2u encoding|null
+              userdict /.lastToUnicode get         % font-res font-dict g2u Encoding|null CMap
+              .convert_ToUnicode-into-g2u          % font-res font-dict
+              //null                               % font-res font-dict null
             } ifelse
           } ifelse
-          {
-            currentglobal exch dup gcheck setglobal
-            dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
-            3 2 roll setglobal
-          } if                                 % font-res font-dict encoding|null font-info g2u
-          exch pop exch                        % font-res font-dict g2u encoding|null
-          userdict /.lastToUnicode get         % font-res font-dict g2u Encoding|null CMap
-          .convert_ToUnicode-into-g2u          % font-res font-dict
-          //null                               % font-res font-dict null
-        } ifelse
-      } ifelse
-    } if
-    PDFDEBUG {
-      (.processToUnicode end) =
+        } if
+        PDFDEBUG {
+          (.processToUnicode end) =
+        } if
+      } if
+    } stopped
+    {
+      .dstackdepth 1 countdictstack 1 sub
+      {pop end} for
+      .stackdepth 1 count 3 sub
+      {pop pop} for
     } if
-  } if
-  pop pop pop
+    pop pop pop
+    currentdict /.stackdepth .forceundef
+    currentdict /.dstackdepth .forceundef
+  }
+  {pop pop pop}
+  ifelse
+
 } bind def
 
+currentdict /.DoToUnicode? .forceundef
+
 % ---------------- Descriptors ---------------- %
 
 % Partial descriptors for the 14 built-in fonts.  Note that
@@ -1445,6 +1489,7 @@ currentdict /CMap_read_dict undef
                     % Stack: fontres name font
   3 copy exch pop //null .processToUnicode
   exch pop .completefont % Stack: fontres font
+
   1 index /FontMatrix knownoget {
     dup aload pop //true {0 0 1 0 0 1} {3 -1 roll eq and} forall {
       1 index exch makefont exch /FontName get exch
diff --git a/base/gstext.c b/base/gstext.c
index 409869d97..bc1c3722f 100644
--- a/base/gstext.c
+++ b/base/gstext.c
@@ -227,6 +227,7 @@ gs_text_enum_copy_dynamic(gs_text_enum_t *pto, const gs_text_enum_t *pfrom,
 
     pto->current_font = pfrom->current_font;
     pto->index = pfrom->index;
+    pto->bytes_decoded = pfrom->bytes_decoded;
     pto->xy_index = pfrom->xy_index;
     pto->fstack.depth = depth;
     pto->FontBBox_as_Metrics2 = pfrom->FontBBox_as_Metrics2;
diff --git a/base/gxfapi.c b/base/gxfapi.c
index 7ab812189..e3d6a69ea 100644
--- a/base/gxfapi.c
+++ b/base/gxfapi.c
@@ -1391,7 +1391,7 @@ gs_fapi_do_char(gs_font *pfont, gs_gstate *pgs, gs_text_enum_t *penum, char *fon
 
     if (I->ff.get_glyphname_or_cid) {
         if ((code =
-             I->ff.get_glyphname_or_cid(pbfont, charstring, glyphname, index,
+             I->ff.get_glyphname_or_cid(penum, pbfont, charstring, glyphname, index,
                                         &enc_char_name_string, font_file_path,
                                         &cr, bCID)) < 0)
             return (code);
diff --git a/base/gxfapi.h b/base/gxfapi.h
index 242bc7e97..87a9ef40f 100644
--- a/base/gxfapi.h
+++ b/base/gxfapi.h
@@ -184,7 +184,7 @@ struct gs_fapi_font_s
                                            byte *buf, ushort buf_length);
     int (*get_glyphdirectory_data) (gs_fapi_font *ff, int char_code,
                                     const byte **ptr);
-    int (*get_glyphname_or_cid) (gs_font_base *pbfont,
+    int (*get_glyphname_or_cid) (gs_text_enum_t *penum, gs_font_base *pbfont,
                                  gs_string *charstring, gs_string *name,
                                  int ccode, gs_string *enc_char_name,
                                  char *font_file_path, gs_fapi_char_ref *cr,
diff --git a/pcl/pl/plfapi.c b/pcl/pl/plfapi.c
index e2cb86cfe..2e55a35bd 100644
--- a/pcl/pl/plfapi.c
+++ b/pcl/pl/plfapi.c
@@ -65,7 +65,7 @@ static ulong
 pl_fapi_get_long(gs_fapi_font * ff, gs_fapi_font_feature var_id, int index);
 
 static int
-pl_fapi_get_cid(gs_font_base * pbfont, gs_string * charstring,
+pl_fapi_get_cid(gs_text_enum_t *penum, gs_font_base * pbfont, gs_string * charstring,
                 gs_string * name, int ccode, gs_string * enc_char_name,
                 char *font_file_path, gs_fapi_char_ref * cr, bool bCID);
 
@@ -146,7 +146,7 @@ pl_fapi_get_long(gs_fapi_font * ff, gs_fapi_font_feature var_id, int index)
 }
 
 static int
-pl_fapi_get_cid(gs_font_base * pbfont, gs_string * charstring,
+pl_fapi_get_cid(gs_text_enum_t *penum, gs_font_base * pbfont, gs_string * charstring,
                 gs_string * name, int ccode, gs_string * enc_char_name,
                 char *font_file_path, gs_fapi_char_ref * cr, bool bCID)
 {
@@ -157,6 +157,7 @@ pl_fapi_get_cid(gs_font_base * pbfont, gs_string * charstring,
     (void)enc_char_name;
     (void)font_file_path;
     (void)bCID;
+    (void)penum;
 
     if (plfont->allow_vertical_substitutes) {
         vertical = pl_font_vertical_glyph(ccode, plfont);
diff --git a/psi/zfapi.c b/psi/zfapi.c
index c186b99b8..3448423ef 100644
--- a/psi/zfapi.c
+++ b/psi/zfapi.c
@@ -1322,7 +1322,8 @@ ps_fapi_get_metrics(gs_fapi_font *ff, gs_string *char_name, int cid,
 
 
 /* forward declaration for the ps_ff_stub assignment */
-static int ps_get_glyphname_or_cid(gs_font_base *pbfont,
+static int ps_get_glyphname_or_cid(gs_text_enum_t *penum,
+                                   gs_font_base *pbfont,
                                    gs_string *charstring, gs_string *name,
                                    int ccode, gs_string *enc_char_name,
                                    char *font_file_path,
@@ -1809,7 +1810,8 @@ find_substring(const byte *where, int length, const char *what)
 }
 
 static int
-ps_get_glyphname_or_cid(gs_font_base *pbfont, gs_string *charstring,
+ps_get_glyphname_or_cid(gs_text_enum_t *penum,
+                        gs_font_base *pbfont, gs_string *charstring,
                         gs_string *name, int ccode,
                         gs_string *enc_char_name, char *font_file_path,
                         gs_fapi_char_ref *cr, bool bCID)
@@ -1825,9 +1827,35 @@ ps_get_glyphname_or_cid(gs_font_base *pbfont, gs_string *charstring,
         ((pbfont->FontType == ft_encrypted
           || pbfont->FontType == ft_encrypted2) && font_file_path == NULL);
     i_ctx_t *i_ctx_p = (i_ctx_t *) I->client_ctx_p;
+    bool unicode_cp = false;
 
     /* Obtain the character name : */
     if (bCID) {
+        if (pbfont->FontType == ft_CID_TrueType && font_file_path) {
+            ref *pdr2, *fidr, *dummy;
+            pdr2 = pfont_dict(gs_rootfont(igs));
+            if (dict_find_string(pdr2, "FontInfo", &fidr) &&
+                dict_find_string(fidr, "GlyphNames2Unicode", &dummy))
+            {
+                unsigned char uc[4] = {0};
+                unsigned int cc = 0;
+                int i, l;
+                byte *c = (byte *)&penum->text.data.bytes[penum->index - penum->bytes_decoded];
+
+                for (i = 0; i < penum->bytes_decoded ; i++) {
+                  cc |= c[i] << ((penum->bytes_decoded - 1) - i) * 8;
+                }
+                l = ((gs_font_base *)gs_rootfont(igs))->procs.decode_glyph(gs_rootfont(igs), cc + GS_MIN_CID_GLYPH, ccode, (unsigned short *)uc, sizeof(uc));
+                if (l > 0 && l < sizeof(uc)) {
+                    cc = 0;
+                    for (i = 0; i < l; i++) {
+                        cc |= uc[i] << ((penum->bytes_decoded - 1) - i) * 8;
+                    }
+                    ccode = cc;
+                    unicode_cp = true;
+                }
+            }
+        }
         client_char_code = ccode;
         make_null(&char_name);
         enc_char_name->data = NULL;
@@ -1877,7 +1905,7 @@ ps_get_glyphname_or_cid(gs_font_base *pbfont, gs_string *charstring,
     cr->char_codes_count = 1;
     if (bCID) {
         if (font_file_path != NULL) {
-            ref *Decoding, *TT_cmap, *SubstNWP;
+            ref *Decoding, *TT_cmap = NULL, *SubstNWP;
             ref src_type, dst_type;
             uint c = 0;
 
@@ -1949,7 +1977,7 @@ ps_get_glyphname_or_cid(gs_font_base *pbfont, gs_string *charstring,
                 }
                 /* We only have to lookup the char code if we're *not* using an identity ordering 
                    with the exception of Identity-UTF16 which is a different beast altogether */
-                if ((cmapnmlen > 0 && !strncmp(cmapnm, utfcmap, cmapnmlen > utfcmaplen ? utfcmaplen : cmapnmlen))
+                if (unicode_cp || (cmapnmlen > 0 && !strncmp(cmapnm, utfcmap, cmapnmlen > utfcmaplen ? utfcmaplen : cmapnmlen))
                     || (dict_find_string(pdr, "CIDSystemInfo", &CIDSystemInfo) >= 0
                     && r_has_type(CIDSystemInfo, t_dictionary)
                     && dict_find_string(CIDSystemInfo, "Ordering",
@@ -1975,6 +2003,13 @@ ps_get_glyphname_or_cid(gs_font_base *pbfont, gs_string *charstring,
                     }
                 }
             }
+            if (pbfont->FontType == ft_CID_TrueType && c == 0 && TT_cmap) {
+                ref cc32;
+                ref *gid;
+                make_int(&cc32, 32);
+                if (dict_find(TT_cmap, &cc32, &gid) >= 0)
+                    c = gid->value.intval;
+            }
             cr->char_codes[0] = c;
             cr->is_glyph_index = is_glyph_index;
             /* fixme : process the narrow/wide/proportional mapping type,
author	Chris Liddell <chris.liddell@artifex.com>	2016-07-19 08:43:35 +0100
committer	Chris Liddell <chris.liddell@artifex.com>	2016-10-05 16:47:41 +0100
commit	d609a3d4c1b8583d1c22db6b4d3bce4b239cf88c (patch)
tree	6429eca837ecbcf2104cac0e80e87e8cf63defc6
parent	3089131dc90ec008ff540d41df0f1a9fbc2dd47b (diff)
download	ghostpdl-d609a3d4c1b8583d1c22db6b4d3bce4b239cf88c.tar.gz