summaryrefslogtreecommitdiff
path: root/psi
diff options
context:
space:
mode:
authorKen Sharp <ken.sharp@artifex.com>2022-05-08 15:13:16 +0100
committerKen Sharp <ken.sharp@artifex.com>2022-05-10 11:27:36 +0100
commit398bfc844bde6e2b2a4f6552ce326ad619471316 (patch)
tree88a4a82e5f1b00ac5654186c16d7c0afe1058d3f /psi
parentbdc105a686f0c8fa1e29312302091685d27a9464 (diff)
downloadghostpdl-398bfc844bde6e2b2a4f6552ce326ad619471316.tar.gz
GhostPDF - revamp PDF information extraction
A customer requested that we make pdf_info.ps work with the new PDF interpreter, and generate the same information. This commit modifies the way we extract information on a page-by-page basis to potentially include the names of spot inks and information about fonts used on the page. This is now returned to the PostScript environment using a PDF dictionary instead of a C structure. The pdf_info.ps program has been updated so that it use the new information in broadly the same way as the information from the old PDF interpreter. There are differences; pdf_info.ps extracts font information itself, rather than having the interpreter do it. This is not possible with the new interpreter which is why we have the PDF interpreter do it for us. In addition the pdf_info.ps program only descended to the page level whereas the new PDF interpreter evaluates all objects on the page, potentially meaning that more fonts (and technically spot inks) might be detected. We now have an additional PostScript operator '.PDFPageInfoExt' which returns 'extended' information about a page. This is the same as .PDFPageInfo but includes the font and spot ink information. Running with -dPDFINFO using either Ghostscript or GhostPDF will print more information than before, including the spot inks and considerably more information about fonts than the pdf_info.ps program emits, including embedding status, descendant fonts (and their membedding status) and the presence of ToUnicode CMaps. Updated documentation for all of the above.
Diffstat (limited to 'psi')
-rw-r--r--psi/zpdfops.c355
1 files changed, 230 insertions, 125 deletions
diff --git a/psi/zpdfops.c b/psi/zpdfops.c
index 1b0e07140..c76ca556d 100644
--- a/psi/zpdfops.c
+++ b/psi/zpdfops.c
@@ -22,6 +22,9 @@
#include "gzht.h"
#include "gsrefct.h"
#include "pdf_misc.h"
+#include "pdf_stack.h"
+#include "pdf_dict.h"
+#include "pdf_array.h"
#include "iminst.h"
#include "dstack.h"
@@ -624,6 +627,164 @@ static int zPDFclose(i_ctx_t *i_ctx_p)
return code;
}
+static int PDFobj_to_PSobj(i_ctx_t *i_ctx_p, pdfctx_t *pdfctx, pdf_obj *PDFobj, ref *PSobj);
+
+static int PDFdict_to_PSdict(i_ctx_t *i_ctx_p, pdfctx_t *pdfctx, pdf_dict *PDFdict, ref *PSdict)
+{
+ int code = 0;
+ uint64_t index = 0;
+ pdf_name *Key = NULL;
+ pdf_obj *Value = NULL;
+ ref nameref, valueref;
+ char *str = NULL;
+ int len = 0;
+
+ code = dict_create(pdfi_dict_entries(PDFdict), PSdict);
+ if (code < 0)
+ return code;
+
+ code = pdfi_dict_first(pdfctx->ctx, PDFdict, (pdf_obj **)&Key, &Value, &index);
+ if (code == gs_error_undefined)
+ return 0;
+
+ while (code >= 0) {
+ code = pdfi_string_from_name(pdfctx->ctx, Key, &str, &len);
+ if (code < 0)
+ goto exit;
+
+ code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)str, len, &nameref, 1);
+ if (code < 0)
+ goto exit;
+
+ code = PDFobj_to_PSobj(i_ctx_p, pdfctx, Value, &valueref);
+ if (code < 0)
+ goto exit;
+
+ code = dict_put(PSdict, &nameref, &valueref, &i_ctx_p->dict_stack);
+ if (code < 0)
+ goto exit;
+
+ pdfi_countdown(Key);
+ pdfi_countdown(Value);
+ Key = NULL;
+ Value = NULL;
+ (void)pdfi_free_string_from_name(pdfctx->ctx, str);
+ str = NULL;
+ code = pdfi_dict_next(pdfctx->ctx, PDFdict, (pdf_obj **)&Key, &Value, &index);
+ if (code == gs_error_undefined) {
+ code = 0;
+ break;
+ }
+ }
+exit:
+ (void)pdfi_free_string_from_name(pdfctx->ctx, str);
+ pdfi_countdown(Key);
+ pdfi_countdown(Value);
+ return code;
+}
+
+static int PDFarray_to_PSarray(i_ctx_t *i_ctx_p, pdfctx_t *pdfctx, pdf_array *PDFarray, ref *PSarray)
+{
+ int code = 0, i = 0;
+ ref PS_ref;
+ pdf_obj *array_obj = NULL;
+ ref *eltp = NULL;
+
+ code = ialloc_ref_array(PSarray, a_all, pdfi_array_size(PDFarray), "zPDFInfo");
+ if (code < 0)
+ return code;
+
+ for (i = 0;i < pdfi_array_size(PDFarray); i++) {
+ code = pdfi_array_get(pdfctx->ctx, PDFarray, i, &array_obj);
+ if (code < 0)
+ goto exit;
+ code = PDFobj_to_PSobj(i_ctx_p, pdfctx, array_obj, &PS_ref);
+ if (code < 0) {
+ pdfi_countdown(array_obj);
+ goto exit;
+ }
+
+ eltp = PSarray->value.refs + i;
+ ref_assign_old((const ref *)PSarray, eltp, &PS_ref, "zPDFInfo");
+
+ pdfi_countdown(array_obj);
+ array_obj = NULL;
+ }
+exit:
+ return code;
+}
+
+static int PDFobj_to_PSobj(i_ctx_t *i_ctx_p, pdfctx_t *pdfctx, pdf_obj *PDFobj, ref *PSobj)
+{
+ int code = 0;
+
+ switch(pdfi_type_of(PDFobj)) {
+ case PDF_NAME:
+ {
+ char *str = NULL;
+ int len;
+
+ code = pdfi_string_from_name(pdfctx->ctx, (pdf_name *)PDFobj, &str, &len);
+ if (code < 0)
+ return code;
+ code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)str, len, PSobj, 1);
+ (void)pdfi_free_string_from_name(pdfctx->ctx, str);
+ }
+ break;
+ case PDF_STRING:
+ {
+ byte *sbody;
+ uint size = ((pdf_name *)PDFobj)->length;
+
+ sbody = ialloc_string(size, "string");
+ if (sbody == 0) {
+ code = gs_note_error(gs_error_VMerror);
+ } else {
+ make_string(PSobj, a_all | icurrent_space, size, sbody);
+ memcpy(sbody, ((pdf_name *)PDFobj)->data, size);
+ }
+ }
+ break;
+ case PDF_INT:
+ {
+ int64_t i;
+
+ code = pdfi_obj_to_int(pdfctx->ctx, PDFobj, &i);
+ if (code < 0)
+ return code;
+ make_int(PSobj, i);
+ }
+ break;
+ case PDF_BOOL:
+ if (PDFobj == PDF_TRUE_OBJ)
+ make_bool(PSobj, 1);
+ else
+ make_bool(PSobj, 0);
+ break;
+ case PDF_REAL:
+ {
+ double d;
+
+ code = pdfi_obj_to_real(pdfctx->ctx, PDFobj, &d);
+ if (code < 0)
+ return code;
+ make_real(PSobj, d);
+ }
+ break;
+ case PDF_DICT:
+ code = PDFdict_to_PSdict(i_ctx_p, pdfctx, (pdf_dict *)PDFobj, PSobj);
+ break;
+ case PDF_ARRAY:
+ code = PDFarray_to_PSarray(i_ctx_p, pdfctx, (pdf_array *)PDFobj, PSobj);
+ break;
+ default:
+ gs_note_error(gs_error_typecheck);
+ break;
+ }
+
+ return code;
+}
+
static int zPDFinfo(i_ctx_t *i_ctx_p)
{
os_ptr op = osp;
@@ -704,6 +865,36 @@ static int zPDFinfo(i_ctx_t *i_ctx_p)
}
gs_free_object(pdfctx->ctx->memory, names_array, "free collection temporary filenames");
code = 0;
+ } else {
+ if (pdfctx->ctx->Info != NULL) {
+ code = PDFobj_to_PSobj(i_ctx_p, pdfctx, (pdf_obj *)pdfctx->ctx->Info, (ref *)op);
+ if (code < 0)
+ return code;
+
+ code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"NumPages", 8, &nameref, 1);
+ if (code < 0)
+ return code;
+
+ make_int(&intref, pdfctx->ctx->num_pages);
+
+ code = dict_put(op, &nameref, &intref, &i_ctx_p->dict_stack);
+ if (code < 0)
+ return code;
+ } else {
+ code = dict_create(1, op);
+ if (code < 0)
+ return code;
+
+ code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"NumPages", 8, &nameref, 1);
+ if (code < 0)
+ return code;
+
+ make_int(&intref, pdfctx->ctx->num_pages);
+
+ code = dict_put(op, &nameref, &intref, &i_ctx_p->dict_stack);
+ if (code < 0)
+ return code;
+ }
}
}
else {
@@ -721,10 +912,9 @@ error:
static int zPDFpageinfo(i_ctx_t *i_ctx_p)
{
os_ptr op = osp;
- ref aref, boolref, nameref, numref, *eltp;
- int page = 0, code = 0, i;
+ int page = 0, code = 0;
pdfctx_t *pdfctx;
- pdf_info_t info;
+ pdf_dict *InfoDict = NULL;
pdfi_switch_t i_switch;
check_op(2);
@@ -739,7 +929,7 @@ static int zPDFpageinfo(i_ctx_t *i_ctx_p)
code = pdfi_gstate_from_PS(pdfctx->ctx, igs, &i_switch, pdfctx->profile_cache);
if (code >= 0) {
- code = pdfi_page_info(pdfctx->ctx, (uint64_t)page, &info);
+ code = pdfi_page_info(pdfctx->ctx, (uint64_t)page, &InfoDict, false);
pdfi_gstate_to_PS(pdfctx->ctx, igs, &i_switch);
}
@@ -750,141 +940,52 @@ static int zPDFpageinfo(i_ctx_t *i_ctx_p)
pop(1);
op = osp;
- code = dict_create(4, op);
- if (code < 0)
- return code;
-
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"HasAnnots", 9, &nameref, 1);
- if (code < 0)
- return code;
- make_bool(&boolref, false);
- code = dict_put(op, &nameref, &boolref, &i_ctx_p->dict_stack);
+ code = PDFobj_to_PSobj(i_ctx_p, pdfctx, (pdf_obj *)InfoDict, op);
+ pdfi_countdown(InfoDict);
if (code < 0)
return code;
+ }
+ else {
+ return_error(gs_error_ioerror);
+ }
+ return 0;
+}
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"UsesTransparency", 16, &nameref, 1);
- if (code < 0)
- return code;
- make_bool(&boolref, info.HasTransparency);
- code = dict_put(op, &nameref, &boolref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
+static int zPDFpageinfoExt(i_ctx_t *i_ctx_p)
+{
+ os_ptr op = osp;
+ int page = 0, code = 0;
+ pdfctx_t *pdfctx;
+ pdf_dict *InfoDict = NULL;
+ pdfi_switch_t i_switch;
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"NumSpots", 8, &nameref, 1);
- if (code < 0)
- return code;
- make_int(&numref, info.NumSpots);
- code = dict_put(op, &nameref, &numref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
+ check_op(2);
- if (info.boxes & MEDIA_BOX) {
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"MediaBox", 8, &nameref, 1);
- if (code < 0)
- return code;
- code = ialloc_ref_array(&aref, a_all, 4, "array");
- if (code < 0)
- return code;
- refset_null(aref.value.refs, 4);
- for (i=0;i < 4;i++) {
- make_real(&numref, info.MediaBox[i]);
- eltp = aref.value.refs + i;
- ref_assign_old(&aref, eltp, &numref, "put");
- }
- code = dict_put(op, &nameref, &aref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
- }
+ check_type(*op, t_integer);
+ page = op->value.intval;
- if (info.boxes & CROP_BOX) {
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"CropBox", 7, &nameref, 1);
- if (code < 0)
- return code;
- code = ialloc_ref_array(&aref, a_all, 4, "array");
- if (code < 0)
- return code;
- refset_null(aref.value.refs, 4);
- for (i=0;i < 4;i++) {
- make_real(&numref, info.CropBox[i]);
- eltp = aref.value.refs + i;
- ref_assign_old(&aref, eltp, &numref, "put");
- }
- code = dict_put(op, &nameref, &aref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
- }
+ check_type(*(op - 1), t_pdfctx);
+ pdfctx = r_ptr(op - 1, pdfctx_t);
- if (info.boxes & TRIM_BOX) {
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"TrimBox", 7, &nameref, 1);
- if (code < 0)
- return code;
- code = ialloc_ref_array(&aref, a_all, 4, "array");
- if (code < 0)
- return code;
- refset_null(aref.value.refs, 4);
- for (i=0;i < 4;i++) {
- make_real(&numref, info.TrimBox[i]);
- eltp = aref.value.refs + i;
- ref_assign_old(&aref, eltp, &numref, "put");
- }
- code = dict_put(op, &nameref, &aref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
- }
+ if (pdfctx->pdf_stream != NULL || pdfctx->UsingPDFFile) {
+ code = pdfi_gstate_from_PS(pdfctx->ctx, igs, &i_switch, pdfctx->profile_cache);
- if (info.boxes & ART_BOX) {
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"ArtBox", 6, &nameref, 1);
- if (code < 0)
- return code;
- code = ialloc_ref_array(&aref, a_all, 4, "array");
- if (code < 0)
- return code;
- refset_null(aref.value.refs, 4);
- for (i=0;i < 4;i++) {
- make_real(&numref, info.ArtBox[i]);
- eltp = aref.value.refs + i;
- ref_assign_old(&aref, eltp, &numref, "put");
- }
- code = dict_put(op, &nameref, &aref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
- }
+ if (code >= 0) {
+ code = pdfi_page_info(pdfctx->ctx, (uint64_t)page, &InfoDict, true);
- if (info.boxes & BLEED_BOX) {
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"BleedBox", 8, &nameref, 1);
- if (code < 0)
- return code;
- code = ialloc_ref_array(&aref, a_all, 4, "array");
- if (code < 0)
- return code;
- refset_null(aref.value.refs, 4);
- for (i=0;i < 4;i++) {
- make_real(&numref, info.BleedBox[i]);
- eltp = aref.value.refs + i;
- ref_assign_old(&aref, eltp, &numref, "put");
- }
- code = dict_put(op, &nameref, &aref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
+ pdfi_gstate_to_PS(pdfctx->ctx, igs, &i_switch);
}
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"Rotate", 6, &nameref, 1);
if (code < 0)
return code;
- make_real(&numref, info.Rotate);
- code = dict_put(op, &nameref, &numref, &i_ctx_p->dict_stack);
+
+ pop(1);
+ op = osp;
+
+ code = PDFobj_to_PSobj(i_ctx_p, pdfctx, (pdf_obj *)InfoDict, op);
+ pdfi_countdown(InfoDict);
if (code < 0)
return code;
-
- if (info.UserUnit != 1) {
- code = names_ref(imemory->gs_lib_ctx->gs_name_table, (const byte *)"UserUnit", 8, &nameref, 1);
- if (code < 0)
- return code;
- make_real(&numref, info.UserUnit);
- code = dict_put(op, &nameref, &numref, &i_ctx_p->dict_stack);
- if (code < 0)
- return code;
- }
}
else {
return_error(gs_error_ioerror);
@@ -929,7 +1030,10 @@ static int zPDFdrawpage(i_ctx_t *i_ctx_p)
code = pdfi_gstate_from_PS(pdfctx->ctx, igs, &i_switch, pdfctx->profile_cache);
if (code >= 0) {
- code = pdfi_page_render(pdfctx->ctx, page, false);
+ if (pdfctx->ctx->args.pdfinfo)
+ code = pdfi_output_page_info(pdfctx->ctx, page);
+ else
+ code = pdfi_page_render(pdfctx->ctx, page, false);
if (code >= 0)
pop(2);
@@ -1336,6 +1440,7 @@ const op_def zpdfops_op_defs[] =
{"1.PDFClose", zPDFclose},
{"1.PDFInfo", zPDFinfo},
{"1.PDFPageInfo", zPDFpageinfo},
+ {"1.PDFPageInfoExt", zPDFpageinfoExt},
{"1.PDFMetadata", zPDFmetadata},
{"1.PDFDrawPage", zPDFdrawpage},
{"1.PDFDrawAnnots", zPDFdrawannots},