Add pdfi_read_bare_keyword function.

Performs exactly the same as reading a token, but doesn't actually allocate a pdf_obj for it, just returns an enum value. Saves on allocating/deallocating keywords. There are still places in the code that read keywords as objects, hence pdf_keyword still exists. These will be dealt with in future commits.
author: Robin Watts <Robin.Watts@artifex.com> 2022-02-28 15:14:45 +0000
committer: Robin Watts <Robin.Watts@artifex.com> 2022-05-05 14:48:01 +0100
commit: 37bfab6a555c0de04d72d7b2ceefc3018c40fb55 (patch)
tree: 827dd57ab8ecbe48b84e94ed7bed08d231e5792a /pdf
parent: c20f3914daf63feee4cc1df9bf766b8045095f22 (diff)
download: ghostpdl-37bfab6a555c0de04d72d7b2ceefc3018c40fb55.tar.gz
5 files changed, 91 insertions, 87 deletions
diff --git a/pdf/pdf_deref.c b/pdf/pdf_deref.c
index dc27074fb..ac93800af 100644
--- a/pdf/pdf_deref.c
+++ b/pdf/pdf_deref.c
@@ -200,7 +200,6 @@ static int pdfi_read_stream_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_
 {
     int code = 0;
     int64_t i;
-    pdf_keyword *keyword = NULL;
     pdf_dict *dict = NULL;
     gs_offset_t offset;
     pdf_stream *stream_obj = NULL;
@@ -305,8 +304,8 @@ static int pdfi_read_stream_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_
         stream_obj->Length = 0;
         stream_obj->length_valid = false;
 
-        code = pdfi_read_token(ctx, ctx->main_stream, objnum, gen);
-        if (code < 0 || pdfi_count_stack(ctx) < 2) {
+        code = pdfi_read_bare_keyword(ctx, ctx->main_stream);
+        if (code == 0) {
             char extra_info[gp_file_name_sizeof];
 
             gs_snprintf(extra_info, sizeof(extra_info), "Failed to find a valid object at end of stream object %u.\n", objnum);
@@ -318,34 +317,27 @@ static int pdfi_read_stream_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_
                 pdfi_countdown(stream_obj); /* get rid of extra ref */
                 return code;
             }
-        }
-        else {
-            if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_KEYWORD) {
-                char extra_info[gp_file_name_sizeof];
+        } else if (code < 0) {
+            char extra_info[gp_file_name_sizeof];
 
-                gs_snprintf(extra_info, sizeof(extra_info), "Failed to find 'endstream' keyword at end of stream object %u.\n", objnum);
-                pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_stream_object", extra_info);
-            } else {
-                keyword = ((pdf_keyword *)ctx->stack_top[-1]);
-                if (keyword->key != TOKEN_ENDSTREAM) {
-                    char extra_info[gp_file_name_sizeof];
-
-                    gs_snprintf(extra_info, sizeof(extra_info), "Stream object %u has an incorrect /Length of %"PRIu64"\n", objnum, i);
-                    pdfi_log_info(ctx, "pdfi_read_stream_object", extra_info);
-                } else {
-                    /* Cache the Length in the stream object and mark it valid */
-                    stream_obj->Length = i;
-                    stream_obj->length_valid = true;
-                }
-            }
-            pdfi_pop(ctx, 1);
+            gs_snprintf(extra_info, sizeof(extra_info), "Failed to find 'endstream' keyword at end of stream object %u.\n", objnum);
+            pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_stream_object", extra_info);
+        } else if (code != TOKEN_ENDSTREAM) {
+            char extra_info[gp_file_name_sizeof];
+
+            gs_snprintf(extra_info, sizeof(extra_info), "Stream object %u has an incorrect /Length of %"PRIu64"\n", objnum, i);
+            pdfi_log_info(ctx, "pdfi_read_stream_object", extra_info);
+        } else {
+            /* Cache the Length in the stream object and mark it valid */
+            stream_obj->Length = i;
+            stream_obj->length_valid = true;
         }
     }
 
     /* If we failed to find a valid object, or the object wasn't a keyword, or the
      * keywrod wasn't 'endstream' then the Length is wrong. We need to have the correct
      * Length for streams if we have encrypted files, because we must install a
-     * SubFileDecode filter iwth a Length (EODString is incompatible with AES encryption)
+     * SubFileDecode filter with a Length (EODString is incompatible with AES encryption)
      * Rather than mess about checking for encryption, we'll choose to just correctly
      * calculate the Length of all streams. Although this takes time, it will only
      * happen for files which are invalid.
@@ -393,7 +385,7 @@ static int pdfi_read_stream_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_
         return 0;
     }
 
-    code = pdfi_read_token(ctx, ctx->main_stream, objnum, gen);
+    code = pdfi_read_bare_keyword(ctx, ctx->main_stream);
     if (code < 0) {
         pdfi_countdown(stream_obj); /* get rid of extra ref */
         if (ctx->args.pdfstoponerror)
@@ -406,14 +398,13 @@ static int pdfi_read_stream_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_
         return 0;
     }
 
-    if (pdfi_count_stack(ctx) < 2) {
+    if (code == 0) {
         pdfi_countdown(stream_obj); /* get rid of extra ref */
         return_error(gs_error_stackunderflow);
     }
 
-    if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_KEYWORD) {
+    if (code != TOKEN_ENDOBJ) {
         pdfi_countdown(stream_obj); /* get rid of extra ref */
-        pdfi_pop(ctx, 1);
         if (ctx->args.pdfstoponerror)
             return_error(gs_error_typecheck);
         pdfi_set_error(ctx, 0, NULL, E_PDF_MISSINGENDOBJ, "pdfi_read_stream_object", NULL);
@@ -424,12 +415,6 @@ static int pdfi_read_stream_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_
     }
     pdfi_countdown(stream_obj); /* get rid of extra ref */
 
-    keyword = ((pdf_keyword *)ctx->stack_top[-1]);
-    if (keyword->key != TOKEN_ENDOBJ) {
-        pdfi_pop(ctx, 2);
-        return_error(gs_error_typecheck);
-    }
-    pdfi_pop(ctx, 1);
     return 0;
 }
 
@@ -535,9 +520,8 @@ int pdfi_read_bare_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_t stream_
 
 static int pdfi_read_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_t stream_offset)
 {
-    int code = 0, stack_size = pdfi_count_stack(ctx);
+    int code = 0;
     int objnum = 0, gen = 0;
-    pdf_keyword *keyword = NULL;
 
     /* An object consists of 'num gen obj' followed by a token, follwed by an endobj
      * A stream dictionary might have a 'stream' instead of an 'endobj', in which case we
@@ -556,21 +540,14 @@ static int pdfi_read_object(pdf_context *ctx, pdf_c_stream *s, gs_offset_t strea
     if (code == 0)
         return_error(gs_error_syntaxerror);
 
-    code = pdfi_read_token(ctx, s, 0, 0);
+    code = pdfi_read_bare_keyword(ctx, s);
     if (code < 0)
         return code;
-    if (stack_size >= pdfi_count_stack(ctx))
+    if (code == 0)
         return gs_note_error(gs_error_ioerror);
-    if (((pdf_obj *)ctx->stack_top[-1])->type != PDF_KEYWORD) {
-        pdfi_pop(ctx, 1);
-        return_error(gs_error_typecheck);
-    }
-    keyword = ((pdf_keyword *)ctx->stack_top[-1]);
-    if (keyword->key != TOKEN_OBJ) {
-        pdfi_pop(ctx, 1);
+    if (code != TOKEN_OBJ) {
         return_error(gs_error_syntaxerror);
     }
-    pdfi_pop(ctx, 1);
 
     return pdfi_read_bare_object(ctx, s, stream_offset, objnum, gen);
 }
diff --git a/pdf/pdf_int.c b/pdf/pdf_int.c
index 53167b6be..d4605d219 100644
--- a/pdf/pdf_int.c
+++ b/pdf/pdf_int.c
@@ -36,6 +36,7 @@
 #include "pdf_trans.h"
 #include "pdf_optcontent.h"
 #include "pdf_sec.h"
+#include <stdlib.h>
 
 #include "gsstate.h"    /* for gs_gstate_free */
 
@@ -783,6 +784,56 @@ int pdfi_skip_comment(pdf_context *ctx, pdf_c_stream *s)
     return 0;
 }
 
+#define PARAM1(A) # A,
+#define PARAM2(A,B) A,
+static const char pdf_token_strings[][10] = {
+#include "pdf_tokens.h"
+};
+
+#define nelems(A) (sizeof(A)/sizeof(A[0]))
+
+typedef int (*bsearch_comparator)(const void *, const void *);
+
+int pdfi_read_bare_keyword(pdf_context *ctx, pdf_c_stream *s)
+{
+    byte Buffer[256];
+    int index = 0;
+    int c;
+    void *t;
+
+    pdfi_skip_white(ctx, s);
+
+    do {
+        c = pdfi_read_byte(ctx, s);
+        if (c < 0)
+            break;
+
+        if (iswhite(c) || isdelimiter(c)) {
+            pdfi_unread_byte(ctx, s, (byte)c);
+            break;
+        }
+        Buffer[index] = (byte)c;
+        index++;
+    } while (index < 255);
+
+    if (index >= 255 || index == 0) {
+        if (ctx->args.pdfstoponerror)
+            return_error(gs_error_syntaxerror);
+        return TOKEN_INVALID_KEY;
+    }
+
+    Buffer[index] = 0x00;
+    t = bsearch((const void *)Buffer,
+                (const void *)pdf_token_strings[TOKEN_INVALID_KEY+1],
+                nelems(pdf_token_strings)-(TOKEN_INVALID_KEY+1),
+                sizeof(pdf_token_strings[0]),
+                (bsearch_comparator)&strcmp);
+    if (t == NULL)
+        return TOKEN_INVALID_KEY;
+
+    return (((const char *)t) - pdf_token_strings[0]) / sizeof(pdf_token_strings[0]);
+}
+
 /* This function is slightly misnamed, for some keywords we do
  * indeed read the keyword and return a PDF_KEYWORD object, but
  * for null, true, false and R we create an appropriate object
diff --git a/pdf/pdf_int.h b/pdf/pdf_int.h
index a2ab28b3f..4cf1dcb64 100644
--- a/pdf/pdf_int.h
+++ b/pdf/pdf_int.h
@@ -30,6 +30,7 @@ int pdfi_name_alloc(pdf_context *ctx, byte *key, uint32_t size, pdf_obj **o);
 int pdfi_read_dict(pdf_context *ctx, pdf_c_stream *s, uint32_t indirect_num, uint32_t indirect_gen);
 
 int pdfi_read_bare_int(pdf_context *ctx, pdf_c_stream *s, int *parsed_int);
+int pdfi_read_bare_keyword(pdf_context *ctx, pdf_c_stream *s);
 
 void local_save_stream_state(pdf_context *ctx, stream_save *local_save);
 void local_restore_stream_state(pdf_context *ctx, stream_save *local_save);
diff --git a/pdf/pdf_repair.c b/pdf/pdf_repair.c
index 7e02dcb9a..cc44e13d7 100644
--- a/pdf/pdf_repair.c
+++ b/pdf/pdf_repair.c
@@ -245,27 +245,16 @@ int pdfi_repair_file(pdf_context *ctx)
                                                 index = 0;
                                         } while (index < 9);
                                         do {
-                                            code = pdfi_read_token(ctx, ctx->main_stream, 0, 0);
-                                            if (code < 0) {
-                                                if (code != gs_error_VMerror && code != gs_error_ioerror)
-                                                    continue;
+                                            code = pdfi_read_bare_keyword(ctx, ctx->main_stream);
+                                            if (code == gs_error_VMerror || code == gs_error_ioerror)
                                                 goto exit;
+                                            if (code == TOKEN_ENDOBJ) {
+                                                code = pdfi_repair_add_object(ctx, object_num, generation_num, offset);
+                                                if (code == gs_error_VMerror || code == gs_error_ioerror)
+                                                    goto exit;
+                                                break;
                                             }
-                                            if (code > 0) {
-                                                if (ctx->stack_top[-1]->type == PDF_KEYWORD){
-                                                    pdf_keyword *k = (pdf_keyword *)ctx->stack_top[-1];
-                                                    if (k->key == TOKEN_ENDOBJ) {
-                                                        code = pdfi_repair_add_object(ctx, object_num, generation_num, offset);
-                                                        if (code < 0) {
-                                                            if (code != gs_error_VMerror && code != gs_error_ioerror)
-                                                                break;
-                                                            goto exit;
-                                                        }
-                                                        break;
-                                                    }
-                                                }
-                                            }
-                                        }while(ctx->main_stream->eof == false);
+                                        } while(ctx->main_stream->eof == false);
 
                                         pdfi_clearstack(ctx);
                                         break;
diff --git a/pdf/pdf_xref.c b/pdf/pdf_xref.c
index e060ef7f1..71f4b76b7 100644
--- a/pdf/pdf_xref.c
+++ b/pdf/pdf_xref.c
@@ -667,7 +667,6 @@ static int write_offset(byte *B, gs_offset_t o, unsigned int g, unsigned char fr
 static int read_xref_section(pdf_context *ctx, pdf_c_stream *s, uint64_t *section_start, uint64_t *section_size)
 {
     int code = 0, i, j;
-    pdf_obj *o = NULL;
     int start = 0;
     int size = 0;
     int64_t bytes = 0;
@@ -681,17 +680,15 @@ static int read_xref_section(pdf_context *ctx, pdf_c_stream *s, uint64_t *sectio
     code = pdfi_read_bare_int(ctx, ctx->main_stream, &start);
     if (code < 0) {
         /* Not an int, might be a keyword */
-        code = pdfi_read_token(ctx, ctx->main_stream, 0, 0);
+        code = pdfi_read_bare_keyword(ctx, ctx->main_stream);
         if (code < 0)
             return code;
 
-        o = ctx->stack_top[-1];
-        if (o->type == PDF_KEYWORD)
-            return 0;
-
-        /* element is not an integer, and not a keyword - not a valid xref */
-        pdfi_pop(ctx, 1);
-        return_error(gs_error_typecheck);
+        if (code != TOKEN_TRAILER) {
+            /* element is not an integer, and not a keyword - not a valid xref */
+            return_error(gs_error_typecheck);
+        }
+        return 1;
     }
 
     *section_start = start;
@@ -798,8 +795,6 @@ static int read_xref_section(pdf_context *ctx, pdf_c_stream *s, uint64_t *sectio
 static int read_xref(pdf_context *ctx, pdf_c_stream *s)
 {
     int code = 0;
-    pdf_obj **o = NULL;
-    pdf_keyword *k;
     pdf_dict *d = NULL;
     uint64_t size = 0, max_obj = 0;
     int64_t num;
@@ -807,7 +802,6 @@ static int read_xref(pdf_context *ctx, pdf_c_stream *s)
     do {
         uint64_t section_start, section_size;
 
-        o = ctx->stack_top;
         code = read_xref_section(ctx, s, &section_start, &section_size);
         if (code < 0)
             return code;
@@ -815,16 +809,8 @@ static int read_xref(pdf_context *ctx, pdf_c_stream *s)
         if (section_size > 0 && section_start + section_size - 1 > max_obj)
             max_obj = section_start + section_size - 1;
 
-        if (ctx->stack_top - o > 0) {
-            k = (pdf_keyword *)ctx->stack_top[-1];
-            if(k->type != PDF_KEYWORD || k->key != TOKEN_TRAILER)
-                return_error(gs_error_syntaxerror);
-            else {
-                pdfi_pop(ctx, 1);
-                break;
-            }
-        }
-    } while (1);
+        /* code == 1 => read_xref_section ended with a trailer. */
+    } while (code != 1);
 
     code = pdfi_read_dict(ctx, ctx->main_stream, 0, 0);
     if (code < 0)
author	Robin Watts <Robin.Watts@artifex.com>	2022-02-28 15:14:45 +0000
committer	Robin Watts <Robin.Watts@artifex.com>	2022-05-05 14:48:01 +0100
commit	37bfab6a555c0de04d72d7b2ceefc3018c40fb55 (patch)
tree	827dd57ab8ecbe48b84e94ed7bed08d231e5792a /pdf
parent	c20f3914daf63feee4cc1df9bf766b8045095f22 (diff)
download	ghostpdl-37bfab6a555c0de04d72d7b2ceefc3018c40fb55.tar.gz