Stage 5: add notion of quote age

author: Eric Blake <ebb9@byu.net> 2007-10-24 08:36:26 -0600
committer: Eric Blake <ebb9@byu.net> 2007-12-06 10:25:55 -0700
commit: 8b5b3b7a74f452fed795c063965966934a68755d (patch)
tree: 64c05d239d202a1696a074f4938f1c0638012671
parent: ab7d5ea40dd30e38cdafdfa69e868390ff6f72ab (diff)
download: m4-8b5b3b7a74f452fed795c063965966934a68755d.tar.gz
6 files changed, 355 insertions, 124 deletions
diff --git a/doc/m4.texinfo b/doc/m4.texinfo
index 3da16fc9..803dbf05 100644
--- a/doc/m4.texinfo
+++ b/doc/m4.texinfo
@@ -2635,6 +2635,47 @@ ifelse(`foo', `bar', `3', `gnu', `gnats', `6', `7', `8')
 @result{}7
 @end example
 
+@ignore
+@comment Stress tests, not worth documenting.
+@comment It would be nice to pass builtin tokens through ifelse, m4wrap,
+@comment user macros; hence the fixmes.
+@example
+define(`e', `$@@')define(`q', ``$@@'')define(`u', `$*')
+@result{}
+define(`cmp', `ifelse($1, $2, `yes', `no')')define(`d', defn(`defn'))
+@result{}
+cmp(`defn(`defn')', `defn(`d')')
+@result{}yes
+cmp(`defn(`defn')', ``<defn>'')
+@result{}no
+cmp(`q(defn(`defn'))', `q(defn(`d'))')
+@result{}yes
+cmp(`q(defn(`defn'))', `q(`<defn>')')
+@result{}no
+cmp(`q(defn(`defn'))', ``'')
+@result{}no
+cmp(`q(`1', `2', defn(`defn'))', `q(`1', `2', defn(`d'))')
+@result{}yes
+cmp(`q(`1', `2', defn(`defn'))', `q(`1', `2', `<defn>')')
+@result{}no
+cmp(`q(`1', `2', defn(`defn'))', ```1',`2',<defn>'')
+@result{}no
+cmp(`q(`1', `2', defn(`defn'))', ```1',`2',`''')-fixme
+@result{}yes-fixme
+define(`cat', `$1`'ifelse(`$#', `1', `', `$0(shift($@@))')')
+@result{}
+cat(`define(`foo',', defn(`divnum'), `)foo')-fixme
+@result{}-fixme
+cat(e(`define(`bar',', defn(`divnum'), `)bar'))-fixme
+@result{}-fixme
+m4wrap(`u('q(`cat(`define(`baz','', defn(`divnum'), ``)baz')')`)-fixme
+')
+@result{}
+^D
+@result{}-fixme
+@end example
+@end ignore
+
 Naturally, the normal case will be slightly more advanced than these
 examples.  A common use of @code{ifelse} is in macros implementing loops
 of various kinds.
@@ -3714,6 +3755,18 @@ changequote(`"', `"')
 @result{}hiHIhi
 @end example
 
+@ignore
+@comment And another stress test, not worth documenting in the manual.
+@example
+define(`aaaaaaaaaaaaaaaaaaaa', `A')define(`q', `"$@@"')
+@result{}
+changequote(`"', `"')
+@result{}
+q(q("aaaaaaaaaaaaaaaaaaaa", "a"))
+@result{}A,a
+@end example
+@end ignore
+
 It is an error if the end of file occurs within a quoted string.
 
 @comment status: 1
@@ -6490,7 +6543,7 @@ of @samp{-} on the command line.
 @acronym{POSIX} requires @code{m4wrap} (@pxref{M4wrap}) to act in FIFO
 (first-in, first-out) order, but @acronym{GNU} @code{m4} currently uses
 LIFO order.  Furthermore, @acronym{POSIX} states that only the first
-argument to @code{m4wrap} is saved for later evaluation, bug
+argument to @code{m4wrap} is saved for later evaluation, but
 @acronym{GNU} @code{m4} saves and processes all arguments, with output
 separated by spaces.
 
diff --git a/examples/Makefile.am b/examples/Makefile.am
index b1ef68a0..c1dc5227 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -58,4 +58,5 @@ translit.m4 \
 undivert.incl \
 undivert.m4 \
 wrap.m4 \
-wrapfifo.m4
+wrapfifo.m4 \
+wraplifo.m4
diff --git a/examples/wraplifo.m4 b/examples/wraplifo.m4
new file mode 100644
index 00000000..bdbf3fb6
--- /dev/null
+++ b/examples/wraplifo.m4
@@ -0,0 +1,10 @@
+dnl Redefine m4wrap to have LIFO semantics.
+define(`_m4wrap_level', `0')dnl
+define(`_m4wrap', defn(`m4wrap'))dnl
+define(`m4wrap',
+`ifdef(`m4wrap'_m4wrap_level,
+       `define(`m4wrap'_m4wrap_level,
+               `$1'defn(`m4wrap'_m4wrap_level))',
+       `_m4wrap(`define(`_m4wrap_level', incr(_m4wrap_level))dnl
+m4wrap'_m4wrap_level)dnl
+define(`m4wrap'_m4wrap_level, `$1')')')dnl
diff --git a/src/input.c b/src/input.c
index 0aa60367..551b43d6 100644
--- a/src/input.c
+++ b/src/input.c
@@ -23,12 +23,13 @@
 
 #include "m4.h"
 
-/* Unread input can be either files, that should be read (eg. included
-   files), strings, which should be rescanned (eg. macro expansion text),
-   or quoted macro definitions (as returned by the builtin "defn").
-   Unread input are organised in a stack, implemented with an obstack.
-   Each input source is described by a "struct input_block".  The obstack
-   is "current_input".  The top of the input stack is "isp".
+/* Unread input can be either files to be read (command line,
+   "include", "sinclude"), strings which should be rescanned (macro
+   expansion text), or quoted macro definitions (as returned by the
+   builtin "defn").  Unread input is organized in a stack, implemented
+   with an obstack.  Each input source is described by a "struct
+   input_block".  The obstack is "current_input".  The top of the
+   input stack is "isp".
 
    The macro "m4wrap" places the text to be saved on another input
    stack, on the obstack "wrapup_stack", whose top is "wsp".  When EOF
@@ -42,12 +43,13 @@
 
    Pushing new input on the input stack is done by push_file (),
    push_string (), push_wrapup () (for wrapup text), and push_macro ()
-   (for macro definitions).  Because macro expansion needs direct access
-   to the current input obstack (for optimisation), push_string () are
-   split in two functions, push_string_init (), which returns a pointer
-   to the current input stack, and push_string_finish (), which return a
-   pointer to the final text.  The input_block *next is used to manage
-   the coordination between the different push routines.
+   (for macro definitions).  Because macro expansion needs direct
+   access to the current input obstack (for optimization), push_string
+   () is split in two functions, push_string_init (), which returns a
+   pointer to the current input stack, and push_string_finish (),
+   which returns a pointer to the final text.  The input_block *next
+   is used to manage the coordination between the different push
+   routines.
 
    The current file and line number are stored in two global
    variables, for use by the error handling functions in m4.c.  Macro
@@ -62,6 +64,7 @@
 # include "regex.h"
 #endif /* ENABLE_CHANGEWORD */
 
+/* Type of an input block.  */
 enum input_type
 {
   INPUT_STRING,		/* String resulting from macro expansion.  */
@@ -71,28 +74,29 @@ enum input_type
 
 typedef enum input_type input_type;
 
+/* A block of input to be scanned.  */
 struct input_block
 {
-  struct input_block *prev;	/* previous input_block on the input stack */
-  input_type type;		/* see enum values */
-  const char *file;		/* file where this input is from */
-  int line;			/* line where this input is from */
+  struct input_block *prev;	/* Previous input_block on the input stack.  */
+  input_type type;		/* See enum values.  */
+  const char *file;		/* File where this input is from.  */
+  int line;			/* Line where this input is from.  */
   union
     {
       struct
 	{
-	  char *string;		/* remaining string value */
+	  char *string;		/* Remaining string value.  */
 	}
 	u_s;	/* INPUT_STRING */
       struct
 	{
-	  FILE *fp;		     /* input file handle */
-	  bool_bitfield end : 1;     /* true if peek has seen EOF */
-	  bool_bitfield close : 1;   /* true if we should close file on pop */
-	  bool_bitfield advance : 1; /* track previous start_of_input_line */
+	  FILE *fp;		     /* Input file handle.  */
+	  bool_bitfield end : 1;     /* True if peek has seen EOF.  */
+	  bool_bitfield close : 1;   /* True to close file on pop.  */
+	  bool_bitfield advance : 1; /* Track previous start_of_input_line.  */
 	}
 	u_f;	/* INPUT_FILE */
-      builtin_func *func;	/* pointer to macro's function */
+      builtin_func *func;	/* Pointer to macro's function.  */
     }
   u;
 };
@@ -136,8 +140,8 @@ static bool start_of_input_line;
 /* Flag for next_char () to recognize change in input block.  */
 static bool input_change;
 
-#define CHAR_EOF	256	/* character return on EOF */
-#define CHAR_MACRO	257	/* character return for MACRO token */
+#define CHAR_EOF	256	/* Character return on EOF.  */
+#define CHAR_MACRO	257	/* Character return for MACRO token.  */
 
 /* Quote chars.  */
 STRING rquote;
@@ -151,16 +155,30 @@ STRING ecomm;
 
 # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"
 
+/* Table of characters that can start a word.  */
 static char *word_start;
+
+/* Current regular expression for detecting words.  */
 static struct re_pattern_buffer word_regexp;
-static int default_word_regexp;
+
+/* True if changeword is not active.  */
+static bool default_word_regexp;
+
+/* Reused memory for detecting matches in word detection.  */
 static struct re_registers regs;
 
 #else /* !ENABLE_CHANGEWORD */
-# define default_word_regexp 1
+# define default_word_regexp true
 #endif /* !ENABLE_CHANGEWORD */
 
+/* Track the current quote age, determined by all significant
+   changequote, changecom, and changeword calls, since any one of
+   these can alter the rescan of a prior parameter in a quoted
+   context.  */
+static unsigned int current_quote_age;
+
 static bool pop_input (bool);
+static void set_quote_age (void);
 
 #ifdef DEBUG_INPUT
 static const char *token_type_string (token_type);
@@ -172,7 +190,8 @@ static const char *token_type_string (token_type);
 | current file name and line number.  If next is non-NULL, this push |
 | invalidates a call to push_string_init (), whose storage is        |
 | consequently released.  If CLOSE, then close FP after EOF is       |
-| detected.                                                          |
+| detected.  TITLE is used as the location for text parsed from the  |
+| file (not necessarily the file name).                              |
 `-------------------------------------------------------------------*/
 
 void
@@ -206,11 +225,11 @@ push_file (FILE *fp, const char *title, bool close)
   isp = i;
 }
 
-/*---------------------------------------------------------------.
-| push_macro () pushes a builtin macro's definition on the input |
-| stack.  If next is non-NULL, this push invalidates a call to   |
-| push_string_init (), whose storage is consequently released.   |
-`---------------------------------------------------------------*/
+/*-----------------------------------------------------------------.
+| push_macro () pushes the builtin macro FUNC on the input stack.  |
+| If next is non-NULL, this push invalidates a call to             |
+| push_string_init (), whose storage is consequently released.     |
+`-----------------------------------------------------------------*/
 
 void
 push_macro (builtin_func *func)
@@ -235,10 +254,10 @@ push_macro (builtin_func *func)
   isp = i;
 }
 
-/*------------------------------------------------------------------.
-| First half of push_string ().  The pointer next points to the new |
-| input_block.							    |
-`------------------------------------------------------------------*/
+/*--------------------------------------------------------------.
+| First half of push_string ().  The return value points to the |
+| obstack where expansion text should be placed.                |
+`--------------------------------------------------------------*/
 
 struct obstack *
 push_string_init (void)
@@ -257,14 +276,15 @@ push_string_init (void)
   return current_input;
 }
 
-/*------------------------------------------------------------------------.
-| Last half of push_string ().  If next is now NULL, a call to push_file  |
-| () has invalidated the previous call to push_string_init (), so we just |
-| give up.  If the new object is void, we do not push it.  The function	  |
-| push_string_finish () returns a pointer to the finished object.  This	  |
-| pointer is only for temporary use, since reading the next token might	  |
-| release the memory used for the object.				  |
-`------------------------------------------------------------------------*/
+/*-------------------------------------------------------------------.
+| Last half of push_string ().  If next is now NULL, a call to       |
+| push_file () or push_macro () has invalidated the previous call to |
+| push_string_init (), so we just give up.  If the new object is     |
+| void, we do not push it.  The function push_string_finish ()       |
+| returns a pointer to the finished object.  This pointer is only    |
+| for temporary use, since reading the next token might release the  |
+| memory used for the object.                                        |
+`-------------------------------------------------------------------*/
 
 const char *
 push_string_finish (void)
@@ -413,7 +433,7 @@ pop_wrapup (void)
 
 /*-------------------------------------------------------------------.
 | When a MACRO token is seen, next_token () uses init_macro_token () |
-| to retrieve the value of the function pointer.                     |
+| to retrieve the value of the function pointer and store it in TD.  |
 `-------------------------------------------------------------------*/
 
 static void
@@ -425,12 +445,14 @@ init_macro_token (token_data *td)
 }
 
 
-/*------------------------------------------------------------------------.
-| Low level input is done a character at a time.  The function peek_input |
-| () is used to look at the next character in the input stream.  At any	  |
-| given time, it reads from the input_block on the top of the current	  |
-| input stack.								  |
-`------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------.
+| Low level input is done a character at a time.  The function     |
+| peek_input () is used to look at the next character in the input |
+| stream.  At any given time, it reads from the input_block on the |
+| top of the current input stack.  The return value is an unsigned |
+| char, or CHAR_EOF if there is no more input, or CHAR_MACRO if a  |
+| builtin token occurs next.                                       |
+`-----------------------------------------------------------------*/
 
 static int
 peek_input (void)
@@ -556,7 +578,8 @@ next_char_1 (void)
 
 /*-------------------------------------------------------------------.
 | skip_line () simply discards all immediately following characters, |
-| up to the first newline.  It is only used from m4_dnl ().          |
+| up to the first newline.  It is only used from m4_dnl ().  Report  |
+| warnings on behalf of NAME.                                        |
 `-------------------------------------------------------------------*/
 
 void
@@ -585,7 +608,7 @@ skip_line (const char *name)
 
 /*------------------------------------------------------------------.
 | This function is for matching a string against a prefix of the    |
-| input stream.  If the string matches the input and consume is     |
+| input stream.  If the string S matches the input and CONSUME is   |
 | true, the input is discarded; otherwise any characters read are   |
 | pushed back again.  The function is used only when multicharacter |
 | quotes or comment delimiters are used.                            |
@@ -637,7 +660,7 @@ match_input (const char *s, bool consume)
 | will not hurt efficiency too much when single character quotes and  |
 | comment delimiters are used.  If CONSUME, then CH is the result of  |
 | next_char, and a successful match will discard the matched string.  |
-| Otherwise, CH is the result of peek_char, and the input stream is   |
+| Otherwise, CH is the result of peek_input, and the input stream is  |
 | effectively unchanged.                                              |
 `--------------------------------------------------------------------*/
 
@@ -648,7 +671,7 @@ match_input (const char *s, bool consume)
 
 
 /*----------------------------------------------------------.
-| Inititialise input stacks, and quote/comment characters.  |
+| Inititialize input stacks, and quote/comment characters.  |
 `----------------------------------------------------------*/
 
 void
@@ -689,21 +712,20 @@ input_init (void)
 #ifdef ENABLE_CHANGEWORD
   set_word_regexp (NULL, user_word_regexp);
 #endif /* ENABLE_CHANGEWORD */
+
+  set_quote_age ();
 }
 
 
-/*------------------------------------------------------------------.
-| Functions for setting quotes and comment delimiters.  Used by	    |
-| m4_changecom () and m4_changequote ().  Pass NULL if the argument |
-| was not present, to distinguish from an explicit empty string.    |
-`------------------------------------------------------------------*/
+/*--------------------------------------------------------------------.
+| Set the quote delimiters to LQ and RQ.  Used by m4_changequote ().  |
+| Pass NULL if the argument was not present, to distinguish from an   |
+| explicit empty string.                                              |
+`--------------------------------------------------------------------*/
 
 void
 set_quotes (const char *lq, const char *rq)
 {
-  free (lquote.string);
-  free (rquote.string);
-
   /* POSIX states that with 0 arguments, the default quotes are used.
      POSIX XCU ERN 112 states that behavior is implementation-defined
      if there was only one argument, or if there is an empty string in
@@ -719,18 +741,27 @@ set_quotes (const char *lq, const char *rq)
   else if (!rq || (*lq && !*rq))
     rq = DEF_RQUOTE;
 
+  if (strcmp (lquote.string, lq) == 0 && strcmp (rquote.string, rq) == 0)
+    return;
+
+  free (lquote.string);
+  free (rquote.string);
   lquote.string = xstrdup (lq);
   lquote.length = strlen (lquote.string);
   rquote.string = xstrdup (rq);
   rquote.length = strlen (rquote.string);
+  set_quote_age ();
 }
 
+/*--------------------------------------------------------------------.
+| Set the comment delimiters to BC and EC.  Used by m4_changecom ().  |
+| Pass NULL if the argument was not present, to distinguish from an   |
+| explicit empty string.                                              |
+`--------------------------------------------------------------------*/
+
 void
 set_comment (const char *bc, const char *ec)
 {
-  free (bcomm.string);
-  free (ecomm.string);
-
   /* POSIX requires no arguments to disable comments.  It requires
      empty arguments to be used as-is, but this is counter to
      traditional behavior, because a non-null begin and null end makes
@@ -743,14 +774,26 @@ set_comment (const char *bc, const char *ec)
   else if (!ec || (*bc && !*ec))
     ec = DEF_ECOMM;
 
+  if (strcmp (bcomm.string, bc) == 0 && strcmp (ecomm.string, ec) == 0)
+    return;
+
+  free (bcomm.string);
+  free (ecomm.string);
   bcomm.string = xstrdup (bc);
   bcomm.length = strlen (bcomm.string);
   ecomm.string = xstrdup (ec);
   ecomm.length = strlen (ecomm.string);
+  set_quote_age ();
 }
 
 #ifdef ENABLE_CHANGEWORD
 
+/*-------------------------------------------------------------------.
+| Set the regular expression for recognizing words to REGEXP, and    |
+| report errors on behalf of CALLER.  If REGEXP is NULL, revert back |
+| to the default parsing rules.                                      |
+`-------------------------------------------------------------------*/
+
 void
 set_word_regexp (const char *caller, const char *regexp)
 {
@@ -762,6 +805,7 @@ set_word_regexp (const char *caller, const char *regexp)
   if (!*regexp || !strcmp (regexp, DEFAULT_WORD_REGEXP))
     {
       default_word_regexp = true;
+      set_quote_age ();
       return;
     }
 
@@ -772,7 +816,6 @@ set_word_regexp (const char *caller, const char *regexp)
 
   if (msg != NULL)
     {
-      /* FIXME - report on behalf of macro caller.  */
       m4_warn (0, caller, _("bad regular expression `%s': %s"), regexp, msg);
       return;
     }
@@ -785,6 +828,7 @@ set_word_regexp (const char *caller, const char *regexp)
   re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
 
   default_word_regexp = false;
+  set_quote_age ();
 
   if (word_start == NULL)
     word_start = (char *) xmalloc (256);
@@ -799,6 +843,82 @@ set_word_regexp (const char *caller, const char *regexp)
 }
 
 #endif /* ENABLE_CHANGEWORD */
+
+/* Call this when changing anything that might impact the quote age,
+   so that quote_age and safe_quotes will reflect the change.  */
+static void
+set_quote_age (void)
+{
+  /* Multi-character quotes are inherently unsafe, since concatenation
+     of individual characters can result in a quote delimiter,
+     consider:
+
+     define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>)
+     => A]> (not ]>a)
+
+   Also, unquoted close delimiters are unsafe, consider:
+
+     define(echo,``$1'')define(a,A)echo(`a''`a')
+     => aA' (not a'a)
+
+   Comment delimiters that overlap with quote delimiters or active
+   characters also present a problem, consider:
+
+     define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,'))
+     => A,a,A (not A,A,A)
+
+   And let's not even think about the impact of changeword, since it
+   will disappear for M4 2.0.
+
+   So rather than check every token for an unquoted delimiter, we
+   merely encode current_quote_age to 0 when things are unsafe, and
+   non-zero when safe (namely, to the 16-bit value composed of the
+   single-character start and end quote delimiters).  There may be
+   other situations which are safe even when this algorithm sets the
+   quote_age to zero, but at least a quote_age of zero always produces
+   correct results (although it may take more time in doing so).  */
+
+  /* Hueristic of characters that might impact rescan if they appear in
+     a quote delimiter.  */
+#define Letters "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  static const char unsafe[] = Letters "_0123456789(,) \t\n\r\f\v";
+#undef Letters
+
+  if (lquote.length == 1 && rquote.length == 1
+      && strpbrk(lquote.string, unsafe) == NULL
+      && strpbrk(rquote.string, unsafe) == NULL
+      && default_word_regexp && *lquote.string != *rquote.string
+      && *bcomm.string != '(' && *bcomm.string != ','
+      && *bcomm.string != ')' && *bcomm.string != *lquote.string)
+    current_quote_age = (((*lquote.string & 0xff) << 8)
+			 | (*rquote.string & 0xff));
+  else
+    current_quote_age = 0;
+}
+
+/* Return the current quote age.  Each non-trivial changequote alters
+   this value; the idea is that if quoting hasn't changed, then we can
+   skip parsing a single argument, quoted or unquoted, within the
+   context of a quoted string, as well as skip parsing a series of
+   quoted arguments within the context of argument collection.  */
+unsigned int
+quote_age (void)
+{
+  /* This accessor is a function, so that the implementation can
+     change if needed.  See set_quote_age for the current
+     implementation.  */
+  return current_quote_age;
+}
+
+/* Return true if the current quote delimiters guarantee that
+   reparsing the current token in the context of a quoted string will
+   be safe.  This could always return false and behavior would still
+   be correct, just slower.  */
+bool
+safe_quotes (void)
+{
+  return current_quote_age != 0;
+}
 
 
 /*--------------------------------------------------------------------.
@@ -835,7 +955,7 @@ next_token (token_data *td, int *line, const char *caller)
   if (!line)
     line = &dummy;
 
- /* Can't consume character until after CHAR_MACRO is handled.  */
+  /* Can't consume character until after CHAR_MACRO is handled.  */
   ch = peek_input ();
   if (ch == CHAR_EOF)
     {
@@ -868,7 +988,7 @@ next_token (token_data *td, int *line, const char *caller)
       if (ch != CHAR_EOF)
 	obstack_grow (&token_stack, ecomm.string, ecomm.length);
       else
-	/* current_file changed to "" if we see CHAR_EOF, use the
+	/* Current_file changed to "" if we see CHAR_EOF, use the
 	   previous value we stored earlier.  */
 	m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
 			  _("end of file in comment"));
@@ -951,7 +1071,7 @@ next_token (token_data *td, int *line, const char *caller)
 	{
 	  ch = next_char ();
 	  if (ch == CHAR_EOF)
-	    /* current_file changed to "" if we see CHAR_EOF, use
+	    /* Current_file changed to "" if we see CHAR_EOF, use
 	       the previous value we stored earlier.  */
 	    m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
 			      _("end of file in string"));
@@ -977,6 +1097,7 @@ next_token (token_data *td, int *line, const char *caller)
   TOKEN_DATA_LEN (td) = obstack_object_size (&token_stack);
   obstack_1grow (&token_stack, '\0');
   TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack);
+  TOKEN_DATA_QUOTE_AGE (td) = current_quote_age;
 #ifdef ENABLE_CHANGEWORD
   if (orig_text == NULL)
     TOKEN_DATA_ORIG_TEXT (td) = TOKEN_DATA_TEXT (td);
diff --git a/src/m4.h b/src/m4.h
index ac819987..d7b6e088 100644
--- a/src/m4.h
+++ b/src/m4.h
@@ -299,6 +299,13 @@ struct token_data
 	     support NUL.  */
 	  size_t len;
 	  char *text;
+	  /* The value of quote_age when this token was scanned.  If
+	     this token is later encountered in the context of
+	     scanning a quoted string, and quote_age has not changed,
+	     then rescanning this string is provably unnecessary.  If
+	     zero, then this string potentially contains content that
+	     might change the parse on rescan.  Ignored for 0 len.  */
+	  unsigned int quote_age;
 #ifdef ENABLE_CHANGEWORD
 	  char *original_text;
 #endif
@@ -316,6 +323,7 @@ struct token_data
 #define TOKEN_DATA_TYPE(Td)		((Td)->type)
 #define TOKEN_DATA_LEN(Td)		((Td)->u.u_t.len)
 #define TOKEN_DATA_TEXT(Td)		((Td)->u.u_t.text)
+#define TOKEN_DATA_QUOTE_AGE(Td)	((Td)->u.u_t.quote_age)
 #ifdef ENABLE_CHANGEWORD
 # define TOKEN_DATA_ORIG_TEXT(Td)	((Td)->u.u_t.original_text)
 #endif
@@ -355,6 +363,8 @@ void set_comment (const char *, const char *);
 #ifdef ENABLE_CHANGEWORD
 void set_word_regexp (const char *, const char *);
 #endif
+unsigned int quote_age (void);
+bool safe_quotes (void);
 
 /* File: output.c --- output functions.  */
 extern int current_diversion;
diff --git a/src/macro.c b/src/macro.c
index a59a1f08..56a8571d 100644
--- a/src/macro.c
+++ b/src/macro.c
@@ -41,6 +41,10 @@ struct macro_arguments
   bool_bitfield has_ref : 1;
   const char *argv0; /* The macro name being expanded.  */
   size_t argv0_len; /* Length of argv0.  */
+  /* The value of quote_age used when parsing all arguments in this
+     object, or 0 if quote_age changed during parsing or if any of the
+     arguments might contain content that can affect rescan.  */
+  unsigned int quote_age;
   size_t arraylen; /* True length of allocated elements in array.  */
   /* Used as a variable-length array, storing information about each
      argument.  */
@@ -48,7 +52,8 @@ struct macro_arguments
 };
 
 static void expand_macro (symbol *);
-static void expand_token (struct obstack *, token_type, token_data *, int);
+static bool expand_token (struct obstack *, token_type, token_data *, int,
+			  bool);
 
 /* Current recursion level in expand_macro ().  */
 int expansion_level = 0;
@@ -95,37 +100,64 @@ expand_input (void)
 #endif
 
   while ((t = next_token (&td, &line, NULL)) != TOKEN_EOF)
-    expand_token ((struct obstack *) NULL, t, &td, line);
+    expand_token ((struct obstack *) NULL, t, &td, line, true);
 
   obstack_free (&arg_stack, NULL);
   obstack_free (&argv_stack, NULL);
 }
 
 
-/*------------------------------------------------------------------------.
-| Expand one token, according to its type.  Potential macro names	  |
-| (TOKEN_WORD) are looked up in the symbol table, to see if they have a	  |
-| macro definition.  If they have, they are expanded as macros, otherwise |
-| the text are just copied to the output.				  |
-`------------------------------------------------------------------------*/
+/*-------------------------------------------------------------------.
+| Expand one token TD onto the stack OBS, according to its type T,   |
+| which began parsing on the specified LINE.  If OBS is NULL, output |
+| the data.  If FIRST, there is no previous text in the current	     |
+| argument.  Potential macro names (TOKEN_WORD) are looked up in the |
+| symbol table, to see if they have a macro definition.  If they     |
+| have, they are expanded as macros, otherwise the text is just	     |
+| copied to the output.  Return true if the result is guaranteed to  |
+| give the same parse on rescan in a quoted context, provided	     |
+| quoting doesn't change.  Returning false is always safe, although  |
+| it may lead to slower performance.				     |
+`-------------------------------------------------------------------*/
 
-static void
-expand_token (struct obstack *obs, token_type t, token_data *td, int line)
+static bool
+expand_token (struct obstack *obs, token_type t, token_data *td, int line,
+	      bool first)
 {
   symbol *sym;
+  bool result;
+  int ch;
 
   switch (t)
     {				/* TOKSW */
     case TOKEN_EOF:
     case TOKEN_MACDEF:
+      /* Always safe, since there is no text to rescan.  */
+      return true;
+
+    case TOKEN_STRING:
+      /* Tokens and comments are safe in isolation (since quote_age()
+	 detects any change in delimiters).  But if other text is
+	 already present, multi-character delimiters could be an
+	 issue, so use a conservative heuristic.  */
+      result = first || safe_quotes ();
       break;
 
     case TOKEN_OPEN:
     case TOKEN_COMMA:
     case TOKEN_CLOSE:
+      /* Conservative heuristic; thanks to multi-character delimiter
+	 concatenation.  */
+      result = safe_quotes ();
+      break;
+
     case TOKEN_SIMPLE:
-    case TOKEN_STRING:
-      shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line);
+      /* Conservative heuristic; if these characters are whitespace or
+	 numeric, then behavior of safe_quotes is applicable.
+	 Otherwise, assume these characters have a high likelihood of
+	 use in quote delimiters.  */
+      ch = to_uchar (*TOKEN_DATA_TEXT (td));
+      result = (isspace (ch) || isdigit (ch)) && safe_quotes ();
       break;
 
     case TOKEN_WORD:
@@ -141,15 +173,22 @@ expand_token (struct obstack *obs, token_type t, token_data *td, int line)
 #else
 	  shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line);
 #endif /* !ENABLE_CHANGEWORD */
+	  /* The word just appended is unquoted, but the heuristics of
+	     safe_quote are applicable.  */
+	  return safe_quotes();
 	}
-      else
-	expand_macro (sym);
-      break;
+      expand_macro (sym);
+      /* Expanding a macro creates new tokens to scan, and those new
+	 tokens may append unsafe text later; but we did not append
+	 any text now.  */
+      return true;
 
     default:
       assert (!"expand_token");
       abort ();
     }
+  shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line);
+  return result;
 }
 
 
@@ -184,6 +223,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
   int paren_level;
   const char *file = current_file;
   int line = current_line;
+  unsigned int age = quote_age ();
+  bool first = true;
 
   TOKEN_DATA_TYPE (argp) = TOKEN_VOID;
 
@@ -211,10 +252,11 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
 		    return t == TOKEN_COMMA;
 		  warn_builtin_concat (caller, TOKEN_DATA_FUNC (argp));
 		}
+	      TOKEN_DATA_TYPE (argp) = TOKEN_TEXT;
 	      TOKEN_DATA_LEN (argp) = obstack_object_size (obs);
 	      obstack_1grow (obs, '\0');
-	      TOKEN_DATA_TYPE (argp) = TOKEN_TEXT;
 	      TOKEN_DATA_TEXT (argp) = (char *) obstack_finish (obs);
+	      TOKEN_DATA_QUOTE_AGE (argp) = age;
 	      return t == TOKEN_COMMA;
 	    }
 	  /* fallthru */
@@ -224,11 +266,12 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
 	    paren_level++;
 	  else if (t == TOKEN_CLOSE)
 	    paren_level--;
-	  expand_token (obs, t, &td, line);
+	  if (!expand_token (obs, t, &td, line, first))
+	    age = 0;
 	  break;
 
 	case TOKEN_EOF:
-	  /* current_file changed to "" if we see TOKEN_EOF, use the
+	  /* Current_file changed to "" if we see TOKEN_EOF, use the
 	     previous value we stored earlier.  */
 	  m4_error_at_line (EXIT_FAILURE, 0, file, line, caller,
 			    _("end of file in argument list"));
@@ -236,7 +279,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
 
 	case TOKEN_WORD:
 	case TOKEN_STRING:
-	  expand_token (obs, t, &td, line);
+	  if (!expand_token (obs, t, &td, line, first))
+	    age = 0;
 	  break;
 
 	case TOKEN_MACDEF:
@@ -260,6 +304,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
 	  abort ();
 	}
 
+      if (TOKEN_DATA_TYPE (argp) != TOKEN_VOID || obstack_object_size (obs))
+	first = false;
       t = next_token (&td, NULL, caller);
     }
 }
@@ -285,6 +331,7 @@ collect_arguments (symbol *sym, struct obstack *arguments)
   args.has_ref = false;
   args.argv0 = SYMBOL_NAME (sym);
   args.argv0_len = strlen (args.argv0);
+  args.quote_age = quote_age ();
   args.arraylen = 0;
   obstack_grow (&argv_stack, &args, offsetof (macro_arguments, array));
 
@@ -303,24 +350,31 @@ collect_arguments (symbol *sym, struct obstack *arguments)
 	  obstack_ptr_grow (&argv_stack, tdp);
 	  args.arraylen++;
 	  args.argc++;
+	  /* Be conservative - any change in quoting while collecting
+	     arguments, or any argument that consists of unsafe text,
+	     will require a rescan if $@ is reused.  */
+	  if (TOKEN_DATA_TYPE (tdp) == TOKEN_TEXT
+	      && TOKEN_DATA_LEN (tdp) > 0
+	      && TOKEN_DATA_QUOTE_AGE (tdp) != args.quote_age)
+	    args.quote_age = 0;
 	}
       while (more_args);
     }
   argv = (macro_arguments *) obstack_finish (&argv_stack);
   argv->argc = args.argc;
+  if (args.quote_age != quote_age ())
+    argv->quote_age = 0;
   argv->arraylen = args.arraylen;
   return argv;
 }
 
 
-/*------------------------------------------------------------------------.
-| The actual call of a macro is handled by call_macro ().  call_macro ()  |
-| is passed a symbol SYM, whose type is used to call either a builtin	  |
-| function, or the user macro expansion function expand_user_macro ()	  |
-| (lives in builtin.c).  There are ARGC arguments to the call, stored in  |
-| the ARGV table.  The expansion is left on the obstack EXPANSION.  Macro |
-| tracing is also handled here.						  |
-`------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------.
+| Call the macro SYM, which is either a builtin function or a user |
+| macro (via the expansion function expand_user_macro () in        |
+| builtin.c).  There are ARGC arguments to the call, stored in the |
+| ARGV table.  The expansion is left on the obstack EXPANSION.     |
+`-----------------------------------------------------------------*/
 
 void
 call_macro (symbol *sym, int argc, macro_arguments *argv,
@@ -436,6 +490,7 @@ expand_macro (symbol *sym)
     obstack_free (&argv_stack, argv);
 }
 
+
 /* Given ARGV, return the token_data that contains argument INDEX;
    INDEX must be > 0, < argv->argc.  */
 static token_data *
@@ -472,7 +527,6 @@ arg_token (macro_arguments *argv, unsigned int index)
   return token;
 }
 
-
 /* Given ARGV, return how many arguments it refers to.  */
 unsigned int
 arg_argc (macro_arguments *argv)
@@ -496,7 +550,7 @@ arg_type (macro_arguments *argv, unsigned int index)
   return type;
 }
 
-/* Given ARGV, return the text at argument INDEX, or NULL if the
+/* Given ARGV, return the text at argument INDEX.  Abort if the
    argument is not text.  Index 0 is always text, and indices beyond
    argc return the empty string.  */
 const char *
@@ -513,8 +567,6 @@ arg_text (macro_arguments *argv, unsigned int index)
     {
     case TOKEN_TEXT:
       return TOKEN_DATA_TEXT (token);
-    case TOKEN_FUNC:
-      return NULL;
     case TOKEN_COMP:
       // TODO - how to concatenate multiple arguments?  For now, we expect
       // only one element in the chain, and arg_token dereferences it...
@@ -557,7 +609,7 @@ arg_empty (macro_arguments *argv, unsigned int index)
   return arg_token (argv, index) == &empty_token;
 }
 
-/* Given ARGV, return the length of argument INDEX, or SIZE_MAX if the
+/* Given ARGV, return the length of argument INDEX.  Abort if the
    argument is not text.  Indices beyond argc return 0.  */
 size_t
 arg_len (macro_arguments *argv, unsigned int index)
@@ -574,8 +626,6 @@ arg_len (macro_arguments *argv, unsigned int index)
     case TOKEN_TEXT:
       assert ((token == &empty_token) == (TOKEN_DATA_LEN (token) == 0));
       return TOKEN_DATA_LEN (token);
-    case TOKEN_FUNC:
-      return SIZE_MAX;
     case TOKEN_COMP:
       // TODO - how to concatenate multiple arguments?  For now, we expect
       // only one element in the chain, and arg_token dereferences it...
@@ -587,30 +637,15 @@ arg_len (macro_arguments *argv, unsigned int index)
 }
 
 /* Given ARGV, return the builtin function referenced by argument
-   INDEX, or NULL if it is not a builtin.  Index 0, and indices beyond
-   argc, return NULL.  */
+   INDEX.  Abort if it is not a builtin in isolation.  */
 builtin_func *
 arg_func (macro_arguments *argv, unsigned int index)
 {
   token_data *token;
 
-  if (index == 0 || index >= argv->argc)
-    return NULL;
   token = arg_token (argv, index);
-  switch (TOKEN_DATA_TYPE (token))
-    {
-    case TOKEN_FUNC:
-      return TOKEN_DATA_FUNC (token);
-    case TOKEN_TEXT:
-      return NULL;
-    case TOKEN_COMP:
-      // TODO - how to concatenate multiple arguments?  For now, we expect
-      // only one element in the chain...
-    default:
-      break;
-    }
-  assert(!"arg_func");
-  abort ();
+  assert (TOKEN_DATA_TYPE (token) == TOKEN_FUNC);
+  return TOKEN_DATA_FUNC (token);
 }
 
 /* Create a new argument object using the same obstack as ARGV; thus,
@@ -673,5 +708,6 @@ make_argv_ref (macro_arguments *argv, const char *argv0, size_t argv0_len,
   new_argv->inuse = false;
   new_argv->argv0 = argv0;
   new_argv->argv0_len = argv0_len;
+  new_argv->quote_age = argv->quote_age;
   return new_argv;
 }
author	Eric Blake <ebb9@byu.net>	2007-10-24 08:36:26 -0600
committer	Eric Blake <ebb9@byu.net>	2007-12-06 10:25:55 -0700
commit	8b5b3b7a74f452fed795c063965966934a68755d (patch)
tree	64c05d239d202a1696a074f4938f1c0638012671
parent	ab7d5ea40dd30e38cdafdfa69e868390ff6f72ab (diff)
download	m4-8b5b3b7a74f452fed795c063965966934a68755d.tar.gz