summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Blake <ebb9@byu.net>2007-10-24 08:36:26 -0600
committerEric Blake <ebb9@byu.net>2007-12-06 10:25:55 -0700
commit8b5b3b7a74f452fed795c063965966934a68755d (patch)
tree64c05d239d202a1696a074f4938f1c0638012671
parentab7d5ea40dd30e38cdafdfa69e868390ff6f72ab (diff)
downloadm4-8b5b3b7a74f452fed795c063965966934a68755d.tar.gz
Stage 5: add notion of quote age
-rw-r--r--doc/m4.texinfo55
-rw-r--r--examples/Makefile.am3
-rw-r--r--examples/wraplifo.m410
-rw-r--r--src/input.c261
-rw-r--r--src/m4.h10
-rw-r--r--src/macro.c140
6 files changed, 355 insertions, 124 deletions
diff --git a/doc/m4.texinfo b/doc/m4.texinfo
index 3da16fc9..803dbf05 100644
--- a/doc/m4.texinfo
+++ b/doc/m4.texinfo
@@ -2635,6 +2635,47 @@ ifelse(`foo', `bar', `3', `gnu', `gnats', `6', `7', `8')
@result{}7
@end example
+@ignore
+@comment Stress tests, not worth documenting.
+@comment It would be nice to pass builtin tokens through ifelse, m4wrap,
+@comment user macros; hence the fixmes.
+@example
+define(`e', `$@@')define(`q', ``$@@'')define(`u', `$*')
+@result{}
+define(`cmp', `ifelse($1, $2, `yes', `no')')define(`d', defn(`defn'))
+@result{}
+cmp(`defn(`defn')', `defn(`d')')
+@result{}yes
+cmp(`defn(`defn')', ``<defn>'')
+@result{}no
+cmp(`q(defn(`defn'))', `q(defn(`d'))')
+@result{}yes
+cmp(`q(defn(`defn'))', `q(`<defn>')')
+@result{}no
+cmp(`q(defn(`defn'))', ``'')
+@result{}no
+cmp(`q(`1', `2', defn(`defn'))', `q(`1', `2', defn(`d'))')
+@result{}yes
+cmp(`q(`1', `2', defn(`defn'))', `q(`1', `2', `<defn>')')
+@result{}no
+cmp(`q(`1', `2', defn(`defn'))', ```1',`2',<defn>'')
+@result{}no
+cmp(`q(`1', `2', defn(`defn'))', ```1',`2',`''')-fixme
+@result{}yes-fixme
+define(`cat', `$1`'ifelse(`$#', `1', `', `$0(shift($@@))')')
+@result{}
+cat(`define(`foo',', defn(`divnum'), `)foo')-fixme
+@result{}-fixme
+cat(e(`define(`bar',', defn(`divnum'), `)bar'))-fixme
+@result{}-fixme
+m4wrap(`u('q(`cat(`define(`baz','', defn(`divnum'), ``)baz')')`)-fixme
+')
+@result{}
+^D
+@result{}-fixme
+@end example
+@end ignore
+
Naturally, the normal case will be slightly more advanced than these
examples. A common use of @code{ifelse} is in macros implementing loops
of various kinds.
@@ -3714,6 +3755,18 @@ changequote(`"', `"')
@result{}hiHIhi
@end example
+@ignore
+@comment And another stress test, not worth documenting in the manual.
+@example
+define(`aaaaaaaaaaaaaaaaaaaa', `A')define(`q', `"$@@"')
+@result{}
+changequote(`"', `"')
+@result{}
+q(q("aaaaaaaaaaaaaaaaaaaa", "a"))
+@result{}A,a
+@end example
+@end ignore
+
It is an error if the end of file occurs within a quoted string.
@comment status: 1
@@ -6490,7 +6543,7 @@ of @samp{-} on the command line.
@acronym{POSIX} requires @code{m4wrap} (@pxref{M4wrap}) to act in FIFO
(first-in, first-out) order, but @acronym{GNU} @code{m4} currently uses
LIFO order. Furthermore, @acronym{POSIX} states that only the first
-argument to @code{m4wrap} is saved for later evaluation, bug
+argument to @code{m4wrap} is saved for later evaluation, but
@acronym{GNU} @code{m4} saves and processes all arguments, with output
separated by spaces.
diff --git a/examples/Makefile.am b/examples/Makefile.am
index b1ef68a0..c1dc5227 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -58,4 +58,5 @@ translit.m4 \
undivert.incl \
undivert.m4 \
wrap.m4 \
-wrapfifo.m4
+wrapfifo.m4 \
+wraplifo.m4
diff --git a/examples/wraplifo.m4 b/examples/wraplifo.m4
new file mode 100644
index 00000000..bdbf3fb6
--- /dev/null
+++ b/examples/wraplifo.m4
@@ -0,0 +1,10 @@
+dnl Redefine m4wrap to have LIFO semantics.
+define(`_m4wrap_level', `0')dnl
+define(`_m4wrap', defn(`m4wrap'))dnl
+define(`m4wrap',
+`ifdef(`m4wrap'_m4wrap_level,
+ `define(`m4wrap'_m4wrap_level,
+ `$1'defn(`m4wrap'_m4wrap_level))',
+ `_m4wrap(`define(`_m4wrap_level', incr(_m4wrap_level))dnl
+m4wrap'_m4wrap_level)dnl
+define(`m4wrap'_m4wrap_level, `$1')')')dnl
diff --git a/src/input.c b/src/input.c
index 0aa60367..551b43d6 100644
--- a/src/input.c
+++ b/src/input.c
@@ -23,12 +23,13 @@
#include "m4.h"
-/* Unread input can be either files, that should be read (eg. included
- files), strings, which should be rescanned (eg. macro expansion text),
- or quoted macro definitions (as returned by the builtin "defn").
- Unread input are organised in a stack, implemented with an obstack.
- Each input source is described by a "struct input_block". The obstack
- is "current_input". The top of the input stack is "isp".
+/* Unread input can be either files to be read (command line,
+ "include", "sinclude"), strings which should be rescanned (macro
+ expansion text), or quoted macro definitions (as returned by the
+ builtin "defn"). Unread input is organized in a stack, implemented
+ with an obstack. Each input source is described by a "struct
+ input_block". The obstack is "current_input". The top of the
+ input stack is "isp".
The macro "m4wrap" places the text to be saved on another input
stack, on the obstack "wrapup_stack", whose top is "wsp". When EOF
@@ -42,12 +43,13 @@
Pushing new input on the input stack is done by push_file (),
push_string (), push_wrapup () (for wrapup text), and push_macro ()
- (for macro definitions). Because macro expansion needs direct access
- to the current input obstack (for optimisation), push_string () are
- split in two functions, push_string_init (), which returns a pointer
- to the current input stack, and push_string_finish (), which return a
- pointer to the final text. The input_block *next is used to manage
- the coordination between the different push routines.
+ (for macro definitions). Because macro expansion needs direct
+ access to the current input obstack (for optimization), push_string
+ () is split in two functions, push_string_init (), which returns a
+ pointer to the current input stack, and push_string_finish (),
+ which returns a pointer to the final text. The input_block *next
+ is used to manage the coordination between the different push
+ routines.
The current file and line number are stored in two global
variables, for use by the error handling functions in m4.c. Macro
@@ -62,6 +64,7 @@
# include "regex.h"
#endif /* ENABLE_CHANGEWORD */
+/* Type of an input block. */
enum input_type
{
INPUT_STRING, /* String resulting from macro expansion. */
@@ -71,28 +74,29 @@ enum input_type
typedef enum input_type input_type;
+/* A block of input to be scanned. */
struct input_block
{
- struct input_block *prev; /* previous input_block on the input stack */
- input_type type; /* see enum values */
- const char *file; /* file where this input is from */
- int line; /* line where this input is from */
+ struct input_block *prev; /* Previous input_block on the input stack. */
+ input_type type; /* See enum values. */
+ const char *file; /* File where this input is from. */
+ int line; /* Line where this input is from. */
union
{
struct
{
- char *string; /* remaining string value */
+ char *string; /* Remaining string value. */
}
u_s; /* INPUT_STRING */
struct
{
- FILE *fp; /* input file handle */
- bool_bitfield end : 1; /* true if peek has seen EOF */
- bool_bitfield close : 1; /* true if we should close file on pop */
- bool_bitfield advance : 1; /* track previous start_of_input_line */
+ FILE *fp; /* Input file handle. */
+ bool_bitfield end : 1; /* True if peek has seen EOF. */
+ bool_bitfield close : 1; /* True to close file on pop. */
+ bool_bitfield advance : 1; /* Track previous start_of_input_line. */
}
u_f; /* INPUT_FILE */
- builtin_func *func; /* pointer to macro's function */
+ builtin_func *func; /* Pointer to macro's function. */
}
u;
};
@@ -136,8 +140,8 @@ static bool start_of_input_line;
/* Flag for next_char () to recognize change in input block. */
static bool input_change;
-#define CHAR_EOF 256 /* character return on EOF */
-#define CHAR_MACRO 257 /* character return for MACRO token */
+#define CHAR_EOF 256 /* Character return on EOF. */
+#define CHAR_MACRO 257 /* Character return for MACRO token. */
/* Quote chars. */
STRING rquote;
@@ -151,16 +155,30 @@ STRING ecomm;
# define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"
+/* Table of characters that can start a word. */
static char *word_start;
+
+/* Current regular expression for detecting words. */
static struct re_pattern_buffer word_regexp;
-static int default_word_regexp;
+
+/* True if changeword is not active. */
+static bool default_word_regexp;
+
+/* Reused memory for detecting matches in word detection. */
static struct re_registers regs;
#else /* !ENABLE_CHANGEWORD */
-# define default_word_regexp 1
+# define default_word_regexp true
#endif /* !ENABLE_CHANGEWORD */
+/* Track the current quote age, determined by all significant
+ changequote, changecom, and changeword calls, since any one of
+ these can alter the rescan of a prior parameter in a quoted
+ context. */
+static unsigned int current_quote_age;
+
static bool pop_input (bool);
+static void set_quote_age (void);
#ifdef DEBUG_INPUT
static const char *token_type_string (token_type);
@@ -172,7 +190,8 @@ static const char *token_type_string (token_type);
| current file name and line number. If next is non-NULL, this push |
| invalidates a call to push_string_init (), whose storage is |
| consequently released. If CLOSE, then close FP after EOF is |
-| detected. |
+| detected. TITLE is used as the location for text parsed from the |
+| file (not necessarily the file name). |
`-------------------------------------------------------------------*/
void
@@ -206,11 +225,11 @@ push_file (FILE *fp, const char *title, bool close)
isp = i;
}
-/*---------------------------------------------------------------.
-| push_macro () pushes a builtin macro's definition on the input |
-| stack. If next is non-NULL, this push invalidates a call to |
-| push_string_init (), whose storage is consequently released. |
-`---------------------------------------------------------------*/
+/*-----------------------------------------------------------------.
+| push_macro () pushes the builtin macro FUNC on the input stack. |
+| If next is non-NULL, this push invalidates a call to |
+| push_string_init (), whose storage is consequently released. |
+`-----------------------------------------------------------------*/
void
push_macro (builtin_func *func)
@@ -235,10 +254,10 @@ push_macro (builtin_func *func)
isp = i;
}
-/*------------------------------------------------------------------.
-| First half of push_string (). The pointer next points to the new |
-| input_block. |
-`------------------------------------------------------------------*/
+/*--------------------------------------------------------------.
+| First half of push_string (). The return value points to the |
+| obstack where expansion text should be placed. |
+`--------------------------------------------------------------*/
struct obstack *
push_string_init (void)
@@ -257,14 +276,15 @@ push_string_init (void)
return current_input;
}
-/*------------------------------------------------------------------------.
-| Last half of push_string (). If next is now NULL, a call to push_file |
-| () has invalidated the previous call to push_string_init (), so we just |
-| give up. If the new object is void, we do not push it. The function |
-| push_string_finish () returns a pointer to the finished object. This |
-| pointer is only for temporary use, since reading the next token might |
-| release the memory used for the object. |
-`------------------------------------------------------------------------*/
+/*-------------------------------------------------------------------.
+| Last half of push_string (). If next is now NULL, a call to |
+| push_file () or push_macro () has invalidated the previous call to |
+| push_string_init (), so we just give up. If the new object is |
+| void, we do not push it. The function push_string_finish () |
+| returns a pointer to the finished object. This pointer is only |
+| for temporary use, since reading the next token might release the |
+| memory used for the object. |
+`-------------------------------------------------------------------*/
const char *
push_string_finish (void)
@@ -413,7 +433,7 @@ pop_wrapup (void)
/*-------------------------------------------------------------------.
| When a MACRO token is seen, next_token () uses init_macro_token () |
-| to retrieve the value of the function pointer. |
+| to retrieve the value of the function pointer and store it in TD. |
`-------------------------------------------------------------------*/
static void
@@ -425,12 +445,14 @@ init_macro_token (token_data *td)
}
-/*------------------------------------------------------------------------.
-| Low level input is done a character at a time. The function peek_input |
-| () is used to look at the next character in the input stream. At any |
-| given time, it reads from the input_block on the top of the current |
-| input stack. |
-`------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------.
+| Low level input is done a character at a time. The function |
+| peek_input () is used to look at the next character in the input |
+| stream. At any given time, it reads from the input_block on the |
+| top of the current input stack. The return value is an unsigned |
+| char, or CHAR_EOF if there is no more input, or CHAR_MACRO if a |
+| builtin token occurs next. |
+`-----------------------------------------------------------------*/
static int
peek_input (void)
@@ -556,7 +578,8 @@ next_char_1 (void)
/*-------------------------------------------------------------------.
| skip_line () simply discards all immediately following characters, |
-| up to the first newline. It is only used from m4_dnl (). |
+| up to the first newline. It is only used from m4_dnl (). Report |
+| warnings on behalf of NAME. |
`-------------------------------------------------------------------*/
void
@@ -585,7 +608,7 @@ skip_line (const char *name)
/*------------------------------------------------------------------.
| This function is for matching a string against a prefix of the |
-| input stream. If the string matches the input and consume is |
+| input stream. If the string S matches the input and CONSUME is |
| true, the input is discarded; otherwise any characters read are |
| pushed back again. The function is used only when multicharacter |
| quotes or comment delimiters are used. |
@@ -637,7 +660,7 @@ match_input (const char *s, bool consume)
| will not hurt efficiency too much when single character quotes and |
| comment delimiters are used. If CONSUME, then CH is the result of |
| next_char, and a successful match will discard the matched string. |
-| Otherwise, CH is the result of peek_char, and the input stream is |
+| Otherwise, CH is the result of peek_input, and the input stream is |
| effectively unchanged. |
`--------------------------------------------------------------------*/
@@ -648,7 +671,7 @@ match_input (const char *s, bool consume)
/*----------------------------------------------------------.
-| Inititialise input stacks, and quote/comment characters. |
+| Inititialize input stacks, and quote/comment characters. |
`----------------------------------------------------------*/
void
@@ -689,21 +712,20 @@ input_init (void)
#ifdef ENABLE_CHANGEWORD
set_word_regexp (NULL, user_word_regexp);
#endif /* ENABLE_CHANGEWORD */
+
+ set_quote_age ();
}
-/*------------------------------------------------------------------.
-| Functions for setting quotes and comment delimiters. Used by |
-| m4_changecom () and m4_changequote (). Pass NULL if the argument |
-| was not present, to distinguish from an explicit empty string. |
-`------------------------------------------------------------------*/
+/*--------------------------------------------------------------------.
+| Set the quote delimiters to LQ and RQ. Used by m4_changequote (). |
+| Pass NULL if the argument was not present, to distinguish from an |
+| explicit empty string. |
+`--------------------------------------------------------------------*/
void
set_quotes (const char *lq, const char *rq)
{
- free (lquote.string);
- free (rquote.string);
-
/* POSIX states that with 0 arguments, the default quotes are used.
POSIX XCU ERN 112 states that behavior is implementation-defined
if there was only one argument, or if there is an empty string in
@@ -719,18 +741,27 @@ set_quotes (const char *lq, const char *rq)
else if (!rq || (*lq && !*rq))
rq = DEF_RQUOTE;
+ if (strcmp (lquote.string, lq) == 0 && strcmp (rquote.string, rq) == 0)
+ return;
+
+ free (lquote.string);
+ free (rquote.string);
lquote.string = xstrdup (lq);
lquote.length = strlen (lquote.string);
rquote.string = xstrdup (rq);
rquote.length = strlen (rquote.string);
+ set_quote_age ();
}
+/*--------------------------------------------------------------------.
+| Set the comment delimiters to BC and EC. Used by m4_changecom (). |
+| Pass NULL if the argument was not present, to distinguish from an |
+| explicit empty string. |
+`--------------------------------------------------------------------*/
+
void
set_comment (const char *bc, const char *ec)
{
- free (bcomm.string);
- free (ecomm.string);
-
/* POSIX requires no arguments to disable comments. It requires
empty arguments to be used as-is, but this is counter to
traditional behavior, because a non-null begin and null end makes
@@ -743,14 +774,26 @@ set_comment (const char *bc, const char *ec)
else if (!ec || (*bc && !*ec))
ec = DEF_ECOMM;
+ if (strcmp (bcomm.string, bc) == 0 && strcmp (ecomm.string, ec) == 0)
+ return;
+
+ free (bcomm.string);
+ free (ecomm.string);
bcomm.string = xstrdup (bc);
bcomm.length = strlen (bcomm.string);
ecomm.string = xstrdup (ec);
ecomm.length = strlen (ecomm.string);
+ set_quote_age ();
}
#ifdef ENABLE_CHANGEWORD
+/*-------------------------------------------------------------------.
+| Set the regular expression for recognizing words to REGEXP, and |
+| report errors on behalf of CALLER. If REGEXP is NULL, revert back |
+| to the default parsing rules. |
+`-------------------------------------------------------------------*/
+
void
set_word_regexp (const char *caller, const char *regexp)
{
@@ -762,6 +805,7 @@ set_word_regexp (const char *caller, const char *regexp)
if (!*regexp || !strcmp (regexp, DEFAULT_WORD_REGEXP))
{
default_word_regexp = true;
+ set_quote_age ();
return;
}
@@ -772,7 +816,6 @@ set_word_regexp (const char *caller, const char *regexp)
if (msg != NULL)
{
- /* FIXME - report on behalf of macro caller. */
m4_warn (0, caller, _("bad regular expression `%s': %s"), regexp, msg);
return;
}
@@ -785,6 +828,7 @@ set_word_regexp (const char *caller, const char *regexp)
re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
default_word_regexp = false;
+ set_quote_age ();
if (word_start == NULL)
word_start = (char *) xmalloc (256);
@@ -799,6 +843,82 @@ set_word_regexp (const char *caller, const char *regexp)
}
#endif /* ENABLE_CHANGEWORD */
+
+/* Call this when changing anything that might impact the quote age,
+ so that quote_age and safe_quotes will reflect the change. */
+static void
+set_quote_age (void)
+{
+ /* Multi-character quotes are inherently unsafe, since concatenation
+ of individual characters can result in a quote delimiter,
+ consider:
+
+ define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>)
+ => A]> (not ]>a)
+
+ Also, unquoted close delimiters are unsafe, consider:
+
+ define(echo,``$1'')define(a,A)echo(`a''`a')
+ => aA' (not a'a)
+
+ Comment delimiters that overlap with quote delimiters or active
+ characters also present a problem, consider:
+
+ define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,'))
+ => A,a,A (not A,A,A)
+
+ And let's not even think about the impact of changeword, since it
+ will disappear for M4 2.0.
+
+ So rather than check every token for an unquoted delimiter, we
+ merely encode current_quote_age to 0 when things are unsafe, and
+ non-zero when safe (namely, to the 16-bit value composed of the
+ single-character start and end quote delimiters). There may be
+ other situations which are safe even when this algorithm sets the
+ quote_age to zero, but at least a quote_age of zero always produces
+ correct results (although it may take more time in doing so). */
+
+ /* Hueristic of characters that might impact rescan if they appear in
+ a quote delimiter. */
+#define Letters "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ static const char unsafe[] = Letters "_0123456789(,) \t\n\r\f\v";
+#undef Letters
+
+ if (lquote.length == 1 && rquote.length == 1
+ && strpbrk(lquote.string, unsafe) == NULL
+ && strpbrk(rquote.string, unsafe) == NULL
+ && default_word_regexp && *lquote.string != *rquote.string
+ && *bcomm.string != '(' && *bcomm.string != ','
+ && *bcomm.string != ')' && *bcomm.string != *lquote.string)
+ current_quote_age = (((*lquote.string & 0xff) << 8)
+ | (*rquote.string & 0xff));
+ else
+ current_quote_age = 0;
+}
+
+/* Return the current quote age. Each non-trivial changequote alters
+ this value; the idea is that if quoting hasn't changed, then we can
+ skip parsing a single argument, quoted or unquoted, within the
+ context of a quoted string, as well as skip parsing a series of
+ quoted arguments within the context of argument collection. */
+unsigned int
+quote_age (void)
+{
+ /* This accessor is a function, so that the implementation can
+ change if needed. See set_quote_age for the current
+ implementation. */
+ return current_quote_age;
+}
+
+/* Return true if the current quote delimiters guarantee that
+ reparsing the current token in the context of a quoted string will
+ be safe. This could always return false and behavior would still
+ be correct, just slower. */
+bool
+safe_quotes (void)
+{
+ return current_quote_age != 0;
+}
/*--------------------------------------------------------------------.
@@ -835,7 +955,7 @@ next_token (token_data *td, int *line, const char *caller)
if (!line)
line = &dummy;
- /* Can't consume character until after CHAR_MACRO is handled. */
+ /* Can't consume character until after CHAR_MACRO is handled. */
ch = peek_input ();
if (ch == CHAR_EOF)
{
@@ -868,7 +988,7 @@ next_token (token_data *td, int *line, const char *caller)
if (ch != CHAR_EOF)
obstack_grow (&token_stack, ecomm.string, ecomm.length);
else
- /* current_file changed to "" if we see CHAR_EOF, use the
+ /* Current_file changed to "" if we see CHAR_EOF, use the
previous value we stored earlier. */
m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
_("end of file in comment"));
@@ -951,7 +1071,7 @@ next_token (token_data *td, int *line, const char *caller)
{
ch = next_char ();
if (ch == CHAR_EOF)
- /* current_file changed to "" if we see CHAR_EOF, use
+ /* Current_file changed to "" if we see CHAR_EOF, use
the previous value we stored earlier. */
m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
_("end of file in string"));
@@ -977,6 +1097,7 @@ next_token (token_data *td, int *line, const char *caller)
TOKEN_DATA_LEN (td) = obstack_object_size (&token_stack);
obstack_1grow (&token_stack, '\0');
TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack);
+ TOKEN_DATA_QUOTE_AGE (td) = current_quote_age;
#ifdef ENABLE_CHANGEWORD
if (orig_text == NULL)
TOKEN_DATA_ORIG_TEXT (td) = TOKEN_DATA_TEXT (td);
diff --git a/src/m4.h b/src/m4.h
index ac819987..d7b6e088 100644
--- a/src/m4.h
+++ b/src/m4.h
@@ -299,6 +299,13 @@ struct token_data
support NUL. */
size_t len;
char *text;
+ /* The value of quote_age when this token was scanned. If
+ this token is later encountered in the context of
+ scanning a quoted string, and quote_age has not changed,
+ then rescanning this string is provably unnecessary. If
+ zero, then this string potentially contains content that
+ might change the parse on rescan. Ignored for 0 len. */
+ unsigned int quote_age;
#ifdef ENABLE_CHANGEWORD
char *original_text;
#endif
@@ -316,6 +323,7 @@ struct token_data
#define TOKEN_DATA_TYPE(Td) ((Td)->type)
#define TOKEN_DATA_LEN(Td) ((Td)->u.u_t.len)
#define TOKEN_DATA_TEXT(Td) ((Td)->u.u_t.text)
+#define TOKEN_DATA_QUOTE_AGE(Td) ((Td)->u.u_t.quote_age)
#ifdef ENABLE_CHANGEWORD
# define TOKEN_DATA_ORIG_TEXT(Td) ((Td)->u.u_t.original_text)
#endif
@@ -355,6 +363,8 @@ void set_comment (const char *, const char *);
#ifdef ENABLE_CHANGEWORD
void set_word_regexp (const char *, const char *);
#endif
+unsigned int quote_age (void);
+bool safe_quotes (void);
/* File: output.c --- output functions. */
extern int current_diversion;
diff --git a/src/macro.c b/src/macro.c
index a59a1f08..56a8571d 100644
--- a/src/macro.c
+++ b/src/macro.c
@@ -41,6 +41,10 @@ struct macro_arguments
bool_bitfield has_ref : 1;
const char *argv0; /* The macro name being expanded. */
size_t argv0_len; /* Length of argv0. */
+ /* The value of quote_age used when parsing all arguments in this
+ object, or 0 if quote_age changed during parsing or if any of the
+ arguments might contain content that can affect rescan. */
+ unsigned int quote_age;
size_t arraylen; /* True length of allocated elements in array. */
/* Used as a variable-length array, storing information about each
argument. */
@@ -48,7 +52,8 @@ struct macro_arguments
};
static void expand_macro (symbol *);
-static void expand_token (struct obstack *, token_type, token_data *, int);
+static bool expand_token (struct obstack *, token_type, token_data *, int,
+ bool);
/* Current recursion level in expand_macro (). */
int expansion_level = 0;
@@ -95,37 +100,64 @@ expand_input (void)
#endif
while ((t = next_token (&td, &line, NULL)) != TOKEN_EOF)
- expand_token ((struct obstack *) NULL, t, &td, line);
+ expand_token ((struct obstack *) NULL, t, &td, line, true);
obstack_free (&arg_stack, NULL);
obstack_free (&argv_stack, NULL);
}
-/*------------------------------------------------------------------------.
-| Expand one token, according to its type. Potential macro names |
-| (TOKEN_WORD) are looked up in the symbol table, to see if they have a |
-| macro definition. If they have, they are expanded as macros, otherwise |
-| the text are just copied to the output. |
-`------------------------------------------------------------------------*/
+/*-------------------------------------------------------------------.
+| Expand one token TD onto the stack OBS, according to its type T, |
+| which began parsing on the specified LINE. If OBS is NULL, output |
+| the data. If FIRST, there is no previous text in the current |
+| argument. Potential macro names (TOKEN_WORD) are looked up in the |
+| symbol table, to see if they have a macro definition. If they |
+| have, they are expanded as macros, otherwise the text is just |
+| copied to the output. Return true if the result is guaranteed to |
+| give the same parse on rescan in a quoted context, provided |
+| quoting doesn't change. Returning false is always safe, although |
+| it may lead to slower performance. |
+`-------------------------------------------------------------------*/
-static void
-expand_token (struct obstack *obs, token_type t, token_data *td, int line)
+static bool
+expand_token (struct obstack *obs, token_type t, token_data *td, int line,
+ bool first)
{
symbol *sym;
+ bool result;
+ int ch;
switch (t)
{ /* TOKSW */
case TOKEN_EOF:
case TOKEN_MACDEF:
+ /* Always safe, since there is no text to rescan. */
+ return true;
+
+ case TOKEN_STRING:
+ /* Tokens and comments are safe in isolation (since quote_age()
+ detects any change in delimiters). But if other text is
+ already present, multi-character delimiters could be an
+ issue, so use a conservative heuristic. */
+ result = first || safe_quotes ();
break;
case TOKEN_OPEN:
case TOKEN_COMMA:
case TOKEN_CLOSE:
+ /* Conservative heuristic; thanks to multi-character delimiter
+ concatenation. */
+ result = safe_quotes ();
+ break;
+
case TOKEN_SIMPLE:
- case TOKEN_STRING:
- shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line);
+ /* Conservative heuristic; if these characters are whitespace or
+ numeric, then behavior of safe_quotes is applicable.
+ Otherwise, assume these characters have a high likelihood of
+ use in quote delimiters. */
+ ch = to_uchar (*TOKEN_DATA_TEXT (td));
+ result = (isspace (ch) || isdigit (ch)) && safe_quotes ();
break;
case TOKEN_WORD:
@@ -141,15 +173,22 @@ expand_token (struct obstack *obs, token_type t, token_data *td, int line)
#else
shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line);
#endif /* !ENABLE_CHANGEWORD */
+ /* The word just appended is unquoted, but the heuristics of
+ safe_quote are applicable. */
+ return safe_quotes();
}
- else
- expand_macro (sym);
- break;
+ expand_macro (sym);
+ /* Expanding a macro creates new tokens to scan, and those new
+ tokens may append unsafe text later; but we did not append
+ any text now. */
+ return true;
default:
assert (!"expand_token");
abort ();
}
+ shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line);
+ return result;
}
@@ -184,6 +223,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
int paren_level;
const char *file = current_file;
int line = current_line;
+ unsigned int age = quote_age ();
+ bool first = true;
TOKEN_DATA_TYPE (argp) = TOKEN_VOID;
@@ -211,10 +252,11 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
return t == TOKEN_COMMA;
warn_builtin_concat (caller, TOKEN_DATA_FUNC (argp));
}
+ TOKEN_DATA_TYPE (argp) = TOKEN_TEXT;
TOKEN_DATA_LEN (argp) = obstack_object_size (obs);
obstack_1grow (obs, '\0');
- TOKEN_DATA_TYPE (argp) = TOKEN_TEXT;
TOKEN_DATA_TEXT (argp) = (char *) obstack_finish (obs);
+ TOKEN_DATA_QUOTE_AGE (argp) = age;
return t == TOKEN_COMMA;
}
/* fallthru */
@@ -224,11 +266,12 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
paren_level++;
else if (t == TOKEN_CLOSE)
paren_level--;
- expand_token (obs, t, &td, line);
+ if (!expand_token (obs, t, &td, line, first))
+ age = 0;
break;
case TOKEN_EOF:
- /* current_file changed to "" if we see TOKEN_EOF, use the
+ /* Current_file changed to "" if we see TOKEN_EOF, use the
previous value we stored earlier. */
m4_error_at_line (EXIT_FAILURE, 0, file, line, caller,
_("end of file in argument list"));
@@ -236,7 +279,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
case TOKEN_WORD:
case TOKEN_STRING:
- expand_token (obs, t, &td, line);
+ if (!expand_token (obs, t, &td, line, first))
+ age = 0;
break;
case TOKEN_MACDEF:
@@ -260,6 +304,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller)
abort ();
}
+ if (TOKEN_DATA_TYPE (argp) != TOKEN_VOID || obstack_object_size (obs))
+ first = false;
t = next_token (&td, NULL, caller);
}
}
@@ -285,6 +331,7 @@ collect_arguments (symbol *sym, struct obstack *arguments)
args.has_ref = false;
args.argv0 = SYMBOL_NAME (sym);
args.argv0_len = strlen (args.argv0);
+ args.quote_age = quote_age ();
args.arraylen = 0;
obstack_grow (&argv_stack, &args, offsetof (macro_arguments, array));
@@ -303,24 +350,31 @@ collect_arguments (symbol *sym, struct obstack *arguments)
obstack_ptr_grow (&argv_stack, tdp);
args.arraylen++;
args.argc++;
+ /* Be conservative - any change in quoting while collecting
+ arguments, or any argument that consists of unsafe text,
+ will require a rescan if $@ is reused. */
+ if (TOKEN_DATA_TYPE (tdp) == TOKEN_TEXT
+ && TOKEN_DATA_LEN (tdp) > 0
+ && TOKEN_DATA_QUOTE_AGE (tdp) != args.quote_age)
+ args.quote_age = 0;
}
while (more_args);
}
argv = (macro_arguments *) obstack_finish (&argv_stack);
argv->argc = args.argc;
+ if (args.quote_age != quote_age ())
+ argv->quote_age = 0;
argv->arraylen = args.arraylen;
return argv;
}
-/*------------------------------------------------------------------------.
-| The actual call of a macro is handled by call_macro (). call_macro () |
-| is passed a symbol SYM, whose type is used to call either a builtin |
-| function, or the user macro expansion function expand_user_macro () |
-| (lives in builtin.c). There are ARGC arguments to the call, stored in |
-| the ARGV table. The expansion is left on the obstack EXPANSION. Macro |
-| tracing is also handled here. |
-`------------------------------------------------------------------------*/
+/*-----------------------------------------------------------------.
+| Call the macro SYM, which is either a builtin function or a user |
+| macro (via the expansion function expand_user_macro () in |
+| builtin.c). There are ARGC arguments to the call, stored in the |
+| ARGV table. The expansion is left on the obstack EXPANSION. |
+`-----------------------------------------------------------------*/
void
call_macro (symbol *sym, int argc, macro_arguments *argv,
@@ -436,6 +490,7 @@ expand_macro (symbol *sym)
obstack_free (&argv_stack, argv);
}
+
/* Given ARGV, return the token_data that contains argument INDEX;
INDEX must be > 0, < argv->argc. */
static token_data *
@@ -472,7 +527,6 @@ arg_token (macro_arguments *argv, unsigned int index)
return token;
}
-
/* Given ARGV, return how many arguments it refers to. */
unsigned int
arg_argc (macro_arguments *argv)
@@ -496,7 +550,7 @@ arg_type (macro_arguments *argv, unsigned int index)
return type;
}
-/* Given ARGV, return the text at argument INDEX, or NULL if the
+/* Given ARGV, return the text at argument INDEX. Abort if the
argument is not text. Index 0 is always text, and indices beyond
argc return the empty string. */
const char *
@@ -513,8 +567,6 @@ arg_text (macro_arguments *argv, unsigned int index)
{
case TOKEN_TEXT:
return TOKEN_DATA_TEXT (token);
- case TOKEN_FUNC:
- return NULL;
case TOKEN_COMP:
// TODO - how to concatenate multiple arguments? For now, we expect
// only one element in the chain, and arg_token dereferences it...
@@ -557,7 +609,7 @@ arg_empty (macro_arguments *argv, unsigned int index)
return arg_token (argv, index) == &empty_token;
}
-/* Given ARGV, return the length of argument INDEX, or SIZE_MAX if the
+/* Given ARGV, return the length of argument INDEX. Abort if the
argument is not text. Indices beyond argc return 0. */
size_t
arg_len (macro_arguments *argv, unsigned int index)
@@ -574,8 +626,6 @@ arg_len (macro_arguments *argv, unsigned int index)
case TOKEN_TEXT:
assert ((token == &empty_token) == (TOKEN_DATA_LEN (token) == 0));
return TOKEN_DATA_LEN (token);
- case TOKEN_FUNC:
- return SIZE_MAX;
case TOKEN_COMP:
// TODO - how to concatenate multiple arguments? For now, we expect
// only one element in the chain, and arg_token dereferences it...
@@ -587,30 +637,15 @@ arg_len (macro_arguments *argv, unsigned int index)
}
/* Given ARGV, return the builtin function referenced by argument
- INDEX, or NULL if it is not a builtin. Index 0, and indices beyond
- argc, return NULL. */
+ INDEX. Abort if it is not a builtin in isolation. */
builtin_func *
arg_func (macro_arguments *argv, unsigned int index)
{
token_data *token;
- if (index == 0 || index >= argv->argc)
- return NULL;
token = arg_token (argv, index);
- switch (TOKEN_DATA_TYPE (token))
- {
- case TOKEN_FUNC:
- return TOKEN_DATA_FUNC (token);
- case TOKEN_TEXT:
- return NULL;
- case TOKEN_COMP:
- // TODO - how to concatenate multiple arguments? For now, we expect
- // only one element in the chain...
- default:
- break;
- }
- assert(!"arg_func");
- abort ();
+ assert (TOKEN_DATA_TYPE (token) == TOKEN_FUNC);
+ return TOKEN_DATA_FUNC (token);
}
/* Create a new argument object using the same obstack as ARGV; thus,
@@ -673,5 +708,6 @@ make_argv_ref (macro_arguments *argv, const char *argv0, size_t argv0_len,
new_argv->inuse = false;
new_argv->argv0 = argv0;
new_argv->argv0_len = argv0_len;
+ new_argv->quote_age = argv->quote_age;
return new_argv;
}