diff options
author | Eric Blake <ebb9@byu.net> | 2007-10-24 08:36:26 -0600 |
---|---|---|
committer | Eric Blake <ebb9@byu.net> | 2007-12-06 10:25:55 -0700 |
commit | 8b5b3b7a74f452fed795c063965966934a68755d (patch) | |
tree | 64c05d239d202a1696a074f4938f1c0638012671 | |
parent | ab7d5ea40dd30e38cdafdfa69e868390ff6f72ab (diff) | |
download | m4-8b5b3b7a74f452fed795c063965966934a68755d.tar.gz |
Stage 5: add notion of quote age
-rw-r--r-- | doc/m4.texinfo | 55 | ||||
-rw-r--r-- | examples/Makefile.am | 3 | ||||
-rw-r--r-- | examples/wraplifo.m4 | 10 | ||||
-rw-r--r-- | src/input.c | 261 | ||||
-rw-r--r-- | src/m4.h | 10 | ||||
-rw-r--r-- | src/macro.c | 140 |
6 files changed, 355 insertions, 124 deletions
diff --git a/doc/m4.texinfo b/doc/m4.texinfo index 3da16fc9..803dbf05 100644 --- a/doc/m4.texinfo +++ b/doc/m4.texinfo @@ -2635,6 +2635,47 @@ ifelse(`foo', `bar', `3', `gnu', `gnats', `6', `7', `8') @result{}7 @end example +@ignore +@comment Stress tests, not worth documenting. +@comment It would be nice to pass builtin tokens through ifelse, m4wrap, +@comment user macros; hence the fixmes. +@example +define(`e', `$@@')define(`q', ``$@@'')define(`u', `$*') +@result{} +define(`cmp', `ifelse($1, $2, `yes', `no')')define(`d', defn(`defn')) +@result{} +cmp(`defn(`defn')', `defn(`d')') +@result{}yes +cmp(`defn(`defn')', ``<defn>'') +@result{}no +cmp(`q(defn(`defn'))', `q(defn(`d'))') +@result{}yes +cmp(`q(defn(`defn'))', `q(`<defn>')') +@result{}no +cmp(`q(defn(`defn'))', ``'') +@result{}no +cmp(`q(`1', `2', defn(`defn'))', `q(`1', `2', defn(`d'))') +@result{}yes +cmp(`q(`1', `2', defn(`defn'))', `q(`1', `2', `<defn>')') +@result{}no +cmp(`q(`1', `2', defn(`defn'))', ```1',`2',<defn>'') +@result{}no +cmp(`q(`1', `2', defn(`defn'))', ```1',`2',`''')-fixme +@result{}yes-fixme +define(`cat', `$1`'ifelse(`$#', `1', `', `$0(shift($@@))')') +@result{} +cat(`define(`foo',', defn(`divnum'), `)foo')-fixme +@result{}-fixme +cat(e(`define(`bar',', defn(`divnum'), `)bar'))-fixme +@result{}-fixme +m4wrap(`u('q(`cat(`define(`baz','', defn(`divnum'), ``)baz')')`)-fixme +') +@result{} +^D +@result{}-fixme +@end example +@end ignore + Naturally, the normal case will be slightly more advanced than these examples. A common use of @code{ifelse} is in macros implementing loops of various kinds. @@ -3714,6 +3755,18 @@ changequote(`"', `"') @result{}hiHIhi @end example +@ignore +@comment And another stress test, not worth documenting in the manual. +@example +define(`aaaaaaaaaaaaaaaaaaaa', `A')define(`q', `"$@@"') +@result{} +changequote(`"', `"') +@result{} +q(q("aaaaaaaaaaaaaaaaaaaa", "a")) +@result{}A,a +@end example +@end ignore + It is an error if the end of file occurs within a quoted string. @comment status: 1 @@ -6490,7 +6543,7 @@ of @samp{-} on the command line. @acronym{POSIX} requires @code{m4wrap} (@pxref{M4wrap}) to act in FIFO (first-in, first-out) order, but @acronym{GNU} @code{m4} currently uses LIFO order. Furthermore, @acronym{POSIX} states that only the first -argument to @code{m4wrap} is saved for later evaluation, bug +argument to @code{m4wrap} is saved for later evaluation, but @acronym{GNU} @code{m4} saves and processes all arguments, with output separated by spaces. diff --git a/examples/Makefile.am b/examples/Makefile.am index b1ef68a0..c1dc5227 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -58,4 +58,5 @@ translit.m4 \ undivert.incl \ undivert.m4 \ wrap.m4 \ -wrapfifo.m4 +wrapfifo.m4 \ +wraplifo.m4 diff --git a/examples/wraplifo.m4 b/examples/wraplifo.m4 new file mode 100644 index 00000000..bdbf3fb6 --- /dev/null +++ b/examples/wraplifo.m4 @@ -0,0 +1,10 @@ +dnl Redefine m4wrap to have LIFO semantics. +define(`_m4wrap_level', `0')dnl +define(`_m4wrap', defn(`m4wrap'))dnl +define(`m4wrap', +`ifdef(`m4wrap'_m4wrap_level, + `define(`m4wrap'_m4wrap_level, + `$1'defn(`m4wrap'_m4wrap_level))', + `_m4wrap(`define(`_m4wrap_level', incr(_m4wrap_level))dnl +m4wrap'_m4wrap_level)dnl +define(`m4wrap'_m4wrap_level, `$1')')')dnl diff --git a/src/input.c b/src/input.c index 0aa60367..551b43d6 100644 --- a/src/input.c +++ b/src/input.c @@ -23,12 +23,13 @@ #include "m4.h" -/* Unread input can be either files, that should be read (eg. included - files), strings, which should be rescanned (eg. macro expansion text), - or quoted macro definitions (as returned by the builtin "defn"). - Unread input are organised in a stack, implemented with an obstack. - Each input source is described by a "struct input_block". The obstack - is "current_input". The top of the input stack is "isp". +/* Unread input can be either files to be read (command line, + "include", "sinclude"), strings which should be rescanned (macro + expansion text), or quoted macro definitions (as returned by the + builtin "defn"). Unread input is organized in a stack, implemented + with an obstack. Each input source is described by a "struct + input_block". The obstack is "current_input". The top of the + input stack is "isp". The macro "m4wrap" places the text to be saved on another input stack, on the obstack "wrapup_stack", whose top is "wsp". When EOF @@ -42,12 +43,13 @@ Pushing new input on the input stack is done by push_file (), push_string (), push_wrapup () (for wrapup text), and push_macro () - (for macro definitions). Because macro expansion needs direct access - to the current input obstack (for optimisation), push_string () are - split in two functions, push_string_init (), which returns a pointer - to the current input stack, and push_string_finish (), which return a - pointer to the final text. The input_block *next is used to manage - the coordination between the different push routines. + (for macro definitions). Because macro expansion needs direct + access to the current input obstack (for optimization), push_string + () is split in two functions, push_string_init (), which returns a + pointer to the current input stack, and push_string_finish (), + which returns a pointer to the final text. The input_block *next + is used to manage the coordination between the different push + routines. The current file and line number are stored in two global variables, for use by the error handling functions in m4.c. Macro @@ -62,6 +64,7 @@ # include "regex.h" #endif /* ENABLE_CHANGEWORD */ +/* Type of an input block. */ enum input_type { INPUT_STRING, /* String resulting from macro expansion. */ @@ -71,28 +74,29 @@ enum input_type typedef enum input_type input_type; +/* A block of input to be scanned. */ struct input_block { - struct input_block *prev; /* previous input_block on the input stack */ - input_type type; /* see enum values */ - const char *file; /* file where this input is from */ - int line; /* line where this input is from */ + struct input_block *prev; /* Previous input_block on the input stack. */ + input_type type; /* See enum values. */ + const char *file; /* File where this input is from. */ + int line; /* Line where this input is from. */ union { struct { - char *string; /* remaining string value */ + char *string; /* Remaining string value. */ } u_s; /* INPUT_STRING */ struct { - FILE *fp; /* input file handle */ - bool_bitfield end : 1; /* true if peek has seen EOF */ - bool_bitfield close : 1; /* true if we should close file on pop */ - bool_bitfield advance : 1; /* track previous start_of_input_line */ + FILE *fp; /* Input file handle. */ + bool_bitfield end : 1; /* True if peek has seen EOF. */ + bool_bitfield close : 1; /* True to close file on pop. */ + bool_bitfield advance : 1; /* Track previous start_of_input_line. */ } u_f; /* INPUT_FILE */ - builtin_func *func; /* pointer to macro's function */ + builtin_func *func; /* Pointer to macro's function. */ } u; }; @@ -136,8 +140,8 @@ static bool start_of_input_line; /* Flag for next_char () to recognize change in input block. */ static bool input_change; -#define CHAR_EOF 256 /* character return on EOF */ -#define CHAR_MACRO 257 /* character return for MACRO token */ +#define CHAR_EOF 256 /* Character return on EOF. */ +#define CHAR_MACRO 257 /* Character return for MACRO token. */ /* Quote chars. */ STRING rquote; @@ -151,16 +155,30 @@ STRING ecomm; # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*" +/* Table of characters that can start a word. */ static char *word_start; + +/* Current regular expression for detecting words. */ static struct re_pattern_buffer word_regexp; -static int default_word_regexp; + +/* True if changeword is not active. */ +static bool default_word_regexp; + +/* Reused memory for detecting matches in word detection. */ static struct re_registers regs; #else /* !ENABLE_CHANGEWORD */ -# define default_word_regexp 1 +# define default_word_regexp true #endif /* !ENABLE_CHANGEWORD */ +/* Track the current quote age, determined by all significant + changequote, changecom, and changeword calls, since any one of + these can alter the rescan of a prior parameter in a quoted + context. */ +static unsigned int current_quote_age; + static bool pop_input (bool); +static void set_quote_age (void); #ifdef DEBUG_INPUT static const char *token_type_string (token_type); @@ -172,7 +190,8 @@ static const char *token_type_string (token_type); | current file name and line number. If next is non-NULL, this push | | invalidates a call to push_string_init (), whose storage is | | consequently released. If CLOSE, then close FP after EOF is | -| detected. | +| detected. TITLE is used as the location for text parsed from the | +| file (not necessarily the file name). | `-------------------------------------------------------------------*/ void @@ -206,11 +225,11 @@ push_file (FILE *fp, const char *title, bool close) isp = i; } -/*---------------------------------------------------------------. -| push_macro () pushes a builtin macro's definition on the input | -| stack. If next is non-NULL, this push invalidates a call to | -| push_string_init (), whose storage is consequently released. | -`---------------------------------------------------------------*/ +/*-----------------------------------------------------------------. +| push_macro () pushes the builtin macro FUNC on the input stack. | +| If next is non-NULL, this push invalidates a call to | +| push_string_init (), whose storage is consequently released. | +`-----------------------------------------------------------------*/ void push_macro (builtin_func *func) @@ -235,10 +254,10 @@ push_macro (builtin_func *func) isp = i; } -/*------------------------------------------------------------------. -| First half of push_string (). The pointer next points to the new | -| input_block. | -`------------------------------------------------------------------*/ +/*--------------------------------------------------------------. +| First half of push_string (). The return value points to the | +| obstack where expansion text should be placed. | +`--------------------------------------------------------------*/ struct obstack * push_string_init (void) @@ -257,14 +276,15 @@ push_string_init (void) return current_input; } -/*------------------------------------------------------------------------. -| Last half of push_string (). If next is now NULL, a call to push_file | -| () has invalidated the previous call to push_string_init (), so we just | -| give up. If the new object is void, we do not push it. The function | -| push_string_finish () returns a pointer to the finished object. This | -| pointer is only for temporary use, since reading the next token might | -| release the memory used for the object. | -`------------------------------------------------------------------------*/ +/*-------------------------------------------------------------------. +| Last half of push_string (). If next is now NULL, a call to | +| push_file () or push_macro () has invalidated the previous call to | +| push_string_init (), so we just give up. If the new object is | +| void, we do not push it. The function push_string_finish () | +| returns a pointer to the finished object. This pointer is only | +| for temporary use, since reading the next token might release the | +| memory used for the object. | +`-------------------------------------------------------------------*/ const char * push_string_finish (void) @@ -413,7 +433,7 @@ pop_wrapup (void) /*-------------------------------------------------------------------. | When a MACRO token is seen, next_token () uses init_macro_token () | -| to retrieve the value of the function pointer. | +| to retrieve the value of the function pointer and store it in TD. | `-------------------------------------------------------------------*/ static void @@ -425,12 +445,14 @@ init_macro_token (token_data *td) } -/*------------------------------------------------------------------------. -| Low level input is done a character at a time. The function peek_input | -| () is used to look at the next character in the input stream. At any | -| given time, it reads from the input_block on the top of the current | -| input stack. | -`------------------------------------------------------------------------*/ +/*-----------------------------------------------------------------. +| Low level input is done a character at a time. The function | +| peek_input () is used to look at the next character in the input | +| stream. At any given time, it reads from the input_block on the | +| top of the current input stack. The return value is an unsigned | +| char, or CHAR_EOF if there is no more input, or CHAR_MACRO if a | +| builtin token occurs next. | +`-----------------------------------------------------------------*/ static int peek_input (void) @@ -556,7 +578,8 @@ next_char_1 (void) /*-------------------------------------------------------------------. | skip_line () simply discards all immediately following characters, | -| up to the first newline. It is only used from m4_dnl (). | +| up to the first newline. It is only used from m4_dnl (). Report | +| warnings on behalf of NAME. | `-------------------------------------------------------------------*/ void @@ -585,7 +608,7 @@ skip_line (const char *name) /*------------------------------------------------------------------. | This function is for matching a string against a prefix of the | -| input stream. If the string matches the input and consume is | +| input stream. If the string S matches the input and CONSUME is | | true, the input is discarded; otherwise any characters read are | | pushed back again. The function is used only when multicharacter | | quotes or comment delimiters are used. | @@ -637,7 +660,7 @@ match_input (const char *s, bool consume) | will not hurt efficiency too much when single character quotes and | | comment delimiters are used. If CONSUME, then CH is the result of | | next_char, and a successful match will discard the matched string. | -| Otherwise, CH is the result of peek_char, and the input stream is | +| Otherwise, CH is the result of peek_input, and the input stream is | | effectively unchanged. | `--------------------------------------------------------------------*/ @@ -648,7 +671,7 @@ match_input (const char *s, bool consume) /*----------------------------------------------------------. -| Inititialise input stacks, and quote/comment characters. | +| Inititialize input stacks, and quote/comment characters. | `----------------------------------------------------------*/ void @@ -689,21 +712,20 @@ input_init (void) #ifdef ENABLE_CHANGEWORD set_word_regexp (NULL, user_word_regexp); #endif /* ENABLE_CHANGEWORD */ + + set_quote_age (); } -/*------------------------------------------------------------------. -| Functions for setting quotes and comment delimiters. Used by | -| m4_changecom () and m4_changequote (). Pass NULL if the argument | -| was not present, to distinguish from an explicit empty string. | -`------------------------------------------------------------------*/ +/*--------------------------------------------------------------------. +| Set the quote delimiters to LQ and RQ. Used by m4_changequote (). | +| Pass NULL if the argument was not present, to distinguish from an | +| explicit empty string. | +`--------------------------------------------------------------------*/ void set_quotes (const char *lq, const char *rq) { - free (lquote.string); - free (rquote.string); - /* POSIX states that with 0 arguments, the default quotes are used. POSIX XCU ERN 112 states that behavior is implementation-defined if there was only one argument, or if there is an empty string in @@ -719,18 +741,27 @@ set_quotes (const char *lq, const char *rq) else if (!rq || (*lq && !*rq)) rq = DEF_RQUOTE; + if (strcmp (lquote.string, lq) == 0 && strcmp (rquote.string, rq) == 0) + return; + + free (lquote.string); + free (rquote.string); lquote.string = xstrdup (lq); lquote.length = strlen (lquote.string); rquote.string = xstrdup (rq); rquote.length = strlen (rquote.string); + set_quote_age (); } +/*--------------------------------------------------------------------. +| Set the comment delimiters to BC and EC. Used by m4_changecom (). | +| Pass NULL if the argument was not present, to distinguish from an | +| explicit empty string. | +`--------------------------------------------------------------------*/ + void set_comment (const char *bc, const char *ec) { - free (bcomm.string); - free (ecomm.string); - /* POSIX requires no arguments to disable comments. It requires empty arguments to be used as-is, but this is counter to traditional behavior, because a non-null begin and null end makes @@ -743,14 +774,26 @@ set_comment (const char *bc, const char *ec) else if (!ec || (*bc && !*ec)) ec = DEF_ECOMM; + if (strcmp (bcomm.string, bc) == 0 && strcmp (ecomm.string, ec) == 0) + return; + + free (bcomm.string); + free (ecomm.string); bcomm.string = xstrdup (bc); bcomm.length = strlen (bcomm.string); ecomm.string = xstrdup (ec); ecomm.length = strlen (ecomm.string); + set_quote_age (); } #ifdef ENABLE_CHANGEWORD +/*-------------------------------------------------------------------. +| Set the regular expression for recognizing words to REGEXP, and | +| report errors on behalf of CALLER. If REGEXP is NULL, revert back | +| to the default parsing rules. | +`-------------------------------------------------------------------*/ + void set_word_regexp (const char *caller, const char *regexp) { @@ -762,6 +805,7 @@ set_word_regexp (const char *caller, const char *regexp) if (!*regexp || !strcmp (regexp, DEFAULT_WORD_REGEXP)) { default_word_regexp = true; + set_quote_age (); return; } @@ -772,7 +816,6 @@ set_word_regexp (const char *caller, const char *regexp) if (msg != NULL) { - /* FIXME - report on behalf of macro caller. */ m4_warn (0, caller, _("bad regular expression `%s': %s"), regexp, msg); return; } @@ -785,6 +828,7 @@ set_word_regexp (const char *caller, const char *regexp) re_set_registers (&word_regexp, ®s, regs.num_regs, regs.start, regs.end); default_word_regexp = false; + set_quote_age (); if (word_start == NULL) word_start = (char *) xmalloc (256); @@ -799,6 +843,82 @@ set_word_regexp (const char *caller, const char *regexp) } #endif /* ENABLE_CHANGEWORD */ + +/* Call this when changing anything that might impact the quote age, + so that quote_age and safe_quotes will reflect the change. */ +static void +set_quote_age (void) +{ + /* Multi-character quotes are inherently unsafe, since concatenation + of individual characters can result in a quote delimiter, + consider: + + define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>) + => A]> (not ]>a) + + Also, unquoted close delimiters are unsafe, consider: + + define(echo,``$1'')define(a,A)echo(`a''`a') + => aA' (not a'a) + + Comment delimiters that overlap with quote delimiters or active + characters also present a problem, consider: + + define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,')) + => A,a,A (not A,A,A) + + And let's not even think about the impact of changeword, since it + will disappear for M4 2.0. + + So rather than check every token for an unquoted delimiter, we + merely encode current_quote_age to 0 when things are unsafe, and + non-zero when safe (namely, to the 16-bit value composed of the + single-character start and end quote delimiters). There may be + other situations which are safe even when this algorithm sets the + quote_age to zero, but at least a quote_age of zero always produces + correct results (although it may take more time in doing so). */ + + /* Hueristic of characters that might impact rescan if they appear in + a quote delimiter. */ +#define Letters "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + static const char unsafe[] = Letters "_0123456789(,) \t\n\r\f\v"; +#undef Letters + + if (lquote.length == 1 && rquote.length == 1 + && strpbrk(lquote.string, unsafe) == NULL + && strpbrk(rquote.string, unsafe) == NULL + && default_word_regexp && *lquote.string != *rquote.string + && *bcomm.string != '(' && *bcomm.string != ',' + && *bcomm.string != ')' && *bcomm.string != *lquote.string) + current_quote_age = (((*lquote.string & 0xff) << 8) + | (*rquote.string & 0xff)); + else + current_quote_age = 0; +} + +/* Return the current quote age. Each non-trivial changequote alters + this value; the idea is that if quoting hasn't changed, then we can + skip parsing a single argument, quoted or unquoted, within the + context of a quoted string, as well as skip parsing a series of + quoted arguments within the context of argument collection. */ +unsigned int +quote_age (void) +{ + /* This accessor is a function, so that the implementation can + change if needed. See set_quote_age for the current + implementation. */ + return current_quote_age; +} + +/* Return true if the current quote delimiters guarantee that + reparsing the current token in the context of a quoted string will + be safe. This could always return false and behavior would still + be correct, just slower. */ +bool +safe_quotes (void) +{ + return current_quote_age != 0; +} /*--------------------------------------------------------------------. @@ -835,7 +955,7 @@ next_token (token_data *td, int *line, const char *caller) if (!line) line = &dummy; - /* Can't consume character until after CHAR_MACRO is handled. */ + /* Can't consume character until after CHAR_MACRO is handled. */ ch = peek_input (); if (ch == CHAR_EOF) { @@ -868,7 +988,7 @@ next_token (token_data *td, int *line, const char *caller) if (ch != CHAR_EOF) obstack_grow (&token_stack, ecomm.string, ecomm.length); else - /* current_file changed to "" if we see CHAR_EOF, use the + /* Current_file changed to "" if we see CHAR_EOF, use the previous value we stored earlier. */ m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller, _("end of file in comment")); @@ -951,7 +1071,7 @@ next_token (token_data *td, int *line, const char *caller) { ch = next_char (); if (ch == CHAR_EOF) - /* current_file changed to "" if we see CHAR_EOF, use + /* Current_file changed to "" if we see CHAR_EOF, use the previous value we stored earlier. */ m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller, _("end of file in string")); @@ -977,6 +1097,7 @@ next_token (token_data *td, int *line, const char *caller) TOKEN_DATA_LEN (td) = obstack_object_size (&token_stack); obstack_1grow (&token_stack, '\0'); TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack); + TOKEN_DATA_QUOTE_AGE (td) = current_quote_age; #ifdef ENABLE_CHANGEWORD if (orig_text == NULL) TOKEN_DATA_ORIG_TEXT (td) = TOKEN_DATA_TEXT (td); @@ -299,6 +299,13 @@ struct token_data support NUL. */ size_t len; char *text; + /* The value of quote_age when this token was scanned. If + this token is later encountered in the context of + scanning a quoted string, and quote_age has not changed, + then rescanning this string is provably unnecessary. If + zero, then this string potentially contains content that + might change the parse on rescan. Ignored for 0 len. */ + unsigned int quote_age; #ifdef ENABLE_CHANGEWORD char *original_text; #endif @@ -316,6 +323,7 @@ struct token_data #define TOKEN_DATA_TYPE(Td) ((Td)->type) #define TOKEN_DATA_LEN(Td) ((Td)->u.u_t.len) #define TOKEN_DATA_TEXT(Td) ((Td)->u.u_t.text) +#define TOKEN_DATA_QUOTE_AGE(Td) ((Td)->u.u_t.quote_age) #ifdef ENABLE_CHANGEWORD # define TOKEN_DATA_ORIG_TEXT(Td) ((Td)->u.u_t.original_text) #endif @@ -355,6 +363,8 @@ void set_comment (const char *, const char *); #ifdef ENABLE_CHANGEWORD void set_word_regexp (const char *, const char *); #endif +unsigned int quote_age (void); +bool safe_quotes (void); /* File: output.c --- output functions. */ extern int current_diversion; diff --git a/src/macro.c b/src/macro.c index a59a1f08..56a8571d 100644 --- a/src/macro.c +++ b/src/macro.c @@ -41,6 +41,10 @@ struct macro_arguments bool_bitfield has_ref : 1; const char *argv0; /* The macro name being expanded. */ size_t argv0_len; /* Length of argv0. */ + /* The value of quote_age used when parsing all arguments in this + object, or 0 if quote_age changed during parsing or if any of the + arguments might contain content that can affect rescan. */ + unsigned int quote_age; size_t arraylen; /* True length of allocated elements in array. */ /* Used as a variable-length array, storing information about each argument. */ @@ -48,7 +52,8 @@ struct macro_arguments }; static void expand_macro (symbol *); -static void expand_token (struct obstack *, token_type, token_data *, int); +static bool expand_token (struct obstack *, token_type, token_data *, int, + bool); /* Current recursion level in expand_macro (). */ int expansion_level = 0; @@ -95,37 +100,64 @@ expand_input (void) #endif while ((t = next_token (&td, &line, NULL)) != TOKEN_EOF) - expand_token ((struct obstack *) NULL, t, &td, line); + expand_token ((struct obstack *) NULL, t, &td, line, true); obstack_free (&arg_stack, NULL); obstack_free (&argv_stack, NULL); } -/*------------------------------------------------------------------------. -| Expand one token, according to its type. Potential macro names | -| (TOKEN_WORD) are looked up in the symbol table, to see if they have a | -| macro definition. If they have, they are expanded as macros, otherwise | -| the text are just copied to the output. | -`------------------------------------------------------------------------*/ +/*-------------------------------------------------------------------. +| Expand one token TD onto the stack OBS, according to its type T, | +| which began parsing on the specified LINE. If OBS is NULL, output | +| the data. If FIRST, there is no previous text in the current | +| argument. Potential macro names (TOKEN_WORD) are looked up in the | +| symbol table, to see if they have a macro definition. If they | +| have, they are expanded as macros, otherwise the text is just | +| copied to the output. Return true if the result is guaranteed to | +| give the same parse on rescan in a quoted context, provided | +| quoting doesn't change. Returning false is always safe, although | +| it may lead to slower performance. | +`-------------------------------------------------------------------*/ -static void -expand_token (struct obstack *obs, token_type t, token_data *td, int line) +static bool +expand_token (struct obstack *obs, token_type t, token_data *td, int line, + bool first) { symbol *sym; + bool result; + int ch; switch (t) { /* TOKSW */ case TOKEN_EOF: case TOKEN_MACDEF: + /* Always safe, since there is no text to rescan. */ + return true; + + case TOKEN_STRING: + /* Tokens and comments are safe in isolation (since quote_age() + detects any change in delimiters). But if other text is + already present, multi-character delimiters could be an + issue, so use a conservative heuristic. */ + result = first || safe_quotes (); break; case TOKEN_OPEN: case TOKEN_COMMA: case TOKEN_CLOSE: + /* Conservative heuristic; thanks to multi-character delimiter + concatenation. */ + result = safe_quotes (); + break; + case TOKEN_SIMPLE: - case TOKEN_STRING: - shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line); + /* Conservative heuristic; if these characters are whitespace or + numeric, then behavior of safe_quotes is applicable. + Otherwise, assume these characters have a high likelihood of + use in quote delimiters. */ + ch = to_uchar (*TOKEN_DATA_TEXT (td)); + result = (isspace (ch) || isdigit (ch)) && safe_quotes (); break; case TOKEN_WORD: @@ -141,15 +173,22 @@ expand_token (struct obstack *obs, token_type t, token_data *td, int line) #else shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line); #endif /* !ENABLE_CHANGEWORD */ + /* The word just appended is unquoted, but the heuristics of + safe_quote are applicable. */ + return safe_quotes(); } - else - expand_macro (sym); - break; + expand_macro (sym); + /* Expanding a macro creates new tokens to scan, and those new + tokens may append unsafe text later; but we did not append + any text now. */ + return true; default: assert (!"expand_token"); abort (); } + shipout_text (obs, TOKEN_DATA_TEXT (td), TOKEN_DATA_LEN (td), line); + return result; } @@ -184,6 +223,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller) int paren_level; const char *file = current_file; int line = current_line; + unsigned int age = quote_age (); + bool first = true; TOKEN_DATA_TYPE (argp) = TOKEN_VOID; @@ -211,10 +252,11 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller) return t == TOKEN_COMMA; warn_builtin_concat (caller, TOKEN_DATA_FUNC (argp)); } + TOKEN_DATA_TYPE (argp) = TOKEN_TEXT; TOKEN_DATA_LEN (argp) = obstack_object_size (obs); obstack_1grow (obs, '\0'); - TOKEN_DATA_TYPE (argp) = TOKEN_TEXT; TOKEN_DATA_TEXT (argp) = (char *) obstack_finish (obs); + TOKEN_DATA_QUOTE_AGE (argp) = age; return t == TOKEN_COMMA; } /* fallthru */ @@ -224,11 +266,12 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller) paren_level++; else if (t == TOKEN_CLOSE) paren_level--; - expand_token (obs, t, &td, line); + if (!expand_token (obs, t, &td, line, first)) + age = 0; break; case TOKEN_EOF: - /* current_file changed to "" if we see TOKEN_EOF, use the + /* Current_file changed to "" if we see TOKEN_EOF, use the previous value we stored earlier. */ m4_error_at_line (EXIT_FAILURE, 0, file, line, caller, _("end of file in argument list")); @@ -236,7 +279,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller) case TOKEN_WORD: case TOKEN_STRING: - expand_token (obs, t, &td, line); + if (!expand_token (obs, t, &td, line, first)) + age = 0; break; case TOKEN_MACDEF: @@ -260,6 +304,8 @@ expand_argument (struct obstack *obs, token_data *argp, const char *caller) abort (); } + if (TOKEN_DATA_TYPE (argp) != TOKEN_VOID || obstack_object_size (obs)) + first = false; t = next_token (&td, NULL, caller); } } @@ -285,6 +331,7 @@ collect_arguments (symbol *sym, struct obstack *arguments) args.has_ref = false; args.argv0 = SYMBOL_NAME (sym); args.argv0_len = strlen (args.argv0); + args.quote_age = quote_age (); args.arraylen = 0; obstack_grow (&argv_stack, &args, offsetof (macro_arguments, array)); @@ -303,24 +350,31 @@ collect_arguments (symbol *sym, struct obstack *arguments) obstack_ptr_grow (&argv_stack, tdp); args.arraylen++; args.argc++; + /* Be conservative - any change in quoting while collecting + arguments, or any argument that consists of unsafe text, + will require a rescan if $@ is reused. */ + if (TOKEN_DATA_TYPE (tdp) == TOKEN_TEXT + && TOKEN_DATA_LEN (tdp) > 0 + && TOKEN_DATA_QUOTE_AGE (tdp) != args.quote_age) + args.quote_age = 0; } while (more_args); } argv = (macro_arguments *) obstack_finish (&argv_stack); argv->argc = args.argc; + if (args.quote_age != quote_age ()) + argv->quote_age = 0; argv->arraylen = args.arraylen; return argv; } -/*------------------------------------------------------------------------. -| The actual call of a macro is handled by call_macro (). call_macro () | -| is passed a symbol SYM, whose type is used to call either a builtin | -| function, or the user macro expansion function expand_user_macro () | -| (lives in builtin.c). There are ARGC arguments to the call, stored in | -| the ARGV table. The expansion is left on the obstack EXPANSION. Macro | -| tracing is also handled here. | -`------------------------------------------------------------------------*/ +/*-----------------------------------------------------------------. +| Call the macro SYM, which is either a builtin function or a user | +| macro (via the expansion function expand_user_macro () in | +| builtin.c). There are ARGC arguments to the call, stored in the | +| ARGV table. The expansion is left on the obstack EXPANSION. | +`-----------------------------------------------------------------*/ void call_macro (symbol *sym, int argc, macro_arguments *argv, @@ -436,6 +490,7 @@ expand_macro (symbol *sym) obstack_free (&argv_stack, argv); } + /* Given ARGV, return the token_data that contains argument INDEX; INDEX must be > 0, < argv->argc. */ static token_data * @@ -472,7 +527,6 @@ arg_token (macro_arguments *argv, unsigned int index) return token; } - /* Given ARGV, return how many arguments it refers to. */ unsigned int arg_argc (macro_arguments *argv) @@ -496,7 +550,7 @@ arg_type (macro_arguments *argv, unsigned int index) return type; } -/* Given ARGV, return the text at argument INDEX, or NULL if the +/* Given ARGV, return the text at argument INDEX. Abort if the argument is not text. Index 0 is always text, and indices beyond argc return the empty string. */ const char * @@ -513,8 +567,6 @@ arg_text (macro_arguments *argv, unsigned int index) { case TOKEN_TEXT: return TOKEN_DATA_TEXT (token); - case TOKEN_FUNC: - return NULL; case TOKEN_COMP: // TODO - how to concatenate multiple arguments? For now, we expect // only one element in the chain, and arg_token dereferences it... @@ -557,7 +609,7 @@ arg_empty (macro_arguments *argv, unsigned int index) return arg_token (argv, index) == &empty_token; } -/* Given ARGV, return the length of argument INDEX, or SIZE_MAX if the +/* Given ARGV, return the length of argument INDEX. Abort if the argument is not text. Indices beyond argc return 0. */ size_t arg_len (macro_arguments *argv, unsigned int index) @@ -574,8 +626,6 @@ arg_len (macro_arguments *argv, unsigned int index) case TOKEN_TEXT: assert ((token == &empty_token) == (TOKEN_DATA_LEN (token) == 0)); return TOKEN_DATA_LEN (token); - case TOKEN_FUNC: - return SIZE_MAX; case TOKEN_COMP: // TODO - how to concatenate multiple arguments? For now, we expect // only one element in the chain, and arg_token dereferences it... @@ -587,30 +637,15 @@ arg_len (macro_arguments *argv, unsigned int index) } /* Given ARGV, return the builtin function referenced by argument - INDEX, or NULL if it is not a builtin. Index 0, and indices beyond - argc, return NULL. */ + INDEX. Abort if it is not a builtin in isolation. */ builtin_func * arg_func (macro_arguments *argv, unsigned int index) { token_data *token; - if (index == 0 || index >= argv->argc) - return NULL; token = arg_token (argv, index); - switch (TOKEN_DATA_TYPE (token)) - { - case TOKEN_FUNC: - return TOKEN_DATA_FUNC (token); - case TOKEN_TEXT: - return NULL; - case TOKEN_COMP: - // TODO - how to concatenate multiple arguments? For now, we expect - // only one element in the chain... - default: - break; - } - assert(!"arg_func"); - abort (); + assert (TOKEN_DATA_TYPE (token) == TOKEN_FUNC); + return TOKEN_DATA_FUNC (token); } /* Create a new argument object using the same obstack as ARGV; thus, @@ -673,5 +708,6 @@ make_argv_ref (macro_arguments *argv, const char *argv0, size_t argv0_len, new_argv->inuse = false; new_argv->argv0 = argv0; new_argv->argv0_len = argv0_len; + new_argv->quote_age = argv->quote_age; return new_argv; } |