/* GNU m4 -- A simple macro processor
Copyright (C) 1989-1994, 2002, 2004, 2006-2010, 2013-2014, 2017 Free
Software Foundation, Inc.
This file is part of GNU M4.
GNU M4 is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
GNU M4 is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include
#include "m4private.h"
/* Define this to see runtime debug info. Implied by DEBUG. */
/*#define DEBUG_SYNTAX */
/* THE SYNTAX TABLE
The input is read character by character and grouped together
according to a syntax table. The character groups are (definitions
are all in m4module.h, those marked with a * are not yet in use):
Basic (all characters fall in one of these mutually exclusive bins)
M4_SYNTAX_IGNORE *Character to be deleted from input as if not present
M4_SYNTAX_OTHER Any character with no special meaning to m4
M4_SYNTAX_SPACE Whitespace (ignored when leading macro arguments)
M4_SYNTAX_OPEN Open list of macro arguments
M4_SYNTAX_CLOSE Close list of macro arguments
M4_SYNTAX_COMMA Separates macro arguments
M4_SYNTAX_ACTIVE This character is a macro name by itself
M4_SYNTAX_ESCAPE Use this character to prefix all macro names
M4_SYNTAX_ALPHA Alphabetic characters (can start macro names)
M4_SYNTAX_NUM Numeric characters (can form macro names)
M4_SYNTAX_LQUOTE A single character left quote
M4_SYNTAX_BCOMM A single character begin comment delimiter
Attribute (these are context sensitive, and exist in addition to basic)
M4_SYNTAX_RQUOTE A single character right quote
M4_SYNTAX_ECOMM A single character end comment delimiter
M4_SYNTAX_DOLLAR Indicates macro argument in user macros
M4_SYNTAX_LBRACE *Indicates start of extended macro argument
M4_SYNTAX_RBRACE *Indicates end of extended macro argument
Besides adding new facilities, the use of a syntax table will reduce
the number of calls to next_token (). Now groups of OTHER, NUM and
SPACE characters can be returned as a single token, since next_token
() knows they have no special syntactical meaning to m4. This is,
however, only possible if only single character quotes comments
comments are used, because otherwise the quote and comment characters
will not show up in the syntax-table.
Having a syntax table allows new facilities. The new builtin
"changesyntax" allows the user to change the category of any
character.
By default, '\n' is both ECOMM and SPACE, depending on the context.
Hence we have basic categories (mutually exclusive, can introduce a
context, and can be empty sets), and attribute categories
(additive, only recognized in context, and will never be empty).
The precedence as implemented by next_token () is:
M4_SYNTAX_IGNORE *Filtered out below next_token ()
M4_SYNTAX_ESCAPE Reads macro name iff set, else next character
M4_SYNTAX_ALPHA Reads M4_SYNTAX_ALPHA and M4_SYNTAX_NUM as macro name
M4_SYNTAX_LQUOTE Reads all until balanced M4_SYNTAX_RQUOTE
M4_SYNTAX_BCOMM Reads all until M4_SYNTAX_ECOMM
M4_SYNTAX_OTHER } Reads all M4_SYNTAX_OTHER, M4_SYNTAX_NUM
M4_SYNTAX_NUM }
M4_SYNTAX_SPACE Reads all M4_SYNTAX_SPACE, depending on buffering
M4_SYNTAX_ACTIVE Returns a single char as a macro name
M4_SYNTAX_OPEN } Returned as a single char
M4_SYNTAX_CLOSE }
M4_SYNTAX_COMMA }
M4_SYNTAX_RQUOTE and M4_SYNTAX_ECOMM are context-sensitive, and
close out M4_SYNTAX_LQUOTE and M4_SYNTAX_BCOMM, respectively.
Also, M4_SYNTAX_DOLLAR, M4_SYNTAX_LBRACE, and M4_SYNTAX_RBRACE are
context-sensitive, only mattering when expanding macro definitions.
There are several optimizations that can be performed depending on
known states of the syntax table. For example, when searching for
quotes, if there is only a single start quote and end quote
delimiter, we can use memchr2 and search a word at a time, instead
of performing a table lookup a byte at a time. The is_single_*
flags track whether quotes and comments have a single delimiter
(always the case if changequote/changecom were used, and
potentially the case after changesyntax). Since we frequently need
to access quotes, we store the oldest valid quote outside the
lookup table; the suspect flag tracks whether a cleanup pass is
needed to restore our invariants. On the other hand, coalescing
multiple M4_SYNTAX_OTHER bytes could form a delimiter, so many
optimizations must be disabled if a multi-byte delimiter exists;
this is handled by m4__safe_quotes. Meanwhile, quotes and comments
can be disabled if the leading delimiter is length 0. */
static int add_syntax_attribute (m4_syntax_table *, char, int);
static int remove_syntax_attribute (m4_syntax_table *, char, int);
static void set_quote_age (m4_syntax_table *, bool, bool);
m4_syntax_table *
m4_syntax_create (void)
{
m4_syntax_table *syntax = (m4_syntax_table *) xzalloc (sizeof *syntax);
int ch;
/* Set up default table. This table never changes during operation,
and contains no context attributes. */
for (ch = UCHAR_MAX + 1; --ch >= 0; )
switch (ch)
{
case '(':
syntax->orig[ch] = M4_SYNTAX_OPEN;
break;
case ')':
syntax->orig[ch] = M4_SYNTAX_CLOSE;
break;
case ',':
syntax->orig[ch] = M4_SYNTAX_COMMA;
break;
case '`':
syntax->orig[ch] = M4_SYNTAX_LQUOTE;
break;
case '#':
syntax->orig[ch] = M4_SYNTAX_BCOMM;
break;
default:
if (isspace (ch))
syntax->orig[ch] = M4_SYNTAX_SPACE;
else if (isalpha (ch) || ch == '_')
syntax->orig[ch] = M4_SYNTAX_ALPHA;
else if (isdigit (ch))
syntax->orig[ch] = M4_SYNTAX_NUM;
else
syntax->orig[ch] = M4_SYNTAX_OTHER;
}
/* Set up current table to match default. */
m4_reset_syntax (syntax);
syntax->cached_simple.str1 = syntax->cached_lquote;
syntax->cached_simple.len1 = 1;
syntax->cached_simple.str2 = syntax->cached_rquote;
syntax->cached_simple.len2 = 1;
return syntax;
}
void
m4_syntax_delete (m4_syntax_table *syntax)
{
assert (syntax);
free (syntax->quote.str1);
free (syntax->quote.str2);
free (syntax->comm.str1);
free (syntax->comm.str2);
free (syntax);
}
int
m4_syntax_code (char ch)
{
int code;
switch (ch)
{
/* Sorted according to the order of M4_SYNTAX_* in m4module.h. */
/* FIXME - revisit the ignore syntax attribute. */
case 'I': case 'i': code = M4_SYNTAX_IGNORE; break;
/* Basic categories. */
case '@': code = M4_SYNTAX_ESCAPE; break;
case 'W': case 'w': code = M4_SYNTAX_ALPHA; break;
case 'L': case 'l': code = M4_SYNTAX_LQUOTE; break;
case 'B': case 'b': code = M4_SYNTAX_BCOMM; break;
case 'A': case 'a': code = M4_SYNTAX_ACTIVE; break;
case 'D': case 'd': code = M4_SYNTAX_NUM; break;
case 'S': case 's': code = M4_SYNTAX_SPACE; break;
case '(': code = M4_SYNTAX_OPEN; break;
case ')': code = M4_SYNTAX_CLOSE; break;
case ',': code = M4_SYNTAX_COMMA; break;
case 'O': case 'o': code = M4_SYNTAX_OTHER; break;
/* Context categories. */
case '$': code = M4_SYNTAX_DOLLAR; break;
case '{': code = M4_SYNTAX_LBRACE; break;
case '}': code = M4_SYNTAX_RBRACE; break;
case 'R': case 'r': code = M4_SYNTAX_RQUOTE; break;
case 'E': case 'e': code = M4_SYNTAX_ECOMM; break;
default: code = -1; break;
}
return code;
}
/* Functions to manipulate the syntax table. */
static int
add_syntax_attribute (m4_syntax_table *syntax, char ch, int code)
{
int c = to_uchar (ch);
if (code & M4_SYNTAX_MASKS)
{
syntax->table[c] |= code;
syntax->suspect = true;
}
else
{
if ((code & (M4_SYNTAX_SUSPECT)) != 0
|| m4_has_syntax (syntax, c, M4_SYNTAX_SUSPECT))
syntax->suspect = true;
syntax->table[c] = ((syntax->table[c] & M4_SYNTAX_MASKS) | code);
}
#ifdef DEBUG_SYNTAX
xfprintf(stderr, "Set syntax %o %c = %04X\n", c, isprint(c) ? c : '-',
syntax->table[c]);
#endif
return syntax->table[c];
}
static int
remove_syntax_attribute (m4_syntax_table *syntax, char ch, int code)
{
int c = to_uchar (ch);
assert (code & M4_SYNTAX_MASKS);
syntax->table[c] &= ~code;
syntax->suspect = true;
#ifdef DEBUG_SYNTAX
xfprintf(stderr, "Unset syntax %o %c = %04X\n", c, isprint(c) ? c : '-',
syntax->table[c]);
#endif
return syntax->table[c];
}
/* Add the set CHARS of length LEN to syntax category CODE, removing
them from whatever category they used to be in. */
static void
add_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
int code)
{
while (len--)
add_syntax_attribute (syntax, *chars++, code);
}
/* Remove the set CHARS of length LEN from syntax category CODE,
adding them to category M4_SYNTAX_OTHER instead. */
static void
subtract_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
int code)
{
while (len--)
{
char ch = *chars++;
if ((code & M4_SYNTAX_MASKS) != 0)
remove_syntax_attribute (syntax, ch, code);
else if (m4_has_syntax (syntax, ch, code))
add_syntax_attribute (syntax, ch, M4_SYNTAX_OTHER);
}
}
/* Make the set CHARS of length LEN become syntax category CODE,
removing CHARS from any other categories, and sending all bytes in
the category but not in CHARS to category M4_SYNTAX_OTHER
instead. */
static void
set_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
int code)
{
int ch;
/* Explicit set of characters to install with this category; all
other characters that used to have the category get reset to
OTHER. */
for (ch = UCHAR_MAX + 1; --ch >= 0; )
{
if ((code & M4_SYNTAX_MASKS) != 0)
remove_syntax_attribute (syntax, ch, code);
else if (m4_has_syntax (syntax, ch, code))
add_syntax_attribute (syntax, ch, M4_SYNTAX_OTHER);
}
while (len--)
{
ch = *chars++;
add_syntax_attribute (syntax, ch, code);
}
}
/* Reset syntax category CODE to its default state, sending all other
characters in the category back to their default state. */
static void
reset_syntax_set (m4_syntax_table *syntax, int code)
{
int ch;
for (ch = UCHAR_MAX + 1; --ch >= 0; )
{
/* Reset the category back to its default state. All other
characters that used to have this category get reset to
their default state as well. */
if (code == M4_SYNTAX_RQUOTE)
{
if (ch == '\'')
add_syntax_attribute (syntax, ch, code);
else
remove_syntax_attribute (syntax, ch, code);
}
else if (code == M4_SYNTAX_ECOMM)
{
if (ch == '\n')
add_syntax_attribute (syntax, ch, code);
else
remove_syntax_attribute (syntax, ch, code);
}
else if (code == M4_SYNTAX_DOLLAR)
{
if (ch == '$')
add_syntax_attribute (syntax, ch, code);
else
remove_syntax_attribute (syntax, ch, code);
}
else if (code == M4_SYNTAX_LBRACE)
{
if (ch == '{')
add_syntax_attribute (syntax, ch, code);
else
remove_syntax_attribute (syntax, ch, code);
}
else if (code == M4_SYNTAX_RBRACE)
{
if (ch == '}')
add_syntax_attribute (syntax, ch, code);
else
remove_syntax_attribute (syntax, ch, code);
}
else if (syntax->orig[ch] == code || m4_has_syntax (syntax, ch, code))
add_syntax_attribute (syntax, ch, syntax->orig[ch]);
}
}
/* Reset the syntax table to its default state. */
void
m4_reset_syntax (m4_syntax_table *syntax)
{
/* Restore the default syntax, which has known quote and comment
properties. */
memcpy (syntax->table, syntax->orig, sizeof syntax->orig);
free (syntax->quote.str1);
free (syntax->quote.str2);
free (syntax->comm.str1);
free (syntax->comm.str2);
/* The use of xmemdup0 is exploited by input.c. */
syntax->quote.str1 = xmemdup0 (DEF_LQUOTE, 1);
syntax->quote.len1 = 1;
syntax->quote.str2 = xmemdup0 (DEF_RQUOTE, 1);
syntax->quote.len2 = 1;
syntax->comm.str1 = xmemdup0 (DEF_BCOMM, 1);
syntax->comm.len1 = 1;
syntax->comm.str2 = xmemdup0 (DEF_ECOMM, 1);
syntax->comm.len2 = 1;
syntax->dollar = '$';
add_syntax_attribute (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE);
add_syntax_attribute (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM);
add_syntax_attribute (syntax, '$', M4_SYNTAX_DOLLAR);
add_syntax_attribute (syntax, '{', M4_SYNTAX_LBRACE);
add_syntax_attribute (syntax, '}', M4_SYNTAX_RBRACE);
syntax->is_single_quotes = true;
syntax->is_single_comments = true;
syntax->is_single_dollar = true;
syntax->is_macro_escaped = false;
set_quote_age (syntax, true, false);
}
/* Alter the syntax for category KEY, according to ACTION: '+' to add,
'-' to subtract, '=' to set, or '\0' to reset. The array CHARS of
length LEN describes the characters to modify; it is ignored if
ACTION is '\0'. Return -1 if KEY is invalid, otherwise return the
syntax category matching KEY. */
int
m4_set_syntax (m4_syntax_table *syntax, char key, char action,
const char *chars, size_t len)
{
int code;
assert (syntax && chars);
code = m4_syntax_code (key);
if (code < 0)
{
return -1;
}
syntax->suspect = false;
switch (action)
{
case '+':
add_syntax_set (syntax, chars, len, code);
break;
case '-':
subtract_syntax_set (syntax, chars, len, code);
break;
case '=':
set_syntax_set (syntax, chars, len, code);
break;
case '\0':
assert (!len);
reset_syntax_set (syntax, code);
break;
default:
assert (false);
}
/* Check for any cleanup needed. */
if (syntax->suspect)
{
int ch;
int lquote = -1;
int rquote = -1;
int bcomm = -1;
int ecomm = -1;
bool single_quote_possible = true;
bool single_comm_possible = true;
int dollar = -1;
if (m4_has_syntax (syntax, syntax->quote.str1[0], M4_SYNTAX_LQUOTE))
{
assert (syntax->quote.len1 == 1);
lquote = to_uchar (syntax->quote.str1[0]);
}
if (m4_has_syntax (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE))
{
assert (syntax->quote.len2 == 1);
rquote = to_uchar (syntax->quote.str2[0]);
}
if (m4_has_syntax (syntax, syntax->comm.str1[0], M4_SYNTAX_BCOMM))
{
assert (syntax->comm.len1 == 1);
bcomm = to_uchar (syntax->comm.str1[0]);
}
if (m4_has_syntax (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM))
{
assert (syntax->comm.len2 == 1);
ecomm = to_uchar (syntax->comm.str2[0]);
}
syntax->is_single_dollar = false;
syntax->is_macro_escaped = false;
/* Find candidates for each category. */
for (ch = UCHAR_MAX + 1; --ch >= 0; )
{
if (m4_has_syntax (syntax, ch, M4_SYNTAX_LQUOTE))
{
if (lquote == -1)
lquote = ch;
else if (lquote != ch)
single_quote_possible = false;
}
if (m4_has_syntax (syntax, ch, M4_SYNTAX_RQUOTE))
{
if (rquote == -1)
rquote = ch;
else if (rquote != ch)
single_quote_possible = false;
}
if (m4_has_syntax (syntax, ch, M4_SYNTAX_BCOMM))
{
if (bcomm == -1)
bcomm = ch;
else if (bcomm != ch)
single_comm_possible = false;
}
if (m4_has_syntax (syntax, ch, M4_SYNTAX_ECOMM))
{
if (ecomm == -1)
ecomm = ch;
else if (ecomm != ch)
single_comm_possible = false;
}
if (m4_has_syntax (syntax, ch, M4_SYNTAX_DOLLAR))
{
if (dollar == -1)
{
syntax->dollar = dollar = ch;
syntax->is_single_dollar = true;
}
else
syntax->is_single_dollar = false;
}
if (m4_has_syntax (syntax, ch, M4_SYNTAX_ESCAPE))
syntax->is_macro_escaped = true;
}
/* Disable multi-character delimiters if we discovered
delimiters. */
if (!single_quote_possible)
syntax->is_single_quotes = false;
if (!single_comm_possible)
syntax->is_single_comments = false;
if ((1 < syntax->quote.len1 || 1 < syntax->quote.len2)
&& (!syntax->is_single_quotes || lquote != -1 || rquote != -1))
{
if (syntax->quote.len1)
{
syntax->quote.len1 = lquote == to_uchar (syntax->quote.str1[0]);
syntax->quote.str1[syntax->quote.len1] = '\0';
}
if (syntax->quote.len2)
{
syntax->quote.len2 = rquote == to_uchar (syntax->quote.str2[0]);
syntax->quote.str2[syntax->quote.len2] = '\0';
}
}
if ((1 < syntax->comm.len1 || 1 < syntax->comm.len2)
&& (!syntax->is_single_comments || bcomm != -1 || ecomm != -1))
{
if (syntax->comm.len1)
{
syntax->comm.len1 = bcomm == to_uchar (syntax->comm.str1[0]);
syntax->comm.str1[syntax->comm.len1] = '\0';
}
if (syntax->comm.len2)
{
syntax->comm.len2 = ecomm == to_uchar (syntax->comm.str2[0]);
syntax->comm.str2[syntax->comm.len2] = '\0';
}
}
/* Update the strings. */
if (lquote != -1)
{
if (single_quote_possible)
syntax->is_single_quotes = true;
if (syntax->quote.len1)
assert (syntax->quote.len1 == 1);
else
{
free (syntax->quote.str1);
syntax->quote.str1 = xcharalloc (2);
syntax->quote.str1[1] = '\0';
syntax->quote.len1 = 1;
}
syntax->quote.str1[0] = lquote;
if (rquote == -1)
{
rquote = '\'';
add_syntax_attribute (syntax, rquote, M4_SYNTAX_RQUOTE);
}
if (!syntax->quote.len2)
{
free (syntax->quote.str2);
syntax->quote.str2 = xcharalloc (2);
}
syntax->quote.str2[0] = rquote;
syntax->quote.str2[1] = '\0';
syntax->quote.len2 = 1;
}
if (bcomm != -1)
{
if (single_comm_possible)
syntax->is_single_comments = true;
if (syntax->comm.len1)
assert (syntax->comm.len1 == 1);
else
{
free (syntax->comm.str1);
syntax->comm.str1 = xcharalloc (2);
syntax->comm.str1[1] = '\0';
syntax->comm.len1 = 1;
}
syntax->comm.str1[0] = bcomm;
if (ecomm == -1)
{
ecomm = '\n';
add_syntax_attribute (syntax, ecomm, M4_SYNTAX_ECOMM);
}
if (!syntax->comm.len2)
{
free (syntax->comm.str2);
syntax->comm.str2 = xcharalloc (2);
}
syntax->comm.str2[0] = ecomm;
syntax->comm.str2[1] = '\0';
syntax->comm.len2 = 1;
}
}
set_quote_age (syntax, false, true);
m4__quote_uncache (syntax);
return code;
}
/* Functions for setting quotes and comment delimiters. Used by
m4_changecom () and m4_changequote (). Both functions override the
syntax table to maintain compatibility. */
/* Set the quote delimiters to LQ and RQ, with respective lengths
LQ_LEN and RQ_LEN. Pass NULL if the argument was not present, to
distinguish from an explicit empty string. */
void
m4_set_quotes (m4_syntax_table *syntax, const char *lq, size_t lq_len,
const char *rq, size_t rq_len)
{
int ch;
assert (syntax);
/* POSIX states that with 0 arguments, the default quotes are used.
POSIX XCU ERN 112 states that behavior is implementation-defined
if there was only one argument, or if there is an empty string in
either position when there are two arguments. We allow an empty
left quote to disable quoting, but a non-empty left quote will
always create a non-empty right quote. See the texinfo for what
some other implementations do. */
if (!lq)
{
lq = DEF_LQUOTE;
lq_len = 1;
rq = DEF_RQUOTE;
rq_len = 1;
}
else if (!rq || (lq_len && !rq_len))
{
rq = DEF_RQUOTE;
rq_len = 1;
}
if (syntax->quote.len1 == lq_len && syntax->quote.len2 == rq_len
&& memcmp (syntax->quote.str1, lq, lq_len) == 0
&& memcmp (syntax->quote.str2, rq, rq_len) == 0)
return;
free (syntax->quote.str1);
free (syntax->quote.str2);
/* The use of xmemdup0 is exploited by input.c. */
syntax->quote.str1 = xmemdup0 (lq, lq_len);
syntax->quote.len1 = lq_len;
syntax->quote.str2 = xmemdup0 (rq, rq_len);
syntax->quote.len2 = rq_len;
/* changequote overrides syntax_table, but be careful when it is
used to select a start-quote sequence that is effectively
disabled. */
syntax->is_single_quotes = true;
for (ch = UCHAR_MAX + 1; --ch >= 0; )
{
if (m4_has_syntax (syntax, ch, M4_SYNTAX_LQUOTE))
add_syntax_attribute (syntax, ch,
(syntax->orig[ch] == M4_SYNTAX_LQUOTE
? M4_SYNTAX_OTHER : syntax->orig[ch]));
if (m4_has_syntax (syntax, ch, M4_SYNTAX_RQUOTE))
remove_syntax_attribute (syntax, ch, M4_SYNTAX_RQUOTE);
}
if (!m4_has_syntax (syntax, *syntax->quote.str1,
(M4_SYNTAX_IGNORE | M4_SYNTAX_ESCAPE | M4_SYNTAX_ALPHA
| M4_SYNTAX_NUM)))
{
if (syntax->quote.len1 == 1)
add_syntax_attribute (syntax, syntax->quote.str1[0], M4_SYNTAX_LQUOTE);
if (syntax->quote.len2 == 1)
add_syntax_attribute (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE);
}
set_quote_age (syntax, false, false);
}
/* Set the comment delimiters to BC and EC, with respective lengths
BC_LEN and EC_LEN. Pass NULL if the argument was not present, to
distinguish from an explicit empty string. */
void
m4_set_comment (m4_syntax_table *syntax, const char *bc, size_t bc_len,
const char *ec, size_t ec_len)
{
int ch;
assert (syntax);
/* POSIX requires no arguments to disable comments, and that one
argument use newline as the close-comment. POSIX XCU ERN 131
states that empty arguments invoke implementation-defined
behavior. We allow an empty begin comment to disable comments,
and a non-empty begin comment will always create a non-empty end
comment. See the texinfo for what some other implementations
do. */
if (!bc)
{
bc = ec = "";
bc_len = ec_len = 0;
}
else if (!ec || (bc_len && !ec_len))
{
ec = DEF_ECOMM;
ec_len = 1;
}
if (syntax->comm.len1 == bc_len && syntax->comm.len2 == ec_len
&& memcmp (syntax->comm.str1, bc, bc_len) == 0
&& memcmp (syntax->comm.str2, ec, ec_len) == 0)
return;
free (syntax->comm.str1);
free (syntax->comm.str2);
/* The use of xmemdup0 is exploited by input.c. */
syntax->comm.str1 = xmemdup0 (bc, bc_len);
syntax->comm.len1 = bc_len;
syntax->comm.str2 = xmemdup0 (ec, ec_len);
syntax->comm.len2 = ec_len;
/* changecom overrides syntax_table, but be careful when it is used
to select a start-comment sequence that is effectively
disabled. */
syntax->is_single_comments = true;
for (ch = UCHAR_MAX + 1; --ch >= 0; )
{
if (m4_has_syntax (syntax, ch, M4_SYNTAX_BCOMM))
add_syntax_attribute (syntax, ch,
(syntax->orig[ch] == M4_SYNTAX_BCOMM
? M4_SYNTAX_OTHER : syntax->orig[ch]));
if (m4_has_syntax (syntax, ch, M4_SYNTAX_ECOMM))
remove_syntax_attribute (syntax, ch, M4_SYNTAX_ECOMM);
}
if (!m4_has_syntax (syntax, *syntax->comm.str1,
(M4_SYNTAX_IGNORE | M4_SYNTAX_ESCAPE | M4_SYNTAX_ALPHA
| M4_SYNTAX_NUM | M4_SYNTAX_LQUOTE)))
{
if (syntax->comm.len1 == 1)
add_syntax_attribute (syntax, syntax->comm.str1[0], M4_SYNTAX_BCOMM);
if (syntax->comm.len2 == 1)
add_syntax_attribute (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM);
}
set_quote_age (syntax, false, false);
}
/* Call this when changing anything that might impact the quote age,
so that m4__quote_age and m4__safe_quotes will reflect the change.
If RESET, changesyntax was reset to its default stage; if CHANGE,
arbitrary syntax has changed; otherwise, just quotes or comment
delimiters have changed. */
static void
set_quote_age (m4_syntax_table *syntax, bool reset, bool change)
{
/* Multi-character quotes are inherently unsafe, since concatenation
of individual characters can result in a quote delimiter,
consider:
define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>)
=> A]> (not ]>a)
Also, unquoted close delimiters are unsafe, consider:
define(echo,``$1'')define(a,A)echo(`a''`a')
=> aA' (not a'a)
Duplicated start and end quote delimiters, as well as comment
delimiters that overlap with quote delimiters or active characters,
also present a problem, consider:
define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,'))
=> A,a,A (not A,A,A)
The impact of arbitrary changesyntax is difficult to characterize.
So if things are in their default state, we use 0 for the upper 16
bits of quote_age; otherwise we increment syntax_age for each
changesyntax, but saturate it at 0xffff rather than wrapping
around. Perhaps a cache of other frequently used states is
warranted, if changesyntax becomes more popular.
Perhaps someday we will fix $@ expansion to use the current
settings of the comma category, or even allow multi-character
argument separators via changesyntax. Until then, we use a literal
`,' in $@ expansion, therefore we must insist that `,' be an
argument separator for quote_age to be non-zero.
Rather than check every token for an unquoted delimiter, we merely
encode current_quote_age to 0 when things are unsafe, and non-zero
when safe (namely, the syntax_age in the upper 16 bits, coupled
with the 16-bit value composed of the single-character start and
end quote delimiters). There may be other situations which are
safe even when this algorithm sets the quote_age to zero, but at
least a quote_age of zero always produces correct results (although
it may take more time in doing so). */
unsigned short local_syntax_age;
if (reset)
local_syntax_age = 0;
else if (change && syntax->syntax_age < 0xffff)
local_syntax_age = ++syntax->syntax_age;
else
local_syntax_age = syntax->syntax_age;
if (local_syntax_age < 0xffff && syntax->is_single_quotes
&& syntax->quote.len1 == 1 && syntax->quote.len2 == 1
&& !m4_has_syntax (syntax, *syntax->quote.str1,
(M4_SYNTAX_ALPHA | M4_SYNTAX_NUM | M4_SYNTAX_OPEN
| M4_SYNTAX_COMMA | M4_SYNTAX_CLOSE
| M4_SYNTAX_SPACE))
&& !m4_has_syntax (syntax, *syntax->quote.str2,
(M4_SYNTAX_ALPHA | M4_SYNTAX_NUM | M4_SYNTAX_OPEN
| M4_SYNTAX_COMMA | M4_SYNTAX_CLOSE
| M4_SYNTAX_SPACE))
&& *syntax->quote.str1 != *syntax->quote.str2
&& (!syntax->comm.len1
|| (*syntax->comm.str1 != *syntax->quote.str2
&& !m4_has_syntax (syntax, *syntax->comm.str1,
(M4_SYNTAX_OPEN | M4_SYNTAX_COMMA
| M4_SYNTAX_CLOSE))))
&& m4_has_syntax (syntax, ',', M4_SYNTAX_COMMA))
{
syntax->quote_age = ((local_syntax_age << 16)
| ((*syntax->quote.str1 & 0xff) << 8)
| (*syntax->quote.str2 & 0xff));
}
else
syntax->quote_age = 0;
}
/* Interface for caching frequently used quote pairs, independently of
the current quote delimiters (for example, consider a text macro
expansion that includes several copies of $@), and using AGE for
optimization. If QUOTES is NULL, don't use quoting. If OBS is
non-NULL, AGE should be the current quote age, and QUOTES should be
m4_get_syntax_quotes; the return value will be a cached quote pair,
where the pointer is valid at least as long as OBS is not reset,
but whose contents are only guaranteed until the next changequote
or quote_cache. Otherwise, OBS is NULL, AGE should be the same as
before, and QUOTES should be a previously returned cache value;
used to refresh the contents of the result. */
const m4_string_pair *
m4__quote_cache (m4_syntax_table *syntax, m4_obstack *obs, unsigned int age,
const m4_string_pair *quotes)
{
/* Implementation - if AGE is non-zero, then the implementation of
set_quote_age guarantees that we can recreate the return value on
the fly; so we use static storage, and the contents must be used
immediately. If AGE is zero, then we must copy QUOTES onto OBS,
but we might as well cache that copy. */
if (!quotes)
return NULL;
if (age)
{
*syntax->cached_lquote = (age >> 8) & 0xff;
*syntax->cached_rquote = age & 0xff;
return &syntax->cached_simple;
}
if (!obs)
return quotes;
assert (quotes == &syntax->quote);
if (!syntax->cached_quote)
{
assert (obstack_object_size (obs) == 0);
syntax->cached_quote = (m4_string_pair *) obstack_copy (obs, quotes,
sizeof *quotes);
syntax->cached_quote->str1 = (char *) obstack_copy0 (obs, quotes->str1,
quotes->len1);
syntax->cached_quote->str2 = (char *) obstack_copy0 (obs, quotes->str2,
quotes->len2);
}
return syntax->cached_quote;
}
/* Define these functions at the end, so that calls in the file use the
faster macro version from m4module.h. */
#undef m4_get_syntax_lquote
const char *
m4_get_syntax_lquote (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->quote.str1;
}
#undef m4_get_syntax_rquote
const char *
m4_get_syntax_rquote (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->quote.str2;
}
#undef m4_get_syntax_quotes
const m4_string_pair *
m4_get_syntax_quotes (m4_syntax_table *syntax)
{
assert (syntax);
return &syntax->quote;
}
#undef m4_is_syntax_single_quotes
bool
m4_is_syntax_single_quotes (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->is_single_quotes;
}
#undef m4_get_syntax_bcomm
const char *
m4_get_syntax_bcomm (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->comm.str1;
}
#undef m4_get_syntax_ecomm
const char *
m4_get_syntax_ecomm (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->comm.str2;
}
#undef m4_get_syntax_comments
const m4_string_pair *
m4_get_syntax_comments (m4_syntax_table *syntax)
{
assert (syntax);
return &syntax->comm;
}
#undef m4_is_syntax_single_comments
bool
m4_is_syntax_single_comments (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->is_single_comments;
}
#undef m4_is_syntax_single_dollar
bool
m4_is_syntax_single_dollar (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->is_single_dollar;
}
#undef m4_is_syntax_macro_escaped
bool
m4_is_syntax_macro_escaped (m4_syntax_table *syntax)
{
assert (syntax);
return syntax->is_macro_escaped;
}