Preparations for refactoring syntax tables to allow reverse

lookups [fetch me a M4_SYNTAX_OPEN], without compromising the speed of normal lookups in an array of unsigned short.
author: Gary V. Vaughan <gary@gnu.org> 2002-05-29 18:06:42 +0000
committer: Eric Blake <ebb9@byu.net> 2007-10-05 21:58:22 -0600
commit: de11cb28a57e7d1196a587f24dc83f1b492570b3 (patch)
tree: 7b76cf261009fc6bbcb298ee21c67e7bf1dd2b78
parent: 86b1ec4e3585a53151d88f754a342819d3dc6b06 (diff)
download: m4-de11cb28a57e7d1196a587f24dc83f1b492570b3.tar.gz
7 files changed, 385 insertions, 314 deletions
diff --git a/ChangeLog b/ChangeLog
index 9b552735..0d2ea823 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,27 @@
 2002-05-29  Gary V. Vaughan  <gary@gnu.org>
 
+	Preparations for refactoring syntax tables to allow reverse
+	lookups [fetch me a M4_SYNTAX_OPEN], without compromising the
+	speed of normal lookups in an array of unsigned short.
+
+	* m4/input.c (single_quotes, single_comments, use_macro_escape):
+	Moved from here...
+	* m4/m4private.h (m4__single_quotes, m4__single_comments,
+	m4__use_macro_escape): ...to here, and renamed.  The `m4__' prefix
+	is for internal symbols which unavoidably pollute the global
+	namespace, but are not published APIs.
+	Adjusted all callers.
+	* m4/input.c (m4_syntax_init, m4_syntax_code, m4_set_quotes,
+	m4_set_comment, m4_set_syntax, set_syntax_internal,
+	unset_syntax_attribute): Moved from this file...
+	* m4/syntax.c: New. ...to this file.
+	Also added an m4_syntax_exit stub for orthogonality.
+	* src/main.c (main): Use it.
+	* m4/Makefile.am (libm4_la_SOURCES): Add syntax.c.
+	* m4/m4module.h: Reformatting.  New prototypes.
+
+2002-05-29  Gary V. Vaughan  <gary@gnu.org>
+
 	* bootstrap (aclocal_apiversion): The aclocal apiversion is
 	distinct from the automake release number (in that the apiversion
 	apparently has no micro-version-component).
diff --git a/m4/Makefile.am b/m4/Makefile.am
index 85eb91eb..48656011 100644
--- a/m4/Makefile.am
+++ b/m4/Makefile.am
@@ -38,7 +38,7 @@ EXTRA_DIST = $(EXTRA_HEADERS) obstack.c
 lib_LTLIBRARIES		= libm4.la
 libm4_la_SOURCES 	= builtin.c debug.c error.c hash.c \
 			  input.c ltdl.c macro.c module.c output.c \
-			  path.c regex.c symtab.c utility.c
+			  path.c regex.c symtab.c syntax.c utility.c
 libm4_la_LIBADD		= $(LTLIBOBJS) $(LIBADD_DL)
 
 module.o module.lo: $(srcdir)/module.c pathconf.h
diff --git a/m4/input.c b/m4/input.c
index fd43d823..2f8425ac 100644
--- a/m4/input.c
+++ b/m4/input.c
@@ -26,9 +26,6 @@
 #define DEBUG_INPUT
 #undef DEBUG_INPUT
 
-#define DEBUG_SYNTAX
-#undef DEBUG_SYNTAX
-
 /*
    Unread input can be either files, that should be read (eg. included
    files), strings, which should be rescanned (eg. macro expansion
@@ -67,77 +64,8 @@
    for use by the error handling functions in m4.c.  Whenever a file
    input_block is pushed, the current file name and line number is saved
    in the input_block, and the two variables are reset to match the new
-   input file.
-
-   THE SYNTAX TABLE
-
-   The input is read character by character and grouped together
-   according to a syntax table.  The character groups are (definitions
-   are all in m4.h, those marked with a * are not yet in use):
-
-   M4_SYNTAX_IGNORE	*Character to be deleted from input as if not present
-   M4_SYNTAX_OTHER	Any character with no special meaning to m4
-   M4_SYNTAX_SPACE	Whitespace (ignored when leading macro arguments)
-   M4_SYNTAX_OPEN	Open list of macro arguments
-   M4_SYNTAX_CLOSE	Close list of macro arguments
-   M4_SYNTAX_COMMA	Separates macro arguments
-   M4_SYNTAX_DOLLAR	*Indicates macro argument in user macros
-   M4_SYNTAX_ACTIVE	This caracter is a macro name by itself
-
-   M4_SYNTAX_ESCAPE	Use this character to prefix all macro names
-   M4_SYNTAX_ALPHA	Alphabetic characters (can start macro names)
-   M4_SYNTAX_NUM	Numeric characters
-   M4_SYNTAX_ALNUM	Alphanumeric characters (can form macro names)
-
-   (These are bit masks)
-   M4_SYNTAX_LQUOTE	A single characters left quote
-   M4_SYNTAX_RQUOTE	A single characters right quote
-   M4_SYNTAX_BCOMM	A single characters begin comment delimiter
-   M4_SYNTAX_ECOMM	A single characters end comment delimiter
-
-   Besides adding new facilities, the use of a syntax table will reduce
-   the number of calls to next_token ().  Now groups of OTHER, NUM and
-   SPACE characters can be returned as a single token, since next_token
-   () knows they have no special syntactical meaning to m4.  This is,
-   however, only possible if only single character quotes comments
-   comments are used, because otherwise the quote and comment characters
-   will not show up in the syntax-table.
-
-   Having a syntax table allows new facilities.  The new builtin
-   "changesyntax" allows the the user to change the category of any
-   character.
-
-   Default '\n' is both ECOMM and SPACE, depending on the context.  To
-   solve the problem of quotes and comments that have diffent syntax
-   code based on the context, the [LR]QUOTE and [BE]COMM codes are bit
-   masks to add to an ordinary code.  If a character is made a quote it
-   will be recognised if the basis code does not have precedence.
-
-   When changing quotes and comment delimiters only the bits are
-   removed, and the characters are therefore reverted to its old
-   category code.
-
-   The precedence as implemented by next_token () is:
-
-   M4_SYNTAX_IGNORE	*Filtered out below next_token ()
-   M4_SYNTAX_BCOMM	Reads all until M4_SYNTAX_ECOMM
-   M4_SYNTAX_ESCAPE	Reads macro name iff set, else next
-   M4_SYNTAX_ALPHA	Reads macro name
-   M4_SYNTAX_LQUOTE	Reads all until balanced M4_SYNTAX_RQUOTE
-
-   M4_SYNTAX_OTHER	and M4_SYNTAX_NUM
-			Reads all M4_SYNTAX_OTHER and M4_SYNTAX_NUM
-   M4_SYNTAX_SPACE	Reads all M4_SYNTAX_SPACE
-   M4_SYNTAX_ACTIVE	Returns a single char as a word
-   the rest		Returned as a single char
-
-   M4_SYNTAX_DOLLAR is not currently used.  The character $ is treated as a
-   M4_SYNTAX_OTHER.  It could be done, but it will slow next_token () down
-   a bit.  The $ is not really a part of m4's input syntax in the sense
-   that a string is parsed equally whether there is a $ or not.  The
-   character $ is used by convention in user macros.  */
-
-static	void  check_use_macro_escape	(void);
+   input file.  */
+
 static	int   file_peek			(void);
 static	int   file_read			(void);
 static	void  file_unget		(int ch);
@@ -148,14 +76,12 @@ static	int   macro_read		(void);
 static	int   match_input		(const unsigned char *s);
 static	int   next_char			(void);
 static	void  pop_input			(void);
-static	void  set_syntax_internal	(int code, int ch);
 static	int   single_peek		(void);
 static	int   single_read		(void);
 static	int   string_peek		(void);
 static	int   string_read		(void);
 static	void  string_unget		(int ch);
 static	void  unget_input		(int ch);
-static	void  unset_syntax_attribute	(int code, int ch);
 
 struct input_funcs
 {
@@ -243,22 +169,10 @@ static input_block *next;
 /* Flag for next_char () to increment m4_current_line.  */
 static boolean start_of_input_line;
 
-/* Input syntax table */
-/* unsigned short syntax_table[256];  moved to m4module.c. */
-
 #define CHAR_EOF	256	/* character return on EOF */
 #define CHAR_MACRO	257	/* character return for MACRO token */
 #define CHAR_RETRY	258	/* character return for end of input block */
 
-/* TRUE iff strlen(rquote) == strlen(lquote) == 1 */
-static boolean single_quotes;
-
-/* TRUE iff strlen(bcomm) == strlen(ecomm) == 1 */
-static boolean single_comments;
-
-/* TRUE iff some character has M4_SYNTAX_ESCAPE */
-static boolean use_macro_escape;
-
 
 
 /* push_file () pushes an input file on the input stack, saving the
@@ -760,9 +674,6 @@ match_input (const unsigned char *s)
 
 
 /* Inititialise input stacks, and quote/comment characters.  */
-static void set_syntax_internal	    (int code, int ch);
-static void unset_syntax_attribute  (int code, int ch);
-
 void
 m4_input_init (void)
 {
@@ -788,15 +699,15 @@ m4_input_init (void)
   lquote.length = strlen (lquote.string);
   rquote.string = xstrdup (DEF_RQUOTE);
   rquote.length = strlen (rquote.string);
-  single_quotes = TRUE;
+  m4__single_quotes = TRUE;
 
   bcomm.string = xstrdup (DEF_BCOMM);
   bcomm.length = strlen (bcomm.string);
   ecomm.string = xstrdup (DEF_ECOMM);
   ecomm.length = strlen (ecomm.string);
-  single_comments = TRUE;
+  m4__single_comments = TRUE;
 
-  use_macro_escape = FALSE;
+  m4__use_macro_escape = FALSE;
 }
 
  void
@@ -811,198 +722,8 @@ m4_input_exit (void)
   obstack_free (&token_stack, NULL);
 }
 
-void
-m4_syntax_init (void)
-{
-  int ch;
-
-  for (ch = 256; --ch > 0;)
-    {
-      if (ch == '(')
-	set_syntax_internal (M4_SYNTAX_OPEN, ch);
-      else if (ch == ')')
-	set_syntax_internal (M4_SYNTAX_CLOSE, ch);
-      else if (ch == ',')
-	set_syntax_internal (M4_SYNTAX_COMMA, ch);
-      else if (isspace (ch))
-	set_syntax_internal (M4_SYNTAX_SPACE, ch);
-      else if (isalpha (ch) || ch == '_')
-	set_syntax_internal (M4_SYNTAX_ALPHA, ch);
-      else if (isdigit (ch))
-	set_syntax_internal (M4_SYNTAX_NUM, ch);
-      else
-	set_syntax_internal (M4_SYNTAX_OTHER, ch);
-    }
-  /* set_syntax_internal(M4_SYNTAX_IGNORE, 0); */
-
-  /* Default quotes and comment delimiters are always one char */
-  set_syntax_internal (M4_SYNTAX_LQUOTE, lquote.string[0]);
-  set_syntax_internal (M4_SYNTAX_RQUOTE, rquote.string[0]);
-  set_syntax_internal (M4_SYNTAX_BCOMM, bcomm.string[0]);
-  set_syntax_internal (M4_SYNTAX_ECOMM, ecomm.string[0]);
-}
-
-int
-m4_syntax_code (char ch)
-{
-  int code;
-
-  switch (ch)
-    {
-    case 'I': case 'i': code = M4_SYNTAX_IGNORE; break;
-    case 'O': case 'o': code = M4_SYNTAX_OTHER;  break;
-    case 'S': case 's': code = M4_SYNTAX_SPACE;  break;
-    case 'W': case 'w': code = M4_SYNTAX_ALPHA;  break;
-    case 'D': case 'd': code = M4_SYNTAX_NUM;    break;
-
-    case '(': code = M4_SYNTAX_OPEN;   break;
-    case ')': code = M4_SYNTAX_CLOSE;  break;
-    case ',': code = M4_SYNTAX_COMMA;  break;
-    case '@': code = M4_SYNTAX_ESCAPE; break;
-#if 0				/* not yet used */
-    case '$': code = M4_SYNTAX_DOLLAR; break;
-#endif
-
-    case 'L': case 'l': code = M4_SYNTAX_LQUOTE; break;
-    case 'R': case 'r': code = M4_SYNTAX_RQUOTE; break;
-    case 'B': case 'b': code = M4_SYNTAX_BCOMM;  break;
-    case 'E': case 'e': code = M4_SYNTAX_ECOMM;  break;
-    case 'A': case 'a': code = M4_SYNTAX_ACTIVE;  break;
-
-    default: code = -1;  break;
-    }
-
-  return code;
-}
-
-static void
-check_use_macro_escape (void)
-{
-  int ch;
-
-  use_macro_escape = FALSE;
-  for (ch = 256; --ch >= 0; )
-    if (M4_IS_ESCAPE (ch))
-      use_macro_escape = TRUE;
-}
 
 
-
-/* Functions for setting quotes and comment delimiters.  Used by
-   m4_changecom () and m4_changequote ().  Both functions overrides the
-   syntax_table to maintain compatibility.  */
-void
-m4_set_quotes (const char *lq, const char *rq)
-{
-  int ch;
-  for (ch = 256; --ch >= 0;)	/* changequote overrides syntax_table */
-    if (M4_IS_LQUOTE (ch) || M4_IS_RQUOTE (ch))
-      unset_syntax_attribute (M4_SYNTAX_LQUOTE | M4_SYNTAX_RQUOTE, ch);
-
-  xfree (lquote.string);
-  xfree (rquote.string);
-
-  lquote.string = xstrdup (lq ? lq : DEF_LQUOTE);
-  lquote.length = strlen (lquote.string);
-  rquote.string = xstrdup (rq ? rq : DEF_RQUOTE);
-  rquote.length = strlen (rquote.string);
-
-  single_quotes = (lquote.length == 1 && rquote.length == 1);
-
-  if (single_quotes)
-    {
-      set_syntax_internal (M4_SYNTAX_LQUOTE, lquote.string[0]);
-      set_syntax_internal (M4_SYNTAX_RQUOTE, rquote.string[0]);
-    }
-
-  if (use_macro_escape)
-    check_use_macro_escape ();
-}
-
-void
-m4_set_comment (const char *bc, const char *ec)
-{
-  int ch;
-  for (ch = 256; --ch >= 0;)	/* changecom overrides syntax_table */
-    if (M4_IS_BCOMM (ch) || M4_IS_ECOMM (ch))
-      unset_syntax_attribute (M4_SYNTAX_BCOMM | M4_SYNTAX_ECOMM, ch);
-
-  xfree (bcomm.string);
-  xfree (ecomm.string);
-
-  bcomm.string = xstrdup (bc ? bc : DEF_BCOMM);
-  bcomm.length = strlen (bcomm.string);
-  ecomm.string = xstrdup (ec ? ec : DEF_ECOMM);
-  ecomm.length = strlen (ecomm.string);
-
-  single_comments = (bcomm.length == 1 && ecomm.length == 1);
-
-  if (single_comments)
-    {
-      set_syntax_internal (M4_SYNTAX_BCOMM, bcomm.string[0]);
-      set_syntax_internal (M4_SYNTAX_ECOMM, ecomm.string[0]);
-    }
-
-  if (use_macro_escape)
-    check_use_macro_escape ();
-}
-
-/* Functions to manipulate the syntax table.  */
-static void
-set_syntax_internal (int code, int ch)
-{
-  if (code & M4_SYNTAX_MASKS)
-    m4_syntax_table[ch] |= code;
-  else
-    m4_syntax_table[ch] = code;
-
-#ifdef DEBUG_SYNTAX
-  fprintf(stderr, "Set syntax %o %c = %04X\n",
-	  ch, isprint(ch) ? ch : '-',
-	  m4_syntax_table[ch]);
-#endif
-}
-
-static void
-unset_syntax_attribute (int code, int ch)
-{
-  if (code & M4_SYNTAX_MASKS)
-    m4_syntax_table[ch] &= ~code;
-
-#ifdef DEBUG_SYNTAX
-  fprintf(stderr, "Unset syntax %o %c = %04X\n",
-	  ch, isprint(ch) ? ch : '-',
-	  m4_syntax_table[ch]);
-#endif
-}
-
-void
-m4_set_syntax (char key, const unsigned char *chars)
-{
-  int ch, code;
-
-  code = m4_syntax_code (key);
-
-  if ((code < 0) && (key != '\0'))
-    {
-      M4ERROR ((warning_status, 0,
-		_("Undefined syntax code %c"), key));
-      return;
-    }
-
-  if (*chars != '\0')
-    while ((ch = *chars++))
-      set_syntax_internal (code, ch);
-  else
-    for (ch = 256; --ch > 0; )
-      set_syntax_internal (code, ch);
-
-  if (use_macro_escape || code == M4_SYNTAX_ESCAPE)
-    check_use_macro_escape();
-}
-
-
-
 /* Parse and return a single token from the input stream.  A token can
    either be TOKEN_EOF, if the input_stack is empty; it can be TOKEN_STRING
    for a quoted string; TOKEN_WORD for something that is a potential macro
@@ -1056,7 +777,7 @@ m4_next_token (m4_token *td)
 	type = discard_comments ? M4_TOKEN_NONE : M4_TOKEN_STRING;
       }
 					/* COMMENT, LONGER DELIM */
-    else if (!single_comments && MATCH (ch, bcomm.string))
+    else if (!m4__single_comments && MATCH (ch, bcomm.string))
       {
 	obstack_grow (&token_stack, bcomm.string, bcomm.length);
 	while ((ch = next_char ()) != CHAR_EOF && !MATCH (ch, ecomm.string))
@@ -1103,7 +824,7 @@ m4_next_token (m4_token *td)
 	if (ch != CHAR_EOF)
 	  unget_input(ch);
 
-	type = use_macro_escape ? M4_TOKEN_STRING : M4_TOKEN_WORD;
+	type = m4__use_macro_escape ? M4_TOKEN_STRING : M4_TOKEN_WORD;
       }
     else if (M4_IS_LQUOTE(ch))		/* QUOTED STRING, SINGLE QUOTES */
       {
@@ -1135,7 +856,7 @@ m4_next_token (m4_token *td)
 	type = M4_TOKEN_STRING;
       }
 					/* QUOTED STRING, LONGER QUOTES */
-    else if (!single_quotes && MATCH (ch, lquote.string))
+    else if (!m4__single_quotes && MATCH (ch, lquote.string))
       {
 	const char *current_file = m4_current_file;
 	int current_line = m4_current_line;
@@ -1163,7 +884,7 @@ m4_next_token (m4_token *td)
 	  }
 	type = M4_TOKEN_STRING;
       }
-    else if (single_quotes && single_comments) /* EVERYTHING ELSE */
+    else if (m4__single_quotes && m4__single_comments) /* EVERYTHING ELSE */
       {
 	obstack_1grow (&token_stack, ch);
 
diff --git a/m4/m4module.h b/m4/m4module.h
index 741ea86c..cd9b9c58 100644
--- a/m4/m4module.h
+++ b/m4/m4module.h
@@ -57,6 +57,12 @@ typedef struct {
 } m4_builtin;
 
 
+
+/* --- MODULE MANAGEMENT --- */
+
+typedef void m4_module_init_func   (lt_dlhandle, struct obstack*);
+typedef void m4_module_finish_func (lt_dlhandle, struct obstack*);
+
 extern void	    m4_module_init   (void);
 extern lt_dlhandle  m4_module_load   (const char*, struct obstack*);
 extern void	    m4_module_unload (const char*, struct obstack*);
@@ -72,6 +78,9 @@ extern m4_macro	   *m4_module_macros   (lt_dlhandle);
 extern lt_dlhandle  m4_module_find_by_builtin (const m4_builtin*);
 
 
+
+/* --- MACRO (and builtin) MANAGEMENT --- */
+
 extern m4_symbol *m4_macro_pushdef	(const char *name, lt_dlhandle handle,
 					 const char *text, int flags,
 					 int min_args, int max_args);
@@ -95,6 +104,10 @@ extern const m4_builtin *m4_builtin_find_by_name (
 extern const m4_builtin *m4_builtin_find_by_func (
 				const m4_builtin *, m4_builtin_func *);
 
+
+
+/* --- SYMBOL TABLE MANAGEMENT --- */
+
 extern m4_hash *m4_symtab;
 
 extern void	m4_symtab_init		(void);
@@ -133,9 +146,6 @@ typedef enum {
   M4_TOKEN_FUNC
 } m4_data_t;
 
-typedef void m4_module_init_func   (lt_dlhandle, struct obstack*);
-typedef void m4_module_finish_func (lt_dlhandle, struct obstack*);
-
 extern m4_token_t	m4_token_type	  (m4_token *);
 extern char	       *m4_token_text	  (m4_token *);
 extern m4_builtin_func *m4_token_func	  (m4_token *);
@@ -208,6 +218,9 @@ void m4_shipout_string (struct obstack*, const char*, int, boolean);
 void m4_dump_args (struct obstack *obs, int argc, m4_token **argv, const char *sep, boolean quoted);
 
 
+
+/* --- RUNTIME DEBUGGING --- */
+
 FILE *m4_debug;
 
 /* The value of debug_level is a bitmask of the following.  */
@@ -323,6 +336,13 @@ void m4_process_macro (struct obstack *obs, m4_symbol *symbol, int argc, m4_toke
 
 /* --- SYNTAX TABLE DEFINITIONS --- */
 
+/* Please read the comment at the top of input.c for details */
+unsigned short m4_syntax_table[256];
+
+extern	void	m4_syntax_init	(void);
+extern	void	m4_syntax_exit	(void);
+extern	int	m4_syntax_code	(char ch);
+
 /* These are simple values, not bit masks.  There is no overlap. */
 #define M4_SYNTAX_OTHER		(0x0000)
 
@@ -352,40 +372,38 @@ void m4_process_macro (struct obstack *obs, m4_symbol *symbol, int argc, m4_toke
 #define M4_SYNTAX_VALUE		(0x00FF|M4_SYNTAX_LQUOTE|M4_SYNTAX_BCOMM)
 #define M4_SYNTAX_MASKS		(0xFF00)
 
-
-#define m4_syntax(ch)	m4_syntax_table[(int)(ch)]
+#define m4__syntax(ch)	m4_syntax_table[(int)(ch)]
 
-#define M4_IS_OTHER(ch)  ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_OTHER)
-#define M4_IS_IGNORE(ch) ((m4_syntax(ch)) == M4_SYNTAX_IGNORE)
-#define M4_IS_SPACE(ch)  ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_SPACE)
+#define M4_IS_OTHER(ch)  ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_OTHER)
+#define M4_IS_IGNORE(ch) ((m4__syntax(ch)) == M4_SYNTAX_IGNORE)
+#define M4_IS_SPACE(ch)  ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_SPACE)
 
-#define M4_IS_OPEN(ch)   ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_OPEN)
-#define M4_IS_CLOSE(ch)  ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_CLOSE)
-#define M4_IS_COMMA(ch)  ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_COMMA)
-#define M4_IS_DOLLAR(ch) ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_DOLLAR)
-#define M4_IS_ACTIVE(ch) ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_ACTIVE)
+#define M4_IS_OPEN(ch)   ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_OPEN)
+#define M4_IS_CLOSE(ch)  ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_CLOSE)
+#define M4_IS_COMMA(ch)  ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_COMMA)
+#define M4_IS_DOLLAR(ch) ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_DOLLAR)
+#define M4_IS_ACTIVE(ch) ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_ACTIVE)
 
-#define M4_IS_ESCAPE(ch) ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_ESCAPE)
-#define M4_IS_ALPHA(ch)  ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_ALPHA)
-#define M4_IS_NUM(ch)    ((m4_syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_NUM)
-#define M4_IS_ALNUM(ch)  (((m4_syntax(ch)) & M4_SYNTAX_ALNUM) != 0)
+#define M4_IS_ESCAPE(ch) ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_ESCAPE)
+#define M4_IS_ALPHA(ch)  ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_ALPHA)
+#define M4_IS_NUM(ch)    ((m4__syntax(ch)&M4_SYNTAX_VALUE) == M4_SYNTAX_NUM)
+#define M4_IS_ALNUM(ch)  (((m4__syntax(ch)) & M4_SYNTAX_ALNUM) != 0)
 
-#define M4_IS_LQUOTE(ch) (m4_syntax(ch) & M4_SYNTAX_LQUOTE)
-#define M4_IS_RQUOTE(ch) (m4_syntax(ch) & M4_SYNTAX_RQUOTE)
-#define M4_IS_BCOMM(ch)  (m4_syntax(ch) & M4_SYNTAX_BCOMM)
-#define M4_IS_ECOMM(ch)  (m4_syntax(ch) & M4_SYNTAX_ECOMM)
+#define M4_IS_LQUOTE(ch) (m4__syntax(ch) & M4_SYNTAX_LQUOTE)
+#define M4_IS_RQUOTE(ch) (m4__syntax(ch) & M4_SYNTAX_RQUOTE)
+#define M4_IS_BCOMM(ch)  (m4__syntax(ch) & M4_SYNTAX_BCOMM)
+#define M4_IS_ECOMM(ch)  (m4__syntax(ch) & M4_SYNTAX_ECOMM)
 
-/* Please read the comment at the top of input.c for details */
-unsigned short m4_syntax_table[256];
+
+
+/* --- TOKENISATION AND INPUT --- */
 
 /* current input file, and line */
 const char *m4_current_file;
 int m4_current_line;
 
-extern	int	m4_syntax_code	(char ch);
 extern	void	m4_input_init	(void);
 extern	void	m4_input_exit	(void);
-extern	void	m4_syntax_init	(void);
 extern	int	m4_peek_input	(void);
 extern	m4_token_t m4_next_token (m4_token *);
 extern	void	m4_token_copy	(m4_token *dest, m4_token *src);
diff --git a/m4/m4private.h b/m4/m4private.h
index 4d6094e0..645e3da8 100644
--- a/m4/m4private.h
+++ b/m4/m4private.h
@@ -27,6 +27,15 @@
 #include <assert.h>
 #include <m4module.h>
 
+/* TRUE iff strlen(rquote) == strlen(lquote) == 1 */
+extern boolean m4__single_quotes;
+
+/* TRUE iff strlen(bcomm) == strlen(ecomm) == 1 */
+extern boolean m4__single_comments;
+
+/* TRUE iff some character has M4_SYNTAX_ESCAPE */
+extern boolean m4__use_macro_escape;
+
 struct m4_module_data {
   m4_builtin	    *bp;	/* `m4_builtin_table' address */
   m4_macro	    *mp;	/* `m4_macro_table' address */
diff --git a/m4/syntax.c b/m4/syntax.c
new file mode 100644
index 00000000..be4c7a02
--- /dev/null
+++ b/m4/syntax.c
@@ -0,0 +1,300 @@
+/* GNU m4 -- A simple macro processor
+   Copyright 1989, 90, 91, 92, 93, 94, 2002 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307  USA
+*/
+
+#include "m4private.h"
+
+#define DEBUG_SYNTAX
+#undef DEBUG_SYNTAX
+
+/* THE SYNTAX TABLE
+
+   The input is read character by character and grouped together
+   according to a syntax table.  The character groups are (definitions
+   are all in m4.h, those marked with a * are not yet in use):
+
+   M4_SYNTAX_IGNORE	*Character to be deleted from input as if not present
+   M4_SYNTAX_OTHER	Any character with no special meaning to m4
+   M4_SYNTAX_SPACE	Whitespace (ignored when leading macro arguments)
+   M4_SYNTAX_OPEN	Open list of macro arguments
+   M4_SYNTAX_CLOSE	Close list of macro arguments
+   M4_SYNTAX_COMMA	Separates macro arguments
+   M4_SYNTAX_DOLLAR	*Indicates macro argument in user macros
+   M4_SYNTAX_ACTIVE	This caracter is a macro name by itself
+
+   M4_SYNTAX_ESCAPE	Use this character to prefix all macro names
+   M4_SYNTAX_ALPHA	Alphabetic characters (can start macro names)
+   M4_SYNTAX_NUM	Numeric characters
+   M4_SYNTAX_ALNUM	Alphanumeric characters (can form macro names)
+
+   (These are bit masks)
+   M4_SYNTAX_LQUOTE	A single characters left quote
+   M4_SYNTAX_RQUOTE	A single characters right quote
+   M4_SYNTAX_BCOMM	A single characters begin comment delimiter
+   M4_SYNTAX_ECOMM	A single characters end comment delimiter
+
+   Besides adding new facilities, the use of a syntax table will reduce
+   the number of calls to next_token ().  Now groups of OTHER, NUM and
+   SPACE characters can be returned as a single token, since next_token
+   () knows they have no special syntactical meaning to m4.  This is,
+   however, only possible if only single character quotes comments
+   comments are used, because otherwise the quote and comment characters
+   will not show up in the syntax-table.
+
+   Having a syntax table allows new facilities.  The new builtin
+   "changesyntax" allows the the user to change the category of any
+   character.
+
+   Default '\n' is both ECOMM and SPACE, depending on the context.  To
+   solve the problem of quotes and comments that have diffent syntax
+   code based on the context, the [LR]QUOTE and [BE]COMM codes are bit
+   masks to add to an ordinary code.  If a character is made a quote it
+   will be recognised if the basis code does not have precedence.
+
+   When changing quotes and comment delimiters only the bits are
+   removed, and the characters are therefore reverted to its old
+   category code.
+
+   The precedence as implemented by next_token () is:
+
+   M4_SYNTAX_IGNORE	*Filtered out below next_token ()
+   M4_SYNTAX_BCOMM	Reads all until M4_SYNTAX_ECOMM
+   M4_SYNTAX_ESCAPE	Reads macro name iff set, else next
+   M4_SYNTAX_ALPHA	Reads macro name
+   M4_SYNTAX_LQUOTE	Reads all until balanced M4_SYNTAX_RQUOTE
+
+   M4_SYNTAX_OTHER	and M4_SYNTAX_NUM
+			Reads all M4_SYNTAX_OTHER and M4_SYNTAX_NUM
+   M4_SYNTAX_SPACE	Reads all M4_SYNTAX_SPACE
+   M4_SYNTAX_ACTIVE	Returns a single char as a word
+   the rest		Returned as a single char
+
+   M4_SYNTAX_DOLLAR is not currently used.  The character $ is treated as a
+   M4_SYNTAX_OTHER.  It could be done, but it will slow next_token () down
+   a bit.  The $ is not really a part of m4's input syntax in the sense
+   that a string is parsed equally whether there is a $ or not.  The
+   character $ is used by convention in user macros.  */
+
+static	void  check_use_macro_escape	(void);
+static	void  set_syntax_internal	(int code, int ch);
+static	void  unset_syntax_attribute	(int code, int ch);
+
+/* TRUE iff strlen(rquote) == strlen(lquote) == 1 */
+boolean m4__single_quotes;
+
+/* TRUE iff strlen(bcomm) == strlen(ecomm) == 1 */
+boolean m4__single_comments;
+
+/* TRUE iff some character has M4_SYNTAX_ESCAPE */
+boolean m4__use_macro_escape;
+
+void
+m4_syntax_init (void)
+{
+  int ch;
+
+  for (ch = 256; --ch > 0;)
+    {
+      if (ch == '(')
+	set_syntax_internal (M4_SYNTAX_OPEN, ch);
+      else if (ch == ')')
+	set_syntax_internal (M4_SYNTAX_CLOSE, ch);
+      else if (ch == ',')
+	set_syntax_internal (M4_SYNTAX_COMMA, ch);
+      else if (isspace (ch))
+	set_syntax_internal (M4_SYNTAX_SPACE, ch);
+      else if (isalpha (ch) || ch == '_')
+	set_syntax_internal (M4_SYNTAX_ALPHA, ch);
+      else if (isdigit (ch))
+	set_syntax_internal (M4_SYNTAX_NUM, ch);
+      else
+	set_syntax_internal (M4_SYNTAX_OTHER, ch);
+    }
+  /* set_syntax_internal(M4_SYNTAX_IGNORE, 0); */
+
+  /* Default quotes and comment delimiters are always one char */
+  set_syntax_internal (M4_SYNTAX_LQUOTE, lquote.string[0]);
+  set_syntax_internal (M4_SYNTAX_RQUOTE, rquote.string[0]);
+  set_syntax_internal (M4_SYNTAX_BCOMM, bcomm.string[0]);
+  set_syntax_internal (M4_SYNTAX_ECOMM, ecomm.string[0]);
+}
+
+void
+m4_syntax_exit (void)
+{
+  return;
+}
+
+int
+m4_syntax_code (char ch)
+{
+  int code;
+
+  switch (ch)
+    {
+    case 'I': case 'i': code = M4_SYNTAX_IGNORE; break;
+    case 'O': case 'o': code = M4_SYNTAX_OTHER;  break;
+    case 'S': case 's': code = M4_SYNTAX_SPACE;  break;
+    case 'W': case 'w': code = M4_SYNTAX_ALPHA;  break;
+    case 'D': case 'd': code = M4_SYNTAX_NUM;    break;
+
+    case '(': code = M4_SYNTAX_OPEN;   break;
+    case ')': code = M4_SYNTAX_CLOSE;  break;
+    case ',': code = M4_SYNTAX_COMMA;  break;
+    case '@': code = M4_SYNTAX_ESCAPE; break;
+#if 0				/* not yet used */
+    case '$': code = M4_SYNTAX_DOLLAR; break;
+#endif
+
+    case 'L': case 'l': code = M4_SYNTAX_LQUOTE; break;
+    case 'R': case 'r': code = M4_SYNTAX_RQUOTE; break;
+    case 'B': case 'b': code = M4_SYNTAX_BCOMM;  break;
+    case 'E': case 'e': code = M4_SYNTAX_ECOMM;  break;
+    case 'A': case 'a': code = M4_SYNTAX_ACTIVE;  break;
+
+    default: code = -1;  break;
+    }
+
+  return code;
+}
+
+
+
+/* Functions for setting quotes and comment delimiters.  Used by
+   m4_changecom () and m4_changequote ().  Both functions overrides the
+   syntax_table to maintain compatibility.  */
+void
+m4_set_quotes (const char *lq, const char *rq)
+{
+  int ch;
+  for (ch = 256; --ch >= 0;)	/* changequote overrides syntax_table */
+    if (M4_IS_LQUOTE (ch) || M4_IS_RQUOTE (ch))
+      unset_syntax_attribute (M4_SYNTAX_LQUOTE | M4_SYNTAX_RQUOTE, ch);
+
+  xfree (lquote.string);
+  xfree (rquote.string);
+
+  lquote.string = xstrdup (lq ? lq : DEF_LQUOTE);
+  lquote.length = strlen (lquote.string);
+  rquote.string = xstrdup (rq ? rq : DEF_RQUOTE);
+  rquote.length = strlen (rquote.string);
+
+  m4__single_quotes = (lquote.length == 1 && rquote.length == 1);
+
+  if (m4__single_quotes)
+    {
+      set_syntax_internal (M4_SYNTAX_LQUOTE, lquote.string[0]);
+      set_syntax_internal (M4_SYNTAX_RQUOTE, rquote.string[0]);
+    }
+
+  if (m4__use_macro_escape)
+    check_use_macro_escape ();
+}
+
+void
+m4_set_comment (const char *bc, const char *ec)
+{
+  int ch;
+  for (ch = 256; --ch >= 0;)	/* changecom overrides syntax_table */
+    if (M4_IS_BCOMM (ch) || M4_IS_ECOMM (ch))
+      unset_syntax_attribute (M4_SYNTAX_BCOMM | M4_SYNTAX_ECOMM, ch);
+
+  xfree (bcomm.string);
+  xfree (ecomm.string);
+
+  bcomm.string = xstrdup (bc ? bc : DEF_BCOMM);
+  bcomm.length = strlen (bcomm.string);
+  ecomm.string = xstrdup (ec ? ec : DEF_ECOMM);
+  ecomm.length = strlen (ecomm.string);
+
+  m4__single_comments = (bcomm.length == 1 && ecomm.length == 1);
+
+  if (m4__single_comments)
+    {
+      set_syntax_internal (M4_SYNTAX_BCOMM, bcomm.string[0]);
+      set_syntax_internal (M4_SYNTAX_ECOMM, ecomm.string[0]);
+    }
+
+  if (m4__use_macro_escape)
+    check_use_macro_escape ();
+}
+
+/* Functions to manipulate the syntax table.  */
+static void
+set_syntax_internal (int code, int ch)
+{
+  if (code & M4_SYNTAX_MASKS)
+    m4_syntax_table[ch] |= code;
+  else
+    m4_syntax_table[ch] = code;
+
+#ifdef DEBUG_SYNTAX
+  fprintf(stderr, "Set syntax %o %c = %04X\n",
+	  ch, isprint(ch) ? ch : '-',
+	  m4_syntax_table[ch]);
+#endif
+}
+
+static void
+unset_syntax_attribute (int code, int ch)
+{
+  if (code & M4_SYNTAX_MASKS)
+    m4_syntax_table[ch] &= ~code;
+
+#ifdef DEBUG_SYNTAX
+  fprintf(stderr, "Unset syntax %o %c = %04X\n",
+	  ch, isprint(ch) ? ch : '-',
+	  m4_syntax_table[ch]);
+#endif
+}
+
+void
+m4_set_syntax (char key, const unsigned char *chars)
+{
+  int ch, code;
+
+  code = m4_syntax_code (key);
+
+  if ((code < 0) && (key != '\0'))
+    {
+      M4ERROR ((warning_status, 0,
+		_("Undefined syntax code %c"), key));
+      return;
+    }
+
+  if (*chars != '\0')
+    while ((ch = *chars++))
+      set_syntax_internal (code, ch);
+  else
+    for (ch = 256; --ch > 0; )
+      set_syntax_internal (code, ch);
+
+  if (m4__use_macro_escape || code == M4_SYNTAX_ESCAPE)
+    check_use_macro_escape();
+}
+
+static void
+check_use_macro_escape (void)
+{
+  int ch;
+
+  m4__use_macro_escape = FALSE;
+  for (ch = 256; --ch >= 0; )
+    if (M4_IS_ESCAPE (ch))
+      m4__use_macro_escape = TRUE;
+}
diff --git a/src/main.c b/src/main.c
index 4c69059b..f19689b5 100644
--- a/src/main.c
+++ b/src/main.c
@@ -524,6 +524,7 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"),
      Strictly, we don't need to do this, but it makes leak detection
      a whole lot easier!  */
   m4_symtab_exit ();
+  m4_syntax_exit ();
   m4_output_exit ();
   m4_input_exit ();
   m4_debug_exit ();
author	Gary V. Vaughan <gary@gnu.org>	2002-05-29 18:06:42 +0000
committer	Eric Blake <ebb9@byu.net>	2007-10-05 21:58:22 -0600
commit	de11cb28a57e7d1196a587f24dc83f1b492570b3 (patch)
tree	7b76cf261009fc6bbcb298ee21c67e7bf1dd2b78
parent	86b1ec4e3585a53151d88f754a342819d3dc6b06 (diff)
download	m4-de11cb28a57e7d1196a587f24dc83f1b492570b3.tar.gz