summaryrefslogtreecommitdiff
path: root/gettext-tools/src/x-lisp.c
diff options
context:
space:
mode:
Diffstat (limited to 'gettext-tools/src/x-lisp.c')
-rw-r--r--gettext-tools/src/x-lisp.c1425
1 files changed, 1425 insertions, 0 deletions
diff --git a/gettext-tools/src/x-lisp.c b/gettext-tools/src/x-lisp.c
new file mode 100644
index 0000000..20d88a2
--- /dev/null
+++ b/gettext-tools/src/x-lisp.c
@@ -0,0 +1,1425 @@
+/* xgettext Lisp backend.
+ Copyright (C) 2001-2003, 2005-2009 Free Software Foundation, Inc.
+
+ This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+/* Specification. */
+#include "x-lisp.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "message.h"
+#include "xgettext.h"
+#include "error.h"
+#include "xalloc.h"
+#include "hash.h"
+#include "gettext.h"
+
+#define _(s) gettext(s)
+
+
+/* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
+ Since we are interested only in strings and in forms similar to
+ (gettext msgid ...)
+ or (ngettext msgid msgid_plural ...)
+ we make the following simplifications:
+
+ - Assume the keywords and strings are in an ASCII compatible encoding.
+ This means we can read the input file one byte at a time, instead of
+ one character at a time. No need to worry about multibyte characters:
+ If they occur as part of identifiers, they most probably act as
+ constituent characters, and the byte based approach will do the same.
+
+ - Assume the read table is the standard Common Lisp read table.
+ Non-standard read tables are mostly used to read data, not programs.
+
+ - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
+
+ - Don't interpret #n= and #n#, they usually don't appear in programs.
+
+ - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
+
+ The remaining syntax rules are:
+
+ - The syntax code assigned to each character, and how tokens are built
+ up from characters (single escape, multiple escape etc.).
+
+ - Comment syntax: ';' and '#| ... |#'.
+
+ - String syntax: "..." with single escapes.
+
+ - Read macros and dispatch macro character '#'. Needed to be able to
+ tell which is the n-th argument of a function call.
+
+ */
+
+
+/* ========================= Lexer customization. ========================= */
+
+/* 'readtable_case' is the case conversion that is applied to non-escaped
+ parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */
+
+enum rtcase
+{
+ case_upcase,
+ case_downcase,
+ case_preserve,
+ case_invert
+};
+
+static enum rtcase readtable_case = case_upcase;
+
+/* 'read_base' is the assumed radix of integers and rational numbers.
+ In Common Lisp: *read-base*. */
+static int read_base = 10;
+
+/* 'read_preserve_whitespace' specifies whether a whitespace character
+ that terminates a token must be pushed back on the input stream.
+ We set it to true, because the special newline side effect in read_object()
+ requires that read_object() sees every newline not inside a token. */
+static bool read_preserve_whitespace = true;
+
+
+/* ====================== Keyword set customization. ====================== */
+
+/* If true extract all strings. */
+static bool extract_all = false;
+
+static hash_table keywords;
+static bool default_keywords = true;
+
+
+void
+x_lisp_extract_all ()
+{
+ extract_all = true;
+}
+
+
+void
+x_lisp_keyword (const char *name)
+{
+ if (name == NULL)
+ default_keywords = false;
+ else
+ {
+ const char *end;
+ struct callshape shape;
+ const char *colon;
+ size_t len;
+ char *symname;
+ size_t i;
+
+ if (keywords.table == NULL)
+ hash_init (&keywords, 100);
+
+ split_keywordspec (name, &end, &shape);
+
+ /* The characters between name and end should form a valid Lisp symbol.
+ Extract the symbol name part. */
+ colon = strchr (name, ':');
+ if (colon != NULL && colon < end)
+ {
+ name = colon + 1;
+ if (name < end && *name == ':')
+ name++;
+ colon = strchr (name, ':');
+ if (colon != NULL && colon < end)
+ return;
+ }
+
+ /* Uppercase it. */
+ len = end - name;
+ symname = XNMALLOC (len, char);
+ for (i = 0; i < len; i++)
+ symname[i] =
+ (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
+
+ insert_keyword_callshape (&keywords, symname, len, &shape);
+ }
+}
+
+/* Finish initializing the keywords hash table.
+ Called after argument processing, before each file is processed. */
+static void
+init_keywords ()
+{
+ if (default_keywords)
+ {
+ /* When adding new keywords here, also update the documentation in
+ xgettext.texi! */
+ x_lisp_keyword ("gettext"); /* I18N:GETTEXT */
+ x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */
+ x_lisp_keyword ("gettext-noop");
+ default_keywords = false;
+ }
+}
+
+void
+init_flag_table_lisp ()
+{
+ xgettext_record_flag ("gettext:1:pass-lisp-format");
+ xgettext_record_flag ("ngettext:1:pass-lisp-format");
+ xgettext_record_flag ("ngettext:2:pass-lisp-format");
+ xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
+ xgettext_record_flag ("format:2:lisp-format");
+}
+
+
+/* ======================== Reading of characters. ======================== */
+
+/* Real filename, used in error messages about the input file. */
+static const char *real_file_name;
+
+/* Logical filename and line number, used to label the extracted messages. */
+static char *logical_file_name;
+static int line_number;
+
+/* The input file stream. */
+static FILE *fp;
+
+
+/* Fetch the next character from the input file. */
+static int
+do_getc ()
+{
+ int c = getc (fp);
+
+ if (c == EOF)
+ {
+ if (ferror (fp))
+ error (EXIT_FAILURE, errno, _("\
+error while reading \"%s\""), real_file_name);
+ }
+ else if (c == '\n')
+ line_number++;
+
+ return c;
+}
+
+/* Put back the last fetched character, not EOF. */
+static void
+do_ungetc (int c)
+{
+ if (c == '\n')
+ line_number--;
+ ungetc (c, fp);
+}
+
+
+/* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */
+
+
+/* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */
+
+enum syntax_code
+{
+ syntax_illegal, /* non-printable, except whitespace */
+ syntax_single_esc, /* '\' (single escape) */
+ syntax_multi_esc, /* '|' (multiple escape) */
+ syntax_constituent, /* everything else (constituent) */
+ syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */
+ syntax_eof, /* EOF */
+ syntax_t_macro, /* '()'"' (terminating macro) */
+ syntax_nt_macro /* '#' (non-terminating macro) */
+};
+
+/* Returns the syntax code of a character. */
+static enum syntax_code
+syntax_code_of (unsigned char c)
+{
+ switch (c)
+ {
+ case '\\':
+ return syntax_single_esc;
+ case '|':
+ return syntax_multi_esc;
+ case '\t': case '\n': case '\f': case '\r': case ' ':
+ return syntax_whitespace;
+ case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
+ return syntax_t_macro;
+ case '#':
+ return syntax_nt_macro;
+ default:
+ if (c < ' ' && c != '\b')
+ return syntax_illegal;
+ else
+ return syntax_constituent;
+ }
+}
+
+struct char_syntax
+{
+ int ch; /* character */
+ enum syntax_code scode; /* syntax code */
+};
+
+/* Returns the next character and its syntax code. */
+static void
+read_char_syntax (struct char_syntax *p)
+{
+ int c = do_getc ();
+
+ p->ch = c;
+ p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
+}
+
+/* Every character in a token has an attribute assigned. The attributes
+ help during interpretation of the token. See
+ CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
+ and CLHS 2.1.4.2 "Constituent Traits". */
+
+enum attribute
+{
+ a_illg, /* invalid constituent */
+ a_pack_m, /* ':' package marker */
+ a_alpha, /* normal alphabetic */
+ a_escaped, /* alphabetic but not subject to case conversion */
+ a_ratio, /* '/' */
+ a_dot, /* '.' */
+ a_sign, /* '+-' */
+ a_extens, /* '_^' extension characters */
+ a_digit, /* '0123456789' */
+ a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
+ a_expodigit, /* 'esfdlESFDL' below base */
+ a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
+ a_expo /* 'esfdlESFDL' */
+};
+
+#define is_letter_attribute(a) ((a) >= a_letter)
+#define is_number_attribute(a) ((a) >= a_ratio)
+
+/* Returns the attribute of a character, assuming base 10. */
+static enum attribute
+attribute_of (unsigned char c)
+{
+ switch (c)
+ {
+ case ':':
+ return a_pack_m;
+ case '/':
+ return a_ratio;
+ case '.':
+ return a_dot;
+ case '+': case '-':
+ return a_sign;
+ case '_': case '^':
+ return a_extens;
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ return a_digit;
+ case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
+ case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+ case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
+ case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
+ case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+ case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
+ return a_letter;
+ case 'e': case 's': case 'd': case 'f': case 'l':
+ case 'E': case 'S': case 'D': case 'F': case 'L':
+ return a_expo;
+ default:
+ /* Treat everything as valid. Never return a_illg. */
+ return a_alpha;
+ }
+}
+
+struct token_char
+{
+ unsigned char ch; /* character */
+ unsigned char attribute; /* attribute */
+};
+
+/* A token consists of a sequence of characters with associated attribute. */
+struct token
+{
+ int allocated; /* number of allocated 'token_char's */
+ int charcount; /* number of used 'token_char's */
+ struct token_char *chars; /* the token's constituents */
+ bool with_escape; /* whether single-escape or multiple escape occurs */
+};
+
+/* Initialize a 'struct token'. */
+static inline void
+init_token (struct token *tp)
+{
+ tp->allocated = 10;
+ tp->chars = XNMALLOC (tp->allocated, struct token_char);
+ tp->charcount = 0;
+}
+
+/* Free the memory pointed to by a 'struct token'. */
+static inline void
+free_token (struct token *tp)
+{
+ free (tp->chars);
+}
+
+/* Ensure there is enough room in the token for one more character. */
+static inline void
+grow_token (struct token *tp)
+{
+ if (tp->charcount == tp->allocated)
+ {
+ tp->allocated *= 2;
+ tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
+ }
+}
+
+/* Read the next token. If 'first' is given, it points to the first
+ character, which has already been read.
+ The algorithm follows CLHS 2.2 "Reader Algorithm". */
+static void
+read_token (struct token *tp, const struct char_syntax *first)
+{
+ bool multiple_escape_flag;
+ struct char_syntax curr;
+
+ init_token (tp);
+ tp->with_escape = false;
+
+ multiple_escape_flag = false;
+ if (first)
+ curr = *first;
+ else
+ read_char_syntax (&curr);
+
+ for (;; read_char_syntax (&curr))
+ {
+ switch (curr.scode)
+ {
+ case syntax_illegal:
+ /* Invalid input. Be tolerant, no error message. */
+ do_ungetc (curr.ch);
+ return;
+
+ case syntax_single_esc:
+ tp->with_escape = true;
+ read_char_syntax (&curr);
+ if (curr.scode == syntax_eof)
+ /* Invalid input. Be tolerant, no error message. */
+ return;
+ grow_token (tp);
+ tp->chars[tp->charcount].ch = curr.ch;
+ tp->chars[tp->charcount].attribute = a_escaped;
+ tp->charcount++;
+ break;
+
+ case syntax_multi_esc:
+ multiple_escape_flag = !multiple_escape_flag;
+ tp->with_escape = true;
+ break;
+
+ case syntax_constituent:
+ case syntax_nt_macro:
+ grow_token (tp);
+ if (multiple_escape_flag)
+ {
+ tp->chars[tp->charcount].ch = curr.ch;
+ tp->chars[tp->charcount].attribute = a_escaped;
+ tp->charcount++;
+ }
+ else
+ {
+ tp->chars[tp->charcount].ch = curr.ch;
+ tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
+ tp->charcount++;
+ }
+ break;
+
+ case syntax_whitespace:
+ case syntax_t_macro:
+ if (multiple_escape_flag)
+ {
+ grow_token (tp);
+ tp->chars[tp->charcount].ch = curr.ch;
+ tp->chars[tp->charcount].attribute = a_escaped;
+ tp->charcount++;
+ }
+ else
+ {
+ if (curr.scode != syntax_whitespace || read_preserve_whitespace)
+ do_ungetc (curr.ch);
+ return;
+ }
+ break;
+
+ case syntax_eof:
+ if (multiple_escape_flag)
+ /* Invalid input. Be tolerant, no error message. */
+ ;
+ return;
+ }
+ }
+}
+
+/* A potential number is a token which
+ 1. consists only of digits, '+','-','/','^','_','.' and number markers.
+ The base for digits is context dependent, but always 10 if a dot '.'
+ occurs. A number marker is a non-digit letter which is not adjacent
+ to a non-digit letter.
+ 2. has at least one digit.
+ 3. starts with a digit, '+','-','.','^' or '_'.
+ 4. does not end with '+' or '-'.
+ See CLHS 2.3.1.1 "Potential Numbers as Tokens".
+ */
+
+static inline bool
+has_a_dot (const struct token *tp)
+{
+ int n = tp->charcount;
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (tp->chars[i].attribute == a_dot)
+ return true;
+ return false;
+}
+
+static inline bool
+all_a_number (const struct token *tp)
+{
+ int n = tp->charcount;
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (!is_number_attribute (tp->chars[i].attribute))
+ return false;
+ return true;
+}
+
+static inline void
+a_letter_to_digit (const struct token *tp, int base)
+{
+ int n = tp->charcount;
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (is_letter_attribute (tp->chars[i].attribute))
+ {
+ int c = tp->chars[i].ch;
+
+ if (c >= 'a')
+ c -= 'a' - 'A';
+ if (c - 'A' + 10 < base)
+ tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
+ a_expo -> a_expodigit */
+ }
+}
+
+static inline bool
+has_a_digit (const struct token *tp)
+{
+ int n = tp->charcount;
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (tp->chars[i].attribute == a_digit
+ || tp->chars[i].attribute == a_letterdigit
+ || tp->chars[i].attribute == a_expodigit)
+ return true;
+ return false;
+}
+
+static inline bool
+has_adjacent_letters (const struct token *tp)
+{
+ int n = tp->charcount;
+ int i;
+
+ for (i = 1; i < n; i++)
+ if (is_letter_attribute (tp->chars[i-1].attribute)
+ && is_letter_attribute (tp->chars[i].attribute))
+ return true;
+ return false;
+}
+
+static bool
+is_potential_number (const struct token *tp, int *basep)
+{
+ /* CLHS 2.3.1.1.1:
+ "A potential number cannot contain any escape characters." */
+ if (tp->with_escape)
+ return false;
+
+ if (has_a_dot (tp))
+ *basep = 10;
+
+ if (!all_a_number (tp))
+ return false;
+
+ a_letter_to_digit (tp, *basep);
+
+ if (!has_a_digit (tp))
+ return false;
+
+ if (has_adjacent_letters (tp))
+ return false;
+
+ if (!(tp->chars[0].attribute >= a_dot
+ && tp->chars[0].attribute <= a_expodigit))
+ return false;
+
+ if (tp->chars[tp->charcount - 1].attribute == a_sign)
+ return false;
+
+ return true;
+}
+
+/* A number is one of integer, ratio, float. Each has a particular syntax.
+ See CLHS 2.3.1 "Numbers as Tokens".
+ But note a mistake: The exponent rule should read:
+ exponent ::= exponent-marker [sign] {decimal-digit}+
+ (see 22.1.3.1.3 "Printing Floats"). */
+
+enum number_type
+{
+ n_none,
+ n_integer,
+ n_ratio,
+ n_float
+};
+
+static enum number_type
+is_number (const struct token *tp, int *basep)
+{
+ struct token_char *ptr_limit;
+ struct token_char *ptr1;
+
+ if (!is_potential_number (tp, basep))
+ return n_none;
+
+ /* is_potential_number guarantees
+ - all attributes are >= a_ratio,
+ - there is at least one a_digit or a_letterdigit or a_expodigit, and
+ - if there is an a_dot, then *basep = 10. */
+
+ ptr1 = &tp->chars[0];
+ ptr_limit = &tp->chars[tp->charcount];
+
+ if (ptr1->attribute == a_sign)
+ ptr1++;
+
+ /* Test for syntax
+ * { a_sign | }
+ * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
+ */
+ {
+ bool seen_a_ratio = false;
+ bool seen_a_digit = false; /* seen a digit in last digit block? */
+ struct token_char *ptr;
+
+ for (ptr = ptr1;; ptr++)
+ {
+ if (ptr >= ptr_limit)
+ {
+ if (!seen_a_digit)
+ break;
+ if (seen_a_ratio)
+ return n_ratio;
+ else
+ return n_integer;
+ }
+ if (ptr->attribute == a_digit
+ || ptr->attribute == a_letterdigit
+ || ptr->attribute == a_expodigit)
+ {
+ int c = ptr->ch;
+
+ c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
+ if (c >= *basep)
+ break;
+ seen_a_digit = true;
+ }
+ else if (ptr->attribute == a_ratio)
+ {
+ if (seen_a_ratio || !seen_a_digit)
+ break;
+ seen_a_ratio = true;
+ seen_a_digit = false;
+ }
+ else
+ break;
+ }
+ }
+
+ /* Test for syntax
+ * { a_sign | }
+ * { a_digit }* { a_dot { a_digit }* | }
+ * { a_expo { a_sign | } { a_digit }+ | }
+ *
+ * If there is an exponent part, there must be digits before the dot or
+ * after the dot. The result is a float.
+ * If there is no exponen:
+ * If there is no dot, it would an integer in base 10, but is has already
+ * been verified to not be an integer in the current base.
+ * If there is a dot:
+ * If there are digits after the dot, it's a float.
+ * Otherwise, if there are digits before the dot, it's an integer.
+ */
+ *basep = 10;
+ {
+ bool seen_a_dot = false;
+ bool seen_a_dot_with_leading_digits = false;
+ bool seen_a_digit = false; /* seen a digit in last digit block? */
+ struct token_char *ptr;
+
+ for (ptr = ptr1;; ptr++)
+ {
+ if (ptr >= ptr_limit)
+ {
+ /* no exponent */
+ if (!seen_a_dot)
+ return n_none;
+ if (seen_a_digit)
+ return n_float;
+ if (seen_a_dot_with_leading_digits)
+ return n_integer;
+ else
+ return n_none;
+ }
+ if (ptr->attribute == a_digit)
+ {
+ seen_a_digit = true;
+ }
+ else if (ptr->attribute == a_dot)
+ {
+ if (seen_a_dot)
+ return n_none;
+ seen_a_dot = true;
+ if (seen_a_digit)
+ seen_a_dot_with_leading_digits = true;
+ seen_a_digit = false;
+ }
+ else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
+ break;
+ else
+ return n_none;
+ }
+ ptr++;
+ if (!seen_a_dot_with_leading_digits || !seen_a_digit)
+ return n_none;
+ if (ptr >= ptr_limit)
+ return n_none;
+ if (ptr->attribute == a_sign)
+ ptr++;
+ seen_a_digit = false;
+ for (;; ptr++)
+ {
+ if (ptr >= ptr_limit)
+ break;
+ if (ptr->attribute != a_digit)
+ return n_none;
+ seen_a_digit = true;
+ }
+ if (!seen_a_digit)
+ return n_none;
+ return n_float;
+ }
+}
+
+/* A token representing a symbol must be case converted.
+ For portability, we convert only ASCII characters here. */
+
+static void
+upcase_token (struct token *tp)
+{
+ int n = tp->charcount;
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (tp->chars[i].attribute != a_escaped)
+ {
+ unsigned char c = tp->chars[i].ch;
+ if (c >= 'a' && c <= 'z')
+ tp->chars[i].ch = c - 'a' + 'A';
+ }
+}
+
+static void
+downcase_token (struct token *tp)
+{
+ int n = tp->charcount;
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (tp->chars[i].attribute != a_escaped)
+ {
+ unsigned char c = tp->chars[i].ch;
+ if (c >= 'A' && c <= 'Z')
+ tp->chars[i].ch = c - 'A' + 'a';
+ }
+}
+
+static void
+case_convert_token (struct token *tp)
+{
+ int n = tp->charcount;
+ int i;
+
+ switch (readtable_case)
+ {
+ case case_upcase:
+ upcase_token (tp);
+ break;
+
+ case case_downcase:
+ downcase_token (tp);
+ break;
+
+ case case_preserve:
+ break;
+
+ case case_invert:
+ {
+ bool seen_uppercase = false;
+ bool seen_lowercase = false;
+ for (i = 0; i < n; i++)
+ if (tp->chars[i].attribute != a_escaped)
+ {
+ unsigned char c = tp->chars[i].ch;
+ if (c >= 'a' && c <= 'z')
+ seen_lowercase = true;
+ if (c >= 'A' && c <= 'Z')
+ seen_uppercase = true;
+ }
+ if (seen_uppercase)
+ {
+ if (!seen_lowercase)
+ downcase_token (tp);
+ }
+ else
+ {
+ if (seen_lowercase)
+ upcase_token (tp);
+ }
+ }
+ break;
+ }
+}
+
+
+/* ========================= Accumulating comments ========================= */
+
+
+static char *buffer;
+static size_t bufmax;
+static size_t buflen;
+
+static inline void
+comment_start ()
+{
+ buflen = 0;
+}
+
+static inline void
+comment_add (int c)
+{
+ if (buflen >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[buflen++] = c;
+}
+
+static inline void
+comment_line_end (size_t chars_to_remove)
+{
+ buflen -= chars_to_remove;
+ while (buflen >= 1
+ && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
+ --buflen;
+ if (chars_to_remove == 0 && buflen >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[buflen] = '\0';
+ savable_comment_add (buffer);
+}
+
+
+/* These are for tracking whether comments count as immediately before
+ keyword. */
+static int last_comment_line;
+static int last_non_comment_line;
+
+
+/* ========================= Accumulating messages ========================= */
+
+
+static message_list_ty *mlp;
+
+
+/* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
+
+
+/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
+ Other objects need not to be represented precisely. */
+enum object_type
+{
+ t_symbol, /* symbol */
+ t_string, /* string */
+ t_other, /* other kind of real object */
+ t_dot, /* '.' pseudo object */
+ t_close, /* ')' pseudo object */
+ t_eof /* EOF marker */
+};
+
+struct object
+{
+ enum object_type type;
+ struct token *token; /* for t_symbol and t_string */
+ int line_number_at_start; /* for t_string */
+};
+
+/* Free the memory pointed to by a 'struct object'. */
+static inline void
+free_object (struct object *op)
+{
+ if (op->type == t_symbol || op->type == t_string)
+ {
+ free_token (op->token);
+ free (op->token);
+ }
+}
+
+/* Convert a t_symbol/t_string token to a char*. */
+static char *
+string_of_object (const struct object *op)
+{
+ char *str;
+ const struct token_char *p;
+ char *q;
+ int n;
+
+ if (!(op->type == t_symbol || op->type == t_string))
+ abort ();
+ n = op->token->charcount;
+ str = XNMALLOC (n + 1, char);
+ q = str;
+ for (p = op->token->chars; n > 0; p++, n--)
+ *q++ = p->ch;
+ *q = '\0';
+ return str;
+}
+
+/* Context lookup table. */
+static flag_context_list_table_ty *flag_context_list_table;
+
+/* Read the next object. */
+static void
+read_object (struct object *op, flag_context_ty outer_context)
+{
+ for (;;)
+ {
+ struct char_syntax curr;
+
+ read_char_syntax (&curr);
+
+ switch (curr.scode)
+ {
+ case syntax_eof:
+ op->type = t_eof;
+ return;
+
+ case syntax_whitespace:
+ if (curr.ch == '\n')
+ /* Comments assumed to be grouped with a message must immediately
+ precede it, with no non-whitespace token on a line between
+ both. */
+ if (last_non_comment_line > last_comment_line)
+ savable_comment_reset ();
+ continue;
+
+ case syntax_illegal:
+ op->type = t_other;
+ return;
+
+ case syntax_single_esc:
+ case syntax_multi_esc:
+ case syntax_constituent:
+ /* Start reading a token. */
+ op->token = XMALLOC (struct token);
+ read_token (op->token, &curr);
+ last_non_comment_line = line_number;
+
+ /* Interpret the token. */
+
+ /* Dots. */
+ if (!op->token->with_escape
+ && op->token->charcount == 1
+ && op->token->chars[0].attribute == a_dot)
+ {
+ free_token (op->token);
+ free (op->token);
+ op->type = t_dot;
+ return;
+ }
+ /* Tokens consisting entirely of dots are illegal, but be tolerant
+ here. */
+
+ /* Number. */
+ {
+ int base = read_base;
+
+ if (is_number (op->token, &base) != n_none)
+ {
+ free_token (op->token);
+ free (op->token);
+ op->type = t_other;
+ return;
+ }
+ }
+
+ /* We interpret all other tokens as symbols (including 'reserved
+ tokens', i.e. potential numbers which are not numbers). */
+ case_convert_token (op->token);
+ op->type = t_symbol;
+ return;
+
+ case syntax_t_macro:
+ case syntax_nt_macro:
+ /* Read a macro. */
+ switch (curr.ch)
+ {
+ case '(':
+ {
+ int arg = 0; /* Current argument number. */
+ flag_context_list_iterator_ty context_iter;
+ const struct callshapes *shapes = NULL;
+ struct arglist_parser *argparser = NULL;
+
+ for (;; arg++)
+ {
+ struct object inner;
+ flag_context_ty inner_context;
+
+ if (arg == 0)
+ inner_context = null_context;
+ else
+ inner_context =
+ inherited_context (outer_context,
+ flag_context_list_iterator_advance (
+ &context_iter));
+
+ read_object (&inner, inner_context);
+
+ /* Recognize end of list. */
+ if (inner.type == t_close)
+ {
+ op->type = t_other;
+ /* Don't bother converting "()" to "NIL". */
+ last_non_comment_line = line_number;
+ if (argparser != NULL)
+ arglist_parser_done (argparser, arg);
+ return;
+ }
+
+ /* Dots are not allowed in every position.
+ But be tolerant. */
+
+ /* EOF inside list is illegal.
+ But be tolerant. */
+ if (inner.type == t_eof)
+ break;
+
+ if (arg == 0)
+ {
+ /* This is the function position. */
+ if (inner.type == t_symbol)
+ {
+ char *symbol_name = string_of_object (&inner);
+ int i;
+ int prefix_len;
+ void *keyword_value;
+
+ /* Omit any package name. */
+ i = inner.token->charcount;
+ while (i > 0
+ && inner.token->chars[i-1].attribute != a_pack_m)
+ i--;
+ prefix_len = i;
+
+ if (hash_find_entry (&keywords,
+ symbol_name + prefix_len,
+ strlen (symbol_name + prefix_len),
+ &keyword_value)
+ == 0)
+ shapes = (const struct callshapes *) keyword_value;
+
+ argparser = arglist_parser_alloc (mlp, shapes);
+
+ context_iter =
+ flag_context_list_iterator (
+ flag_context_list_table_lookup (
+ flag_context_list_table,
+ symbol_name, strlen (symbol_name)));
+
+ free (symbol_name);
+ }
+ else
+ context_iter = null_context_list_iterator;
+ }
+ else
+ {
+ /* These are the argument positions. */
+ if (argparser != NULL && inner.type == t_string)
+ arglist_parser_remember (argparser, arg,
+ string_of_object (&inner),
+ inner_context,
+ logical_file_name,
+ inner.line_number_at_start,
+ savable_comment);
+ }
+
+ free_object (&inner);
+ }
+
+ if (argparser != NULL)
+ arglist_parser_done (argparser, arg);
+ }
+ op->type = t_other;
+ last_non_comment_line = line_number;
+ return;
+
+ case ')':
+ /* Tell the caller about the end of list.
+ Unmatched closing parenthesis is illegal.
+ But be tolerant. */
+ op->type = t_close;
+ last_non_comment_line = line_number;
+ return;
+
+ case ',':
+ {
+ int c = do_getc ();
+ /* The ,@ handling inside lists is wrong anyway, because
+ ,@form expands to an unknown number of elements. */
+ if (c != EOF && c != '@' && c != '.')
+ do_ungetc (c);
+ }
+ /*FALLTHROUGH*/
+ case '\'':
+ case '`':
+ {
+ struct object inner;
+
+ read_object (&inner, null_context);
+
+ /* Dots and EOF are not allowed here. But be tolerant. */
+
+ free_object (&inner);
+
+ op->type = t_other;
+ last_non_comment_line = line_number;
+ return;
+ }
+
+ case ';':
+ {
+ bool all_semicolons = true;
+
+ last_comment_line = line_number;
+ comment_start ();
+ for (;;)
+ {
+ int c = do_getc ();
+ if (c == EOF || c == '\n')
+ break;
+ if (c != ';')
+ all_semicolons = false;
+ if (!all_semicolons)
+ {
+ /* We skip all leading white space, but not EOLs. */
+ if (!(buflen == 0 && (c == ' ' || c == '\t')))
+ comment_add (c);
+ }
+ }
+ comment_line_end (0);
+ continue;
+ }
+
+ case '"':
+ {
+ op->token = XMALLOC (struct token);
+ init_token (op->token);
+ op->line_number_at_start = line_number;
+ for (;;)
+ {
+ int c = do_getc ();
+ if (c == EOF)
+ /* Invalid input. Be tolerant, no error message. */
+ break;
+ if (c == '"')
+ break;
+ if (c == '\\') /* syntax_single_esc */
+ {
+ c = do_getc ();
+ if (c == EOF)
+ /* Invalid input. Be tolerant, no error message. */
+ break;
+ }
+ grow_token (op->token);
+ op->token->chars[op->token->charcount++].ch = c;
+ }
+ op->type = t_string;
+
+ if (extract_all)
+ {
+ lex_pos_ty pos;
+
+ pos.file_name = logical_file_name;
+ pos.line_number = op->line_number_at_start;
+ remember_a_message (mlp, NULL, string_of_object (op),
+ null_context, &pos,
+ NULL, savable_comment);
+ }
+ last_non_comment_line = line_number;
+ return;
+ }
+
+ case '#':
+ /* Dispatch macro handling. */
+ {
+ int c;
+
+ for (;;)
+ {
+ c = do_getc ();
+ if (c == EOF)
+ /* Invalid input. Be tolerant, no error message. */
+ {
+ op->type = t_other;
+ return;
+ }
+ if (!(c >= '0' && c <= '9'))
+ break;
+ }
+
+ switch (c)
+ {
+ case '(':
+ case '"':
+ do_ungetc (c);
+ /*FALLTHROUGH*/
+ case '\'':
+ case ':':
+ case '.':
+ case ',':
+ case 'A': case 'a':
+ case 'C': case 'c':
+ case 'P': case 'p':
+ case 'S': case 's':
+ {
+ struct object inner;
+ read_object (&inner, null_context);
+ /* Dots and EOF are not allowed here.
+ But be tolerant. */
+ free_object (&inner);
+ op->type = t_other;
+ last_non_comment_line = line_number;
+ return;
+ }
+
+ case '|':
+ {
+ int depth = 0;
+ int c;
+
+ comment_start ();
+ c = do_getc ();
+ for (;;)
+ {
+ if (c == EOF)
+ break;
+ if (c == '|')
+ {
+ c = do_getc ();
+ if (c == EOF)
+ break;
+ if (c == '#')
+ {
+ if (depth == 0)
+ {
+ comment_line_end (0);
+ break;
+ }
+ depth--;
+ comment_add ('|');
+ comment_add ('#');
+ c = do_getc ();
+ }
+ else
+ comment_add ('|');
+ }
+ else if (c == '#')
+ {
+ c = do_getc ();
+ if (c == EOF)
+ break;
+ comment_add ('#');
+ if (c == '|')
+ {
+ depth++;
+ comment_add ('|');
+ c = do_getc ();
+ }
+ }
+ else
+ {
+ /* We skip all leading white space. */
+ if (!(buflen == 0 && (c == ' ' || c == '\t')))
+ comment_add (c);
+ if (c == '\n')
+ {
+ comment_line_end (1);
+ comment_start ();
+ }
+ c = do_getc ();
+ }
+ }
+ if (c == EOF)
+ {
+ /* EOF not allowed here. But be tolerant. */
+ op->type = t_eof;
+ return;
+ }
+ last_comment_line = line_number;
+ continue;
+ }
+
+ case '\\':
+ {
+ struct token token;
+ struct char_syntax first;
+ first.ch = '\\';
+ first.scode = syntax_single_esc;
+ read_token (&token, &first);
+ free_token (&token);
+ op->type = t_other;
+ last_non_comment_line = line_number;
+ return;
+ }
+
+ case 'B': case 'b':
+ case 'O': case 'o':
+ case 'X': case 'x':
+ case 'R': case 'r':
+ case '*':
+ {
+ struct token token;
+ read_token (&token, NULL);
+ free_token (&token);
+ op->type = t_other;
+ last_non_comment_line = line_number;
+ return;
+ }
+
+ case '=':
+ /* Ignore read labels. */
+ continue;
+
+ case '#':
+ /* Don't bother looking up the corresponding object. */
+ op->type = t_other;
+ last_non_comment_line = line_number;
+ return;
+
+ case '+':
+ case '-':
+ /* Simply assume every feature expression is true. */
+ {
+ struct object inner;
+ read_object (&inner, null_context);
+ /* Dots and EOF are not allowed here.
+ But be tolerant. */
+ free_object (&inner);
+ continue;
+ }
+
+ default:
+ op->type = t_other;
+ last_non_comment_line = line_number;
+ return;
+ }
+ /*NOTREACHED*/
+ abort ();
+ }
+
+ default:
+ /*NOTREACHED*/
+ abort ();
+ }
+
+ default:
+ /*NOTREACHED*/
+ abort ();
+ }
+ }
+}
+
+
+void
+extract_lisp (FILE *f,
+ const char *real_filename, const char *logical_filename,
+ flag_context_list_table_ty *flag_table,
+ msgdomain_list_ty *mdlp)
+{
+ mlp = mdlp->item[0]->messages;
+
+ fp = f;
+ real_file_name = real_filename;
+ logical_file_name = xstrdup (logical_filename);
+ line_number = 1;
+
+ last_comment_line = -1;
+ last_non_comment_line = -1;
+
+ flag_context_list_table = flag_table;
+
+ init_keywords ();
+
+ /* Eat tokens until eof is seen. When read_object returns
+ due to an unbalanced closing parenthesis, just restart it. */
+ do
+ {
+ struct object toplevel_object;
+
+ read_object (&toplevel_object, null_context);
+
+ if (toplevel_object.type == t_eof)
+ break;
+
+ free_object (&toplevel_object);
+ }
+ while (!feof (fp));
+
+ /* Close scanner. */
+ fp = NULL;
+ real_file_name = NULL;
+ logical_file_name = NULL;
+ line_number = 0;
+}