diff options
Diffstat (limited to 'gettext-tools/src/x-lisp.c')
-rw-r--r-- | gettext-tools/src/x-lisp.c | 1425 |
1 files changed, 1425 insertions, 0 deletions
diff --git a/gettext-tools/src/x-lisp.c b/gettext-tools/src/x-lisp.c new file mode 100644 index 0000000..20d88a2 --- /dev/null +++ b/gettext-tools/src/x-lisp.c @@ -0,0 +1,1425 @@ +/* xgettext Lisp backend. + Copyright (C) 2001-2003, 2005-2009 Free Software Foundation, Inc. + + This file was written by Bruno Haible <haible@clisp.cons.org>, 2001. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +/* Specification. */ +#include "x-lisp.h" + +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "message.h" +#include "xgettext.h" +#include "error.h" +#include "xalloc.h" +#include "hash.h" +#include "gettext.h" + +#define _(s) gettext(s) + + +/* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2. + Since we are interested only in strings and in forms similar to + (gettext msgid ...) + or (ngettext msgid msgid_plural ...) + we make the following simplifications: + + - Assume the keywords and strings are in an ASCII compatible encoding. + This means we can read the input file one byte at a time, instead of + one character at a time. No need to worry about multibyte characters: + If they occur as part of identifiers, they most probably act as + constituent characters, and the byte based approach will do the same. + + - Assume the read table is the standard Common Lisp read table. + Non-standard read tables are mostly used to read data, not programs. + + - Assume the read table case is :UPCASE, and *READ-BASE* is 10. + + - Don't interpret #n= and #n#, they usually don't appear in programs. + + - Don't interpret #+, #-, they are unlikely to appear in a gettext form. + + The remaining syntax rules are: + + - The syntax code assigned to each character, and how tokens are built + up from characters (single escape, multiple escape etc.). + + - Comment syntax: ';' and '#| ... |#'. + + - String syntax: "..." with single escapes. + + - Read macros and dispatch macro character '#'. Needed to be able to + tell which is the n-th argument of a function call. + + */ + + +/* ========================= Lexer customization. ========================= */ + +/* 'readtable_case' is the case conversion that is applied to non-escaped + parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */ + +enum rtcase +{ + case_upcase, + case_downcase, + case_preserve, + case_invert +}; + +static enum rtcase readtable_case = case_upcase; + +/* 'read_base' is the assumed radix of integers and rational numbers. + In Common Lisp: *read-base*. */ +static int read_base = 10; + +/* 'read_preserve_whitespace' specifies whether a whitespace character + that terminates a token must be pushed back on the input stream. + We set it to true, because the special newline side effect in read_object() + requires that read_object() sees every newline not inside a token. */ +static bool read_preserve_whitespace = true; + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_lisp_extract_all () +{ + extract_all = true; +} + + +void +x_lisp_keyword (const char *name) +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + struct callshape shape; + const char *colon; + size_t len; + char *symname; + size_t i; + + if (keywords.table == NULL) + hash_init (&keywords, 100); + + split_keywordspec (name, &end, &shape); + + /* The characters between name and end should form a valid Lisp symbol. + Extract the symbol name part. */ + colon = strchr (name, ':'); + if (colon != NULL && colon < end) + { + name = colon + 1; + if (name < end && *name == ':') + name++; + colon = strchr (name, ':'); + if (colon != NULL && colon < end) + return; + } + + /* Uppercase it. */ + len = end - name; + symname = XNMALLOC (len, char); + for (i = 0; i < len; i++) + symname[i] = + (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]); + + insert_keyword_callshape (&keywords, symname, len, &shape); + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + /* When adding new keywords here, also update the documentation in + xgettext.texi! */ + x_lisp_keyword ("gettext"); /* I18N:GETTEXT */ + x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */ + x_lisp_keyword ("gettext-noop"); + default_keywords = false; + } +} + +void +init_flag_table_lisp () +{ + xgettext_record_flag ("gettext:1:pass-lisp-format"); + xgettext_record_flag ("ngettext:1:pass-lisp-format"); + xgettext_record_flag ("ngettext:2:pass-lisp-format"); + xgettext_record_flag ("gettext-noop:1:pass-lisp-format"); + xgettext_record_flag ("format:2:lisp-format"); +} + + +/* ======================== Reading of characters. ======================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + + +/* Fetch the next character from the input file. */ +static int +do_getc () +{ + int c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("\ +error while reading \"%s\""), real_file_name); + } + else if (c == '\n') + line_number++; + + return c; +} + +/* Put back the last fetched character, not EOF. */ +static void +do_ungetc (int c) +{ + if (c == '\n') + line_number--; + ungetc (c, fp); +} + + +/* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */ + + +/* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */ + +enum syntax_code +{ + syntax_illegal, /* non-printable, except whitespace */ + syntax_single_esc, /* '\' (single escape) */ + syntax_multi_esc, /* '|' (multiple escape) */ + syntax_constituent, /* everything else (constituent) */ + syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */ + syntax_eof, /* EOF */ + syntax_t_macro, /* '()'"' (terminating macro) */ + syntax_nt_macro /* '#' (non-terminating macro) */ +}; + +/* Returns the syntax code of a character. */ +static enum syntax_code +syntax_code_of (unsigned char c) +{ + switch (c) + { + case '\\': + return syntax_single_esc; + case '|': + return syntax_multi_esc; + case '\t': case '\n': case '\f': case '\r': case ' ': + return syntax_whitespace; + case '(': case ')': case '\'': case '"': case ',': case ';': case '`': + return syntax_t_macro; + case '#': + return syntax_nt_macro; + default: + if (c < ' ' && c != '\b') + return syntax_illegal; + else + return syntax_constituent; + } +} + +struct char_syntax +{ + int ch; /* character */ + enum syntax_code scode; /* syntax code */ +}; + +/* Returns the next character and its syntax code. */ +static void +read_char_syntax (struct char_syntax *p) +{ + int c = do_getc (); + + p->ch = c; + p->scode = (c == EOF ? syntax_eof : syntax_code_of (c)); +} + +/* Every character in a token has an attribute assigned. The attributes + help during interpretation of the token. See + CLHS 2.3 "Interpretation of Tokens" for the possible interpretations, + and CLHS 2.1.4.2 "Constituent Traits". */ + +enum attribute +{ + a_illg, /* invalid constituent */ + a_pack_m, /* ':' package marker */ + a_alpha, /* normal alphabetic */ + a_escaped, /* alphabetic but not subject to case conversion */ + a_ratio, /* '/' */ + a_dot, /* '.' */ + a_sign, /* '+-' */ + a_extens, /* '_^' extension characters */ + a_digit, /* '0123456789' */ + a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */ + a_expodigit, /* 'esfdlESFDL' below base */ + a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */ + a_expo /* 'esfdlESFDL' */ +}; + +#define is_letter_attribute(a) ((a) >= a_letter) +#define is_number_attribute(a) ((a) >= a_ratio) + +/* Returns the attribute of a character, assuming base 10. */ +static enum attribute +attribute_of (unsigned char c) +{ + switch (c) + { + case ':': + return a_pack_m; + case '/': + return a_ratio; + case '.': + return a_dot; + case '+': case '-': + return a_sign; + case '_': case '^': + return a_extens; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return a_digit; + case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': + case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': + return a_letter; + case 'e': case 's': case 'd': case 'f': case 'l': + case 'E': case 'S': case 'D': case 'F': case 'L': + return a_expo; + default: + /* Treat everything as valid. Never return a_illg. */ + return a_alpha; + } +} + +struct token_char +{ + unsigned char ch; /* character */ + unsigned char attribute; /* attribute */ +}; + +/* A token consists of a sequence of characters with associated attribute. */ +struct token +{ + int allocated; /* number of allocated 'token_char's */ + int charcount; /* number of used 'token_char's */ + struct token_char *chars; /* the token's constituents */ + bool with_escape; /* whether single-escape or multiple escape occurs */ +}; + +/* Initialize a 'struct token'. */ +static inline void +init_token (struct token *tp) +{ + tp->allocated = 10; + tp->chars = XNMALLOC (tp->allocated, struct token_char); + tp->charcount = 0; +} + +/* Free the memory pointed to by a 'struct token'. */ +static inline void +free_token (struct token *tp) +{ + free (tp->chars); +} + +/* Ensure there is enough room in the token for one more character. */ +static inline void +grow_token (struct token *tp) +{ + if (tp->charcount == tp->allocated) + { + tp->allocated *= 2; + tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char)); + } +} + +/* Read the next token. If 'first' is given, it points to the first + character, which has already been read. + The algorithm follows CLHS 2.2 "Reader Algorithm". */ +static void +read_token (struct token *tp, const struct char_syntax *first) +{ + bool multiple_escape_flag; + struct char_syntax curr; + + init_token (tp); + tp->with_escape = false; + + multiple_escape_flag = false; + if (first) + curr = *first; + else + read_char_syntax (&curr); + + for (;; read_char_syntax (&curr)) + { + switch (curr.scode) + { + case syntax_illegal: + /* Invalid input. Be tolerant, no error message. */ + do_ungetc (curr.ch); + return; + + case syntax_single_esc: + tp->with_escape = true; + read_char_syntax (&curr); + if (curr.scode == syntax_eof) + /* Invalid input. Be tolerant, no error message. */ + return; + grow_token (tp); + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = a_escaped; + tp->charcount++; + break; + + case syntax_multi_esc: + multiple_escape_flag = !multiple_escape_flag; + tp->with_escape = true; + break; + + case syntax_constituent: + case syntax_nt_macro: + grow_token (tp); + if (multiple_escape_flag) + { + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = a_escaped; + tp->charcount++; + } + else + { + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = attribute_of (curr.ch); + tp->charcount++; + } + break; + + case syntax_whitespace: + case syntax_t_macro: + if (multiple_escape_flag) + { + grow_token (tp); + tp->chars[tp->charcount].ch = curr.ch; + tp->chars[tp->charcount].attribute = a_escaped; + tp->charcount++; + } + else + { + if (curr.scode != syntax_whitespace || read_preserve_whitespace) + do_ungetc (curr.ch); + return; + } + break; + + case syntax_eof: + if (multiple_escape_flag) + /* Invalid input. Be tolerant, no error message. */ + ; + return; + } + } +} + +/* A potential number is a token which + 1. consists only of digits, '+','-','/','^','_','.' and number markers. + The base for digits is context dependent, but always 10 if a dot '.' + occurs. A number marker is a non-digit letter which is not adjacent + to a non-digit letter. + 2. has at least one digit. + 3. starts with a digit, '+','-','.','^' or '_'. + 4. does not end with '+' or '-'. + See CLHS 2.3.1.1 "Potential Numbers as Tokens". + */ + +static inline bool +has_a_dot (const struct token *tp) +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute == a_dot) + return true; + return false; +} + +static inline bool +all_a_number (const struct token *tp) +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (!is_number_attribute (tp->chars[i].attribute)) + return false; + return true; +} + +static inline void +a_letter_to_digit (const struct token *tp, int base) +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (is_letter_attribute (tp->chars[i].attribute)) + { + int c = tp->chars[i].ch; + + if (c >= 'a') + c -= 'a' - 'A'; + if (c - 'A' + 10 < base) + tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit, + a_expo -> a_expodigit */ + } +} + +static inline bool +has_a_digit (const struct token *tp) +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute == a_digit + || tp->chars[i].attribute == a_letterdigit + || tp->chars[i].attribute == a_expodigit) + return true; + return false; +} + +static inline bool +has_adjacent_letters (const struct token *tp) +{ + int n = tp->charcount; + int i; + + for (i = 1; i < n; i++) + if (is_letter_attribute (tp->chars[i-1].attribute) + && is_letter_attribute (tp->chars[i].attribute)) + return true; + return false; +} + +static bool +is_potential_number (const struct token *tp, int *basep) +{ + /* CLHS 2.3.1.1.1: + "A potential number cannot contain any escape characters." */ + if (tp->with_escape) + return false; + + if (has_a_dot (tp)) + *basep = 10; + + if (!all_a_number (tp)) + return false; + + a_letter_to_digit (tp, *basep); + + if (!has_a_digit (tp)) + return false; + + if (has_adjacent_letters (tp)) + return false; + + if (!(tp->chars[0].attribute >= a_dot + && tp->chars[0].attribute <= a_expodigit)) + return false; + + if (tp->chars[tp->charcount - 1].attribute == a_sign) + return false; + + return true; +} + +/* A number is one of integer, ratio, float. Each has a particular syntax. + See CLHS 2.3.1 "Numbers as Tokens". + But note a mistake: The exponent rule should read: + exponent ::= exponent-marker [sign] {decimal-digit}+ + (see 22.1.3.1.3 "Printing Floats"). */ + +enum number_type +{ + n_none, + n_integer, + n_ratio, + n_float +}; + +static enum number_type +is_number (const struct token *tp, int *basep) +{ + struct token_char *ptr_limit; + struct token_char *ptr1; + + if (!is_potential_number (tp, basep)) + return n_none; + + /* is_potential_number guarantees + - all attributes are >= a_ratio, + - there is at least one a_digit or a_letterdigit or a_expodigit, and + - if there is an a_dot, then *basep = 10. */ + + ptr1 = &tp->chars[0]; + ptr_limit = &tp->chars[tp->charcount]; + + if (ptr1->attribute == a_sign) + ptr1++; + + /* Test for syntax + * { a_sign | } + * { a_digit < base }+ { a_ratio { a_digit < base }+ | } + */ + { + bool seen_a_ratio = false; + bool seen_a_digit = false; /* seen a digit in last digit block? */ + struct token_char *ptr; + + for (ptr = ptr1;; ptr++) + { + if (ptr >= ptr_limit) + { + if (!seen_a_digit) + break; + if (seen_a_ratio) + return n_ratio; + else + return n_integer; + } + if (ptr->attribute == a_digit + || ptr->attribute == a_letterdigit + || ptr->attribute == a_expodigit) + { + int c = ptr->ch; + + c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10); + if (c >= *basep) + break; + seen_a_digit = true; + } + else if (ptr->attribute == a_ratio) + { + if (seen_a_ratio || !seen_a_digit) + break; + seen_a_ratio = true; + seen_a_digit = false; + } + else + break; + } + } + + /* Test for syntax + * { a_sign | } + * { a_digit }* { a_dot { a_digit }* | } + * { a_expo { a_sign | } { a_digit }+ | } + * + * If there is an exponent part, there must be digits before the dot or + * after the dot. The result is a float. + * If there is no exponen: + * If there is no dot, it would an integer in base 10, but is has already + * been verified to not be an integer in the current base. + * If there is a dot: + * If there are digits after the dot, it's a float. + * Otherwise, if there are digits before the dot, it's an integer. + */ + *basep = 10; + { + bool seen_a_dot = false; + bool seen_a_dot_with_leading_digits = false; + bool seen_a_digit = false; /* seen a digit in last digit block? */ + struct token_char *ptr; + + for (ptr = ptr1;; ptr++) + { + if (ptr >= ptr_limit) + { + /* no exponent */ + if (!seen_a_dot) + return n_none; + if (seen_a_digit) + return n_float; + if (seen_a_dot_with_leading_digits) + return n_integer; + else + return n_none; + } + if (ptr->attribute == a_digit) + { + seen_a_digit = true; + } + else if (ptr->attribute == a_dot) + { + if (seen_a_dot) + return n_none; + seen_a_dot = true; + if (seen_a_digit) + seen_a_dot_with_leading_digits = true; + seen_a_digit = false; + } + else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit) + break; + else + return n_none; + } + ptr++; + if (!seen_a_dot_with_leading_digits || !seen_a_digit) + return n_none; + if (ptr >= ptr_limit) + return n_none; + if (ptr->attribute == a_sign) + ptr++; + seen_a_digit = false; + for (;; ptr++) + { + if (ptr >= ptr_limit) + break; + if (ptr->attribute != a_digit) + return n_none; + seen_a_digit = true; + } + if (!seen_a_digit) + return n_none; + return n_float; + } +} + +/* A token representing a symbol must be case converted. + For portability, we convert only ASCII characters here. */ + +static void +upcase_token (struct token *tp) +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute != a_escaped) + { + unsigned char c = tp->chars[i].ch; + if (c >= 'a' && c <= 'z') + tp->chars[i].ch = c - 'a' + 'A'; + } +} + +static void +downcase_token (struct token *tp) +{ + int n = tp->charcount; + int i; + + for (i = 0; i < n; i++) + if (tp->chars[i].attribute != a_escaped) + { + unsigned char c = tp->chars[i].ch; + if (c >= 'A' && c <= 'Z') + tp->chars[i].ch = c - 'A' + 'a'; + } +} + +static void +case_convert_token (struct token *tp) +{ + int n = tp->charcount; + int i; + + switch (readtable_case) + { + case case_upcase: + upcase_token (tp); + break; + + case case_downcase: + downcase_token (tp); + break; + + case case_preserve: + break; + + case case_invert: + { + bool seen_uppercase = false; + bool seen_lowercase = false; + for (i = 0; i < n; i++) + if (tp->chars[i].attribute != a_escaped) + { + unsigned char c = tp->chars[i].ch; + if (c >= 'a' && c <= 'z') + seen_lowercase = true; + if (c >= 'A' && c <= 'Z') + seen_uppercase = true; + } + if (seen_uppercase) + { + if (!seen_lowercase) + downcase_token (tp); + } + else + { + if (seen_lowercase) + upcase_token (tp); + } + } + break; + } +} + + +/* ========================= Accumulating comments ========================= */ + + +static char *buffer; +static size_t bufmax; +static size_t buflen; + +static inline void +comment_start () +{ + buflen = 0; +} + +static inline void +comment_add (int c) +{ + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen++] = c; +} + +static inline void +comment_line_end (size_t chars_to_remove) +{ + buflen -= chars_to_remove; + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + if (chars_to_remove == 0 && buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + savable_comment_add (buffer); +} + + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + + +/* ========================= Accumulating messages ========================= */ + + +static message_list_ty *mlp; + + +/* ============== Reading of objects. See CLHS 2 "Syntax". ============== */ + + +/* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings. + Other objects need not to be represented precisely. */ +enum object_type +{ + t_symbol, /* symbol */ + t_string, /* string */ + t_other, /* other kind of real object */ + t_dot, /* '.' pseudo object */ + t_close, /* ')' pseudo object */ + t_eof /* EOF marker */ +}; + +struct object +{ + enum object_type type; + struct token *token; /* for t_symbol and t_string */ + int line_number_at_start; /* for t_string */ +}; + +/* Free the memory pointed to by a 'struct object'. */ +static inline void +free_object (struct object *op) +{ + if (op->type == t_symbol || op->type == t_string) + { + free_token (op->token); + free (op->token); + } +} + +/* Convert a t_symbol/t_string token to a char*. */ +static char * +string_of_object (const struct object *op) +{ + char *str; + const struct token_char *p; + char *q; + int n; + + if (!(op->type == t_symbol || op->type == t_string)) + abort (); + n = op->token->charcount; + str = XNMALLOC (n + 1, char); + q = str; + for (p = op->token->chars; n > 0; p++, n--) + *q++ = p->ch; + *q = '\0'; + return str; +} + +/* Context lookup table. */ +static flag_context_list_table_ty *flag_context_list_table; + +/* Read the next object. */ +static void +read_object (struct object *op, flag_context_ty outer_context) +{ + for (;;) + { + struct char_syntax curr; + + read_char_syntax (&curr); + + switch (curr.scode) + { + case syntax_eof: + op->type = t_eof; + return; + + case syntax_whitespace: + if (curr.ch == '\n') + /* Comments assumed to be grouped with a message must immediately + precede it, with no non-whitespace token on a line between + both. */ + if (last_non_comment_line > last_comment_line) + savable_comment_reset (); + continue; + + case syntax_illegal: + op->type = t_other; + return; + + case syntax_single_esc: + case syntax_multi_esc: + case syntax_constituent: + /* Start reading a token. */ + op->token = XMALLOC (struct token); + read_token (op->token, &curr); + last_non_comment_line = line_number; + + /* Interpret the token. */ + + /* Dots. */ + if (!op->token->with_escape + && op->token->charcount == 1 + && op->token->chars[0].attribute == a_dot) + { + free_token (op->token); + free (op->token); + op->type = t_dot; + return; + } + /* Tokens consisting entirely of dots are illegal, but be tolerant + here. */ + + /* Number. */ + { + int base = read_base; + + if (is_number (op->token, &base) != n_none) + { + free_token (op->token); + free (op->token); + op->type = t_other; + return; + } + } + + /* We interpret all other tokens as symbols (including 'reserved + tokens', i.e. potential numbers which are not numbers). */ + case_convert_token (op->token); + op->type = t_symbol; + return; + + case syntax_t_macro: + case syntax_nt_macro: + /* Read a macro. */ + switch (curr.ch) + { + case '(': + { + int arg = 0; /* Current argument number. */ + flag_context_list_iterator_ty context_iter; + const struct callshapes *shapes = NULL; + struct arglist_parser *argparser = NULL; + + for (;; arg++) + { + struct object inner; + flag_context_ty inner_context; + + if (arg == 0) + inner_context = null_context; + else + inner_context = + inherited_context (outer_context, + flag_context_list_iterator_advance ( + &context_iter)); + + read_object (&inner, inner_context); + + /* Recognize end of list. */ + if (inner.type == t_close) + { + op->type = t_other; + /* Don't bother converting "()" to "NIL". */ + last_non_comment_line = line_number; + if (argparser != NULL) + arglist_parser_done (argparser, arg); + return; + } + + /* Dots are not allowed in every position. + But be tolerant. */ + + /* EOF inside list is illegal. + But be tolerant. */ + if (inner.type == t_eof) + break; + + if (arg == 0) + { + /* This is the function position. */ + if (inner.type == t_symbol) + { + char *symbol_name = string_of_object (&inner); + int i; + int prefix_len; + void *keyword_value; + + /* Omit any package name. */ + i = inner.token->charcount; + while (i > 0 + && inner.token->chars[i-1].attribute != a_pack_m) + i--; + prefix_len = i; + + if (hash_find_entry (&keywords, + symbol_name + prefix_len, + strlen (symbol_name + prefix_len), + &keyword_value) + == 0) + shapes = (const struct callshapes *) keyword_value; + + argparser = arglist_parser_alloc (mlp, shapes); + + context_iter = + flag_context_list_iterator ( + flag_context_list_table_lookup ( + flag_context_list_table, + symbol_name, strlen (symbol_name))); + + free (symbol_name); + } + else + context_iter = null_context_list_iterator; + } + else + { + /* These are the argument positions. */ + if (argparser != NULL && inner.type == t_string) + arglist_parser_remember (argparser, arg, + string_of_object (&inner), + inner_context, + logical_file_name, + inner.line_number_at_start, + savable_comment); + } + + free_object (&inner); + } + + if (argparser != NULL) + arglist_parser_done (argparser, arg); + } + op->type = t_other; + last_non_comment_line = line_number; + return; + + case ')': + /* Tell the caller about the end of list. + Unmatched closing parenthesis is illegal. + But be tolerant. */ + op->type = t_close; + last_non_comment_line = line_number; + return; + + case ',': + { + int c = do_getc (); + /* The ,@ handling inside lists is wrong anyway, because + ,@form expands to an unknown number of elements. */ + if (c != EOF && c != '@' && c != '.') + do_ungetc (c); + } + /*FALLTHROUGH*/ + case '\'': + case '`': + { + struct object inner; + + read_object (&inner, null_context); + + /* Dots and EOF are not allowed here. But be tolerant. */ + + free_object (&inner); + + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case ';': + { + bool all_semicolons = true; + + last_comment_line = line_number; + comment_start (); + for (;;) + { + int c = do_getc (); + if (c == EOF || c == '\n') + break; + if (c != ';') + all_semicolons = false; + if (!all_semicolons) + { + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + } + } + comment_line_end (0); + continue; + } + + case '"': + { + op->token = XMALLOC (struct token); + init_token (op->token); + op->line_number_at_start = line_number; + for (;;) + { + int c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + break; + if (c == '"') + break; + if (c == '\\') /* syntax_single_esc */ + { + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + break; + } + grow_token (op->token); + op->token->chars[op->token->charcount++].ch = c; + } + op->type = t_string; + + if (extract_all) + { + lex_pos_ty pos; + + pos.file_name = logical_file_name; + pos.line_number = op->line_number_at_start; + remember_a_message (mlp, NULL, string_of_object (op), + null_context, &pos, + NULL, savable_comment); + } + last_non_comment_line = line_number; + return; + } + + case '#': + /* Dispatch macro handling. */ + { + int c; + + for (;;) + { + c = do_getc (); + if (c == EOF) + /* Invalid input. Be tolerant, no error message. */ + { + op->type = t_other; + return; + } + if (!(c >= '0' && c <= '9')) + break; + } + + switch (c) + { + case '(': + case '"': + do_ungetc (c); + /*FALLTHROUGH*/ + case '\'': + case ':': + case '.': + case ',': + case 'A': case 'a': + case 'C': case 'c': + case 'P': case 'p': + case 'S': case 's': + { + struct object inner; + read_object (&inner, null_context); + /* Dots and EOF are not allowed here. + But be tolerant. */ + free_object (&inner); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case '|': + { + int depth = 0; + int c; + + comment_start (); + c = do_getc (); + for (;;) + { + if (c == EOF) + break; + if (c == '|') + { + c = do_getc (); + if (c == EOF) + break; + if (c == '#') + { + if (depth == 0) + { + comment_line_end (0); + break; + } + depth--; + comment_add ('|'); + comment_add ('#'); + c = do_getc (); + } + else + comment_add ('|'); + } + else if (c == '#') + { + c = do_getc (); + if (c == EOF) + break; + comment_add ('#'); + if (c == '|') + { + depth++; + comment_add ('|'); + c = do_getc (); + } + } + else + { + /* We skip all leading white space. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + if (c == '\n') + { + comment_line_end (1); + comment_start (); + } + c = do_getc (); + } + } + if (c == EOF) + { + /* EOF not allowed here. But be tolerant. */ + op->type = t_eof; + return; + } + last_comment_line = line_number; + continue; + } + + case '\\': + { + struct token token; + struct char_syntax first; + first.ch = '\\'; + first.scode = syntax_single_esc; + read_token (&token, &first); + free_token (&token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case 'B': case 'b': + case 'O': case 'o': + case 'X': case 'x': + case 'R': case 'r': + case '*': + { + struct token token; + read_token (&token, NULL); + free_token (&token); + op->type = t_other; + last_non_comment_line = line_number; + return; + } + + case '=': + /* Ignore read labels. */ + continue; + + case '#': + /* Don't bother looking up the corresponding object. */ + op->type = t_other; + last_non_comment_line = line_number; + return; + + case '+': + case '-': + /* Simply assume every feature expression is true. */ + { + struct object inner; + read_object (&inner, null_context); + /* Dots and EOF are not allowed here. + But be tolerant. */ + free_object (&inner); + continue; + } + + default: + op->type = t_other; + last_non_comment_line = line_number; + return; + } + /*NOTREACHED*/ + abort (); + } + + default: + /*NOTREACHED*/ + abort (); + } + + default: + /*NOTREACHED*/ + abort (); + } + } +} + + +void +extract_lisp (FILE *f, + const char *real_filename, const char *logical_filename, + flag_context_list_table_ty *flag_table, + msgdomain_list_ty *mdlp) +{ + mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + last_comment_line = -1; + last_non_comment_line = -1; + + flag_context_list_table = flag_table; + + init_keywords (); + + /* Eat tokens until eof is seen. When read_object returns + due to an unbalanced closing parenthesis, just restart it. */ + do + { + struct object toplevel_object; + + read_object (&toplevel_object, null_context); + + if (toplevel_object.type == t_eof) + break; + + free_object (&toplevel_object); + } + while (!feof (fp)); + + /* Close scanner. */ + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} |