diff options
Diffstat (limited to 'gettext-tools/src/x-awk.c')
-rw-r--r-- | gettext-tools/src/x-awk.c | 887 |
1 files changed, 887 insertions, 0 deletions
diff --git a/gettext-tools/src/x-awk.c b/gettext-tools/src/x-awk.c new file mode 100644 index 0000000..6a6a9dc --- /dev/null +++ b/gettext-tools/src/x-awk.c @@ -0,0 +1,887 @@ +/* xgettext awk backend. + Copyright (C) 2002-2003, 2005-2009 Free Software Foundation, Inc. + + This file was written by Bruno Haible <haible@clisp.cons.org>, 2002. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +/* Specification. */ +#include "x-awk.h" + +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "message.h" +#include "xgettext.h" +#include "error.h" +#include "error-progname.h" +#include "xalloc.h" +#include "gettext.h" + +#define _(s) gettext(s) + + +/* The awk syntax is defined in the gawk manual page and documentation. + See also gawk/awkgram.y. */ + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_awk_extract_all () +{ + extract_all = true; +} + + +void +x_awk_keyword (const char *name) +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + struct callshape shape; + const char *colon; + + if (keywords.table == NULL) + hash_init (&keywords, 100); + + split_keywordspec (name, &end, &shape); + + /* The characters between name and end should form a valid C identifier. + A colon means an invalid parse in split_keywordspec(). */ + colon = strchr (name, ':'); + if (colon == NULL || colon >= end) + insert_keyword_callshape (&keywords, name, end - name, &shape); + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + /* When adding new keywords here, also update the documentation in + xgettext.texi! */ + x_awk_keyword ("dcgettext"); + x_awk_keyword ("dcngettext:1,2"); + default_keywords = false; + } +} + +void +init_flag_table_awk () +{ + xgettext_record_flag ("dcgettext:1:pass-awk-format"); + xgettext_record_flag ("dcngettext:1:pass-awk-format"); + xgettext_record_flag ("dcngettext:2:pass-awk-format"); + xgettext_record_flag ("printf:1:awk-format"); +} + + +/* ======================== Reading of characters. ======================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + + +/* 1. line_number handling. */ + +static int +phase1_getc () +{ + int c = getc (fp); + + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("error while reading \"%s\""), + real_file_name); + return EOF; + } + + if (c == '\n') + line_number++; + + return c; +} + +/* Supports only one pushback character. */ +static void +phase1_ungetc (int c) +{ + if (c != EOF) + { + if (c == '\n') + --line_number; + + ungetc (c, fp); + } +} + + +/* 2. Replace each comment that is not inside a string literal or regular + expression with a newline character. We need to remember the comment + for later, because it may be attached to a keyword string. */ + +static int +phase2_getc () +{ + static char *buffer; + static size_t bufmax; + size_t buflen; + int lineno; + int c; + + c = phase1_getc (); + if (c == '#') + { + buflen = 0; + lineno = line_number; + for (;;) + { + c = phase1_getc (); + if (c == '\n' || c == EOF) + break; + /* We skip all leading white space, but not EOLs. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + { + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen++] = c; + } + } + if (buflen >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[buflen] = '\0'; + savable_comment_add (buffer); + last_comment_line = lineno; + } + return c; +} + +/* Supports only one pushback character. */ +static void +phase2_ungetc (int c) +{ + if (c != EOF) + phase1_ungetc (c); +} + + +/* ========================== Reading of tokens. ========================== */ + + +enum token_type_ty +{ + token_type_eof, + token_type_lparen, /* ( */ + token_type_rparen, /* ) */ + token_type_comma, /* , */ + token_type_string, /* "abc" */ + token_type_i18nstring, /* _"abc" */ + token_type_symbol, /* symbol, number */ + token_type_semicolon, /* ; */ + token_type_other /* regexp, misc. operator */ +}; +typedef enum token_type_ty token_type_ty; + +typedef struct token_ty token_ty; +struct token_ty +{ + token_type_ty type; + char *string; /* for token_type_{symbol,string,i18nstring} */ + int line_number; +}; + + +/* 7. Replace escape sequences within character strings with their + single character equivalents. */ + +#define P7_QUOTES (1000 + '"') + +static int +phase7_getc () +{ + int c; + + for (;;) + { + /* Use phase 1, because phase 2 elides comments. */ + c = phase1_getc (); + + if (c == EOF || c == '\n') + break; + if (c == '"') + return P7_QUOTES; + if (c != '\\') + return c; + c = phase1_getc (); + if (c == EOF) + break; + if (c != '\n') + switch (c) + { + case 'a': + return '\a'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'v': + return '\v'; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + int n = c - '0'; + + c = phase1_getc (); + if (c != EOF) + { + if (c >= '0' && c <= '7') + { + n = (n << 3) + (c - '0'); + c = phase1_getc (); + if (c != EOF) + { + if (c >= '0' && c <= '7') + n = (n << 3) + (c - '0'); + else + phase1_ungetc (c); + } + } + else + phase1_ungetc (c); + } + return (unsigned char) n; + } + case 'x': + { + int n = 0; + + for (;;) + { + c = phase1_getc (); + if (c == EOF) + break; + else if (c >= '0' && c <= '9') + n = (n << 4) + (c - '0'); + else if (c >= 'A' && c <= 'F') + n = (n << 4) + (c - 'A' + 10); + else if (c >= 'a' && c <= 'f') + n = (n << 4) + (c - 'a' + 10); + else + { + phase1_ungetc (c); + break; + } + } + return (unsigned char) n; + } + default: + return c; + } + } + + phase1_ungetc (c); + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated string"), logical_file_name, + line_number); + error_with_progname = true; + return P7_QUOTES; +} + + +/* Free the memory pointed to by a 'struct token_ty'. */ +static inline void +free_token (token_ty *tp) +{ + switch (tp->type) + { + case token_type_string: + case token_type_i18nstring: + case token_type_symbol: + free (tp->string); + break; + default: + break; + } +} + + +/* Combine characters into tokens. Discard whitespace. */ + +/* There is an ambiguity about '/': It can start a division operator ('/' or + '/=') or it can start a regular expression. The distinction is important + because inside regular expressions, '#' and '"' lose its special meanings. + If you look at the awk grammar, you see that the operator is only allowed + right after a 'variable' or 'simp_exp' nonterminal, and these nonterminals + can only end in the NAME, LENGTH, YSTRING, YNUMBER, ')', ']' terminals. + So we prefer the division operator interpretation only right after + symbol, string, number, ')', ']', with whitespace but no newline allowed + in between. */ +static bool prefer_division_over_regexp; + +static void +x_awk_lex (token_ty *tp) +{ + static char *buffer; + static int bufmax; + int bufpos; + int c; + + for (;;) + { + tp->line_number = line_number; + c = phase2_getc (); + + switch (c) + { + case EOF: + tp->type = token_type_eof; + return; + + case '\n': + if (last_non_comment_line > last_comment_line) + savable_comment_reset (); + /* Newline is not allowed inside expressions. It usually + introduces a fresh statement. + FIXME: Newlines after any of ',' '{' '?' ':' '||' '&&' 'do' 'else' + does *not* introduce a fresh statement. */ + prefer_division_over_regexp = false; + /* FALLTHROUGH */ + case '\t': + case ' ': + /* Ignore whitespace and comments. */ + continue; + + case '\\': + /* Backslash ought to be immediately followed by a newline. */ + continue; + } + + last_non_comment_line = tp->line_number; + + switch (c) + { + case '.': + { + int c2 = phase2_getc (); + phase2_ungetc (c2); + if (!(c2 >= '0' && c2 <= '9')) + { + + tp->type = token_type_other; + prefer_division_over_regexp = false; + return; + } + } + /* FALLTHROUGH */ + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + /* Symbol, or part of a number. */ + bufpos = 0; + for (;;) + { + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase2_getc (); + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + continue; + default: + if (bufpos == 1 && buffer[0] == '_' && c == '"') + { + tp->type = token_type_i18nstring; + goto case_string; + } + phase2_ungetc (c); + break; + } + break; + } + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = '\0'; + tp->string = xstrdup (buffer); + tp->type = token_type_symbol; + /* Most identifiers can be variable names; after them we must + interpret '/' as division operator. But for awk's builtin + keywords we have three cases: + (a) Must interpret '/' as division operator. "length". + (b) Must interpret '/' as start of a regular expression. + "do", "exit", "print", "printf", "return". + (c) '/' after this keyword in invalid anyway. All others. + I used the following script for the distinction. + for k in $awk_keywords; do + echo; echo $k; awk "function foo () { $k / 10 }" < /dev/null + done + */ + if (strcmp (buffer, "do") == 0 + || strcmp (buffer, "exit") == 0 + || strcmp (buffer, "print") == 0 + || strcmp (buffer, "printf") == 0 + || strcmp (buffer, "return") == 0) + prefer_division_over_regexp = false; + else + prefer_division_over_regexp = true; + return; + + case '"': + tp->type = token_type_string; + case_string: + bufpos = 0; + for (;;) + { + c = phase7_getc (); + if (c == EOF || c == P7_QUOTES) + break; + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = '\0'; + tp->string = xstrdup (buffer); + prefer_division_over_regexp = true; + return; + + case '(': + tp->type = token_type_lparen; + prefer_division_over_regexp = false; + return; + + case ')': + tp->type = token_type_rparen; + prefer_division_over_regexp = true; + return; + + case ',': + tp->type = token_type_comma; + prefer_division_over_regexp = false; + return; + + case ';': + tp->type = token_type_semicolon; + prefer_division_over_regexp = false; + return; + + case ']': + tp->type = token_type_other; + prefer_division_over_regexp = true; + return; + + case '/': + if (!prefer_division_over_regexp) + { + /* Regular expression. + Counting brackets is non-trivial. [[] is balanced, and so is + [\]]. Also, /[/]/ is balanced and ends at the third slash. + Do not count [ or ] if either one is preceded by a \. + A '[' should be counted if + a) it is the first one so far (brackets == 0), or + b) it is the '[' in '[:'. + A ']' should be counted if not preceded by a \. + According to POSIX, []] is how you put a ] into a set. + Try to handle that too. + */ + int brackets = 0; + bool pos0 = true; /* true at start of regexp */ + bool pos1_open = false; /* true after [ at start of regexp */ + bool pos2_open_not = false; /* true after [^ at start of regexp */ + + for (;;) + { + c = phase1_getc (); + + if (c == EOF || c == '\n') + { + phase1_ungetc (c); + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated regular expression"), + logical_file_name, line_number); + error_with_progname = true; + break; + } + else if (c == '[') + { + if (brackets == 0) + brackets++; + else + { + c = phase1_getc (); + if (c == ':') + brackets++; + phase1_ungetc (c); + } + if (pos0) + { + pos0 = false; + pos1_open = true; + continue; + } + } + else if (c == ']') + { + if (!(pos1_open || pos2_open_not)) + brackets--; + } + else if (c == '^') + { + if (pos1_open) + { + pos1_open = false; + pos2_open_not = true; + continue; + } + } + else if (c == '\\') + { + c = phase1_getc (); + /* Backslash-newline is valid and ignored. */ + } + else if (c == '/') + { + if (brackets <= 0) + break; + } + + pos0 = false; + pos1_open = false; + pos2_open_not = false; + } + + tp->type = token_type_other; + prefer_division_over_regexp = false; + return; + } + /* FALLTHROUGH */ + + default: + /* We could carefully recognize each of the 2 and 3 character + operators, but it is not necessary, as we only need to recognize + gettext invocations. Don't bother. */ + tp->type = token_type_other; + prefer_division_over_regexp = false; + return; + } + } +} + + +/* ========================= Extracting strings. ========================== */ + + +/* Context lookup table. */ +static flag_context_list_table_ty *flag_context_list_table; + + +/* The file is broken into tokens. Scan the token stream, looking for + a keyword, followed by a left paren, followed by a string. When we + see this sequence, we have something to remember. We assume we are + looking at a valid C or C++ program, and leave the complaints about + the grammar to the compiler. + + Normal handling: Look for + keyword ( ... msgid ... ) + Plural handling: Look for + keyword ( ... msgid ... msgid_plural ... ) + + We use recursion because the arguments before msgid or between msgid + and msgid_plural can contain subexpressions of the same form. */ + + +/* Extract messages until the next balanced closing parenthesis. + Extracted messages are added to MLP. + Return true upon eof, false upon closing parenthesis. */ +static bool +extract_parenthesized (message_list_ty *mlp, + flag_context_ty outer_context, + flag_context_list_iterator_ty context_iter, + struct arglist_parser *argparser) +{ + /* Current argument number. */ + int arg = 1; + /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ + int state; + /* Parameters of the keyword just seen. Defined only in state 1. */ + const struct callshapes *next_shapes = NULL; + /* Whether to implicitly assume the next tokens are arguments even without + a '('. */ + bool next_is_argument = false; + /* Context iterator that will be used if the next token is a '('. */ + flag_context_list_iterator_ty next_context_iter = + passthrough_context_list_iterator; + /* Current context. */ + flag_context_ty inner_context = + inherited_context (outer_context, + flag_context_list_iterator_advance (&context_iter)); + + /* Start state is 0. */ + state = 0; + + for (;;) + { + token_ty token; + + x_awk_lex (&token); + + if (next_is_argument && token.type != token_type_lparen) + { + /* An argument list starts, even though there is no '('. */ + context_iter = next_context_iter; + outer_context = inner_context; + inner_context = + inherited_context (outer_context, + flag_context_list_iterator_advance ( + &context_iter)); + } + + switch (token.type) + { + case token_type_symbol: + { + void *keyword_value; + + if (hash_find_entry (&keywords, token.string, strlen (token.string), + &keyword_value) + == 0) + { + next_shapes = (const struct callshapes *) keyword_value; + state = 1; + } + else + state = 0; + } + next_is_argument = + (strcmp (token.string, "print") == 0 + || strcmp (token.string, "printf") == 0); + next_context_iter = + flag_context_list_iterator ( + flag_context_list_table_lookup ( + flag_context_list_table, + token.string, strlen (token.string))); + free (token.string); + continue; + + case token_type_lparen: + if (extract_parenthesized (mlp, inner_context, next_context_iter, + arglist_parser_alloc (mlp, + state ? next_shapes : NULL))) + { + arglist_parser_done (argparser, arg); + return true; + } + next_is_argument = false; + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_rparen: + arglist_parser_done (argparser, arg); + return false; + + case token_type_comma: + arg++; + inner_context = + inherited_context (outer_context, + flag_context_list_iterator_advance ( + &context_iter)); + next_is_argument = false; + next_context_iter = passthrough_context_list_iterator; + state = 0; + continue; + + case token_type_string: + { + lex_pos_ty pos; + pos.file_name = logical_file_name; + pos.line_number = token.line_number; + + if (extract_all) + remember_a_message (mlp, NULL, token.string, inner_context, &pos, + NULL, savable_comment); + else + arglist_parser_remember (argparser, arg, token.string, + inner_context, + pos.file_name, pos.line_number, + savable_comment); + } + next_is_argument = false; + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_i18nstring: + { + lex_pos_ty pos; + pos.file_name = logical_file_name; + pos.line_number = token.line_number; + + remember_a_message (mlp, NULL, token.string, inner_context, &pos, + NULL, savable_comment); + } + next_is_argument = false; + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_semicolon: + /* An argument list ends, and a new statement begins. */ + /* FIXME: Should handle newline that acts as statement separator + in the same way. */ + /* FIXME: Instead of resetting outer_context here, it may be better + to recurse in the next_is_argument handling above, waiting for + the next semicolon or other statement terminator. */ + outer_context = null_context; + context_iter = null_context_list_iterator; + next_is_argument = false; + next_context_iter = passthrough_context_list_iterator; + inner_context = + inherited_context (outer_context, + flag_context_list_iterator_advance ( + &context_iter)); + state = 0; + continue; + + case token_type_eof: + arglist_parser_done (argparser, arg); + return true; + + case token_type_other: + next_is_argument = false; + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + default: + abort (); + } + } +} + + +void +extract_awk (FILE *f, + const char *real_filename, const char *logical_filename, + flag_context_list_table_ty *flag_table, + msgdomain_list_ty *mdlp) +{ + message_list_ty *mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + last_comment_line = -1; + last_non_comment_line = -1; + + prefer_division_over_regexp = false; + + flag_context_list_table = flag_table; + + init_keywords (); + + /* Eat tokens until eof is seen. When extract_parenthesized returns + due to an unbalanced closing parenthesis, just restart it. */ + while (!extract_parenthesized (mlp, null_context, null_context_list_iterator, + arglist_parser_alloc (mlp, NULL))) + ; + + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} |