summaryrefslogtreecommitdiff
path: root/gettext-tools/src/x-perl.c
diff options
context:
space:
mode:
Diffstat (limited to 'gettext-tools/src/x-perl.c')
-rw-r--r--gettext-tools/src/x-perl.c3592
1 files changed, 3592 insertions, 0 deletions
diff --git a/gettext-tools/src/x-perl.c b/gettext-tools/src/x-perl.c
new file mode 100644
index 0000000..571f6de
--- /dev/null
+++ b/gettext-tools/src/x-perl.c
@@ -0,0 +1,3592 @@
+/* xgettext Perl backend.
+ Copyright (C) 2002-2010 Free Software Foundation, Inc.
+
+ This file was written by Guido Flohr <guido@imperia.net>, 2002-2010.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+/* Specification. */
+#include "x-perl.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "message.h"
+#include "xgettext.h"
+#include "error.h"
+#include "error-progname.h"
+#include "xalloc.h"
+#include "po-charset.h"
+#include "unistr.h"
+#include "uniname.h"
+#include "gettext.h"
+
+#define _(s) gettext(s)
+
+/* The Perl syntax is defined in perlsyn.pod. Try the command
+ "man perlsyn" or "perldoc perlsyn".
+ Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
+ Try the command "man perlsub" or "perldoc perlsub".
+ Perl 5.10 has new operators '//' and '//=', see
+ <http://perldoc.perl.org/perldelta.html#Defined-or-operator>. */
+
+#define DEBUG_PERL 0
+
+
+/* ====================== Keyword set customization. ====================== */
+
+/* If true extract all strings. */
+static bool extract_all = false;
+
+static hash_table keywords;
+static bool default_keywords = true;
+
+
+void
+x_perl_extract_all ()
+{
+ extract_all = true;
+}
+
+
+void
+x_perl_keyword (const char *name)
+{
+ if (name == NULL)
+ default_keywords = false;
+ else
+ {
+ const char *end;
+ struct callshape shape;
+ const char *colon;
+
+ if (keywords.table == NULL)
+ hash_init (&keywords, 100);
+
+ split_keywordspec (name, &end, &shape);
+
+ /* The characters between name and end should form a valid C identifier.
+ A colon means an invalid parse in split_keywordspec(). */
+ colon = strchr (name, ':');
+ if (colon == NULL || colon >= end)
+ insert_keyword_callshape (&keywords, name, end - name, &shape);
+ }
+}
+
+/* Finish initializing the keywords hash table.
+ Called after argument processing, before each file is processed. */
+static void
+init_keywords ()
+{
+ if (default_keywords)
+ {
+ /* When adding new keywords here, also update the documentation in
+ xgettext.texi! */
+ x_perl_keyword ("gettext");
+ x_perl_keyword ("%gettext");
+ x_perl_keyword ("$gettext");
+ x_perl_keyword ("dgettext:2");
+ x_perl_keyword ("dcgettext:2");
+ x_perl_keyword ("ngettext:1,2");
+ x_perl_keyword ("dngettext:2,3");
+ x_perl_keyword ("dcngettext:2,3");
+ x_perl_keyword ("gettext_noop");
+#if 0
+ x_perl_keyword ("__");
+ x_perl_keyword ("$__");
+ x_perl_keyword ("%__");
+ x_perl_keyword ("__x");
+ x_perl_keyword ("__n:1,2");
+ x_perl_keyword ("__nx:1,2");
+ x_perl_keyword ("__xn:1,2");
+ x_perl_keyword ("N__");
+#endif
+ default_keywords = false;
+ }
+}
+
+void
+init_flag_table_perl ()
+{
+ xgettext_record_flag ("gettext:1:pass-perl-format");
+ xgettext_record_flag ("gettext:1:pass-perl-brace-format");
+ xgettext_record_flag ("%gettext:1:pass-perl-format");
+ xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
+ xgettext_record_flag ("$gettext:1:pass-perl-format");
+ xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
+ xgettext_record_flag ("dgettext:2:pass-perl-format");
+ xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
+ xgettext_record_flag ("dcgettext:2:pass-perl-format");
+ xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
+ xgettext_record_flag ("ngettext:1:pass-perl-format");
+ xgettext_record_flag ("ngettext:2:pass-perl-format");
+ xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
+ xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
+ xgettext_record_flag ("dngettext:2:pass-perl-format");
+ xgettext_record_flag ("dngettext:3:pass-perl-format");
+ xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
+ xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
+ xgettext_record_flag ("dcngettext:2:pass-perl-format");
+ xgettext_record_flag ("dcngettext:3:pass-perl-format");
+ xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
+ xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
+ xgettext_record_flag ("gettext_noop:1:pass-perl-format");
+ xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
+ xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
+ xgettext_record_flag ("sprintf:1:perl-format");
+#if 0
+ xgettext_record_flag ("__:1:pass-perl-format");
+ xgettext_record_flag ("__:1:pass-perl-brace-format");
+ xgettext_record_flag ("%__:1:pass-perl-format");
+ xgettext_record_flag ("%__:1:pass-perl-brace-format");
+ xgettext_record_flag ("$__:1:pass-perl-format");
+ xgettext_record_flag ("$__:1:pass-perl-brace-format");
+ xgettext_record_flag ("__x:1:perl-brace-format");
+ xgettext_record_flag ("__n:1:pass-perl-format");
+ xgettext_record_flag ("__n:2:pass-perl-format");
+ xgettext_record_flag ("__n:1:pass-perl-brace-format");
+ xgettext_record_flag ("__n:2:pass-perl-brace-format");
+ xgettext_record_flag ("__nx:1:perl-brace-format");
+ xgettext_record_flag ("__nx:2:perl-brace-format");
+ xgettext_record_flag ("__xn:1:perl-brace-format");
+ xgettext_record_flag ("__xn:2:perl-brace-format");
+ xgettext_record_flag ("N__:1:pass-perl-format");
+ xgettext_record_flag ("N__:1:pass-perl-brace-format");
+#endif
+}
+
+
+/* ======================== Reading of characters. ======================== */
+
+/* Real filename, used in error messages about the input file. */
+static const char *real_file_name;
+
+/* Logical filename and line number, used to label the extracted messages. */
+static char *logical_file_name;
+static int line_number;
+
+/* The input file stream. */
+static FILE *fp;
+
+/* The current line buffer. */
+static char *linebuf;
+
+/* The size of the current line. */
+static int linesize;
+
+/* The position in the current line. */
+static int linepos;
+
+/* The size of the input buffer. */
+static size_t linebuf_size;
+
+/* Number of lines eaten for here documents. */
+static int eaten_here;
+
+/* Paranoia: EOF marker for __END__ or __DATA__. */
+static bool end_of_file;
+
+
+/* 1. line_number handling. */
+
+/* Returns the next character from the input stream or EOF. */
+static int
+phase1_getc ()
+{
+ line_number += eaten_here;
+ eaten_here = 0;
+
+ if (end_of_file)
+ return EOF;
+
+ if (linepos >= linesize)
+ {
+ linesize = getline (&linebuf, &linebuf_size, fp);
+
+ if (linesize < 0)
+ {
+ if (ferror (fp))
+ error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
+ real_file_name);
+ end_of_file = true;
+ return EOF;
+ }
+
+ linepos = 0;
+ ++line_number;
+
+ /* Undosify. This is important for catching the end of <<EOF and
+ <<'EOF'. We could rely on stdio doing this for us but
+ it is not uncommon to to come across Perl scripts with CRLF
+ newline conventions on systems that do not follow this
+ convention. */
+ if (linesize >= 2 && linebuf[linesize - 1] == '\n'
+ && linebuf[linesize - 2] == '\r')
+ {
+ linebuf[linesize - 2] = '\n';
+ linebuf[linesize - 1] = '\0';
+ --linesize;
+ }
+ }
+
+ return linebuf[linepos++];
+}
+
+/* Supports only one pushback character. */
+static void
+phase1_ungetc (int c)
+{
+ if (c != EOF)
+ {
+ if (linepos == 0)
+ /* Attempt to ungetc across line boundary. Shouldn't happen.
+ No two phase1_ungetc calls are permitted in a row. */
+ abort ();
+
+ --linepos;
+ }
+}
+
+/* Read a here document and return its contents.
+ The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
+ encoded as well. */
+
+static char *
+get_here_document (const char *delimiter)
+{
+ /* Accumulator for the entire here document, including a NUL byte
+ at the end. */
+ static char *buffer;
+ static size_t bufmax = 0;
+ size_t bufpos = 0;
+ /* Current line being appended. */
+ static char *my_linebuf = NULL;
+ static size_t my_linebuf_size = 0;
+
+ /* Allocate the initial buffer. Later on, bufmax > 0. */
+ if (bufmax == 0)
+ {
+ buffer = XNMALLOC (1, char);
+ buffer[0] = '\0';
+ bufmax = 1;
+ }
+
+ for (;;)
+ {
+ int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
+ char *my_line_utf8;
+ bool chomp;
+
+ if (read_bytes < 0)
+ {
+ if (ferror (fp))
+ {
+ error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
+ real_file_name);
+ }
+ else
+ {
+ error_with_progname = false;
+ error (EXIT_SUCCESS, 0, _("\
+%s:%d: can't find string terminator \"%s\" anywhere before EOF"),
+ real_file_name, line_number, delimiter);
+ error_with_progname = true;
+
+ break;
+ }
+ }
+
+ ++eaten_here;
+
+ /* Convert to UTF-8. */
+ my_line_utf8 =
+ from_current_source_encoding (my_linebuf, lc_string, logical_file_name,
+ line_number + eaten_here);
+ if (my_line_utf8 != my_linebuf)
+ {
+ if (strlen (my_line_utf8) >= my_linebuf_size)
+ {
+ my_linebuf_size = strlen (my_line_utf8) + 1;
+ my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
+ }
+ strcpy (my_linebuf, my_line_utf8);
+ free (my_line_utf8);
+ }
+
+ /* Undosify. This is important for catching the end of <<EOF and
+ <<'EOF'. We could rely on stdio doing this for us but you
+ it is not uncommon to to come across Perl scripts with CRLF
+ newline conventions on systems that do not follow this
+ convention. */
+ if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
+ && my_linebuf[read_bytes - 2] == '\r')
+ {
+ my_linebuf[read_bytes - 2] = '\n';
+ my_linebuf[read_bytes - 1] = '\0';
+ --read_bytes;
+ }
+
+ /* Temporarily remove the trailing newline from my_linebuf. */
+ chomp = false;
+ if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
+ {
+ chomp = true;
+ my_linebuf[read_bytes - 1] = '\0';
+ }
+
+ /* See whether this line terminates the here document. */
+ if (strcmp (my_linebuf, delimiter) == 0)
+ break;
+
+ /* Add back the trailing newline to my_linebuf. */
+ if (chomp)
+ my_linebuf[read_bytes - 1] = '\n';
+
+ /* Ensure room for read_bytes + 1 bytes. */
+ if (bufpos + read_bytes >= bufmax)
+ {
+ do
+ bufmax = 2 * bufmax + 10;
+ while (bufpos + read_bytes >= bufmax);
+ buffer = xrealloc (buffer, bufmax);
+ }
+ /* Append this line to the accumulator. */
+ strcpy (buffer + bufpos, my_linebuf);
+ bufpos += read_bytes;
+ }
+
+ /* Done accumulating the here document. */
+ return xstrdup (buffer);
+}
+
+/* Skips pod sections. */
+static void
+skip_pod ()
+{
+ line_number += eaten_here;
+ eaten_here = 0;
+ linepos = 0;
+
+ for (;;)
+ {
+ linesize = getline (&linebuf, &linebuf_size, fp);
+
+ if (linesize < 0)
+ {
+ if (ferror (fp))
+ error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
+ real_file_name);
+ return;
+ }
+
+ ++line_number;
+
+ if (strncmp ("=cut", linebuf, 4) == 0)
+ {
+ /* Force reading of a new line on next call to phase1_getc(). */
+ linepos = linesize;
+ return;
+ }
+ }
+}
+
+
+/* These are for tracking whether comments count as immediately before
+ keyword. */
+static int last_comment_line;
+static int last_non_comment_line;
+
+
+/* 2. Replace each comment that is not inside a string literal or regular
+ expression with a newline character. We need to remember the comment
+ for later, because it may be attached to a keyword string. */
+
+static int
+phase2_getc ()
+{
+ static char *buffer;
+ static size_t bufmax;
+ size_t buflen;
+ int lineno;
+ int c;
+ char *utf8_string;
+
+ c = phase1_getc ();
+ if (c == '#')
+ {
+ buflen = 0;
+ lineno = line_number;
+ /* Skip leading whitespace. */
+ for (;;)
+ {
+ c = phase1_getc ();
+ if (c == EOF)
+ break;
+ if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
+ {
+ phase1_ungetc (c);
+ break;
+ }
+ }
+ /* Accumulate the comment. */
+ for (;;)
+ {
+ c = phase1_getc ();
+ if (c == '\n' || c == EOF)
+ break;
+ if (buflen >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[buflen++] = c;
+ }
+ if (buflen >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[buflen] = '\0';
+ /* Convert it to UTF-8. */
+ utf8_string =
+ from_current_source_encoding (buffer, lc_comment, logical_file_name,
+ lineno);
+ /* Save it until we encounter the corresponding string. */
+ savable_comment_add (utf8_string);
+ last_comment_line = lineno;
+ }
+ return c;
+}
+
+/* Supports only one pushback character. */
+static void
+phase2_ungetc (int c)
+{
+ if (c != EOF)
+ phase1_ungetc (c);
+}
+
+/* Whitespace recognition. */
+
+#define case_whitespace \
+ case ' ': case '\t': case '\r': case '\n': case '\f'
+
+static inline bool
+is_whitespace (int c)
+{
+ return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
+}
+
+
+/* ========================== Reading of tokens. ========================== */
+
+
+enum token_type_ty
+{
+ token_type_eof,
+ token_type_lparen, /* ( */
+ token_type_rparen, /* ) */
+ token_type_comma, /* , */
+ token_type_fat_comma, /* => */
+ token_type_dereference, /* -> */
+ token_type_semicolon, /* ; */
+ token_type_lbrace, /* { */
+ token_type_rbrace, /* } */
+ token_type_lbracket, /* [ */
+ token_type_rbracket, /* ] */
+ token_type_string, /* quote-like */
+ token_type_number, /* starting with a digit o dot */
+ token_type_named_op, /* if, unless, while, ... */
+ token_type_variable, /* $... */
+ token_type_object, /* A dereferenced variable, maybe a blessed
+ object. */
+ token_type_symbol, /* symbol, number */
+ token_type_regex_op, /* s, tr, y, m. */
+ token_type_dot, /* . */
+ token_type_other, /* regexp, misc. operator */
+ /* The following are not really token types, but variants used by
+ the parser. */
+ token_type_keyword_symbol /* keyword symbol */
+};
+typedef enum token_type_ty token_type_ty;
+
+/* Subtypes for strings, important for interpolation. */
+enum string_type_ty
+{
+ string_type_verbatim, /* "<<'EOF'", "m'...'", "s'...''...'",
+ "tr/.../.../", "y/.../.../". */
+ string_type_q, /* "'..'", "q/.../". */
+ string_type_qq, /* '"..."', "`...`", "qq/.../", "qx/.../",
+ "<file*glob>". */
+ string_type_qr /* Not supported. */
+};
+
+/* Subtypes for symbols, important for dollar interpretation. */
+enum symbol_type_ty
+{
+ symbol_type_none, /* Nothing special. */
+ symbol_type_sub, /* 'sub'. */
+ symbol_type_function /* Function name after 'sub'. */
+};
+
+typedef struct token_ty token_ty;
+struct token_ty
+{
+ token_type_ty type;
+ token_type_ty last_type;
+ int sub_type; /* for token_type_string, token_type_symbol */
+ char *string; /* for: in encoding:
+ token_type_named_op ASCII
+ token_type_string UTF-8
+ token_type_symbol ASCII
+ token_type_variable global_source_encoding
+ token_type_object global_source_encoding
+ */
+ refcounted_string_list_ty *comment; /* for token_type_string */
+ int line_number;
+};
+
+#if DEBUG_PERL
+static const char *
+token2string (const token_ty *token)
+{
+ switch (token->type)
+ {
+ case token_type_eof:
+ return "token_type_eof";
+ case token_type_lparen:
+ return "token_type_lparen";
+ case token_type_rparen:
+ return "token_type_rparen";
+ case token_type_comma:
+ return "token_type_comma";
+ case token_type_fat_comma:
+ return "token_type_fat_comma";
+ case token_type_dereference:
+ return "token_type_dereference";
+ case token_type_semicolon:
+ return "token_type_semicolon";
+ case token_type_lbrace:
+ return "token_type_lbrace";
+ case token_type_rbrace:
+ return "token_type_rbrace";
+ case token_type_lbracket:
+ return "token_type_lbracket";
+ case token_type_rbracket:
+ return "token_type_rbracket";
+ case token_type_string:
+ return "token_type_string";
+ case token_type_number:
+ return "token type number";
+ case token_type_named_op:
+ return "token_type_named_op";
+ case token_type_variable:
+ return "token_type_variable";
+ case token_type_object:
+ return "token_type_object";
+ case token_type_symbol:
+ return "token_type_symbol";
+ case token_type_regex_op:
+ return "token_type_regex_op";
+ case token_type_dot:
+ return "token_type_dot";
+ case token_type_other:
+ return "token_type_other";
+ default:
+ return "unknown";
+ }
+}
+#endif
+
+/* Free the memory pointed to by a 'struct token_ty'. */
+static inline void
+free_token (token_ty *tp)
+{
+ switch (tp->type)
+ {
+ case token_type_named_op:
+ case token_type_string:
+ case token_type_symbol:
+ case token_type_variable:
+ case token_type_object:
+ free (tp->string);
+ break;
+ default:
+ break;
+ }
+ if (tp->type == token_type_string)
+ drop_reference (tp->comment);
+ free (tp);
+}
+
+/* Pass 1 of extracting quotes: Find the end of the string, regardless
+ of the semantics of the construct. Return the complete string,
+ including the starting and the trailing delimiter, with backslashes
+ removed where appropriate. */
+static char *
+extract_quotelike_pass1 (int delim)
+{
+ /* This function is called recursively. No way to allocate stuff
+ statically. Also alloca() is inappropriate due to limited stack
+ size on some platforms. So we use malloc(). */
+ int bufmax = 10;
+ char *buffer = XNMALLOC (bufmax, char);
+ int bufpos = 0;
+ bool nested = true;
+ int counter_delim;
+
+ buffer[bufpos++] = delim;
+
+ /* Find the closing delimiter. */
+ switch (delim)
+ {
+ case '(':
+ counter_delim = ')';
+ break;
+ case '{':
+ counter_delim = '}';
+ break;
+ case '[':
+ counter_delim = ']';
+ break;
+ case '<':
+ counter_delim = '>';
+ break;
+ default: /* "..." or '...' or |...| etc. */
+ nested = false;
+ counter_delim = delim;
+ break;
+ }
+
+ for (;;)
+ {
+ int c = phase1_getc ();
+
+ /* This round can produce 1 or 2 bytes. Ensure room for 2 bytes. */
+ if (bufpos + 2 > bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+
+ if (c == counter_delim || c == EOF)
+ {
+ buffer[bufpos++] = counter_delim; /* will be stripped off later */
+ buffer[bufpos++] = '\0';
+#if DEBUG_PERL
+ fprintf (stderr, "PASS1: %s\n", buffer);
+#endif
+ return buffer;
+ }
+
+ if (nested && c == delim)
+ {
+ char *inner = extract_quotelike_pass1 (delim);
+ size_t len = strlen (inner);
+
+ /* Ensure room for len + 1 bytes. */
+ if (bufpos + len >= bufmax)
+ {
+ do
+ bufmax = 2 * bufmax + 10;
+ while (bufpos + len >= bufmax);
+ buffer = xrealloc (buffer, bufmax);
+ }
+ strcpy (buffer + bufpos, inner);
+ free (inner);
+ bufpos += len;
+ }
+ else if (c == '\\')
+ {
+ c = phase1_getc ();
+ if (c == '\\')
+ {
+ buffer[bufpos++] = '\\';
+ buffer[bufpos++] = '\\';
+ }
+ else if (c == delim || c == counter_delim)
+ {
+ /* This is pass2 in Perl. */
+ buffer[bufpos++] = c;
+ }
+ else
+ {
+ buffer[bufpos++] = '\\';
+ phase1_ungetc (c);
+ }
+ }
+ else
+ {
+ buffer[bufpos++] = c;
+ }
+ }
+}
+
+/* Like extract_quotelike_pass1, but return the complete string in UTF-8
+ encoding. */
+static char *
+extract_quotelike_pass1_utf8 (int delim)
+{
+ char *string = extract_quotelike_pass1 (delim);
+ char *utf8_string =
+ from_current_source_encoding (string, lc_string, logical_file_name,
+ line_number);
+ if (utf8_string != string)
+ free (string);
+ return utf8_string;
+}
+
+
+/* ========= Reading of tokens and commands. Extracting strings. ========= */
+
+
+/* Context lookup table. */
+static flag_context_list_table_ty *flag_context_list_table;
+
+
+/* Forward declaration of local functions. */
+static void interpolate_keywords (message_list_ty *mlp, const char *string,
+ int lineno);
+static token_ty *x_perl_lex (message_list_ty *mlp);
+static void x_perl_unlex (token_ty *tp);
+static bool extract_balanced (message_list_ty *mlp,
+ token_type_ty delim, bool eat_delim,
+ bool comma_delim,
+ flag_context_ty outer_context,
+ flag_context_list_iterator_ty context_iter,
+ int arg, struct arglist_parser *argparser);
+
+
+/* Extract an unsigned hexadecimal number from STRING, considering at
+ most LEN bytes and place the result in *RESULT. Returns a pointer
+ to the first character past the hexadecimal number. */
+static const char *
+extract_hex (const char *string, size_t len, unsigned int *result)
+{
+ size_t i;
+
+ *result = 0;
+
+ for (i = 0; i < len; i++)
+ {
+ char c = string[i];
+ int number;
+
+ if (c >= 'A' && c <= 'F')
+ number = c - 'A' + 10;
+ else if (c >= 'a' && c <= 'f')
+ number = c - 'a' + 10;
+ else if (c >= '0' && c <= '9')
+ number = c - '0';
+ else
+ break;
+
+ *result <<= 4;
+ *result |= number;
+ }
+
+ return string + i;
+}
+
+/* Extract an unsigned octal number from STRING, considering at
+ most LEN bytes and place the result in *RESULT. Returns a pointer
+ to the first character past the octal number. */
+static const char *
+extract_oct (const char *string, size_t len, unsigned int *result)
+{
+ size_t i;
+
+ *result = 0;
+
+ for (i = 0; i < len; i++)
+ {
+ char c = string[i];
+ int number;
+
+ if (c >= '0' && c <= '7')
+ number = c - '0';
+ else
+ break;
+
+ *result <<= 3;
+ *result |= number;
+ }
+
+ return string + i;
+}
+
+/* Extract the various quotelike constructs except for <<EOF. See the
+ section "Gory details of parsing quoted constructs" in perlop.pod.
+ Return the resulting token in *tp; tp->type == token_type_string. */
+static void
+extract_quotelike (token_ty *tp, int delim)
+{
+ char *string = extract_quotelike_pass1_utf8 (delim);
+ size_t len = strlen (string);
+
+ tp->type = token_type_string;
+ /* Take the string without the delimiters at the start and at the end. */
+ if (!(len >= 2))
+ abort ();
+ string[len - 1] = '\0';
+ tp->string = xstrdup (string + 1);
+ free (string);
+ tp->comment = add_reference (savable_comment);
+}
+
+/* Extract the quotelike constructs with double delimiters, like
+ s/[SEARCH]/[REPLACE]/. This function does not eat up trailing
+ modifiers (left to the caller).
+ Return the resulting token in *tp; tp->type == token_type_regex_op. */
+static void
+extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
+ bool interpolate)
+{
+ char *string;
+
+ tp->type = token_type_regex_op;
+
+ string = extract_quotelike_pass1_utf8 (delim);
+ if (interpolate)
+ interpolate_keywords (mlp, string, line_number);
+ free (string);
+
+ if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
+ {
+ /* The delimiter for the second string can be different, e.g.
+ s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/. See "man perlrequick". */
+ delim = phase1_getc ();
+ while (is_whitespace (delim))
+ {
+ /* The hash-sign is not a valid delimiter after whitespace, ergo
+ use phase2_getc() and not phase1_getc() now. */
+ delim = phase2_getc ();
+ }
+ }
+ string = extract_quotelike_pass1_utf8 (delim);
+ if (interpolate)
+ interpolate_keywords (mlp, string, line_number);
+ free (string);
+}
+
+/* Perform pass 3 of quotelike extraction (interpolation).
+ *tp is a token of type token_type_string.
+ This function replaces tp->string.
+ This function does not access tp->comment. */
+/* FIXME: Currently may writes null-bytes into the string. */
+static void
+extract_quotelike_pass3 (token_ty *tp, int error_level)
+{
+ static char *buffer;
+ static int bufmax = 0;
+ int bufpos = 0;
+ const char *crs;
+ bool uppercase;
+ bool lowercase;
+ bool quotemeta;
+
+#if DEBUG_PERL
+ switch (tp->sub_type)
+ {
+ case string_type_verbatim:
+ fprintf (stderr, "Interpolating string_type_verbatim:\n");
+ break;
+ case string_type_q:
+ fprintf (stderr, "Interpolating string_type_q:\n");
+ break;
+ case string_type_qq:
+ fprintf (stderr, "Interpolating string_type_qq:\n");
+ break;
+ case string_type_qr:
+ fprintf (stderr, "Interpolating string_type_qr:\n");
+ break;
+ }
+ fprintf (stderr, "%s\n", tp->string);
+ if (tp->sub_type == string_type_verbatim)
+ fprintf (stderr, "---> %s\n", tp->string);
+#endif
+
+ if (tp->sub_type == string_type_verbatim)
+ return;
+
+ /* Loop over tp->string, accumulating the expansion in buffer. */
+ crs = tp->string;
+ uppercase = false;
+ lowercase = false;
+ quotemeta = false;
+ while (*crs)
+ {
+ bool backslashed;
+
+ /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
+ if \Q modifier is present. */
+ if (bufpos + 7 > bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+
+ if (tp->sub_type == string_type_q)
+ {
+ switch (*crs)
+ {
+ case '\\':
+ if (crs[1] == '\\')
+ {
+ crs += 2;
+ buffer[bufpos++] = '\\';
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ buffer[bufpos++] = *crs++;
+ break;
+ }
+ continue;
+ }
+
+ /* We only get here for double-quoted strings or regular expressions.
+ Unescape escape sequences. */
+ if (*crs == '\\')
+ {
+ switch (crs[1])
+ {
+ case 't':
+ crs += 2;
+ buffer[bufpos++] = '\t';
+ continue;
+ case 'n':
+ crs += 2;
+ buffer[bufpos++] = '\n';
+ continue;
+ case 'r':
+ crs += 2;
+ buffer[bufpos++] = '\r';
+ continue;
+ case 'f':
+ crs += 2;
+ buffer[bufpos++] = '\f';
+ continue;
+ case 'b':
+ crs += 2;
+ buffer[bufpos++] = '\b';
+ continue;
+ case 'a':
+ crs += 2;
+ buffer[bufpos++] = '\a';
+ continue;
+ case 'e':
+ crs += 2;
+ buffer[bufpos++] = 0x1b;
+ continue;
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ {
+ unsigned int oct_number;
+ int length;
+
+ crs = extract_oct (crs + 1, 3, &oct_number);
+
+ /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
+ true, the character should be converted to its uppercase
+ resp. lowercase equivalent. I don't know if the necessary
+ facilities are already included in gettext. For US-Ascii
+ the conversion can be already be done, however. */
+ if (uppercase && oct_number >= 'a' && oct_number <= 'z')
+ {
+ oct_number = oct_number - 'a' + 'A';
+ }
+ else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
+ {
+ oct_number = oct_number - 'A' + 'a';
+ }
+
+
+ /* Yes, octal escape sequences in the range 0x100..0x1ff are
+ valid. */
+ length = u8_uctomb ((unsigned char *) (buffer + bufpos),
+ oct_number, 2);
+ if (length > 0)
+ bufpos += length;
+ }
+ continue;
+ case 'x':
+ {
+ unsigned int hex_number = 0;
+ int length;
+
+ crs += 2;
+ if (*crs == '{')
+ {
+ const char *end = strchr (crs, '}');
+ if (end == NULL)
+ {
+ error_with_progname = false;
+ error (error_level, 0, _("\
+%s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
+ error_with_progname = true;
+ ++crs;
+ continue;
+ }
+ else
+ {
+ ++crs;
+ (void) extract_hex (crs, end - crs, &hex_number);
+ crs = end + 1;
+ }
+ }
+ else
+ {
+ crs = extract_hex (crs, 2, &hex_number);
+ }
+
+ /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
+ true, the character should be converted to its uppercase
+ resp. lowercase equivalent. I don't know if the necessary
+ facilities are already included in gettext. For US-Ascii
+ the conversion can be already be done, however. */
+ if (uppercase && hex_number >= 'a' && hex_number <= 'z')
+ {
+ hex_number = hex_number - 'a' + 'A';
+ }
+ else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
+ {
+ hex_number = hex_number - 'A' + 'a';
+ }
+
+ length = u8_uctomb ((unsigned char *) (buffer + bufpos),
+ hex_number, 6);
+
+ if (length > 0)
+ bufpos += length;
+ }
+ continue;
+ case 'c':
+ /* Perl's notion of control characters. */
+ crs += 2;
+ if (*crs)
+ {
+ int the_char = (unsigned char) *crs;
+ if (the_char >= 'a' && the_char <= 'z')
+ the_char = the_char - 'a' + 'A';
+ buffer[bufpos++] = the_char ^ 0x40;
+ }
+ continue;
+ case 'N':
+ crs += 2;
+ if (*crs == '{')
+ {
+ const char *end = strchr (crs + 1, '}');
+ if (end != NULL)
+ {
+ char *name;
+ unsigned int unicode;
+
+ name = XNMALLOC (end - (crs + 1) + 1, char);
+ memcpy (name, crs + 1, end - (crs + 1));
+ name[end - (crs + 1)] = '\0';
+
+ unicode = unicode_name_character (name);
+ if (unicode != UNINAME_INVALID)
+ {
+ /* FIXME: Convert to upper/lowercase if the
+ corresponding flag is set to true. */
+ int length =
+ u8_uctomb ((unsigned char *) (buffer + bufpos),
+ unicode, 6);
+ if (length > 0)
+ bufpos += length;
+ }
+
+ free (name);
+
+ crs = end + 1;
+ }
+ }
+ continue;
+ }
+ }
+
+ /* No escape sequence, go on. */
+ if (*crs == '\\')
+ {
+ ++crs;
+ switch (*crs)
+ {
+ case 'E':
+ uppercase = false;
+ lowercase = false;
+ quotemeta = false;
+ ++crs;
+ continue;
+ case 'L':
+ uppercase = false;
+ lowercase = true;
+ ++crs;
+ continue;
+ case 'U':
+ uppercase = true;
+ lowercase = false;
+ ++crs;
+ continue;
+ case 'Q':
+ quotemeta = true;
+ ++crs;
+ continue;
+ case 'l':
+ ++crs;
+ if (*crs >= 'A' && *crs <= 'Z')
+ {
+ buffer[bufpos++] = *crs - 'A' + 'a';
+ }
+ else if ((unsigned char) *crs >= 0x80)
+ {
+ error_with_progname = false;
+ error (error_level, 0, _("\
+%s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
+ real_file_name, line_number, *crs);
+ error_with_progname = true;
+ }
+ else
+ {
+ buffer[bufpos++] = *crs;
+ }
+ ++crs;
+ continue;
+ case 'u':
+ ++crs;
+ if (*crs >= 'a' && *crs <= 'z')
+ {
+ buffer[bufpos++] = *crs - 'a' + 'A';
+ }
+ else if ((unsigned char) *crs >= 0x80)
+ {
+ error_with_progname = false;
+ error (error_level, 0, _("\
+%s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
+ real_file_name, line_number, *crs);
+ error_with_progname = true;
+ }
+ else
+ {
+ buffer[bufpos++] = *crs;
+ }
+ ++crs;
+ continue;
+ case '\\':
+ buffer[bufpos++] = *crs;
+ ++crs;
+ continue;
+ default:
+ backslashed = true;
+ break;
+ }
+ }
+ else
+ backslashed = false;
+
+ if (quotemeta
+ && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
+ || (*crs >= '0' && *crs <= '9') || *crs == '_'))
+ {
+ buffer[bufpos++] = '\\';
+ backslashed = true;
+ }
+
+ if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
+ {
+ error_with_progname = false;
+ error (error_level, 0, _("\
+%s:%d: invalid variable interpolation at \"%c\""),
+ real_file_name, line_number, *crs);
+ error_with_progname = true;
+ ++crs;
+ }
+ else if (lowercase)
+ {
+ if (*crs >= 'A' && *crs <= 'Z')
+ buffer[bufpos++] = *crs - 'A' + 'a';
+ else if ((unsigned char) *crs >= 0x80)
+ {
+ error_with_progname = false;
+ error (error_level, 0, _("\
+%s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
+ real_file_name, line_number, *crs);
+ error_with_progname = true;
+ buffer[bufpos++] = *crs;
+ }
+ else
+ buffer[bufpos++] = *crs;
+ ++crs;
+ }
+ else if (uppercase)
+ {
+ if (*crs >= 'a' && *crs <= 'z')
+ buffer[bufpos++] = *crs - 'a' + 'A';
+ else if ((unsigned char) *crs >= 0x80)
+ {
+ error_with_progname = false;
+ error (error_level, 0, _("\
+%s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
+ real_file_name, line_number, *crs);
+ error_with_progname = true;
+ buffer[bufpos++] = *crs;
+ }
+ else
+ buffer[bufpos++] = *crs;
+ ++crs;
+ }
+ else
+ {
+ buffer[bufpos++] = *crs++;
+ }
+ }
+
+ /* Ensure room for 1 more byte. */
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+
+ buffer[bufpos++] = '\0';
+
+#if DEBUG_PERL
+ fprintf (stderr, "---> %s\n", buffer);
+#endif
+
+ /* Replace tp->string. */
+ free (tp->string);
+ tp->string = xstrdup (buffer);
+}
+
+/* Parse a variable. This is done in several steps:
+ 1) Consume all leading occurencies of '$', '@', '%', and '*'.
+ 2) Determine the name of the variable from the following input.
+ 3) Parse possible following hash keys or array indexes.
+ */
+static void
+extract_variable (message_list_ty *mlp, token_ty *tp, int first)
+{
+ static char *buffer;
+ static int bufmax = 0;
+ int bufpos = 0;
+ int c = first;
+ size_t varbody_length = 0;
+ bool maybe_hash_deref = false;
+ bool maybe_hash_value = false;
+
+ tp->type = token_type_variable;
+
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
+ real_file_name, line_number, first);
+#endif
+
+ /*
+ * 1) Consume dollars and so on (not euros ...). Unconditionally
+ * accepting the hash sign (#) will maybe lead to inaccurate
+ * results. FIXME!
+ */
+ while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
+ {
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = c;
+ c = phase1_getc ();
+ }
+
+ if (c == EOF)
+ {
+ tp->type = token_type_eof;
+ return;
+ }
+
+ /* Hash references are treated in a special way, when looking for
+ our keywords. */
+ if (buffer[0] == '$')
+ {
+ if (bufpos == 1)
+ maybe_hash_value = true;
+ else if (bufpos == 2 && buffer[1] == '$')
+ {
+ if (!(c == '{'
+ || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+ || (c >= '0' && c <= '9')
+ || c == '_' || c == ':' || c == '\'' || c >= 0x80))
+ {
+ /* Special variable $$ for pid. */
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = '\0';
+ tp->string = xstrdup (buffer);
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: is PID ($$)\n",
+ real_file_name, line_number);
+#endif
+
+ phase1_ungetc (c);
+ return;
+ }
+
+ maybe_hash_deref = true;
+ bufpos = 1;
+ }
+ }
+
+ /*
+ * 2) Get the name of the variable. The first character is practically
+ * arbitrary. Punctuation and numbers automagically put a variable
+ * in the global namespace but that subtle difference is not interesting
+ * for us.
+ */
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ if (c == '{')
+ {
+ /* Yuck, we cannot accept ${gettext} as a keyword... Except for
+ * debugging purposes it is also harmless, that we suppress the
+ * real name of the variable.
+ */
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: braced {variable_name}\n",
+ real_file_name, line_number);
+#endif
+
+ if (extract_balanced (mlp, token_type_rbrace, true, false,
+ null_context, null_context_list_iterator,
+ 1, arglist_parser_alloc (mlp, NULL)))
+ {
+ tp->type = token_type_eof;
+ return;
+ }
+ buffer[bufpos++] = c;
+ }
+ else
+ {
+ while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+ || (c >= '0' && c <= '9')
+ || c == '_' || c == ':' || c == '\'' || c >= 0x80)
+ {
+ ++varbody_length;
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = c;
+ c = phase1_getc ();
+ }
+ phase1_ungetc (c);
+ }
+
+ /* Probably some strange Perl variable like $`. */
+ if (varbody_length == 0)
+ {
+ c = phase1_getc ();
+ if (c == EOF || is_whitespace (c))
+ phase1_ungetc (c); /* Loser. */
+ else
+ {
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = c;
+ }
+ }
+
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = '\0';
+
+ tp->string = xstrdup (buffer);
+
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: complete variable name: %s\n",
+ real_file_name, line_number, tp->string);
+#endif
+
+ /*
+ * 3) If the following looks strange to you, this is valid Perl syntax:
+ *
+ * $var = $$hashref # We can place a
+ * # comment here and then ...
+ * {key_into_hashref};
+ *
+ * POD sections are not allowed but we leave complaints about
+ * that to the compiler/interpreter.
+ */
+ /* We only extract strings from the first hash key (if present). */
+
+ if (maybe_hash_deref || maybe_hash_value)
+ {
+ bool is_dereference = false;
+ int c;
+
+ do
+ c = phase2_getc ();
+ while (is_whitespace (c));
+
+ if (c == '-')
+ {
+ int c2 = phase1_getc ();
+
+ if (c2 == '>')
+ {
+ is_dereference = true;
+
+ do
+ c = phase2_getc ();
+ while (is_whitespace (c));
+ }
+ else if (c2 != '\n')
+ {
+ /* Discarding the newline is harmless here. The only
+ special character recognized after a minus is greater-than
+ for dereference. However, the sequence "-\n>" that we
+ treat incorrectly here, is a syntax error. */
+ phase1_ungetc (c2);
+ }
+ }
+
+ if (maybe_hash_value && is_dereference)
+ {
+ tp->type = token_type_object;
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
+ real_file_name, line_number);
+#endif
+ }
+ else if (maybe_hash_value)
+ {
+ /* Fake it into a hash. */
+ tp->string[0] = '%';
+ }
+
+ /* Do NOT change that into else if (see above). */
+ if ((maybe_hash_value || maybe_hash_deref) && c == '{')
+ {
+ void *keyword_value;
+
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
+ real_file_name, line_number);
+#endif
+
+ if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
+ &keyword_value) == 0)
+ {
+ /* TODO: Shouldn't we use the shapes of the keyword, instead
+ of hardwiring argnum1 = 1 ?
+ const struct callshapes *shapes =
+ (const struct callshapes *) keyword_value;
+ */
+ struct callshapes shapes;
+ shapes.keyword = tp->string; /* XXX storage duration? */
+ shapes.keyword_len = strlen (tp->string);
+ shapes.nshapes = 1;
+ shapes.shapes[0].argnum1 = 1;
+ shapes.shapes[0].argnum2 = 0;
+ shapes.shapes[0].argnumc = 0;
+ shapes.shapes[0].argnum1_glib_context = false;
+ shapes.shapes[0].argnum2_glib_context = false;
+ shapes.shapes[0].argtotal = 0;
+ string_list_init (&shapes.shapes[0].xcomments);
+
+ {
+ /* Extract a possible string from the key. Before proceeding
+ we check whether the open curly is followed by a symbol and
+ then by a right curly. */
+ flag_context_list_iterator_ty context_iter =
+ flag_context_list_iterator (
+ flag_context_list_table_lookup (
+ flag_context_list_table,
+ tp->string, strlen (tp->string)));
+ token_ty *t1 = x_perl_lex (mlp);
+
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: extracting string key\n",
+ real_file_name, line_number);
+#endif
+
+ if (t1->type == token_type_symbol
+ || t1->type == token_type_named_op)
+ {
+ token_ty *t2 = x_perl_lex (mlp);
+ if (t2->type == token_type_rbrace)
+ {
+ flag_context_ty context;
+ lex_pos_ty pos;
+
+ context =
+ inherited_context (null_context,
+ flag_context_list_iterator_advance (
+ &context_iter));
+
+ pos.line_number = line_number;
+ pos.file_name = logical_file_name;
+
+ xgettext_current_source_encoding = po_charset_utf8;
+ remember_a_message (mlp, NULL, xstrdup (t1->string),
+ context, &pos, NULL, savable_comment);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ free_token (t2);
+ free_token (t1);
+ }
+ else
+ {
+ x_perl_unlex (t2);
+ }
+ }
+ else
+ {
+ x_perl_unlex (t1);
+ if (extract_balanced (mlp, token_type_rbrace, true, false,
+ null_context, context_iter,
+ 1, arglist_parser_alloc (mlp, &shapes)))
+ return;
+ }
+ }
+ }
+ else
+ {
+ phase2_ungetc (c);
+ }
+ }
+ else
+ {
+ phase2_ungetc (c);
+ }
+ }
+
+ /* Now consume "->", "[...]", and "{...}". */
+ for (;;)
+ {
+ int c = phase2_getc ();
+ int c2;
+
+ switch (c)
+ {
+ case '{':
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
+ real_file_name, line_number);
+#endif
+ extract_balanced (mlp, token_type_rbrace, true, false,
+ null_context, null_context_list_iterator,
+ 1, arglist_parser_alloc (mlp, NULL));
+ break;
+
+ case '[':
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
+ real_file_name, line_number);
+#endif
+ extract_balanced (mlp, token_type_rbracket, true, false,
+ null_context, null_context_list_iterator,
+ 1, arglist_parser_alloc (mlp, NULL));
+ break;
+
+ case '-':
+ c2 = phase1_getc ();
+ if (c2 == '>')
+ {
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: another \"->\" after varname\n",
+ real_file_name, line_number);
+#endif
+ break;
+ }
+ else if (c2 != '\n')
+ {
+ /* Discarding the newline is harmless here. The only
+ special character recognized after a minus is greater-than
+ for dereference. However, the sequence "-\n>" that we
+ treat incorrectly here, is a syntax error. */
+ phase1_ungetc (c2);
+ }
+ /* FALLTHROUGH */
+
+ default:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: variable finished\n",
+ real_file_name, line_number);
+#endif
+ phase2_ungetc (c);
+ return;
+ }
+ }
+}
+
+/* Actually a simplified version of extract_variable(). It searches for
+ variables inside a double-quoted string that may interpolate to
+ some keyword hash (reference). The string is UTF-8 encoded. */
+static void
+interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
+{
+ static char *buffer;
+ static int bufmax = 0;
+ int bufpos = 0;
+ flag_context_ty context;
+ int c;
+ bool maybe_hash_deref = false;
+ enum parser_state
+ {
+ initial,
+ one_dollar,
+ two_dollars,
+ identifier,
+ minus,
+ wait_lbrace,
+ wait_quote,
+ dquote,
+ squote,
+ barekey,
+ wait_rbrace
+ } state;
+ token_ty token;
+
+ lex_pos_ty pos;
+
+ /* States are:
+ *
+ * initial: initial
+ * one_dollar: dollar sign seen in state INITIAL
+ * two_dollars: another dollar-sign has been seen in state ONE_DOLLAR
+ * identifier: a valid identifier character has been seen in state
+ * ONE_DOLLAR or TWO_DOLLARS
+ * minus: a minus-sign has been seen in state IDENTIFIER
+ * wait_lbrace: a greater-than has been seen in state MINUS
+ * wait_quote: a left brace has been seen in state IDENTIFIER or in
+ * state WAIT_LBRACE
+ * dquote: a double-quote has been seen in state WAIT_QUOTE
+ * squote: a single-quote has been seen in state WAIT_QUOTE
+ * barekey: an bareword character has been seen in state WAIT_QUOTE
+ * wait_rbrace: closing quote has been seen in state DQUOTE or SQUOTE
+ *
+ * In the states initial...identifier the context is null_context; in the
+ * states minus...wait_rbrace the context is the one suitable for the first
+ * argument of the last seen identifier.
+ */
+ state = initial;
+ context = null_context;
+
+ token.type = token_type_string;
+ token.sub_type = string_type_qq;
+ token.line_number = line_number;
+ /* No need for token.comment = add_reference (savable_comment); here.
+ We can let token.comment uninitialized here, and use savable_comment
+ directly, because this function only parses the given string and does
+ not call phase2_getc. */
+ pos.file_name = logical_file_name;
+ pos.line_number = lineno;
+
+ while ((c = (unsigned char) *string++) != '\0')
+ {
+ void *keyword_value;
+
+ if (state == initial)
+ bufpos = 0;
+
+ if (c == '\n')
+ lineno++;
+
+ if (bufpos + 1 >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+
+ switch (state)
+ {
+ case initial:
+ switch (c)
+ {
+ case '\\':
+ c = (unsigned char) *string++;
+ if (c == '\0')
+ return;
+ break;
+ case '$':
+ buffer[bufpos++] = '$';
+ maybe_hash_deref = false;
+ state = one_dollar;
+ break;
+ default:
+ break;
+ }
+ break;
+ case one_dollar:
+ switch (c)
+ {
+ case '$':
+ /*
+ * This is enough to make us believe later that we dereference
+ * a hash reference.
+ */
+ maybe_hash_deref = true;
+ state = two_dollars;
+ break;
+ default:
+ if (c == '_' || c == ':' || c == '\'' || c >= 0x80
+ || (c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z')
+ || (c >= '0' && c <= '9'))
+ {
+ buffer[bufpos++] = c;
+ state = identifier;
+ }
+ else
+ state = initial;
+ break;
+ }
+ break;
+ case two_dollars:
+ if (c == '_' || c == ':' || c == '\'' || c >= 0x80
+ || (c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z')
+ || (c >= '0' && c <= '9'))
+ {
+ buffer[bufpos++] = c;
+ state = identifier;
+ }
+ else
+ state = initial;
+ break;
+ case identifier:
+ switch (c)
+ {
+ case '-':
+ if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
+ == 0)
+ {
+ flag_context_list_iterator_ty context_iter =
+ flag_context_list_iterator (
+ flag_context_list_table_lookup (
+ flag_context_list_table,
+ buffer, bufpos));
+ context =
+ inherited_context (null_context,
+ flag_context_list_iterator_advance (
+ &context_iter));
+ state = minus;
+ }
+ else
+ state = initial;
+ break;
+ case '{':
+ if (!maybe_hash_deref)
+ buffer[0] = '%';
+ if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
+ == 0)
+ {
+ flag_context_list_iterator_ty context_iter =
+ flag_context_list_iterator (
+ flag_context_list_table_lookup (
+ flag_context_list_table,
+ buffer, bufpos));
+ context =
+ inherited_context (null_context,
+ flag_context_list_iterator_advance (
+ &context_iter));
+ state = wait_quote;
+ }
+ else
+ state = initial;
+ break;
+ default:
+ if (c == '_' || c == ':' || c == '\'' || c >= 0x80
+ || (c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z')
+ || (c >= '0' && c <= '9'))
+ {
+ buffer[bufpos++] = c;
+ }
+ else
+ state = initial;
+ break;
+ }
+ break;
+ case minus:
+ switch (c)
+ {
+ case '>':
+ state = wait_lbrace;
+ break;
+ default:
+ context = null_context;
+ state = initial;
+ break;
+ }
+ break;
+ case wait_lbrace:
+ switch (c)
+ {
+ case '{':
+ state = wait_quote;
+ break;
+ default:
+ context = null_context;
+ state = initial;
+ break;
+ }
+ break;
+ case wait_quote:
+ switch (c)
+ {
+ case_whitespace:
+ break;
+ case '\'':
+ pos.line_number = lineno;
+ bufpos = 0;
+ state = squote;
+ break;
+ case '"':
+ pos.line_number = lineno;
+ bufpos = 0;
+ state = dquote;
+ break;
+ default:
+ if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
+ || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+ {
+ pos.line_number = lineno;
+ bufpos = 0;
+ buffer[bufpos++] = c;
+ state = barekey;
+ }
+ else
+ {
+ context = null_context;
+ state = initial;
+ }
+ break;
+ }
+ break;
+ case dquote:
+ switch (c)
+ {
+ case '"':
+ /* The resulting string has to be interpolated twice. */
+ buffer[bufpos] = '\0';
+ token.string = xstrdup (buffer);
+ extract_quotelike_pass3 (&token, EXIT_FAILURE);
+ /* The string can only shrink with interpolation (because
+ we ignore \Q). */
+ if (!(strlen (token.string) <= bufpos))
+ abort ();
+ strcpy (buffer, token.string);
+ free (token.string);
+ state = wait_rbrace;
+ break;
+ case '\\':
+ if (string[0] == '\"')
+ {
+ buffer[bufpos++] = string++[0];
+ }
+ else if (string[0])
+ {
+ buffer[bufpos++] = '\\';
+ buffer[bufpos++] = string++[0];
+ }
+ else
+ {
+ context = null_context;
+ state = initial;
+ }
+ break;
+ default:
+ buffer[bufpos++] = c;
+ break;
+ }
+ break;
+ case squote:
+ switch (c)
+ {
+ case '\'':
+ state = wait_rbrace;
+ break;
+ case '\\':
+ if (string[0] == '\'')
+ {
+ buffer[bufpos++] = string++[0];
+ }
+ else if (string[0])
+ {
+ buffer[bufpos++] = '\\';
+ buffer[bufpos++] = string++[0];
+ }
+ else
+ {
+ context = null_context;
+ state = initial;
+ }
+ break;
+ default:
+ buffer[bufpos++] = c;
+ break;
+ }
+ break;
+ case barekey:
+ if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
+ || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+ {
+ buffer[bufpos++] = c;
+ break;
+ }
+ else if (is_whitespace (c))
+ {
+ state = wait_rbrace;
+ break;
+ }
+ else if (c != '}')
+ {
+ context = null_context;
+ state = initial;
+ break;
+ }
+ /* Must be right brace. */
+ /* FALLTHROUGH */
+ case wait_rbrace:
+ switch (c)
+ {
+ case_whitespace:
+ break;
+ case '}':
+ buffer[bufpos] = '\0';
+ token.string = xstrdup (buffer);
+ extract_quotelike_pass3 (&token, EXIT_FAILURE);
+ xgettext_current_source_encoding = po_charset_utf8;
+ remember_a_message (mlp, NULL, token.string, context, &pos,
+ NULL, savable_comment);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ /* FALLTHROUGH */
+ default:
+ context = null_context;
+ state = initial;
+ break;
+ }
+ break;
+ }
+ }
+}
+
+/* There is an ambiguity about '/' and '?': They can start an operator
+ (division operator '/' or '/=' or the conditional operator '?'), or they can
+ start a regular expression. The distinction is important because inside
+ regular expressions, '#' loses its special meaning. This function helps
+ making the decision (a heuristic). See the documentation for details. */
+static bool
+prefer_regexp_over_division (token_type_ty type)
+{
+ bool retval = true;
+
+ switch (type)
+ {
+ case token_type_eof:
+ retval = true;
+ break;
+ case token_type_lparen:
+ retval = true;
+ break;
+ case token_type_rparen:
+ retval = false;
+ break;
+ case token_type_comma:
+ retval = true;
+ break;
+ case token_type_fat_comma:
+ retval = true;
+ break;
+ case token_type_dereference:
+ retval = true;
+ break;
+ case token_type_semicolon:
+ retval = true;
+ break;
+ case token_type_lbrace:
+ retval = true;
+ break;
+ case token_type_rbrace:
+ retval = false;
+ break;
+ case token_type_lbracket:
+ retval = true;
+ break;
+ case token_type_rbracket:
+ retval = false;
+ break;
+ case token_type_string:
+ retval = false;
+ break;
+ case token_type_number:
+ retval = false;
+ break;
+ case token_type_named_op:
+ retval = true;
+ break;
+ case token_type_variable:
+ retval = false;
+ break;
+ case token_type_object:
+ retval = false;
+ break;
+ case token_type_symbol:
+ case token_type_keyword_symbol:
+ retval = true;
+ break;
+ case token_type_regex_op:
+ retval = false;
+ break;
+ case token_type_dot:
+ retval = true;
+ break;
+ case token_type_other:
+ retval = true;
+ break;
+ }
+
+#if DEBUG_PERL
+ token_ty ty;
+ ty.type = type;
+ fprintf (stderr, "Prefer regexp over division after %s: %s\n",
+ token2string (&ty), retval ? "true" : "false");
+#endif
+
+ return retval;
+}
+
+/* Last token type seen in the stream. Important for the interpretation
+ of slash and question mark. */
+static token_type_ty last_token_type;
+
+/* Combine characters into tokens. Discard whitespace. */
+
+static void
+x_perl_prelex (message_list_ty *mlp, token_ty *tp)
+{
+ static char *buffer;
+ static int bufmax;
+ int bufpos;
+ int c;
+
+ for (;;)
+ {
+ c = phase2_getc ();
+ tp->line_number = line_number;
+ tp->last_type = last_token_type;
+
+ switch (c)
+ {
+ case EOF:
+ tp->type = token_type_eof;
+ return;
+
+ case '\n':
+ if (last_non_comment_line > last_comment_line)
+ savable_comment_reset ();
+ /* FALLTHROUGH */
+ case '\t':
+ case ' ':
+ /* Ignore whitespace. */
+ continue;
+
+ case '%':
+ case '@':
+ case '*':
+ case '$':
+ if (!extract_all)
+ {
+ extract_variable (mlp, tp, c);
+ return;
+ }
+ break;
+ }
+
+ last_non_comment_line = tp->line_number;
+
+ switch (c)
+ {
+ case '.':
+ {
+ int c2 = phase1_getc ();
+ phase1_ungetc (c2);
+ if (c2 == '.')
+ {
+ tp->type = token_type_other;
+ return;
+ }
+ else if (!(c2 >= '0' && c2 <= '9'))
+ {
+ tp->type = token_type_dot;
+ return;
+ }
+ }
+ /* FALLTHROUGH */
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+ case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+ case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+ case 'Y': case 'Z':
+ case '_':
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+ case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+ case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+ case 'y': case 'z':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ /* Symbol, or part of a number. */
+ bufpos = 0;
+ for (;;)
+ {
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = c;
+ c = phase1_getc ();
+ switch (c)
+ {
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+ case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+ case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+ case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+ case 'Y': case 'Z':
+ case '_':
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+ case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
+ case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
+ case 's': case 't': case 'u': case 'v': case 'w': case 'x':
+ case 'y': case 'z':
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ continue;
+
+ default:
+ phase1_ungetc (c);
+ break;
+ }
+ break;
+ }
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos] = '\0';
+
+ if (strcmp (buffer, "__END__") == 0
+ || strcmp (buffer, "__DATA__") == 0)
+ {
+ end_of_file = true;
+ tp->type = token_type_eof;
+ return;
+ }
+ else if (strcmp (buffer, "and") == 0
+ || strcmp (buffer, "cmp") == 0
+ || strcmp (buffer, "eq") == 0
+ || strcmp (buffer, "if") == 0
+ || strcmp (buffer, "ge") == 0
+ || strcmp (buffer, "gt") == 0
+ || strcmp (buffer, "le") == 0
+ || strcmp (buffer, "lt") == 0
+ || strcmp (buffer, "ne") == 0
+ || strcmp (buffer, "not") == 0
+ || strcmp (buffer, "or") == 0
+ || strcmp (buffer, "unless") == 0
+ || strcmp (buffer, "while") == 0
+ || strcmp (buffer, "xor") == 0)
+ {
+ tp->type = token_type_named_op;
+ tp->string = xstrdup (buffer);
+ return;
+ }
+ else if (strcmp (buffer, "s") == 0
+ || strcmp (buffer, "y") == 0
+ || strcmp (buffer, "tr") == 0)
+ {
+ int delim = phase1_getc ();
+
+ while (is_whitespace (delim))
+ delim = phase2_getc ();
+
+ if (delim == EOF)
+ {
+ tp->type = token_type_eof;
+ return;
+ }
+ if ((delim >= '0' && delim <= '9')
+ || (delim >= 'A' && delim <= 'Z')
+ || (delim >= 'a' && delim <= 'z'))
+ {
+ /* False positive. */
+ phase2_ungetc (delim);
+ tp->type = token_type_symbol;
+ tp->sub_type = symbol_type_none;
+ tp->string = xstrdup (buffer);
+ return;
+ }
+ extract_triple_quotelike (mlp, tp, delim,
+ buffer[0] == 's' && delim != '\'');
+
+ /* Eat the following modifiers. */
+ do
+ c = phase1_getc ();
+ while (c >= 'a' && c <= 'z');
+ phase1_ungetc (c);
+ return;
+ }
+ else if (strcmp (buffer, "m") == 0)
+ {
+ int delim = phase1_getc ();
+
+ while (is_whitespace (delim))
+ delim = phase2_getc ();
+
+ if (delim == EOF)
+ {
+ tp->type = token_type_eof;
+ return;
+ }
+ if ((delim >= '0' && delim <= '9')
+ || (delim >= 'A' && delim <= 'Z')
+ || (delim >= 'a' && delim <= 'z'))
+ {
+ /* False positive. */
+ phase2_ungetc (delim);
+ tp->type = token_type_symbol;
+ tp->sub_type = symbol_type_none;
+ tp->string = xstrdup (buffer);
+ return;
+ }
+ extract_quotelike (tp, delim);
+ if (delim != '\'')
+ interpolate_keywords (mlp, tp->string, line_number);
+ free (tp->string);
+ drop_reference (tp->comment);
+ tp->type = token_type_regex_op;
+
+ /* Eat the following modifiers. */
+ do
+ c = phase1_getc ();
+ while (c >= 'a' && c <= 'z');
+ phase1_ungetc (c);
+ return;
+ }
+ else if (strcmp (buffer, "qq") == 0
+ || strcmp (buffer, "q") == 0
+ || strcmp (buffer, "qx") == 0
+ || strcmp (buffer, "qw") == 0
+ || strcmp (buffer, "qr") == 0)
+ {
+ /* The qw (...) construct is not really a string but we
+ can treat in the same manner and then pretend it is
+ a symbol. Rationale: Saying "qw (foo bar)" is the
+ same as "my @list = ('foo', 'bar'); @list;". */
+
+ int delim = phase1_getc ();
+
+ while (is_whitespace (delim))
+ delim = phase2_getc ();
+
+ if (delim == EOF)
+ {
+ tp->type = token_type_eof;
+ return;
+ }
+
+ if ((delim >= '0' && delim <= '9')
+ || (delim >= 'A' && delim <= 'Z')
+ || (delim >= 'a' && delim <= 'z'))
+ {
+ /* False positive. */
+ phase2_ungetc (delim);
+ tp->type = token_type_symbol;
+ tp->sub_type = symbol_type_none;
+ tp->string = xstrdup (buffer);
+ return;
+ }
+
+ extract_quotelike (tp, delim);
+
+ switch (buffer[1])
+ {
+ case 'q':
+ case 'x':
+ tp->type = token_type_string;
+ tp->sub_type = string_type_qq;
+ interpolate_keywords (mlp, tp->string, line_number);
+ break;
+ case 'r':
+ drop_reference (tp->comment);
+ tp->type = token_type_regex_op;
+ break;
+ case 'w':
+ drop_reference (tp->comment);
+ tp->type = token_type_symbol;
+ tp->sub_type = symbol_type_none;
+ break;
+ case '\0':
+ tp->type = token_type_string;
+ tp->sub_type = string_type_q;
+ break;
+ default:
+ abort ();
+ }
+ return;
+ }
+ else if ((buffer[0] >= '0' && buffer[0] <= '9') || buffer[0] == '.')
+ {
+ tp->type = token_type_number;
+ return;
+ }
+ tp->type = token_type_symbol;
+ tp->sub_type = (strcmp (buffer, "sub") == 0
+ ? symbol_type_sub
+ : symbol_type_none);
+ tp->string = xstrdup (buffer);
+ return;
+
+ case '"':
+ extract_quotelike (tp, c);
+ tp->sub_type = string_type_qq;
+ interpolate_keywords (mlp, tp->string, line_number);
+ return;
+
+ case '`':
+ extract_quotelike (tp, c);
+ tp->sub_type = string_type_qq;
+ interpolate_keywords (mlp, tp->string, line_number);
+ return;
+
+ case '\'':
+ extract_quotelike (tp, c);
+ tp->sub_type = string_type_q;
+ return;
+
+ case '(':
+ c = phase2_getc ();
+ if (c == ')')
+ /* Ignore empty list. */
+ continue;
+ else
+ phase2_ungetc (c);
+ tp->type = token_type_lparen;
+ return;
+
+ case ')':
+ tp->type = token_type_rparen;
+ return;
+
+ case '{':
+ tp->type = token_type_lbrace;
+ return;
+
+ case '}':
+ tp->type = token_type_rbrace;
+ return;
+
+ case '[':
+ tp->type = token_type_lbracket;
+ return;
+
+ case ']':
+ tp->type = token_type_rbracket;
+ return;
+
+ case ';':
+ tp->type = token_type_semicolon;
+ return;
+
+ case ',':
+ tp->type = token_type_comma;
+ return;
+
+ case '=':
+ /* Check for fat comma. */
+ c = phase1_getc ();
+ if (c == '>')
+ {
+ tp->type = token_type_fat_comma;
+ return;
+ }
+ else if (linepos == 2
+ && (last_token_type == token_type_semicolon
+ || last_token_type == token_type_rbrace)
+ && ((c >= 'A' && c <='Z')
+ || (c >= 'a' && c <= 'z')))
+ {
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: start pod section\n",
+ real_file_name, line_number);
+#endif
+ skip_pod ();
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: end pod section\n",
+ real_file_name, line_number);
+#endif
+ continue;
+ }
+ phase1_ungetc (c);
+ tp->type = token_type_other;
+ return;
+
+ case '<':
+ /* Check for <<EOF and friends. */
+ c = phase1_getc ();
+ if (c == '<')
+ {
+ c = phase1_getc ();
+ if (c == '\'')
+ {
+ char *string;
+ extract_quotelike (tp, c);
+ string = get_here_document (tp->string);
+ free (tp->string);
+ tp->string = string;
+ tp->type = token_type_string;
+ tp->sub_type = string_type_verbatim;
+ tp->line_number = line_number + 1;
+ return;
+ }
+ else if (c == '"')
+ {
+ char *string;
+ extract_quotelike (tp, c);
+ string = get_here_document (tp->string);
+ free (tp->string);
+ tp->string = string;
+ tp->type = token_type_string;
+ tp->sub_type = string_type_qq;
+ tp->line_number = line_number + 1;
+ interpolate_keywords (mlp, tp->string, tp->line_number);
+ return;
+ }
+ else if ((c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z')
+ || c == '_')
+ {
+ bufpos = 0;
+ while ((c >= 'A' && c <= 'Z')
+ || (c >= 'a' && c <= 'z')
+ || (c >= '0' && c <= '9')
+ || c == '_' || c >= 0x80)
+ {
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = c;
+ c = phase1_getc ();
+ }
+ if (c == EOF)
+ {
+ tp->type = token_type_eof;
+ return;
+ }
+ else
+ {
+ char *string;
+ phase1_ungetc (c);
+ if (bufpos >= bufmax)
+ {
+ bufmax = 2 * bufmax + 10;
+ buffer = xrealloc (buffer, bufmax);
+ }
+ buffer[bufpos++] = '\0';
+ string = get_here_document (buffer);
+ tp->string = string;
+ tp->type = token_type_string;
+ tp->sub_type = string_type_qq;
+ tp->comment = add_reference (savable_comment);
+ tp->line_number = line_number + 1;
+ interpolate_keywords (mlp, tp->string, tp->line_number);
+ return;
+ }
+ }
+ else
+ {
+ tp->type = token_type_other;
+ return;
+ }
+ }
+ else
+ {
+ phase1_ungetc (c);
+ tp->type = token_type_other;
+ }
+ return; /* End of case '>'. */
+
+ case '-':
+ /* Check for dereferencing operator. */
+ c = phase1_getc ();
+ if (c == '>')
+ {
+ tp->type = token_type_dereference;
+ return;
+ }
+ else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
+ {
+ /* One of the -X (filetest) functions. We play safe
+ and accept all alphabetical characters here. */
+ tp->type = token_type_other;
+ return;
+ }
+ phase1_ungetc (c);
+ tp->type = token_type_other;
+ return;
+
+ case '/':
+ case '?':
+ if (prefer_regexp_over_division (tp->last_type))
+ {
+ extract_quotelike (tp, c);
+ interpolate_keywords (mlp, tp->string, line_number);
+ free (tp->string);
+ drop_reference (tp->comment);
+ tp->type = token_type_regex_op;
+ /* Eat the following modifiers. */
+ do
+ c = phase1_getc ();
+ while (c >= 'a' && c <= 'z');
+ phase1_ungetc (c);
+ return;
+ }
+ /* Recognize operator '//'. */
+ if (c == '/')
+ {
+ c = phase1_getc ();
+ if (c != '/')
+ phase1_ungetc (c);
+ }
+ /* FALLTHROUGH */
+
+ default:
+ /* We could carefully recognize each of the 2 and 3 character
+ operators, but it is not necessary, except for the '//' operator,
+ as we only need to recognize gettext invocations. Don't
+ bother. */
+ tp->type = token_type_other;
+ return;
+ }
+ }
+}
+
+
+/* A token stack used as a lookahead buffer. */
+
+typedef struct token_stack_ty token_stack_ty;
+struct token_stack_ty
+{
+ token_ty **items;
+ size_t nitems;
+ size_t nitems_max;
+};
+
+static struct token_stack_ty token_stack;
+
+#if DEBUG_PERL
+/* Dumps all resources allocated by stack STACK. */
+static int
+token_stack_dump (token_stack_ty *stack)
+{
+ size_t i;
+
+ fprintf (stderr, "BEGIN STACK DUMP\n");
+ for (i = 0; i < stack->nitems; i++)
+ {
+ token_ty *token = stack->items[i];
+ fprintf (stderr, " [%s]\n", token2string (token));
+ switch (token->type)
+ {
+ case token_type_named_op:
+ case token_type_string:
+ case token_type_symbol:
+ case token_type_variable:
+ fprintf (stderr, " string: %s\n", token->string);
+ break;
+ case token_type_object:
+ fprintf (stderr, " string: %s->\n", token->string);
+ default:
+ break;
+ }
+ }
+ fprintf (stderr, "END STACK DUMP\n");
+ return 0;
+}
+#endif
+
+/* Pushes the token TOKEN onto the stack STACK. */
+static inline void
+token_stack_push (token_stack_ty *stack, token_ty *token)
+{
+ if (stack->nitems >= stack->nitems_max)
+ {
+ size_t nbytes;
+
+ stack->nitems_max = 2 * stack->nitems_max + 4;
+ nbytes = stack->nitems_max * sizeof (token_ty *);
+ stack->items = xrealloc (stack->items, nbytes);
+ }
+ stack->items[stack->nitems++] = token;
+}
+
+/* Pops the most recently pushed token from the stack STACK and returns it.
+ Returns NULL if the stack is empty. */
+static inline token_ty *
+token_stack_pop (token_stack_ty *stack)
+{
+ if (stack->nitems > 0)
+ return stack->items[--(stack->nitems)];
+ else
+ return NULL;
+}
+
+/* Return the top of the stack without removing it from the stack, or
+ NULL if the stack is empty. */
+static inline token_ty *
+token_stack_peek (const token_stack_ty *stack)
+{
+ if (stack->nitems > 0)
+ return stack->items[stack->nitems - 1];
+ else
+ return NULL;
+}
+
+/* Frees all resources allocated by stack STACK. */
+static inline void
+token_stack_free (token_stack_ty *stack)
+{
+ size_t i;
+
+ for (i = 0; i < stack->nitems; i++)
+ free_token (stack->items[i]);
+ free (stack->items);
+}
+
+
+static token_ty *
+x_perl_lex (message_list_ty *mlp)
+{
+#if DEBUG_PERL
+ int dummy = token_stack_dump (&token_stack);
+#endif
+ token_ty *tp = token_stack_pop (&token_stack);
+
+ if (!tp)
+ {
+ tp = XMALLOC (token_ty);
+ x_perl_prelex (mlp, tp);
+ tp->last_type = last_token_type;
+ last_token_type = tp->type;
+
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
+ real_file_name, line_number, token2string (tp));
+#endif
+
+ /* The interpretation of a slash or question mark after a function call
+ depends on the prototype of that function. If the function expects
+ at least one argument, a regular expression is preferred, otherwise
+ an operator. With our limited means, we can only guess here. If
+ the function is a builtin that takes no arguments, we prefer an
+ operator by silently turning the last symbol into a variable instead
+ of a symbol.
+
+ Method calls without parentheses are not ambiguous. After them, an
+ operator must follow. Due to some ideosyncrasies in this parser
+ they are treated in two different manners. If the call is
+ chained ($foo->bar->baz) the token left of the symbol is a
+ dereference operator. If it is not chained ($foo->bar) the
+ dereference operator is consumed with the extracted variable. The
+ latter case is handled below. */
+ if (tp->type == token_type_symbol)
+ {
+ if (tp->last_type == token_type_dereference)
+ {
+ /* Class method call or chained method call (with at least
+ two arrow operators). */
+ last_token_type = token_type_variable;
+ }
+ else if (tp->last_type == token_type_object)
+ {
+ /* Instance method, not chained. */
+ last_token_type = token_type_variable;
+ }
+ else if (strcmp (tp->string, "wantarray") == 0
+ || strcmp (tp->string, "fork") == 0
+ || strcmp (tp->string, "getlogin") == 0
+ || strcmp (tp->string, "getppid") == 0
+ || strcmp (tp->string, "getpwent") == 0
+ || strcmp (tp->string, "getgrent") == 0
+ || strcmp (tp->string, "gethostent") == 0
+ || strcmp (tp->string, "getnetent") == 0
+ || strcmp (tp->string, "getprotoent") == 0
+ || strcmp (tp->string, "getservent") == 0
+ || strcmp (tp->string, "setpwent") == 0
+ || strcmp (tp->string, "setgrent") == 0
+ || strcmp (tp->string, "endpwent") == 0
+ || strcmp (tp->string, "endgrent") == 0
+ || strcmp (tp->string, "endhostent") == 0
+ || strcmp (tp->string, "endnetent") == 0
+ || strcmp (tp->string, "endprotoent") == 0
+ || strcmp (tp->string, "endservent") == 0
+ || strcmp (tp->string, "time") == 0
+ || strcmp (tp->string, "times") == 0
+ || strcmp (tp->string, "wait") == 0
+ || strcmp (tp->string, "wantarray") == 0)
+ {
+ /* A Perl built-in function that does not accept arguments. */
+ last_token_type = token_type_variable;
+ }
+ }
+ }
+#if DEBUG_PERL
+ else
+ {
+ fprintf (stderr, "%s:%d: %s recycled from stack\n",
+ real_file_name, line_number, token2string (tp));
+ }
+#endif
+
+ /* A symbol followed by a fat comma is really a single-quoted string.
+ Function definitions or forward declarations also need a special
+ handling because the dollars and at signs inside the parentheses
+ must not be interpreted as the beginning of a variable ')'. */
+ if (tp->type == token_type_symbol || tp->type == token_type_named_op)
+ {
+ token_ty *next = token_stack_peek (&token_stack);
+
+ if (!next)
+ {
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: pre-fetching next token\n",
+ real_file_name, line_number);
+#endif
+ next = x_perl_lex (mlp);
+ x_perl_unlex (next);
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: unshifted next token\n",
+ real_file_name, line_number);
+#endif
+ }
+
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: next token is %s\n",
+ real_file_name, line_number, token2string (next));
+#endif
+
+ if (next->type == token_type_fat_comma)
+ {
+ tp->type = token_type_string;
+ tp->sub_type = string_type_q;
+ tp->comment = add_reference (savable_comment);
+#if DEBUG_PERL
+ fprintf (stderr,
+ "%s:%d: token %s mutated to token_type_string\n",
+ real_file_name, line_number, token2string (tp));
+#endif
+ }
+ else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
+ && next->type == token_type_symbol)
+ {
+ /* Start of a function declaration or definition. Mark this
+ symbol as a function name, so that we can later eat up
+ possible prototype information. */
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
+ real_file_name, line_number, next->string);
+#endif
+ next->sub_type = symbol_type_function;
+ }
+ else if (tp->type == token_type_symbol
+ && (tp->sub_type == symbol_type_sub
+ || tp->sub_type == symbol_type_function)
+ && next->type == token_type_lparen)
+ {
+ /* For simplicity we simply consume everything up to the
+ closing parenthesis. Actually only a limited set of
+ characters is allowed inside parentheses but we leave
+ complaints to the interpreter and are prepared for
+ future extensions to the Perl syntax. */
+ int c;
+
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: consuming prototype information\n",
+ real_file_name, line_number);
+#endif
+
+ do
+ {
+ c = phase1_getc ();
+#if DEBUG_PERL
+ fprintf (stderr, " consuming character '%c'\n", c);
+#endif
+ }
+ while (c != EOF && c != ')');
+ phase1_ungetc (c);
+ }
+ }
+
+ return tp;
+}
+
+static void
+x_perl_unlex (token_ty *tp)
+{
+ token_stack_push (&token_stack, tp);
+}
+
+
+/* ========================= Extracting strings. ========================== */
+
+/* Assuming TP is a string token, this function accumulates all subsequent
+ . string2 . string3 ... to the string. (String concatenation.) */
+
+static char *
+collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
+{
+ char *string;
+ size_t len;
+
+ extract_quotelike_pass3 (tp, error_level);
+ string = xstrdup (tp->string);
+ len = strlen (tp->string) + 1;
+
+ for (;;)
+ {
+ int c;
+
+ do
+ c = phase2_getc ();
+ while (is_whitespace (c));
+
+ if (c != '.')
+ {
+ phase2_ungetc (c);
+ return string;
+ }
+
+ do
+ c = phase2_getc ();
+ while (is_whitespace (c));
+
+ phase2_ungetc (c);
+
+ if (c == '"' || c == '\'' || c == '`'
+ || ((c == '/' || c == '?')
+ && prefer_regexp_over_division (tp->last_type))
+ || c == 'q')
+ {
+ token_ty *qstring = x_perl_lex (mlp);
+ if (qstring->type != token_type_string)
+ {
+ /* assert (qstring->type == token_type_symbol) */
+ x_perl_unlex (qstring);
+ return string;
+ }
+
+ extract_quotelike_pass3 (qstring, error_level);
+ len += strlen (qstring->string);
+ string = xrealloc (string, len);
+ strcat (string, qstring->string);
+ free_token (qstring);
+ }
+ }
+}
+
+/* The file is broken into tokens. Scan the token stream, looking for
+ a keyword, followed by a left paren, followed by a string. When we
+ see this sequence, we have something to remember. We assume we are
+ looking at a valid Perl program, and leave the complaints about
+ the grammar to the compiler.
+
+ Normal handling: Look for
+ keyword ( ... msgid ... )
+ Plural handling: Look for
+ keyword ( ... msgid ... msgid_plural ... )
+
+ We use recursion because the arguments before msgid or between msgid
+ and msgid_plural can contain subexpressions of the same form.
+
+ In Perl, parentheses around function arguments can be omitted.
+
+ The general rules are:
+ 1) Functions declared with a prototype take exactly the specified number
+ of arguments.
+ sub one_arg ($) { ... }
+ sub two_args ($$) { ... }
+ 2) When a function name is immediately followed by an opening parenthesis,
+ the argument list ends at the corresponding closing parenthesis.
+
+ If rule 1 and rule 2 are contradictory, i.e. when the program calls a
+ function with an explicit argument list and the wrong number of arguments,
+ the program is invalid:
+ sub two_args ($$) { ... }
+ foo two_args (x), y - invalid due to rules 1 and 2
+
+ Ambiguities are resolved as follows:
+ 3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions
+ declared with a prototype of exactly one argument take exactly one
+ argument:
+ foo sin x, y ==> foo (sin (x), y)
+ sub one_arg ($) { ... }
+ foo one_arg x, y, z ==> foo (one_arg (x), y, z)
+ 4) Other identifiers, if not immediately followed by an opening
+ parenthesis, consume the entire remaining argument list:
+ foo bar x, y ==> foo (bar (x, y))
+ sub two_args ($$) { ... }
+ foo two_args x, y ==> foo (two_args (x, y))
+
+ Other series of comma separated expressions without a function name at
+ the beginning are comma expressions:
+ sub two_args ($$) { ... }
+ foo two_args x, (y, z) ==> foo (two_args (x, (y, z)))
+ Note that the evaluation of comma expressions returns a list of values
+ when in list context (e.g. inside the argument list of a function without
+ prototype) but only one value when inside the argument list of a function
+ with a prototype:
+ sub print3 ($$$) { print @_ }
+ print3 5, (6, 7), 8 ==> 578
+ print 5, (6, 7), 8 ==> 5678
+
+ Where rule 3 or 4 contradict rule 1 or 2, the program is invalid:
+ sin (x, y) - invalid due to rules 2 and 3
+ sub one_arg ($) { ... }
+ one_arg (x, y) - invalid due to rules 2 and 3
+ sub two_args ($$) { ... }
+ foo two_args x, y, z - invalid due to rules 1 and 4
+ */
+
+/* Extract messages until the next balanced closing parenthesis.
+ Extracted messages are added to MLP.
+
+ DELIM can be either token_type_rbrace, token_type_rbracket,
+ token_type_rparen. Additionally, if COMMA_DELIM is true, parsing
+ stops at the next comma outside parentheses.
+
+ ARG is the current argument list position, starts with 1.
+ ARGPARSER is the corresponding argument list parser.
+
+ Returns true for EOF, false otherwise. */
+
+static bool
+extract_balanced (message_list_ty *mlp,
+ token_type_ty delim, bool eat_delim, bool comma_delim,
+ flag_context_ty outer_context,
+ flag_context_list_iterator_ty context_iter,
+ int arg, struct arglist_parser *argparser)
+{
+ /* Whether to implicitly assume the next tokens are arguments even without
+ a '('. */
+ bool next_is_argument = false;
+ /* Parameters of the keyword just seen. Defined only when next_is_argument
+ is true. */
+ const struct callshapes *next_shapes = NULL;
+ struct arglist_parser *next_argparser = NULL;
+
+ /* Whether to not consider strings until the next comma. */
+ bool skip_until_comma = false;
+
+ /* Context iterator that will be used if the next token is a '('. */
+ flag_context_list_iterator_ty next_context_iter =
+ passthrough_context_list_iterator;
+ /* Current context. */
+ flag_context_ty inner_context =
+ inherited_context (outer_context,
+ flag_context_list_iterator_advance (&context_iter));
+
+#if DEBUG_PERL
+ static int nesting_level = 0;
+
+ ++nesting_level;
+#endif
+
+ for (;;)
+ {
+ /* The current token. */
+ token_ty *tp;
+
+ tp = x_perl_lex (mlp);
+
+ if (delim == tp->type)
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ if (next_argparser != NULL)
+ free (next_argparser);
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
+ logical_file_name, tp->line_number, --nesting_level);
+#endif
+ if (eat_delim)
+ free_token (tp);
+ else
+ /* Preserve the delimiter for the caller. */
+ x_perl_unlex (tp);
+ return false;
+ }
+
+ if (comma_delim && tp->type == token_type_comma)
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ if (next_argparser != NULL)
+ free (next_argparser);
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n",
+ logical_file_name, tp->line_number, --nesting_level);
+#endif
+ x_perl_unlex (tp);
+ return false;
+ }
+
+ if (next_is_argument && tp->type != token_type_lparen)
+ {
+ /* An argument list starts, even though there is no '('. */
+ bool next_comma_delim;
+
+ x_perl_unlex (tp);
+
+ if (next_shapes != NULL)
+ /* We know something about the function being called. Assume
+ that it consumes only one argument if no argument number or
+ total > 1 is specified. */
+ {
+ size_t i;
+
+ next_comma_delim = true;
+ for (i = 0; i < next_shapes->nshapes; i++)
+ {
+ const struct callshape *shape = &next_shapes->shapes[i];
+
+ if (shape->argnum1 > 1
+ || shape->argnum2 > 1
+ || shape->argnumc > 1
+ || shape->argtotal > 1)
+ next_comma_delim = false;
+ }
+ }
+ else
+ /* We know nothing about the function being called. It could be
+ a function prototyped to take only one argument, or on the other
+ hand it could be prototyped to take more than one argument or an
+ arbitrary argument list or it could be unprototyped. Due to
+ the way the parser works, assuming the first case gives the
+ best results. */
+ next_comma_delim = true;
+
+ if (extract_balanced (mlp, delim, false, next_comma_delim,
+ inner_context, next_context_iter,
+ 1, next_argparser))
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ return true;
+ }
+
+ next_is_argument = false;
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ continue;
+ }
+
+ switch (tp->type)
+ {
+ case token_type_symbol:
+ case token_type_keyword_symbol:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
+ logical_file_name, tp->line_number, nesting_level,
+ tp->string);
+#endif
+
+ {
+ void *keyword_value;
+
+ if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
+ &keyword_value) == 0)
+ {
+ const struct callshapes *shapes =
+ (const struct callshapes *) keyword_value;
+
+ next_shapes = shapes;
+ next_argparser = arglist_parser_alloc (mlp, shapes);
+ }
+ else
+ {
+ next_shapes = NULL;
+ next_argparser = arglist_parser_alloc (mlp, NULL);
+ }
+ }
+ next_is_argument = true;
+ next_context_iter =
+ flag_context_list_iterator (
+ flag_context_list_table_lookup (
+ flag_context_list_table,
+ tp->string, strlen (tp->string)));
+ break;
+
+ case token_type_variable:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
+ logical_file_name, tp->line_number, nesting_level,
+ tp->string);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_object:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type object (%d) \"%s->\"\n",
+ logical_file_name, tp->line_number, nesting_level,
+ tp->string);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_lparen:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type left parenthesis (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ if (next_is_argument)
+ {
+ /* Parse the argument list of a function call. */
+ if (extract_balanced (mlp, token_type_rparen, true, false,
+ inner_context, next_context_iter,
+ 1, next_argparser))
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ return true;
+ }
+ next_is_argument = false;
+ next_argparser = NULL;
+ }
+ else
+ {
+ /* Parse a parenthesized expression or comma expression. */
+ if (extract_balanced (mlp, token_type_rparen, true, false,
+ inner_context, next_context_iter,
+ arg, arglist_parser_clone (argparser)))
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ free_token (tp);
+ return true;
+ }
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ }
+ skip_until_comma = true;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_rparen:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type right parenthesis (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ skip_until_comma = true;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_comma:
+ case token_type_fat_comma:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type comma (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ if (arglist_parser_decidedp (argparser, arg))
+ {
+ /* We have missed the argument. */
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ argparser = arglist_parser_alloc (mlp, NULL);
+ arg = 0;
+ }
+ arg++;
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: arg: %d\n",
+ real_file_name, tp->line_number, arg);
+#endif
+ inner_context =
+ inherited_context (outer_context,
+ flag_context_list_iterator_advance (
+ &context_iter));
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ skip_until_comma = false;
+ next_context_iter = passthrough_context_list_iterator;
+ break;
+
+ case token_type_string:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
+ logical_file_name, tp->line_number, nesting_level,
+ tp->string);
+#endif
+
+ if (extract_all)
+ {
+ char *string = collect_message (mlp, tp, EXIT_SUCCESS);
+ lex_pos_ty pos;
+
+ pos.file_name = logical_file_name;
+ pos.line_number = tp->line_number;
+ xgettext_current_source_encoding = po_charset_utf8;
+ remember_a_message (mlp, NULL, string, inner_context, &pos,
+ NULL, tp->comment);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ }
+ else if (!skip_until_comma)
+ {
+ /* Need to collect the complete string, with error checking,
+ only if the argument ARG is used in ARGPARSER. */
+ bool must_collect = false;
+ {
+ size_t nalternatives = argparser->nalternatives;
+ size_t i;
+
+ for (i = 0; i < nalternatives; i++)
+ {
+ struct partial_call *cp = &argparser->alternative[i];
+
+ if (arg == cp->argnumc
+ || arg == cp->argnum1 || arg == cp->argnum2)
+ must_collect = true;
+ }
+ }
+
+ if (must_collect)
+ {
+ char *string = collect_message (mlp, tp, EXIT_FAILURE);
+
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_remember (argparser, arg,
+ string, inner_context,
+ logical_file_name, tp->line_number,
+ tp->comment);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ }
+ }
+
+ if (arglist_parser_decidedp (argparser, arg))
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ argparser = arglist_parser_alloc (mlp, NULL);
+ }
+
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_number:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type number (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_eof:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type EOF (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ free_token (tp);
+ return true;
+
+ case token_type_lbrace:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type lbrace (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ if (extract_balanced (mlp, token_type_rbrace, true, false,
+ null_context, null_context_list_iterator,
+ 1, arglist_parser_alloc (mlp, NULL)))
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ free_token (tp);
+ return true;
+ }
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_rbrace:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type rbrace (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_lbracket:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type lbracket (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ if (extract_balanced (mlp, token_type_rbracket, true, false,
+ null_context, null_context_list_iterator,
+ 1, arglist_parser_alloc (mlp, NULL)))
+ {
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ free_token (tp);
+ return true;
+ }
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_rbracket:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type rbracket (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_semicolon:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type semicolon (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+
+ /* The ultimate sign. */
+ xgettext_current_source_encoding = po_charset_utf8;
+ arglist_parser_done (argparser, arg);
+ xgettext_current_source_encoding = xgettext_global_source_encoding;
+ argparser = arglist_parser_alloc (mlp, NULL);
+
+ /* FIXME: Instead of resetting outer_context here, it may be better
+ to recurse in the next_is_argument handling above, waiting for
+ the next semicolon or other statement terminator. */
+ outer_context = null_context;
+ context_iter = null_context_list_iterator;
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = passthrough_context_list_iterator;
+ inner_context =
+ inherited_context (outer_context,
+ flag_context_list_iterator_advance (
+ &context_iter));
+ break;
+
+ case token_type_dereference:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type dereference (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_dot:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type dot (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_named_op:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
+ logical_file_name, tp->line_number, nesting_level,
+ tp->string);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_regex_op:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type regex operator (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ case token_type_other:
+#if DEBUG_PERL
+ fprintf (stderr, "%s:%d: type other (%d)\n",
+ logical_file_name, tp->line_number, nesting_level);
+#endif
+ next_is_argument = false;
+ if (next_argparser != NULL)
+ free (next_argparser);
+ next_argparser = NULL;
+ next_context_iter = null_context_list_iterator;
+ break;
+
+ default:
+ fprintf (stderr, "%s:%d: unknown token type %d\n",
+ real_file_name, tp->line_number, tp->type);
+ abort ();
+ }
+
+ free_token (tp);
+ }
+}
+
+void
+extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
+ flag_context_list_table_ty *flag_table,
+ msgdomain_list_ty *mdlp)
+{
+ message_list_ty *mlp = mdlp->item[0]->messages;
+
+ fp = f;
+ real_file_name = real_filename;
+ logical_file_name = xstrdup (logical_filename);
+ line_number = 0;
+
+ last_comment_line = -1;
+ last_non_comment_line = -1;
+
+ flag_context_list_table = flag_table;
+
+ init_keywords ();
+
+ token_stack.items = NULL;
+ token_stack.nitems = 0;
+ token_stack.nitems_max = 0;
+ linesize = 0;
+ linepos = 0;
+ eaten_here = 0;
+ end_of_file = false;
+
+ /* Safe assumption. */
+ last_token_type = token_type_semicolon;
+
+ /* Eat tokens until eof is seen. When extract_balanced returns
+ due to an unbalanced closing brace, just restart it. */
+ while (!extract_balanced (mlp, token_type_rbrace, true, false,
+ null_context, null_context_list_iterator,
+ 1, arglist_parser_alloc (mlp, NULL)))
+ ;
+
+ fp = NULL;
+ real_file_name = NULL;
+ free (logical_file_name);
+ logical_file_name = NULL;
+ line_number = 0;
+ last_token_type = token_type_semicolon;
+ token_stack_free (&token_stack);
+ eaten_here = 0;
+ end_of_file = true;
+}