diff options
author | Lorry Tar Creator <lorry-tar-importer@baserock.org> | 2014-12-24 07:38:37 +0000 |
---|---|---|
committer | <> | 2015-02-02 12:02:29 +0000 |
commit | 482840e61f86ca321838a91e902c41d40c098bbb (patch) | |
tree | 01ea2e242fd2792d19fe192476601587901db794 /gettext-tools/src/x-python.c | |
download | gettext-tarball-482840e61f86ca321838a91e902c41d40c098bbb.tar.gz |
Imported from /home/lorry/working-area/delta_gettext-tarball/gettext-0.19.4.tar.xz.gettext-0.19.4
Diffstat (limited to 'gettext-tools/src/x-python.c')
-rw-r--r-- | gettext-tools/src/x-python.c | 1779 |
1 files changed, 1779 insertions, 0 deletions
diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c new file mode 100644 index 0000000..d781ef2 --- /dev/null +++ b/gettext-tools/src/x-python.c @@ -0,0 +1,1779 @@ +/* xgettext Python backend. + Copyright (C) 2002-2003, 2005-2013 Free Software Foundation, Inc. + + This file was written by Bruno Haible <haible@clisp.cons.org>, 2002. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +/* Specification. */ +#include "x-python.h" + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "message.h" +#include "xgettext.h" +#include "error.h" +#include "error-progname.h" +#include "progname.h" +#include "basename.h" +#include "xerror.h" +#include "xvasprintf.h" +#include "xalloc.h" +#include "c-strstr.h" +#include "c-ctype.h" +#include "po-charset.h" +#include "uniname.h" +#include "unistr.h" +#include "gettext.h" + +#define _(s) gettext(s) + +#define max(a,b) ((a) > (b) ? (a) : (b)) + +#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) + + +/* The Python syntax is defined in the Python Reference Manual + /usr/share/doc/packages/python/html/ref/index.html. + See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c, + Python-2.0/Objects/unicodeobject.c. */ + + +/* ====================== Keyword set customization. ====================== */ + +/* If true extract all strings. */ +static bool extract_all = false; + +static hash_table keywords; +static bool default_keywords = true; + + +void +x_python_extract_all () +{ + extract_all = true; +} + + +void +x_python_keyword (const char *name) +{ + if (name == NULL) + default_keywords = false; + else + { + const char *end; + struct callshape shape; + const char *colon; + + if (keywords.table == NULL) + hash_init (&keywords, 100); + + split_keywordspec (name, &end, &shape); + + /* The characters between name and end should form a valid C identifier. + A colon means an invalid parse in split_keywordspec(). */ + colon = strchr (name, ':'); + if (colon == NULL || colon >= end) + insert_keyword_callshape (&keywords, name, end - name, &shape); + } +} + +/* Finish initializing the keywords hash table. + Called after argument processing, before each file is processed. */ +static void +init_keywords () +{ + if (default_keywords) + { + /* When adding new keywords here, also update the documentation in + xgettext.texi! */ + x_python_keyword ("gettext"); + x_python_keyword ("ugettext"); + x_python_keyword ("dgettext:2"); + x_python_keyword ("ngettext:1,2"); + x_python_keyword ("ungettext:1,2"); + x_python_keyword ("dngettext:2,3"); + x_python_keyword ("_"); + default_keywords = false; + } +} + +void +init_flag_table_python () +{ + xgettext_record_flag ("gettext:1:pass-python-format"); + xgettext_record_flag ("ugettext:1:pass-python-format"); + xgettext_record_flag ("dgettext:2:pass-python-format"); + xgettext_record_flag ("ngettext:1:pass-python-format"); + xgettext_record_flag ("ngettext:2:pass-python-format"); + xgettext_record_flag ("ungettext:1:pass-python-format"); + xgettext_record_flag ("ungettext:2:pass-python-format"); + xgettext_record_flag ("dngettext:2:pass-python-format"); + xgettext_record_flag ("dngettext:3:pass-python-format"); + xgettext_record_flag ("_:1:pass-python-format"); + /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */ + + xgettext_record_flag ("gettext:1:pass-python-brace-format"); + xgettext_record_flag ("ugettext:1:pass-python-brace-format"); + xgettext_record_flag ("dgettext:2:pass-python-brace-format"); + xgettext_record_flag ("ngettext:1:pass-python-brace-format"); + xgettext_record_flag ("ngettext:2:pass-python-brace-format"); + xgettext_record_flag ("ungettext:1:pass-python-brace-format"); + xgettext_record_flag ("ungettext:2:pass-python-brace-format"); + xgettext_record_flag ("dngettext:2:pass-python-brace-format"); + xgettext_record_flag ("dngettext:3:pass-python-brace-format"); + xgettext_record_flag ("_:1:pass-python-brace-format"); + /* xgettext_record_flag ("format:1:python-brace-format"); */ +} + + +/* ======================== Reading of characters. ======================== */ + +/* Real filename, used in error messages about the input file. */ +static const char *real_file_name; + +/* Logical filename and line number, used to label the extracted messages. */ +static char *logical_file_name; +static int line_number; + +/* The input file stream. */ +static FILE *fp; + + +/* 0. Terminate line by \n, regardless whether the external + representation of a line terminator is CR (Mac), and CR/LF + (DOS/Windows), as Python treats them equally. */ +static int +phase0_getc () +{ + int c; + + c = getc (fp); + if (c == EOF) + { + if (ferror (fp)) + error (EXIT_FAILURE, errno, _("error while reading \"%s\""), + real_file_name); + return EOF; + } + + if (c == '\r') + { + int c1 = getc (fp); + + if (c1 != EOF && c1 != '\n') + ungetc (c1, fp); + + /* Seen line terminator CR or CR/LF. */ + return '\n'; + } + + return c; +} + +/* Supports only one pushback character, and not '\n'. */ +static inline void +phase0_ungetc (int c) +{ + if (c != EOF) + ungetc (c, fp); +} + + +/* 1. line_number handling. */ + +/* Maximum used, roughly a safer MB_LEN_MAX. */ +#define MAX_PHASE1_PUSHBACK 16 +static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; +static int phase1_pushback_length; + +/* Read the next single byte from the input file. */ +static int +phase1_getc () +{ + int c; + + if (phase1_pushback_length) + c = phase1_pushback[--phase1_pushback_length]; + else + c = phase0_getc (); + + if (c == '\n') + ++line_number; + + return c; +} + +/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */ +static void +phase1_ungetc (int c) +{ + if (c != EOF) + { + if (c == '\n') + --line_number; + + if (phase1_pushback_length == SIZEOF (phase1_pushback)) + abort (); + phase1_pushback[phase1_pushback_length++] = c; + } +} + + +/* Phase 2: Conversion to Unicode. + This is done early because PEP 0263 specifies that conversion to Unicode + conceptually occurs before tokenization. A test case where it matters + is with encodings like BIG5: when a double-byte character ending in 0x5C + is followed by '\' or 'u0021', the tokenizer must not treat the second + half of the double-byte character as a backslash. */ + +/* End-of-file indicator for functions returning an UCS-4 character. */ +#define UEOF -1 + +static lexical_context_ty lexical_context; + +static int phase2_pushback[max (9, UNINAME_MAX + 3)]; +static int phase2_pushback_length; + +/* Read the next Unicode UCS-4 character from the input file. */ +static int +phase2_getc () +{ + if (phase2_pushback_length) + return phase2_pushback[--phase2_pushback_length]; + + if (xgettext_current_source_encoding == po_charset_ascii) + { + int c = phase1_getc (); + if (c == EOF) + return UEOF; + if (!c_isascii (c)) + { + multiline_error (xstrdup (""), + xasprintf ("%s\n%s\n", + non_ascii_error_message (lexical_context, + real_file_name, + line_number), + _("\ +Please specify the source encoding through --from-code or through a comment\n\ +as specified in http://www.python.org/peps/pep-0263.html.\n"))); + exit (EXIT_FAILURE); + } + return c; + } + else if (xgettext_current_source_encoding != po_charset_utf8) + { +#if HAVE_ICONV + /* Use iconv on an increasing number of bytes. Read only as many bytes + through phase1_getc as needed. This is needed to give reasonable + interactive behaviour when fp is connected to an interactive tty. */ + unsigned char buf[MAX_PHASE1_PUSHBACK]; + size_t bufcount; + int c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[0] = (unsigned char) c; + bufcount = 1; + + for (;;) + { + unsigned char scratchbuf[6]; + const char *inptr = (const char *) &buf[0]; + size_t insize = bufcount; + char *outptr = (char *) &scratchbuf[0]; + size_t outsize = sizeof (scratchbuf); + + size_t res = iconv (xgettext_current_source_iconv, + (ICONV_CONST char **) &inptr, &insize, + &outptr, &outsize); + /* We expect that a character has been produced if and only if + some input bytes have been consumed. */ + if ((insize < bufcount) != (outsize < sizeof (scratchbuf))) + abort (); + if (outsize == sizeof (scratchbuf)) + { + /* No character has been produced. Must be an error. */ + if (res != (size_t)(-1)) + abort (); + + if (errno == EILSEQ) + { + /* An invalid multibyte sequence was encountered. */ + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Invalid multibyte sequence.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + else if (errno == EINVAL) + { + /* An incomplete multibyte character. */ + int c; + + if (bufcount == MAX_PHASE1_PUSHBACK) + { + /* An overlong incomplete multibyte sequence was + encountered. */ + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Long incomplete multibyte sequence.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + + /* Read one more byte and retry iconv. */ + c = phase1_getc (); + if (c == EOF) + { + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Incomplete multibyte sequence at end of file.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + if (c == '\n') + { + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Incomplete multibyte sequence at end of line.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number - 1)); + exit (EXIT_FAILURE); + } + buf[bufcount++] = (unsigned char) c; + } + else + error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"), + real_file_name, line_number); + } + else + { + size_t outbytes = sizeof (scratchbuf) - outsize; + size_t bytes = bufcount - insize; + ucs4_t uc; + + /* We expect that one character has been produced. */ + if (bytes == 0) + abort (); + if (outbytes == 0) + abort (); + /* Push back the unused bytes. */ + while (insize > 0) + phase1_ungetc (buf[--insize]); + /* Convert the character from UTF-8 to UCS-4. */ + if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes) + { + /* scratchbuf contains an out-of-range Unicode character + (> 0x10ffff). */ + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Invalid multibyte sequence.\n\ +Please specify the source encoding through --from-code or through a comment\n\ +as specified in http://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + } + return uc; + } + } +#else + /* If we don't have iconv(), the only supported values for + xgettext_global_source_encoding and thus also for + xgettext_current_source_encoding are ASCII and UTF-8. */ + abort (); +#endif + } + else + { + /* Read an UTF-8 encoded character. */ + unsigned char buf[6]; + unsigned int count; + int c; + ucs4_t uc; + + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[0] = c; + count = 1; + + if (buf[0] >= 0xc0) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[1] = c; + count = 2; + } + + if (buf[0] >= 0xe0 + && ((buf[1] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[2] = c; + count = 3; + } + + if (buf[0] >= 0xf0 + && ((buf[1] ^ 0x80) < 0x40) + && ((buf[2] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[3] = c; + count = 4; + } + + if (buf[0] >= 0xf8 + && ((buf[1] ^ 0x80) < 0x40) + && ((buf[2] ^ 0x80) < 0x40) + && ((buf[3] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[4] = c; + count = 5; + } + + if (buf[0] >= 0xfc + && ((buf[1] ^ 0x80) < 0x40) + && ((buf[2] ^ 0x80) < 0x40) + && ((buf[3] ^ 0x80) < 0x40) + && ((buf[4] ^ 0x80) < 0x40)) + { + c = phase1_getc (); + if (c == EOF) + return UEOF; + buf[5] = c; + count = 6; + } + + u8_mbtouc (&uc, buf, count); + return uc; + } +} + +/* Supports max (9, UNINAME_MAX + 3) pushback characters. */ +static void +phase2_ungetc (int c) +{ + if (c != UEOF) + { + if (phase2_pushback_length == SIZEOF (phase2_pushback)) + abort (); + phase2_pushback[phase2_pushback_length++] = c; + } +} + + +/* ========================= Accumulating strings. ======================== */ + +/* A string buffer type that allows appending Unicode characters. + Returns the entire string in UTF-8 encoding. */ + +struct unicode_string_buffer +{ + /* The part of the string that has already been converted to UTF-8. */ + char *utf8_buffer; + size_t utf8_buflen; + size_t utf8_allocated; +}; + +/* Initialize a 'struct unicode_string_buffer' to empty. */ +static inline void +init_unicode_string_buffer (struct unicode_string_buffer *bp) +{ + bp->utf8_buffer = NULL; + bp->utf8_buflen = 0; + bp->utf8_allocated = 0; +} + +/* Auxiliary function: Ensure count more bytes are available in bp->utf8. */ +static inline void +unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp, + size_t count) +{ + if (bp->utf8_buflen + count > bp->utf8_allocated) + { + size_t new_allocated = 2 * bp->utf8_allocated + 10; + if (new_allocated < bp->utf8_buflen + count) + new_allocated = bp->utf8_buflen + count; + bp->utf8_allocated = new_allocated; + bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated); + } +} + +/* Auxiliary function: Append a Unicode character to bp->utf8. + uc must be < 0x110000. */ +static inline void +unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp, + unsigned int uc) +{ + unsigned char utf8buf[6]; + int count = u8_uctomb (utf8buf, uc, 6); + + if (count < 0) + /* The caller should have ensured that uc is not out-of-range. */ + abort (); + + unicode_string_buffer_append_unicode_grow (bp, count); + memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count); + bp->utf8_buflen += count; +} + +/* Return the string buffer's contents. */ +static char * +unicode_string_buffer_result (struct unicode_string_buffer *bp) +{ + /* NUL-terminate it. */ + unicode_string_buffer_append_unicode_grow (bp, 1); + bp->utf8_buffer[bp->utf8_buflen] = '\0'; + /* Return it. */ + return bp->utf8_buffer; +} + +/* Free the memory pointed to by a 'struct unicode_string_buffer'. */ +static inline void +free_unicode_string_buffer (struct unicode_string_buffer *bp) +{ + free (bp->utf8_buffer); +} + + +/* ======================== Accumulating comments. ======================== */ + + +/* Accumulating a single comment line. */ + +static struct unicode_string_buffer comment_buffer; + +static inline void +comment_start () +{ + lexical_context = lc_comment; + comment_buffer.utf8_buflen = 0; +} + +static inline bool +comment_at_start () +{ + return (comment_buffer.utf8_buflen == 0); +} + +static inline void +comment_add (int c) +{ + unicode_string_buffer_append_unicode (&comment_buffer, c); +} + +static inline const char * +comment_line_end () +{ + char *buffer = unicode_string_buffer_result (&comment_buffer); + size_t buflen = strlen (buffer); + + while (buflen >= 1 + && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) + --buflen; + buffer[buflen] = '\0'; + savable_comment_add (buffer); + lexical_context = lc_outside; + return buffer; +} + + +/* These are for tracking whether comments count as immediately before + keyword. */ +static int last_comment_line; +static int last_non_comment_line; + + +/* ======================== Recognizing comments. ======================== */ + + +/* Recognizing the "coding" comment. + As specified in PEP 0263, it takes the form + "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}* + or + "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}* + and is located in a comment in a line that + - is either the first or second line, + - is not a continuation line, + - in the first form, contains no other tokens except this comment. */ + +/* Canonicalized encoding name for the current input file. */ +static const char *xgettext_current_file_source_encoding; + +#if HAVE_ICONV +/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from + ASCII or UTF-8, when this conversion is a no-op). */ +static iconv_t xgettext_current_file_source_iconv; +#endif + +static inline void +set_current_file_source_encoding (const char *canon_encoding) +{ + xgettext_current_file_source_encoding = canon_encoding; + + if (xgettext_current_file_source_encoding != po_charset_ascii + && xgettext_current_file_source_encoding != po_charset_utf8) + { +#if HAVE_ICONV + iconv_t cd; + + /* Avoid glibc-2.1 bug with EUC-KR. */ +# if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \ + && !defined _LIBICONV_VERSION + if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0) + cd = (iconv_t)(-1); + else +# endif + cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding); + if (cd == (iconv_t)(-1)) + error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \ +and iconv() does not support this conversion."), + xgettext_current_file_source_encoding, po_charset_utf8, + basename (program_name)); + xgettext_current_file_source_iconv = cd; +#else + error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\ +Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \ +This version was built without iconv()."), + xgettext_global_source_encoding, po_charset_utf8, + basename (program_name)); +#endif + } + + xgettext_current_source_encoding = xgettext_current_file_source_encoding; +#if HAVE_ICONV + xgettext_current_source_iconv = xgettext_current_file_source_iconv; +#endif +} + +static inline void +try_to_extract_coding (const char *comment) +{ + const char *p = c_strstr (comment, "coding"); + + if (p != NULL) + { + p += 6; + if (*p == ':' || *p == '=') + { + p++; + while (*p == ' ' || *p == '\t') + p++; + { + const char *encoding_start = p; + + while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.') + p++; + { + const char *encoding_end = p; + + if (encoding_end > encoding_start) + { + /* Extract the encoding string. */ + size_t encoding_len = encoding_end - encoding_start; + char *encoding = XNMALLOC (encoding_len + 1, char); + + memcpy (encoding, encoding_start, encoding_len); + encoding[encoding_len] = '\0'; + + { + /* Canonicalize it. */ + const char *canon_encoding = po_charset_canonicalize (encoding); + if (canon_encoding == NULL) + { + error_at_line (0, 0, + logical_file_name, line_number - 1, _("\ +Unknown encoding \"%s\". Proceeding with ASCII instead."), + encoding); + canon_encoding = po_charset_ascii; + } + + /* Activate it. */ + set_current_file_source_encoding (canon_encoding); + } + + free (encoding); + } + } + } + } + } +} + +/* Tracking whether the current line is a continuation line or contains a + non-blank character. */ +static bool continuation_or_nonblank_line = false; + + +/* Phase 3: Outside strings, replace backslash-newline with nothing and a + comment with nothing. */ + +static int +phase3_getc () +{ + int c; + + for (;;) + { + c = phase2_getc (); + if (c == '\\') + { + c = phase2_getc (); + if (c != '\n') + { + phase2_ungetc (c); + /* This shouldn't happen usually, because "A backslash is + illegal elsewhere on a line outside a string literal." */ + return '\\'; + } + /* Eat backslash-newline. */ + continuation_or_nonblank_line = true; + } + else if (c == '#') + { + /* Eat a comment. */ + const char *comment; + + last_comment_line = line_number; + comment_start (); + for (;;) + { + c = phase2_getc (); + if (c == UEOF || c == '\n') + break; + /* We skip all leading white space, but not EOLs. */ + if (!(comment_at_start () && (c == ' ' || c == '\t'))) + comment_add (c); + } + comment = comment_line_end (); + if (line_number - 1 <= 2 && !continuation_or_nonblank_line) + try_to_extract_coding (comment); + continuation_or_nonblank_line = false; + return c; + } + else + { + if (c == '\n') + continuation_or_nonblank_line = false; + else if (!(c == ' ' || c == '\t' || c == '\f')) + continuation_or_nonblank_line = true; + return c; + } + } +} + +/* Supports only one pushback character. */ +static void +phase3_ungetc (int c) +{ + phase2_ungetc (c); +} + + +/* ========================= Accumulating strings. ======================== */ + +/* Return value of phase7_getuc when EOF is reached. */ +#define P7_EOF (-1) +#define P7_STRING_END (-2) + +/* Convert an UTF-16 or UTF-32 code point to a return value that can be + distinguished from a single-byte return value. */ +#define UNICODE(code) (0x100 + (code)) + +/* Test a return value of phase7_getuc whether it designates an UTF-16 or + UTF-32 code point. */ +#define IS_UNICODE(p7_result) ((p7_result) >= 0x100) + +/* Extract the UTF-16 or UTF-32 code of a return value that satisfies + IS_UNICODE. */ +#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) + + +/* ========================== Reading of tokens. ========================== */ + + +enum token_type_ty +{ + token_type_eof, + token_type_lparen, /* ( */ + token_type_rparen, /* ) */ + token_type_comma, /* , */ + token_type_lbracket, /* [ */ + token_type_rbracket, /* ] */ + token_type_string, /* "abc", 'abc', """abc""", '''abc''' */ + token_type_symbol, /* symbol, number */ + token_type_plus, /* + */ + token_type_other /* misc. operator */ +}; +typedef enum token_type_ty token_type_ty; + +typedef struct token_ty token_ty; +struct token_ty +{ + token_type_ty type; + char *string; /* for token_type_string, token_type_symbol */ + refcounted_string_list_ty *comment; /* for token_type_string */ + int line_number; +}; + +/* Free the memory pointed to by a 'struct token_ty'. */ +static inline void +free_token (token_ty *tp) +{ + if (tp->type == token_type_string || tp->type == token_type_symbol) + free (tp->string); + if (tp->type == token_type_string) + drop_reference (tp->comment); +} + + +/* There are two different input syntaxes for strings, "abc" and r"abc", + and two different input syntaxes for Unicode strings, u"abc" and ur"abc". + Which escape sequences are understood, i.e. what is interpreted specially + after backslash? + "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn + r"abc" + u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...} + ur"abc" \unnnn + The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two + \unnnn items. The \ooo and \xnn values are in the current source encoding + for byte strings, and Unicode code points for Unicode strings. + */ + +static int +phase7_getuc (int quote_char, + bool triple, bool interpret_ansic, bool interpret_unicode, + unsigned int *backslash_counter) +{ + int c; + + for (;;) + { + /* Use phase 2, because phase 3 elides comments. */ + c = phase2_getc (); + + if (c == UEOF) + return P7_EOF; + + if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0)) + { + if (triple) + { + int c1 = phase2_getc (); + if (c1 == quote_char) + { + int c2 = phase2_getc (); + if (c2 == quote_char) + return P7_STRING_END; + phase2_ungetc (c2); + } + phase2_ungetc (c1); + return UNICODE (c); + } + else + return P7_STRING_END; + } + + if (c == '\n') + { + if (triple) + { + *backslash_counter = 0; + return UNICODE ('\n'); + } + /* In r"..." and ur"..." strings, newline is only allowed + immediately after an odd number of backslashes (although the + backslashes are not interpreted!). */ + if (!(interpret_ansic || (*backslash_counter & 1) == 0)) + { + *backslash_counter = 0; + return UNICODE ('\n'); + } + phase2_ungetc (c); + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated string"), + logical_file_name, line_number); + error_with_progname = true; + return P7_STRING_END; + } + + if (c != '\\') + { + *backslash_counter = 0; + return UNICODE (c); + } + + /* Backslash handling. */ + + if (!interpret_ansic && !interpret_unicode) + { + ++*backslash_counter; + return UNICODE ('\\'); + } + + /* Dispatch according to the character following the backslash. */ + c = phase2_getc (); + if (c == UEOF) + { + ++*backslash_counter; + return UNICODE ('\\'); + } + + if (interpret_ansic) + switch (c) + { + case '\n': + continue; + case '\\': + ++*backslash_counter; + return UNICODE (c); + case '\'': case '"': + *backslash_counter = 0; + return UNICODE (c); + case 'a': + *backslash_counter = 0; + return UNICODE ('\a'); + case 'b': + *backslash_counter = 0; + return UNICODE ('\b'); + case 'f': + *backslash_counter = 0; + return UNICODE ('\f'); + case 'n': + *backslash_counter = 0; + return UNICODE ('\n'); + case 'r': + *backslash_counter = 0; + return UNICODE ('\r'); + case 't': + *backslash_counter = 0; + return UNICODE ('\t'); + case 'v': + *backslash_counter = 0; + return UNICODE ('\v'); + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + int n = c - '0'; + + c = phase2_getc (); + if (c != UEOF) + { + if (c >= '0' && c <= '7') + { + n = (n << 3) + (c - '0'); + c = phase2_getc (); + if (c != UEOF) + { + if (c >= '0' && c <= '7') + n = (n << 3) + (c - '0'); + else + phase2_ungetc (c); + } + } + else + phase2_ungetc (c); + } + *backslash_counter = 0; + if (interpret_unicode) + return UNICODE (n); + else + return (unsigned char) n; + } + case 'x': + { + int c1 = phase2_getc (); + int n1; + + if (c1 >= '0' && c1 <= '9') + n1 = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') + n1 = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') + n1 = c1 - 'a' + 10; + else + n1 = -1; + + if (n1 >= 0) + { + int c2 = phase2_getc (); + int n2; + + if (c2 >= '0' && c2 <= '9') + n2 = c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') + n2 = c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') + n2 = c2 - 'a' + 10; + else + n2 = -1; + + if (n2 >= 0) + { + int n = (n1 << 4) + n2; + *backslash_counter = 0; + if (interpret_unicode) + return UNICODE (n); + else + return (unsigned char) n; + } + + phase2_ungetc (c2); + } + phase2_ungetc (c1); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + } + + if (interpret_unicode) + { + if (c == 'u') + { + unsigned char buf[4]; + unsigned int n = 0; + int i; + + for (i = 0; i < 4; i++) + { + int c1 = phase2_getc (); + + if (c1 >= '0' && c1 <= '9') + n = (n << 4) + (c1 - '0'); + else if (c1 >= 'A' && c1 <= 'F') + n = (n << 4) + (c1 - 'A' + 10); + else if (c1 >= 'a' && c1 <= 'f') + n = (n << 4) + (c1 - 'a' + 10); + else + { + phase2_ungetc (c1); + while (--i >= 0) + phase2_ungetc (buf[i]); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + + buf[i] = c1; + } + *backslash_counter = 0; + return UNICODE (n); + } + + if (interpret_ansic) + { + if (c == 'U') + { + unsigned char buf[8]; + unsigned int n = 0; + int i; + + for (i = 0; i < 8; i++) + { + int c1 = phase2_getc (); + + if (c1 >= '0' && c1 <= '9') + n = (n << 4) + (c1 - '0'); + else if (c1 >= 'A' && c1 <= 'F') + n = (n << 4) + (c1 - 'A' + 10); + else if (c1 >= 'a' && c1 <= 'f') + n = (n << 4) + (c1 - 'a' + 10); + else + { + phase2_ungetc (c1); + while (--i >= 0) + phase2_ungetc (buf[i]); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + + buf[i] = c1; + } + if (n < 0x110000) + { + *backslash_counter = 0; + return UNICODE (n); + } + + error_with_progname = false; + error (0, 0, _("%s:%d: warning: invalid Unicode character"), + logical_file_name, line_number); + error_with_progname = true; + + while (--i >= 0) + phase2_ungetc (buf[i]); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + + if (c == 'N') + { + int c1 = phase2_getc (); + if (c1 == '{') + { + unsigned char buf[UNINAME_MAX + 1]; + int i; + unsigned int n; + + for (i = 0; i < UNINAME_MAX; i++) + { + int c2 = phase2_getc (); + if (!(c2 >= ' ' && c2 <= '~')) + { + phase2_ungetc (c2); + while (--i >= 0) + phase2_ungetc (buf[i]); + phase2_ungetc (c1); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + if (c2 == '}') + break; + buf[i] = c2; + } + buf[i] = '\0'; + + n = unicode_name_character ((char *) buf); + if (n != UNINAME_INVALID) + { + *backslash_counter = 0; + return UNICODE (n); + } + + phase2_ungetc ('}'); + while (--i >= 0) + phase2_ungetc (buf[i]); + } + phase2_ungetc (c1); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + } + } + + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } +} + + +/* Combine characters into tokens. Discard whitespace except newlines at + the end of logical lines. */ + +/* Number of pending open parentheses/braces/brackets. */ +static int open_pbb; + +static token_ty phase5_pushback[2]; +static int phase5_pushback_length; + +static void +phase5_get (token_ty *tp) +{ + int c; + + if (phase5_pushback_length) + { + *tp = phase5_pushback[--phase5_pushback_length]; + return; + } + + for (;;) + { + tp->line_number = line_number; + c = phase3_getc (); + + switch (c) + { + case UEOF: + tp->type = token_type_eof; + return; + + case ' ': + case '\t': + case '\f': + /* Ignore whitespace and comments. */ + continue; + + case '\n': + if (last_non_comment_line > last_comment_line) + savable_comment_reset (); + /* Ignore newline if and only if it is used for implicit line + joining. */ + if (open_pbb > 0) + continue; + tp->type = token_type_other; + return; + } + + last_non_comment_line = tp->line_number; + + switch (c) + { + case '.': + { + int c1 = phase3_getc (); + phase3_ungetc (c1); + if (!(c1 >= '0' && c1 <= '9')) + { + + tp->type = token_type_other; + return; + } + } + /* FALLTHROUGH */ + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': + case 'S': case 'T': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': + case 's': case 't': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + symbol: + /* Symbol, or part of a number. */ + { + static char *buffer; + static int bufmax; + int bufpos; + + bufpos = 0; + for (;;) + { + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + c = phase3_getc (); + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + continue; + default: + phase3_ungetc (c); + break; + } + break; + } + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = '\0'; + tp->string = xstrdup (buffer); + tp->type = token_type_symbol; + return; + } + + /* Strings. */ + { + struct mixed_string_buffer *bp; + int quote_char; + bool interpret_ansic; + bool interpret_unicode; + bool triple; + unsigned int backslash_counter; + + case 'R': case 'r': + { + int c1 = phase2_getc (); + if (c1 == '"' || c1 == '\'') + { + quote_char = c1; + interpret_ansic = false; + interpret_unicode = false; + goto string; + } + phase2_ungetc (c1); + goto symbol; + } + + case 'U': case 'u': + { + int c1 = phase2_getc (); + if (c1 == '"' || c1 == '\'') + { + quote_char = c1; + interpret_ansic = true; + interpret_unicode = true; + goto string; + } + if (c1 == 'R' || c1 == 'r') + { + int c2 = phase2_getc (); + if (c2 == '"' || c2 == '\'') + { + quote_char = c2; + interpret_ansic = false; + interpret_unicode = true; + goto string; + } + phase2_ungetc (c2); + } + phase2_ungetc (c1); + goto symbol; + } + + case '"': case '\'': + quote_char = c; + interpret_ansic = true; + interpret_unicode = false; + string: + triple = false; + lexical_context = lc_string; + { + int c1 = phase2_getc (); + if (c1 == quote_char) + { + int c2 = phase2_getc (); + if (c2 == quote_char) + triple = true; + else + { + phase2_ungetc (c2); + phase2_ungetc (c1); + } + } + else + phase2_ungetc (c1); + } + backslash_counter = 0; + /* Start accumulating the string. */ + bp = mixed_string_buffer_alloc (lexical_context, + logical_file_name, + line_number); + for (;;) + { + int uc = phase7_getuc (quote_char, triple, interpret_ansic, + interpret_unicode, &backslash_counter); + + /* Keep line_number in sync. */ + bp->line_number = line_number; + + if (uc == P7_EOF || uc == P7_STRING_END) + break; + + if (IS_UNICODE (uc)) + { + assert (UNICODE_VALUE (uc) >= 0 + && UNICODE_VALUE (uc) < 0x110000); + mixed_string_buffer_append_unicode (bp, + UNICODE_VALUE (uc)); + } + else + mixed_string_buffer_append_char (bp, uc); + } + tp->string = mixed_string_buffer_done (bp); + tp->comment = add_reference (savable_comment); + lexical_context = lc_outside; + tp->type = token_type_string; + return; + } + + case '(': + open_pbb++; + tp->type = token_type_lparen; + return; + + case ')': + if (open_pbb > 0) + open_pbb--; + tp->type = token_type_rparen; + return; + + case ',': + tp->type = token_type_comma; + return; + + case '[': case '{': + open_pbb++; + tp->type = (c == '[' ? token_type_lbracket : token_type_other); + return; + + case ']': case '}': + if (open_pbb > 0) + open_pbb--; + tp->type = (c == ']' ? token_type_rbracket : token_type_other); + return; + + case '+': + tp->type = token_type_plus; + return; + + default: + /* We could carefully recognize each of the 2 and 3 character + operators, but it is not necessary, as we only need to recognize + gettext invocations. Don't bother. */ + tp->type = token_type_other; + return; + } + } +} + +/* Supports only one pushback token. */ +static void +phase5_unget (token_ty *tp) +{ + if (tp->type != token_type_eof) + { + if (phase5_pushback_length == SIZEOF (phase5_pushback)) + abort (); + phase5_pushback[phase5_pushback_length++] = *tp; + } +} + + +/* Combine adjacent strings to form a single string. Note that the end + of a logical line appears as a token of its own, therefore strings that + belong to different logical lines will not be concatenated. */ + +static void +x_python_lex (token_ty *tp) +{ + phase5_get (tp); + if (tp->type == token_type_string) + { + char *sum = tp->string; + size_t sum_len = strlen (sum); + + for (;;) + { + token_ty token2, *tp2 = NULL; + token_ty token3; + + phase5_get (&token2); + switch (token2.type) + { + case token_type_plus: + { + phase5_get (&token3); + if (token3.type == token_type_string) + { + free_token (&token2); + tp2 = &token3; + } + else + phase5_unget (&token3); + } + break; + case token_type_string: + tp2 = &token2; + break; + default: + break; + } + + if (tp2) + { + char *addend = tp2->string; + size_t addend_len = strlen (addend); + + sum = (char *) xrealloc (sum, sum_len + addend_len + 1); + memcpy (sum + sum_len, addend, addend_len + 1); + sum_len += addend_len; + + free_token (tp2); + continue; + } + phase5_unget (&token2); + break; + } + tp->string = sum; + } +} + + +/* ========================= Extracting strings. ========================== */ + + +/* Context lookup table. */ +static flag_context_list_table_ty *flag_context_list_table; + + +/* The file is broken into tokens. Scan the token stream, looking for + a keyword, followed by a left paren, followed by a string. When we + see this sequence, we have something to remember. We assume we are + looking at a valid C or C++ program, and leave the complaints about + the grammar to the compiler. + + Normal handling: Look for + keyword ( ... msgid ... ) + Plural handling: Look for + keyword ( ... msgid ... msgid_plural ... ) + + We use recursion because the arguments before msgid or between msgid + and msgid_plural can contain subexpressions of the same form. */ + + +/* Extract messages until the next balanced closing parenthesis or bracket. + Extracted messages are added to MLP. + DELIM can be either token_type_rparen or token_type_rbracket, or + token_type_eof to accept both. + Return true upon eof, false upon closing parenthesis or bracket. */ +static bool +extract_balanced (message_list_ty *mlp, + token_type_ty delim, + flag_context_ty outer_context, + flag_context_list_iterator_ty context_iter, + struct arglist_parser *argparser) +{ + /* Current argument number. */ + int arg = 1; + /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ + int state; + /* Parameters of the keyword just seen. Defined only in state 1. */ + const struct callshapes *next_shapes = NULL; + /* Context iterator that will be used if the next token is a '('. */ + flag_context_list_iterator_ty next_context_iter = + passthrough_context_list_iterator; + /* Current context. */ + flag_context_ty inner_context = + inherited_context (outer_context, + flag_context_list_iterator_advance (&context_iter)); + + /* Start state is 0. */ + state = 0; + + for (;;) + { + token_ty token; + + x_python_lex (&token); + switch (token.type) + { + case token_type_symbol: + { + void *keyword_value; + + if (hash_find_entry (&keywords, token.string, strlen (token.string), + &keyword_value) + == 0) + { + next_shapes = (const struct callshapes *) keyword_value; + state = 1; + } + else + state = 0; + } + next_context_iter = + flag_context_list_iterator ( + flag_context_list_table_lookup ( + flag_context_list_table, + token.string, strlen (token.string))); + free (token.string); + continue; + + case token_type_lparen: + if (extract_balanced (mlp, token_type_rparen, + inner_context, next_context_iter, + arglist_parser_alloc (mlp, + state ? next_shapes : NULL))) + { + xgettext_current_source_encoding = po_charset_utf8; + arglist_parser_done (argparser, arg); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; + return true; + } + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_rparen: + if (delim == token_type_rparen || delim == token_type_eof) + { + xgettext_current_source_encoding = po_charset_utf8; + arglist_parser_done (argparser, arg); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; + return false; + } + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_comma: + arg++; + inner_context = + inherited_context (outer_context, + flag_context_list_iterator_advance ( + &context_iter)); + next_context_iter = passthrough_context_list_iterator; + state = 0; + continue; + + case token_type_lbracket: + if (extract_balanced (mlp, token_type_rbracket, + null_context, null_context_list_iterator, + arglist_parser_alloc (mlp, NULL))) + { + xgettext_current_source_encoding = po_charset_utf8; + arglist_parser_done (argparser, arg); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; + return true; + } + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_rbracket: + if (delim == token_type_rbracket || delim == token_type_eof) + { + xgettext_current_source_encoding = po_charset_utf8; + arglist_parser_done (argparser, arg); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; + return false; + } + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_string: + { + lex_pos_ty pos; + pos.file_name = logical_file_name; + pos.line_number = token.line_number; + + xgettext_current_source_encoding = po_charset_utf8; + if (extract_all) + remember_a_message (mlp, NULL, token.string, inner_context, + &pos, NULL, token.comment); + else + arglist_parser_remember (argparser, arg, token.string, + inner_context, + pos.file_name, pos.line_number, + token.comment); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; + } + drop_reference (token.comment); + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + case token_type_eof: + xgettext_current_source_encoding = po_charset_utf8; + arglist_parser_done (argparser, arg); + xgettext_current_source_encoding = xgettext_current_file_source_encoding; + return true; + + case token_type_plus: + case token_type_other: + next_context_iter = null_context_list_iterator; + state = 0; + continue; + + default: + abort (); + } + } +} + + +void +extract_python (FILE *f, + const char *real_filename, const char *logical_filename, + flag_context_list_table_ty *flag_table, + msgdomain_list_ty *mdlp) +{ + message_list_ty *mlp = mdlp->item[0]->messages; + + fp = f; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + line_number = 1; + + lexical_context = lc_outside; + + last_comment_line = -1; + last_non_comment_line = -1; + + xgettext_current_file_source_encoding = xgettext_global_source_encoding; +#if HAVE_ICONV + xgettext_current_file_source_iconv = xgettext_global_source_iconv; +#endif + + xgettext_current_source_encoding = xgettext_current_file_source_encoding; +#if HAVE_ICONV + xgettext_current_source_iconv = xgettext_current_file_source_iconv; +#endif + + continuation_or_nonblank_line = false; + + open_pbb = 0; + + flag_context_list_table = flag_table; + + init_keywords (); + + /* Eat tokens until eof is seen. When extract_balanced returns + due to an unbalanced closing parenthesis, just restart it. */ + while (!extract_balanced (mlp, token_type_eof, + null_context, null_context_list_iterator, + arglist_parser_alloc (mlp, NULL))) + ; + + fp = NULL; + real_file_name = NULL; + logical_file_name = NULL; + line_number = 0; +} |