summaryrefslogtreecommitdiff
path: root/gcc
diff options
context:
space:
mode:
authorzack <zack@138bc75d-0d04-0410-961f-82ee72b054a4>2003-07-05 00:24:00 +0000
committerzack <zack@138bc75d-0d04-0410-961f-82ee72b054a4>2003-07-05 00:24:00 +0000
commitebc0381062ddc4bc6408feb6516dcbc6c3525e92 (patch)
tree34c7734f7acee49beff2b3d99cbdf53576456697 /gcc
parentc472286c1a11cc726c1f0365e5805197dd96e41e (diff)
downloadgcc-ebc0381062ddc4bc6408feb6516dcbc6c3525e92.tar.gz
* cpplib.h (CPP_AT_NAME, CPP_OBJC_STRING): New token types.
(struct cpp_options): Add narrow_charset, wide_charset, bytes_big_endian fields. Remove EBCDIC field. (cpp_init_iconv, cpp_interpret_string): New external interfaces. * cpphash.h: Include <iconv.h> if we have it, otherwise provide a dummy definition of iconv_t. (struct cpp_reader): Add narrow_cset_desc and wide_cset_desc fields. (_cpp_valid_ucn): Update prototype. (_cpp_destroy_iconv): New prototype. * doc/cpp.texi: Document character set handling. * doc/cppopts.texi: Document -fexec-charset= and -fexec-wide-charset=. * doc/extend.texi: Delete entire section on multiline strings. Rewrite section on __FUNCTION__ etc now that these are variables in C. * cppucnid.tab, cppucnid.pl: New files. * cppucnid.h: New generated file. * cppcharset.c: Include cppucnid.h. Lots of commentary added. (iconv_open, iconv, iconv_close): Provide dummy definitions if !HAVE_ICONV. (SOURCE_CHARSET, struct strbuf, init_iconv_desc, cpp_init_iconv, _cpp_destroy_iconv, convert_cset, width_to_mask, convert_ucn, emit_numeric_escape, convert_hex, convert_oct, convert_escape, cpp_interpret_string, narrow_str_to_charconst, wide_str_to_charconst): New. (ucn_valid_in_identifier): Use a binary search through the ucnranges table defined in cppucnid.h, not a long chain of if statements. (_cpp_valid_ucn): Add a limit pointer. Downgrade "universal character names are only valid in C++ and C99" to a warning. Issue the "meaning of \[uU] is different in traditional C" warning here. Take care not to let iconv see an invalid UCS value if we get a malformed UCN. Issue an error if we don't have iconv. (cpp_interpret_charconst): Moved here from cpplex.c. Use cpp_interpret_string to do the heavy lifting. * cppinit.c (cpp_create_reader): Initialize bytes_big_endian, narrow_charset, wide_charset fields of options structure. (cpp_destroy): Call _cpp_destroy_iconv. * cpplex.c (forms_identifier_p): Adjust call to _cpp_valid_ucn. (maybe_read_ucn, hex_digit_value, cpp_parse_escape): Delete. (cpp_interpret_charconst): Moved to cppcharset.c. * cpplib.c (dequote_string): Delete. (interpret_string_notranslate): New. (do_line, do_linemarker): Use interpret_string_notranslate. * Makefile.in (cppcharset.o): Depend on cppucnid.h. * c-common.c (fname_string, combine_strings): Delete. * c-common.h (fname_string, combine_strings): Delete prototypes. * c-lex.c (ignore_escape_flag): Delete. (cb_ident): Use cpp_interpret_string, not lex_string. (get_nonpadding_token): New function. (c_lex): Handle Objective-C @-prefixed identifiers and strings here. Adjust calls to lex_string. Don't write *value twice. (lex_string): Now handles string constant concatenation. Most of the work handed off to cpp_interpret_string. Call fix_string_type here. * c-parse.in (STRING_FUNC_NAME, VAR_FUNC_NAME): Replace with FUNC_NAME, throughout. (OBJC_STRING): New token type. (primary:STRING): No need to call fix_string_type here. (primary:objc_string): Make that OBJC_STRING. (objc_string nonterminal): Delete. (yylexname): Delete code to handle fake string constants. (yylexstring): Delete entirely. (_yylex): Handle CPP_AT_NAME and CPP_OBJC_STRING. No need to handle CPP_ATSIGN. * c.opt (-fexec-charset=, -fwide-exec-charset=): New options. * c-opts.c (missing_arg, c_common_handle_option): Handle OPT_fexec_charset_ and OPT_fwide_exec_charset_. (c_common_init): Set cpp_opts->bytes_big_endian, not cpp_opts->EBCDIC. Call cpp_init_iconv. (print_help): Document -fexec-charset= and -fexec-wide-charset=. (TARGET_EBCDIC): Delete default definition. * objc/objc-act.c (build_objc_string_object): No need to handle string constant concatenation. cp: * parser.c (cp_lexer_read_token): No need to handle string constant concatenation. testsuite: * gcc.c-torture/execute/wchar_t-1.x: New file; XFAIL wchar_t-1.c everywhere. * gcc.dg/concat.c: Concatenation of string constants with __FUNCTION__ / __PRETTY_FUNCTION__ is now a hard error. * gcc.dg/wtr-strcat-1.c: Loosen dg-warning regexp. * gcc.dg/cpp/escape-2.c: Use wide character constants where necessary to avoid multi-character character constant warning. * gcc.dg/cpp/escape.c: Likewise. * gcc.dg/cpp/ucs.c: Likewise. Remove backslashes from dg-bogus comments, as they confuse Tcl. Fix a typo. libstdc++-v3: * testsuite/22_locale/collate/compare/wchar_t/2.cc * testsuite/22_locale/collate/compare/wchar_t/wrapped_env.cc * testsuite/22_locale/collate/compare/wchar_t/wrapped_locale.cc * testsuite/22_locale/collate/hash/wchar_t/2.cc * testsuite/22_locale/collate/hash/wchar_t/wrapped_env.cc * testsuite/22_locale/collate/hash/wchar_t/wrapped_locale.cc * testsuite/22_locale/collate/transform/wchar_t/2.cc * testsuite/22_locale/collate/transform/wchar_t/wrapped_env.cc * testsuite/22_locale/collate/transform/wchar_t/wrapped_locale.cc: XFAIL on all targets. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@68952 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc')
-rw-r--r--gcc/ChangeLog85
-rw-r--r--gcc/Makefile.in2
-rw-r--r--gcc/c-common.c119
-rw-r--r--gcc/c-common.h2
-rw-r--r--gcc/c-lex.c195
-rw-r--r--gcc/c-opts.c26
-rw-r--r--gcc/c-parse.in138
-rw-r--r--gcc/c.opt6
-rw-r--r--gcc/cp/ChangeLog43
-rw-r--r--gcc/cp/parser.c74
-rw-r--r--gcc/cppcharset.c1238
-rw-r--r--gcc/cpphash.h19
-rw-r--r--gcc/cppinit.c6
-rw-r--r--gcc/cpplex.c289
-rw-r--r--gcc/cpplib.c50
-rw-r--r--gcc/cpplib.h20
-rw-r--r--gcc/cppucnid.h336
-rw-r--r--gcc/cppucnid.pl130
-rw-r--r--gcc/cppucnid.tab239
-rw-r--r--gcc/doc/cpp.texi86
-rw-r--r--gcc/doc/cppopts.texi15
-rw-r--r--gcc/doc/extend.texi115
-rw-r--r--gcc/objc/objc-act.c26
-rw-r--r--gcc/testsuite/ChangeLog110
-rw-r--r--gcc/testsuite/gcc.c-torture/execute/wchar_t-1.x3
-rw-r--r--gcc/testsuite/gcc.dg/concat.c12
-rw-r--r--gcc/testsuite/gcc.dg/cpp/escape-2.c4
-rw-r--r--gcc/testsuite/gcc.dg/cpp/escape.c4
-rw-r--r--gcc/testsuite/gcc.dg/cpp/ucs.c8
-rw-r--r--gcc/testsuite/gcc.dg/wtr-strcat-1.c2
30 files changed, 2060 insertions, 1342 deletions
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 4cf39f31324..0b077b0b041 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,88 @@
+2003-07-04 Zack Weinberg <zack@codesourcery.com>
+
+ * cpplib.h (CPP_AT_NAME, CPP_OBJC_STRING): New token types.
+ (struct cpp_options): Add narrow_charset, wide_charset,
+ bytes_big_endian fields. Remove EBCDIC field.
+ (cpp_init_iconv, cpp_interpret_string): New external interfaces.
+
+ * cpphash.h: Include <iconv.h> if we have it, otherwise
+ provide a dummy definition of iconv_t.
+ (struct cpp_reader): Add narrow_cset_desc and wide_cset_desc fields.
+ (_cpp_valid_ucn): Update prototype.
+ (_cpp_destroy_iconv): New prototype.
+
+ * doc/cpp.texi: Document character set handling.
+ * doc/cppopts.texi: Document -fexec-charset= and -fexec-wide-charset=.
+ * doc/extend.texi: Delete entire section on multiline strings.
+ Rewrite section on __FUNCTION__ etc now that these are
+ variables in C.
+
+ * cppucnid.tab, cppucnid.pl: New files.
+ * cppucnid.h: New generated file.
+ * cppcharset.c: Include cppucnid.h. Lots of commentary added.
+ (iconv_open, iconv, iconv_close): Provide dummy definitions
+ if !HAVE_ICONV.
+ (SOURCE_CHARSET, struct strbuf, init_iconv_desc, cpp_init_iconv,
+ _cpp_destroy_iconv, convert_cset, width_to_mask, convert_ucn,
+ emit_numeric_escape, convert_hex, convert_oct, convert_escape,
+ cpp_interpret_string, narrow_str_to_charconst,
+ wide_str_to_charconst): New.
+ (ucn_valid_in_identifier): Use a binary search through the
+ ucnranges table defined in cppucnid.h, not a long chain of if
+ statements.
+ (_cpp_valid_ucn): Add a limit pointer. Downgrade "universal
+ character names are only valid in C++ and C99" to a warning.
+ Issue the "meaning of \[uU] is different in traditional C"
+ warning here. Take care not to let iconv see an invalid UCS
+ value if we get a malformed UCN. Issue an error if we don't
+ have iconv.
+ (cpp_interpret_charconst): Moved here from cpplex.c. Use
+ cpp_interpret_string to do the heavy lifting.
+
+ * cppinit.c (cpp_create_reader): Initialize bytes_big_endian,
+ narrow_charset, wide_charset fields of options structure.
+ (cpp_destroy): Call _cpp_destroy_iconv.
+ * cpplex.c (forms_identifier_p): Adjust call to _cpp_valid_ucn.
+ (maybe_read_ucn, hex_digit_value, cpp_parse_escape): Delete.
+ (cpp_interpret_charconst): Moved to cppcharset.c.
+ * cpplib.c (dequote_string): Delete.
+ (interpret_string_notranslate): New.
+ (do_line, do_linemarker): Use interpret_string_notranslate.
+
+ * Makefile.in (cppcharset.o): Depend on cppucnid.h.
+
+ * c-common.c (fname_string, combine_strings): Delete.
+ * c-common.h (fname_string, combine_strings): Delete prototypes.
+ * c-lex.c (ignore_escape_flag): Delete.
+ (cb_ident): Use cpp_interpret_string, not lex_string.
+ (get_nonpadding_token): New function.
+ (c_lex): Handle Objective-C @-prefixed identifiers and strings here.
+ Adjust calls to lex_string. Don't write *value twice.
+ (lex_string): Now handles string constant concatenation.
+ Most of the work handed off to cpp_interpret_string.
+ Call fix_string_type here.
+ * c-parse.in (STRING_FUNC_NAME, VAR_FUNC_NAME): Replace with
+ FUNC_NAME, throughout.
+ (OBJC_STRING): New token type.
+ (primary:STRING): No need to call fix_string_type here.
+ (primary:objc_string): Make that OBJC_STRING.
+ (objc_string nonterminal): Delete.
+ (yylexname): Delete code to handle fake string constants.
+ (yylexstring): Delete entirely.
+ (_yylex): Handle CPP_AT_NAME and CPP_OBJC_STRING. No need
+ to handle CPP_ATSIGN.
+
+ * c.opt (-fexec-charset=, -fwide-exec-charset=): New options.
+ * c-opts.c (missing_arg, c_common_handle_option): Handle
+ OPT_fexec_charset_ and OPT_fwide_exec_charset_.
+ (c_common_init): Set cpp_opts->bytes_big_endian, not
+ cpp_opts->EBCDIC. Call cpp_init_iconv.
+ (print_help): Document -fexec-charset= and -fexec-wide-charset=.
+ (TARGET_EBCDIC): Delete default definition.
+
+ * objc/objc-act.c (build_objc_string_object): No need to
+ handle string constant concatenation.
+
2003-07-04 Kazu Hirata <kazu@cs.umass.edu>
* doc/install.texi: Fix typos.
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 7190dacda2f..7b475735f66 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -2351,7 +2351,7 @@ libcpp.a: $(LIBCPP_OBJS)
$(AR) $(AR_FLAGS) libcpp.a $(LIBCPP_OBJS)
-$(RANLIB) libcpp.a
-cppcharset.o: cppcharset.c $(LIBCPP_DEPS)
+cppcharset.o: cppcharset.c $(LIBCPP_DEPS) cppucnid.h
cpperror.o: cpperror.c $(LIBCPP_DEPS)
cppexp.o: cppexp.c $(LIBCPP_DEPS)
cpplex.o: cpplex.c $(LIBCPP_DEPS)
diff --git a/gcc/c-common.c b/gcc/c-common.c
index 6513ca81c0d..341018c0e5b 100644
--- a/gcc/c-common.c
+++ b/gcc/c-common.c
@@ -1084,20 +1084,6 @@ fname_as_string (int pretty_p)
return name;
}
-/* Return the text name of the current function, formatted as
- required by the supplied RID value. */
-
-const char *
-fname_string (unsigned int rid)
-{
- unsigned ix;
-
- for (ix = 0; fname_vars[ix].decl; ix++)
- if (fname_vars[ix].rid == rid)
- break;
- return fname_as_string (fname_vars[ix].pretty);
-}
-
/* Return the VAR_DECL for a const char array naming the current
function. If the VAR_DECL has not yet been created, create it
now. RID indicates how it should be formatted and IDENTIFIER_NODE
@@ -1190,111 +1176,6 @@ fix_string_type (tree value)
TREE_STATIC (value) = 1;
return value;
}
-
-/* Given a VARRAY of STRING_CST nodes, concatenate them into one
- STRING_CST. */
-
-tree
-combine_strings (varray_type strings)
-{
- const int wchar_bytes = TYPE_PRECISION (wchar_type_node) / BITS_PER_UNIT;
- const int nstrings = VARRAY_ACTIVE_SIZE (strings);
- tree value, t;
- int length = 1;
- int wide_length = 0;
- int wide_flag = 0;
- int i;
- char *p, *q;
-
- /* Don't include the \0 at the end of each substring. Count wide
- strings and ordinary strings separately. */
- for (i = 0; i < nstrings; ++i)
- {
- t = VARRAY_TREE (strings, i);
-
- if (TREE_TYPE (t) == wchar_array_type_node)
- {
- wide_length += TREE_STRING_LENGTH (t) - wchar_bytes;
- wide_flag = 1;
- }
- else
- {
- length += (TREE_STRING_LENGTH (t) - 1);
- if (C_ARTIFICIAL_STRING_P (t) && !in_system_header)
- warning ("concatenation of string literals with __FUNCTION__ is deprecated");
- }
- }
-
- /* If anything is wide, the non-wides will be converted,
- which makes them take more space. */
- if (wide_flag)
- length = length * wchar_bytes + wide_length;
-
- p = xmalloc (length);
-
- /* Copy the individual strings into the new combined string.
- If the combined string is wide, convert the chars to ints
- for any individual strings that are not wide. */
-
- q = p;
- for (i = 0; i < nstrings; ++i)
- {
- int len, this_wide;
-
- t = VARRAY_TREE (strings, i);
- this_wide = TREE_TYPE (t) == wchar_array_type_node;
- len = TREE_STRING_LENGTH (t) - (this_wide ? wchar_bytes : 1);
- if (this_wide == wide_flag)
- {
- memcpy (q, TREE_STRING_POINTER (t), len);
- q += len;
- }
- else
- {
- const int nzeros = (TYPE_PRECISION (wchar_type_node)
- / BITS_PER_UNIT) - 1;
- int j, k;
-
- if (BYTES_BIG_ENDIAN)
- {
- for (k = 0; k < len; k++)
- {
- for (j = 0; j < nzeros; j++)
- *q++ = 0;
- *q++ = TREE_STRING_POINTER (t)[k];
- }
- }
- else
- {
- for (k = 0; k < len; k++)
- {
- *q++ = TREE_STRING_POINTER (t)[k];
- for (j = 0; j < nzeros; j++)
- *q++ = 0;
- }
- }
- }
- }
-
- /* Nul terminate the string. */
- if (wide_flag)
- {
- for (i = 0; i < wchar_bytes; i++)
- *q++ = 0;
- }
- else
- *q = 0;
-
- value = build_string (length, p);
- free (p);
-
- if (wide_flag)
- TREE_TYPE (value) = wchar_array_type_node;
- else
- TREE_TYPE (value) = char_array_type_node;
-
- return value;
-}
static int is_valid_printf_arglist (tree);
static rtx c_expand_builtin (tree, rtx, enum machine_mode,
diff --git a/gcc/c-common.h b/gcc/c-common.h
index f4dc8f7c4c1..d1c3e5a0fff 100644
--- a/gcc/c-common.h
+++ b/gcc/c-common.h
@@ -883,7 +883,6 @@ extern void start_fname_decls (void);
extern void finish_fname_decls (void);
extern const char *fname_as_string (int);
extern tree fname_decl (unsigned, tree);
-extern const char *fname_string (unsigned);
extern void check_function_arguments (tree, tree);
extern void check_function_arguments_recurse (void (*)
@@ -922,7 +921,6 @@ extern void c_expand_end_cond (void);
extern tree check_case_value (tree);
extern tree fix_string_type (tree);
struct varray_head_tag;
-extern tree combine_strings (struct varray_head_tag *);
extern void constant_expression_warning (tree);
extern tree convert_and_check (tree, tree);
extern void overflow_warning (tree);
diff --git a/gcc/c-lex.c b/gcc/c-lex.c
index 2cca2313c2f..f5733604a5a 100644
--- a/gcc/c-lex.c
+++ b/gcc/c-lex.c
@@ -61,16 +61,13 @@ static splay_tree file_info_tree;
int pending_lang_change; /* If we need to switch languages - C++ only */
int c_header_level; /* depth in C headers - C++ only */
-/* Nonzero tells yylex to ignore \ in string constants. */
-static int ignore_escape_flag;
-
static tree interpret_integer (const cpp_token *, unsigned int);
static tree interpret_float (const cpp_token *, unsigned int);
static enum integer_type_kind
narrowest_unsigned_type (tree, unsigned int);
static enum integer_type_kind
narrowest_signed_type (tree, unsigned int);
-static tree lex_string (const cpp_string *);
+static enum cpp_ttype lex_string (const cpp_token *, tree *, bool);
static tree lex_charconst (const cpp_token *);
static void update_header_times (const char *);
static int dump_one_header (splay_tree_node, void *);
@@ -184,8 +181,12 @@ cb_ident (cpp_reader *pfile ATTRIBUTE_UNUSED,
if (! flag_no_ident)
{
/* Convert escapes in the string. */
- tree value ATTRIBUTE_UNUSED = lex_string (str);
- ASM_OUTPUT_IDENT (asm_out_file, TREE_STRING_POINTER (value));
+ cpp_string cstr = { 0, 0 };
+ if (cpp_interpret_string (pfile, str, 1, &cstr, false))
+ {
+ ASM_OUTPUT_IDENT (asm_out_file, cstr.text);
+ free ((void *)cstr.text);
+ }
}
#endif
}
@@ -296,12 +297,10 @@ cb_undef (cpp_reader *pfile ATTRIBUTE_UNUSED, unsigned int line,
(const char *) NODE_NAME (node));
}
-int
-c_lex (tree *value)
+static inline const cpp_token *
+get_nonpadding_token (void)
{
const cpp_token *tok;
-
- retry:
timevar_push (TV_CPP);
do
tok = cpp_get_token (parse_in);
@@ -310,10 +309,22 @@ c_lex (tree *value)
/* The C++ front end does horrible things with the current line
number. To ensure an accurate line number, we must reset it
- every time we return a token. */
+ every time we advance a token. */
input_line = src_lineno;
- *value = NULL_TREE;
+ return tok;
+}
+
+int
+c_lex (tree *value)
+{
+ const cpp_token *tok;
+ location_t atloc;
+
+ retry:
+ tok = get_nonpadding_token ();
+
+ retry_after_at:
switch (tok->type)
{
case CPP_NAME:
@@ -345,6 +356,37 @@ c_lex (tree *value)
}
break;
+ case CPP_ATSIGN:
+ /* An @ may give the next token special significance in Objective-C. */
+ atloc = input_location;
+ tok = get_nonpadding_token ();
+ if (c_dialect_objc ())
+ {
+ tree val;
+ switch (tok->type)
+ {
+ case CPP_NAME:
+ val = HT_IDENT_TO_GCC_IDENT (HT_NODE (tok->val.node));
+ if (C_IS_RESERVED_WORD (val)
+ && OBJC_IS_AT_KEYWORD (C_RID_CODE (val)))
+ {
+ *value = val;
+ return CPP_AT_NAME;
+ }
+ break;
+
+ case CPP_STRING:
+ case CPP_WSTRING:
+ return lex_string (tok, value, true);
+
+ default: break;
+ }
+ }
+
+ /* ... or not. */
+ error ("%Hstray '@' in program", &atloc);
+ goto retry_after_at;
+
case CPP_OTHER:
{
cppchar_t c = tok->val.str.text[0];
@@ -365,7 +407,7 @@ c_lex (tree *value)
case CPP_STRING:
case CPP_WSTRING:
- *value = lex_string (&tok->val.str);
+ return lex_string (tok, value, false);
break;
/* These tokens should not be visible outside cpplib. */
@@ -374,7 +416,9 @@ c_lex (tree *value)
case CPP_MACRO_ARG:
abort ();
- default: break;
+ default:
+ *value = NULL_TREE;
+ break;
}
return tok->type;
@@ -571,75 +615,100 @@ interpret_float (const cpp_token *token, unsigned int flags)
return value;
}
-static tree
-lex_string (const cpp_string *str)
+/* Convert a series of STRING and/or WSTRING tokens into a tree,
+ performing string constant concatenation. TOK is the first of
+ these. VALP is the location to write the string into. OBJC_STRING
+ indicates whether an '@' token preceded the incoming token.
+ Returns the CPP token type of the result (CPP_STRING, CPP_WSTRING,
+ or CPP_OBJC_STRING).
+
+ This is unfortunately more work than it should be. If any of the
+ strings in the series has an L prefix, the result is a wide string
+ (6.4.5p4). Whether or not the result is a wide string affects the
+ meaning of octal and hexadecimal escapes (6.4.4.4p6,9). But escape
+ sequences do not continue across the boundary between two strings in
+ a series (6.4.5p7), so we must not lose the boundaries. Therefore
+ cpp_interpret_string takes a vector of cpp_string structures, which
+ we must arrange to provide. */
+
+static enum cpp_ttype
+lex_string (const cpp_token *tok, tree *valp, bool objc_string)
{
- bool wide;
tree value;
- char *buf, *q;
- cppchar_t c;
- const unsigned char *p, *limit;
+ bool wide = false;
+ size_t count = 1;
+ struct obstack str_ob;
+ cpp_string istr;
- wide = str->text[0] == 'L';
- p = str->text + 1 + wide;
- limit = str->text + str->len - 1;
- q = buf = alloca ((str->len + 1) * (wide ? WCHAR_BYTES : 1));
+ /* Try to avoid the overhead of creating and destroying an obstack
+ for the common case of just one string. */
+ cpp_string str = tok->val.str;
+ cpp_string *strs = &str;
- while (p < limit)
- {
- c = *p++;
+ if (tok->type == CPP_WSTRING)
+ wide = true;
- if (c == '\\' && !ignore_escape_flag)
- c = cpp_parse_escape (parse_in, &p, limit, wide);
+ tok = get_nonpadding_token ();
+ if (c_dialect_objc () && tok->type == CPP_ATSIGN)
+ {
+ objc_string = true;
+ tok = get_nonpadding_token ();
+ }
+ if (tok->type == CPP_STRING || tok->type == CPP_WSTRING)
+ {
+ gcc_obstack_init (&str_ob);
+ obstack_grow (&str_ob, &str, sizeof (cpp_string));
- /* Add this single character into the buffer either as a wchar_t,
- a multibyte sequence, or as a single byte. */
- if (wide)
+ do
{
- unsigned charwidth = TYPE_PRECISION (char_type_node);
- unsigned bytemask = (1 << charwidth) - 1;
- int byte;
-
- for (byte = 0; byte < WCHAR_BYTES; ++byte)
+ count++;
+ if (tok->type == CPP_WSTRING)
+ wide = true;
+ obstack_grow (&str_ob, &tok->val.str, sizeof (cpp_string));
+
+ tok = get_nonpadding_token ();
+ if (c_dialect_objc () && tok->type == CPP_ATSIGN)
{
- int n;
- if (byte >= (int) sizeof (c))
- n = 0;
- else
- n = (c >> (byte * charwidth)) & bytemask;
- if (BYTES_BIG_ENDIAN)
- q[WCHAR_BYTES - byte - 1] = n;
- else
- q[byte] = n;
+ objc_string = true;
+ tok = get_nonpadding_token ();
}
- q += WCHAR_BYTES;
- }
- else
- {
- *q++ = c;
}
+ while (tok->type == CPP_STRING || tok->type == CPP_WSTRING);
+ strs = obstack_finish (&str_ob);
}
- /* Terminate the string value, either with a single byte zero
- or with a wide zero. */
+ /* We have read one more token than we want. */
+ _cpp_backup_tokens (parse_in, 1);
+
+ if (count > 1 && !objc_string && warn_traditional && !in_system_header)
+ warning ("traditional C rejects string constant concatenation");
- if (wide)
+ if (cpp_interpret_string (parse_in, strs, count, &istr, wide))
{
- memset (q, 0, WCHAR_BYTES);
- q += WCHAR_BYTES;
+ value = build_string (istr.len, (char *)istr.text);
+ free ((void *)istr.text);
}
else
{
- *q++ = '\0';
+ /* Callers cannot generally handle error_mark_node in this context,
+ so return the empty string instead. cpp_interpret_string has
+ issued an error. */
+ if (wide)
+ value = build_string (TYPE_PRECISION (wchar_type_node)
+ / TYPE_PRECISION (char_type_node),
+ "\0\0\0"); /* widest supported wchar_t
+ is 32 bits */
+ else
+ value = build_string (1, "");
}
- value = build_string (q - buf, buf);
+ TREE_TYPE (value) = wide ? wchar_array_type_node : char_array_type_node;
+ *valp = fix_string_type (value);
- if (wide)
- TREE_TYPE (value) = wchar_array_type_node;
- else
- TREE_TYPE (value) = char_array_type_node;
- return value;
+ if (strs != &str)
+ obstack_free (&str_ob, 0);
+
+ return objc_string ? CPP_OBJC_STRING : wide ? CPP_WSTRING : CPP_STRING;
}
/* Converts a (possibly wide) character constant token into a tree. */
diff --git a/gcc/c-opts.c b/gcc/c-opts.c
index b89b68467c7..52a7536140e 100644
--- a/gcc/c-opts.c
+++ b/gcc/c-opts.c
@@ -46,10 +46,6 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
# define TARGET_SYSTEM_ROOT NULL
#endif
-#ifndef TARGET_EBCDIC
-# define TARGET_EBCDIC 0
-#endif
-
static int saved_lineno;
/* CPP's options. */
@@ -143,6 +139,8 @@ missing_arg (enum opt_code code)
case OPT_fdump_:
case OPT_fname_mangling_version_:
case OPT_ftabstop_:
+ case OPT_fexec_charset_:
+ case OPT_fwide_exec_charset_:
case OPT_ftemplate_depth_:
case OPT_iprefix:
case OPT_iwithprefix:
@@ -892,6 +890,14 @@ c_common_handle_option (size_t scode, const char *arg, int value)
cpp_opts->tabstop = value;
break;
+ case OPT_fexec_charset_:
+ cpp_opts->narrow_charset = arg;
+ break;
+
+ case OPT_fwide_exec_charset_:
+ cpp_opts->wide_charset = arg;
+ break;
+
case OPT_ftemplate_depth_:
max_tinst_depth = value;
break;
@@ -1145,7 +1151,11 @@ c_common_init (void)
cpp_opts->int_precision = TYPE_PRECISION (integer_type_node);
cpp_opts->wchar_precision = TYPE_PRECISION (wchar_type_node);
cpp_opts->unsigned_wchar = TREE_UNSIGNED (wchar_type_node);
- cpp_opts->EBCDIC = TARGET_EBCDIC;
+ cpp_opts->bytes_big_endian = BYTES_BIG_ENDIAN;
+
+ /* This can't happen until after wchar_precision and bytes_big_endian
+ are known. */
+ cpp_init_iconv (parse_in);
if (flag_preprocess_only)
{
@@ -1571,6 +1581,12 @@ Switches:\n\
fputs (_("\
-f[no-]preprocessed Treat the input file as already preprocessed\n\
-ftabstop=<number> Distance between tab stops for column reporting\n\
+ -ftarget-charset=<c> Convert all strings and character constants\n\
+ to character set <c>\n\
+ -ftarget-wide-charset=<c> Convert all wide strings and character constants\n\
+ to character set <c>\n\
+"), stdout);
+ fputs (_("\
-isysroot <dir> Set <dir> to be the system root directory\n\
-P Do not generate #line directives\n\
-remap Remap file names when including files\n\
diff --git a/gcc/c-parse.in b/gcc/c-parse.in
index d5750138487..b62f2ff7294 100644
--- a/gcc/c-parse.in
+++ b/gcc/c-parse.in
@@ -151,9 +151,7 @@ do { \
%token ATTRIBUTE EXTENSION LABEL
%token REALPART IMAGPART VA_ARG CHOOSE_EXPR TYPES_COMPATIBLE_P
%token PTR_VALUE PTR_BASE PTR_EXTENT
-
-/* function name can be a string const or a var decl. */
-%token STRING_FUNC_NAME VAR_FUNC_NAME
+%token FUNC_NAME
/* Add precedence rules to solve dangling else s/r conflict */
%nonassoc IF
@@ -183,6 +181,7 @@ do { \
Objective C, so that the token codes are the same in both. */
%token INTERFACE IMPLEMENTATION END SELECTOR DEFS ENCODE
%token CLASSNAME PUBLIC PRIVATE PROTECTED PROTOCOL OBJECTNAME CLASS ALIAS
+%token OBJC_STRING
%type <code> unop
%type <ttype> ENUM STRUCT UNION IF ELSE WHILE DO FOR SWITCH CASE DEFAULT
@@ -249,9 +248,9 @@ ifobjc
%type <ttype> keywordexpr keywordarglist keywordarg
%type <ttype> myparms myparm optparmlist reservedwords objcselectorexpr
%type <ttype> selectorarg keywordnamelist keywordname objcencodeexpr
-%type <ttype> objc_string non_empty_protocolrefs protocolrefs identifier_list objcprotocolexpr
+%type <ttype> non_empty_protocolrefs protocolrefs identifier_list objcprotocolexpr
-%type <ttype> CLASSNAME OBJECTNAME
+%type <ttype> CLASSNAME OBJECTNAME OBJC_STRING
end ifobjc
%{
@@ -340,7 +339,6 @@ static bool parsing_iso_function_signature;
static void yyprint PARAMS ((FILE *, int, YYSTYPE));
static void yyerror PARAMS ((const char *));
static int yylexname PARAMS ((void));
-static int yylexstring PARAMS ((void));
static inline int _yylex PARAMS ((void));
static int yylex PARAMS ((void));
static void init_reswords PARAMS ((void));
@@ -657,8 +655,7 @@ primary:
}
| CONSTANT
| STRING
- { $$ = fix_string_type ($$); }
- | VAR_FUNC_NAME
+ | FUNC_NAME
{ $$ = fname_decl (C_RID_CODE ($$), $$); }
| '(' typename ')' '{'
{ start_init (NULL_TREE, NULL, 0);
@@ -763,22 +760,11 @@ ifobjc
{ $$ = build_protocol_expr ($1); }
| objcencodeexpr
{ $$ = build_encode_expr ($1); }
- | objc_string
+ | OBJC_STRING
{ $$ = build_objc_string_object ($1); }
end ifobjc
;
-ifobjc
-/* Produces an STRING_CST with perhaps more STRING_CSTs chained
- onto it, which is to be read as an ObjC string object. */
-objc_string:
- '@' STRING
- { $$ = $2; }
- | objc_string '@' STRING
- { $$ = chainon ($1, $3); }
- ;
-end ifobjc
-
old_style_parm_decls:
old_style_parm_decls_1
{
@@ -3494,9 +3480,9 @@ static const short rid_to_yy[RID_MAX] =
/* RID_CHOOSE_EXPR */ CHOOSE_EXPR,
/* RID_TYPES_COMPATIBLE_P */ TYPES_COMPATIBLE_P,
- /* RID_FUNCTION_NAME */ STRING_FUNC_NAME,
- /* RID_PRETTY_FUNCTION_NAME */ STRING_FUNC_NAME,
- /* RID_C99_FUNCTION_NAME */ VAR_FUNC_NAME,
+ /* RID_FUNCTION_NAME */ FUNC_NAME,
+ /* RID_PRETTY_FUNCTION_NAME */ FUNC_NAME,
+ /* RID_C99_FUNCTION_NAME */ FUNC_NAME,
/* C++ */
/* RID_BOOL */ TYPESPEC,
@@ -3627,22 +3613,9 @@ ifobjc
&& (!OBJC_IS_PQ_KEYWORD (rid_code) || objc_pq_context))
end ifobjc
{
- int yycode = rid_to_yy[(int) rid_code];
- if (yycode == STRING_FUNC_NAME)
- {
- /* __FUNCTION__ and __PRETTY_FUNCTION__ get converted
- to string constants. */
- const char *name = fname_string (rid_code);
-
- yylval.ttype = build_string (strlen (name) + 1, name);
- C_ARTIFICIAL_STRING_P (yylval.ttype) = 1;
- last_token = CPP_STRING; /* so yyerror won't choke */
- return STRING;
- }
-
/* Return the canonical spelling for this keyword. */
yylval.ttype = ridpointers[(int) rid_code];
- return yycode;
+ return rid_to_yy[(int) rid_code];
}
}
@@ -3671,57 +3644,6 @@ end ifobjc
return IDENTIFIER;
}
-/* Concatenate strings before returning them to the parser. This isn't quite
- as good as having it done in the lexer, but it's better than nothing. */
-
-static int
-yylexstring ()
-{
- enum cpp_ttype next_type;
- tree orig = yylval.ttype;
-
- next_type = c_lex (&yylval.ttype);
- if (next_type == CPP_STRING
- || next_type == CPP_WSTRING
- || (next_type == CPP_NAME && yylexname () == STRING))
- {
- varray_type strings;
-
-ifc
- static location_t last_location;
- if (warn_traditional && !in_system_header
- && (input_location.line != last_location.line
- || !last_location.file ||
- strcmp (last_location.file, input_location.file)))
- {
- warning ("traditional C rejects string concatenation");
- last_location = input_location;
- }
-end ifc
-
- VARRAY_TREE_INIT (strings, 32, "strings");
- VARRAY_PUSH_TREE (strings, orig);
-
- do
- {
- VARRAY_PUSH_TREE (strings, yylval.ttype);
- next_type = c_lex (&yylval.ttype);
- }
- while (next_type == CPP_STRING
- || next_type == CPP_WSTRING
- || (next_type == CPP_NAME && yylexname () == STRING));
-
- yylval.ttype = combine_strings (strings);
- }
- else
- yylval.ttype = orig;
-
- /* We will have always read one token too many. */
- _cpp_backup_tokens (parse_in, 1);
-
- return STRING;
-}
-
static inline int
_yylex ()
{
@@ -3787,13 +3709,11 @@ _yylex ()
return 0;
case CPP_NAME:
- {
- int ret = yylexname ();
- if (ret == STRING)
- return yylexstring ();
- else
- return ret;
- }
+ return yylexname ();
+
+ case CPP_AT_NAME:
+ /* This only happens in Objective-C; it must be a keyword. */
+ return rid_to_yy [(int) C_RID_CODE (yylval.ttype)];
case CPP_NUMBER:
case CPP_CHAR:
@@ -3802,30 +3722,10 @@ _yylex ()
case CPP_STRING:
case CPP_WSTRING:
- return yylexstring ();
-
- /* This token is Objective-C specific. It gives the next token
- special significance. */
- case CPP_ATSIGN:
-ifobjc
- {
- tree after_at;
- enum cpp_ttype after_at_type;
-
- after_at_type = c_lex (&after_at);
-
- if (after_at_type == CPP_NAME
- && C_IS_RESERVED_WORD (after_at)
- && OBJC_IS_AT_KEYWORD (C_RID_CODE (after_at)))
- {
- yylval.ttype = after_at;
- last_token = after_at_type;
- return rid_to_yy [(int) C_RID_CODE (after_at)];
- }
- _cpp_backup_tokens (parse_in, 1);
- return '@';
- }
-end ifobjc
+ return STRING;
+
+ case CPP_OBJC_STRING:
+ return OBJC_STRING;
/* These tokens are C++ specific (and will not be generated
in C mode, but let's be cautious). */
diff --git a/gcc/c.opt b/gcc/c.opt
index 15c344b887e..e8f61df9c9a 100644
--- a/gcc/c.opt
+++ b/gcc/c.opt
@@ -368,6 +368,9 @@ C++ ObjC++
fenum-int-equiv
C++ ObjC++
+fexec-charset=
+C ObjC C++ ObjC++ Joined RejectNegative
+
fexternal-templates
C++ ObjC++
@@ -509,6 +512,9 @@ C++ ObjC++
fweak
C++ ObjC++
+fwide-exec-charset=
+C ObjC C++ ObjC++ Joined RejectNegative
+
fxref
C++ ObjC++
diff --git a/gcc/cp/ChangeLog b/gcc/cp/ChangeLog
index 076dcbd300e..8776631ee27 100644
--- a/gcc/cp/ChangeLog
+++ b/gcc/cp/ChangeLog
@@ -1,3 +1,8 @@
+2003-07-04 Zack Weinberg <zack@codesourcery.com>
+
+ * parser.c (cp_lexer_read_token): No need to handle string
+ constant concatenation.
+
2003-07-03 Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
* cp-tree.h (GCC_DIAG_STYLE, ATTRIBUTE_GCC_CXXDIAG): Define.
@@ -51,7 +56,7 @@
(convert_for_initialization): Likewise.
* typeck2.c (build_x_arrow): Likewise.
(build_m_component_ref): Simplify.
-
+
* call.c (build_scoped_method_call): Use convert_to_void.
(build_method_call): Likewise.
* class.c (check_field_decls): Remove dead code.
@@ -63,7 +68,7 @@
(build_vec_delete_1): Use convert_to_void.
* mangle.c (write_type): Avoid relying on POINTER_TYPE over OFFSET_TYPE
as pointer-to-member representation.
-
+
2003-07-03 Nathan Sidwell <nathan@codesourcery.com>
PR c++/9162
@@ -145,10 +150,10 @@ Wed Jul 2 00:36:48 CEST 2003 Jan Hubicka <jh@suse.cz>
is a class type.
2003-07-01 Giovanni Bajo <giovannibajo@libero.it>
-
- PR c++/8046
- * error.c (dump_decl): Handle BIT_NOT_EXPR as
- pseudo destructor calls.
+
+ PR c++/8046
+ * error.c (dump_decl): Handle BIT_NOT_EXPR as
+ pseudo destructor calls.
2003-07-01 Nathan Sidwell <nathan@codesourcery.com>
@@ -176,28 +181,28 @@ Wed Jul 2 00:36:48 CEST 2003 Jan Hubicka <jh@suse.cz>
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/4933
- * error.c (dump_expr): Support correctly the COMPOUND_EXPR
- tree generated within a template. Use dump_expr to dump an
- expression sizeof.
+ PR c++/4933
+ * error.c (dump_expr): Support correctly the COMPOUND_EXPR
+ tree generated within a template. Use dump_expr to dump an
+ expression sizeof.
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- * mangle.c (write_expression): Exit gracefully when trying to
- mangle a CALL_EXPR.
+ * mangle.c (write_expression): Exit gracefully when trying to
+ mangle a CALL_EXPR.
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/10750
- * parser.c (cp_parser_primary_expression): A VAR_DECL with a
- (value- or type-) dependent expression as DECL_INITIAL is a
- valid constant-expression (at parser time).
+ PR c++/10750
+ * parser.c (cp_parser_primary_expression): A VAR_DECL with a
+ (value- or type-) dependent expression as DECL_INITIAL is a
+ valid constant-expression (at parser time).
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/11106
- * error.c (dump_decl): Call dump_decl to dump the DECL_NAME for a
- USING_DECL, instead of print_tree_identifier.
+ PR c++/11106
+ * error.c (dump_decl): Call dump_decl to dump the DECL_NAME for a
+ USING_DECL, instead of print_tree_identifier.
2003-06-29 Gabriel Dos Reis <gdr@integrable-solutions.net>
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index d19e40383e5..1f484663e2c 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -479,66 +479,22 @@ cp_lexer_read_token (cp_lexer* lexer)
/* Increment LAST_TOKEN. */
lexer->last_token = cp_lexer_next_token (lexer, token);
- /* The preprocessor does not yet do translation phase six, i.e., the
- combination of adjacent string literals. Therefore, we do it
- here. */
- if (token->type == CPP_STRING || token->type == CPP_WSTRING)
- {
- ptrdiff_t delta;
- int i;
-
- /* When we grow the buffer, we may invalidate TOKEN. So, save
- the distance from the beginning of the BUFFER so that we can
- recaulate it. */
- delta = cp_lexer_token_difference (lexer, lexer->buffer, token);
- /* Make sure there is room in the buffer for another token. */
- cp_lexer_maybe_grow_buffer (lexer);
- /* Restore TOKEN. */
- token = lexer->buffer;
- for (i = 0; i < delta; ++i)
- token = cp_lexer_next_token (lexer, token);
-
- VARRAY_PUSH_TREE (lexer->string_tokens, token->value);
- while (true)
- {
- /* Read the token after TOKEN. */
- cp_lexer_get_preprocessor_token (lexer, lexer->last_token);
- /* See whether it's another string constant. */
- if (lexer->last_token->type != token->type)
- {
- /* If not, then it will be the next real token. */
- lexer->last_token = cp_lexer_next_token (lexer,
- lexer->last_token);
- break;
- }
-
- /* Chain the strings together. */
- VARRAY_PUSH_TREE (lexer->string_tokens,
- lexer->last_token->value);
- }
-
- /* Create a single STRING_CST. Curiously we have to call
- combine_strings even if there is only a single string in
- order to get the type set correctly. */
- token->value = combine_strings (lexer->string_tokens);
- VARRAY_CLEAR (lexer->string_tokens);
- token->value = fix_string_type (token->value);
- /* Strings should have type `const char []'. Right now, we will
- have an ARRAY_TYPE that is constant rather than an array of
- constant elements. */
- if (flag_const_strings)
- {
- tree type;
+ /* Strings should have type `const char []'. Right now, we will
+ have an ARRAY_TYPE that is constant rather than an array of
+ constant elements.
+ FIXME: Make fix_string_type get this right in the first place. */
+ if ((token->type == CPP_STRING || token->type == CPP_WSTRING)
+ && flag_const_strings)
+ {
+ tree type;
- /* Get the current type. It will be an ARRAY_TYPE. */
- type = TREE_TYPE (token->value);
- /* Use build_cplus_array_type to rebuild the array, thereby
- getting the right type. */
- type = build_cplus_array_type (TREE_TYPE (type),
- TYPE_DOMAIN (type));
- /* Reset the type of the token. */
- TREE_TYPE (token->value) = type;
- }
+ /* Get the current type. It will be an ARRAY_TYPE. */
+ type = TREE_TYPE (token->value);
+ /* Use build_cplus_array_type to rebuild the array, thereby
+ getting the right type. */
+ type = build_cplus_array_type (TREE_TYPE (type), TYPE_DOMAIN (type));
+ /* Reset the type of the token. */
+ TREE_TYPE (token->value) = type;
}
return token;
diff --git a/gcc/cppcharset.c b/gcc/cppcharset.c
index f506ba2bc1b..0ba7e930ab0 100644
--- a/gcc/cppcharset.c
+++ b/gcc/cppcharset.c
@@ -24,8 +24,278 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
#include "tm.h"
#include "cpplib.h"
#include "cpphash.h"
+#include "cppucnid.h"
+
+/* Character set handling for C-family languages.
+
+ Terminological note: In what follows, "charset" or "character set"
+ will be taken to mean both an abstract set of characters and an
+ encoding for that set.
+
+ The C99 standard discusses two character sets: source and execution.
+ The source character set is used for internal processing in translation
+ phases 1 through 4; the execution character set is used thereafter.
+ Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
+ character encodings (see 3.7.2, 3.7.3 for the standardese meanings
+ of these terms). Furthermore, the "basic character set" (listed in
+ 5.2.1p3) is to be encoded in each with values one byte wide, and is
+ to appear in the initial shift state.
+
+ It is not explicitly mentioned, but there is also a "wide execution
+ character set" used to encode wide character constants and wide
+ string literals; this is supposed to be the result of applying the
+ standard library function mbstowcs() to an equivalent narrow string
+ (6.4.5p5). However, the behavior of hexadecimal and octal
+ \-escapes is at odds with this; they are supposed to be translated
+ directly to wchar_t values (6.4.4.4p5,6).
+
+ The source character set is not necessarily the character set used
+ to encode physical source files on disk; translation phase 1 converts
+ from whatever that encoding is to the source character set.
+
+ The presence of universal character names in C99 (6.4.3 et seq.)
+ forces the source character set to be isomorphic to ISO 10646,
+ that is, Unicode. There is no such constraint on the execution
+ character set; note also that the conversion from source to
+ execution character set does not occur for identifiers (5.1.1.2p1#5).
+
+ For convenience of implementation, the source character set's
+ encoding of the basic character set should be identical to the
+ execution character set OF THE HOST SYSTEM's encoding of the basic
+ character set, and it should not be a state-dependent encoding.
+
+ cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
+ depending on whether the host is based on ASCII or EBCDIC (see
+ respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
+ Technical Report #16). It relies on the system library's iconv()
+ primitive to do charset conversion (specified in SUSv2). If this
+ primitive is not present, the source and execution character sets
+ must be identical and are limited to the basic ASCII or EBCDIC
+ range, and wide characters are implemented by padding narrow
+ characters to the size of wchar_t. */
+
+#if !HAVE_ICONV
+/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
+ below, which are guarded only by if statements with compile-time
+ constant conditions, do not cause link errors. */
+#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
+#define iconv(a,b,c,d,e) (errno = EINVAL, (iconv_t)-1)
+#define iconv_close(x) 0
+#endif
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+#define SOURCE_CHARSET "UTF-8"
+#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
+#define SOURCE_CHARSET "UTF-EBCDIC"
+#else
+#error "Unrecognized basic host character set"
+#endif
+
+/* This structure is used for a resizable string buffer, mostly by
+ convert_cset and cpp_interpret_string. */
+struct strbuf
+{
+ uchar *text;
+ size_t asize;
+ size_t len;
+};
+
+/* This is enough to hold any string that fits on a single 80-column
+ line, even if iconv quadruples its size (e.g. conversion from
+ ASCII to UCS-4) rounded up to a power of two. */
+#define OUTBUF_BLOCK_SIZE 256
+
+/* Subroutine of cpp_init_iconv: initialize and return an iconv
+ descriptor for conversion from FROM to TO. If iconv_open() fails,
+ issue an error and return (iconv_t) -1. Silently return
+ (iconv_t) -1 if FROM and TO are identical. */
+static iconv_t
+init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
+{
+ iconv_t dsc;
+
+ if (!strcmp (to, from))
+ return (iconv_t) -1;
+
+ dsc = iconv_open (to, from);
+ if (dsc == (iconv_t) -1)
+ {
+ if (errno == EINVAL)
+ cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
+ "conversion from %s to %s not supported by iconv",
+ from, to);
+ else
+ cpp_errno (pfile, DL_ERROR, "iconv_open");
+ }
+ return dsc;
+}
+
+/* If charset conversion is requested, initialize iconv(3) descriptors
+ for conversion from the source character set to the execution
+ character sets. If iconv is not present in the C library, and
+ conversion is requested, issue an error. */
+
+void
+cpp_init_iconv (cpp_reader *pfile)
+{
+ const char *ncset = CPP_OPTION (pfile, narrow_charset);
+ const char *wcset = CPP_OPTION (pfile, wide_charset);
+ const char *default_wcset;
+
+ bool be = CPP_OPTION (pfile, bytes_big_endian);
+
+ if (CPP_OPTION (pfile, wchar_precision) >= 32)
+ default_wcset = be ? "UCS-4BE" : "UCS-4LE";
+ else if (CPP_OPTION (pfile, wchar_precision) >= 16)
+ default_wcset = be ? "UCS-2BE" : "UCS-2LE";
+ else
+ /* This effectively means that wide strings are not supported,
+ so don't do any conversion at all. */
+ default_wcset = SOURCE_CHARSET;
+
+ if (!HAVE_ICONV)
+ {
+ if (ncset && strcmp (ncset, SOURCE_CHARSET))
+ cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
+ "no iconv implementation, cannot convert to %s", ncset);
+
+ if (wcset && strcmp (wcset, default_wcset))
+ cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
+ "no iconv implementation, cannot convert to %s", wcset);
+ }
+ else
+ {
+ if (!ncset)
+ ncset = SOURCE_CHARSET;
+ if (!wcset)
+ wcset = default_wcset;
+
+ pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
+ pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
+ }
+}
+
+void
+_cpp_destroy_iconv (cpp_reader *pfile)
+{
+ if (HAVE_ICONV)
+ {
+ if (pfile->narrow_cset_desc != (iconv_t) -1)
+ iconv_close (pfile->narrow_cset_desc);
+ if (pfile->wide_cset_desc != (iconv_t) -1)
+ iconv_close (pfile->wide_cset_desc);
+ }
+}
+
+/* iconv(3) utility wrapper. Convert the string FROM, of length FLEN,
+ according to the iconv descriptor CD. The result is appended to
+ the string buffer TO. If DESC is (iconv_t)-1 or iconv is not
+ available, the string is simply copied into TO.
+
+ Returns true on success, false on error. */
+
+static bool
+convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
+{
+ if (!HAVE_ICONV || cd == (iconv_t)-1)
+ {
+ if (to->len + flen > to->asize)
+ {
+ to->asize = to->len + flen;
+ to->text = xrealloc (to->text, to->asize);
+ }
+ memcpy (to->text + to->len, from, flen);
+ to->len += flen;
+ return true;
+ }
+ else
+ {
+ char *inbuf, *outbuf;
+ size_t inbytesleft, outbytesleft;
+
+ /* Reset conversion descriptor and check that it is valid. */
+ if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
+ return false;
+
+ inbuf = (char *)from;
+ inbytesleft = flen;
+ outbuf = (char *)to->text + to->len;
+ outbytesleft = to->asize - to->len;
+
+ for (;;)
+ {
+ iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+ if (__builtin_expect (inbytesleft == 0, 1))
+ {
+ to->len = to->asize - outbytesleft;
+ return true;
+ }
+ if (errno != E2BIG)
+ return false;
+
+ outbytesleft += OUTBUF_BLOCK_SIZE;
+ to->asize += OUTBUF_BLOCK_SIZE;
+ to->text = xrealloc (to->text, to->asize);
+ outbuf = (char *)to->text + to->asize - outbytesleft;
+ }
+ }
+}
+
+/* Utility routine that computes a mask of the form 0000...111... with
+ WIDTH 1-bits. */
+static inline size_t
+width_to_mask (size_t width)
+{
+ width = MIN (width, BITS_PER_CPPCHAR_T);
+ if (width >= CHAR_BIT * sizeof (size_t))
+ return ~(size_t) 0;
+ else
+ return ((size_t) 1 << width) - 1;
+}
+
+
+
+/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
+ the start of an identifier, and 0 if C is not valid in an
+ identifier. We assume C has already gone through the checks of
+ _cpp_valid_ucn. The algorithm is a simple binary search on the
+ table defined in cppucnid.h. */
+
+static int
+ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
+{
+ int mn, mx, md;
+
+ mn = -1;
+ mx = ARRAY_SIZE (ucnranges);
+ while (mx - mn > 1)
+ {
+ md = (mn + mx) / 2;
+ if (c < ucnranges[md].lo)
+ mx = md;
+ else if (c > ucnranges[md].hi)
+ mn = md;
+ else
+ goto found;
+ }
+ return 0;
-static int ucn_valid_in_identifier (cpp_reader *, cppchar_t);
+ found:
+ /* When -pedantic, we require the character to have been listed by
+ the standard for the current language. Otherwise, we accept the
+ union of the acceptable sets for C++98 and C99. */
+ if (CPP_PEDANTIC (pfile)
+ && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
+ || (CPP_OPTION (pfile, cplusplus)
+ && !(ucnranges[md].flags & CXX))))
+ return 0;
+
+ /* In C99, UCN digits may not begin identifiers. */
+ if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
+ return 2;
+
+ return 1;
+}
/* [lex.charset]: The character designated by the universal character
name \UNNNNNNNN is that character whose character short name in
@@ -52,20 +322,21 @@ static int ucn_valid_in_identifier (cpp_reader *, cppchar_t);
*/
cppchar_t
-_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, int identifier_pos)
+_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
+ const uchar *limit, int identifier_pos)
{
cppchar_t result, c;
unsigned int length;
const uchar *str = *pstr;
const uchar *base = str - 2;
- /* Only attempt to interpret a UCS for C++ and C99. */
if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
- return 0;
-
- /* We don't accept UCNs for an EBCDIC target. */
- if (CPP_OPTION (pfile, EBCDIC))
- return 0;
+ cpp_error (pfile, DL_WARNING,
+ "universal character names are only valid in C++ and C99");
+ else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
+ cpp_error (pfile, DL_WARNING,
+ "the meaning of '\\%c' is different in traditional C",
+ (int) str[-1]);
if (str[-1] == 'u')
length = 4;
@@ -83,13 +354,16 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, int identifier_pos)
str++;
result = (result << 4) + hex_value (c);
}
- while (--length);
+ while (--length && str < limit);
*pstr = str;
if (length)
- /* We'll error when we try it out as the start of an identifier. */
- cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
- (int) (str - base), base);
+ {
+ /* We'll error when we try it out as the start of an identifier. */
+ cpp_error (pfile, DL_ERROR, "incomplete universal character name %.*s",
+ (int) (str - base), base);
+ result = 1;
+ }
/* The standard permits $, @ and ` to be specified as UCNs. We use
hex escapes so that this also works with EBCDIC hosts. */
else if ((result < 0xa0
@@ -99,6 +373,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, int identifier_pos)
{
cpp_error (pfile, DL_ERROR, "%.*s is not a valid universal character",
(int) (str - base), base);
+ result = 1;
}
else if (identifier_pos)
{
@@ -113,6 +388,15 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, int identifier_pos)
"universal character %.*s is not valid at the start of an identifier",
(int) (str - base), base);
}
+ /* We don't accept UCNs if iconv is not available or will not
+ convert to the target wide character set. */
+ else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
+ {
+ /* XXX should be DL_SORRY */
+ cpp_error (pfile, DL_ERROR,
+ "universal character names are not supported in this configuration");
+ }
+
if (result == 0)
result = 1;
@@ -120,467 +404,487 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, int identifier_pos)
return result;
}
-/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
- the start of an identifier, and 0 if C is not valid in an
- identifier. We assume C has already gone through the checks of
- _cpp_valid_ucn. */
-static int
-ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
+/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
+ it to the execution character set and write the result into TBUF.
+ An advanced pointer is returned. Issues all relevant diagnostics.
+
+ UTF-8 encoding looks like this:
+
+ value range encoded as
+ 00000000-0000007F 0xxxxxxx
+ 00000080-000007FF 110xxxxx 10xxxxxx
+ 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
+ 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+ Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
+ which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
+ never occur. Note also that any value that can be encoded by a
+ given row of the table can also be encoded by all successive rows,
+ but this is not done; only the shortest possible encoding for any
+ given value is valid. For instance, the character 07C0 could be
+ encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
+ FC 80 80 80 9F 80. Only the first is valid. */
+
+static const uchar *
+convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
+ struct strbuf *tbuf, bool wide)
{
- /* None of the valid chars are outside the Basic Multilingual Plane (the
- low 16 bits). */
- if (c > 0xffff)
- return 0;
+ int nbytes;
+ uchar buf[6], *p = &buf[6];
+ static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+ cppchar_t ucn;
+
+ from++; /* skip u/U */
+ ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
+ if (!ucn)
+ return from;
+
+ nbytes = 1;
+ if (ucn < 0x80)
+ *--p = ucn;
+ else
+ {
+ do
+ {
+ *--p = ((ucn & 0x3F) | 0x80);
+ ucn >>= 6;
+ nbytes++;
+ }
+ while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
+ *--p = (ucn | masks[nbytes-1]);
+ }
+
+ if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
+ p, nbytes, tbuf))
+ cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
+
+ return from;
+}
- if (CPP_OPTION (pfile, c99) || !CPP_PEDANTIC (pfile))
+static void
+emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
+ struct strbuf *tbuf, bool wide)
+{
+ if (wide)
{
- /* Latin. */
- if (c == 0x0aa || c == 0x00ba || c == 0x207f || c == 0x1e9b)
- return 1;
-
- /* Greek. */
- if (c == 0x0386)
- return 1;
-
- /* Cyrillic. */
- if (c == 0x040c)
- return 1;
-
- /* Hebrew. */
- if ((c >= 0x05b0 && c <= 0x05b9)
- || (c >= 0x05bb && c <= 0x005bd)
- || c == 0x05bf
- || (c >= 0x05c1 && c <= 0x05c2))
- return 1;
-
- /* Arabic. */
- if ((c >= 0x06d0 && c <= 0x06dc)
- || c == 0x06e8
- || (c >= 0x06ea && c <= 0x06ed))
- return 1;
-
- /* Devanagari */
- if ((c >= 0x0901 && c <= 0x0903)
- || (c >= 0x093e && c <= 0x094d)
- || (c >= 0x0950 && c <= 0x0952)
- || c == 0x0963)
- return 1;
-
- /* Bengali */
- if ((c >= 0x0981 && c <= 0x0983)
- || (c >= 0x09be && c <= 0x09c4)
- || (c >= 0x09c7 && c <= 0x09c8)
- || (c >= 0x09cb && c <= 0x09cd)
- || (c >= 0x09e2 && c <= 0x09e3))
- return 1;
-
- /* Gurmukhi */
- if (c == 0x0a02
- || (c >= 0x0a3e && c <= 0x0a42)
- || (c >= 0x0a47 && c <= 0x0a48)
- || (c >= 0x0a4b && c <= 0x0a4d)
- || (c == 0x0a74))
- return 1;
-
- /* Gujarati */
- if ((c >= 0x0a81 && c <= 0x0a83)
- || (c >= 0x0abd && c <= 0x0ac5)
- || (c >= 0x0ac7 && c <= 0x0ac9)
- || (c >= 0x0acb && c <= 0x0acd)
- || (c == 0x0ad0))
- return 1;
-
- /* Oriya */
- if ((c >= 0x0b01 && c <= 0x0b03)
- || (c >= 0x0b3e && c <= 0x0b43)
- || (c >= 0x0b47 && c <= 0x0b48)
- || (c >= 0x0b4b && c <= 0x0b4d))
- return 1;
-
- /* Tamil */
- if ((c >= 0x0b82 && c <= 0x0b83)
- || (c >= 0x0bbe && c <= 0x0bc2)
- || (c >= 0x0bc6 && c <= 0x0bc8)
- || (c >= 0x0bc8 && c <= 0x0bcd))
- return 1;
-
- /* Telugu */
- if ((c >= 0x0c01 && c <= 0x0c03)
- || (c >= 0x0c3e && c <= 0x0c44)
- || (c >= 0x0c46 && c <= 0x0c48)
- || (c >= 0x0c4a && c <= 0x0c4d))
- return 1;
-
- /* Kannada */
- if ((c >= 0x0c82 && c <= 0x0c83)
- || (c >= 0x0cbe && c <= 0x0cc4)
- || (c >= 0x0cc6 && c <= 0x0cc8)
- || (c >= 0x0cca && c <= 0x0ccd)
- || c == 0x0cde)
- return 1;
-
- /* Malayalam */
- if ((c >= 0x0d02 && c <= 0x0d03)
- || (c >= 0x0d3e && c <= 0x0d43)
- || (c >= 0x0d46 && c <= 0x0d48)
- || (c >= 0x0d4a && c <= 0x0d4d))
- return 1;
-
- /* Thai */
- if ((c >= 0x0e01 && c <= 0x0e3a)
- || (c >= 0x0e40 && c <= 0x0e5b))
- return 1;
-
- /* Lao */
- if ((c >= 0x0ead && c <= 0x0eae)
- || (c >= 0x0eb0 && c <= 0x0eb9)
- || (c >= 0x0ebb && c <= 0x0ebd)
- || (c >= 0x0ec0 && c <= 0x0ec4)
- || c == 0x0ec6
- || (c >= 0x0ec8 && c <= 0x0ecd)
- || (c >= 0x0edc && c <= 0x0ed))
- return 1;
-
- /* Tibetan. */
- if (c == 0x0f00
- || (c >= 0x0f18 && c <= 0x0f19)
- || c == 0x0f35
- || c == 0x0f37
- || c == 0x0f39
- || (c >= 0x0f3e && c <= 0x0f47)
- || (c >= 0x0f49 && c <= 0x0f69)
- || (c >= 0x0f71 && c <= 0x0f84)
- || (c >= 0x0f86 && c <= 0x0f8b)
- || (c >= 0x0f90 && c <= 0x0f95)
- || c == 0x0f97
- || (c >= 0x0f99 && c <= 0x0fad)
- || (c >= 0x0fb1 && c <= 0x0fb7)
- || c == 0x0fb9)
- return 1;
-
- /* Katakana */
- if ((c >= 0x30a1 && c <= 0x30f6)
- || (c >= 0x30fb && c <= 0x30fc))
- return 1;
-
- /* CJK Unified Ideographs. */
- if (c >= 0x4e00 && c <= 0x9fa5)
- return 1;
-
- /* Hangul. */
- if (c >= 0xac00 && c <= 0xd7a3)
- return 1;
-
- /* Digits. */
- if ((c >= 0x0660 && c <= 0x0669)
- || (c >= 0x06f0 && c <= 0x06f9)
- || (c >= 0x0966 && c <= 0x096f)
- || (c >= 0x09e6 && c <= 0x09ef)
- || (c >= 0x0a66 && c <= 0x0a6f)
- || (c >= 0x0ae6 && c <= 0x0aef)
- || (c >= 0x0b66 && c <= 0x0b6f)
- || (c >= 0x0be7 && c <= 0x0bef)
- || (c >= 0x0c66 && c <= 0x0c6f)
- || (c >= 0x0ce6 && c <= 0x0cef)
- || (c >= 0x0d66 && c <= 0x0d6f)
- || (c >= 0x0e50 && c <= 0x0e59)
- || (c >= 0x0ed0 && c <= 0x0ed9)
- || (c >= 0x0f20 && c <= 0x0f33))
- return 2;
-
- /* Special characters. */
- if (c == 0x00b5
- || c == 0x00b7
- || (c >= 0x02b0 && c <= 0x02b8)
- || c == 0x02bb
- || (c >= 0x02bd && c <= 0x02c1)
- || (c >= 0x02d0 && c <= 0x02d1)
- || (c >= 0x02e0 && c <= 0x02e4)
- || c == 0x037a
- || c == 0x0559
- || c == 0x093d
- || c == 0x0b3d
- || c == 0x1fbe
- || (c >= 0x203f && c <= 0x2040)
- || c == 0x2102
- || c == 0x2107
- || (c >= 0x210a && c <= 0x2113)
- || c == 0x2115
- || (c >= 0x2118 && c <= 0x211d)
- || c == 0x2124
- || c == 0x2126
- || c == 0x2128
- || (c >= 0x212a && c <= 0x2131)
- || (c >= 0x2133 && c <= 0x2138)
- || (c >= 0x2160 && c <= 0x2182)
- || (c >= 0x3005 && c <= 0x3007)
- || (c >= 0x3021 && c <= 0x3029))
- return 1;
+ /* We have to render this into the target byte order, which may not
+ be our byte order. */
+ bool bigend = CPP_OPTION (pfile, bytes_big_endian);
+ size_t width = CPP_OPTION (pfile, wchar_precision);
+ size_t cwidth = CPP_OPTION (pfile, char_precision);
+ size_t cmask = width_to_mask (cwidth);
+ size_t nbwc = width / cwidth;
+ size_t i;
+ size_t off = tbuf->len;
+ cppchar_t c;
+
+ if (tbuf->len + nbwc > tbuf->asize)
+ {
+ tbuf->asize += OUTBUF_BLOCK_SIZE;
+ tbuf->text = xrealloc (tbuf->text, tbuf->asize);
+ }
+
+ for (i = 0; i < nbwc; i++)
+ {
+ c = n & cmask;
+ n >>= cwidth;
+ tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
+ }
+ tbuf->len += nbwc;
}
-
- if (CPP_OPTION (pfile, cplusplus) || !CPP_PEDANTIC (pfile))
+ else
{
- /* Greek. */
- if (c == 0x0384)
- return 1;
-
- /* Cyrillic. */
- if (c == 0x040d)
- return 1;
-
- /* Hebrew. */
- if (c >= 0x05f3 && c <= 0x05f4)
- return 1;
-
- /* Lao. */
- if ((c >= 0x0ead && c <= 0x0eb0)
- || (c == 0x0eb2)
- || (c == 0x0eb3)
- || (c == 0x0ebd)
- || (c >= 0x0ec0 && c <= 0x0ec4)
- || (c == 0x0ec6))
- return 1;
-
- /* Hiragana */
- if (c == 0x3094
- || (c >= 0x309d && c <= 0x309e))
- return 1;
-
- /* Katakana */
- if ((c >= 0x30a1 && c <= 0x30fe))
- return 1;
-
- /* Hangul */
- if ((c >= 0x1100 && c <= 0x1159)
- || (c >= 0x1161 && c <= 0x11a2)
- || (c >= 0x11a8 && c <= 0x11f9))
- return 1;
-
- /* CJK Unified Ideographs */
- if ((c >= 0xf900 && c <= 0xfa2d)
- || (c >= 0xfb1f && c <= 0xfb36)
- || (c >= 0xfb38 && c <= 0xfb3c)
- || (c == 0xfb3e)
- || (c >= 0xfb40 && c <= 0xfb41)
- || (c >= 0xfb42 && c <= 0xfb44)
- || (c >= 0xfb46 && c <= 0xfbb1)
- || (c >= 0xfbd3 && c <= 0xfd3f)
- || (c >= 0xfd50 && c <= 0xfd8f)
- || (c >= 0xfd92 && c <= 0xfdc7)
- || (c >= 0xfdf0 && c <= 0xfdfb)
- || (c >= 0xfe70 && c <= 0xfe72)
- || (c == 0xfe74)
- || (c >= 0xfe76 && c <= 0xfefc)
- || (c >= 0xff21 && c <= 0xff3a)
- || (c >= 0xff41 && c <= 0xff5a)
- || (c >= 0xff66 && c <= 0xffbe)
- || (c >= 0xffc2 && c <= 0xffc7)
- || (c >= 0xffca && c <= 0xffcf)
- || (c >= 0xffd2 && c <= 0xffd7)
- || (c >= 0xffda && c <= 0xffdc)
- || (c >= 0x4e00 && c <= 0x9fa5))
- return 1;
+ if (tbuf->len + 1 > tbuf->asize)
+ {
+ tbuf->asize += OUTBUF_BLOCK_SIZE;
+ tbuf->text = xrealloc (tbuf->text, tbuf->asize);
+ }
+ tbuf->text[tbuf->len++] = n;
}
+}
- /* Latin */
- if ((c >= 0x00c0 && c <= 0x00d6)
- || (c >= 0x00d8 && c <= 0x00f6)
- || (c >= 0x00f8 && c <= 0x01f5)
- || (c >= 0x01fa && c <= 0x0217)
- || (c >= 0x0250 && c <= 0x02a8)
- || (c >= 0x1e00 && c <= 0x1e9a)
- || (c >= 0x1ea0 && c <= 0x1ef9))
- return 1;
-
- /* Greek */
- if ((c >= 0x0388 && c <= 0x038a)
- || (c == 0x038c)
- || (c >= 0x038e && c <= 0x03a1)
- || (c >= 0x03a3 && c <= 0x03ce)
- || (c >= 0x03d0 && c <= 0x03d6)
- || (c == 0x03da)
- || (c == 0x03dc)
- || (c == 0x03de)
- || (c == 0x03e0)
- || (c >= 0x03e2 && c <= 0x03f3)
- || (c >= 0x1f00 && c <= 0x1f15)
- || (c >= 0x1f18 && c <= 0x1f1d)
- || (c >= 0x1f20 && c <= 0x1f45)
- || (c >= 0x1f48 && c <= 0x1f4d)
- || (c >= 0x1f50 && c <= 0x1f57)
- || (c == 0x1f59)
- || (c == 0x1f5b)
- || (c == 0x1f5d)
- || (c >= 0x1f5f && c <= 0x1f7d)
- || (c >= 0x1f80 && c <= 0x1fb4)
- || (c >= 0x1fb6 && c <= 0x1fbc)
- || (c >= 0x1fc2 && c <= 0x1fc4)
- || (c >= 0x1fc6 && c <= 0x1fcc)
- || (c >= 0x1fd0 && c <= 0x1fd3)
- || (c >= 0x1fd6 && c <= 0x1fdb)
- || (c >= 0x1fe0 && c <= 0x1fec)
- || (c >= 0x1ff2 && c <= 0x1ff4)
- || (c >= 0x1ff6 && c <= 0x1ffc))
- return 1;
-
- /* Cyrillic */
- if ((c >= 0x0401 && c <= 0x040c)
- || (c >= 0x040f && c <= 0x044f)
- || (c >= 0x0451 && c <= 0x045c)
- || (c >= 0x045e && c <= 0x0481)
- || (c >= 0x0490 && c <= 0x04c4)
- || (c >= 0x04c7 && c <= 0x04c8)
- || (c >= 0x04cb && c <= 0x04cc)
- || (c >= 0x04d0 && c <= 0x04eb)
- || (c >= 0x04ee && c <= 0x04f5)
- || (c >= 0x04f8 && c <= 0x04f9))
- return 1;
-
- /* Armenian */
- if ((c >= 0x0531 && c <= 0x0556)
- || (c >= 0x0561 && c <= 0x0587))
- return 1;
-
- /* Hebrew */
- if ((c >= 0x05d0 && c <= 0x05ea)
- || (c >= 0x05f0 && c <= 0x05f2))
- return 1;
-
- /* Arabic */
- if ((c >= 0x0621 && c <= 0x063a)
- || (c >= 0x0640 && c <= 0x0652)
- || (c >= 0x0670 && c <= 0x06b7)
- || (c >= 0x06ba && c <= 0x06be)
- || (c >= 0x06c0 && c <= 0x06ce)
- || (c >= 0x06e5 && c <= 0x06e7))
- return 1;
-
- /* Devanagari */
- if ((c >= 0x0905 && c <= 0x0939)
- || (c >= 0x0958 && c <= 0x0962))
- return 1;
-
- /* Bengali */
- if ((c >= 0x0985 && c <= 0x098c)
- || (c >= 0x098f && c <= 0x0990)
- || (c >= 0x0993 && c <= 0x09a8)
- || (c >= 0x09aa && c <= 0x09b0)
- || (c == 0x09b2)
- || (c >= 0x09b6 && c <= 0x09b9)
- || (c >= 0x09dc && c <= 0x09dd)
- || (c >= 0x09df && c <= 0x09e1)
- || (c >= 0x09f0 && c <= 0x09f1))
- return 1;
-
- /* Gurmukhi */
- if ((c >= 0x0a05 && c <= 0x0a0a)
- || (c >= 0x0a0f && c <= 0x0a10)
- || (c >= 0x0a13 && c <= 0x0a28)
- || (c >= 0x0a2a && c <= 0x0a30)
- || (c >= 0x0a32 && c <= 0x0a33)
- || (c >= 0x0a35 && c <= 0x0a36)
- || (c >= 0x0a38 && c <= 0x0a39)
- || (c >= 0x0a59 && c <= 0x0a5c)
- || (c == 0x0a5e))
- return 1;
-
- /* Gujarati */
- if ((c >= 0x0a85 && c <= 0x0a8b)
- || (c == 0x0a8d)
- || (c >= 0x0a8f && c <= 0x0a91)
- || (c >= 0x0a93 && c <= 0x0aa8)
- || (c >= 0x0aaa && c <= 0x0ab0)
- || (c >= 0x0ab2 && c <= 0x0ab3)
- || (c >= 0x0ab5 && c <= 0x0ab9)
- || (c == 0x0ae0))
- return 1;
-
- /* Oriya */
- if ((c >= 0x0b05 && c <= 0x0b0c)
- || (c >= 0x0b0f && c <= 0x0b10)
- || (c >= 0x0b13 && c <= 0x0b28)
- || (c >= 0x0b2a && c <= 0x0b30)
- || (c >= 0x0b32 && c <= 0x0b33)
- || (c >= 0x0b36 && c <= 0x0b39)
- || (c >= 0x0b5c && c <= 0x0b5d)
- || (c >= 0x0b5f && c <= 0x0b61))
- return 1;
-
- /* Tamil */
- if ((c >= 0x0b85 && c <= 0x0b8a)
- || (c >= 0x0b8e && c <= 0x0b90)
- || (c >= 0x0b92 && c <= 0x0b95)
- || (c >= 0x0b99 && c <= 0x0b9a)
- || (c == 0x0b9c)
- || (c >= 0x0b9e && c <= 0x0b9f)
- || (c >= 0x0ba3 && c <= 0x0ba4)
- || (c >= 0x0ba8 && c <= 0x0baa)
- || (c >= 0x0bae && c <= 0x0bb5)
- || (c >= 0x0bb7 && c <= 0x0bb9))
- return 1;
-
- /* Telugu */
- if ((c >= 0x0c05 && c <= 0x0c0c)
- || (c >= 0x0c0e && c <= 0x0c10)
- || (c >= 0x0c12 && c <= 0x0c28)
- || (c >= 0x0c2a && c <= 0x0c33)
- || (c >= 0x0c35 && c <= 0x0c39)
- || (c >= 0x0c60 && c <= 0x0c61))
- return 1;
-
- /* Kannada */
- if ((c >= 0x0c85 && c <= 0x0c8c)
- || (c >= 0x0c8e && c <= 0x0c90)
- || (c >= 0x0c92 && c <= 0x0ca8)
- || (c >= 0x0caa && c <= 0x0cb3)
- || (c >= 0x0cb5 && c <= 0x0cb9)
- || (c >= 0x0ce0 && c <= 0x0ce1))
- return 1;
-
- /* Malayalam */
- if ((c >= 0x0d05 && c <= 0x0d0c)
- || (c >= 0x0d0e && c <= 0x0d10)
- || (c >= 0x0d12 && c <= 0x0d28)
- || (c >= 0x0d2a && c <= 0x0d39)
- || (c >= 0x0d60 && c <= 0x0d61))
- return 1;
-
- /* Thai */
- if ((c >= 0x0e01 && c <= 0x0e30)
- || (c >= 0x0e32 && c <= 0x0e33)
- || (c >= 0x0e40 && c <= 0x0e46)
- || (c >= 0x0e4f && c <= 0x0e5b))
- return 1;
-
- /* Lao */
- if ((c >= 0x0e81 && c <= 0x0e82)
- || (c == 0x0e84)
- || (c == 0x0e87)
- || (c == 0x0e88)
- || (c == 0x0e8a)
- || (c == 0x0e8d)
- || (c >= 0x0e94 && c <= 0x0e97)
- || (c >= 0x0e99 && c <= 0x0e9f)
- || (c >= 0x0ea1 && c <= 0x0ea3)
- || (c == 0x0ea5)
- || (c == 0x0ea7)
- || (c == 0x0eaa)
- || (c == 0x0eab))
- return 1;
-
- /* Georgian */
- if ((c >= 0x10a0 && c <= 0x10c5)
- || (c >= 0x10d0 && c <= 0x10f6))
- return 1;
-
- /* Hiragana */
- if ((c >= 0x3041 && c <= 0x3093)
- || (c >= 0x309b && c <= 0x309c))
- return 1;
-
- /* Bopmofo */
- if ((c >= 0x3105 && c <= 0x312c))
- return 1;
+/* Convert a hexadecimal escape, pointed to by FROM, to the execution
+ character set and write it into the string buffer TBUF. Returns an
+ advanced pointer, and issues diagnostics as necessary.
+ No character set translation occurs; this routine always produces the
+ execution-set character with numeric value equal to the given hex
+ number. You can, e.g. generate surrogate pairs this way. */
+static const uchar *
+convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
+ struct strbuf *tbuf, bool wide)
+{
+ cppchar_t c, n = 0, overflow = 0;
+ int digits_found = 0;
+ size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
+ : CPP_OPTION (pfile, char_precision));
+ size_t mask = width_to_mask (width);
+
+ if (CPP_WTRADITIONAL (pfile))
+ cpp_error (pfile, DL_WARNING,
+ "the meaning of '\\x' is different in traditional C");
+
+ from++; /* skip 'x' */
+ while (from < limit)
+ {
+ c = *from;
+ if (! hex_p (c))
+ break;
+ from++;
+ overflow |= n ^ (n << 4 >> 4);
+ n = (n << 4) + hex_value (c);
+ digits_found = 1;
+ }
- return 0;
+ if (!digits_found)
+ {
+ cpp_error (pfile, DL_ERROR,
+ "\\x used with no following hex digits");
+ return from;
+ }
+
+ if (overflow | (n != (n & mask)))
+ {
+ cpp_error (pfile, DL_PEDWARN,
+ "hex escape sequence out of range");
+ n &= mask;
+ }
+
+ emit_numeric_escape (pfile, n, tbuf, wide);
+
+ return from;
+}
+
+/* Convert an octal escape, pointed to by FROM, to the execution
+ character set and write it into the string buffer TBUF. Returns an
+ advanced pointer, and issues diagnostics as necessary.
+ No character set translation occurs; this routine always produces the
+ execution-set character with numeric value equal to the given octal
+ number. */
+static const uchar *
+convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
+ struct strbuf *tbuf, bool wide)
+{
+ size_t count = 0;
+ cppchar_t c, n = 0;
+ size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
+ : CPP_OPTION (pfile, char_precision));
+ size_t mask = width_to_mask (width);
+ bool overflow = false;
+
+ while (from < limit && count++ < 3)
+ {
+ c = *from;
+ if (c < '0' || c > '7')
+ break;
+ from++;
+ overflow |= n ^ (n << 3 >> 3);
+ n = (n << 3) + c - '0';
+ }
+
+ if (n != (n & mask))
+ {
+ cpp_error (pfile, DL_PEDWARN,
+ "octal escape sequence out of range");
+ n &= mask;
+ }
+
+ emit_numeric_escape (pfile, n, tbuf, wide);
+
+ return from;
+}
+
+/* Convert an escape sequence (pointed to by FROM) to its value on
+ the target, and to the execution character set. Do not scan past
+ LIMIT. Write the converted value into TBUF. Returns an advanced
+ pointer. Handles all relevant diagnostics. */
+static const uchar *
+convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
+ struct strbuf *tbuf, bool wide)
+{
+ /* Values of \a \b \e \f \n \r \t \v respectively. */
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+ static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
+#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
+ static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
+#else
+#error "unknown host character set"
+#endif
+
+ uchar c;
+
+ c = *from;
+ switch (c)
+ {
+ /* UCNs, hex escapes, and octal escapes are processed separately. */
+ case 'u': case 'U':
+ return convert_ucn (pfile, from, limit, tbuf, wide);
+
+ case 'x':
+ return convert_hex (pfile, from, limit, tbuf, wide);
+ break;
+
+ case '0': case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ return convert_oct (pfile, from, limit, tbuf, wide);
+
+ /* Various letter escapes. Get the appropriate host-charset
+ value into C. */
+ case '\\': case '\'': case '"': case '?': break;
+
+ case '(': case '{': case '[': case '%':
+ /* '\(', etc, can be used at the beginning of a line in a long
+ string split onto multiple lines with \-newline, to prevent
+ Emacs or other text editors from getting confused. '\%' can
+ be used to prevent SCCS from mangling printf format strings. */
+ if (CPP_PEDANTIC (pfile))
+ goto unknown;
+ break;
+
+ case 'b': c = charconsts[1]; break;
+ case 'f': c = charconsts[3]; break;
+ case 'n': c = charconsts[4]; break;
+ case 'r': c = charconsts[5]; break;
+ case 't': c = charconsts[6]; break;
+ case 'v': c = charconsts[7]; break;
+
+ case 'a':
+ if (CPP_WTRADITIONAL (pfile))
+ cpp_error (pfile, DL_WARNING,
+ "the meaning of '\\a' is different in traditional C");
+ c = charconsts[0];
+ break;
+
+ case 'e': case 'E':
+ if (CPP_PEDANTIC (pfile))
+ cpp_error (pfile, DL_PEDWARN,
+ "non-ISO-standard escape sequence, '\\%c'", (int) c);
+ c = charconsts[2];
+ break;
+
+ default:
+ unknown:
+ if (ISGRAPH (c))
+ cpp_error (pfile, DL_PEDWARN,
+ "unknown escape sequence '\\%c'", (int) c);
+ else
+ cpp_error (pfile, DL_PEDWARN,
+ "unknown escape sequence: '\\%03o'", (int) c);
+ }
+
+ /* Now convert what we have to the execution character set. */
+ if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
+ &c, 1, tbuf))
+ cpp_errno (pfile, DL_ERROR,
+ "converting escape sequence to execution character set");
+
+ return from + 1;
+}
+
+/* FROM is an array of cpp_string structures of length COUNT. These
+ are to be converted from the source to the execution character set,
+ escape sequences translated, and finally all are to be
+ concatenated. WIDE indicates whether or not to produce a wide
+ string. The result is written into TO. Returns true for success,
+ false for failure. */
+bool
+cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
+ cpp_string *to, bool wide)
+{
+ struct strbuf tbuf;
+ const uchar *p, *base, *limit;
+ size_t i;
+ iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
+
+ tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
+ tbuf.text = xmalloc (tbuf.asize);
+ tbuf.len = 0;
+
+ for (i = 0; i < count; i++)
+ {
+ p = from[i].text;
+ if (*p == 'L') p++;
+ p++; /* skip leading quote */
+ limit = from[i].text + from[i].len - 1; /* skip trailing quote */
+
+ for (;;)
+ {
+ base = p;
+ while (p < limit && *p != '\\')
+ p++;
+ if (p > base)
+ {
+ /* We have a run of normal characters; these can be fed
+ directly to convert_cset. */
+ if (!convert_cset (cd, base, p - base, &tbuf))
+ goto fail;
+ }
+ if (p == limit)
+ break;
+
+ p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
+ }
+ }
+ /* NUL-terminate the 'to' buffer and translate it to a cpp_string
+ structure. */
+ emit_numeric_escape (pfile, 0, &tbuf, wide);
+ tbuf.text = xrealloc (tbuf.text, tbuf.len);
+ to->text = tbuf.text;
+ to->len = tbuf.len;
+ return true;
+
+ fail:
+ cpp_errno (pfile, DL_ERROR, "converting to execution character set");
+ free (tbuf.text);
+ return false;
+}
+
+/* Subroutine of cpp_interpret_charconst which performs the conversion
+ to a number, for narrow strings. STR is the string structure returned
+ by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
+ cpp_interpret_charconst. */
+static cppchar_t
+narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
+ unsigned int *pchars_seen, int *unsignedp)
+{
+ size_t width = CPP_OPTION (pfile, char_precision);
+ size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
+ size_t mask = width_to_mask (width);
+ size_t i;
+ cppchar_t result, c;
+ bool unsigned_p;
+
+ /* The value of a multi-character character constant, or a
+ single-character character constant whose representation in the
+ execution character set is more than one byte long, is
+ implementation defined. This implementation defines it to be the
+ number formed by interpreting the byte sequence in memory as a
+ big-endian binary number. If overflow occurs, the high bytes are
+ lost, and a warning is issued.
+
+ We don't want to process the NUL terminator handed back by
+ cpp_interpret_string. */
+ result = 0;
+ for (i = 0; i < str.len - 1; i++)
+ {
+ c = str.text[i] & mask;
+ if (width < BITS_PER_CPPCHAR_T)
+ result = (result << width) | c;
+ else
+ result = c;
+ }
+
+ if (i > max_chars)
+ {
+ i = max_chars;
+ cpp_error (pfile, DL_WARNING, "character constant too long for its type");
+ }
+ else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
+ cpp_error (pfile, DL_WARNING, "multi-character character constant");
+
+ /* Multichar constants are of type int and therefore signed. */
+ if (i > 1)
+ unsigned_p = 0;
+ else
+ unsigned_p = CPP_OPTION (pfile, unsigned_char);
+
+ /* Truncate the constant to its natural width, and simultaneously
+ sign- or zero-extend to the full width of cppchar_t.
+ For single-character constants, the value is WIDTH bits wide.
+ For multi-character constants, the value is INT_PRECISION bits wide. */
+ if (i > 1)
+ width = CPP_OPTION (pfile, int_precision);
+ if (width < BITS_PER_CPPCHAR_T)
+ {
+ mask = ((cppchar_t) 1 << width) - 1;
+ if (unsigned_p || !(result & (1 << (width - 1))))
+ result &= mask;
+ else
+ result |= ~mask;
+ }
+ *pchars_seen = i;
+ *unsignedp = unsigned_p;
+ return result;
+}
+
+/* Subroutine of cpp_interpret_charconst which performs the conversion
+ to a number, for wide strings. STR is the string structure returned
+ by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
+ cpp_interpret_charconst. */
+static cppchar_t
+wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
+ unsigned int *pchars_seen, int *unsignedp)
+{
+ bool bigend = CPP_OPTION (pfile, bytes_big_endian);
+ size_t width = CPP_OPTION (pfile, wchar_precision);
+ size_t cwidth = CPP_OPTION (pfile, char_precision);
+ size_t mask = width_to_mask (width);
+ size_t cmask = width_to_mask (cwidth);
+ size_t nbwc = width / cwidth;
+ size_t off, i;
+ cppchar_t result = 0, c;
+
+ /* This is finicky because the string is in the target's byte order,
+ which may not be our byte order. Only the last character, ignoring
+ the NUL terminator, is relevant. */
+ off = str.len - (nbwc * 2);
+ result = 0;
+ for (i = 0; i < nbwc; i++)
+ {
+ c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
+ result = (result << cwidth) | (c & cmask);
+ }
+
+ /* Wide character constants have type wchar_t, and a single
+ character exactly fills a wchar_t, so a multi-character wide
+ character constant is guaranteed to overflow. */
+ if (off > 0)
+ cpp_error (pfile, DL_WARNING, "character constant too long for its type");
+
+ /* Truncate the constant to its natural width, and simultaneously
+ sign- or zero-extend to the full width of cppchar_t. */
+ if (width < BITS_PER_CPPCHAR_T)
+ {
+ if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
+ result &= mask;
+ else
+ result |= ~mask;
+ }
+
+ *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
+ *pchars_seen = 1;
+ return result;
+}
+
+/* Interpret a (possibly wide) character constant in TOKEN.
+ PCHARS_SEEN points to a variable that is filled in with the number
+ of characters seen, and UNSIGNEDP to a variable that indicates
+ whether the result has signed type. */
+cppchar_t
+cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
+ unsigned int *pchars_seen, int *unsignedp)
+{
+ cpp_string str = { 0, 0 };
+ bool wide = (token->type == CPP_WCHAR);
+ cppchar_t result;
+
+ /* an empty constant will appear as L'' or '' */
+ if (token->val.str.len == (size_t) (2 + wide))
+ {
+ cpp_error (pfile, DL_ERROR, "empty character constant");
+ return 0;
+ }
+ else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
+ return 0;
+
+ if (wide)
+ result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
+ else
+ result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
+
+ if (str.text != token->val.str.text)
+ free ((void *)str.text);
+
+ return result;
}
diff --git a/gcc/cpphash.h b/gcc/cpphash.h
index 32fa1aaae1e..f4a7cfcde28 100644
--- a/gcc/cpphash.h
+++ b/gcc/cpphash.h
@@ -25,6 +25,13 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
#include "hashtable.h"
+#ifdef HAVE_ICONV
+#include <iconv.h>
+#else
+#define HAVE_ICONV 0
+typedef int iconv_t; /* dummy */
+#endif
+
struct directive; /* Deliberately incomplete. */
struct pending_option;
struct op;
@@ -362,6 +369,15 @@ struct cpp_reader
unsigned char *macro_buffer;
unsigned int macro_buffer_len;
+ /* Iconv descriptor for converting from the source character set
+ to the execution character set. (iconv_t)-1 for no conversion. */
+ iconv_t narrow_cset_desc;
+
+ /* Iconv descriptor for converting from the execution character set
+ to the wide execution character set. (iconv_t)-1 for no conversion
+ other than zero-extending each character to the width of wchar_t. */
+ iconv_t wide_cset_desc;
+
/* Tree of other included files. See cppfiles.c. */
struct splay_tree_s *all_include_files;
@@ -539,7 +555,8 @@ extern uchar *_cpp_copy_replacement_text (const cpp_macro *, uchar *);
extern size_t _cpp_replacement_text_len (const cpp_macro *);
/* In cppcharset.c. */
-cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **, int identifer_p);
+cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **, const uchar *, int);
+void _cpp_destroy_iconv (cpp_reader *);
/* Utility routines and macros. */
#define DSC(str) (const uchar *)str, sizeof str - 1
diff --git a/gcc/cppinit.c b/gcc/cppinit.c
index 1792ddd8a07..cc1faecf966 100644
--- a/gcc/cppinit.c
+++ b/gcc/cppinit.c
@@ -157,6 +157,11 @@ cpp_create_reader (enum c_lang lang, hash_table *table)
CPP_OPTION (pfile, int_precision) = CHAR_BIT * sizeof (int);
CPP_OPTION (pfile, unsigned_char) = 0;
CPP_OPTION (pfile, unsigned_wchar) = 1;
+ CPP_OPTION (pfile, bytes_big_endian) = 1; /* does not matter */
+
+ /* Default to no charset conversion. */
+ CPP_OPTION (pfile, narrow_charset) = 0;
+ CPP_OPTION (pfile, wide_charset) = 0;
/* Initialize the line map. Start at logical line 1, so we can use
a line number of zero for special states. */
@@ -227,6 +232,7 @@ cpp_destroy (cpp_reader *pfile)
_cpp_destroy_hashtable (pfile);
_cpp_cleanup_includes (pfile);
+ _cpp_destroy_iconv (pfile);
_cpp_free_buff (pfile->a_buff);
_cpp_free_buff (pfile->u_buff);
diff --git a/gcc/cpplex.c b/gcc/cpplex.c
index c536c768813..edb765dc61b 100644
--- a/gcc/cpplex.c
+++ b/gcc/cpplex.c
@@ -64,10 +64,8 @@ static void create_literal (cpp_reader *, cpp_token *, const uchar *,
unsigned int, enum cpp_ttype);
static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
static int name_p (cpp_reader *, const cpp_string *);
-static cppchar_t maybe_read_ucn (cpp_reader *, const uchar **);
static tokenrun *next_tokenrun (tokenrun *);
-static unsigned int hex_digit_value (unsigned int);
static _cpp_buff *new_buff (size_t);
@@ -397,7 +395,7 @@ forms_identifier_p (cpp_reader *pfile, int first)
&& (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
{
buffer->cur += 2;
- if (_cpp_valid_ucn (pfile, &buffer->cur, 1 + !first))
+ if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
return true;
buffer->cur -= 2;
}
@@ -1316,291 +1314,6 @@ cpp_output_line (cpp_reader *pfile, FILE *fp)
putc ('\n', fp);
}
-/* Returns the value of a hexadecimal digit. */
-static unsigned int
-hex_digit_value (unsigned int c)
-{
- if (hex_p (c))
- return hex_value (c);
- else
- abort ();
-}
-
-/* Read a possible universal character name starting at *PSTR. */
-static cppchar_t
-maybe_read_ucn (cpp_reader *pfile, const uchar **pstr)
-{
- cppchar_t result, c = (*pstr)[-1];
-
- result = _cpp_valid_ucn (pfile, pstr, false);
- if (result)
- {
- if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, DL_WARNING,
- "the meaning of '\\%c' is different in traditional C",
- (int) c);
-
- if (CPP_OPTION (pfile, EBCDIC))
- {
- cpp_error (pfile, DL_ERROR,
- "universal character with an EBCDIC target");
- result = 0x3f; /* EBCDIC invalid character */
- }
- }
-
- return result;
-}
-
-/* Returns the value of an escape sequence, truncated to the correct
- target precision. PSTR points to the input pointer, which is just
- after the backslash. LIMIT is how much text we have. WIDE is true
- if the escape sequence is part of a wide character constant or
- string literal. Handles all relevant diagnostics. */
-cppchar_t
-cpp_parse_escape (cpp_reader *pfile, const unsigned char **pstr,
- const unsigned char *limit, int wide)
-{
- /* Values of \a \b \e \f \n \r \t \v respectively. */
- static const uchar ascii[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
- static const uchar ebcdic[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
-
- int unknown = 0;
- const unsigned char *str = *pstr, *charconsts;
- cppchar_t c, ucn, mask;
- unsigned int width;
-
- if (CPP_OPTION (pfile, EBCDIC))
- charconsts = ebcdic;
- else
- charconsts = ascii;
-
- if (wide)
- width = CPP_OPTION (pfile, wchar_precision);
- else
- width = CPP_OPTION (pfile, char_precision);
- if (width < BITS_PER_CPPCHAR_T)
- mask = ((cppchar_t) 1 << width) - 1;
- else
- mask = ~0;
-
- c = *str++;
- switch (c)
- {
- case '\\': case '\'': case '"': case '?': break;
- case 'b': c = charconsts[1]; break;
- case 'f': c = charconsts[3]; break;
- case 'n': c = charconsts[4]; break;
- case 'r': c = charconsts[5]; break;
- case 't': c = charconsts[6]; break;
- case 'v': c = charconsts[7]; break;
-
- case '(': case '{': case '[': case '%':
- /* '\(', etc, are used at beginning of line to avoid confusing Emacs.
- '\%' is used to prevent SCCS from getting confused. */
- unknown = CPP_PEDANTIC (pfile);
- break;
-
- case 'a':
- if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, DL_WARNING,
- "the meaning of '\\a' is different in traditional C");
- c = charconsts[0];
- break;
-
- case 'e': case 'E':
- if (CPP_PEDANTIC (pfile))
- cpp_error (pfile, DL_PEDWARN,
- "non-ISO-standard escape sequence, '\\%c'", (int) c);
- c = charconsts[2];
- break;
-
- case 'u': case 'U':
- ucn = maybe_read_ucn (pfile, &str);
- if (ucn)
- c = ucn;
- else
- unknown = true;
- break;
-
- case 'x':
- if (CPP_WTRADITIONAL (pfile))
- cpp_error (pfile, DL_WARNING,
- "the meaning of '\\x' is different in traditional C");
-
- {
- cppchar_t i = 0, overflow = 0;
- int digits_found = 0;
-
- while (str < limit)
- {
- c = *str;
- if (! ISXDIGIT (c))
- break;
- str++;
- overflow |= i ^ (i << 4 >> 4);
- i = (i << 4) + hex_digit_value (c);
- digits_found = 1;
- }
-
- if (!digits_found)
- cpp_error (pfile, DL_ERROR,
- "\\x used with no following hex digits");
-
- if (overflow | (i != (i & mask)))
- {
- cpp_error (pfile, DL_PEDWARN,
- "hex escape sequence out of range");
- i &= mask;
- }
- c = i;
- }
- break;
-
- case '0': case '1': case '2': case '3':
- case '4': case '5': case '6': case '7':
- {
- size_t count = 0;
- cppchar_t i = c - '0';
-
- while (str < limit && ++count < 3)
- {
- c = *str;
- if (c < '0' || c > '7')
- break;
- str++;
- i = (i << 3) + c - '0';
- }
-
- if (i != (i & mask))
- {
- cpp_error (pfile, DL_PEDWARN,
- "octal escape sequence out of range");
- i &= mask;
- }
- c = i;
- }
- break;
-
- default:
- unknown = 1;
- break;
- }
-
- if (unknown)
- {
- if (ISGRAPH (c))
- cpp_error (pfile, DL_PEDWARN,
- "unknown escape sequence '\\%c'", (int) c);
- else
- cpp_error (pfile, DL_PEDWARN,
- "unknown escape sequence: '\\%03o'", (int) c);
- }
-
- if (c > mask)
- {
- cpp_error (pfile, DL_PEDWARN,
- "escape sequence out of range for its type");
- c &= mask;
- }
-
- *pstr = str;
- return c;
-}
-
-/* Interpret a (possibly wide) character constant in TOKEN.
- WARN_MULTI warns about multi-character charconsts. PCHARS_SEEN
- points to a variable that is filled in with the number of
- characters seen, and UNSIGNEDP to a variable that indicates whether
- the result has signed type. */
-cppchar_t
-cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
- unsigned int *pchars_seen, int *unsignedp)
-{
- const unsigned char *str, *limit;
- unsigned int chars_seen = 0;
- size_t width, max_chars;
- cppchar_t c, mask, result = 0;
- bool unsigned_p;
-
- str = token->val.str.text + 1 + (token->type == CPP_WCHAR);
- limit = token->val.str.text + token->val.str.len - 1;
-
- if (token->type == CPP_CHAR)
- {
- width = CPP_OPTION (pfile, char_precision);
- max_chars = CPP_OPTION (pfile, int_precision) / width;
- unsigned_p = CPP_OPTION (pfile, unsigned_char);
- }
- else
- {
- width = CPP_OPTION (pfile, wchar_precision);
- max_chars = 1;
- unsigned_p = CPP_OPTION (pfile, unsigned_wchar);
- }
-
- if (width < BITS_PER_CPPCHAR_T)
- mask = ((cppchar_t) 1 << width) - 1;
- else
- mask = ~0;
-
- while (str < limit)
- {
- c = *str++;
-
- if (c == '\\')
- c = cpp_parse_escape (pfile, &str, limit, token->type == CPP_WCHAR);
-
-#ifdef MAP_CHARACTER
- if (ISPRINT (c))
- c = MAP_CHARACTER (c);
-#endif
-
- chars_seen++;
-
- /* Truncate the character, scale the result and merge the two. */
- c &= mask;
- if (width < BITS_PER_CPPCHAR_T)
- result = (result << width) | c;
- else
- result = c;
- }
-
- if (chars_seen == 0)
- cpp_error (pfile, DL_ERROR, "empty character constant");
- else if (chars_seen > 1)
- {
- /* Multichar charconsts are of type int and therefore signed. */
- unsigned_p = 0;
-
- if (chars_seen > max_chars)
- {
- chars_seen = max_chars;
- cpp_error (pfile, DL_WARNING,
- "character constant too long for its type");
- }
- else if (CPP_OPTION (pfile, warn_multichar))
- cpp_error (pfile, DL_WARNING, "multi-character character constant");
- }
-
- /* Sign-extend or truncate the constant to cppchar_t. The value is
- in WIDTH bits, but for multi-char charconsts it's value is the
- full target type's width. */
- if (chars_seen > 1)
- width *= max_chars;
- if (width < BITS_PER_CPPCHAR_T)
- {
- mask = ((cppchar_t) 1 << width) - 1;
- if (unsigned_p || !(result & (1 << (width - 1))))
- result &= mask;
- else
- result |= ~mask;
- }
-
- *pchars_seen = chars_seen;
- *unsignedp = unsigned_p;
- return result;
-}
-
/* Memory buffers. Changing these three constants can have a dramatic
effect on performance. The values here are reasonable defaults,
but might be tuned. If you adjust them, be sure to test across a
diff --git a/gcc/cpplib.c b/gcc/cpplib.c
index af32705856f..2fac44e62fd 100644
--- a/gcc/cpplib.c
+++ b/gcc/cpplib.c
@@ -106,7 +106,6 @@ static char *glue_header_name (cpp_reader *);
static const char *parse_include (cpp_reader *, int *);
static void push_conditional (cpp_reader *, int, int, const cpp_hashnode *);
static unsigned int read_flag (cpp_reader *, unsigned int);
-static uchar *dequote_string (cpp_reader *, const uchar *, unsigned int);
static int strtoul_for_line (const uchar *, unsigned int, unsigned long *);
static void do_diagnostic (cpp_reader *, int, int);
static cpp_hashnode *lex_macro_node (cpp_reader *);
@@ -714,29 +713,6 @@ read_flag (cpp_reader *pfile, unsigned int last)
return 0;
}
-/* Subroutine of do_line and do_linemarker. Returns a version of STR
- which has a NUL terminator and all escape sequences converted to
- their equivalents. Temporary, hopefully. */
-static uchar *
-dequote_string (cpp_reader *pfile, const uchar *str, unsigned int len)
-{
- uchar *result = _cpp_unaligned_alloc (pfile, len + 1);
- uchar *dst = result;
- const uchar *limit = str + len;
- cppchar_t c;
-
- while (str < limit)
- {
- c = *str++;
- if (c != '\\')
- *dst++ = c;
- else
- *dst++ = cpp_parse_escape (pfile, &str, limit, 0);
- }
- *dst++ = '\0';
- return result;
-}
-
/* Subroutine of do_line and do_linemarker. Convert a number in STR,
of length LEN, to binary; store it in NUMP, and return 0 if the
number was well-formed, 1 if not. Temporary, hopefully. */
@@ -757,6 +733,21 @@ strtoul_for_line (const uchar *str, unsigned int len, long unsigned int *nump)
return 0;
}
+/* Subroutine of do_line and do_linemarker. Convert escape sequences
+ in a string, but do not perform character set conversion. */
+static bool
+interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
+ cpp_string *out)
+{
+ iconv_t save_narrow_cset_desc = pfile->narrow_cset_desc;
+ bool retval;
+
+ pfile->narrow_cset_desc = (iconv_t) -1;
+ retval = cpp_interpret_string (pfile, in, 1, out, false);
+ pfile->narrow_cset_desc = save_narrow_cset_desc;
+ return retval;
+}
+
/* Interpret #line command.
Note that the filename string (if any) is a true string constant
(escapes are interpreted), unlike in #line. */
@@ -788,8 +779,9 @@ do_line (cpp_reader *pfile)
token = cpp_get_token (pfile);
if (token->type == CPP_STRING)
{
- new_file = (const char *) dequote_string (pfile, token->val.str.text + 1,
- token->val.str.len - 2);
+ cpp_string s = { 0, 0 };
+ if (interpret_string_notranslate (pfile, &token->val.str, &s))
+ new_file = (const char *)s.text;
check_eol (pfile);
}
else if (token->type != CPP_EOF)
@@ -836,8 +828,10 @@ do_linemarker (cpp_reader *pfile)
token = cpp_get_token (pfile);
if (token->type == CPP_STRING)
{
- new_file = (const char *) dequote_string (pfile, token->val.str.text + 1,
- token->val.str.len - 2);
+ cpp_string s = { 0, 0 };
+ if (interpret_string_notranslate (pfile, &token->val.str, &s))
+ new_file = (const char *)s.text;
+
new_sysp = 0;
flag = read_flag (pfile, 0);
if (flag == 1)
diff --git a/gcc/cpplib.h b/gcc/cpplib.h
index aad2841ac62..fb3cc78ec67 100644
--- a/gcc/cpplib.h
+++ b/gcc/cpplib.h
@@ -124,6 +124,7 @@ struct file_name_map_list;
OP(CPP_ATSIGN, "@") /* used in Objective-C */ \
\
TK(CPP_NAME, SPELL_IDENT) /* word */ \
+ TK(CPP_AT_NAME, SPELL_IDENT) /* @word - Objective-C */ \
TK(CPP_NUMBER, SPELL_LITERAL) /* 34_be+ta */ \
\
TK(CPP_CHAR, SPELL_LITERAL) /* 'char' */ \
@@ -132,6 +133,7 @@ struct file_name_map_list;
\
TK(CPP_STRING, SPELL_LITERAL) /* "string" */ \
TK(CPP_WSTRING, SPELL_LITERAL) /* L"string" */ \
+ TK(CPP_OBJC_STRING, SPELL_LITERAL) /* @"string" - Objective-C */ \
TK(CPP_HEADER_NAME, SPELL_LITERAL) /* <stdio.h> in #include */ \
\
TK(CPP_COMMENT, SPELL_LITERAL) /* Only if output comments. */ \
@@ -332,6 +334,12 @@ struct cpp_options
/* True for traditional preprocessing. */
unsigned char traditional;
+ /* Holds the name of the target (execution) character set. */
+ const char *narrow_charset;
+
+ /* Holds the name of the target wide character set. */
+ const char *wide_charset;
+
/* True to warn about precompiled header files we couldn't use. */
bool warn_invalid_pch;
@@ -364,8 +372,9 @@ struct cpp_options
/* True means chars (wide chars) are unsigned. */
bool unsigned_char, unsigned_wchar;
- /* True if target is EBCDIC. */
- bool EBCDIC;
+ /* True if the most significant byte in a word has the lowest
+ address in memory. */
+ bool bytes_big_endian;
/* Nonzero means __STDC__ should have the value 0 in system headers. */
unsigned char stdc_0_in_system_headers;
@@ -529,6 +538,9 @@ extern const char *cpp_read_main_file (cpp_reader *, const char *);
/* Set up built-ins like __FILE__. */
extern void cpp_init_builtins (cpp_reader *, int);
+/* Set up translation to the target character set. */
+extern void cpp_init_iconv (cpp_reader *);
+
/* Call this to finish preprocessing. If you requested dependency
generation, pass an open stream to write the information to,
otherwise NULL. It is your responsibility to close the stream.
@@ -560,6 +572,10 @@ extern void _cpp_backup_tokens (cpp_reader *, unsigned int);
/* Evaluate a CPP_CHAR or CPP_WCHAR token. */
extern cppchar_t cpp_interpret_charconst (cpp_reader *, const cpp_token *,
unsigned int *, int *);
+/* Evaluate a vector of CPP_STRING or CPP_WSTRING tokens. */
+extern bool cpp_interpret_string (cpp_reader *,
+ const cpp_string *, size_t,
+ cpp_string *, bool);
/* Used to register macros and assertions, perhaps from the command line.
The text is the same as the command line argument. */
diff --git a/gcc/cppucnid.h b/gcc/cppucnid.h
new file mode 100644
index 00000000000..1cac7df0a94
--- /dev/null
+++ b/gcc/cppucnid.h
@@ -0,0 +1,336 @@
+/* Table of UCNs which are valid in identifiers.
+ Copyright (C) 2003 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+/* Automatically generated from cppucnid.tab, do not edit */
+
+/* This file reproduces the table in ISO/IEC 9899:1999 (C99) Annex
+ D, which is itself a reproduction from ISO/IEC TR 10176:1998, and
+ the similar table from ISO/IEC 14882:1988 (C++98) Annex E, which is
+ a reproduction of ISO/IEC PDTR 10176. Unfortunately these tables
+ are not identical. */
+
+#ifndef CPPUCNID_H
+#define CPPUCNID_H
+
+#define C99 1
+#define CXX 2
+#define DIG 4
+
+struct ucnrange
+{
+ unsigned short lo, hi;
+ unsigned short flags;
+};
+
+static const struct ucnrange ucnranges[] = {
+ { 0x00aa, 0x00aa, C99 }, /* Latin */
+ { 0x00b5, 0x00b5, C99 }, /* Special characters */
+ { 0x00b7, 0x00b7, C99 },
+ { 0x00ba, 0x00ba, C99 }, /* Latin */
+ { 0x00c0, 0x00d6, CXX|C99 },
+ { 0x00d8, 0x00f6, CXX|C99 },
+ { 0x00f8, 0x01f5, CXX|C99 },
+ { 0x01fa, 0x0217, CXX|C99 },
+ { 0x0250, 0x02a8, CXX|C99 },
+ { 0x02b0, 0x02b8, C99 }, /* Special characters */
+ { 0x02bb, 0x02bb, C99 },
+ { 0x02bd, 0x02c1, C99 },
+ { 0x02d0, 0x02d1, C99 },
+ { 0x02e0, 0x02e4, C99 },
+ { 0x037a, 0x037a, C99 },
+ { 0x0384, 0x0384, CXX }, /* Greek */
+ { 0x0386, 0x0386, C99 },
+ { 0x0388, 0x038a, CXX|C99 },
+ { 0x038c, 0x038c, CXX|C99 },
+ { 0x038e, 0x03a1, CXX|C99 },
+ { 0x03a3, 0x03ce, CXX|C99 },
+ { 0x03d0, 0x03d6, CXX|C99 },
+ { 0x03da, 0x03da, CXX|C99 },
+ { 0x03dc, 0x03dc, CXX|C99 },
+ { 0x03de, 0x03de, CXX|C99 },
+ { 0x03e0, 0x03e0, CXX|C99 },
+ { 0x03e2, 0x03f3, CXX|C99 },
+ { 0x0401, 0x040c, CXX|C99 }, /* Cyrillic */
+ { 0x040d, 0x040d, CXX },
+ { 0x040e, 0x040e, C99 },
+ { 0x040f, 0x044f, CXX|C99 },
+ { 0x0451, 0x045c, CXX|C99 },
+ { 0x045e, 0x0481, CXX|C99 },
+ { 0x0490, 0x04c4, CXX|C99 },
+ { 0x04c7, 0x04c8, CXX|C99 },
+ { 0x04cb, 0x04cc, CXX|C99 },
+ { 0x04d0, 0x04eb, CXX|C99 },
+ { 0x04ee, 0x04f5, CXX|C99 },
+ { 0x04f8, 0x04f9, CXX|C99 },
+ { 0x0531, 0x0556, CXX|C99 }, /* Armenian */
+ { 0x0559, 0x0559, C99 }, /* Special characters */
+ { 0x0561, 0x0587, CXX|C99 }, /* Armenian */
+ { 0x05b0, 0x05b9, C99 }, /* Hebrew */
+ { 0x05bb, 0x05bd, C99 },
+ { 0x05bf, 0x05bf, C99 },
+ { 0x05c1, 0x05c2, C99 },
+ { 0x05d0, 0x05ea, CXX|C99 },
+ { 0x05f0, 0x05f2, CXX|C99 },
+ { 0x05f3, 0x05f4, CXX },
+ { 0x0621, 0x063a, CXX|C99 }, /* Arabic */
+ { 0x0640, 0x0652, CXX|C99 },
+ { 0x0660, 0x0669, C99|DIG }, /* Digits */
+ { 0x0670, 0x06b7, CXX|C99 }, /* Arabic */
+ { 0x06ba, 0x06be, CXX|C99 },
+ { 0x06c0, 0x06ce, CXX|C99 },
+ { 0x06d0, 0x06dc, C99 },
+ { 0x06e5, 0x06e7, CXX|C99 },
+ { 0x06e8, 0x06e8, C99 },
+ { 0x06ea, 0x06ed, C99 },
+ { 0x06f0, 0x06f9, C99|DIG }, /* Digits */
+ { 0x0901, 0x0903, C99 }, /* Devanagari */
+ { 0x0905, 0x0939, CXX|C99 },
+ { 0x093d, 0x093d, C99 }, /* Special characters */
+ { 0x093e, 0x094d, C99 }, /* Devanagari */
+ { 0x0950, 0x0952, C99 },
+ { 0x0958, 0x0962, CXX|C99 },
+ { 0x0963, 0x0963, C99 },
+ { 0x0966, 0x096f, C99|DIG }, /* Digits */
+ { 0x0981, 0x0983, C99 }, /* Bengali */
+ { 0x0985, 0x098c, CXX|C99 },
+ { 0x098f, 0x0990, CXX|C99 },
+ { 0x0993, 0x09a8, CXX|C99 },
+ { 0x09aa, 0x09b0, CXX|C99 },
+ { 0x09b2, 0x09b2, CXX|C99 },
+ { 0x09b6, 0x09b9, CXX|C99 },
+ { 0x09be, 0x09c4, C99 },
+ { 0x09c7, 0x09c8, C99 },
+ { 0x09cb, 0x09cd, C99 },
+ { 0x09dc, 0x09dd, CXX|C99 },
+ { 0x09df, 0x09e1, CXX|C99 },
+ { 0x09e2, 0x09e3, C99 },
+ { 0x09e6, 0x09ef, C99|DIG }, /* Digits */
+ { 0x09f0, 0x09f1, CXX|C99 }, /* Bengali */
+ { 0x0a02, 0x0a02, C99 }, /* Gurmukhi */
+ { 0x0a05, 0x0a0a, CXX|C99 },
+ { 0x0a0f, 0x0a10, CXX|C99 },
+ { 0x0a13, 0x0a28, CXX|C99 },
+ { 0x0a2a, 0x0a30, CXX|C99 },
+ { 0x0a32, 0x0a33, CXX|C99 },
+ { 0x0a35, 0x0a36, CXX|C99 },
+ { 0x0a38, 0x0a39, CXX|C99 },
+ { 0x0a3e, 0x0a42, C99 },
+ { 0x0a47, 0x0a48, C99 },
+ { 0x0a4b, 0x0a4d, C99 },
+ { 0x0a59, 0x0a5c, CXX|C99 },
+ { 0x0a5e, 0x0a5e, CXX|C99 },
+ { 0x0a66, 0x0a6f, C99|DIG }, /* Digits */
+ { 0x0a74, 0x0a74, C99 }, /* Gurmukhi */
+ { 0x0a81, 0x0a83, C99 }, /* Gujarati */
+ { 0x0a85, 0x0a8b, CXX|C99 },
+ { 0x0a8d, 0x0a8d, CXX|C99 },
+ { 0x0a8f, 0x0a91, CXX|C99 },
+ { 0x0a93, 0x0aa8, CXX|C99 },
+ { 0x0aaa, 0x0ab0, CXX|C99 },
+ { 0x0ab2, 0x0ab3, CXX|C99 },
+ { 0x0ab5, 0x0ab9, CXX|C99 },
+ { 0x0abd, 0x0ac5, C99 },
+ { 0x0ac7, 0x0ac9, C99 },
+ { 0x0acb, 0x0acd, C99 },
+ { 0x0ad0, 0x0ad0, C99 },
+ { 0x0ae0, 0x0ae0, CXX|C99 },
+ { 0x0ae6, 0x0aef, C99|DIG }, /* Digits */
+ { 0x0b01, 0x0b03, C99 }, /* Oriya */
+ { 0x0b05, 0x0b0c, CXX|C99 },
+ { 0x0b0f, 0x0b10, CXX|C99 },
+ { 0x0b13, 0x0b28, CXX|C99 },
+ { 0x0b2a, 0x0b30, CXX|C99 },
+ { 0x0b32, 0x0b33, CXX|C99 },
+ { 0x0b36, 0x0b39, CXX|C99 },
+ { 0x0b3d, 0x0b3d, C99 }, /* Special characters */
+ { 0x0b3e, 0x0b43, C99 }, /* Oriya */
+ { 0x0b47, 0x0b48, C99 },
+ { 0x0b4b, 0x0b4d, C99 },
+ { 0x0b5c, 0x0b5d, CXX|C99 },
+ { 0x0b5f, 0x0b61, CXX|C99 },
+ { 0x0b66, 0x0b6f, C99|DIG }, /* Digits */
+ { 0x0b82, 0x0b83, C99 }, /* Tamil */
+ { 0x0b85, 0x0b8a, CXX|C99 },
+ { 0x0b8e, 0x0b90, CXX|C99 },
+ { 0x0b92, 0x0b95, CXX|C99 },
+ { 0x0b99, 0x0b9a, CXX|C99 },
+ { 0x0b9c, 0x0b9c, CXX|C99 },
+ { 0x0b9e, 0x0b9f, CXX|C99 },
+ { 0x0ba3, 0x0ba4, CXX|C99 },
+ { 0x0ba8, 0x0baa, CXX|C99 },
+ { 0x0bae, 0x0bb5, CXX|C99 },
+ { 0x0bb7, 0x0bb9, CXX|C99 },
+ { 0x0bbe, 0x0bc2, C99 },
+ { 0x0bc6, 0x0bc8, C99 },
+ { 0x0bca, 0x0bcd, C99 },
+ { 0x0be7, 0x0bef, C99|DIG }, /* Digits */
+ { 0x0c01, 0x0c03, C99 }, /* Telugu */
+ { 0x0c05, 0x0c0c, CXX|C99 },
+ { 0x0c0e, 0x0c10, CXX|C99 },
+ { 0x0c12, 0x0c28, CXX|C99 },
+ { 0x0c2a, 0x0c33, CXX|C99 },
+ { 0x0c35, 0x0c39, CXX|C99 },
+ { 0x0c3e, 0x0c44, C99 },
+ { 0x0c46, 0x0c48, C99 },
+ { 0x0c4a, 0x0c4d, C99 },
+ { 0x0c60, 0x0c61, CXX|C99 },
+ { 0x0c66, 0x0c6f, C99|DIG }, /* Digits */
+ { 0x0c82, 0x0c83, C99 }, /* Kannada */
+ { 0x0c85, 0x0c8c, CXX|C99 },
+ { 0x0c8e, 0x0c90, CXX|C99 },
+ { 0x0c92, 0x0ca8, CXX|C99 },
+ { 0x0caa, 0x0cb3, CXX|C99 },
+ { 0x0cb5, 0x0cb9, CXX|C99 },
+ { 0x0cbe, 0x0cc4, C99 },
+ { 0x0cc6, 0x0cc8, C99 },
+ { 0x0cca, 0x0ccd, C99 },
+ { 0x0cde, 0x0cde, C99 },
+ { 0x0ce0, 0x0ce1, CXX|C99 },
+ { 0x0ce6, 0x0cef, C99|DIG }, /* Digits */
+ { 0x0d02, 0x0d03, C99 }, /* Malayalam */
+ { 0x0d05, 0x0d0c, CXX|C99 },
+ { 0x0d0e, 0x0d10, CXX|C99 },
+ { 0x0d12, 0x0d28, CXX|C99 },
+ { 0x0d2a, 0x0d39, CXX|C99 },
+ { 0x0d3e, 0x0d43, C99 },
+ { 0x0d46, 0x0d48, C99 },
+ { 0x0d4a, 0x0d4d, C99 },
+ { 0x0d60, 0x0d61, CXX|C99 },
+ { 0x0d66, 0x0d6f, C99|DIG }, /* Digits */
+ { 0x0e01, 0x0e30, CXX|C99 }, /* Thai */
+ { 0x0e31, 0x0e31, C99 },
+ { 0x0e32, 0x0e33, CXX|C99 },
+ { 0x0e34, 0x0e3a, C99 },
+ { 0x0e40, 0x0e46, CXX|C99 },
+ { 0x0e47, 0x0e49, C99 },
+ { 0x0e50, 0x0e59, CXX|C99|DIG }, /* Digits */
+ { 0x0e5a, 0x0e5b, CXX|C99 }, /* Thai */
+ { 0x0e81, 0x0e82, CXX|C99 }, /* Lao */
+ { 0x0e84, 0x0e84, CXX|C99 },
+ { 0x0e87, 0x0e88, CXX|C99 },
+ { 0x0e8a, 0x0e8a, CXX|C99 },
+ { 0x0e8d, 0x0e8d, CXX|C99 },
+ { 0x0e94, 0x0e97, CXX|C99 },
+ { 0x0e99, 0x0e9f, CXX|C99 },
+ { 0x0ea1, 0x0ea3, CXX|C99 },
+ { 0x0ea5, 0x0ea5, CXX|C99 },
+ { 0x0ea7, 0x0ea7, CXX|C99 },
+ { 0x0eaa, 0x0eab, CXX|C99 },
+ { 0x0ead, 0x0eae, CXX|C99 },
+ { 0x0eaf, 0x0eaf, CXX },
+ { 0x0eb0, 0x0eb0, CXX|C99 },
+ { 0x0eb1, 0x0eb1, C99 },
+ { 0x0eb2, 0x0eb3, CXX|C99 },
+ { 0x0eb4, 0x0eb9, C99 },
+ { 0x0ebb, 0x0ebc, C99 },
+ { 0x0ebd, 0x0ebd, CXX|C99 },
+ { 0x0ec0, 0x0ec4, CXX|C99 },
+ { 0x0ec6, 0x0ec6, CXX|C99 },
+ { 0x0ec8, 0x0ecd, C99 },
+ { 0x0ed0, 0x0ed9, C99|DIG }, /* Digits */
+ { 0x0edc, 0x0edd, C99 }, /* Lao */
+ { 0x0f00, 0x0f00, C99 }, /* Tibetan */
+ { 0x0f18, 0x0f19, C99 },
+ { 0x0f20, 0x0f33, C99|DIG }, /* Digits */
+ { 0x0f35, 0x0f35, C99 }, /* Tibetan */
+ { 0x0f37, 0x0f37, C99 },
+ { 0x0f39, 0x0f39, C99 },
+ { 0x0f3e, 0x0f47, C99 },
+ { 0x0f49, 0x0f69, C99 },
+ { 0x0f71, 0x0f84, C99 },
+ { 0x0f86, 0x0f8b, C99 },
+ { 0x0f90, 0x0f95, C99 },
+ { 0x0f97, 0x0f97, C99 },
+ { 0x0f99, 0x0fad, C99 },
+ { 0x0fb1, 0x0fb7, C99 },
+ { 0x0fb9, 0x0fb9, C99 },
+ { 0x10a0, 0x10c5, CXX|C99 }, /* Georgian */
+ { 0x10d0, 0x10f6, CXX|C99 },
+ { 0x1100, 0x1159, CXX }, /* Hangul */
+ { 0x1161, 0x11a2, CXX },
+ { 0x11a8, 0x11f9, CXX },
+ { 0x1e00, 0x1e9a, CXX|C99 }, /* Latin */
+ { 0x1e9b, 0x1e9b, C99 },
+ { 0x1ea0, 0x1ef9, CXX|C99 },
+ { 0x1f00, 0x1f15, CXX|C99 }, /* Greek */
+ { 0x1f18, 0x1f1d, CXX|C99 },
+ { 0x1f20, 0x1f45, CXX|C99 },
+ { 0x1f48, 0x1f4d, CXX|C99 },
+ { 0x1f50, 0x1f57, CXX|C99 },
+ { 0x1f59, 0x1f59, CXX|C99 },
+ { 0x1f5b, 0x1f5b, CXX|C99 },
+ { 0x1f5d, 0x1f5d, CXX|C99 },
+ { 0x1f5f, 0x1f7d, CXX|C99 },
+ { 0x1f80, 0x1fb4, CXX|C99 },
+ { 0x1fb6, 0x1fbc, CXX|C99 },
+ { 0x1fbe, 0x1fbe, C99 }, /* Special characters */
+ { 0x1fc2, 0x1fc4, CXX|C99 }, /* Greek */
+ { 0x1fc6, 0x1fcc, CXX|C99 },
+ { 0x1fd0, 0x1fd3, CXX|C99 },
+ { 0x1fd6, 0x1fdb, CXX|C99 },
+ { 0x1fe0, 0x1fec, CXX|C99 },
+ { 0x1ff2, 0x1ff4, CXX|C99 },
+ { 0x1ff6, 0x1ffc, CXX|C99 },
+ { 0x203f, 0x2040, C99 }, /* Special characters */
+ { 0x207f, 0x207f, C99 }, /* Latin */
+ { 0x2102, 0x2102, C99 }, /* Special characters */
+ { 0x2107, 0x2107, C99 },
+ { 0x210a, 0x2113, C99 },
+ { 0x2115, 0x2115, C99 },
+ { 0x2118, 0x211d, C99 },
+ { 0x2124, 0x2124, C99 },
+ { 0x2126, 0x2126, C99 },
+ { 0x2128, 0x2128, C99 },
+ { 0x212a, 0x2131, C99 },
+ { 0x2133, 0x2138, C99 },
+ { 0x2160, 0x2182, C99 },
+ { 0x3005, 0x3007, C99 },
+ { 0x3021, 0x3029, C99 },
+ { 0x3041, 0x3093, CXX|C99 }, /* Hiragana */
+ { 0x3094, 0x3094, CXX },
+ { 0x309b, 0x309c, CXX|C99 },
+ { 0x309d, 0x309e, CXX },
+ { 0x30a1, 0x30f6, CXX|C99 }, /* Katakana */
+ { 0x30f7, 0x30fa, CXX },
+ { 0x30fb, 0x30fc, CXX|C99 },
+ { 0x30fd, 0x30fe, CXX },
+ { 0x3105, 0x312c, CXX|C99 }, /* Bopomofo */
+ { 0x4e00, 0x9fa5, CXX|C99 }, /* CJK Unified Ideographs */
+ { 0xac00, 0xd7a3, C99 }, /* Hangul */
+ { 0xf900, 0xfa2d, CXX }, /* CJK Unified Ideographs */
+ { 0xfb1f, 0xfb36, CXX },
+ { 0xfb38, 0xfb3c, CXX },
+ { 0xfb3e, 0xfb3e, CXX },
+ { 0xfb40, 0xfb44, CXX },
+ { 0xfb46, 0xfbb1, CXX },
+ { 0xfbd3, 0xfd3f, CXX },
+ { 0xfd50, 0xfd8f, CXX },
+ { 0xfd92, 0xfdc7, CXX },
+ { 0xfdf0, 0xfdfb, CXX },
+ { 0xfe70, 0xfe72, CXX },
+ { 0xfe74, 0xfe74, CXX },
+ { 0xfe76, 0xfefc, CXX },
+ { 0xff21, 0xff3a, CXX },
+ { 0xff41, 0xff5a, CXX },
+ { 0xff66, 0xffbe, CXX },
+ { 0xffc2, 0xffc7, CXX },
+ { 0xffca, 0xffcf, CXX },
+ { 0xffd2, 0xffd7, CXX },
+ { 0xffda, 0xffdc, CXX },
+};
+
+#endif /* cppucnid.h */
diff --git a/gcc/cppucnid.pl b/gcc/cppucnid.pl
new file mode 100644
index 00000000000..eb8bbcac627
--- /dev/null
+++ b/gcc/cppucnid.pl
@@ -0,0 +1,130 @@
+#! /usr/bin/perl -w
+use strict;
+
+# Convert cppucnid.tab to cppucnid.h. We use two arrays of length
+# 65536 to represent the table, since this is nice and simple. The
+# first array holds the tags indicating which ranges are valid in
+# which contexts. The second array holds the language name associated
+# with each element.
+
+our(@tags, @names);
+@tags = ("") x 65536;
+@names = ("") x 65536;
+
+
+# Array mapping tag numbers to standard #defines
+our @stds;
+
+# Current standard and language
+our($curstd, $curlang);
+
+# First block of the file is a template to be saved for later.
+our @template;
+
+while (<>) {
+ chomp;
+ last if $_ eq '%%';
+ push @template, $_;
+};
+
+# Second block of the file is the UCN tables.
+# The format looks like this:
+#
+# [std]
+#
+# ; language
+# xxxx-xxxx xxxx xxxx-xxxx ....
+#
+# with comment lines starting with #.
+
+while (<>) {
+ chomp;
+ /^#/ and next;
+ /^\s*$/ and next;
+ /^\[(.+)\]$/ and do {
+ $curstd = $1;
+ next;
+ };
+ /^; (.+)$/ and do {
+ $curlang = $1;
+ next;
+ };
+
+ process_range(split);
+}
+
+# Print out the template, inserting as requested.
+$\ = "\n";
+for (@template) {
+ print("/* Automatically generated from cppucnid.tab, do not edit */"),
+ next if $_ eq "[dne]";
+ print_table(), next if $_ eq "[table]";
+ print;
+}
+
+sub print_table {
+ my($lo, $hi);
+ my $prevname = "";
+
+ for ($lo = 0; $lo <= $#tags; $lo = $hi) {
+ $hi = $lo;
+ $hi++ while $hi <= $#tags
+ && $tags[$hi] eq $tags[$lo]
+ && $names[$hi] eq $names[$lo];
+
+ # Range from $lo to $hi-1.
+ # Don't make entries for ranges that are not valid idchars.
+ next if ($tags[$lo] eq "");
+ my $tag = $tags[$lo];
+ $tag = " ".$tag if $tag =~ /^C99/;
+
+ if ($names[$lo] eq $prevname) {
+ printf(" { 0x%04x, 0x%04x, %-11s },\n",
+ $lo, $hi-1, $tag);
+ } else {
+ printf(" { 0x%04x, 0x%04x, %-11s }, /* %s */\n",
+ $lo, $hi-1, $tag, $names[$lo]);
+ }
+ $prevname = $names[$lo];
+ }
+}
+
+# The line is a list of four-digit hexadecimal numbers or
+# pairs of such numbers. Each is a valid identifier character
+# from the given language, under the given standard.
+sub process_range {
+ for my $range (@_) {
+ if ($range =~ /^[0-9a-f]{4}$/) {
+ my $i = hex($range);
+ if ($tags[$i] eq "") {
+ $tags[$i] = $curstd;
+ } else {
+ $tags[$i] = $curstd . "|" . $tags[$i];
+ }
+ if ($names[$i] ne "" && $names[$i] ne $curlang) {
+ warn sprintf ("language overlap: %s/%s at %x (tag %d)",
+ $names[$i], $curlang, $i, $tags[$i]);
+ next;
+ }
+ $names[$i] = $curlang;
+ } elsif ($range =~ /^ ([0-9a-f]{4}) - ([0-9a-f]{4}) $/x) {
+ my ($start, $end) = (hex($1), hex($2));
+ my $i;
+ for ($i = $start; $i <= $end; $i++) {
+ if ($tags[$i] eq "") {
+ $tags[$i] = $curstd;
+ } else {
+ $tags[$i] = $curstd . "|" . $tags[$i];
+ }
+ if ($names[$i] ne "" && $names[$i] ne $curlang) {
+ warn sprintf ("language overlap: %s/%s at %x (tag %d)",
+ $names[$i], $curlang, $i, $tags[$i]);
+ next;
+ }
+ $names[$i] = $curlang;
+ }
+ } else {
+ warn "malformed range expression $range";
+ }
+ }
+}
diff --git a/gcc/cppucnid.tab b/gcc/cppucnid.tab
new file mode 100644
index 00000000000..4a7a0f4094c
--- /dev/null
+++ b/gcc/cppucnid.tab
@@ -0,0 +1,239 @@
+/* Table of UCNs which are valid in identifiers.
+ Copyright (C) 2003 Free Software Foundation, Inc.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
+
+[dne]
+
+/* This file reproduces the table in ISO/IEC 9899:1999 (C99) Annex
+ D, which is itself a reproduction from ISO/IEC TR 10176:1998, and
+ the similar table from ISO/IEC 14882:1988 (C++98) Annex E, which is
+ a reproduction of ISO/IEC PDTR 10176. Unfortunately these tables
+ are not identical. */
+
+#ifndef CPPUCNID_H
+#define CPPUCNID_H
+
+#define C99 1
+#define CXX 2
+#define DIG 4
+
+struct ucnrange
+{
+ unsigned short lo, hi;
+ unsigned short flags;
+};
+
+static const struct ucnrange ucnranges[] = {
+[table]
+};
+
+#endif /* cppucnid.h */
+%%
+
+[C99]
+
+; Latin
+00aa 00ba 00c0-00d6 00d8-00f6 00f8-01f5 01fa-0217 0250-02a8 1e00-1e9b
+1ea0-1ef9 207f
+
+; Greek
+0386 0388-038a 038c 038e-03a1 03a3-03ce 03d0-03d6 03da 03dc 03de 03e0
+03e2-03f3 1f00-1f15 1f18-1f1d 1f20-1f45 1f48-1f4d 1f50-1f57 1f59 1f5b
+1f5d 1f5f-1f7d 1f80-1fb4 1fb6-1fbc 1fc2-1fc4 1fc6-1fcc 1fd0-1fd3
+1fd6-1fdb 1fe0-1fec 1ff2-1ff4 1ff6-1ffc
+
+; Cyrillic
+0401-040c 040e-044f 0451-045c 045e-0481 0490-04c4 04c7-04c8 04cb-04cc
+04d0-04eb 04ee-04f5 04f8-04f9
+
+; Armenian
+0531-0556 0561-0587
+
+; Hebrew
+05b0-05b9 05bb-05bd 05bf 05c1-05c2 05d0-05ea 05f0-05f2
+
+; Arabic
+0621-063a 0640-0652 0670-06b7 06ba-06be 06c0-06ce 06d0-06dc 06e5-06e8
+06ea-06ed
+
+; Devanagari
+0901-0903 0905-0939 093e-094d 0950-0952 0958-0963
+
+; Bengali
+0981-0983 0985-098c 098f-0990 0993-09a8 09aa-09b0 09b2 09b6-09b9
+09be-09c4 09c7-09c8 09cb-09cd 09dc-09dd 09df-09e3 09f0-09f1
+
+; Gurmukhi
+0a02 0a05-0a0a 0a0f-0a10 0a13-0a28 0a2a-0a30 0a32-0a33 0a35-0a36
+0a38-0a39 0a3e-0a42 0a47-0a48 0a4b-0a4d 0a59-0a5c 0a5e 0a74
+
+; Gujarati
+0a81-0a83 0a85-0a8b 0a8d 0a8f-0a91 0a93-0aa8 0aaa-0ab0 0ab2-0ab3
+0ab5-0ab9 0abd-0ac5 0ac7-0ac9 0acb-0acd 0ad0 0ae0
+
+; Oriya
+0b01-0b03 0b05-0b0c 0b0f-0b10 0b13-0b28 0b2a-0b30 0b32-0b33 0b36-0b39
+0b3e-0b43 0b47-0b48 0b4b-0b4d 0b5c-0b5d 0b5f-0b61
+
+; Tamil
+0b82-0b83 0b85-0b8a 0b8e-0b90 0b92-0b95 0b99-0b9a 0b9c 0b9e-0b9f
+0ba3-0ba4 0ba8-0baa 0bae-0bb5 0bb7-0bb9 0bbe-0bc2 0bc6-0bc8 0bca-0bcd
+
+; Telugu
+0c01-0c03 0c05-0c0c 0c0e-0c10 0c12-0c28 0c2a-0c33 0c35-0c39 0c3e-0c44
+0c46-0c48 0c4a-0c4d 0c60-0c61
+
+; Kannada
+0c82-0c83 0c85-0c8c 0c8e-0c90 0c92-0ca8 0caa-0cb3 0cb5-0cb9 0cbe-0cc4
+0cc6-0cc8 0cca-0ccd 0cde 0ce0-0ce1
+
+; Malayalam
+0d02-0d03 0d05-0d0c 0d0e-0d10 0d12-0d28 0d2a-0d39 0d3e-0d43 0d46-0d48
+0d4a-0d4d 0d60-0d61
+
+# CORRECTION: exclude 0e50-0e59 from the Thai range as it also appears
+# in the Digits range below.
+; Thai
+0e01-0e3a 0e40-0e49 0e5a-0e5b
+
+; Lao
+0e81-0e82 0e84 0e87-0e88 0e8a 0e8d 0e94-0e97 0e99-0e9f 0ea1-0ea3 0ea5
+0ea7 0eaa-0eab 0ead-0eae 0eb0-0eb9 0ebb-0ebd 0ec0-0ec4 0ec6 0ec8-0ecd
+0edc-0edd
+
+; Tibetan
+0f00 0f18-0f19 0f35 0f37 0f39 0f3e-0f47 0f49-0f69 0f71-0f84 0f86-0f8b
+0f90-0f95 0f97 0f99-0fad 0fb1-0fb7 0fb9
+
+; Georgian
+10a0-10c5 10d0-10f6
+
+; Hiragana
+3041-3093 309b-309c
+
+; Katakana
+30a1-30f6 30fb-30fc
+
+; Bopomofo
+3105-312c
+
+; CJK Unified Ideographs
+4e00-9fa5
+
+; Hangul
+ac00-d7a3
+
+; Special characters
+00b5 00b7 02b0-02b8 02bb 02bd-02c1 02d0-02d1 02e0-02e4 037a 0559 093d
+0b3d 1fbe 203f-2040 2102 2107 210a-2113 2115 2118-211d 2124 2126 2128
+212a-2131 2133-2138 2160-2182 3005-3007 3021-3029
+
+[C99|DIG]
+; Digits
+0660-0669 06f0-06f9 0966-096f 09e6-09ef 0a66-0a6f 0ae6-0aef 0b66-0b6f
+0be7-0bef 0c66-0c6f 0ce6-0cef 0d66-0d6f 0e50-0e59 0ed0-0ed9 0f20-0f33
+
+[CXX]
+
+; Latin
+00c0-00d6 00d8-00f6 00f8-01f5 01fa-0217 0250-02a8 1e00-1e9a 1ea0-1ef9
+
+; Greek
+0384 0388-038a 038c 038e-03a1 03a3-03ce 03d0-03d6 03da 03dc 03de 03e0
+03e2-03f3 1f00-1f15 1f18-1f1d 1f20-1f45 1f48-1f4d 1f50-1f57 1f59 1f5b
+1f5d 1f5f-1f7d 1f80-1fb4 1fb6-1fbc 1fc2-1fc4 1fc6-1fcc 1fd0-1fd3
+1fd6-1fdb 1fe0-1fec 1ff2-1ff4 1ff6-1ffc
+
+; Cyrillic
+0401-040d 040f-044f 0451-045c 045e-0481 0490-04c4 04c7-04c8 04cb-04cc
+04d0-04eb 04ee-04f5 04f8-04f9
+
+; Armenian
+0531-0556 0561-0587
+
+; Hebrew
+05d0-05ea 05f0-05f4
+
+; Arabic
+0621-063a 0640-0652 0670-06b7 06ba-06be 06c0-06ce 06e5-06e7
+
+; Devanagari
+0905-0939 0958-0962
+
+; Bengali
+0985-098c 098f-0990 0993-09a8 09aa-09b0 09b2 09b6-09b9 09dc-09dd
+09df-09e1 09f0-09f1
+
+; Gurmukhi
+0a05-0a0a 0a0f-0a10 0a13-0a28 0a2a-0a30 0a32-0a33 0a35-0a36 0a38-0a39
+0a59-0a5c 0a5e
+
+; Gujarati
+0a85-0a8b 0a8d 0a8f-0a91 0a93-0aa8 0aaa-0ab0 0ab2-0ab3 0ab5-0ab9 0ae0
+
+; Oriya
+0b05-0b0c 0b0f-0b10 0b13-0b28 0b2a-0b30 0b32-0b33 0b36-0b39 0b5c-0b5d
+0b5f-0b61
+
+; Tamil
+0b85-0b8a 0b8e-0b90 0b92-0b95 0b99-0b9a 0b9c 0b9e-0b9f 0ba3-0ba4
+0ba8-0baa 0bae-0bb5 0bb7-0bb9
+
+; Telugu
+0c05-0c0c 0c0e-0c10 0c12-0c28 0c2a-0c33 0c35-0c39 0c60-0c61
+
+; Kannada
+0c85-0c8c 0c8e-0c90 0c92-0ca8 0caa-0cb3 0cb5-0cb9 0ce0-0ce1
+
+; Malayalam
+0d05-0d0c 0d0e-0d10 0d12-0d28 0d2a-0d39 0d60-0d61
+
+# CORRECTION: Exclude 0e50-0e59 from the Thai range and make a fake
+# Digits range for it, to match C99. cppcharset.c knows that C++
+# doesn't distinguish digits from other UCNs valid in identifiers.
+; Thai
+0e01-0e30 0e32-0e33 0e40-0e46 0e4f-0e49 0e5a-0e5b
+
+; Digits
+0e50-0e59
+
+# CORRECTION: Change 0e0d to 0e8d (typo in standard; see C++ DR 131)
+; Lao
+0e81-0e82 0e84 0e87-0e88 0e8a 0e8d 0e94-0e97 0e99-0e9f 0ea1-0ea3 0ea5
+0ea7 0eaa-0eab 0ead-0eb0 0eb2 0eb3 0ebd 0ec0-0ec4 0ec6
+
+; Georgian
+10a0-10c5 10d0-10f6
+
+; Hiragana
+3041-3094 309b-309e
+
+; Katakana
+30a1-30fe
+
+# CORRECTION: language spelled "Bopmofo" in C++98.
+; Bopomofo
+3105-312c
+
+; Hangul
+1100-1159 1161-11a2 11a8-11f9
+
+; CJK Unified Ideographs
+f900-fa2d fb1f-fb36 fb38-fb3c fb3e fb40-fb41 fb42-fb44 fb46-fbb1
+fbd3-fd3f fd50-fd8f fd92-fdc7 fdf0-fdfb fe70-fe72 fe74 fe76-fefc
+ff21-ff3a ff41-ff5a ff66-ffbe ffc2-ffc7 ffca-ffcf ffd2-ffd7
+ffda-ffdc 4e00-9fa5
+
diff --git a/gcc/doc/cpp.texi b/gcc/doc/cpp.texi
index 4a41a462a18..c043b88b9ee 100644
--- a/gcc/doc/cpp.texi
+++ b/gcc/doc/cpp.texi
@@ -104,6 +104,7 @@ useful on its own.
Overview
+* Character sets::
* Initial processing::
* Tokenization::
* The preprocessing language::
@@ -233,11 +234,62 @@ manual refer to GNU CPP.
@c man end
@menu
+* Character sets::
* Initial processing::
* Tokenization::
* The preprocessing language::
@end menu
+@node Character sets
+@section Character sets
+
+Source code character set processing in C and related languages is
+rather complicated. The C standard discusses two character sets, but
+there are really at least four.
+
+The files input to CPP might be in any character set at all. CPP's
+very first action, before it even looks for line boundaries, is to
+convert the file into the character set it uses for internal
+processing. That set is what the C standard calls the @dfn{source}
+character set. It must be isomorphic with ISO 10646, also known as
+Unicode. CPP uses the UTF-8 encoding of Unicode.
+
+At present, GNU CPP does not implement conversion from arbitrary file
+encodings to the source character set. Use of any encoding other than
+plain ASCII or UTF-8, except in comments, will cause errors. Use of
+encodings that are not strict supersets of ASCII, such as Shift JIS,
+may cause errors even if non-ASCII characters appear only in comments.
+We plan to fix this in the near future.
+
+All preprocessing work (the subject of the rest of this manual) is
+carried out in the source character set. If you request textual
+output from the preprocessor with the @option{-E} option, it will be
+in UTF-8.
+
+After preprocessing is complete, string and character constants are
+converted again, into the @dfn{execution} character set. This
+character set is under control of the user; the default is UTF-8,
+matching the source character set. Wide string and character
+constants have their own character set, which is not called out
+specifically in the standard. Again, it is under control of the user.
+The default is UTF-16 or UTF-32, whichever fits in the target's
+@code{wchar_t} type, in the target machine's byte
+order.@footnote{UTF-16 does not meet the requirements of the C
+standard for a wide character set, but the choice of 16-bit
+@code{wchar_t} is enshrined in some system ABIs so we cannot fix
+this.} Octal and hexadecimal escape sequences do not undergo
+conversion; @t{'\x12'} has the value 0x12 regardless of the currently
+selected execution character set. All other escapes are replaced by
+the character in the source character set that they represent, then
+converted to the execution character set, just like unescaped
+characters.
+
+GCC does not permit the use of characters outside the ASCII range, nor
+@samp{\u} and @samp{\U} escapes, in identifiers. We hope this will
+change eventually, but there are problems with the standard semantics
+of such ``extended identifiers'' which must be resolved through the
+ISO C and C++ committees first.
+
@node Initial processing
@section Initial processing
@@ -251,27 +303,19 @@ standard.
@enumerate
@item
-@cindex character sets
@cindex line endings
The input file is read into memory and broken into lines.
-CPP expects its input to be a text file, that is, an unstructured
-stream of ASCII characters, with some characters indicating the end of a
-line of text. Extended ASCII character sets, such as ISO Latin-1 or
-Unicode encoded in UTF-8, are also acceptable. Character sets that are
-not strict supersets of seven-bit ASCII will not work. We plan to add
-complete support for international character sets in a future release.
-
Different systems use different conventions to indicate the end of a
line. GCC accepts the ASCII control sequences @kbd{LF}, @kbd{@w{CR
-LF}} and @kbd{CR} as end-of-line markers. These
-are the canonical sequences used by Unix, DOS and VMS, and the
-classic Mac OS (before OSX) respectively. You may therefore safely copy
-source code written on any of those systems to a different one and use
-it without conversion. (GCC may lose track of the current line number
-if a file doesn't consistently use one convention, as sometimes happens
-when it is edited on computers with different conventions that share a
-network file system.)
+LF}} and @kbd{CR} as end-of-line markers. These are the canonical
+sequences used by Unix, DOS and VMS, and the classic Mac OS (before
+OSX) respectively. You may therefore safely copy source code written
+on any of those systems to a different one and use it without
+conversion. (GCC may lose track of the current line number if a file
+doesn't consistently use one convention, as sometimes happens when it
+is edited on computers with different conventions that share a network
+file system.)
If the last line of any input file lacks an end-of-line marker, the end
of the file is considered to implicitly supply one. The C standard says
@@ -378,8 +422,9 @@ comment.
@end group
@end example
-Comments are not recognized within string literals. @t{@w{"/* blah
-*/"}} is the string constant @samp{@w{/* blah */}}, not an empty string.
+Comments are not recognized within string literals.
+@t{@w{"/* blah */"}} is the string constant @samp{@w{/* blah */}}, not
+an empty string.
Line comments are not in the 1989 edition of the C standard, but they
are recognized by GCC as an extension. In C++ and in the 1999 edition
@@ -3706,8 +3751,9 @@ and stick to it.
@item The mapping of physical source file multi-byte characters to the
execution character set.
-Currently, GNU cpp only supports character sets that are strict supersets
-of ASCII, and performs no translation of characters.
+Currently, CPP requires its input to be ASCII or UTF-8. The execution
+character set may be controlled by the user, with the
+@code{-ftarget-charset} and @code{-ftarget-wide-charset} options.
@item Identifier characters.
@anchor{Identifier characters}
diff --git a/gcc/doc/cppopts.texi b/gcc/doc/cppopts.texi
index 70116399dc7..8096763e6ed 100644
--- a/gcc/doc/cppopts.texi
+++ b/gcc/doc/cppopts.texi
@@ -498,6 +498,21 @@ correct column numbers in warnings or errors, even if tabs appear on the
line. If the value is less than 1 or greater than 100, the option is
ignored. The default is 8.
+@item -fexec-charset=@var{charset}
+@opindex fexec-charset
+Set the execution character set, used for string and character
+constants. The default is UTF-8. @var{charset} can be any encoding
+supported by the system's @code{iconv} library routine.
+
+@item -fwide-exec-charset=@var{charset}
+@opindex fwide-exec-charset
+Set the wide execution character set, used for wide string and
+character constants. The default is UTF-32 or UTF-16, whichever
+corresponds to the width of @code{wchar_t}. As with
+@option{-ftarget-charset}, @var{charset} can be any encoding supported
+by the system's @code{iconv} library routine; however, you will have
+problems with encodings that do not fit exactly in @code{wchar_t}.
+
@item -fno-show-column
@opindex fno-show-column
Do not print column numbers in diagnostics. This may be necessary if
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index dcefcdc89f1..b8fefede871 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -439,7 +439,6 @@ extensions, accepted by GCC in C89 mode and in C++.
* Empty Structures:: Structures with no members.
* Variadic Macros:: Macros with a variable number of arguments.
* Escaped Newlines:: Slightly looser rules for escaped newlines.
-* Multi-line Strings:: String literals with embedded newlines.
* Subscripting:: Any array can be subscripted, even if not an lvalue.
* Pointer Arith:: Arithmetic on @code{void}-pointers and function pointers.
* Initializers:: Non-constant initializers.
@@ -1529,27 +1528,14 @@ argument, these arguments are not macro expanded.
Recently, the preprocessor has relaxed its treatment of escaped
newlines. Previously, the newline had to immediately follow a
-backslash. The current implementation allows whitespace in the form of
-spaces, horizontal and vertical tabs, and form feeds between the
+backslash. The current implementation allows whitespace in the form
+of spaces, horizontal and vertical tabs, and form feeds between the
backslash and the subsequent newline. The preprocessor issues a
warning, but treats it as a valid escaped newline and combines the two
lines to form a single logical line. This works within comments and
-tokens, including multi-line strings, as well as between tokens.
-Comments are @emph{not} treated as whitespace for the purposes of this
-relaxation, since they have not yet been replaced with spaces.
-
-@node Multi-line Strings
-@section String Literals with Embedded Newlines
-@cindex multi-line string literals
-
-As an extension, GNU CPP permits string literals to cross multiple lines
-without escaping the embedded newlines. Each embedded newline is
-replaced with a single @samp{\n} character in the resulting string
-literal, regardless of what form the newline took originally.
-
-CPP currently allows such strings in directives as well (other than the
-@samp{#include} family). This is deprecated and will eventually be
-removed.
+tokens, as well as between tokens. Comments are @emph{not} treated as
+whitespace for the purposes of this relaxation, since they have not
+yet been replaced with spaces.
@node Subscripting
@section Non-Lvalue Arrays May Have Subscripts
@@ -4437,18 +4423,47 @@ This extension is not supported by GNU C++.
@node Function Names
@section Function Names as Strings
+@cindex @code{__func__} identifier
@cindex @code{__FUNCTION__} identifier
@cindex @code{__PRETTY_FUNCTION__} identifier
-@cindex @code{__func__} identifier
-GCC predefines two magic identifiers to hold the name of the current
-function. The identifier @code{__FUNCTION__} holds the name of the function
-as it appears in the source. The identifier @code{__PRETTY_FUNCTION__}
-holds the name of the function pretty printed in a language specific
-fashion.
+GCC provides three magic variables which hold the name of the current
+function, as a string. The first of these is @code{__func__}, which
+is part of the C99 standard:
+
+@display
+The identifier @code{__func__} is implicitly declared by the translator
+as if, immediately following the opening brace of each function
+definition, the declaration
+
+@smallexample
+static const char __func__[] = "function-name";
+@end smallexample
-These names are always the same in a C function, but in a C++ function
-they may be different. For example, this program:
+appeared, where function-name is the name of the lexically-enclosing
+function. This name is the unadorned name of the function.
+@end display
+
+@code{__FUNCTION__} is another name for @code{__func__}. Older
+versions of GCC recognize only this name. However, it is not
+standardized. For maximum portability, we recommend you use
+@code{__func__}, but provide a fallback definition with the
+preprocessor:
+
+@smallexample
+#if __STDC_VERSION__ < 199901L
+# if __GNUC__ >= 2
+# define __func__ __FUNCTION__
+# else
+# define __func__ "<unknown>"
+# endif
+#endif
+@end smallexample
+
+In C, @code{__PRETTY_FUNCTION__} is yet another name for
+@code{__func__}. However, in C++, @code{__PRETTY_FUNCTION__} contains
+the type signature of the function as well as its bare name. For
+example, this program:
@smallexample
extern "C" @{
@@ -4478,46 +4493,16 @@ gives this output:
@smallexample
__FUNCTION__ = sub
-__PRETTY_FUNCTION__ = int a::sub (int)
-@end smallexample
-
-The compiler automagically replaces the identifiers with a string
-literal containing the appropriate name. Thus, they are neither
-preprocessor macros, like @code{__FILE__} and @code{__LINE__}, nor
-variables. This means that they catenate with other string literals, and
-that they can be used to initialize char arrays. For example
-
-@smallexample
-char here[] = "Function " __FUNCTION__ " in " __FILE__;
+__PRETTY_FUNCTION__ = void a::sub(int)
@end smallexample
-On the other hand, @samp{#ifdef __FUNCTION__} does not have any special
-meaning inside a function, since the preprocessor does not do anything
-special with the identifier @code{__FUNCTION__}.
-
-Note that these semantics are deprecated, and that GCC 3.2 will handle
-@code{__FUNCTION__} and @code{__PRETTY_FUNCTION__} the same way as
-@code{__func__}. @code{__func__} is defined by the ISO standard C99:
-
-@display
-The identifier @code{__func__} is implicitly declared by the translator
-as if, immediately following the opening brace of each function
-definition, the declaration
-
-@smallexample
-static const char __func__[] = "function-name";
-@end smallexample
-
-appeared, where function-name is the name of the lexically-enclosing
-function. This name is the unadorned name of the function.
-@end display
-
-By this definition, @code{__func__} is a variable, not a string literal.
-In particular, @code{__func__} does not catenate with other string
-literals.
-
-In @code{C++}, @code{__FUNCTION__} and @code{__PRETTY_FUNCTION__} are
-variables, declared in the same way as @code{__func__}.
+These identifiers are not preprocessor macros. In GCC 3.3 and
+earlier, in C only, @code{__FUNCTION__} and @code{__PRETTY_FUNCTION__}
+were treated as string literals; they could be used to initialize
+@code{char} arrays, and they could be concatenated with other string
+literals. GCC 3.4 and later treat them as variables, like
+@code{__func__}. In C++, @code{__FUNCTION__} and
+@code{__PRETTY_FUNCTION__} have always been variables.
@node Return Address
@section Getting the Return or Frame Address of a Function
diff --git a/gcc/objc/objc-act.c b/gcc/objc/objc-act.c
index d52e62ee75f..3d78b099941 100644
--- a/gcc/objc/objc-act.c
+++ b/gcc/objc/objc-act.c
@@ -1274,18 +1274,18 @@ my_build_string (len, str)
return fix_string_type (build_string (len, str));
}
-/* Given a chain of STRING_CST's, build a static instance of
- NXConstantString which points at the concatenation of those strings.
+/* Build a static instance of NXConstantString which points at the
+ string constant STRING.
We place the string object in the __string_objects section of the
__OBJC segment. The Objective-C runtime will initialize the isa
pointers of the string objects to point at the NXConstantString
class object. */
tree
-build_objc_string_object (strings)
- tree strings;
+build_objc_string_object (string)
+ tree string;
{
- tree string, initlist, constructor;
+ tree initlist, constructor;
int length;
if (lookup_interface (constant_string_id) == NULL_TREE)
@@ -1297,22 +1297,6 @@ build_objc_string_object (strings)
add_class_reference (constant_string_id);
- if (TREE_CHAIN (strings))
- {
- varray_type vstrings;
- VARRAY_TREE_INIT (vstrings, 32, "strings");
-
- for (; strings ; strings = TREE_CHAIN (strings))
- VARRAY_PUSH_TREE (vstrings, strings);
-
- string = combine_strings (vstrings);
- }
- else
- string = strings;
-
- string = fix_string_type (string);
-
- TREE_SET_CODE (string, STRING_CST);
length = TREE_STRING_LENGTH (string) - 1;
/* We could not properly create NXConstantString in synth_module_prologue,
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3079c344bbf..d67d0b74693 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,17 @@
+2003-07-04 Zack Weinberg <zack@codesourcery.com>
+
+ * gcc.c-torture/execute/wchar_t-1.x: New file; XFAIL wchar_t-1.c
+ everywhere.
+ * gcc.dg/concat.c: Concatenation of string constants with
+ __FUNCTION__ / __PRETTY_FUNCTION__ is now a hard error.
+ * gcc.dg/wtr-strcat-1.c: Loosen dg-warning regexp.
+ * gcc.dg/cpp/escape-2.c: Use wide character constants where
+ necessary to avoid multi-character character constant warning.
+ * gcc.dg/cpp/escape.c: Likewise.
+ * gcc.dg/cpp/ucs.c: Likewise.
+ Remove backslashes from dg-bogus comments, as they confuse Tcl.
+ Fix a typo.
+
2003-07-04 Kazu Hirata <kazu@cs.umass.edu>
PR c/11428
@@ -117,7 +131,7 @@
PR c++/10219
* g++.dg/template/error1.C: New.
-
+
PR c++/9779
* g++.dg/template/dependent-expr1.C: New.
@@ -131,8 +145,8 @@
2003-07-01 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/8046
- * g++.dg/other/error7.C: New test.
+ PR c++/8046
+ * g++.dg/other/error7.C: New test.
2003-07-01 Kazu Hirata <kazu@cs.umass.edu>
@@ -164,22 +178,22 @@
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/4933
- * g++.dg/template/sizeof4.C: New test.
+ PR c++/4933
+ * g++.dg/template/sizeof4.C: New test.
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- * g++.dg/other/error6.C: New test.
+ * g++.dg/other/error6.C: New test.
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/10750
- * g++.dg/parse/constant2.C: New test.
+ PR c++/10750
+ * g++.dg/parse/constant2.C: New test.
2003-06-30 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/11106
- * g++.dg/other/error5.C: New test.
+ PR c++/11106
+ * g++.dg/other/error5.C: New test.
2003-06-29 Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
@@ -238,8 +252,8 @@
2003-06-26 Giovanni Bajo <giovannibajo@libero.it>
- PR c++/8266
- * g++.dg/template/explicit-instantiation3.C: New test.
+ PR c++/8266
+ * g++.dg/template/explicit-instantiation3.C: New test.
2003-06-26 Eric Botcazou <ebotcazou@libertysurf.fr>
@@ -263,7 +277,7 @@
PR c++/10931
* g++.dg/expr/static_cast1.C: New test.
-
+
2003-06-25 Josef Zlomek <zlomekj@suse.cz>
* gcc.dg/20030625-1.c: New test.
@@ -295,10 +309,10 @@
2003-06-21 Gabriel Dos Reis <gdr@integrable-solutions.net>
- * g++.old-deja/g++.benjamin/16077.C: Add -Wconversion option.
- * g++.old-deja/g++.other/conv7.C: Likewise
- * g++.old-deja/g++.other/overcnv2.C: Likewise.
- * g++.old-deja/g++.other/overload14.C: Likewise.
+ * g++.old-deja/g++.benjamin/16077.C: Add -Wconversion option.
+ * g++.old-deja/g++.other/conv7.C: Likewise
+ * g++.old-deja/g++.other/overcnv2.C: Likewise.
+ * g++.old-deja/g++.other/overload14.C: Likewise.
2003-06-21 Gabriel Dos Reis <gdr@integrable-solutions.net>
@@ -308,7 +322,7 @@
2003-06-20 Mark Mitchell <mark@codesourcery.com>
PR c++/10888
- * g++.dg/warn/Winline-3.C: New test.
+ * g++.dg/warn/Winline-3.C: New test.
2003-06-20 Mark Mitchell <mark@codesourcery.com>
@@ -319,8 +333,8 @@
* g++.dg/template/memclass2.C: New test.
2003-06-20 Mark Mitchell <mark@codesourcery.com>
- Eric Botcazou <ebotcazou@libertysurf.fr>
-
+ Eric Botcazou <ebotcazou@libertysurf.fr>
+
* lib/gcc-dg.exp (dg-xfail-if): Do not process conditional xfail
data for non-matching targets.
* gcc.c-torture/compile/simd-5.c: Fix typo in conditional xfail.
@@ -349,7 +363,7 @@
* g++.dg/anew2.C: New test.
* g++.dg/anew3.C: New test.
* g++.dg/anew4.C: New test.
-
+
2003-06-19 Kazu Hirata <kazu@cs.umass.edu>
* gcc.c-torture/compile/simd-5.c: Don't XFAIL on H8.
@@ -363,7 +377,7 @@
* lib/g++.exp (g++_include_flags): Tweak path to testsuite_flags.
Remove cruft.
-
+
2003-06-17 Kazu Hirata <kazu@cs.umass.edu>
* gcc.c-torture/compile/20020604-1.c: Use dg-xfail-if for h8300.
@@ -374,7 +388,7 @@
PR c++/10712
* g++.dg/lookup/using7.C: New test.
-
+
2003-06-17 Mark Mitchell <mark@codesourcery.com>
PR c++/11105
@@ -444,7 +458,7 @@
(dg-xfail-if): Likewise.
* gcc.c-torture/compile/compile.exp: Use dg rather than c-torture
driver.
-
+
* gcc.c-torture/compile/20000804-1.c: Convert to dg format.
* gcc.c-torture/compile/20001205-1.c: Likewise.
* gcc.c-torture/compile/20001226-1.c: Likewise.
@@ -775,7 +789,7 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
* lib/g++.exp (additional_sources): Remove.
(additional_files): Likewise.
(g++_target_compile): Use dg-additional-files-options.
-
+
* gcc.dg/special/special.exp: Add "ecos" tests. Remove complex
Tcl logic.
* gcc.dg/special/ecos.exp: Remove.
@@ -787,7 +801,7 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
* gcc.dg/special/weak-2.c: Likewise.
* gcc.dg/special/wkali-1.c: Likewise.
* gcc.dg/special/wkali-2.c: Likewise.
-
+
* g++.dg/special/conpr-2.C: Use dg-additional-*, not
dg-gpp-additional-*.
* g++.dg/special/conpr-3.C: Likewise.
@@ -819,13 +833,13 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
2003-06-04 Richard Henderson <rth@redhat.com>
- * gcc.dg/cleanup-1.c: New.
- * gcc.dg/cleanup-2.c: New.
- * gcc.dg/cleanup-3.c: New.
- * gcc.dg/cleanup-4.c: New.
- * gcc.dg/cleanup-5.c: New.
- * gcc.dg/cleanup-6.c: New.
- * gcc.dg/cleanup-7.c: New.
+ * gcc.dg/cleanup-1.c: New.
+ * gcc.dg/cleanup-2.c: New.
+ * gcc.dg/cleanup-3.c: New.
+ * gcc.dg/cleanup-4.c: New.
+ * gcc.dg/cleanup-5.c: New.
+ * gcc.dg/cleanup-6.c: New.
+ * gcc.dg/cleanup-7.c: New.
2003-06-04 Mark Mitchell <mark@codesourcery.com>
@@ -866,7 +880,7 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
2003-06-03 Aldy Hernandez <aldyh@redhat.com>
- * gcc.c-torture/compile/simd-5.x: Remove xfail for PPC64.
+ * gcc.c-torture/compile/simd-5.x: Remove xfail for PPC64.
2003-06-03 Kriang Lerdsuwanakij <lerdsuwa@users.sourceforge.net>
@@ -934,7 +948,7 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
* gcc.dg/cpp/cpp.exp: Remove scanning of ".cpp" files.
2003-06-01 Loren James Rittle <ljrittle@acm.org>
-
+
* gcc.dg/cpp/redef3.c: New file.
2003-06-01 Eric Botcazou <ebotcazou@libertysurf.fr>
@@ -964,7 +978,7 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
PR fortran/10843
* g77.dg/ffixed-form-1.f: New test
* g77.dg/ffixed-form-2.f: New test
- * g77.dg/ffree-form-2.f: New test - XFAIL pending fix
+ * g77.dg/ffree-form-2.f: New test - XFAIL pending fix
* g77.dg/ffree-form-3.f: New test
2003-05-26 Andreas Tobler <a.tobler@schweiz.ch>
@@ -982,21 +996,21 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
* g++.dg/template/access10.C: New test.
2003-05-24 Eric Botcazou <ebotcazou@libertysurf.fr>
- Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
+ Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
* gcc.c-torture/compile/simd-5.x: XFAIL on SPARC64.
2003-05-24 Andreas Tobler <a.tobler@schweiz.ch>
- * gcc.dg/torture/builtin-noret-1.c: Add dg-option -multiply_defined
- suppress for powerpc-*-darwin*.
+ * gcc.dg/torture/builtin-noret-1.c: Add dg-option -multiply_defined
+ suppress for powerpc-*-darwin*.
* gcc.dg/torture/builtin-noret-2.c: Likewise.
2003-05-24 Andreas Tobler <a.tobler@schweiz.ch>
* gcc.c-torture/execute/builtins/builtins.exp: Add -multiply_defined
suppress option for powerpc-*-darwin*.
-
+
2003-05-23 Roger Sayle <roger@eyesopen.com>
* gcc.dg/builtins-1.c: Add tests for tan and atan.
@@ -1123,7 +1137,7 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
* lib/g++-dg.exp: Use gcc-dg.exp to implement all functionality.
* lib/g77-dg.exp: Likewise.
* lib/obj-dg.exp: Likewise.
-
+
2003-05-16 Jakub Jelinek <jakub@redhat.com>
Merge from gcc-3_2-rhl8-branch:
@@ -1316,13 +1330,13 @@ Sun Jun 8 16:46:04 CEST 2003 Jan Hubicka <jh@suse.cz>
2003-05-07 Richard Henderson <rth@redhat.com>
- PR c++/10570
- * g++.dg/eh/forced1.C: Expect catch-all handlers to run.
- Verify exception_cleanup not called for rethrows.
- * g++.dg/eh/forced2.C: Test that exception_cleanup is called
- when exiting catch block without rethrowing.
- * g++.dg/eh/forced3.C: New.
- * g++.dg/eh/forced4.C: New.
+ PR c++/10570
+ * g++.dg/eh/forced1.C: Expect catch-all handlers to run.
+ Verify exception_cleanup not called for rethrows.
+ * g++.dg/eh/forced2.C: Test that exception_cleanup is called
+ when exiting catch block without rethrowing.
+ * g++.dg/eh/forced3.C: New.
+ * g++.dg/eh/forced4.C: New.
2003-05-07 Aldy Hernandez <aldyh@redhat.com>
diff --git a/gcc/testsuite/gcc.c-torture/execute/wchar_t-1.x b/gcc/testsuite/gcc.c-torture/execute/wchar_t-1.x
new file mode 100644
index 00000000000..38c693d2f00
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/wchar_t-1.x
@@ -0,0 +1,3 @@
+# Doesn't compile due to use of literal ISO8859.1 characters. PR 11439.
+set torture_compile_xfail "*-*-*"
+return 0
diff --git a/gcc/testsuite/gcc.dg/concat.c b/gcc/testsuite/gcc.dg/concat.c
index 0a77b99bb1f..17a80a7e41d 100644
--- a/gcc/testsuite/gcc.dg/concat.c
+++ b/gcc/testsuite/gcc.dg/concat.c
@@ -2,15 +2,15 @@
/* { dg-do compile } */
-/* Test we output a warning for concatenation of artificial strings.
+/* Test we output an error for concatenation of artificial strings.
Neil Booth, 10 Dec 2001. */
void foo ()
{
- char str1[] = __FUNCTION__ "."; /* { dg-warning "deprecated" } */
- char str2[] = __PRETTY_FUNCTION__ ".";/* { dg-warning "deprecated" } */
- char str3[] = "." __FUNCTION__; /* { dg-warning "deprecated" } */
- char str4[] = "." __PRETTY_FUNCTION__;/* { dg-warning "deprecated" } */
- char str5[] = "." "."; /* No warning. */
+ char s1[] = __FUNCTION__"."; /* { dg-error "(parse|syntax|invalid)" } */
+ char s2[] = __PRETTY_FUNCTION__".";/* { dg-error "(parse|syntax|invalid)" } */
+ char s3[] = "."__FUNCTION__; /* { dg-error "(parse|syntax|invalid)" } */
+ char s4[] = "."__PRETTY_FUNCTION__;/* { dg-error "(parse|syntax|invalid)" } */
+ char s5[] = ".""."; /* No error. */
}
diff --git a/gcc/testsuite/gcc.dg/cpp/escape-2.c b/gcc/testsuite/gcc.dg/cpp/escape-2.c
index 31bf882c721..e79fa91cbe9 100644
--- a/gcc/testsuite/gcc.dg/cpp/escape-2.c
+++ b/gcc/testsuite/gcc.dg/cpp/escape-2.c
@@ -10,11 +10,11 @@
#if '\e' /* { dg-warning "non-ISO" "non-ISO \\e" } */
#endif
-#if '\u00a0' /* { dg-bogus "unknown" "\\u is known in C99" } */
+#if L'\u00a0' /* { dg-bogus "unknown" "\\u is known in C99" } */
#endif
void foo ()
{
int c = '\E'; /* { dg-warning "non-ISO" "non-ISO \\E" } */
- c = '\u00a0'; /* { dg-bogus "unknown" "\\u is known in C99" } */
+ c = L'\u00a0'; /* { dg-bogus "unknown" "\\u is known in C99" } */
}
diff --git a/gcc/testsuite/gcc.dg/cpp/escape.c b/gcc/testsuite/gcc.dg/cpp/escape.c
index 44ad4c1d2bf..c9dd44e43e5 100644
--- a/gcc/testsuite/gcc.dg/cpp/escape.c
+++ b/gcc/testsuite/gcc.dg/cpp/escape.c
@@ -13,7 +13,7 @@
#if '\x1a' != 26 /* { dg-warning "traditional" "traditional hex" } */
#error bad hex /* { dg-bogus "bad" "bad hexadecimal evaluation" } */
#endif
-#if '\u' /* { dg-warning "unknown" "\u is unknown in C89" } */
+#if L'\u00a1' /* { dg-warning "only valid" "\u is unknown in C89" } */
#endif
void foo ()
@@ -21,5 +21,5 @@ void foo ()
int c = '\a'; /* { dg-warning "traditional" "traditional bell" } */
c = '\xa1'; /* { dg-warning "traditional" "traditional hex" } */
- c = '\u'; /* { dg-warning "unknown" "\u is unknown in C89" } */
+ c = L'\u00a1'; /* { dg-warning "only valid" "\u is unknown in C89" } */
}
diff --git a/gcc/testsuite/gcc.dg/cpp/ucs.c b/gcc/testsuite/gcc.dg/cpp/ucs.c
index d36e0dc517f..3f3d97edfa5 100644
--- a/gcc/testsuite/gcc.dg/cpp/ucs.c
+++ b/gcc/testsuite/gcc.dg/cpp/ucs.c
@@ -35,12 +35,12 @@
#undef long
#if L'\u1234' != 0x1234
-#error bad short ucs /* { dg-bogus "bad" "bad \u1234 evaluation" } */
+#error bad short ucs /* { dg-bogus "bad" "bad u1234 evaluation" } */
#endif
#if WCHAR_MAX >= 0x7ffffff
# if L'\U1234abcd' != 0x1234abcd
-# error bad long ucs /* { dg-bogus "bad" "bad \U1234abcd evaluation" } */
+# error bad long ucs /* { dg-bogus "bad" "bad U1234abcd evaluation" } */
# endif
#endif
@@ -48,7 +48,7 @@ void foo ()
{
int c;
- c = L'\ubad'; /* { dg-error "incomplete" "incompete UCN 1" } */
+ c = L'\ubad'; /* { dg-error "incomplete" "incomplete UCN 1" } */
c = L"\U1234"[0]; /* { dg-error "incomplete" "incompete UCN 2" } */
c = L'\u000x'; /* { dg-error "incomplete" "non-hex digit in UCN" } */
@@ -58,7 +58,7 @@ void foo ()
c = '\u0024'; /* { dg-bogus "invalid" "0024 is a valid UCN" } */
c = "\u0040"[0]; /* { dg-bogus "invalid" "0040 is a valid UCN" } */
- c = '\u00a0'; /* { dg-bogus "invalid" "00a0 is a valid UCN" } */
+ c = L'\u00a0'; /* { dg-bogus "invalid" "00a0 is a valid UCN" } */
c = '\U00000060'; /* { dg-bogus "invalid" "0060 is a valid UCN" } */
c = '\u0025'; /* { dg-error "not a valid" "0025 invalid UCN" } */
diff --git a/gcc/testsuite/gcc.dg/wtr-strcat-1.c b/gcc/testsuite/gcc.dg/wtr-strcat-1.c
index b3966529ad7..730a929c596 100644
--- a/gcc/testsuite/gcc.dg/wtr-strcat-1.c
+++ b/gcc/testsuite/gcc.dg/wtr-strcat-1.c
@@ -9,7 +9,7 @@ testfunc ()
{
const char *foo;
- foo = "hello" "hello"; /* { dg-warning "string concatenation" "string concatenation" } */
+ foo = "hello" "hello"; /* { dg-warning "concatenation" "string concatenation" } */
# 15 "sys-header.h" 3
/* We are in system headers now, no -Wtraditional warnings should issue. */