summaryrefslogtreecommitdiff
path: root/libcpp
diff options
context:
space:
mode:
authorGeoffrey Keating <geoffk@apple.com>2005-03-12 10:44:06 +0000
committerGeoffrey Keating <geoffk@gcc.gnu.org>2005-03-12 10:44:06 +0000
commit47e204910a9a3e154e38121f55b9cafec0620b63 (patch)
tree96b619db02d90b96e5dc09601db8bd7a58e95367 /libcpp
parent5269bfe2809931ca62a0bcd8cad1bed7e78e5b32 (diff)
downloadgcc-47e204910a9a3e154e38121f55b9cafec0620b63.tar.gz
Index: libcpp/ChangeLog
2005-03-12 Geoffrey Keating <geoffk@apple.com> * directives.c (glue_header_name): Update call to cpp_spell_token. * internal.h (_cpp_interpret_identifier): New. * charset.c (_cpp_interpret_identifier): New. (_cpp_valid_ucn): Allow UCN version of '$'. * lex.c (lex_identifier): Add extra parameter to indicate if initial character was '$' or '\'. Support identifiers with UCNs. (forms_identifier_p): Allow UCNs. (_cpp_lex_direct): Pass extra parameter to lex_identifier. (utf8_to_ucn): New. (cpp_spell_token): Add FORSTRING parameter. Use it. (cpp_token_as_text): Update call to cpp_spell_token. (cpp_output_token): Write UCNs back out. (stringify_arg): Update call to cpp_spell_token. (paste_tokens): Likewise. (cpp_macro_definition): Likewise. * macro.c (stringify_arg): Likewise. (paste_tokens): Likewise. (cpp_macro_definition): Likewise. * include/cpplib.h: Add parameter to cpp_spell_token. Index: gcc/ChangeLog 2005-03-12 Geoffrey Keating <geoffk@apple.com> * c-lex.c (c_lex_with_flags): Add parameter to call to cpp_spell_token. Index: gcc/testsuite/ChangeLog 2005-03-12 Geoffrey Keating <geoffk@apple.com> * gcc.dg/ucnid-1.c: New. * gcc.dg/ucnid-2.c: New. * gcc.dg/ucnid-3.c: New. * gcc.dg/ucnid-4.c: New. * gcc.dg/ucnid-5.c: New. * gcc.dg/ucnid-6.c: New. * gcc.dg/cpp/ucnid-1.c: New. * gcc.dg/cpp/ucnid-2.c: New. * gcc.dg/cpp/ucnid-3.c: New. * g++.dg/other/ucnid-1.C: New. From-SVN: r96333
Diffstat (limited to 'libcpp')
-rw-r--r--libcpp/ChangeLog22
-rw-r--r--libcpp/charset.c62
-rw-r--r--libcpp/directives.c3
-rw-r--r--libcpp/include/cpplib.h2
-rw-r--r--libcpp/internal.h3
-rw-r--r--libcpp/lex.c138
-rw-r--r--libcpp/macro.c10
7 files changed, 198 insertions, 42 deletions
diff --git a/libcpp/ChangeLog b/libcpp/ChangeLog
index b246de782da..5190599b9b0 100644
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@@ -1,3 +1,25 @@
+2005-03-11 Geoffrey Keating <geoffk@apple.com>
+
+ * directives.c (glue_header_name): Update call to cpp_spell_token.
+ * internal.h (_cpp_interpret_identifier): New.
+ * charset.c (_cpp_interpret_identifier): New.
+ (_cpp_valid_ucn): Allow UCN version of '$'.
+ * lex.c (lex_identifier): Add extra parameter to indicate if initial
+ character was '$' or '\'. Support identifiers with UCNs.
+ (forms_identifier_p): Allow UCNs.
+ (_cpp_lex_direct): Pass extra parameter to lex_identifier.
+ (utf8_to_ucn): New.
+ (cpp_spell_token): Add FORSTRING parameter. Use it.
+ (cpp_token_as_text): Update call to cpp_spell_token.
+ (cpp_output_token): Write UCNs back out.
+ (stringify_arg): Update call to cpp_spell_token.
+ (paste_tokens): Likewise.
+ (cpp_macro_definition): Likewise.
+ * macro.c (stringify_arg): Likewise.
+ (paste_tokens): Likewise.
+ (cpp_macro_definition): Likewise.
+ * include/cpplib.h: Add parameter to cpp_spell_token.
+
2005-03-04 Jakub Jelinek <jakub@redhat.com>
PR bootstrap/20282
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 6b6c360f73d..cd25f10a2e6 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -907,6 +907,15 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
(int) (str - base), base);
result = 1;
}
+ else if (identifier_pos && result == 0x24
+ && CPP_OPTION (pfile, dollars_in_ident))
+ {
+ if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
+ {
+ CPP_OPTION (pfile, warn_dollars) = 0;
+ cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
+ }
+ }
else if (identifier_pos)
{
int validity = ucn_valid_in_identifier (pfile, result);
@@ -1414,7 +1423,60 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
return result;
}
+
+/* Convert an identifier denoted by ID and LEN, which might contain
+ UCN escapes, to the source character set, either UTF-8 or
+ UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
+cpp_hashnode *
+_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
+{
+ /* It turns out that a UCN escape always turns into fewer characters
+ than the escape itself, so we can allocate a temporary in advance. */
+ uchar * buf = alloca (len + 1);
+ uchar * bufp = buf;
+ size_t idp;
+
+ for (idp = 0; idp < len; idp++)
+ if (id[idp] != '\\')
+ *bufp++ = id[idp];
+ else
+ {
+ unsigned length = id[idp+1] == 'u' ? 4 : 8;
+ cppchar_t value = 0;
+ size_t bufleft = len - (bufp - buf);
+ int rval;
+
+ idp += 2;
+ while (length && idp < len && ISXDIGIT (id[idp]))
+ {
+ value = (value << 4) + hex_value (id[idp]);
+ idp++;
+ length--;
+ }
+ idp--;
+
+ /* Special case for EBCDIC: if the identifier contains
+ a '$' specified using a UCN, translate it to EBCDIC. */
+ if (value == 0x24)
+ {
+ *bufp++ = '$';
+ continue;
+ }
+
+ rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
+ if (rval)
+ {
+ errno = rval;
+ cpp_errno (pfile, CPP_DL_ERROR,
+ "converting UCN to source character set");
+ break;
+ }
+ }
+ return CPP_HASHNODE (ht_lookup (pfile->hash_table,
+ buf, bufp - buf, HT_ALLOC));
+}
+
/* Convert an input buffer (containing the complete contents of one
source file) from INPUT_CHARSET to the source character set. INPUT
points to the input buffer, SIZE is its allocated size, and LEN is
diff --git a/libcpp/directives.c b/libcpp/directives.c
index 84065052630..957e879caec 100644
--- a/libcpp/directives.c
+++ b/libcpp/directives.c
@@ -608,7 +608,8 @@ glue_header_name (cpp_reader *pfile)
if (token->flags & PREV_WHITE)
buffer[total_len++] = ' ';
- total_len = (cpp_spell_token (pfile, token, (uchar *) &buffer[total_len])
+ total_len = (cpp_spell_token (pfile, token, (uchar *) &buffer[total_len],
+ true)
- (uchar *) buffer);
}
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index 70f8d895afd..ccf8bff47e4 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -637,7 +637,7 @@ extern unsigned int cpp_errors (cpp_reader *);
extern unsigned int cpp_token_len (const cpp_token *);
extern unsigned char *cpp_token_as_text (cpp_reader *, const cpp_token *);
extern unsigned char *cpp_spell_token (cpp_reader *, const cpp_token *,
- unsigned char *);
+ unsigned char *, bool);
extern void cpp_register_pragma (cpp_reader *, const char *, const char *,
void (*) (cpp_reader *), bool);
extern void cpp_handle_deferred_pragma (cpp_reader *, const cpp_string *);
diff --git a/libcpp/internal.h b/libcpp/internal.h
index 0ae13d58cb6..af823d766b3 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -571,6 +571,9 @@ extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
unsigned char *, size_t, size_t,
off_t *);
extern const char *_cpp_default_encoding (void);
+extern cpp_hashnode * _cpp_interpret_identifier (cpp_reader *pfile,
+ const unsigned char *id,
+ size_t len);
/* Utility routines and macros. */
#define DSC(str) (const unsigned char *)str, sizeof str - 1
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 62a28f81b87..8398c7ca061 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -53,7 +53,7 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
static int skip_line_comment (cpp_reader *);
static void skip_whitespace (cpp_reader *, cppchar_t);
-static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
+static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *, bool);
static void lex_number (cpp_reader *, cpp_string *);
static bool forms_identifier_p (cpp_reader *, int);
static void lex_string (cpp_reader *, cpp_token *, const uchar *);
@@ -453,7 +453,7 @@ forms_identifier_p (cpp_reader *pfile, int first)
}
/* Is this a syntactically valid UCN? */
- if (0 && *buffer->cur == '\\'
+ if (*buffer->cur == '\\'
&& (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
{
buffer->cur += 2;
@@ -467,39 +467,39 @@ forms_identifier_p (cpp_reader *pfile, int first)
/* Lex an identifier starting at BUFFER->CUR - 1. */
static cpp_hashnode *
-lex_identifier (cpp_reader *pfile, const uchar *base)
+lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn)
{
cpp_hashnode *result;
- const uchar *cur, *limit;
+ const uchar *cur;
unsigned int len;
unsigned int hash = HT_HASHSTEP (0, *base);
cur = pfile->buffer->cur;
- for (;;)
+ if (! starts_ucn)
+ while (ISIDNUM (*cur))
+ {
+ hash = HT_HASHSTEP (hash, *cur);
+ cur++;
+ }
+ pfile->buffer->cur = cur;
+ if (starts_ucn || forms_identifier_p (pfile, false))
{
- /* N.B. ISIDNUM does not include $. */
- while (ISIDNUM (*cur))
- {
- hash = HT_HASHSTEP (hash, *cur);
- cur++;
- }
-
- pfile->buffer->cur = cur;
- if (!forms_identifier_p (pfile, false))
- break;
-
- limit = pfile->buffer->cur;
- while (cur < limit)
- {
- hash = HT_HASHSTEP (hash, *cur);
- cur++;
- }
+ /* Slower version for identifiers containing UCNs (or $). */
+ do {
+ while (ISIDNUM (*pfile->buffer->cur))
+ pfile->buffer->cur++;
+ } while (forms_identifier_p (pfile, false));
+ result = _cpp_interpret_identifier (pfile, base,
+ pfile->buffer->cur - base);
}
- len = cur - base;
- hash = HT_HASHFINISH (hash, len);
+ else
+ {
+ len = cur - base;
+ hash = HT_HASHFINISH (hash, len);
- result = (cpp_hashnode *)
- ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+ result = (cpp_hashnode *)
+ ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+ }
/* Rarely, identifiers require diagnostics when lexed. */
if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
@@ -922,7 +922,7 @@ _cpp_lex_direct (cpp_reader *pfile)
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
result->type = CPP_NAME;
- result->val.node = lex_identifier (pfile, buffer->cur - 1);
+ result->val.node = lex_identifier (pfile, buffer->cur - 1, false);
/* Convert named operators to their proper types. */
if (result->val.node->flags & NODE_OPERATOR)
@@ -1155,7 +1155,7 @@ _cpp_lex_direct (cpp_reader *pfile)
if (forms_identifier_p (pfile, true))
{
result->type = CPP_NAME;
- result->val.node = lex_identifier (pfile, base);
+ result->val.node = lex_identifier (pfile, base, true);
break;
}
buffer->cur++;
@@ -1180,19 +1180,56 @@ cpp_token_len (const cpp_token *token)
{
default: len = 4; break;
case SPELL_LITERAL: len = token->val.str.len; break;
- case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
+ case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break;
}
return len;
}
+/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
+ Return the number of bytes read out of NAME. (There are always
+ 10 bytes written to BUFFER.) */
+
+static size_t
+utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
+{
+ int j;
+ int ucn_len = 0;
+ int ucn_len_c;
+ unsigned t;
+ unsigned long utf32;
+
+ /* Compute the length of the UTF-8 sequence. */
+ for (t = *name; t & 0x80; t <<= 1)
+ ucn_len++;
+
+ utf32 = *name & (0x7F >> ucn_len);
+ for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
+ {
+ utf32 = (utf32 << 6) | (*++name & 0x3F);
+
+ /* Ill-formed UTF-8. */
+ if ((*name & ~0x3F) != 0x80)
+ abort ();
+ }
+
+ *buffer++ = '\\';
+ *buffer++ = 'U';
+ for (j = 7; j >= 0; j--)
+ *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
+ return ucn_len;
+}
+
+
/* Write the spelling of a token TOKEN to BUFFER. The buffer must
already contain the enough space to hold the token's spelling.
Returns a pointer to the character after the last character written.
+ FORSTRING is true if this is to be the spelling after translation
+ phase 1 (this is different for UCNs).
FIXME: Would be nice if we didn't need the PFILE argument. */
unsigned char *
cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
- unsigned char *buffer)
+ unsigned char *buffer, bool forstring)
{
switch (TOKEN_SPELL (token))
{
@@ -1216,8 +1253,26 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
spell_ident:
case SPELL_IDENT:
- memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
- buffer += NODE_LEN (token->val.node);
+ if (forstring)
+ {
+ memcpy (buffer, NODE_NAME (token->val.node),
+ NODE_LEN (token->val.node));
+ buffer += NODE_LEN (token->val.node);
+ }
+ else
+ {
+ size_t i;
+ const unsigned char * name = NODE_NAME (token->val.node);
+
+ for (i = 0; i < NODE_LEN (token->val.node); i++)
+ if (name[i] & ~0x7F)
+ {
+ i += utf8_to_ucn (buffer, name + i) - 1;
+ buffer += 10;
+ }
+ else
+ *buffer++ = NODE_NAME (token->val.node)[i];
+ }
break;
case SPELL_LITERAL:
@@ -1242,7 +1297,7 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
unsigned int len = cpp_token_len (token) + 1;
unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
- end = cpp_spell_token (pfile, token, start);
+ end = cpp_spell_token (pfile, token, start, false);
end[0] = '\0';
return start;
@@ -1286,8 +1341,21 @@ cpp_output_token (const cpp_token *token, FILE *fp)
spell_ident:
case SPELL_IDENT:
- fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
- break;
+ {
+ size_t i;
+ const unsigned char * name = NODE_NAME (token->val.node);
+
+ for (i = 0; i < NODE_LEN (token->val.node); i++)
+ if (name[i] & ~0x7F)
+ {
+ unsigned char buffer[10];
+ i += utf8_to_ucn (buffer, name + i) - 1;
+ fwrite (buffer, 1, 10, fp);
+ }
+ else
+ fputc (NODE_NAME (token->val.node)[i], fp);
+ }
+ break;
case SPELL_LITERAL:
fwrite (token->val.str.text, 1, token->val.str.len, fp);
diff --git a/libcpp/macro.c b/libcpp/macro.c
index 7d65886a390..441b3b32ed3 100644
--- a/libcpp/macro.c
+++ b/libcpp/macro.c
@@ -380,12 +380,12 @@ stringify_arg (cpp_reader *pfile, macro_arg *arg)
{
_cpp_buff *buff = _cpp_get_buff (pfile, len);
unsigned char *buf = BUFF_FRONT (buff);
- len = cpp_spell_token (pfile, token, buf) - buf;
+ len = cpp_spell_token (pfile, token, buf, true) - buf;
dest = cpp_quote_string (dest, buf, len);
_cpp_release_buff (pfile, buff);
}
else
- dest = cpp_spell_token (pfile, token, dest);
+ dest = cpp_spell_token (pfile, token, dest, true);
if (token->type == CPP_OTHER && token->val.str.text[0] == '\\')
backslash_count++;
@@ -422,7 +422,7 @@ paste_tokens (cpp_reader *pfile, const cpp_token **plhs, const cpp_token *rhs)
lhs = *plhs;
len = cpp_token_len (lhs) + cpp_token_len (rhs) + 1;
buf = alloca (len);
- end = cpp_spell_token (pfile, lhs, buf);
+ end = cpp_spell_token (pfile, lhs, buf, false);
/* Avoid comment headers, since they are still processed in stage 3.
It is simpler to insert a space here, rather than modifying the
@@ -430,7 +430,7 @@ paste_tokens (cpp_reader *pfile, const cpp_token **plhs, const cpp_token *rhs)
false doesn't work, since we want to clear the PASTE_LEFT flag. */
if (lhs->type == CPP_DIV && rhs->type != CPP_EQ)
*end++ = ' ';
- end = cpp_spell_token (pfile, rhs, end);
+ end = cpp_spell_token (pfile, rhs, end, false);
*end = '\n';
cpp_push_buffer (pfile, buf, end - buf, /* from_stage3 */ true);
@@ -1751,7 +1751,7 @@ cpp_macro_definition (cpp_reader *pfile, const cpp_hashnode *node)
buffer += NODE_LEN (macro->params[token->val.arg_no - 1]);
}
else
- buffer = cpp_spell_token (pfile, token, buffer);
+ buffer = cpp_spell_token (pfile, token, buffer, false);
if (token->flags & PASTE_LEFT)
{