diff options
| author | Michael Gran <spk121@yahoo.com> | 2009-04-26 09:02:43 -0700 |
|---|---|---|
| committer | Michael Gran <spk121@yahoo.com> | 2009-04-26 09:05:05 -0700 |
| commit | a84d5f1e3d290417d443d2a29b25e173bb8c3da3 (patch) | |
| tree | 8da0302d44fd6509c0f820db19d57067c24d0693 | |
| parent | 3aa3f19175fa199090632889ef874c5516209649 (diff) | |
| download | guile-a84d5f1e3d290417d443d2a29b25e173bb8c3da3.tar.gz | |
Refactor writing symbols; don't double-covert strings in VM
* vm-i-loader.c: Don't double-convert strings and symbols to the
locale.
* vm.c: Pick up symbols.h
* symbols.h: Add declaration
* symbols.c (scm_i_is_keywordish_symbol): Do kewordish
determination here and not in print.c
* strings.h: Add declarations for scm_i_symbol_name*
* strings.c (is_printable_ascii): New
(string_escape_char): New
(symbol_escape_char): New
(hex_escape_length): New
(hex_escape): New
(scm_i_to_write_string): Use new helper funcs to reduce complexity
(scm_i_symbol_name_to_write_string): Do this conversion here and
not in the print module. Use new helper funcs. Draw a distiction
between writing and displaying for symbols, where writing is 7-bit
ASCII always.
(scm_i_symbol_name_to_locale_string): Do this conversion here and
not in the print module. Use new help funcs
* print.c (quote_keywordish_symbol): Move determination of whether
a symbol is keywordish to symbols module.
(iprin1): Move symbol to c string conversion to strings module.
| -rw-r--r-- | libguile/print.c | 26 | ||||
| -rw-r--r-- | libguile/strings.c | 460 | ||||
| -rw-r--r-- | libguile/strings.h | 6 | ||||
| -rw-r--r-- | libguile/symbols.c | 22 | ||||
| -rw-r--r-- | libguile/symbols.h | 2 | ||||
| -rw-r--r-- | libguile/vm-i-loader.c | 6 | ||||
| -rw-r--r-- | libguile/vm.c | 1 |
7 files changed, 434 insertions, 89 deletions
diff --git a/libguile/print.c b/libguile/print.c index f3ed140d7..da0352334 100644 --- a/libguile/print.c +++ b/libguile/print.c @@ -316,12 +316,8 @@ static int quote_keywordish_symbol (SCM str) { SCM option; - size_t len = scm_i_symbol_length (str); - /* LEN is guaranteed to be > 0. - */ - if (!scm_i_symbol_ref_eq_char (str, 0, ':') - && !scm_i_symbol_ref_eq_char (str, len - 1, ':')) + if (!scm_i_is_keywordish_symbol (str)) return 0; option = SCM_PRINT_KEYWORD_STYLE; @@ -620,15 +616,31 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate) case scm_tc7_symbol: if (scm_i_symbol_is_interned (exp)) { - scm_i_print_symbol_name (exp, port); + char *str; + if (SCM_WRITINGP (pstate)) + str = scm_i_symbol_name_to_write_string (exp, quote_keywordish_symbol (exp)); + else + str = scm_i_symbol_name_to_locale_string (exp, quote_keywordish_symbol (exp)); + scm_lfwrite (str, strlen (str), port); + free (str); + scm_remember_upto_here_1 (exp); } else { + char *str; + scm_puts ("#<uninterned-symbol ", port); - scm_i_print_symbol_name (exp, port); + if (SCM_WRITINGP (pstate)) + str = scm_i_symbol_name_to_write_string (exp, quote_keywordish_symbol (exp)); + else + str = scm_i_symbol_name_to_locale_string (exp, quote_keywordish_symbol (exp)); + + scm_lfwrite (str, strlen (str), port); scm_putc (' ', port); scm_uintprint (SCM_UNPACK (exp), 16, port); scm_putc ('>', port); + free (str); + scm_remember_upto_here_1 (exp); } break; case scm_tc7_variable: diff --git a/libguile/strings.c b/libguile/strings.c index 4aa7fd6b1..bb1c17c8d 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -628,6 +628,9 @@ scm_i_c_take_symbol (char *name, size_t len, size_t scm_i_symbol_length (SCM sym) { + if (!scm_is_symbol (sym)) + scm_wrong_type_arg ("scm_i_symbol_length", 0, sym); + return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym)); } @@ -1170,6 +1173,109 @@ scm_i_symbol_strcmp (SCM sym, const char *str) } } + +/* Displaying and writing strings */ + +static int +is_printable_ascii (scm_t_uint32 ch) +{ + return (ch >= 0x20 && ch <= 0x7E); +} + +#define NO_MATCH ((scm_t_uint32)(-1)) + +static scm_t_uint32 +string_escape_char (scm_t_uint32 ch) +{ + if (ch == '"' || ch == '\\') + return ch; + else if (ch == '\0') + return '0'; + else if (ch == '\f') + return 'f'; + else if (ch == '\n') + return 'n'; + else if (ch == '\r') + return 'r'; + else if (ch == '\t') + return 't'; + else if (ch == '\007') + return 'a'; + else if (ch == '\010') + return 'b'; + else if (ch == '\v') + return 'v'; + else + return NO_MATCH; +} + +static scm_t_uint32 +symbol_escape_char (scm_t_uint32 c) +{ + if ( c == '(' || c == ')' || c == '"' || c == ';' || c == '#' +#ifdef BRACKETS_AS_PARENS + || c == '[' || c == ']' +#endif +#ifdef MSDOS + || c == SCM_ASCII_SUB_CHAR +#endif + || c == ' ' || c == '\t' + || c == '\n' || c == '\r' || c == '\f') + return c; + else + return NO_MATCH; +} + +static size_t +hex_escape_length (scm_t_uint32 ch) +{ + size_t olen; + + if (ch <= 0xF) /* \xN; */ + olen = 4; + else if (ch <= 0xFF) /* \xNN; */ + olen = 5; + else if (ch <= 0xFFF) /* \xNNN; */ + olen = 6; + else if (ch <= 0xFFFF) /* \xNNNN; */ + olen = 7; + else if (ch <= 0xFFFFF) /* \xNNNNN; */ + olen = 8; + else if (ch <= SCM_CODEPOINT_MAX) + olen = 9; /* \x10NNNN; */ + else + /* This should be impossible. */ + abort (); + + return olen; +} + +static const char * +hex_escape (scm_t_uint32 ch, size_t *plen) +{ + static char const hex[]="0123456789abcdef"; + static char odata[10]; + size_t j = 0; + + odata[j++] = '\\'; + odata[j++] = 'x'; + if (ch >= 0x100000) + odata[j++] = hex [(ch & 0xF00000) >> 20]; + if (ch >= 0x10000) + odata[j++] = hex [(ch & 0xF0000) >> 16]; + if (ch >= 0x1000) + odata[j++] = hex [(ch & 0xF000) >> 12]; + if (ch >= 0x100) + odata[j++] = hex [(ch & 0xF00) >> 8]; + if (ch >= 0x10) + odata[j++] = hex [(ch & 0xF0) >> 4]; + odata[j++] = hex [ch & 0xF]; + odata[j++] = ';'; + + odata[j] = '\0'; + *plen = j; + return odata; +} /* Encode STR as a machine-readable 7-bit string for use with the write command. */ @@ -1178,8 +1284,7 @@ scm_i_to_write_string (SCM str) { size_t i, j, ilen, olen; char *odata; - scm_t_uint32 ch; - static char const hex[]="0123456789abcdef"; + scm_t_uint32 ch, ch2; /* The output length begins at 3: 2 for quotation marks plus 1 for null terminator. */ @@ -1190,35 +1295,15 @@ scm_i_to_write_string (SCM str) for (i=0; i < ilen; i++) { ch = scm_i_string_ref_to_uint32 (str, i); - if (ch >= 0x20 && ch <= 0x7E && ch != '"' && ch != '\\') - { - /* These chars are printable ASCII */ - olen ++; - } - else if (ch == '"' || ch == '\\' || ch == '\0' || ch == '\f' - || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\007' - || ch == '\010' || ch == '\v') - { - /* These chars have special escapes. */ - olen += 2; - } + if (string_escape_char (ch) != NO_MATCH) + olen += 2; + else if (is_printable_ascii (ch)) + olen ++; /* Otherwise, print it as a generic hex escape. */ - else if (ch <= 0xF) - olen += 4; /* \xN; */ - else if (ch <= 0xFF) /* \xNN; */ - olen += 5; - else if (ch <= 0xFFF) /* \xNNN; */ - olen += 6; - else if (ch <= 0xFFFF) /* \xNNNN; */ - olen += 7; - else if (ch <= 0xFFFFF) /* \xNNNNN; */ - olen += 8; - else if (ch <= SCM_CODEPOINT_MAX) - olen += 9; /* \x10NNNN; */ - else - /* This should be impossible. */ - abort (); + else + olen += hex_escape_length (ch); } + /* Pass 2: create the output string. */ odata = scm_malloc (olen); j = 0; @@ -1226,83 +1311,267 @@ scm_i_to_write_string (SCM str) for (i=0; i < ilen; i++) { ch = scm_i_string_ref_to_uint32 (str, i); - if (ch >= 0x20 && ch <= 0x7E && ch != '"' && ch != '\\') + if ((ch2 = string_escape_char (ch)) != NO_MATCH) { - /* These chars are printable ASCII. */ - odata[j++] = (char) ch; + odata[j++] = '\\'; + odata[j++] = (char) ch2; } - /* Else, check the special escapes. */ - else if (ch == '"' || ch == '\\') + else if (is_printable_ascii (ch)) { - odata[j++] = '\\'; + /* These chars are unescaped. */ odata[j++] = (char) ch; } - else if (ch == '\0') + else { - odata[j++] = '\\'; - odata[j++] = '0'; + /* Otherwise, print it in the form \xNNN; */ + size_t len; + const char *hex; + hex = hex_escape (ch, &len); + memcpy (odata + j, hex, len); + j += len; } - else if (ch == '\f') + } + odata[j++] = '"'; + odata[j++] = '\0'; + + if (j != olen) + scm_misc_error ("write string","internal string-size error~s vs ~s", + scm_list_2 (scm_from_size_t (j), + scm_from_size_t (olen))); + + scm_remember_upto_here_1 (str); + + return odata; +} + + +/* Encode SYM as a machine-readable 7-bit string for use with the + write command. */ +char * +scm_i_symbol_name_to_write_string (SCM sym, int quote_key_flag) +{ + /* If the name contains weird characters, we'll escape them with + * backslashes and set this flag; it indicates that we should surround the + * name with "#{" and "}#". */ + size_t i, j, ilen, olen; + char *odata; + scm_t_uint32 c, c2; + int weird = 0; + + /* Pass 1: Determine if it requires extended symbol syntax, aka if + it is weird. */ + ilen = scm_i_symbol_length (sym); + if (ilen == 0 + || scm_i_symbol_ref_eq_char (sym, 0, '\'') + || scm_i_symbol_ref_eq_char (sym, 0, '`') + || scm_i_symbol_ref_eq_char (sym, 0, ',') + || (scm_i_is_keywordish_symbol (sym) && quote_key_flag) + || (scm_i_symbol_ref_eq_char (sym, 0, '.') && ilen == 1) + || scm_is_true (scm_string_to_number (scm_symbol_to_string (sym), + scm_from_int (10)))) + { + weird = 1; + } + else + { + i = 0; + while (i < ilen) { - odata[j++] = '\\'; - odata[j++] = 'f'; + c = scm_i_symbol_ref_to_uint32 (sym, i++); + if (symbol_escape_char (c) != NO_MATCH) + { + weird = 1; + break; + } } - else if (ch == '\n') + } + + /* Pass 2: determine the length required. */ + if (weird) + olen = 5; /* "#{" plus "}#" plus terminating null */ + else + olen = 1; /* Terminating null */ + for (i = 0; i < ilen; i++) + { + c = scm_i_symbol_ref_to_uint32 (sym, i); + if (symbol_escape_char (c) != NO_MATCH) { - odata[j++] = '\\'; - odata[j++] = 'n'; + if (weird) + olen += 2; + else + olen ++; } - else if (ch == '\r') + else if (is_printable_ascii (c)) + olen ++; + else + olen += hex_escape_length (c); + } + + /* Pass 3: create the output string. */ + odata = scm_malloc (olen); + j = 0; + if (weird) + { + odata[j++] = '#'; + odata[j++] = '{'; + } + for (i=0; i < ilen; i++) + { + c = scm_i_symbol_ref_to_uint32 (sym, i); + if ((c2 = symbol_escape_char (c)) != NO_MATCH) { - odata[j++] = '\\'; - odata[j++] = 'r'; + if (weird) + { + odata[j++] = '\\'; + odata[j++] = c2; + } + else + odata[j++] = c; } - else if (ch == '\t') + else if (is_printable_ascii (c)) { - odata[j++] = '\\'; - odata[j++] = 't'; + /* These chars are printable ASCII */ + odata[j++] = (char) c; } - else if (ch == '\007') + else { - odata[j++] = '\\'; - odata[j++] = 'a'; + /* Otherwise, print it in the form \xNNN; */ + size_t len; + const char *hex; + hex = hex_escape (c, &len); + memcpy (odata + j, hex, len); + j += len; } - else if (ch == '\010') + } + if (weird) + { + odata[j++] = '}'; + odata[j++] = '#'; + } + odata[j++] = '\0'; + + if (j != olen) + scm_misc_error ("symbol-name->write-string","internal string-size error~s vs ~s", + scm_list_2 (scm_from_size_t (j), + scm_from_size_t (olen))); + + scm_remember_upto_here_1 (sym); + + return odata; +} + +/* Encode SYM as a machine-readable 7-bit string for use with the + write command. */ +char * +scm_i_symbol_name_to_locale_string (SCM sym, int quote_key_flag) +{ + /* If the name contains weird characters, we'll escape them with + * backslashes and set this flag; it indicates that we should surround the + * name with "#{" and "}#". */ + size_t i, j, ilen, olen, llen; + scm_t_uint32 *odata; + scm_t_uint32 c, c2; + char *ldata; + int weird = 0; + int ret; + + /* Pass 1: Determine if it requires extended symbol syntax, aka if + it is weird. */ + ilen = scm_i_symbol_length (sym); + if (ilen == 0 + || scm_i_symbol_ref_eq_char (sym, 0, '\'') + || scm_i_symbol_ref_eq_char (sym, 0, '`') + || scm_i_symbol_ref_eq_char (sym, 0, ',') + || (scm_i_is_keywordish_symbol (sym) && quote_key_flag) + || (scm_i_symbol_ref_eq_char (sym, 0, '.') && ilen == 1) + || scm_is_true (scm_string_to_number (scm_symbol_to_string (sym), + scm_from_int (10)))) + { + weird = 1; + } + else + { + i = 0; + while (i < ilen) { - odata[j++] = '\\'; - odata[j++] = 'b'; + c = scm_i_symbol_ref_to_uint32 (sym, i++); + if (symbol_escape_char (c) != NO_MATCH) + { + weird = 1; + break; + } } - else if (ch == '\v') + } + + /* Pass 2: determine the length required in codepoints. */ + if (weird) + olen = 5; /* "#{" plus "}#" plus NULL */ + else + olen = 1; /* NULL terminator */ + for (i = 0; i < ilen; i++) + { + c = scm_i_symbol_ref_to_uint32 (sym, i); + if (symbol_escape_char (c) != NO_MATCH) { - odata[j++] = '\\'; - odata[j++] = 'v'; + if (weird) + olen += 2; + else + olen ++; } else + olen ++; + } + + /* Pass 3: create the wide output string. */ + odata = scm_malloc (olen * sizeof(scm_t_uint32)); + j = 0; + if (weird) + { + odata[j++] = '#'; + odata[j++] = '{'; + } + for (i=0; i < ilen; i++) + { + c = scm_i_symbol_ref_to_uint32 (sym, i); + if ((c2 = symbol_escape_char (c)) != NO_MATCH) { - /* Otherwise, print it in the form \xNNN; */ - odata[j++] = '\\'; - odata[j++] = 'x'; - if (ch >= 0x100000) - odata[j++] = hex [(ch & 0xF00000) >> 20]; - if (ch >= 0x10000) - odata[j++] = hex [(ch & 0xF0000) >> 16]; - if (ch >= 0x1000) - odata[j++] = hex [(ch & 0xF000) >> 12]; - if (ch >= 0x100) - odata[j++] = hex [(ch & 0xF00) >> 8]; - if (ch >= 0x10) - odata[j++] = hex [(ch & 0xF0) >> 4]; - odata[j++] = hex [ch & 0xF]; - odata[j++] = ';'; + if (weird) + { + odata[j++] = '\\'; + odata[j++] = c2; + } + else + odata[j++] = c; } + else + odata[j++] = c; } - odata[j++] = '"'; - odata[j] = '\0'; - scm_remember_upto_here_1 (str); - - return odata; + if (weird) + { + odata[j++] = '}'; + odata[j++] = '#'; + } + odata[j++] = 0; + if (j != olen) + scm_misc_error ("symbol-name->locale-string","internal string-size error ~s vs ~s", + scm_list_2 (scm_from_size_t (j), + scm_from_size_t (olen))); + + + ldata = NULL; + llen = 0; + ret = u32_conv_to_encoding (locale_charset (), iconveh_question_mark, + odata, olen, NULL, &ldata, &llen); + if (ret != 0) + scm_misc_error (NULL, "conversion error", scm_list_1 (sym)); + + free (odata); + scm_remember_upto_here_1 (sym); + + return ldata; } + /* Create a null-terminated utf-8 C string. */ scm_t_uint8 * scm_i_to_utf8_string (SCM str) @@ -1564,6 +1833,36 @@ scm_i_c_is_native_8bit_codeset (const char *charset) } SCM +scm_from_latin1_stringn (const char *str, size_t len) +{ + SCM res; + char *dst; + + if (len == (size_t)-1) + len = strlen (str); + res = scm_i_make_string (len, &dst); + memcpy (dst, str, len); + return res; +} + +SCM +scm_from_utf32_stringn (const scm_t_uint32 *str, size_t len) +{ + SCM res; + scm_t_uint32 *dst; + +#if 0 + if (len == (size_t)-1) + len = u32_strlen (str); +#endif + res = scm_i_make_wide_string (len, &dst); + memcpy (dst, str, len); + return res; +} + + + +SCM scm_from_locale_stringn (const char *str, size_t len) { SCM res; @@ -1598,7 +1897,7 @@ scm_from_locale_stringn (const char *str, size_t len) { SCM errstr = scm_i_make_string (len, &dst); memcpy (dst, str, len); - scm_misc_error (NULL, "Invalid sequence in locale string: ~S in ~S", + scm_misc_error (NULL, "invalid sequence in locale string: ~S in ~S", scm_list_2 (errstr, scm_from_locale_string (locale_charset()))); } else @@ -1701,6 +2000,7 @@ scm_to_locale_stringn (SCM str, size_t *lenp) len = 0; if (scm_i_is_narrow_string (str)) { + ret = mem_iconveh (scm_i_string_chars (str), scm_i_string_length (str), "ISO-8859-1", locale_charset(), iconveh_question_mark, NULL, diff --git a/libguile/strings.h b/libguile/strings.h index 6c26a9610..5b7142d0a 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -99,6 +99,8 @@ SCM_API SCM scm_c_substring_shared (SCM str, size_t start, size_t end); SCM_API SCM scm_c_substring_copy (SCM str, size_t start, size_t end); SCM_API int scm_is_string (SCM x); +SCM_INTERNAL SCM scm_from_latin1_stringn (const char *str, size_t len); +SCM_INTERNAL SCM scm_from_utf32_stringn (const scm_t_uint32 *str, size_t len); SCM_API SCM scm_from_locale_string (const char *str); SCM_API SCM scm_from_locale_stringn (const char *str, size_t len); SCM_API SCM scm_take_locale_string (char *str); @@ -170,6 +172,10 @@ SCM_INTERNAL int scm_i_string_contains_char (SCM str, char ch); SCM_INTERNAL int scm_i_symbol_strcmp (SCM sym, const char *str); SCM_INTERNAL char *scm_i_to_write_string (SCM str); +SCM_INTERNAL char *scm_i_symbol_name_to_write_string (SCM sym, + int quote_key_flag); +SCM_INTERNAL char *scm_i_symbol_name_to_locale_string (SCM sym, + int quote_key_flag); SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str); SCM_INTERNAL SCM scm_i_from_utf8_string (const scm_t_uint8 *str); diff --git a/libguile/symbols.c b/libguile/symbols.c index 89401187c..fe6bc8467 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -364,6 +364,21 @@ SCM_DEFINE (scm_symbol_pset_x, "symbol-pset!", 2, 0, 0, } #undef FUNC_NAME +int +scm_i_is_keywordish_symbol (SCM str) +{ + size_t len = scm_i_symbol_length (str); + + if (len == 0) + return 0; + + if (!scm_i_symbol_ref_eq_char (str, 0, ':') + && !scm_i_symbol_ref_eq_char (str, len - 1, ':')) + return 0; + + return 1; +} + SCM scm_from_locale_symbol (const char *sym) { @@ -371,6 +386,13 @@ scm_from_locale_symbol (const char *sym) } SCM +scm_from_latin1_symboln (const char *sym, size_t len) +{ + SCM str = scm_from_latin1_stringn (sym, len); + return scm_i_mem2symbol (str); +} + +SCM scm_from_locale_symboln (const char *sym, size_t len) { SCM str = scm_from_locale_stringn (sym, len); diff --git a/libguile/symbols.h b/libguile/symbols.h index c2dc18363..50ff3f50e 100644 --- a/libguile/symbols.h +++ b/libguile/symbols.h @@ -54,6 +54,8 @@ SCM_API SCM scm_symbol_pset_x (SCM s, SCM val); SCM_API SCM scm_symbol_hash (SCM s); SCM_API SCM scm_gensym (SCM prefix); +SCM_INTERNAL int scm_i_is_keywordish_symbol (SCM str); +SCM_INTERNAL SCM scm_from_latin1_symboln (const char *str, size_t len); SCM_API SCM scm_from_locale_symbol (const char *str); SCM_API SCM scm_from_locale_symboln (const char *str, size_t len); SCM_API SCM scm_take_locale_symbol (char *sym); diff --git a/libguile/vm-i-loader.c b/libguile/vm-i-loader.c index 515001d61..eeae82f87 100644 --- a/libguile/vm-i-loader.c +++ b/libguile/vm-i-loader.c @@ -72,7 +72,8 @@ VM_DEFINE_LOADER (62, load_string, "load-string") size_t len; FETCH_LENGTH (len); SYNC_REGISTER (); - PUSH (scm_from_locale_stringn ((char *)ip, len)); + /* PUSH (scm_from_locale_stringn ((char *)ip, len)); */ + PUSH (scm_from_latin1_stringn ((char *)ip, len)); /* Was: scm_makfromstr (ip, len, 0) */ ip += len; NEXT; @@ -83,7 +84,8 @@ VM_DEFINE_LOADER (63, load_symbol, "load-symbol") size_t len; FETCH_LENGTH (len); SYNC_REGISTER (); - PUSH (scm_from_locale_symboln ((char *)ip, len)); + /* PUSH (scm_from_locale_symboln ((char *)ip, len)); */ + PUSH (scm_from_latin1_symboln ((char *)ip, len)); ip += len; NEXT; } diff --git a/libguile/vm.c b/libguile/vm.c index 38d085c99..55b801050 100644 --- a/libguile/vm.c +++ b/libguile/vm.c @@ -52,6 +52,7 @@ #include "objcodes.h" #include "programs.h" #include "lang.h" /* NULL_OR_NIL_P */ +#include "symbols.h" #include "vm.h" /* I sometimes use this for debugging. */ |
