diff options
author | Michael Gran <spk121@yahoo.com> | 2009-08-23 06:50:45 -0700 |
---|---|---|
committer | Michael Gran <spk121@yahoo.com> | 2009-08-23 09:29:45 -0700 |
commit | 587a33556fdef90025c1b7d4d172af649c8ebba8 (patch) | |
tree | 97792a8e7e1a2b9cb05596df2332294a6c316671 | |
parent | 27646f414e9350c2bf9f35982082bcabfb475c5d (diff) | |
download | guile-587a33556fdef90025c1b7d4d172af649c8ebba8.tar.gz |
Modify socket and time functions for wide strings
* libguile/socket.c (scm_recv): receive the message without holding the
stringbuf writing lock
(scm_send): try to narrow a string before using it
* libguile/stime.c (strftime): convert string to UTF-8 so that it can
be safely passed to strftime
(strptime): convert input string to UTF-8 so that it can be safely
passed through strptime
* libguile/strings.c (narrow_stringbuf): new function
(scm_i_try_narrow_string): new function
* libguile/strings.h: new declaration for scm_i_try_narrow_string
-rw-r--r-- | libguile/socket.c | 32 | ||||
-rw-r--r-- | libguile/stime.c | 50 | ||||
-rw-r--r-- | libguile/strings.c | 143 | ||||
-rw-r--r-- | libguile/strings.h | 3 | ||||
-rw-r--r-- | test-suite/tests/time.test | 5 |
5 files changed, 200 insertions, 33 deletions
diff --git a/libguile/socket.c b/libguile/socket.c index 2e02e9082..d463d04f4 100644 --- a/libguile/socket.c +++ b/libguile/socket.c @@ -33,6 +33,7 @@ #include "libguile/strings.h" #include "libguile/vectors.h" #include "libguile/dynwind.h" +#include "libguile/srfi-13.h" #include "libguile/validate.h" #include "libguile/socket.h" @@ -1414,6 +1415,8 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0, "protocols, if a packet larger than this limit is encountered\n" "then some data\n" "will be irrevocably lost.\n\n" + "The data is assumed to be binary, and there is no decoding of\n" + "of locale-encoded strings.\n\n" "The optional @var{flags} argument is a value or\n" "bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n" "The value returned is the number of bytes read from the\n" @@ -1428,6 +1431,7 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0, int flg; char *dest; size_t len; + SCM msg; SCM_VALIDATE_OPFPORT (1, sock); SCM_VALIDATE_STRING (2, buf); @@ -1437,16 +1441,16 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0, flg = scm_to_int (flags); fd = SCM_FPORT_FDES (sock); - len = scm_i_string_length (buf); - buf = scm_i_string_start_writing (buf); - dest = scm_i_string_writable_chars (buf); + len = scm_i_string_length (buf); + msg = scm_i_make_string (len, &dest); SCM_SYSCALL (rv = recv (fd, dest, len, flg)); - scm_i_string_stop_writing (); + scm_string_copy_x (buf, scm_from_int (0), + msg, scm_from_int (0), scm_from_size_t (len)); if (rv == -1) SCM_SYSERROR; - scm_remember_upto_here_1 (buf); + scm_remember_upto_here_2 (buf, msg); return scm_from_int (rv); } #undef FUNC_NAME @@ -1464,18 +1468,28 @@ SCM_DEFINE (scm_send, "send", 2, 1, 0, "bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n" "Note that the data is written directly to the socket\n" "file descriptor:\n" - "any unflushed buffered port data is ignored.") + "any unflushed buffered port data is ignored.\n\n" + "This operation is defined only for strings containing codepoints\n" + "zero to 255.") #define FUNC_NAME s_scm_send { int rv; int fd; int flg; - const char *src; + char *src; size_t len; sock = SCM_COERCE_OUTPORT (sock); SCM_VALIDATE_OPFPORT (1, sock); SCM_VALIDATE_STRING (2, message); + + /* If the string is wide, see if it can be coerced into + a narrow string. */ + if (!scm_i_is_narrow_string (message) + || scm_i_try_narrow_string (message)) + SCM_MISC_ERROR ("the message string is not 8-bit: ~s", + scm_list_1 (message)); + if (SCM_UNBNDP (flags)) flg = 0; else @@ -1592,7 +1606,9 @@ SCM_DEFINE (scm_sendto, "sendto", 3, 1, 1, "set to be non-blocking.\n" "Note that the data is written directly to the socket\n" "file descriptor:\n" - "any unflushed buffered port data is ignored.") + "any unflushed buffered port data is ignored.\n" + "This operation is defined only for strings containing codepoints\n" + "zero to 255.") #define FUNC_NAME s_scm_sendto { int rv; diff --git a/libguile/stime.c b/libguile/stime.c index a6843377b..54022c296 100644 --- a/libguile/stime.c +++ b/libguile/stime.c @@ -46,6 +46,7 @@ #include <stdio.h> #include <errno.h> #include <strftime.h> +#include <unistr.h> #include "libguile/_scm.h" #include "libguile/async.h" @@ -53,6 +54,7 @@ #include "libguile/strings.h" #include "libguile/vectors.h" #include "libguile/dynwind.h" +#include "libguile/strings.h" #include "libguile/validate.h" #include "libguile/stime.h" @@ -624,18 +626,20 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0, { struct tm t; - char *tbuf; + scm_t_uint8 *tbuf; int size = 50; - const char *fmt; - char *myfmt; + scm_t_uint8 *fmt; + scm_t_uint8 *myfmt; int len; SCM result; SCM_VALIDATE_STRING (1, format); bdtime2c (stime, &t, SCM_ARG2, FUNC_NAME); - fmt = scm_i_string_chars (format); - len = scm_i_string_length (format); + /* Convert string to UTF-8 so that non-ASCII characters in the + format are passed through unchanged. */ + fmt = scm_i_to_utf8_string (format); + len = strlen ((const char *) fmt); /* Ugly hack: strftime can return 0 if its buffer is too small, but some valid time strings (e.g. "%p") can sometimes produce @@ -643,9 +647,11 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0, character to the format string, so that valid returns are always nonzero. */ myfmt = scm_malloc (len+2); - *myfmt = 'x'; - strncpy(myfmt+1, fmt, len); - myfmt[len+1] = 0; + *myfmt = (scm_t_uint8) 'x'; + strncpy ((char *) myfmt + 1, (const char *) fmt, len); + myfmt[len + 1] = 0; + scm_remember_upto_here_1 (format); + free (fmt); tbuf = scm_malloc (size); { @@ -680,7 +686,8 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0, /* Use `nstrftime ()' from Gnulib, which supports all GNU extensions supported by glibc. */ - while ((len = nstrftime (tbuf, size, myfmt, &t, 0, 0)) == 0) + while ((len = nstrftime ((char *) tbuf, size, + (const char *) myfmt, &t, 0, 0)) == 0) { free (tbuf); size *= 2; @@ -696,7 +703,7 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0, #endif } - result = scm_from_locale_stringn (tbuf + 1, len - 1); + result = scm_i_from_utf8_string ((const scm_t_uint8 *) tbuf + 1); free (tbuf); free (myfmt); #if HAVE_STRUCT_TM_TM_ZONE @@ -722,14 +729,17 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0, #define FUNC_NAME s_scm_strptime { struct tm t; - const char *fmt, *str, *rest; + scm_t_uint8 *fmt, *str, *rest; + size_t used_len; long zoff; SCM_VALIDATE_STRING (1, format); SCM_VALIDATE_STRING (2, string); - fmt = scm_i_string_chars (format); - str = scm_i_string_chars (string); + /* Convert strings to UTF-8 so that non-ASCII characters are passed + through unchanged. */ + fmt = scm_i_to_utf8_string (format); + str = scm_i_to_utf8_string (string); /* initialize the struct tm */ #define tm_init(field) t.field = 0 @@ -751,7 +761,8 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0, fields, hence the use of SCM_CRITICAL_SECTION_START. */ t.tm_isdst = -1; SCM_CRITICAL_SECTION_START; - rest = strptime (str, fmt, &t); + rest = (scm_t_uint8 *) strptime ((const char *) str, + (const char *) fmt, &t); SCM_CRITICAL_SECTION_END; if (rest == NULL) { @@ -759,6 +770,9 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0, instance it doesn't. Force a sensible value for our error message. */ errno = EINVAL; + scm_remember_upto_here_2 (format, string); + free (str); + free (fmt); SCM_SYSERROR; } @@ -770,8 +784,14 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0, zoff = 0; #endif + /* Compute the number of UTF-8 characters. */ + used_len = u8_strnlen (str, rest-str); + scm_remember_upto_here_2 (format, string); + free (str); + free (fmt); + return scm_cons (filltime (&t, zoff, NULL), - scm_from_signed_integer (rest - str)); + scm_from_signed_integer (used_len)); } #undef FUNC_NAME #endif /* HAVE_STRPTIME */ diff --git a/libguile/strings.c b/libguile/strings.c index c6464de6b..59487bd06 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -239,6 +239,36 @@ widen_stringbuf (SCM buf) } } +/* Convert a stringbuf of 32-bit UCS-4-encoded characters to one + containing 8-bit Latin-1-encoded characters, if possible. */ +static void +narrow_stringbuf (SCM buf) +{ + size_t i, len; + scm_t_wchar *wmem; + char *mem; + + if (!STRINGBUF_WIDE (buf)) + return; + + len = STRINGBUF_OUTLINE_LENGTH (buf); + i = 0; + wmem = STRINGBUF_WIDE_CHARS (buf); + while (i < len) + if (wmem[i++] > 0xFF) + return; + + mem = scm_gc_malloc (sizeof (char) * (len + 1), "string"); + for (i = 0; i < len; i++) + mem[i] = (unsigned char) wmem[i]; + + scm_gc_free (wmem, sizeof (scm_t_wchar) * (len + 1), "string"); + + SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_WIDE); + SCM_SET_CELL_WORD_1 (buf, mem); + SCM_SET_CELL_WORD_2 (buf, len); +} + scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER; /* Copy-on-write strings. @@ -459,6 +489,18 @@ scm_i_is_narrow_string (SCM str) return !STRINGBUF_WIDE (STRING_STRINGBUF (str)); } +/* Try to coerce a string to be narrow. It if is narrow already, do + nothing. If it is wide, shrink it to narrow if none of its + characters are above 0xFF. Return true if the string is narrow or + was made to be narrow. */ +int +scm_i_try_narrow_string (SCM str) +{ + narrow_stringbuf (STRING_STRINGBUF (str)); + + return scm_i_is_narrow_string (str); +} + /* Returns a pointer to the 8-bit Latin-1 encoded character array of STR. */ const char * @@ -623,7 +665,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr) if (scm_i_is_narrow_string (str)) { char *dst = scm_i_string_writable_chars (str); - dst[p] = (char) (unsigned char) chr; + dst[p] = chr; } else { @@ -633,7 +675,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr) } /* Symbols. - + Basic symbol creation and accessing is done here, the rest is in symbols.[hc]. This has been done to keep stringbufs and the internals of strings and string-like objects confined to this file. @@ -866,7 +908,7 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str), else e5 = scm_cons (scm_from_locale_symbol ("read-only"), SCM_BOOL_F); - + /* Stringbuf info */ if (!STRINGBUF_WIDE (buf)) { @@ -1426,6 +1468,80 @@ scm_from_locale_string (const char *str) return scm_from_locale_stringn (str, -1); } +static SCM +scm_from_stringn (const char *str, size_t len, const char *encoding, + scm_t_string_failed_conversion_handler handler) +{ + size_t u32len, i; + scm_t_wchar *u32; + int wide = 0; + SCM res; + + u32len = 0; + u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding, + (enum iconv_ilseq_handler) + handler, + str, len, + NULL, + NULL, &u32len); + + if (u32 == NULL) + { + if (errno == ENOMEM) + scm_memory_error ("locale string conversion"); + else + { + /* There are invalid sequences in the input string. Since + it is partially nonsense, what is the best strategy for + printing it in the error message? */ + SCM errstr; + char *dst; + /* We'll just print it unconverted and hope for the best. */ + errstr = scm_i_make_string (len, &dst); + memcpy (dst, str, len); + scm_misc_error (NULL, "input locale conversion error from ~s: ~s", + scm_list_2 (scm_from_locale_string (encoding), + errstr)); + scm_remember_upto_here_1 (errstr); + } + } + + i = 0; + while (i < u32len) + if (u32[i++] > 0xFF) + { + wide = 1; + break; + } + + if (!wide) + { + char *dst; + res = scm_i_make_string (u32len, &dst); + for (i = 0; i < u32len; i ++) + dst[i] = (unsigned char) u32[i]; + dst[u32len] = '\0'; + } + else + { + scm_t_wchar *wdst; + res = scm_i_make_wide_string (u32len, &wdst); + u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len); + wdst[u32len] = 0; + } + + free (u32); + return res; +} + +SCM +scm_i_from_utf8_string (const scm_t_uint8 *str) +{ + return scm_from_stringn ((const char *) str, + strlen ((char *) str), "UTF-8", + SCM_FAILED_CONVERSION_ERROR); +} + /* Create a new scheme string from the C string STR. The memory of STR may be used directly as storage for the new string. */ SCM @@ -1519,16 +1635,15 @@ scm_to_locale_stringn (SCM str, size_t * lenp) /* In the future, enc will hold the port's encoding. */ enc = NULL; - return scm_to_stringn (str, lenp, enc, + return scm_to_stringn (str, lenp, enc, SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE); } /* Low-level scheme to C string conversion function. */ char * -scm_to_stringn (SCM str, size_t * lenp, const char *encoding, +scm_to_stringn (SCM str, size_t *lenp, const char *encoding, scm_t_string_failed_conversion_handler handler) { - static const char iso[11] = "ISO-8859-1"; char *buf; size_t ilen, len, i; @@ -1544,7 +1659,7 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding, *lenp = 0; return buf; } - + if (lenp == NULL) for (i = 0; i < ilen; i++) if (scm_i_string_ref (str, i) == '\0') @@ -1570,16 +1685,16 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding, } } - + buf = NULL; len = 0; - buf = u32_conv_to_encoding (iso, + buf = u32_conv_to_encoding (encoding ? encoding : "ISO-8859-1", (enum iconv_ilseq_handler) handler, (scm_t_uint32 *) scm_i_string_wide_chars (str), ilen, NULL, NULL, &len); if (buf == NULL) scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"", - scm_list_2 (scm_from_locale_string (iso), str)); + scm_list_2 (scm_from_locale_string (encoding), str)); if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE) unistring_escapes_to_guile_escapes (&buf, &len); @@ -1602,6 +1717,14 @@ scm_to_locale_string (SCM str) return scm_to_locale_stringn (str, NULL); } +scm_t_uint8 * +scm_i_to_utf8_string (SCM str) +{ + char *u8str; + u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR); + return (scm_t_uint8 *) u8str; +} + size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len) { diff --git a/libguile/strings.h b/libguile/strings.h index d0cbb8dd3..20726a3e7 100644 --- a/libguile/strings.h +++ b/libguile/strings.h @@ -124,6 +124,7 @@ SCM_API SCM scm_c_substring_copy (SCM str, size_t start, size_t end); SCM_API int scm_is_string (SCM x); SCM_API SCM scm_from_locale_string (const char *str); SCM_API SCM scm_from_locale_stringn (const char *str, size_t len); +SCM_INTERNAL SCM scm_i_from_utf8_string (const scm_t_uint8 *str); SCM_API SCM scm_take_locale_string (char *str); SCM_API SCM scm_take_locale_stringn (char *str, size_t len); SCM_API char *scm_to_locale_string (SCM str); @@ -132,6 +133,7 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp, const char *encoding, scm_t_string_failed_conversion_handler handler); +SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str); SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len); SCM_API SCM scm_makfromstrs (int argc, char **argv); @@ -168,6 +170,7 @@ SCM_INTERNAL const char *scm_i_symbol_chars (SCM sym); SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym); SCM_INTERNAL size_t scm_i_symbol_length (SCM sym); SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str); +SCM_INTERNAL int scm_i_try_narrow_string (SCM str); SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end); SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x); diff --git a/test-suite/tests/time.test b/test-suite/tests/time.test index 38a49d384..da7a48c04 100644 --- a/test-suite/tests/time.test +++ b/test-suite/tests/time.test @@ -202,6 +202,11 @@ (string=? (strftime "%Z" t) "ZOW"))) + (pass-if "strftime passes wide characters" + (let ((t (localtime (current-time)))) + (string=? (substring (strftime "\u0100%Z" t) 0 1) + "\u0100"))) + (with-test-prefix "C99 %z format" ;; %z here is quite possibly affected by the same tm:gmtoff vs current |