Modify socket and time functions for wide strings

* libguile/socket.c (scm_recv): receive the message without holding the stringbuf writing lock (scm_send): try to narrow a string before using it * libguile/stime.c (strftime): convert string to UTF-8 so that it can be safely passed to strftime (strptime): convert input string to UTF-8 so that it can be safely passed through strptime * libguile/strings.c (narrow_stringbuf): new function (scm_i_try_narrow_string): new function * libguile/strings.h: new declaration for scm_i_try_narrow_string
author: Michael Gran <spk121@yahoo.com> 2009-08-23 06:50:45 -0700
committer: Michael Gran <spk121@yahoo.com> 2009-08-23 09:29:45 -0700
commit: 587a33556fdef90025c1b7d4d172af649c8ebba8 (patch)
tree: 97792a8e7e1a2b9cb05596df2332294a6c316671
parent: 27646f414e9350c2bf9f35982082bcabfb475c5d (diff)
download: guile-587a33556fdef90025c1b7d4d172af649c8ebba8.tar.gz
5 files changed, 200 insertions, 33 deletions
diff --git a/libguile/socket.c b/libguile/socket.c
index 2e02e9082..d463d04f4 100644
--- a/libguile/socket.c
+++ b/libguile/socket.c
@@ -33,6 +33,7 @@
 #include "libguile/strings.h"
 #include "libguile/vectors.h"
 #include "libguile/dynwind.h"
+#include "libguile/srfi-13.h"
 
 #include "libguile/validate.h"
 #include "libguile/socket.h"
@@ -1414,6 +1415,8 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
 	    "protocols, if a packet larger than this limit is encountered\n"
 	    "then some data\n"
 	    "will be irrevocably lost.\n\n"
+	    "The data is assumed to be binary, and there is no decoding of\n"
+	    "of locale-encoded strings.\n\n"
 	    "The optional @var{flags} argument is a value or\n"
 	    "bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n"
 	    "The value returned is the number of bytes read from the\n"
@@ -1428,6 +1431,7 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
   int flg;
   char *dest;
   size_t len;
+  SCM msg;
 
   SCM_VALIDATE_OPFPORT (1, sock);
   SCM_VALIDATE_STRING (2, buf);
@@ -1437,16 +1441,16 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
     flg = scm_to_int (flags);
   fd = SCM_FPORT_FDES (sock);
 
-  len =  scm_i_string_length (buf);
-  buf = scm_i_string_start_writing (buf);
-  dest = scm_i_string_writable_chars (buf);
+  len = scm_i_string_length (buf);
+  msg = scm_i_make_string (len, &dest);
   SCM_SYSCALL (rv = recv (fd, dest, len, flg));
-  scm_i_string_stop_writing ();
+  scm_string_copy_x (buf, scm_from_int (0), 
+		     msg, scm_from_int (0), scm_from_size_t (len));
 
   if (rv == -1)
     SCM_SYSERROR;
 
-  scm_remember_upto_here_1 (buf);
+  scm_remember_upto_here_2 (buf, msg);
   return scm_from_int (rv);
 }
 #undef FUNC_NAME
@@ -1464,18 +1468,28 @@ SCM_DEFINE (scm_send, "send", 2, 1, 0,
 	    "bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n"
 	    "Note that the data is written directly to the socket\n"
 	    "file descriptor:\n"
-	    "any unflushed buffered port data is ignored.")
+	    "any unflushed buffered port data is ignored.\n\n"
+	    "This operation is defined only for strings containing codepoints\n"
+	    "zero to 255.")
 #define FUNC_NAME s_scm_send
 {
   int rv;
   int fd;
   int flg;
-  const char *src;
+  char *src;
   size_t len;
 
   sock = SCM_COERCE_OUTPORT (sock);
   SCM_VALIDATE_OPFPORT (1, sock);
   SCM_VALIDATE_STRING (2, message);
+  
+  /* If the string is wide, see if it can be coerced into
+     a narrow string.  */
+  if (!scm_i_is_narrow_string (message)
+      || scm_i_try_narrow_string (message))
+    SCM_MISC_ERROR ("the message string is not 8-bit: ~s", 
+                        scm_list_1 (message));
+
   if (SCM_UNBNDP (flags))
     flg = 0;
   else
@@ -1592,7 +1606,9 @@ SCM_DEFINE (scm_sendto, "sendto", 3, 1, 1,
 	    "set to be non-blocking.\n"
 	    "Note that the data is written directly to the socket\n"
 	    "file descriptor:\n"
-	    "any unflushed buffered port data is ignored.")
+	    "any unflushed buffered port data is ignored.\n"
+	    "This operation is defined only for strings containing codepoints\n"
+	    "zero to 255.")
 #define FUNC_NAME s_scm_sendto
 {
   int rv;
diff --git a/libguile/stime.c b/libguile/stime.c
index a6843377b..54022c296 100644
--- a/libguile/stime.c
+++ b/libguile/stime.c
@@ -46,6 +46,7 @@
 #include <stdio.h>
 #include <errno.h>
 #include <strftime.h>
+#include <unistr.h>
 
 #include "libguile/_scm.h"
 #include "libguile/async.h"
@@ -53,6 +54,7 @@
 #include "libguile/strings.h"
 #include "libguile/vectors.h"
 #include "libguile/dynwind.h"
+#include "libguile/strings.h"
 
 #include "libguile/validate.h"
 #include "libguile/stime.h"
@@ -624,18 +626,20 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
 {
   struct tm t;
 
-  char *tbuf;
+  scm_t_uint8 *tbuf;
   int size = 50;
-  const char *fmt;
-  char *myfmt;
+  scm_t_uint8 *fmt;
+  scm_t_uint8 *myfmt;
   int len;
   SCM result;
 
   SCM_VALIDATE_STRING (1, format);
   bdtime2c (stime, &t, SCM_ARG2, FUNC_NAME);
 
-  fmt = scm_i_string_chars (format);
-  len = scm_i_string_length (format);
+  /* Convert string to UTF-8 so that non-ASCII characters in the
+     format are passed through unchanged.  */
+  fmt = scm_i_to_utf8_string (format);
+  len = strlen ((const char *) fmt);
 
   /* Ugly hack: strftime can return 0 if its buffer is too small,
      but some valid time strings (e.g. "%p") can sometimes produce
@@ -643,9 +647,11 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
      character to the format string, so that valid returns are always
      nonzero. */
   myfmt = scm_malloc (len+2);
-  *myfmt = 'x';
-  strncpy(myfmt+1, fmt, len);
-  myfmt[len+1] = 0;
+  *myfmt = (scm_t_uint8) 'x';
+  strncpy ((char *) myfmt + 1, (const char *) fmt, len);
+  myfmt[len + 1] = 0;
+  scm_remember_upto_here_1 (format);
+  free (fmt);
 
   tbuf = scm_malloc (size);
   {
@@ -680,7 +686,8 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
 
     /* Use `nstrftime ()' from Gnulib, which supports all GNU extensions
        supported by glibc.  */
-    while ((len = nstrftime (tbuf, size, myfmt, &t, 0, 0)) == 0)
+    while ((len = nstrftime ((char *) tbuf, size, 
+			     (const char *) myfmt, &t, 0, 0)) == 0)
       {
 	free (tbuf);
 	size *= 2;
@@ -696,7 +703,7 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
 #endif
     }
 
-  result = scm_from_locale_stringn (tbuf + 1, len - 1);
+  result = scm_i_from_utf8_string ((const scm_t_uint8 *) tbuf + 1);
   free (tbuf);
   free (myfmt);
 #if HAVE_STRUCT_TM_TM_ZONE
@@ -722,14 +729,17 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
 #define FUNC_NAME s_scm_strptime
 {
   struct tm t;
-  const char *fmt, *str, *rest;
+  scm_t_uint8 *fmt, *str, *rest;
+  size_t used_len;
   long zoff;
 
   SCM_VALIDATE_STRING (1, format);
   SCM_VALIDATE_STRING (2, string);
 
-  fmt = scm_i_string_chars (format);
-  str = scm_i_string_chars (string);
+  /* Convert strings to UTF-8 so that non-ASCII characters are passed
+     through unchanged.  */
+  fmt = scm_i_to_utf8_string (format);
+  str = scm_i_to_utf8_string (string);
 
   /* initialize the struct tm */
 #define tm_init(field) t.field = 0
@@ -751,7 +761,8 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
      fields, hence the use of SCM_CRITICAL_SECTION_START.  */
   t.tm_isdst = -1;
   SCM_CRITICAL_SECTION_START;
-  rest = strptime (str, fmt, &t);
+  rest = (scm_t_uint8 *) strptime ((const char *) str, 
+                                   (const char *) fmt, &t);
   SCM_CRITICAL_SECTION_END;
   if (rest == NULL)
     {
@@ -759,6 +770,9 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
          instance it doesn't.  Force a sensible value for our error
          message.  */
       errno = EINVAL;
+      scm_remember_upto_here_2 (format, string);
+      free (str);
+      free (fmt);
       SCM_SYSERROR;
     }
 
@@ -770,8 +784,14 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
   zoff = 0;
 #endif
 
+  /* Compute the number of UTF-8 characters.  */
+  used_len = u8_strnlen (str, rest-str);
+  scm_remember_upto_here_2 (format, string);
+  free (str);
+  free (fmt);
+
   return scm_cons (filltime (&t, zoff, NULL),
-		   scm_from_signed_integer (rest - str));
+		   scm_from_signed_integer (used_len));
 }
 #undef FUNC_NAME
 #endif /* HAVE_STRPTIME */
diff --git a/libguile/strings.c b/libguile/strings.c
index c6464de6b..59487bd06 100644
--- a/libguile/strings.c
+++ b/libguile/strings.c
@@ -239,6 +239,36 @@ widen_stringbuf (SCM buf)
     }
 }
 
+/* Convert a stringbuf of 32-bit UCS-4-encoded characters to one
+   containing 8-bit Latin-1-encoded characters, if possible.  */
+static void
+narrow_stringbuf (SCM buf)
+{
+  size_t i, len;
+  scm_t_wchar *wmem;
+  char *mem;
+
+  if (!STRINGBUF_WIDE (buf))
+    return;
+
+  len = STRINGBUF_OUTLINE_LENGTH (buf);
+  i = 0;
+  wmem = STRINGBUF_WIDE_CHARS (buf);
+  while (i < len)
+    if (wmem[i++] > 0xFF)
+      return;
+
+  mem = scm_gc_malloc (sizeof (char) * (len + 1), "string");
+  for (i = 0; i < len; i++)
+    mem[i] = (unsigned char) wmem[i];
+
+  scm_gc_free (wmem, sizeof (scm_t_wchar) * (len + 1), "string");
+
+  SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_WIDE);
+  SCM_SET_CELL_WORD_1 (buf, mem);
+  SCM_SET_CELL_WORD_2 (buf, len);
+}
+
 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 
 /* Copy-on-write strings.
@@ -459,6 +489,18 @@ scm_i_is_narrow_string (SCM str)
   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 }
 
+/* Try to coerce a string to be narrow.  It if is narrow already, do
+   nothing.  If it is wide, shrink it to narrow if none of its
+   characters are above 0xFF.  Return true if the string is narrow or
+   was made to be narrow.  */
+int
+scm_i_try_narrow_string (SCM str)
+{
+  narrow_stringbuf (STRING_STRINGBUF (str));
+
+  return scm_i_is_narrow_string (str);
+}
+
 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
    STR.  */
 const char *
@@ -623,7 +665,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
   if (scm_i_is_narrow_string (str))
     {
       char *dst = scm_i_string_writable_chars (str);
-      dst[p] = (char) (unsigned char) chr;
+      dst[p] = chr;
     }
   else
     {
@@ -633,7 +675,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 }
 
 /* Symbols.
- 
+
    Basic symbol creation and accessing is done here, the rest is in
    symbols.[hc].  This has been done to keep stringbufs and the
    internals of strings and string-like objects confined to this file.
@@ -866,7 +908,7 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
   else
     e5 = scm_cons (scm_from_locale_symbol ("read-only"),
                    SCM_BOOL_F);
-      
+
   /* Stringbuf info */
   if (!STRINGBUF_WIDE (buf))
     {
@@ -1426,6 +1468,80 @@ scm_from_locale_string (const char *str)
   return scm_from_locale_stringn (str, -1);
 }
 
+static SCM
+scm_from_stringn (const char *str, size_t len, const char *encoding,
+                  scm_t_string_failed_conversion_handler handler)
+{
+  size_t u32len, i;
+  scm_t_wchar *u32;
+  int wide = 0;
+  SCM res;
+
+  u32len = 0;
+  u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
+                                                (enum iconv_ilseq_handler)
+                                                handler,
+                                                str, len,
+                                                NULL,
+                                                NULL, &u32len);
+
+  if (u32 == NULL)
+    {
+      if (errno == ENOMEM)
+        scm_memory_error ("locale string conversion");
+      else
+        {
+          /* There are invalid sequences in the input string.  Since
+             it is partially nonsense, what is the best strategy for
+             printing it in the error message?  */
+          SCM errstr;
+          char *dst;
+          /* We'll just print it unconverted and hope for the best.  */
+          errstr = scm_i_make_string (len, &dst);
+          memcpy (dst, str, len);
+          scm_misc_error (NULL, "input locale conversion error from ~s: ~s",
+                          scm_list_2 (scm_from_locale_string (encoding),
+                                      errstr));
+          scm_remember_upto_here_1 (errstr);
+        }
+    }
+
+  i = 0;
+  while (i < u32len)
+    if (u32[i++] > 0xFF)
+      {
+        wide = 1;
+        break;
+      }
+
+  if (!wide)
+    {
+      char *dst;
+      res = scm_i_make_string (u32len, &dst);
+      for (i = 0; i < u32len; i ++)
+        dst[i] = (unsigned char) u32[i];
+      dst[u32len] = '\0';
+    }
+  else
+    {
+      scm_t_wchar *wdst;
+      res = scm_i_make_wide_string (u32len, &wdst);
+      u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
+      wdst[u32len] = 0;
+    }
+
+  free (u32);
+  return res;
+}
+
+SCM
+scm_i_from_utf8_string (const scm_t_uint8 *str)
+{
+  return scm_from_stringn ((const char *) str,
+                           strlen ((char *) str), "UTF-8",
+                           SCM_FAILED_CONVERSION_ERROR);
+}
+
 /* Create a new scheme string from the C string STR.  The memory of
    STR may be used directly as storage for the new string.  */
 SCM
@@ -1519,16 +1635,15 @@ scm_to_locale_stringn (SCM str, size_t * lenp)
   /* In the future, enc will hold the port's encoding.  */
   enc = NULL;
 
-  return scm_to_stringn (str, lenp, enc, 
+  return scm_to_stringn (str, lenp, enc,
                          SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
 }
 
 /* Low-level scheme to C string conversion function.  */
 char *
-scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
+scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
                 scm_t_string_failed_conversion_handler handler)
 {
-  static const char iso[11] = "ISO-8859-1";
   char *buf;
   size_t ilen, len, i;
 
@@ -1544,7 +1659,7 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
         *lenp = 0;
       return buf;
     }
-	
+
   if (lenp == NULL)
     for (i = 0; i < ilen; i++)
       if (scm_i_string_ref (str, i) == '\0')
@@ -1570,16 +1685,16 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
         }
     }
 
-  
+
   buf = NULL;
   len = 0;
-  buf = u32_conv_to_encoding (iso,
+  buf = u32_conv_to_encoding (encoding ? encoding : "ISO-8859-1",
                               (enum iconv_ilseq_handler) handler,
                               (scm_t_uint32 *) scm_i_string_wide_chars (str),
                               ilen, NULL, NULL, &len);
   if (buf == NULL)
     scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
-                    scm_list_2 (scm_from_locale_string (iso), str));
+                    scm_list_2 (scm_from_locale_string (encoding), str));
 
   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
     unistring_escapes_to_guile_escapes (&buf, &len);
@@ -1602,6 +1717,14 @@ scm_to_locale_string (SCM str)
   return scm_to_locale_stringn (str, NULL);
 }
 
+scm_t_uint8 *
+scm_i_to_utf8_string (SCM str)
+{
+  char *u8str;
+  u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
+  return (scm_t_uint8 *) u8str;
+}
+
 size_t
 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
 {
diff --git a/libguile/strings.h b/libguile/strings.h
index d0cbb8dd3..20726a3e7 100644
--- a/libguile/strings.h
+++ b/libguile/strings.h
@@ -124,6 +124,7 @@ SCM_API SCM scm_c_substring_copy (SCM str, size_t start, size_t end);
 SCM_API int scm_is_string (SCM x);
 SCM_API SCM scm_from_locale_string (const char *str);
 SCM_API SCM scm_from_locale_stringn (const char *str, size_t len);
+SCM_INTERNAL SCM scm_i_from_utf8_string (const scm_t_uint8 *str);
 SCM_API SCM scm_take_locale_string (char *str);
 SCM_API SCM scm_take_locale_stringn (char *str, size_t len);
 SCM_API char *scm_to_locale_string (SCM str);
@@ -132,6 +133,7 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp,
                                    const char *encoding,
                                    scm_t_string_failed_conversion_handler
                                    handler);
+SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str);
 SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
 
 SCM_API SCM scm_makfromstrs (int argc, char **argv);
@@ -168,6 +170,7 @@ SCM_INTERNAL const char *scm_i_symbol_chars (SCM sym);
 SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym);
 SCM_INTERNAL size_t scm_i_symbol_length (SCM sym);
 SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str);
+SCM_INTERNAL int scm_i_try_narrow_string (SCM str);
 SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end);
 SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x);
 
diff --git a/test-suite/tests/time.test b/test-suite/tests/time.test
index 38a49d384..da7a48c04 100644
--- a/test-suite/tests/time.test
+++ b/test-suite/tests/time.test
@@ -202,6 +202,11 @@
       (string=? (strftime "%Z" t)
                 "ZOW")))
 
+  (pass-if "strftime passes wide characters"
+    (let ((t (localtime (current-time))))
+      (string=? (substring (strftime "\u0100%Z" t) 0 1)
+                "\u0100")))
+
   (with-test-prefix "C99 %z format"
 
     ;; %z here is quite possibly affected by the same tm:gmtoff vs current
author	Michael Gran <spk121@yahoo.com>	2009-08-23 06:50:45 -0700
committer	Michael Gran <spk121@yahoo.com>	2009-08-23 09:29:45 -0700
commit	587a33556fdef90025c1b7d4d172af649c8ebba8 (patch)
tree	97792a8e7e1a2b9cb05596df2332294a6c316671
parent	27646f414e9350c2bf9f35982082bcabfb475c5d (diff)
download	guile-587a33556fdef90025c1b7d4d172af649c8ebba8.tar.gz