summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndy Wingo <wingo@pobox.com>2009-08-25 21:43:00 +0200
committerAndy Wingo <wingo@pobox.com>2009-08-25 21:43:00 +0200
commitc6a1380bde978194ee5c533246285f6af4bbb554 (patch)
tree81bb98421030e83688331b8a12b52bd93de7b403
parent108e18b18abc066b2709a09283751e9138ccc935 (diff)
parent889975e51accb80491af76fc5db980aeb3edd342 (diff)
downloadguile-c6a1380bde978194ee5c533246285f6af4bbb554.tar.gz
Merge commit 'origin/master'
Conflicts: libguile/unif.c
-rw-r--r--libguile/chars.h6
-rw-r--r--libguile/deprecated.c5
-rw-r--r--libguile/fports.c4
-rw-r--r--libguile/goops.c65
-rw-r--r--libguile/inline.h25
-rw-r--r--libguile/load.c11
-rw-r--r--libguile/numbers.h6
-rw-r--r--libguile/ports.c557
-rw-r--r--libguile/ports.h20
-rw-r--r--libguile/posix.c46
-rw-r--r--libguile/posix.h1
-rw-r--r--libguile/print.c103
-rw-r--r--libguile/print.h3
-rw-r--r--libguile/rdelim.c6
-rw-r--r--libguile/read.c383
-rw-r--r--libguile/read.h2
-rw-r--r--libguile/socket.c32
-rw-r--r--libguile/stime.c50
-rw-r--r--libguile/strings.c264
-rw-r--r--libguile/strings.h4
-rw-r--r--libguile/strports.c137
-rw-r--r--libguile/strports.h6
-rw-r--r--libguile/struct.c76
-rw-r--r--libguile/throw.c9
-rw-r--r--module/system/base/compile.scm7
-rw-r--r--test-suite/tests/encoding-escapes.test139
-rw-r--r--test-suite/tests/encoding-iso88591.test135
-rw-r--r--test-suite/tests/encoding-iso88597.test136
-rw-r--r--test-suite/tests/encoding-utf8.test105
-rw-r--r--test-suite/tests/numbers.test1
-rw-r--r--test-suite/tests/ports.test3
-rw-r--r--test-suite/tests/r6rs-ports.test3
-rw-r--r--test-suite/tests/time.test5
33 files changed, 1980 insertions, 375 deletions
diff --git a/libguile/chars.h b/libguile/chars.h
index f75aeadd1..85b16739a 100644
--- a/libguile/chars.h
+++ b/libguile/chars.h
@@ -24,7 +24,11 @@
#include "libguile/__scm.h"
-#include "libguile/numbers.h"
+
+#ifndef SCM_T_WCHAR_DEFINED
+typedef scm_t_int32 scm_t_wchar;
+#define SCM_T_WCHAR_DEFINED
+#endif /* SCM_T_WCHAR_DEFINED */
/* Immediate Characters
diff --git a/libguile/deprecated.c b/libguile/deprecated.c
index 98ca5bdea..6ecef3b4b 100644
--- a/libguile/deprecated.c
+++ b/libguile/deprecated.c
@@ -1076,7 +1076,8 @@ SCM_DEFINE (scm_gentemp, "gentemp", 0, 2, 0,
{
char buf[MAX_PREFIX_LENGTH + SCM_INTBUFLEN];
char *name = buf;
- int len, n_digits;
+ int n_digits;
+ size_t len;
scm_c_issue_deprecation_warning ("`gentemp' is deprecated. "
"Use `gensym' instead.");
@@ -1090,7 +1091,7 @@ SCM_DEFINE (scm_gentemp, "gentemp", 0, 2, 0,
{
SCM_VALIDATE_STRING (1, prefix);
len = scm_i_string_length (prefix);
- name = scm_to_locale_stringn (prefix, (size_t *)(&len));
+ name = scm_to_locale_stringn (prefix, &len);
name = scm_realloc (name, len + SCM_INTBUFLEN);
}
diff --git a/libguile/fports.c b/libguile/fports.c
index cfb8b25b6..8e25ebdd5 100644
--- a/libguile/fports.c
+++ b/libguile/fports.c
@@ -587,7 +587,7 @@ static void fport_flush (SCM port);
/* fill a port's read-buffer with a single read. returns the first
char or EOF if end of file. */
-static int
+static scm_t_wchar
fport_fill_input (SCM port)
{
long count;
@@ -601,7 +601,7 @@ fport_fill_input (SCM port)
if (count == -1)
scm_syserror ("fport_fill_input");
if (count == 0)
- return EOF;
+ return (scm_t_wchar) EOF;
else
{
pt->read_pos = pt->read_buf;
diff --git a/libguile/goops.c b/libguile/goops.c
index 8145e4162..d1beab3d6 100644
--- a/libguile/goops.c
+++ b/libguile/goops.c
@@ -176,6 +176,8 @@ static SCM scm_unbound_p (SCM obj);
static SCM scm_assert_bound (SCM value, SCM obj);
static SCM scm_at_assert_bound_ref (SCM obj, SCM index);
static SCM scm_sys_goops_loaded (void);
+static SCM scm_make_extended_class_from_symbol (SCM type_name_sym,
+ int applicablep);
/* This function is used for efficient type dispatch. */
SCM_DEFINE (scm_class_of, "class-of", 1, 0, 0,
@@ -281,9 +283,9 @@ SCM_DEFINE (scm_class_of, "class-of", 1, 0, 0,
else
{
SCM name = SCM_STRUCT_TABLE_NAME (SCM_CDR (handle));
- SCM class = scm_make_extended_class (scm_is_true (name)
- ? scm_i_symbol_chars (name)
- : 0,
+ SCM class = scm_make_extended_class_from_symbol (scm_is_true (name)
+ ? name
+ : scm_nullstr,
SCM_I_OPERATORP (x));
SCM_SET_STRUCT_TABLE_CLASS (SCM_CDR (handle), class);
return class;
@@ -1526,11 +1528,11 @@ wrap_init (SCM class, SCM *m, long n)
{
long i;
scm_t_bits slayout = SCM_STRUCT_DATA (class)[scm_vtable_index_layout];
- const char *layout = scm_i_symbol_chars (SCM_PACK (slayout));
+ SCM layout = SCM_PACK (slayout);
/* Set all SCM-holding slots to unbound */
for (i = 0; i < n; i++)
- if (layout[i*2] == 'p')
+ if (scm_i_symbol_ref (layout, i*2) == 'p')
m[i] = SCM_GOOPS_UNBOUND;
else
m[i] = 0;
@@ -2680,6 +2682,34 @@ make_class_from_template (char const *template, char const *type_name, SCM super
return class;
}
+static SCM
+make_class_from_symbol (SCM type_name_sym, SCM supers, int applicablep)
+{
+ SCM class, name;
+ if (type_name_sym != SCM_BOOL_F)
+ {
+ name = scm_string_append (scm_list_3 (scm_from_locale_string ("<"),
+ scm_symbol_to_string (type_name_sym),
+ scm_from_locale_string (">")));
+ name = scm_string_to_symbol (name);
+ }
+ else
+ name = SCM_GOOPS_UNBOUND;
+
+ class = scm_permanent_object (scm_basic_make_class (applicablep
+ ? scm_class_procedure_class
+ : scm_class_class,
+ name,
+ supers,
+ SCM_EOL));
+
+ /* Only define name if doesn't already exist. */
+ if (!SCM_GOOPS_UNBOUNDP (name)
+ && scm_is_false (scm_module_variable (scm_module_goops, name)))
+ DEFVAR (name, class);
+ return class;
+}
+
SCM
scm_make_extended_class (char const *type_name, int applicablep)
{
@@ -2691,6 +2721,16 @@ scm_make_extended_class (char const *type_name, int applicablep)
applicablep);
}
+static SCM
+scm_make_extended_class_from_symbol (SCM type_name_sym, int applicablep)
+{
+ return make_class_from_symbol (type_name_sym,
+ scm_list_1 (applicablep
+ ? scm_class_applicable
+ : scm_class_top),
+ applicablep);
+}
+
void
scm_i_inherit_applicable (SCM c)
{
@@ -2783,11 +2823,16 @@ static SCM
make_struct_class (void *closure SCM_UNUSED,
SCM vtable, SCM data, SCM prev SCM_UNUSED)
{
- if (scm_is_true (SCM_STRUCT_TABLE_NAME (data)))
- SCM_SET_STRUCT_TABLE_CLASS (data,
- scm_make_extended_class
- (scm_i_symbol_chars (SCM_STRUCT_TABLE_NAME (data)),
- SCM_CLASS_FLAGS (vtable) & SCM_CLASSF_OPERATOR));
+ SCM sym = SCM_STRUCT_TABLE_NAME (data);
+ if (scm_is_true (sym))
+ {
+ int applicablep = SCM_CLASS_FLAGS (vtable) & SCM_CLASSF_OPERATOR;
+
+ SCM_SET_STRUCT_TABLE_CLASS (data,
+ scm_make_extended_class_from_symbol (sym, applicablep));
+ }
+
+ scm_remember_upto_here_2 (data, vtable);
return SCM_UNSPECIFIED;
}
diff --git a/libguile/inline.h b/libguile/inline.h
index f7a216d7d..574bbfcd3 100644
--- a/libguile/inline.h
+++ b/libguile/inline.h
@@ -87,7 +87,7 @@ SCM_API void scm_array_handle_set (scm_t_array_handle *h, ssize_t pos, SCM val);
SCM_API int scm_is_pair (SCM x);
-SCM_API int scm_getc (SCM port);
+SCM_API int scm_get_byte_or_eof (SCM port);
SCM_API void scm_putc (char c, SCM port);
SCM_API void scm_puts (const char *str_data, SCM port);
@@ -299,7 +299,7 @@ scm_is_pair (SCM x)
SCM_C_EXTERN_INLINE
#endif
int
-scm_getc (SCM port)
+scm_get_byte_or_eof (SCM port)
{
int c;
scm_t_port *pt = SCM_PTAB_ENTRY (port);
@@ -319,27 +319,6 @@ scm_getc (SCM port)
c = *(pt->read_pos++);
- switch (c)
- {
- case '\a':
- break;
- case '\b':
- SCM_DECCOL (port);
- break;
- case '\n':
- SCM_INCLINE (port);
- break;
- case '\r':
- SCM_ZEROCOL (port);
- break;
- case '\t':
- SCM_TABCOL (port);
- break;
- default:
- SCM_INCCOL (port);
- break;
- }
-
return c;
}
diff --git a/libguile/load.c b/libguile/load.c
index 9e54dfab9..e2c0dbf99 100644
--- a/libguile/load.c
+++ b/libguile/load.c
@@ -85,6 +85,7 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
#define FUNC_NAME s_scm_primitive_load
{
SCM hook = *scm_loc_load_hook;
+ char *encoding;
SCM_VALIDATE_STRING (1, filename);
if (scm_is_true (hook) && scm_is_false (scm_procedure_p (hook)))
SCM_MISC_ERROR ("value of %load-hook is neither a procedure nor #f",
@@ -97,7 +98,15 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
SCM port = scm_open_file (filename, scm_from_locale_string ("r"));
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
scm_i_dynwind_current_load_port (port);
-
+ encoding = scm_scan_for_encoding (port);
+ if (encoding)
+ {
+ scm_i_set_port_encoding_x (port, encoding);
+ free (encoding);
+ }
+ else
+ /* The file has no encoding declaraed. We'll presume Latin-1. */
+ scm_i_set_port_encoding_x (port, NULL);
while (1)
{
SCM reader, form;
diff --git a/libguile/numbers.h b/libguile/numbers.h
index eaa57287a..9597afb8d 100644
--- a/libguile/numbers.h
+++ b/libguile/numbers.h
@@ -28,6 +28,11 @@
#include "libguile/__scm.h"
#include "libguile/print.h"
+#ifndef SCM_T_WCHAR_DEFINED
+typedef scm_t_int32 scm_t_wchar;
+#define SCM_T_WCHAR_DEFINED
+#endif /* SCM_T_WCHAR_DEFINED */
+
#if SCM_HAVE_FLOATINGPOINT_H
# include <floatingpoint.h>
#endif
@@ -174,7 +179,6 @@ typedef struct scm_t_complex
double imag;
} scm_t_complex;
-typedef scm_t_int32 scm_t_wchar;
diff --git a/libguile/ports.c b/libguile/ports.c
index 1ddeaa364..749d97533 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -30,6 +30,9 @@
#include <errno.h>
#include <fcntl.h> /* for chsize on mingw */
#include <assert.h>
+#include <uniconv.h>
+#include <unistr.h>
+#include <striconveh.h>
#include "libguile/_scm.h"
#include "libguile/async.h"
@@ -51,6 +54,7 @@
#include "libguile/vectors.h"
#include "libguile/weaks.h"
#include "libguile/fluids.h"
+#include "libguile/eq.h"
#ifdef HAVE_STRING_H
#include <string.h>
@@ -358,10 +362,10 @@ SCM_DEFINE (scm_drain_input, "drain-input", 1, 0, 0,
/* Standard ports --- current input, output, error, and more(!). */
-static SCM cur_inport_fluid;
-static SCM cur_outport_fluid;
-static SCM cur_errport_fluid;
-static SCM cur_loadport_fluid;
+static SCM cur_inport_fluid = 0;
+static SCM cur_outport_fluid = 0;
+static SCM cur_errport_fluid = 0;
+static SCM cur_loadport_fluid = 0;
SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0,
(),
@@ -370,7 +374,10 @@ SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0,
"returns the @dfn{standard input} in Unix and C terminology.")
#define FUNC_NAME s_scm_current_input_port
{
- return scm_fluid_ref (cur_inport_fluid);
+ if (cur_inport_fluid)
+ return scm_fluid_ref (cur_inport_fluid);
+ else
+ return SCM_BOOL_F;
}
#undef FUNC_NAME
@@ -382,7 +389,10 @@ SCM_DEFINE (scm_current_output_port, "current-output-port", 0, 0, 0,
"Unix and C terminology.")
#define FUNC_NAME s_scm_current_output_port
{
- return scm_fluid_ref (cur_outport_fluid);
+ if (cur_outport_fluid)
+ return scm_fluid_ref (cur_outport_fluid);
+ else
+ return SCM_BOOL_F;
}
#undef FUNC_NAME
@@ -392,7 +402,10 @@ SCM_DEFINE (scm_current_error_port, "current-error-port", 0, 0, 0,
"@dfn{standard error} in Unix and C terminology).")
#define FUNC_NAME s_scm_current_error_port
{
- return scm_fluid_ref (cur_errport_fluid);
+ if (cur_errport_fluid)
+ return scm_fluid_ref (cur_errport_fluid);
+ else
+ return SCM_BOOL_F;
}
#undef FUNC_NAME
@@ -509,10 +522,18 @@ scm_new_port_table_entry (scm_t_bits tag)
SCM z = scm_cons (SCM_EOL, SCM_EOL);
scm_t_port *entry = (scm_t_port *) scm_gc_calloc (sizeof (scm_t_port), "port");
+ const char *enc;
entry->file_name = SCM_BOOL_F;
entry->rw_active = SCM_PORT_NEITHER;
entry->port = z;
+ /* Initialize this port with the thread's current default
+ encoding. */
+ if ((enc = scm_i_get_port_encoding (SCM_BOOL_F)) == NULL)
+ entry->encoding = NULL;
+ else
+ entry->encoding = strdup (enc);
+ entry->ilseq_handler = scm_i_get_conversion_strategy (SCM_BOOL_F);
SCM_SET_CELL_TYPE (z, tag);
SCM_SETPTAB_ENTRY (z, entry);
@@ -549,6 +570,11 @@ scm_i_remove_port (SCM port)
scm_t_port *p = SCM_PTAB_ENTRY (port);
if (p->putback_buf)
scm_gc_free (p->putback_buf, p->putback_buf_size, "putback buffer");
+ if (p->encoding)
+ {
+ free (p->encoding);
+ p->encoding = NULL;
+ }
scm_gc_free (p, sizeof (scm_t_port), "port");
SCM_SETPTAB_ENTRY (port, 0);
@@ -632,21 +658,22 @@ SCM_DEFINE (scm_set_port_revealed_x, "set-port-revealed!", 2, 0, 0,
*/
static long
-scm_i_mode_bits_n (const char *modes, size_t n)
+scm_i_mode_bits_n (SCM modes)
{
return (SCM_OPN
- | (memchr (modes, 'r', n) || memchr (modes, '+', n) ? SCM_RDNG : 0)
- | ( memchr (modes, 'w', n)
- || memchr (modes, 'a', n)
- || memchr (modes, '+', n) ? SCM_WRTNG : 0)
- | (memchr (modes, '0', n) ? SCM_BUF0 : 0)
- | (memchr (modes, 'l', n) ? SCM_BUFLINE : 0));
+ | (scm_i_string_contains_char (modes, 'r')
+ || scm_i_string_contains_char (modes, '+') ? SCM_RDNG : 0)
+ | (scm_i_string_contains_char (modes, 'w')
+ || scm_i_string_contains_char (modes, 'a')
+ || scm_i_string_contains_char (modes, '+') ? SCM_WRTNG : 0)
+ | (scm_i_string_contains_char (modes, '0') ? SCM_BUF0 : 0)
+ | (scm_i_string_contains_char (modes, 'l') ? SCM_BUFLINE : 0));
}
long
scm_mode_bits (char *modes)
{
- return scm_i_mode_bits_n (modes, strlen (modes));
+ return scm_i_mode_bits (scm_from_locale_string (modes));
}
long
@@ -657,8 +684,7 @@ scm_i_mode_bits (SCM modes)
if (!scm_is_string (modes))
scm_wrong_type_arg_msg (NULL, 0, modes, "string");
- bits = scm_i_mode_bits_n (scm_i_string_chars (modes),
- scm_i_string_length (modes));
+ bits = scm_i_mode_bits_n (modes);
scm_remember_upto_here_1 (modes);
return bits;
}
@@ -929,7 +955,7 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
"characters are available, the end-of-file object is returned.")
#define FUNC_NAME s_scm_read_char
{
- int c;
+ scm_t_wchar c;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
SCM_VALIDATE_OPINPORT (1, port);
@@ -940,6 +966,133 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
}
#undef FUNC_NAME
+#define SCM_MBCHAR_BUF_SIZE (4)
+
+/* Get one codepoint from a file, using the port's encoding. */
+scm_t_wchar
+scm_getc (SCM port)
+{
+ int c;
+ unsigned int bufcount = 0;
+ char buf[SCM_MBCHAR_BUF_SIZE];
+ scm_t_wchar codepoint = 0;
+ scm_t_uint32 *u32;
+ size_t u32len;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+ c = scm_get_byte_or_eof (port);
+ if (c == EOF)
+ return (scm_t_wchar) EOF;
+
+ buf[0] = c;
+ bufcount++;
+
+ if (pt->encoding == NULL)
+ {
+ /* The encoding is Latin-1: bytes are characters. */
+ codepoint = buf[0];
+ goto success;
+ }
+
+ for (;;)
+ {
+ u32 = u32_conv_from_encoding (pt->encoding,
+ (enum iconv_ilseq_handler) pt->ilseq_handler,
+ buf, bufcount, NULL, NULL, &u32len);
+ if (u32 == NULL || u32len == 0)
+ {
+ if (errno == ENOMEM)
+ scm_memory_error ("Input decoding");
+
+ /* Otherwise errno is EILSEQ or EINVAL, so perhaps more
+ bytes are needed. Keep looping. */
+ }
+ else
+ {
+ /* Complete codepoint found. */
+ codepoint = u32[0];
+ free (u32);
+ goto success;
+ }
+
+ if (bufcount == SCM_MBCHAR_BUF_SIZE)
+ {
+ /* We've read several bytes and didn't find a good
+ codepoint. Give up. */
+ goto failure;
+ }
+
+ c = scm_get_byte_or_eof (port);
+
+ if (c == EOF)
+ {
+ /* EOF before a complete character was read. Push it all
+ back and return EOF. */
+ while (bufcount > 0)
+ {
+ /* FIXME: this will probably cause errors in the port column. */
+ scm_unget_byte (buf[bufcount-1], port);
+ bufcount --;
+ }
+ return EOF;
+ }
+
+ if (c == '\n')
+ {
+ /* It is always invalid to have EOL in the middle of a
+ multibyte character. */
+ scm_unget_byte ('\n', port);
+ goto failure;
+ }
+
+ buf[bufcount++] = c;
+ }
+
+ success:
+ switch (codepoint)
+ {
+ case '\a':
+ break;
+ case '\b':
+ SCM_DECCOL (port);
+ break;
+ case '\n':
+ SCM_INCLINE (port);
+ break;
+ case '\r':
+ SCM_ZEROCOL (port);
+ break;
+ case '\t':
+ SCM_TABCOL (port);
+ break;
+ default:
+ SCM_INCCOL (port);
+ break;
+ }
+
+ return codepoint;
+
+ failure:
+ {
+ char *err_buf;
+ SCM err_str = scm_i_make_string (bufcount, &err_buf);
+ memcpy (err_buf, buf, bufcount);
+
+ if (errno == EILSEQ)
+ scm_misc_error (NULL, "input encoding error for ~s: ~s",
+ scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
+ err_str));
+ else
+ scm_misc_error (NULL, "input encoding error (invalid) for ~s: ~s\n",
+ scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
+ err_str));
+ }
+
+ /* Never gets here. */
+ return 0;
+}
+
+
/* this should only be called when the read buffer is empty. it
tries to refill the read buffer. it returns the first char from
the port, which is either EOF or *(pt->read_pos). */
@@ -1027,7 +1180,7 @@ scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
stringbuf_write_mutex. So, one shouldn't use scm_lfwrite_substr
if the stringbuf write mutex may still be held elsewhere. */
buf = scm_to_stringn (scm_c_substring (str, start, end), &len,
- NULL, SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ pt->encoding, pt->ilseq_handler);
ptob->write (port, buf, len);
free (buf);
@@ -1056,7 +1209,7 @@ scm_lfwrite_str (SCM str, SCM port)
scm_end_input (port);
buf = scm_to_stringn (str, &len,
- NULL, SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ pt->encoding, pt->ilseq_handler);
ptob->write (port, buf, len);
free (buf);
@@ -1257,8 +1410,8 @@ scm_end_input (SCM port)
void
-scm_ungetc (int c, SCM port)
-#define FUNC_NAME "scm_ungetc"
+scm_unget_byte (int c, SCM port)
+#define FUNC_NAME "scm_unget_byte"
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
@@ -1317,6 +1470,25 @@ scm_ungetc (int c, SCM port)
if (pt->rw_random)
pt->rw_active = SCM_PORT_READ;
+}
+#undef FUNC_NAME
+
+void
+scm_ungetc (scm_t_wchar c, SCM port)
+#define FUNC_NAME "scm_ungetc"
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_t_wchar *wbuf;
+ SCM str = scm_i_make_wide_string (1, &wbuf);
+ char *buf;
+ size_t len;
+ int i;
+
+ wbuf[0] = c;
+ buf = scm_to_stringn (str, &len, pt->encoding, pt->ilseq_handler);
+
+ for (i = len - 1; i >= 0; i--)
+ scm_unget_byte (buf[i], port);
if (c == '\n')
{
@@ -1363,7 +1535,7 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
"to @code{read-char} would have hung.")
#define FUNC_NAME s_scm_peek_char
{
- int c, column;
+ scm_t_wchar c, column;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
else
@@ -1409,13 +1581,17 @@ SCM_DEFINE (scm_unread_string, "unread-string", 2, 0, 0,
"@var{port} is not supplied, the current-input-port is used.")
#define FUNC_NAME s_scm_unread_string
{
+ int n;
SCM_VALIDATE_STRING (1, str);
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
else
SCM_VALIDATE_OPINPORT (2, port);
- scm_ungets (scm_i_string_chars (str), scm_i_string_length (str), port);
+ n = scm_i_string_length (str);
+
+ while (n--)
+ scm_ungetc (scm_i_string_ref (str, n), port);
return str;
}
@@ -1670,6 +1846,328 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0,
}
#undef FUNC_NAME
+/* The default port encoding for this locale. New ports will have this
+ encoding. If it is a string, that is the encoding. If it #f, it
+ is in the native (Latin-1) encoding. */
+SCM_GLOBAL_VARIABLE (scm_port_encoding_var, "%port-encoding");
+static int scm_port_encoding_init = 0;
+
+/* Return a C string representation of the current encoding. */
+const char *
+scm_i_get_port_encoding (SCM port)
+{
+ SCM encoding;
+
+ if (scm_is_false (port))
+ {
+ if (!scm_port_encoding_init)
+ return NULL;
+ else if (!scm_is_fluid (SCM_VARIABLE_REF (scm_port_encoding_var)))
+ return NULL;
+ else
+ {
+ encoding = scm_fluid_ref (SCM_VARIABLE_REF (scm_port_encoding_var));
+ if (!scm_is_string (encoding))
+ return NULL;
+ else
+ return scm_i_string_chars (encoding);
+ }
+ }
+ else
+ {
+ scm_t_port *pt;
+ pt = SCM_PTAB_ENTRY (port);
+ if (pt->encoding)
+ return pt->encoding;
+ else
+ return NULL;
+ }
+}
+
+/* Returns ENC is if is a recognized encoding. If it isn't, it tries
+ to find an alias of ENC that is valid. Otherwise, it returns
+ NULL. */
+static const char *
+find_valid_encoding (const char *enc)
+{
+ int isvalid = 0;
+ const char str[] = " ";
+ scm_t_uint32 *u32;
+ size_t u32len;
+
+ u32 = u32_conv_from_encoding (enc, iconveh_error, str, 1,
+ NULL, NULL, &u32len);
+ isvalid = (u32 != NULL);
+ free (u32);
+
+ if (isvalid)
+ return enc;
+
+ return NULL;
+}
+
+void
+scm_i_set_port_encoding_x (SCM port, const char *enc)
+{
+ const char *valid_enc;
+ scm_t_port *pt;
+
+ /* Null is shorthand for the native, Latin-1 encoding. */
+ if (enc == NULL)
+ valid_enc = NULL;
+ else
+ {
+ valid_enc = find_valid_encoding (enc);
+ if (valid_enc == NULL)
+ {
+ SCM err;
+ err = scm_from_locale_string (enc);
+ scm_misc_error (NULL, "invalid or unknown character encoding ~s",
+ scm_list_1 (err));
+ }
+ }
+
+ if (scm_is_false (port))
+ {
+ /* Set the default encoding for future ports. */
+ if (!scm_port_encoding_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (scm_port_encoding_var)))
+ scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized",
+ SCM_EOL);
+
+ if (valid_enc == NULL
+ || !strcmp (valid_enc, "ASCII")
+ || !strcmp (valid_enc, "ANSI_X3.4-1968")
+ || !strcmp (valid_enc, "ISO-8859-1"))
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var), SCM_BOOL_F);
+ else
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var),
+ scm_from_locale_string (valid_enc));
+ }
+ else
+ {
+ /* Set the character encoding for this port. */
+ pt = SCM_PTAB_ENTRY (port);
+ if (pt->encoding)
+ free (pt->encoding);
+ if (valid_enc == NULL)
+ pt->encoding = NULL;
+ else
+ pt->encoding = strdup (valid_enc);
+ }
+}
+
+SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
+ (SCM port),
+ "Returns, as a string, the character encoding that @var{port}\n"
+ "uses to interpret its input and output.\n")
+#define FUNC_NAME s_scm_port_encoding
+{
+ scm_t_port *pt;
+ const char *enc;
+
+ SCM_VALIDATE_PORT (1, port);
+
+ pt = SCM_PTAB_ENTRY (port);
+ enc = scm_i_get_port_encoding (port);
+ if (enc)
+ return scm_from_locale_string (pt->encoding);
+ else
+ return scm_from_locale_string ("NONE");
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_encoding_x, "set-port-encoding!", 2, 0, 0,
+ (SCM port, SCM enc),
+ "Sets the character encoding that will be used to interpret all\n"
+ "port I/O. New ports are created with the encoding\n"
+ "appropriate for the current locale if @code{setlocale} has \n"
+ "been called or ISO-8859-1 otherwise\n"
+ "and this procedure can be used to modify that encoding.\n")
+
+#define FUNC_NAME s_scm_set_port_encoding_x
+{
+ char *enc_str;
+ const char *valid_enc_str;
+
+ SCM_VALIDATE_PORT (1, port);
+ SCM_VALIDATE_STRING (2, enc);
+
+ enc_str = scm_to_locale_string (enc);
+ valid_enc_str = find_valid_encoding (enc_str);
+ if (valid_enc_str == NULL)
+ {
+ free (enc_str);
+ scm_misc_error (FUNC_NAME, "invalid or unknown character encoding ~s",
+ scm_list_1 (enc));
+ }
+ else
+ {
+ scm_i_set_port_encoding_x (port, valid_enc_str);
+ free (enc_str);
+ }
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+
+/* This determines how conversions handle unconvertible characters. */
+SCM_GLOBAL_VARIABLE (scm_conversion_strategy, "%port-conversion-strategy");
+static int scm_conversion_strategy_init = 0;
+
+scm_t_string_failed_conversion_handler
+scm_i_get_conversion_strategy (SCM port)
+{
+ SCM encoding;
+
+ if (scm_is_false (port))
+ {
+ if (!scm_conversion_strategy_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (scm_conversion_strategy)))
+ return SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else
+ {
+ encoding = scm_fluid_ref (SCM_VARIABLE_REF (scm_conversion_strategy));
+ if (scm_is_false (encoding))
+ return SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else
+ return (scm_t_string_failed_conversion_handler) scm_to_int (encoding);
+ }
+ }
+ else
+ {
+ scm_t_port *pt;
+ pt = SCM_PTAB_ENTRY (port);
+ return pt->ilseq_handler;
+ }
+
+}
+
+void
+scm_i_set_conversion_strategy_x (SCM port,
+ scm_t_string_failed_conversion_handler handler)
+{
+ SCM strategy;
+ scm_t_port *pt;
+
+ strategy = scm_from_int ((int) handler);
+
+ if (scm_is_false (port))
+ {
+ /* Set the default encoding for future ports. */
+ if (!scm_conversion_strategy
+ || !scm_is_fluid (SCM_VARIABLE_REF (scm_conversion_strategy)))
+ scm_misc_error (NULL, "tried to set conversion strategy fluid before it is initialized",
+ SCM_EOL);
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_conversion_strategy), strategy);
+ }
+ else
+ {
+ /* Set the character encoding for this port. */
+ pt = SCM_PTAB_ENTRY (port);
+ pt->ilseq_handler = handler;
+ }
+}
+
+SCM_DEFINE (scm_port_conversion_strategy, "port-conversion-strategy",
+ 1, 0, 0, (SCM port),
+ "Returns the behavior of the port when handling a character that\n"
+ "is not representable in the port's current encoding.\n"
+ "It returns the symbol @code{error} if unrepresentable characters\n"
+ "should cause exceptions, @code{substitute} if the port should\n"
+ "try to replace unrepresentable characters with question marks or\n"
+ "approximate characters, or @code{escape} if unrepresentable\n"
+ "characters should be converted to string escapes.\n"
+ "\n"
+ "If @var{port} is @code{#f}, then the current default behavior\n"
+ "will be returned. New ports will have this default behavior\n"
+ "when they are created.\n")
+#define FUNC_NAME s_scm_port_conversion_strategy
+{
+ scm_t_string_failed_conversion_handler h;
+
+ SCM_VALIDATE_OPPORT (1, port);
+
+ if (!scm_is_false (port))
+ {
+ SCM_VALIDATE_OPPORT (1, port);
+ }
+
+ h = scm_i_get_conversion_strategy (port);
+ if (h == SCM_FAILED_CONVERSION_ERROR)
+ return scm_from_locale_symbol ("error");
+ else if (h == SCM_FAILED_CONVERSION_QUESTION_MARK)
+ return scm_from_locale_symbol ("substitute");
+ else if (h == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ return scm_from_locale_symbol ("escape");
+ else
+ abort ();
+
+ /* Never gets here. */
+ return SCM_UNDEFINED;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_conversion_strategy_x, "set-port-conversion-strategy!",
+ 2, 0, 0,
+ (SCM port, SCM sym),
+ "Sets the behavior of the interpreter when outputting a character\n"
+ "that is not representable in the port's current encoding.\n"
+ "@var{sym} can be either @code{'error}, @code{'substitute}, or\n"
+ "@code{'escape}. If it is @code{'error}, an error will be thrown\n"
+ "when an unconvertible character is encountered. If it is\n"
+ "@code{'substitute}, then unconvertible characters will \n"
+ "be replaced with approximate characters, or with question marks\n"
+ "if no approximately correct character is available.\n"
+ "If it is @code{'escape},\n"
+ "it will appear as a hex escape when output.\n"
+ "\n"
+ "If @var{port} is an open port, the conversion error behavior\n"
+ "is set for that port. If it is @code{#f}, it is set as the\n"
+ "default behavior for any future ports that get created in\n"
+ "this thread.\n")
+#define FUNC_NAME s_scm_set_port_conversion_strategy_x
+{
+ SCM err;
+ SCM qm;
+ SCM esc;
+
+ if (!scm_is_false (port))
+ {
+ SCM_VALIDATE_OPPORT (1, port);
+ }
+
+ err = scm_from_locale_symbol ("error");
+ if (scm_is_true (scm_eqv_p (sym, err)))
+ {
+ scm_i_set_conversion_strategy_x (port, SCM_FAILED_CONVERSION_ERROR);
+ return SCM_UNSPECIFIED;
+ }
+
+ qm = scm_from_locale_symbol ("substitute");
+ if (scm_is_true (scm_eqv_p (sym, qm)))
+ {
+ scm_i_set_conversion_strategy_x (port,
+ SCM_FAILED_CONVERSION_QUESTION_MARK);
+ return SCM_UNSPECIFIED;
+ }
+
+ esc = scm_from_locale_symbol ("escape");
+ if (scm_is_true (scm_eqv_p (sym, esc)))
+ {
+ scm_i_set_conversion_strategy_x (port,
+ SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ return SCM_UNSPECIFIED;
+ }
+
+ SCM_MISC_ERROR ("unknown conversion behavior ~s", scm_list_1 (sym));
+
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+
+
void
scm_print_port_mode (SCM exp, SCM port)
{
@@ -1780,8 +2278,17 @@ scm_init_ports ()
cur_loadport_fluid = scm_permanent_object (scm_make_fluid ());
scm_i_port_weak_hash = scm_permanent_object (scm_make_weak_key_hash_table (SCM_I_MAKINUM(31)));
-
#include "libguile/ports.x"
+
+ SCM_VARIABLE_SET (scm_port_encoding_var, scm_make_fluid ());
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var), SCM_BOOL_F);
+ scm_port_encoding_init = 1;
+
+ SCM_VARIABLE_SET (scm_conversion_strategy, scm_make_fluid ());
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_conversion_strategy),
+ scm_from_int ((int) SCM_FAILED_CONVERSION_QUESTION_MARK));
+ scm_conversion_strategy_init = 1;
+
}
/*
diff --git a/libguile/ports.h b/libguile/ports.h
index e5c0ffd84..c75f17d59 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -28,7 +28,7 @@
#include "libguile/print.h"
#include "libguile/struct.h"
#include "libguile/threads.h"
-
+#include "libguile/strings.h"
@@ -56,6 +56,10 @@ typedef struct
long line_number; /* debugging support. */
int column_number; /* debugging support. */
+ /* Character encoding support */
+ char *encoding;
+ scm_t_string_failed_conversion_handler ilseq_handler;
+
/* port buffers. the buffer(s) are set up for all ports.
in the case of string ports, the buffer is the string itself.
in the case of unbuffered file ports, the buffer is a
@@ -266,6 +270,7 @@ SCM_API SCM scm_eof_object_p (SCM x);
SCM_API SCM scm_force_output (SCM port);
SCM_API SCM scm_flush_all_ports (void);
SCM_API SCM scm_read_char (SCM port);
+SCM_API scm_t_wchar scm_getc (SCM port);
SCM_API size_t scm_c_read (SCM port, void *buffer, size_t size);
SCM_API void scm_c_write (SCM port, const void *buffer, size_t size);
SCM_API void scm_lfwrite (const char *ptr, size_t size, SCM port);
@@ -275,7 +280,8 @@ SCM_INTERNAL void scm_lfwrite_substr (SCM str, size_t start, size_t end,
SCM_API void scm_flush (SCM port);
SCM_API void scm_end_input (SCM port);
SCM_API int scm_fill_input (SCM port);
-SCM_API void scm_ungetc (int c, SCM port);
+SCM_INTERNAL void scm_unget_byte (int c, SCM port);
+SCM_API void scm_ungetc (scm_t_wchar c, SCM port);
SCM_API void scm_ungets (const char *s, int n, SCM port);
SCM_API SCM scm_peek_char (SCM port);
SCM_API SCM scm_unread_char (SCM cobj, SCM port);
@@ -288,6 +294,15 @@ SCM_API SCM scm_port_column (SCM port);
SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
SCM_API SCM scm_port_filename (SCM port);
SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
+SCM_INTERNAL const char *scm_i_get_port_encoding (SCM port);
+SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
+SCM_API SCM scm_port_encoding (SCM port);
+SCM_API SCM scm_set_port_encoding_x (SCM port, SCM encoding);
+SCM_INTERNAL scm_t_string_failed_conversion_handler scm_i_get_conversion_strategy (SCM port);
+SCM_INTERNAL void scm_i_set_conversion_strategy_x (SCM port,
+ scm_t_string_failed_conversion_handler h);
+SCM_API SCM scm_port_conversion_strategy (SCM port);
+SCM_API SCM scm_set_port_conversion_strategy_x (SCM port, SCM behavior);
SCM_API int scm_port_print (SCM exp, SCM port, scm_print_state *);
SCM_API void scm_print_port_mode (SCM exp, SCM port);
SCM_API void scm_ports_prehistory (void);
@@ -295,7 +310,6 @@ SCM_API SCM scm_void_port (char * mode_str);
SCM_API SCM scm_sys_make_void_port (SCM mode);
SCM_INTERNAL void scm_init_ports (void);
-
#if SCM_ENABLE_DEPRECATED==1
SCM_API scm_t_port * scm_add_to_port_table (SCM port);
#endif
diff --git a/libguile/posix.c b/libguile/posix.c
index dafc5e996..09d53f212 100644
--- a/libguile/posix.c
+++ b/libguile/posix.c
@@ -25,6 +25,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
+#include <uniconv.h>
#include "libguile/_scm.h"
#include "libguile/dynwind.h"
@@ -1528,12 +1529,17 @@ SCM_DEFINE (scm_setlocale, "setlocale", 1, 1, 0,
"Otherwise the specified locale category is set to the string\n"
"@var{locale} and the new value is returned as a\n"
"system-dependent string. If @var{locale} is an empty string,\n"
- "the locale will be set using environment variables.")
+ "the locale will be set using environment variables.\n"
+ "\n"
+ "When the locale is changed, the character encoding of the new\n"
+ "locale (UTF-8, ISO-8859-1, etc.) is used for the current\n"
+ "input, output, and error ports\n")
#define FUNC_NAME s_scm_setlocale
{
int c_category;
char *clocale;
char *rv;
+ const char *enc;
scm_dynwind_begin (0);
@@ -1562,15 +1568,47 @@ SCM_DEFINE (scm_setlocale, "setlocale", 1, 1, 0,
SCM_SYSERROR;
}
- /* Recompute the standard SRFI-14 character sets in a locale-dependent
- (actually charset-dependent) way. */
- scm_srfi_14_compute_char_sets ();
+ enc = locale_charset ();
+ /* Set the default encoding for new ports. */
+ scm_i_set_port_encoding_x (SCM_BOOL_F, enc);
+ /* Set the encoding for the stdio ports. */
+ scm_i_set_port_encoding_x (scm_current_input_port (), enc);
+ scm_i_set_port_encoding_x (scm_current_output_port (), enc);
+ scm_i_set_port_encoding_x (scm_current_error_port (), enc);
scm_dynwind_end ();
return scm_from_locale_string (rv);
}
#undef FUNC_NAME
#endif /* HAVE_SETLOCALE */
+SCM_DEFINE (scm_setbinary, "setbinary", 0, 0, 0,
+ (void),
+ "Sets the encoding for the current input, output, and error\n"
+ "ports to ISO-8859-1. That character encoding allows\n"
+ "ports to operate on binary data.\n"
+ "\n"
+ "It also sets the default encoding for newly created ports\n"
+ "to ISO-8859-1.\n"
+ "\n"
+ "The previous default encoding for new ports is returned\n")
+#define FUNC_NAME s_scm_setbinary
+{
+ const char *enc = scm_i_get_port_encoding (SCM_BOOL_F);
+
+ /* Set the default encoding for new ports. */
+ scm_i_set_port_encoding_x (SCM_BOOL_F, NULL);
+ /* Set the encoding for the stdio ports. */
+ scm_i_set_port_encoding_x (scm_current_input_port (), NULL);
+ scm_i_set_port_encoding_x (scm_current_output_port (), NULL);
+ scm_i_set_port_encoding_x (scm_current_error_port (), NULL);
+
+ if (enc)
+ return scm_from_locale_string (enc);
+
+ return scm_from_locale_string ("ISO-8859-1");
+}
+#undef FUNC_NAME
+
#ifdef HAVE_MKNOD
SCM_DEFINE (scm_mknod, "mknod", 4, 0, 0,
diff --git a/libguile/posix.h b/libguile/posix.h
index 4d057643c..2d93300b8 100644
--- a/libguile/posix.h
+++ b/libguile/posix.h
@@ -74,6 +74,7 @@ SCM_API SCM scm_access (SCM path, SCM how);
SCM_API SCM scm_getpid (void);
SCM_API SCM scm_putenv (SCM str);
SCM_API SCM scm_setlocale (SCM category, SCM locale);
+SCM_API SCM scm_setbinary (void);
SCM_API SCM scm_mknod (SCM path, SCM type, SCM perms, SCM dev);
SCM_API SCM scm_nice (SCM incr);
SCM_API SCM scm_sync (void);
diff --git a/libguile/print.c b/libguile/print.c
index 3ce080c69..4d206eb5b 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -462,20 +462,45 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
| UC_CATEGORY_MASK_S))
/* Print the character if is graphic character. */
{
- if (i<256)
- /* Character is graphic. Print it. */
- scm_putc (i, port);
+ scm_t_wchar *wbuf;
+ SCM wstr = scm_i_make_wide_string (1, &wbuf);
+ char *buf;
+ size_t len;
+ const char *enc;
+
+ enc = scm_i_get_port_encoding (port);
+ wbuf[0] = i;
+ if (enc == NULL && i <= 0xFF)
+ {
+ /* Character is graphic and Latin-1. Print it */
+ scm_lfwrite_str (wstr, port);
+ }
else
- /* Character is graphic but unrepresentable in
- this port's encoding. */
- scm_intprint (i, 8, port);
+ {
+ buf = u32_conv_to_encoding (enc,
+ iconveh_error,
+ (scm_t_uint32 *) wbuf,
+ 1,
+ NULL,
+ NULL, &len);
+ if (buf != NULL)
+ {
+ /* Character is graphic. Print it. */
+ scm_lfwrite_str (wstr, port);
+ free (buf);
+ }
+ else
+ /* Character is graphic but unrepresentable in
+ this port's encoding. */
+ scm_intprint (i, 8, port);
+ }
}
else
/* Character is a non-graphical character. */
scm_intprint (i, 8, port);
}
else
- scm_putc (i, port);
+ scm_i_charprint (i, port);
}
else if (SCM_IFLAGP (exp)
&& ((size_t) SCM_IFLAGNUM (exp) < (sizeof iflagnames / sizeof (char *))))
@@ -607,21 +632,32 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
SCM wstr = scm_i_make_wide_string (1, &wbuf);
char *buf;
size_t len;
-
- wbuf[0] = ch;
-
- buf = u32_conv_to_encoding ("ISO-8859-1",
- iconveh_error,
- (scm_t_uint32 *) wbuf,
- 1, NULL, NULL, &len);
- if (buf != NULL)
+
+ if (scm_i_get_port_encoding (port))
{
- /* Character is graphic and representable in
- this encoding. Print it. */
- scm_lfwrite_str (wstr, port);
- free (buf);
- printed = 1;
+ wstr = scm_i_make_wide_string (1, &wbuf);
+ wbuf[0] = ch;
+ buf = u32_conv_to_encoding (scm_i_get_port_encoding (port),
+ iconveh_error,
+ (scm_t_uint32 *) wbuf,
+ 1 ,
+ NULL,
+ NULL, &len);
+ if (buf != NULL)
+ {
+ /* Character is graphic and representable in
+ this encoding. Print it. */
+ scm_lfwrite_str (wstr, port);
+ free (buf);
+ printed = 1;
+ }
}
+ else
+ if (ch <= 0xFF)
+ {
+ scm_putc (ch, port);
+ printed = 1;
+ }
}
if (!printed)
@@ -834,7 +870,7 @@ scm_prin1 (SCM exp, SCM port, int writingp)
/* Print a character.
*/
void
-scm_i_charprint (scm_t_uint32 ch, SCM port)
+scm_i_charprint (scm_t_wchar ch, SCM port)
{
scm_t_wchar *wbuf;
SCM wstr = scm_i_make_wide_string (1, &wbuf);
@@ -1056,9 +1092,7 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
SCM port, answer = SCM_UNSPECIFIED;
int fReturnString = 0;
int writingp;
- const char *start;
- const char *end;
- const char *p;
+ size_t start, p, end;
if (scm_is_eq (destination, SCM_BOOL_T))
{
@@ -1081,15 +1115,16 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
SCM_VALIDATE_STRING (2, message);
SCM_VALIDATE_REST_ARGUMENT (args);
- start = scm_i_string_chars (message);
- end = start + scm_i_string_length (message);
+ p = 0;
+ start = 0;
+ end = scm_i_string_length (message);
for (p = start; p != end; ++p)
- if (*p == '~')
+ if (scm_i_string_ref (message, p) == '~')
{
if (++p == end)
break;
- switch (*p)
+ switch (scm_i_string_ref (message, p))
{
case 'A': case 'a':
writingp = 0;
@@ -1098,33 +1133,33 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
writingp = 1;
break;
case '~':
- scm_lfwrite (start, p - start, port);
+ scm_lfwrite_substr (message, start, p, port);
start = p + 1;
continue;
case '%':
- scm_lfwrite (start, p - start - 1, port);
+ scm_lfwrite_substr (message, start, p - 1, port);
scm_newline (port);
start = p + 1;
continue;
default:
SCM_MISC_ERROR ("FORMAT: Unsupported format option ~~~A - use (ice-9 format) instead",
- scm_list_1 (SCM_MAKE_CHAR (*p)));
+ scm_list_1 (SCM_MAKE_CHAR (scm_i_string_ref (message, p))));
}
if (!scm_is_pair (args))
SCM_MISC_ERROR ("FORMAT: Missing argument for ~~~A",
- scm_list_1 (SCM_MAKE_CHAR (*p)));
+ scm_list_1 (SCM_MAKE_CHAR (scm_i_string_ref (message, p))));
- scm_lfwrite (start, p - start - 1, port);
+ scm_lfwrite_substr (message, start, p - 1, port);
/* we pass destination here */
scm_prin1 (SCM_CAR (args), destination, writingp);
args = SCM_CDR (args);
start = p + 1;
}
- scm_lfwrite (start, p - start, port);
+ scm_lfwrite_substr (message, start, p, port);
if (!scm_is_eq (args, SCM_EOL))
SCM_MISC_ERROR ("FORMAT: ~A superfluous arguments",
scm_list_1 (scm_length (args)));
diff --git a/libguile/print.h b/libguile/print.h
index 3e2333ddd..ae2aaef54 100644
--- a/libguile/print.h
+++ b/libguile/print.h
@@ -25,6 +25,7 @@
#include "libguile/__scm.h"
+#include "libguile/chars.h"
#include "libguile/options.h"
@@ -77,7 +78,7 @@ SCM_API SCM scm_print_options (SCM setting);
SCM_API SCM scm_make_print_state (void);
SCM_API void scm_free_print_state (SCM print_state);
SCM_INTERNAL SCM scm_i_port_with_print_state (SCM port, SCM print_state);
-SCM_INTERNAL void scm_i_charprint (scm_t_uint32 c, SCM port);
+SCM_INTERNAL void scm_i_charprint (scm_t_wchar c, SCM port);
SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port);
SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port);
SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port);
diff --git a/libguile/rdelim.c b/libguile/rdelim.c
index 04a0944f4..1f46e5bf0 100644
--- a/libguile/rdelim.c
+++ b/libguile/rdelim.c
@@ -59,12 +59,10 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
size_t j;
size_t cstart;
size_t cend;
- int c;
- const char *cdelims;
+ scm_t_wchar c;
size_t num_delims;
SCM_VALIDATE_STRING (1, delims);
- cdelims = scm_i_string_chars (delims);
num_delims = scm_i_string_length (delims);
SCM_VALIDATE_STRING (2, str);
@@ -83,7 +81,7 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
c = scm_getc (port);
for (k = 0; k < num_delims; k++)
{
- if (cdelims[k] == c)
+ if (scm_i_string_ref (delims, k) == c)
{
if (scm_is_false (gobble))
scm_ungetc (c, port);
diff --git a/libguile/read.c b/libguile/read.c
index a8ec0a86d..821e01cea 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -27,6 +27,8 @@
#include <stdio.h>
#include <ctype.h>
#include <string.h>
+#include <unistd.h>
+#include <unicase.h>
#include "libguile/_scm.h"
#include "libguile/bytevectors.h"
@@ -178,11 +180,6 @@ static SCM *scm_read_hash_procedures;
(((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
|| ((_chr) == 'd') || ((_chr) == 'l'))
-/* An inlinable version of `scm_c_downcase ()'. */
-#define CHAR_DOWNCASE(_chr) \
- (((_chr) <= UCHAR_MAX) ? tolower ((int) (_chr)) : (_chr))
-
-
/* Read an SCSH block comment. */
static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
static SCM scm_read_commented_expression (int chr, SCM port);
@@ -190,41 +187,69 @@ static SCM scm_read_commented_expression (int chr, SCM port);
/* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
zero if the whole token fits in BUF, non-zero otherwise. */
static inline int
-read_token (SCM port, char *buf, size_t buf_size, size_t *read)
+read_token (SCM port, SCM buf, size_t *read)
{
+ scm_t_wchar chr;
*read = 0;
- while (*read < buf_size)
+ buf = scm_i_string_start_writing (buf);
+ while (*read < scm_i_string_length (buf))
{
- int chr;
-
chr = scm_getc (port);
- chr = (SCM_CASE_INSENSITIVE_P ? CHAR_DOWNCASE (chr) : chr);
if (chr == EOF)
- return 0;
- else if (CHAR_IS_DELIMITER (chr))
{
- scm_ungetc (chr, port);
+ scm_i_string_stop_writing ();
return 0;
}
- else
+
+ chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
+
+ if (CHAR_IS_DELIMITER (chr))
{
- *buf = (char) chr;
- buf++, (*read)++;
+ scm_i_string_stop_writing ();
+ scm_ungetc (chr, port);
+ return 0;
}
+
+ scm_i_string_set_x (buf, *read, chr);
+ (*read)++;
}
+ scm_i_string_stop_writing ();
return 1;
}
+static SCM
+read_complete_token (SCM port, size_t *read)
+{
+ SCM buffer, str = SCM_EOL;
+ size_t len;
+ int overflow;
+
+ buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
+ overflow = read_token (port, buffer, read);
+ if (!overflow)
+ return scm_i_substring (buffer, 0, *read);
+
+ str = scm_string_copy (buffer);
+ do
+ {
+ overflow = read_token (port, buffer, &len);
+ str = scm_string_append (scm_list_2 (str, buffer));
+ *read += len;
+ }
+ while (overflow);
+
+ return scm_i_substring (str, 0, *read);
+}
/* Skip whitespace from PORT and return the first non-whitespace character
read. Raise an error on end-of-file. */
static int
flush_ws (SCM port, const char *eoferr)
{
- register int c;
+ register scm_t_wchar c;
while (1)
switch (c = scm_getc (port))
{
@@ -293,7 +318,7 @@ static SCM recsexpr (SCM obj, long line, int column, SCM filename);
static SCM
-scm_read_sexp (int chr, SCM port)
+scm_read_sexp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_i_lreadparen"
{
register int c;
@@ -554,107 +579,52 @@ scm_read_string (int chr, SCM port)
static SCM
-scm_read_number (int chr, SCM port)
+scm_read_number (scm_t_wchar chr, SCM port)
{
- SCM result, str = SCM_EOL;
- char buffer[READER_BUFFER_SIZE];
+ SCM result;
+ SCM buffer;
size_t read;
- int overflow = 0;
scm_ungetc (chr, port);
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
-
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- /* The slow path. */
-
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- result = scm_string_to_number (str, SCM_UNDEFINED);
- if (!scm_is_true (result))
- /* Return a symbol instead of a number. */
- result = scm_string_to_symbol (str);
- }
- else
- {
- result = scm_c_locale_stringn_to_number (buffer, read, 10);
- if (!scm_is_true (result))
- /* Return a symbol instead of a number. */
- result = scm_from_locale_symboln (buffer, read);
- }
+ buffer = read_complete_token (port, &read);
+ result = scm_string_to_number (buffer, SCM_UNDEFINED);
+ if (!scm_is_true (result))
+ /* Return a symbol instead of a number. */
+ result = scm_string_to_symbol (buffer);
return result;
}
static SCM
-scm_read_mixed_case_symbol (int chr, SCM port)
+scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
{
- SCM result, str = SCM_EOL;
- int overflow = 0, ends_with_colon = 0;
- char buffer[READER_BUFFER_SIZE];
+ SCM result;
+ int ends_with_colon = 0;
+ SCM buffer;
size_t read = 0;
int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
scm_ungetc (chr, port);
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
-
- if (read > 0)
- ends_with_colon = (buffer[read - 1] == ':');
+ buffer = read_complete_token (port, &read);
+ if (read > 0)
+ ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- size_t len;
-
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- len = scm_c_string_length (str);
-
- /* Per SRFI-88, `:' alone is an identifier, not a keyword. */
- if (postfix && ends_with_colon && (len > 1))
- {
- /* Strip off colon. */
- str = scm_c_substring (str, 0, len-1);
- result = scm_string_to_symbol (str);
- result = scm_symbol_to_keyword (result);
- }
- else
- result = scm_string_to_symbol (str);
- }
+ if (postfix && ends_with_colon && (read > 1))
+ result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
else
- {
- /* For symbols smaller than `sizeof (buffer)', we don't need to recur
- to Scheme strings. Therefore, we only create one Scheme object (a
- symbol) per symbol read. */
- if (postfix && ends_with_colon && (read > 1))
- result = scm_from_locale_keywordn (buffer, read - 1);
- else
- result = scm_from_locale_symboln (buffer, read);
- }
+ result = scm_string_to_symbol (buffer);
return result;
}
static SCM
-scm_read_number_and_radix (int chr, SCM port)
+scm_read_number_and_radix (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
- SCM result, str = SCM_EOL;
+ SCM result;
size_t read;
- char buffer[READER_BUFFER_SIZE];
+ SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
unsigned int radix;
- int overflow = 0;
switch (chr)
{
@@ -684,22 +654,8 @@ scm_read_number_and_radix (int chr, SCM port)
radix = 10;
}
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
-
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- result = scm_string_to_number (str, scm_from_uint (radix));
- }
- else
- result = scm_c_locale_stringn_to_number (buffer, read, radix);
+ buffer = read_complete_token (port, &read);
+ result = scm_string_to_number (buffer, scm_from_uint (radix));
if (scm_is_true (result))
return result;
@@ -729,7 +685,7 @@ scm_read_quote (int chr, SCM port)
case ',':
{
- int c;
+ scm_t_wchar c;
c = scm_getc (port);
if ('@' == c)
@@ -828,7 +784,10 @@ scm_read_semicolon_comment (int chr, SCM port)
{
int c;
- for (c = scm_getc (port);
+ /* We use the get_byte here because there is no need to get the
+ locale correct with comment input. This presumes that newline
+ always represents itself no matter what the encoding is. */
+ for (c = scm_get_byte_or_eof (port);
(c != EOF) && (c != '\n');
c = scm_getc (port));
@@ -856,14 +815,19 @@ scm_read_boolean (int chr, SCM port)
}
static SCM
-scm_read_character (int chr, SCM port)
+scm_read_character (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM ch;
- char charname[READER_CHAR_NAME_MAX_SIZE];
+ SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
size_t charname_len;
+ scm_t_wchar cp;
+ int overflow;
+
+ overflow = read_token (port, charname, &charname_len);
+ charname = scm_c_substring (charname, 0, charname_len);
- if (read_token (port, charname, sizeof (charname), &charname_len))
+ if (overflow)
goto char_error;
if (charname_len == 0)
@@ -878,28 +842,33 @@ scm_read_character (int chr, SCM port)
}
if (charname_len == 1)
- return SCM_MAKE_CHAR (charname[0]);
+ return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
- if (*charname >= '0' && *charname < '8')
+ cp = scm_i_string_ref (charname, 0);
+ if (cp >= '0' && cp < '8')
{
/* Dirk:FIXME:: This type of character syntax is not R5RS
* compliant. Further, it should be verified that the constant
* does only consist of octal digits. Finally, it should be
* checked whether the resulting fixnum is in the range of
* characters. */
- SCM p = scm_c_locale_stringn_to_number (charname, charname_len, 8);
+ SCM p = scm_string_to_number (charname, scm_from_uint (8));
if (SCM_I_INUMP (p))
return SCM_MAKE_CHAR (SCM_I_INUM (p));
}
- ch = scm_i_charname_to_char (charname, charname_len);
+ /* The names of characters should never have non-Latin1
+ characters. */
+ if (scm_i_is_narrow_string (charname)
+ || scm_i_try_narrow_string (charname))
+ ch = scm_i_charname_to_char (scm_i_string_chars (charname),
+ charname_len);
if (scm_is_true (ch))
return ch;
char_error:
scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
- scm_list_1 (scm_from_locale_stringn (charname,
- charname_len)));
+ scm_list_1 (charname));
return SCM_UNSPECIFIED;
}
@@ -941,7 +910,7 @@ scm_read_srfi4_vector (int chr, SCM port)
}
static SCM
-scm_read_bytevector (int chr, SCM port)
+scm_read_bytevector (scm_t_wchar chr, SCM port)
{
chr = scm_getc (port);
if (chr != 'u')
@@ -965,7 +934,7 @@ scm_read_bytevector (int chr, SCM port)
}
static SCM
-scm_read_guile_bit_vector (int chr, SCM port)
+scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
{
/* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
terribly inefficient but who cares? */
@@ -985,13 +954,17 @@ scm_read_guile_bit_vector (int chr, SCM port)
}
static inline SCM
-scm_read_scsh_block_comment (int chr, SCM port)
+scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
{
int bang_seen = 0;
+ /* We can use the get_byte here because there is no need to get the
+ locale correct when reading comments. This presumes that
+ hash and exclamation points always represent themselves no
+ matter what the source encoding is.*/
for (;;)
{
- int c = scm_getc (port);
+ int c = scm_get_byte_or_eof (port);
if (c == EOF)
scm_i_input_error ("skip_block_comment", port,
@@ -1009,9 +982,9 @@ scm_read_scsh_block_comment (int chr, SCM port)
}
static SCM
-scm_read_commented_expression (int chr, SCM port)
+scm_read_commented_expression (scm_t_wchar chr, SCM port)
{
- int c;
+ scm_t_wchar c;
c = flush_ws (port, (char *) NULL);
if (EOF == c)
@@ -1023,19 +996,18 @@ scm_read_commented_expression (int chr, SCM port)
}
static SCM
-scm_read_extended_symbol (int chr, SCM port)
+scm_read_extended_symbol (scm_t_wchar chr, SCM port)
{
/* Guile's extended symbol read syntax looks like this:
#{This is all a symbol name}#
So here, CHR is expected to be `{'. */
- SCM result;
int saw_brace = 0, finished = 0;
size_t len = 0;
- char buf[1024];
+ SCM buf = scm_i_make_string (1024, NULL);
- result = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
+ buf = scm_i_string_start_writing (buf);
while ((chr = scm_getc (port)) != EOF)
{
@@ -1049,32 +1021,30 @@ scm_read_extended_symbol (int chr, SCM port)
else
{
saw_brace = 0;
- buf[len++] = '}';
- buf[len++] = chr;
+ scm_i_string_set_x (buf, len++, '}');
+ scm_i_string_set_x (buf, len++, chr);
}
}
else if (chr == '}')
saw_brace = 1;
else
- buf[len++] = chr;
+ scm_i_string_set_x (buf, len++, chr);
- if (len >= sizeof (buf) - 2)
+ if (len >= scm_i_string_length (buf) - 2)
{
- scm_string_append (scm_list_2 (result,
- scm_from_locale_stringn (buf, len)));
+ scm_i_string_stop_writing ();
+ SCM addy = scm_i_make_string (1024, NULL);
+ buf = scm_string_append (scm_list_2 (buf, addy));
len = 0;
+ buf = scm_i_string_start_writing (buf);
}
if (finished)
break;
}
+ scm_i_string_stop_writing ();
- if (len)
- result = scm_string_append (scm_list_2
- (result,
- scm_from_locale_stringn (buf, len)));
-
- return (scm_string_to_symbol (result));
+ return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
}
@@ -1110,7 +1080,7 @@ scm_read_sharp_extension (int chr, SCM port)
/* The reader for the sharp `#' character. It basically dispatches reads
among the above token readers. */
static SCM
-scm_read_sharp (int chr, SCM port)
+scm_read_sharp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM result;
@@ -1162,7 +1132,7 @@ scm_read_sharp (int chr, SCM port)
{
/* When next char is '(', it really is an old-style
uniform array. */
- int next_c = scm_getc (port);
+ scm_t_wchar next_c = scm_getc (port);
if (next_c != EOF)
scm_ungetc (next_c, port);
if (next_c == '(')
@@ -1210,7 +1180,7 @@ scm_read_expression (SCM port)
{
while (1)
{
- register int chr;
+ register scm_t_wchar chr;
chr = scm_getc (port);
@@ -1421,6 +1391,127 @@ scm_get_hash_procedure (int c)
}
}
+#define SCM_ENCODING_SEARCH_SIZE (500)
+
+/* Search the first few hundred characters of a file for
+ an emacs-like coding declaration. */
+char *
+scm_scan_for_encoding (SCM port)
+{
+ char header[SCM_ENCODING_SEARCH_SIZE+1];
+ size_t bytes_read;
+ char *encoding = NULL;
+ int utf8_bom = 0;
+ char *pos;
+ int i;
+ int in_comment;
+
+ bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
+ scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+
+ if (bytes_read > 3
+ && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
+ utf8_bom = 1;
+
+ /* search past "coding[:=]" */
+ pos = header;
+ while (1)
+ {
+ if ((pos = strstr(pos, "coding")) == NULL)
+ return NULL;
+
+ pos += strlen("coding");
+ if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
+ (*pos == ':' || *pos == '='))
+ {
+ pos ++;
+ break;
+ }
+ }
+
+ /* skip spaces */
+ while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
+ (*pos == ' ' || *pos == '\t'))
+ pos ++;
+
+ /* grab the next token */
+ i = 0;
+ while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
+ && (isalnum(pos[i]) || pos[i] == '_' || pos[i] == '-' || pos[i] == '.'))
+ i++;
+
+ if (i == 0)
+ return NULL;
+
+ encoding = scm_malloc (i+1);
+ memcpy (encoding, pos, i);
+ encoding[i] ='\0';
+ for (i = 0; i < strlen(encoding); i++)
+ encoding[i] = toupper(encoding[i]);
+
+ /* push backwards to make sure we were in a comment */
+ in_comment = 0;
+ while (pos - i - header > 0)
+ {
+ if (*(pos - i) == '\n')
+ {
+ /* This wasn't in a semicolon comment. Check for a
+ hash-bang comment. */
+ char *beg = strstr (header, "#!");
+ char *end = strstr (header, "!#");
+ if (beg < pos && pos < end)
+ in_comment = 1;
+ break;
+ }
+ if (*(pos - i) == ';')
+ {
+ in_comment = 1;
+ break;
+ }
+ i ++;
+ }
+ if (!in_comment)
+ {
+ /* This wasn't in a comment */
+ free (encoding);
+ return NULL;
+ }
+ if (utf8_bom && strcmp(encoding, "UTF-8"))
+ scm_misc_error (NULL,
+ "the port input declares the encoding ~s but is encoded as UTF-8",
+ scm_list_1 (scm_from_locale_string (encoding)));
+
+ return encoding;
+}
+
+SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
+ (SCM port),
+ "Scans the port for an EMACS-like character coding declaration\n"
+ "near the top of the contents of a port with random-acessible contents.\n"
+ "The coding declaration is of the form\n"
+ "@code{coding: XXXXX} and must appear in a scheme comment.\n"
+ "\n"
+ "Returns a string containing the character encoding of the file\n"
+ "if a declaration was found, or @code{#f} otherwise.\n")
+#define FUNC_NAME s_scm_file_encoding
+{
+ char *enc;
+ SCM s_enc;
+
+ enc = scm_scan_for_encoding (port);
+ if (enc == NULL)
+ return SCM_BOOL_F;
+ else
+ {
+ s_enc = scm_from_locale_string (enc);
+ free (enc);
+ return s_enc;
+ }
+
+ return SCM_BOOL_F;
+}
+#undef FUNC_NAME
+
void
scm_init_read ()
{
diff --git a/libguile/read.h b/libguile/read.h
index 20d3f4bf7..7bc4a0ba4 100644
--- a/libguile/read.h
+++ b/libguile/read.h
@@ -56,6 +56,8 @@ SCM_API SCM scm_read_options (SCM setting);
SCM_API SCM scm_read (SCM port);
SCM_API size_t scm_read_token (int ic, SCM * tok_buf, SCM port, int weird);
SCM_API SCM scm_read_hash_extend (SCM chr, SCM proc);
+SCM_INTERNAL char *scm_scan_for_encoding (SCM port);
+SCM_API SCM scm_file_encoding (SCM port);
SCM_INTERNAL void scm_i_input_error (const char *func, SCM port,
const char *message, SCM arg)
diff --git a/libguile/socket.c b/libguile/socket.c
index 144bb1090..3a81ed9d0 100644
--- a/libguile/socket.c
+++ b/libguile/socket.c
@@ -33,6 +33,7 @@
#include "libguile/strings.h"
#include "libguile/vectors.h"
#include "libguile/dynwind.h"
+#include "libguile/srfi-13.h"
#include "libguile/validate.h"
#include "libguile/socket.h"
@@ -1414,6 +1415,8 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
"protocols, if a packet larger than this limit is encountered\n"
"then some data\n"
"will be irrevocably lost.\n\n"
+ "The data is assumed to be binary, and there is no decoding of\n"
+ "of locale-encoded strings.\n\n"
"The optional @var{flags} argument is a value or\n"
"bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n"
"The value returned is the number of bytes read from the\n"
@@ -1428,6 +1431,7 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
int flg;
char *dest;
size_t len;
+ SCM msg;
SCM_VALIDATE_OPFPORT (1, sock);
SCM_VALIDATE_STRING (2, buf);
@@ -1437,16 +1441,16 @@ SCM_DEFINE (scm_recv, "recv!", 2, 1, 0,
flg = scm_to_int (flags);
fd = SCM_FPORT_FDES (sock);
- len = scm_i_string_length (buf);
- buf = scm_i_string_start_writing (buf);
- dest = scm_i_string_writable_chars (buf);
+ len = scm_i_string_length (buf);
+ msg = scm_i_make_string (len, &dest);
SCM_SYSCALL (rv = recv (fd, dest, len, flg));
- scm_i_string_stop_writing ();
+ scm_string_copy_x (buf, scm_from_int (0),
+ msg, scm_from_int (0), scm_from_size_t (len));
if (rv == -1)
SCM_SYSERROR;
- scm_remember_upto_here_1 (buf);
+ scm_remember_upto_here_2 (buf, msg);
return scm_from_int (rv);
}
#undef FUNC_NAME
@@ -1464,18 +1468,28 @@ SCM_DEFINE (scm_send, "send", 2, 1, 0,
"bitwise OR of MSG_OOB, MSG_PEEK, MSG_DONTROUTE etc.\n\n"
"Note that the data is written directly to the socket\n"
"file descriptor:\n"
- "any unflushed buffered port data is ignored.")
+ "any unflushed buffered port data is ignored.\n\n"
+ "This operation is defined only for strings containing codepoints\n"
+ "zero to 255.")
#define FUNC_NAME s_scm_send
{
int rv;
int fd;
int flg;
- const char *src;
+ char *src;
size_t len;
sock = SCM_COERCE_OUTPORT (sock);
SCM_VALIDATE_OPFPORT (1, sock);
SCM_VALIDATE_STRING (2, message);
+
+ /* If the string is wide, see if it can be coerced into
+ a narrow string. */
+ if (!scm_i_is_narrow_string (message)
+ || scm_i_try_narrow_string (message))
+ SCM_MISC_ERROR ("the message string is not 8-bit: ~s",
+ scm_list_1 (message));
+
if (SCM_UNBNDP (flags))
flg = 0;
else
@@ -1592,7 +1606,9 @@ SCM_DEFINE (scm_sendto, "sendto", 3, 1, 1,
"set to be non-blocking.\n"
"Note that the data is written directly to the socket\n"
"file descriptor:\n"
- "any unflushed buffered port data is ignored.")
+ "any unflushed buffered port data is ignored.\n"
+ "This operation is defined only for strings containing codepoints\n"
+ "zero to 255.")
#define FUNC_NAME s_scm_sendto
{
int rv;
diff --git a/libguile/stime.c b/libguile/stime.c
index a6843377b..54022c296 100644
--- a/libguile/stime.c
+++ b/libguile/stime.c
@@ -46,6 +46,7 @@
#include <stdio.h>
#include <errno.h>
#include <strftime.h>
+#include <unistr.h>
#include "libguile/_scm.h"
#include "libguile/async.h"
@@ -53,6 +54,7 @@
#include "libguile/strings.h"
#include "libguile/vectors.h"
#include "libguile/dynwind.h"
+#include "libguile/strings.h"
#include "libguile/validate.h"
#include "libguile/stime.h"
@@ -624,18 +626,20 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
{
struct tm t;
- char *tbuf;
+ scm_t_uint8 *tbuf;
int size = 50;
- const char *fmt;
- char *myfmt;
+ scm_t_uint8 *fmt;
+ scm_t_uint8 *myfmt;
int len;
SCM result;
SCM_VALIDATE_STRING (1, format);
bdtime2c (stime, &t, SCM_ARG2, FUNC_NAME);
- fmt = scm_i_string_chars (format);
- len = scm_i_string_length (format);
+ /* Convert string to UTF-8 so that non-ASCII characters in the
+ format are passed through unchanged. */
+ fmt = scm_i_to_utf8_string (format);
+ len = strlen ((const char *) fmt);
/* Ugly hack: strftime can return 0 if its buffer is too small,
but some valid time strings (e.g. "%p") can sometimes produce
@@ -643,9 +647,11 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
character to the format string, so that valid returns are always
nonzero. */
myfmt = scm_malloc (len+2);
- *myfmt = 'x';
- strncpy(myfmt+1, fmt, len);
- myfmt[len+1] = 0;
+ *myfmt = (scm_t_uint8) 'x';
+ strncpy ((char *) myfmt + 1, (const char *) fmt, len);
+ myfmt[len + 1] = 0;
+ scm_remember_upto_here_1 (format);
+ free (fmt);
tbuf = scm_malloc (size);
{
@@ -680,7 +686,8 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
/* Use `nstrftime ()' from Gnulib, which supports all GNU extensions
supported by glibc. */
- while ((len = nstrftime (tbuf, size, myfmt, &t, 0, 0)) == 0)
+ while ((len = nstrftime ((char *) tbuf, size,
+ (const char *) myfmt, &t, 0, 0)) == 0)
{
free (tbuf);
size *= 2;
@@ -696,7 +703,7 @@ SCM_DEFINE (scm_strftime, "strftime", 2, 0, 0,
#endif
}
- result = scm_from_locale_stringn (tbuf + 1, len - 1);
+ result = scm_i_from_utf8_string ((const scm_t_uint8 *) tbuf + 1);
free (tbuf);
free (myfmt);
#if HAVE_STRUCT_TM_TM_ZONE
@@ -722,14 +729,17 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
#define FUNC_NAME s_scm_strptime
{
struct tm t;
- const char *fmt, *str, *rest;
+ scm_t_uint8 *fmt, *str, *rest;
+ size_t used_len;
long zoff;
SCM_VALIDATE_STRING (1, format);
SCM_VALIDATE_STRING (2, string);
- fmt = scm_i_string_chars (format);
- str = scm_i_string_chars (string);
+ /* Convert strings to UTF-8 so that non-ASCII characters are passed
+ through unchanged. */
+ fmt = scm_i_to_utf8_string (format);
+ str = scm_i_to_utf8_string (string);
/* initialize the struct tm */
#define tm_init(field) t.field = 0
@@ -751,7 +761,8 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
fields, hence the use of SCM_CRITICAL_SECTION_START. */
t.tm_isdst = -1;
SCM_CRITICAL_SECTION_START;
- rest = strptime (str, fmt, &t);
+ rest = (scm_t_uint8 *) strptime ((const char *) str,
+ (const char *) fmt, &t);
SCM_CRITICAL_SECTION_END;
if (rest == NULL)
{
@@ -759,6 +770,9 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
instance it doesn't. Force a sensible value for our error
message. */
errno = EINVAL;
+ scm_remember_upto_here_2 (format, string);
+ free (str);
+ free (fmt);
SCM_SYSERROR;
}
@@ -770,8 +784,14 @@ SCM_DEFINE (scm_strptime, "strptime", 2, 0, 0,
zoff = 0;
#endif
+ /* Compute the number of UTF-8 characters. */
+ used_len = u8_strnlen (str, rest-str);
+ scm_remember_upto_here_2 (format, string);
+ free (str);
+ free (fmt);
+
return scm_cons (filltime (&t, zoff, NULL),
- scm_from_signed_integer (rest - str));
+ scm_from_signed_integer (used_len));
}
#undef FUNC_NAME
#endif /* HAVE_STRPTIME */
diff --git a/libguile/strings.c b/libguile/strings.c
index 39dab3a38..4a8390d16 100644
--- a/libguile/strings.c
+++ b/libguile/strings.c
@@ -28,6 +28,8 @@
#include <unistr.h>
#include <uniconv.h>
+#include "striconveh.h"
+
#include "libguile/_scm.h"
#include "libguile/chars.h"
#include "libguile/root.h"
@@ -240,6 +242,36 @@ widen_stringbuf (SCM buf)
}
}
+/* Convert a stringbuf of 32-bit UCS-4-encoded characters to one
+ containing 8-bit Latin-1-encoded characters, if possible. */
+static void
+narrow_stringbuf (SCM buf)
+{
+ size_t i, len;
+ scm_t_wchar *wmem;
+ char *mem;
+
+ if (!STRINGBUF_WIDE (buf))
+ return;
+
+ len = STRINGBUF_OUTLINE_LENGTH (buf);
+ i = 0;
+ wmem = STRINGBUF_WIDE_CHARS (buf);
+ while (i < len)
+ if (wmem[i++] > 0xFF)
+ return;
+
+ mem = scm_gc_malloc (sizeof (char) * (len + 1), "string");
+ for (i = 0; i < len; i++)
+ mem[i] = (unsigned char) wmem[i];
+
+ scm_gc_free (wmem, sizeof (scm_t_wchar) * (len + 1), "string");
+
+ SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_WIDE);
+ SCM_SET_CELL_WORD_1 (buf, mem);
+ SCM_SET_CELL_WORD_2 (buf, len);
+}
+
scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
/* Copy-on-write strings.
@@ -460,6 +492,18 @@ scm_i_is_narrow_string (SCM str)
return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
}
+/* Try to coerce a string to be narrow. It if is narrow already, do
+ nothing. If it is wide, shrink it to narrow if none of its
+ characters are above 0xFF. Return true if the string is narrow or
+ was made to be narrow. */
+int
+scm_i_try_narrow_string (SCM str)
+{
+ narrow_stringbuf (STRING_STRINGBUF (str));
+
+ return scm_i_is_narrow_string (str);
+}
+
/* Returns a pointer to the 8-bit Latin-1 encoded character array of
STR. */
const char *
@@ -591,6 +635,37 @@ scm_i_string_ref (SCM str, size_t x)
return scm_i_string_wide_chars (str)[x];
}
+/* Returns index+1 of the first char in STR that matches C, or
+ 0 if the char is not found. */
+int
+scm_i_string_contains_char (SCM str, char ch)
+{
+ size_t i;
+ size_t len = scm_i_string_length (str);
+
+ i = 0;
+ if (scm_i_is_narrow_string (str))
+ {
+ while (i < len)
+ {
+ if (scm_i_string_chars (str)[i] == ch)
+ return i+1;
+ i++;
+ }
+ }
+ else
+ {
+ while (i < len)
+ {
+ if (scm_i_string_wide_chars (str)[i]
+ == (unsigned char) ch)
+ return i+1;
+ i++;
+ }
+ }
+ return 0;
+}
+
int
scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
{
@@ -624,7 +699,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
if (scm_i_is_narrow_string (str))
{
char *dst = scm_i_string_writable_chars (str);
- dst[p] = (char) (unsigned char) chr;
+ dst[p] = chr;
}
else
{
@@ -634,7 +709,7 @@ scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
}
/* Symbols.
-
+
Basic symbol creation and accessing is done here, the rest is in
symbols.[hc]. This has been done to keep stringbufs and the
internals of strings and string-like objects confined to this file.
@@ -867,7 +942,7 @@ SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
else
e5 = scm_cons (scm_from_locale_symbol ("read-only"),
SCM_BOOL_F);
-
+
/* Stringbuf info */
if (!STRINGBUF_WIDE (buf))
{
@@ -1402,20 +1477,105 @@ scm_is_string (SCM obj)
return IS_STRING (obj);
}
+static SCM
+scm_from_stringn (const char *str, size_t len, const char *encoding,
+ scm_t_string_failed_conversion_handler handler)
+{
+ size_t u32len, i;
+ scm_t_wchar *u32;
+ int wide = 0;
+ SCM res;
+
+ if (encoding == NULL)
+ {
+ /* If encoding is null, use Latin-1. */
+ char *buf;
+ res = scm_i_make_string (len, &buf);
+ memcpy (buf, str, len);
+ return res;
+ }
+
+ u32len = 0;
+ u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
+ (enum iconv_ilseq_handler)
+ handler,
+ str, len,
+ NULL,
+ NULL, &u32len);
+
+ if (u32 == NULL)
+ {
+ if (errno == ENOMEM)
+ scm_memory_error ("locale string conversion");
+ else
+ {
+ /* There are invalid sequences in the input string. */
+ SCM errstr;
+ char *dst;
+ errstr = scm_i_make_string (len, &dst);
+ memcpy (dst, str, len);
+ scm_misc_error (NULL, "input locale conversion error from ~s: ~s",
+ scm_list_2 (scm_from_locale_string (encoding),
+ errstr));
+ scm_remember_upto_here_1 (errstr);
+ }
+ }
+
+ i = 0;
+ while (i < u32len)
+ if (u32[i++] > 0xFF)
+ {
+ wide = 1;
+ break;
+ }
+
+ if (!wide)
+ {
+ char *dst;
+ res = scm_i_make_string (u32len, &dst);
+ for (i = 0; i < u32len; i ++)
+ dst[i] = (unsigned char) u32[i];
+ dst[u32len] = '\0';
+ }
+ else
+ {
+ scm_t_wchar *wdst;
+ res = scm_i_make_wide_string (u32len, &wdst);
+ u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
+ wdst[u32len] = 0;
+ }
+
+ free (u32);
+ return res;
+}
+
SCM
scm_from_locale_stringn (const char *str, size_t len)
{
- SCM res;
- char *dst;
+ const char *enc;
+ scm_t_string_failed_conversion_handler hndl;
+ SCM inport;
+ scm_t_port *pt;
if (len == (size_t) -1)
len = strlen (str);
if (len == 0)
return scm_nullstr;
- res = scm_i_make_string (len, &dst);
- memcpy (dst, str, len);
- return res;
+ inport = scm_current_input_port ();
+ if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
+ {
+ pt = SCM_PTAB_ENTRY (inport);
+ enc = pt->encoding;
+ hndl = pt->ilseq_handler;
+ }
+ else
+ {
+ enc = NULL;
+ hndl = SCM_FAILED_CONVERSION_ERROR;
+ }
+
+ return scm_from_stringn (str, len, enc, hndl);
}
SCM
@@ -1427,6 +1587,14 @@ scm_from_locale_string (const char *str)
return scm_from_locale_stringn (str, -1);
}
+SCM
+scm_i_from_utf8_string (const scm_t_uint8 *str)
+{
+ return scm_from_stringn ((const char *) str,
+ strlen ((char *) str), "UTF-8",
+ SCM_FAILED_CONVERSION_ERROR);
+}
+
/* Create a new scheme string from the C string STR. The memory of
STR may be used directly as storage for the new string. */
SCM
@@ -1515,23 +1683,33 @@ unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
char *
scm_to_locale_stringn (SCM str, size_t * lenp)
{
+ SCM outport;
+ scm_t_port *pt;
const char *enc;
- /* In the future, enc will hold the port's encoding. */
- enc = NULL;
+ outport = scm_current_output_port ();
+ if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
+ {
+ pt = SCM_PTAB_ENTRY (outport);
+ enc = pt->encoding;
+ }
+ else
+ enc = NULL;
- return scm_to_stringn (str, lenp, enc,
- SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ return scm_to_stringn (str, lenp,
+ enc,
+ scm_i_get_conversion_strategy (SCM_BOOL_F));
}
/* Low-level scheme to C string conversion function. */
char *
-scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
+scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
scm_t_string_failed_conversion_handler handler)
{
- static const char iso[11] = "ISO-8859-1";
char *buf;
size_t ilen, len, i;
+ int ret;
+ const char *enc;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
@@ -1545,7 +1723,7 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
*lenp = 0;
return buf;
}
-
+
if (lenp == NULL)
for (i = 0; i < ilen; i++)
if (scm_i_string_ref (str, i) == '\0')
@@ -1553,8 +1731,10 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
"string contains #\\nul character: ~S",
scm_list_1 (str));
- if (scm_i_is_narrow_string (str))
+ if (scm_i_is_narrow_string (str) && (encoding == NULL))
{
+ /* If using native Latin-1 encoding, just copy the string
+ contents. */
if (lenp)
{
buf = scm_malloc (ilen);
@@ -1571,20 +1751,44 @@ scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
}
}
-
+
buf = NULL;
len = 0;
- buf = u32_conv_to_encoding (iso,
- (enum iconv_ilseq_handler) handler,
- (scm_t_uint32 *) scm_i_string_wide_chars (str),
- ilen, NULL, NULL, &len);
- if (buf == NULL)
- scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
- scm_list_2 (scm_from_locale_string (iso), str));
+ enc = encoding;
+ if (enc == NULL)
+ enc = "ISO-8859-1";
+ if (scm_i_is_narrow_string (str))
+ {
+ ret = mem_iconveh (scm_i_string_chars (str), ilen,
+ "ISO-8859-1", enc,
+ (enum iconv_ilseq_handler) handler, NULL,
+ &buf, &len);
- if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
- unistring_escapes_to_guile_escapes (&buf, &len);
+ if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ unistring_escapes_to_guile_escapes (&buf, &len);
+ if (ret != 0)
+ {
+ scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
+ scm_list_2 (scm_from_locale_string (enc),
+ str));
+ }
+ }
+ else
+ {
+ buf = u32_conv_to_encoding (enc,
+ (enum iconv_ilseq_handler) handler,
+ (scm_t_uint32 *) scm_i_string_wide_chars (str),
+ ilen,
+ NULL,
+ NULL, &len);
+ if (buf == NULL)
+ {
+ scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
+ scm_list_2 (scm_from_locale_string (enc),
+ str));
+ }
+ }
if (lenp)
*lenp = len;
else
@@ -1603,6 +1807,14 @@ scm_to_locale_string (SCM str)
return scm_to_locale_stringn (str, NULL);
}
+scm_t_uint8 *
+scm_i_to_utf8_string (SCM str)
+{
+ char *u8str;
+ u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
+ return (scm_t_uint8 *) u8str;
+}
+
size_t
scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
{
diff --git a/libguile/strings.h b/libguile/strings.h
index d0cbb8dd3..2393aae91 100644
--- a/libguile/strings.h
+++ b/libguile/strings.h
@@ -124,6 +124,7 @@ SCM_API SCM scm_c_substring_copy (SCM str, size_t start, size_t end);
SCM_API int scm_is_string (SCM x);
SCM_API SCM scm_from_locale_string (const char *str);
SCM_API SCM scm_from_locale_stringn (const char *str, size_t len);
+SCM_INTERNAL SCM scm_i_from_utf8_string (const scm_t_uint8 *str);
SCM_API SCM scm_take_locale_string (char *str);
SCM_API SCM scm_take_locale_stringn (char *str, size_t len);
SCM_API char *scm_to_locale_string (SCM str);
@@ -132,6 +133,7 @@ SCM_INTERNAL char *scm_to_stringn (SCM str, size_t *lenp,
const char *encoding,
scm_t_string_failed_conversion_handler
handler);
+SCM_INTERNAL scm_t_uint8 *scm_i_to_utf8_string (SCM str);
SCM_API size_t scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len);
SCM_API SCM scm_makfromstrs (int argc, char **argv);
@@ -152,6 +154,7 @@ SCM_INTERNAL SCM scm_i_string_start_writing (SCM str);
SCM_INTERNAL void scm_i_string_stop_writing (void);
SCM_INTERNAL int scm_i_is_narrow_string (SCM str);
SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x);
+SCM_INTERNAL int scm_i_string_contains_char (SCM str, char c);
SCM_INTERNAL int scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr);
SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr);
/* internal functions related to symbols. */
@@ -168,6 +171,7 @@ SCM_INTERNAL const char *scm_i_symbol_chars (SCM sym);
SCM_INTERNAL const scm_t_wchar *scm_i_symbol_wide_chars (SCM sym);
SCM_INTERNAL size_t scm_i_symbol_length (SCM sym);
SCM_INTERNAL int scm_i_is_narrow_symbol (SCM str);
+SCM_INTERNAL int scm_i_try_narrow_string (SCM str);
SCM_INTERNAL SCM scm_i_symbol_substring (SCM sym, size_t start, size_t end);
SCM_INTERNAL scm_t_wchar scm_i_symbol_ref (SCM sym, size_t x);
diff --git a/libguile/strports.c b/libguile/strports.c
index 619d9d750..490a15f8b 100644
--- a/libguile/strports.c
+++ b/libguile/strports.c
@@ -39,6 +39,7 @@
#include "libguile/modules.h"
#include "libguile/validate.h"
#include "libguile/deprecation.h"
+#include "libguile/srfi-4.h"
#include "libguile/strports.h"
@@ -289,42 +290,33 @@ st_truncate (SCM port, scm_t_off length)
}
SCM
-scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
+scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len, long modes, const char *caller)
{
- SCM z;
+ SCM z, str;
scm_t_port *pt;
- size_t str_len, c_pos;
-
- SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
+ size_t c_pos;
+ char *buf;
+
+ /* Because ports are inherently 8-bit, strings need to be converted
+ to a locale representation for storage. But, since string ports
+ rely on string functionality for their memory management, we need
+ to create a new string that has the 8-bit locale representation
+ of the underlying string. This violates the guideline that the
+ internal encoding of characters in strings is in unicode
+ codepoints. */
+ str = scm_i_make_string (str_len, &buf);
+ memcpy (buf, locale_str, str_len);
- str_len = scm_i_string_length (str);
c_pos = scm_to_unsigned_integer (pos, 0, str_len);
if (!((modes & SCM_WRTNG) || (modes & SCM_RDNG)))
scm_misc_error ("scm_mkstrport", "port must read or write", SCM_EOL);
- /* XXX
-
- Make a new string to isolate us from changes to the original.
- This is done so that we can rely on scm_i_string_chars to stay in
- place even across SCM_TICKs.
-
- Additionally, when we are going to write to the string, we make a
- copy so that we can write to it without having to use
- scm_i_string_writable_chars.
- */
-
- if (modes & SCM_WRTNG)
- str = scm_c_substring_copy (str, 0, str_len);
- else
- str = scm_c_substring (str, 0, str_len);
-
scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
z = scm_new_port_table_entry (scm_tc16_strport);
pt = SCM_PTAB_ENTRY(z);
SCM_SETSTREAM (z, SCM_UNPACK (str));
SCM_SET_CELL_TYPE(z, scm_tc16_strport|modes);
- /* see above why we can use scm_i_string_chars here. */
pt->write_buf = pt->read_buf = (unsigned char *) scm_i_string_chars (str);
pt->read_pos = pt->write_pos = pt->read_buf + c_pos;
pt->write_buf_size = pt->read_buf_size = str_len;
@@ -340,22 +332,60 @@ scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
return z;
}
+SCM
+scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
+{
+ SCM z;
+ size_t str_len;
+ char *buf;
+
+ SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
+
+ /* Because ports are inherently 8-bit, strings need to be converted
+ to a locale representation for storage. But, since string ports
+ rely on string functionality for their memory management, we need
+ to create a new string that has the 8-bit locale representation
+ of the underlying string. This violates the guideline that the
+ internal encoding of characters in strings is in unicode
+ codepoints. */
+ buf = scm_to_locale_stringn (str, &str_len);
+ z = scm_i_mkstrport (pos, buf, str_len, modes, caller);
+ free (buf);
+ return z;
+}
+
/* create a new string from a string port's buffer. */
SCM scm_strport_to_string (SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
SCM str;
- char *dst;
if (pt->rw_active == SCM_PORT_WRITE)
st_flush (port);
- str = scm_i_make_string (pt->read_buf_size, &dst);
- memcpy (dst, (char *) pt->read_buf, pt->read_buf_size);
+ str = scm_from_locale_stringn ((char *)pt->read_buf, pt->read_buf_size);
scm_remember_upto_here_1 (port);
return str;
}
+/* Create a vector containing the locale representation of the string in the
+ port's buffer. */
+SCM scm_strport_to_locale_u8vector (SCM port)
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ SCM vec;
+ char *buf;
+
+ if (pt->rw_active == SCM_PORT_WRITE)
+ st_flush (port);
+
+ buf = scm_malloc (pt->read_buf_size);
+ memcpy (buf, pt->read_buf, pt->read_buf_size);
+ vec = scm_take_u8vector ((unsigned char *) buf, pt->read_buf_size);
+ scm_remember_upto_here_1 (port);
+ return vec;
+}
+
SCM_DEFINE (scm_object_to_string, "object->string", 1, 1, 0,
(SCM obj, SCM printer),
"Return a Scheme string obtained by printing @var{obj}.\n"
@@ -380,6 +410,25 @@ SCM_DEFINE (scm_object_to_string, "object->string", 1, 1, 0,
}
#undef FUNC_NAME
+SCM_DEFINE (scm_call_with_output_locale_u8vector, "call-with-output-locale-u8vector", 1, 0, 0,
+ (SCM proc),
+ "Calls the one-argument procedure @var{proc} with a newly created output\n"
+ "port. When the function returns, a vector containing the bytes of a\n"
+ "locale representation of the characters written into the port is returned\n")
+#define FUNC_NAME s_scm_call_with_output_locale_u8vector
+{
+ SCM p;
+
+ p = scm_mkstrport (SCM_INUM0,
+ scm_make_string (SCM_INUM0, SCM_UNDEFINED),
+ SCM_OPN | SCM_WRTNG,
+ FUNC_NAME);
+ scm_call_1 (proc, p);
+
+ return scm_get_output_locale_u8vector (p);
+}
+#undef FUNC_NAME
+
SCM_DEFINE (scm_call_with_output_string, "call-with-output-string", 1, 0, 0,
(SCM proc),
"Calls the one-argument procedure @var{proc} with a newly created output\n"
@@ -424,6 +473,27 @@ SCM_DEFINE (scm_open_input_string, "open-input-string", 1, 0, 0,
}
#undef FUNC_NAME
+SCM_DEFINE (scm_open_input_locale_u8vector, "open-input-locale-u8vector", 1, 0, 0,
+ (SCM vec),
+ "Take a u8vector containing the bytes of a string encoded in the\n"
+ "current locale and return an input port that delivers characters\n"
+ "from the string. The port can be closed by\n"
+ "@code{close-input-port}, though its storage will be reclaimed\n"
+ "by the garbage collector if it becomes inaccessible.")
+#define FUNC_NAME s_scm_open_input_locale_u8vector
+{
+ scm_t_array_handle hnd;
+ ssize_t inc;
+ size_t len;
+ const scm_t_uint8 *buf;
+
+ buf = scm_u8vector_elements (vec, &hnd, &len, &inc);
+ SCM p = scm_i_mkstrport(SCM_INUM0, (const char *) buf, len, SCM_OPN | SCM_RDNG, FUNC_NAME);
+ scm_array_handle_release (&hnd);
+ return p;
+}
+#undef FUNC_NAME
+
SCM_DEFINE (scm_open_output_string, "open-output-string", 0, 0, 0,
(void),
"Return an output port that will accumulate characters for\n"
@@ -456,11 +526,26 @@ SCM_DEFINE (scm_get_output_string, "get-output-string", 1, 0, 0,
#undef FUNC_NAME
+SCM_DEFINE (scm_get_output_locale_u8vector, "get-output-locale-u8vector", 1, 0, 0,
+ (SCM port),
+ "Given an output port created by @code{open-output-string},\n"
+ "return a u8 vector containing the characters of the string\n"
+ "encoded in the current locale.")
+#define FUNC_NAME s_scm_get_output_locale_u8vector
+{
+ SCM_VALIDATE_OPOUTSTRPORT (1, port);
+ return scm_strport_to_locale_u8vector (port);
+}
+#undef FUNC_NAME
+
+
/* Given a null-terminated string EXPR containing a Scheme expression
read it, and return it as an SCM value. */
SCM
scm_c_read_string (const char *expr)
{
+ /* FIXME: the c string gets packed into a string, only to get
+ immediately unpacked in scm_mkstrport. */
SCM port = scm_mkstrport (SCM_INUM0,
scm_from_locale_string (expr),
SCM_OPN | SCM_RDNG,
diff --git a/libguile/strports.h b/libguile/strports.h
index 3129c03e2..b2ded01f1 100644
--- a/libguile/strports.h
+++ b/libguile/strports.h
@@ -44,13 +44,19 @@ SCM_API scm_t_bits scm_tc16_strport;
SCM_API SCM scm_mkstrport (SCM pos, SCM str, long modes, const char * caller);
+SCM_INTERNAL SCM scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len,
+ long modes, const char *caller);
SCM_API SCM scm_strport_to_string (SCM port);
+SCM_API SCM scm_strport_to_locale_u8vector (SCM port);
SCM_API SCM scm_object_to_string (SCM obj, SCM printer);
SCM_API SCM scm_call_with_output_string (SCM proc);
+SCM_API SCM scm_call_with_output_locale_u8vector (SCM proc);
SCM_API SCM scm_call_with_input_string (SCM str, SCM proc);
SCM_API SCM scm_open_input_string (SCM str);
+SCM_API SCM scm_open_input_locale_u8vector (SCM str);
SCM_API SCM scm_open_output_string (void);
SCM_API SCM scm_get_output_string (SCM port);
+SCM_API SCM scm_get_output_locale_u8vector (SCM port);
SCM_API SCM scm_c_read_string (const char *expr);
SCM_API SCM scm_c_eval_string (const char *expr);
SCM_API SCM scm_c_eval_string_in_module (const char *expr, SCM module);
diff --git a/libguile/struct.c b/libguile/struct.c
index 9cb165e2f..f78a812ab 100644
--- a/libguile/struct.c
+++ b/libguile/struct.c
@@ -30,6 +30,7 @@
#include "libguile/hashtab.h"
#include "libguile/ports.h"
#include "libguile/strings.h"
+#include "libguile/srfi-13.h"
#include "libguile/validate.h"
#include "libguile/struct.h"
@@ -61,9 +62,9 @@ SCM_DEFINE (scm_make_struct_layout, "make-struct-layout", 1, 0, 0,
{
SCM new_sym;
SCM_VALIDATE_STRING (1, fields);
+ scm_t_wchar c;
{ /* scope */
- const char * field_desc;
size_t len;
int x;
@@ -72,11 +73,9 @@ SCM_DEFINE (scm_make_struct_layout, "make-struct-layout", 1, 0, 0,
SCM_MISC_ERROR ("odd length field specification: ~S",
scm_list_1 (fields));
- field_desc = scm_i_string_chars (fields);
-
for (x = 0; x < len; x += 2)
{
- switch (field_desc[x])
+ switch (c = scm_i_string_ref (fields, x))
{
case 'u':
case 'p':
@@ -88,13 +87,13 @@ SCM_DEFINE (scm_make_struct_layout, "make-struct-layout", 1, 0, 0,
break;
default:
SCM_MISC_ERROR ("unrecognized field type: ~S",
- scm_list_1 (SCM_MAKE_CHAR (field_desc[x])));
+ scm_list_1 (SCM_MAKE_CHAR (c)));
}
- switch (field_desc[x + 1])
+ switch (c = scm_i_string_ref (fields, x + 1))
{
case 'w':
- if (field_desc[x] == 's')
+ if (scm_i_string_ref (fields, x) == 's')
SCM_MISC_ERROR ("self fields not writable", SCM_EOL);
case 'r':
case 'o':
@@ -102,7 +101,7 @@ SCM_DEFINE (scm_make_struct_layout, "make-struct-layout", 1, 0, 0,
case 'R':
case 'W':
case 'O':
- if (field_desc[x] == 's')
+ if (scm_i_string_ref (fields, x) == 's')
SCM_MISC_ERROR ("self fields not allowed in tail array",
SCM_EOL);
if (x != len - 2)
@@ -111,12 +110,12 @@ SCM_DEFINE (scm_make_struct_layout, "make-struct-layout", 1, 0, 0,
break;
default:
SCM_MISC_ERROR ("unrecognized ref specification: ~S",
- scm_list_1 (SCM_MAKE_CHAR (field_desc[x + 1])));
+ scm_list_1 (SCM_MAKE_CHAR (c)));
}
#if 0
- if (field_desc[x] == 'd')
+ if (scm_i_string_ref (fields, x, 'd'))
{
- if (field_desc[x + 2] != '-')
+ if (!scm_i_string_ref (fields, x+2, '-'))
SCM_MISC_ERROR ("missing dash field at position ~A",
scm_list_1 (scm_from_int (x / 2)));
x += 2;
@@ -138,18 +137,18 @@ SCM_DEFINE (scm_make_struct_layout, "make-struct-layout", 1, 0, 0,
static void
scm_struct_init (SCM handle, SCM layout, scm_t_bits * mem, int tail_elts, SCM inits)
{
- unsigned const char *fields_desc =
- (unsigned const char *) scm_i_symbol_chars (layout) - 2;
- unsigned char prot = 0;
+ scm_t_wchar prot = 0;
int n_fields = scm_i_symbol_length (layout) / 2;
int tailp = 0;
+ int i;
+ i = -2;
while (n_fields)
{
if (!tailp)
{
- fields_desc += 2;
- prot = fields_desc[1];
+ i += 2;
+ prot = scm_i_symbol_ref (layout, i+1);
if (SCM_LAYOUT_TAILP (prot))
{
tailp = 1;
@@ -160,8 +159,7 @@ scm_struct_init (SCM handle, SCM layout, scm_t_bits * mem, int tail_elts, SCM in
break;
}
}
-
- switch (*fields_desc)
+ switch (scm_i_symbol_ref (layout, i))
{
#if 0
case 'i':
@@ -237,7 +235,8 @@ SCM_DEFINE (scm_struct_vtable_p, "struct-vtable?", 1, 0, 0,
{
SCM layout;
scm_t_bits * mem;
- int tmp;
+ SCM tmp;
+ size_t len;
if (!SCM_STRUCTP (x))
return SCM_BOOL_F;
@@ -248,11 +247,14 @@ SCM_DEFINE (scm_struct_vtable_p, "struct-vtable?", 1, 0, 0,
< scm_i_string_length (required_vtable_fields))
return SCM_BOOL_F;
- tmp = strncmp (scm_i_symbol_chars (layout),
- scm_i_string_chars (required_vtable_fields),
- scm_i_string_length (required_vtable_fields));
- scm_remember_upto_here_1 (required_vtable_fields);
- if (tmp)
+ len = scm_i_string_length (required_vtable_fields);
+ tmp = scm_string_eq (scm_symbol_to_string (layout),
+ required_vtable_fields,
+ scm_from_size_t (0),
+ scm_from_size_t (len),
+ scm_from_size_t (0),
+ scm_from_size_t (len));
+ if (scm_is_false (tmp))
return SCM_BOOL_F;
mem = SCM_STRUCT_DATA (x);
@@ -646,8 +648,7 @@ SCM_DEFINE (scm_struct_ref, "struct-ref", 2, 0, 0,
size_t layout_len;
size_t p;
scm_t_bits n_fields;
- const char *fields_desc;
- char field_type = 0;
+ scm_t_wchar field_type = 0;
SCM_VALIDATE_STRUCT (1, handle);
@@ -656,7 +657,6 @@ SCM_DEFINE (scm_struct_ref, "struct-ref", 2, 0, 0,
data = SCM_STRUCT_DATA (handle);
p = scm_to_size_t (pos);
- fields_desc = scm_i_symbol_chars (layout);
layout_len = scm_i_symbol_length (layout);
if (SCM_STRUCT_VTABLE_FLAGS (handle) & SCM_STRUCTF_LIGHT)
/* no extra words */
@@ -668,9 +668,9 @@ SCM_DEFINE (scm_struct_ref, "struct-ref", 2, 0, 0,
if (p * 2 < layout_len)
{
- char ref;
- field_type = fields_desc[p * 2];
- ref = fields_desc[p * 2 + 1];
+ scm_t_wchar ref;
+ field_type = scm_i_symbol_ref (layout, p * 2);
+ ref = scm_i_symbol_ref (layout, p * 2 + 1);
if ((ref != 'r') && (ref != 'w'))
{
if ((ref == 'R') || (ref == 'W'))
@@ -679,8 +679,8 @@ SCM_DEFINE (scm_struct_ref, "struct-ref", 2, 0, 0,
SCM_MISC_ERROR ("ref denied for field ~A", scm_list_1 (pos));
}
}
- else if (fields_desc[layout_len - 1] != 'O')
- field_type = fields_desc[layout_len - 2];
+ else if (scm_i_symbol_ref (layout, layout_len - 1) != 'O')
+ field_type = scm_i_symbol_ref(layout, layout_len - 2);
else
SCM_MISC_ERROR ("ref denied for field ~A", scm_list_1 (pos));
@@ -728,8 +728,7 @@ SCM_DEFINE (scm_struct_set_x, "struct-set!", 3, 0, 0,
size_t layout_len;
size_t p;
int n_fields;
- const char *fields_desc;
- char field_type = 0;
+ scm_t_wchar field_type = 0;
SCM_VALIDATE_STRUCT (1, handle);
@@ -737,7 +736,6 @@ SCM_DEFINE (scm_struct_set_x, "struct-set!", 3, 0, 0,
data = SCM_STRUCT_DATA (handle);
p = scm_to_size_t (pos);
- fields_desc = scm_i_symbol_chars (layout);
layout_len = scm_i_symbol_length (layout);
if (SCM_STRUCT_VTABLE_FLAGS (handle) & SCM_STRUCTF_LIGHT)
/* no extra words */
@@ -750,13 +748,13 @@ SCM_DEFINE (scm_struct_set_x, "struct-set!", 3, 0, 0,
if (p * 2 < layout_len)
{
char set_x;
- field_type = fields_desc[p * 2];
- set_x = fields_desc [p * 2 + 1];
+ field_type = scm_i_symbol_ref (layout, p * 2);
+ set_x = scm_i_symbol_ref (layout, p * 2 + 1);
if (set_x != 'w')
SCM_MISC_ERROR ("set! denied for field ~A", scm_list_1 (pos));
}
- else if (fields_desc[layout_len - 1] == 'W')
- field_type = fields_desc[layout_len - 2];
+ else if (scm_i_symbol_ref (layout, layout_len - 1) == 'W')
+ field_type = scm_i_symbol_ref (layout, layout_len - 2);
else
SCM_MISC_ERROR ("set! denied for field ~A", scm_list_1 (pos));
diff --git a/libguile/throw.c b/libguile/throw.c
index 4413efadf..cf6ea4a49 100644
--- a/libguile/throw.c
+++ b/libguile/throw.c
@@ -23,6 +23,7 @@
#endif
#include <stdio.h>
+#include <unistdio.h>
#include "libguile/_scm.h"
#include "libguile/async.h"
#include "libguile/smob.h"
@@ -744,8 +745,12 @@ scm_ithrow (SCM key, SCM args, int noreturn SCM_UNUSED)
*/
fprintf (stderr, "throw from within critical section.\n");
if (scm_is_symbol (key))
- fprintf (stderr, "error key: %s\n", scm_i_symbol_chars (key));
-
+ {
+ if (scm_i_is_narrow_symbol (key))
+ fprintf (stderr, "error key: %s\n", scm_i_symbol_chars (key));
+ else
+ ulc_fprintf (stderr, "error key: %llU\n", scm_i_symbol_wide_chars (key));
+ }
for (; scm_is_pair (s); s = scm_cdr (s), i++)
{
diff --git a/module/system/base/compile.scm b/module/system/base/compile.scm
index 8470f39e2..26dd29e20 100644
--- a/module/system/base/compile.scm
+++ b/module/system/base/compile.scm
@@ -145,8 +145,11 @@
(from (current-language))
(to 'objcode)
(opts '()))
- (let ((comp (or output-file (compiled-file-name file)))
- (in (open-input-file file)))
+ (let* ((comp (or output-file (compiled-file-name file)))
+ (in (open-input-file file))
+ (enc (file-encoding in)))
+ (if enc
+ (set-port-encoding! in enc))
(ensure-writable-dir (dirname comp))
(call-with-output-file/atomic comp
(lambda (port)
diff --git a/test-suite/tests/encoding-escapes.test b/test-suite/tests/encoding-escapes.test
new file mode 100644
index 000000000..8859d2e83
--- /dev/null
+++ b/test-suite/tests/encoding-escapes.test
@@ -0,0 +1,139 @@
+;;;; encoding-escapes.test --- test suite for Guile's string encodings -*- mode: scheme; coding: utf-8 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(define s1 "última")
+(define s2 "cédula")
+(define s3 "años")
+(define s4 "羅生門")
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "ultima"
+ (string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
+
+ (pass-if "cedula"
+ (string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
+
+ (pass-if "anos"
+ (string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
+
+ (pass-if "Rashomon"
+ (string=? s4 (string-ints #x7f85 #x751f #x9580))))
+
+(with-test-prefix "chars"
+
+ (pass-if "ultima"
+ (list= eqv? (string->list s1)
+ (list #\372 #\l #\t #\i #\m #\a)))
+
+ (pass-if "cedula"
+ (list= eqv? (string->list s2)
+ (list #\c #\351 #\d #\u #\l #\a)))
+
+ (pass-if "anos"
+ (list= eqv? (string->list s3)
+ (list #\a #\361 #\o #\s)))
+
+ (pass-if "Rashomon"
+ (list= eqv? (string->list s4)
+ (list #\77605 #\72437 #\112600))))
+
+
+;; Check that an error is flagged on display output when the output
+;; error strategy is 'error
+
+(with-test-prefix "display output errors"
+
+ (pass-if-exception "ultima"
+ exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'error)
+ (display s1 pt)))
+
+ (pass-if-exception "Rashomon"
+ exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'error)
+ (display s4 pt))))
+
+;; Check that questions marks or substitutions appear when the conversion
+;; mode is substitute
+(with-test-prefix "display output substitutions"
+
+ (pass-if "ultima"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'substitute)
+ (display s1 pt)
+ (string=? "?ltima"
+ (get-output-string pt))))
+
+ (pass-if "Rashomon"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'substitute)
+ (display s4 pt)
+ (string=? "???"
+ (get-output-string pt)))))
+
+
+;; Check that hex escapes appear in the write output and that no error
+;; is thrown. The output error strategy should be irrelevant here.
+(with-test-prefix "display output escapes"
+
+ (pass-if "ultima"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'escape)
+ (display s1 pt)
+ (string=? "\\xfaltima"
+ (get-output-string pt))))
+ (pass-if "Rashomon"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'escape)
+ (display s4 pt)
+ (string=? "\\u7F85\\u751F\\u9580"
+ (get-output-string pt)))))
+
+(setlocale LC_ALL "en_US.utf8")
+
+(with-test-prefix "input escapes"
+
+ (pass-if "última"
+ (string=? "última"
+ (with-input-from-string "\"\\xfaltima\"" read)))
+
+ (pass-if "羅生門"
+ (string=? "羅生門"
+ (with-input-from-string "\"\\u7F85\\u751F\\u9580\"" read))))
+
diff --git a/test-suite/tests/encoding-iso88591.test b/test-suite/tests/encoding-iso88591.test
new file mode 100644
index 000000000..edd573432
--- /dev/null
+++ b/test-suite/tests/encoding-iso88591.test
@@ -0,0 +1,135 @@
+;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: iso-8859-1 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(setlocale LC_ALL "")
+
+(define s1 "ltima")
+(define s2 "cdula")
+(define s3 "aos")
+(define s4 "Cmo?")
+
+(with-test-prefix "string length"
+
+ (pass-if "ltima"
+ (eq? (string-length s1) 6))
+
+ (pass-if "cdula"
+ (eq? (string-length s2) 6))
+
+ (pass-if "aos"
+ (eq? (string-length s3) 4))
+
+ (pass-if "Cmo?"
+ (eq? (string-length s4) 6)))
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "ltima"
+ (string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
+
+ (pass-if "cdula"
+ (string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
+
+ (pass-if "aos"
+ (string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
+
+ (pass-if "Cmo?"
+ (string=? s4 (string-ints #xbf #x43 #xf3 #x6d #x6f #x3f))))
+
+(with-test-prefix "chars"
+
+ (pass-if "ltima"
+ (list= eqv? (string->list s1)
+ (list #\ #\l #\t #\i #\m #\a)))
+
+ (pass-if "cdula"
+ (list= eqv? (string->list s2)
+ (list #\c #\ #\d #\u #\l #\a)))
+
+ (pass-if "aos"
+ (list= eqv? (string->list s3)
+ (list #\a #\ #\o #\s)))
+
+ (pass-if "Cmo?"
+ (list= eqv? (string->list s4)
+ (list #\ #\C #\ #\m #\o #\?))))
+
+;; Check that the output is in ISO-8859-1 encoding
+(with-test-prefix "display"
+
+ (pass-if "s1"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-1")
+ (display s1 pt)
+ (list= eqv?
+ (list #xfa #x6c #x74 #x69 #x6d #x61)
+ (u8vector->list
+ (get-output-locale-u8vector pt)))))
+
+ (pass-if "s2"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-1")
+ (display s2 pt)
+ (list= eqv?
+ (list #x63 #xe9 #x64 #x75 #x6c #x61)
+ (u8vector->list
+ (get-output-locale-u8vector pt))))))
+
+(with-test-prefix "symbols == strings"
+
+ (pass-if "ltima"
+ (eq? (string->symbol s1) 'ltima))
+
+ (pass-if "cdula"
+ (eq? (string->symbol s2) 'cdula))
+
+ (pass-if "aos"
+ (eq? (string->symbol s3) 'aos))
+
+ (pass-if "Cmo?"
+ (eq? (string->symbol s4) 'Cmo?)))
+
+(with-test-prefix "non-ascii variable names"
+
+ (pass-if "1"
+ (let (( 1)
+ ( 2))
+ (eq? (+ ) 3))))
+
+(with-test-prefix "output errors"
+
+ (pass-if-exception "char 256" exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-1")
+ (set-port-conversion-strategy! pt 'error)
+ (display (string-ints 256) pt))))
+
+;; Reset locales
+(setlocale LC_ALL "C") \ No newline at end of file
diff --git a/test-suite/tests/encoding-iso88597.test b/test-suite/tests/encoding-iso88597.test
new file mode 100644
index 000000000..8985042f4
--- /dev/null
+++ b/test-suite/tests/encoding-iso88597.test
@@ -0,0 +1,136 @@
+;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: iso-8859-7 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(setlocale LC_ALL "")
+
+(define s1 "")
+(define s2 "")
+(define s3 "")
+(define s4 "")
+
+(with-test-prefix "string length"
+
+ (pass-if "s1"
+ (eq? (string-length s1) 4))
+
+ (pass-if "s2"
+ (eq? (string-length s2) 3))
+
+ (pass-if "s3"
+ (eq? (string-length s3) 8))
+
+ (pass-if "s4"
+ (eq? (string-length s4) 3)))
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "s1"
+ (string=? s1 (string-ints #x03a0 #x03b5 #x03c1 #x03af)))
+
+ (pass-if "s2"
+ (string=? s2 (string-ints #x03c4 #x03b7 #x03c2)))
+
+ (pass-if "s3"
+ (string=? s3 (string-ints #x03ba #x03c1 #x03b9 #x03c4 #x03b9 #x03ba #x03ae #x03c2)))
+
+ (pass-if "s4"
+ (string=? s4 (string-ints #x03ba #x03b1 #x03b9))))
+
+(with-test-prefix "chars"
+
+ (pass-if "s1"
+ (list= eqv? (string->list s1)
+ (list #\ #\ #\ #\)))
+
+ (pass-if "s2"
+ (list= eqv? (string->list s2)
+ (list #\ #\ #\)))
+
+ (pass-if "s3"
+ (list= eqv? (string->list s3)
+ (list #\ #\ #\ #\ #\ #\ #\ #\)))
+
+ (pass-if "s4"
+ (list= eqv? (string->list s4)
+ (list #\ #\ #\))))
+
+;; Testing that the display of the string is output in the ISO-8859-7
+;; encoding
+(with-test-prefix "display"
+
+ (pass-if "s1"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-7")
+ (display s1 pt)
+ (list= eqv?
+ (list #xd0 #xe5 #xf1 #xdf)
+ (u8vector->list
+ (get-output-locale-u8vector pt)))))
+ (pass-if "s2"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-7")
+ (display s2 pt)
+ (list= eqv?
+ (list #xf4 #xe7 #xf2)
+ (u8vector->list
+ (get-output-locale-u8vector pt))))))
+
+(with-test-prefix "symbols == strings"
+
+ (pass-if ""
+ (eq? (string->symbol s1) '))
+
+ (pass-if ""
+ (eq? (string->symbol s2) '))
+
+ (pass-if ""
+ (eq? (string->symbol s3) '))
+
+ (pass-if ""
+ (eq? (string->symbol s4) ')))
+
+(with-test-prefix "non-ascii variable names"
+
+ (pass-if "1"
+ (let (( 1)
+ ( 2))
+ (eq? (+ ) 3))))
+
+(with-test-prefix "output errors"
+
+ (pass-if-exception "char #x0400"
+ exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-7")
+ (set-port-conversion-strategy! pt 'error)
+ (display (string-ints #x0400) pt))))
+
+;; Reset locale
+(setlocale LC_ALL "C") \ No newline at end of file
diff --git a/test-suite/tests/encoding-utf8.test b/test-suite/tests/encoding-utf8.test
new file mode 100644
index 000000000..83e7540f3
--- /dev/null
+++ b/test-suite/tests/encoding-utf8.test
@@ -0,0 +1,105 @@
+;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: utf-8 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(setlocale LC_ALL "")
+
+(define s1 "última")
+(define s2 "cédula")
+(define s3 "años")
+(define s4 "羅生門")
+
+(with-test-prefix "string length"
+
+ (pass-if "última"
+ (eq? (string-length s1) 6))
+
+ (pass-if "cédula"
+ (eq? (string-length s2) 6))
+
+ (pass-if "años"
+ (eq? (string-length s3) 4))
+
+ (pass-if "羅生門"
+ (eq? (string-length s4) 3)))
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "última"
+ (string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
+
+ (pass-if "cédula"
+ (string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
+
+ (pass-if "años"
+ (string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
+
+ (pass-if "羅生門"
+ (string=? s4 (string-ints #x7f85 #x751f #x9580))))
+
+(with-test-prefix "chars"
+
+ (pass-if "última"
+ (list= eqv? (string->list s1)
+ (list #\ú #\l #\t #\i #\m #\a)))
+
+ (pass-if "cédula"
+ (list= eqv? (string->list s2)
+ (list #\c #\é #\d #\u #\l #\a)))
+
+ (pass-if "años"
+ (list= eqv? (string->list s3)
+ (list #\a #\ñ #\o #\s)))
+
+ (pass-if "羅生門"
+ (list= eqv? (string->list s4)
+ (list #\羅 #\生 #\門))))
+
+(with-test-prefix "symbols == strings"
+
+ (pass-if "última"
+ (eq? (string->symbol s1) 'última))
+
+ (pass-if "cédula"
+ (eq? (string->symbol s2) 'cédula))
+
+ (pass-if "años"
+ (eq? (string->symbol s3) 'años))
+
+ (pass-if "羅生門"
+ (eq? (string->symbol s4) '羅生門)))
+
+(with-test-prefix "non-ascii variable names"
+
+ (pass-if "1"
+ (let ((芥川龍之介 1)
+ (ñ 2))
+ (eq? (+ 芥川龍之介 ñ) 3))))
+
+
diff --git a/test-suite/tests/numbers.test b/test-suite/tests/numbers.test
index 4a9476a52..774e228a7 100644
--- a/test-suite/tests/numbers.test
+++ b/test-suite/tests/numbers.test
@@ -22,6 +22,7 @@
;;;
;;; miscellaneous
;;;
+(setbinary)
(define exception:numerical-overflow
(cons 'numerical-overflow "^Numerical overflow"))
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 67df5b979..76b3e5656 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -33,6 +33,9 @@
;;;; Some general utilities for testing ports.
+;;; Make sure we are set up for 8-bit data
+(setbinary)
+
;;; Read from PORT until EOF, and return the result as a string.
(define (read-all port)
(let loop ((chars '()))
diff --git a/test-suite/tests/r6rs-ports.test b/test-suite/tests/r6rs-ports.test
index df12e5cbc..c2b0755f8 100644
--- a/test-suite/tests/r6rs-ports.test
+++ b/test-suite/tests/r6rs-ports.test
@@ -27,6 +27,9 @@
;;; All these tests assume Guile 1.8's port system, where characters are
;;; treated as octets.
+;;; Set the default encoding of future ports to be binary
+(setbinary)
+
(with-test-prefix "7.2.5 End-of-File Object"
diff --git a/test-suite/tests/time.test b/test-suite/tests/time.test
index 38a49d384..da7a48c04 100644
--- a/test-suite/tests/time.test
+++ b/test-suite/tests/time.test
@@ -202,6 +202,11 @@
(string=? (strftime "%Z" t)
"ZOW")))
+ (pass-if "strftime passes wide characters"
+ (let ((t (localtime (current-time))))
+ (string=? (substring (strftime "\u0100%Z" t) 0 1)
+ "\u0100")))
+
(with-test-prefix "C99 %z format"
;; %z here is quite possibly affected by the same tm:gmtoff vs current