summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Gran <spk121@yahoo.com>2009-08-25 07:54:37 -0700
committerMichael Gran <spk121@yahoo.com>2009-08-25 07:54:37 -0700
commit889975e51accb80491af76fc5db980aeb3edd342 (patch)
tree43b57c4388abcf86e97f8f413abc4ebbcaeac1a8
parent9db8cf1634ca9a91cb88b2532f7b87f2502b4abd (diff)
downloadguile-889975e51accb80491af76fc5db980aeb3edd342.tar.gz
Add full Unicode capability to ports and the default reader
Ports are given two additional properties: a character encoding and a conversion failure strategy. These properties have getters and setters. The new properties are used to convert any locale text to/from the internal representation of strings. If unspecified, ports use a default value. The default value of these properties is held in a fluid. The default character encoding can be modified by calling setlocale. ISO-8859-1 is treated specially. Since it is a native encoding of strings, it can be processed more quickly. Source code is assumed to be ISO-8859-1 unless otherwise specified. The encoding of a source code file can be given as 'coding: XXXXX' in a magic comment at the top of a file. The C functions that deal with encoding often use a null pointer as shorthand for the native Latin-1 encoding, for efficiency's sake. * test-suite/tests/encoding-iso88591.test: new tests * test-suite/tests/encoding-iso88597.test: new tests * test-suite/tests/encoding-utf8.test: new tests * test-suite/tests/encoding-escapes.test: new tests * test-suite/tests/numbers.test: declare 'binary' encoding * test-suite/tests/ports.test: declare 'binary' encoding * test-suite/tests/r6rs-ports.test: declare 'binary' encoding * module/system/base/compile.scm (compile-file): use source-code file's self-declared encoding when compiling files * libguile/strports.c: store string ports in locale encoding (scm_strport_to_locale_u8vector, scm_call_with_output_locale_u8vector) (scm_open_input_locale_u8vector, scm_get_output_locale_u8vector): new functions * libguile/strings.h: new declaration for scm_i_string_contains_char * libguile/strings.c (scm_i_string_contains_char): new function (scm_from_stringn, scm_to_stringn): use NULL for Latin-1 (scm_from_locale_stringn, scm_to_locale_stringn): respect character encoding of input and output ports * libguile/read.h: declaration for scm_scan_for_encoding * libguile/read.c: (read_token): now takes scheme string instead of C string/length (read_complete_token): new function (scm_read_sexp, scm_read_number, scm_read_mixed_case_symbol) (scm_read_number_and_radix, scm_read_quote, scm_read_semicolon_comment) (scm_read_srfi4_vector, scm_read_bytevector, scm_read_guile_bit_vector) (scm_read_scsh_block_comment, scm_read_commented_expression) (scm_read_extended_symbol, scm_read_sharp_extension, scm_read_shart) (scm_read_expression): use scm_t_wchar for char type, use read_complete_token (scm_scan_for_encoding): new function to find a file's character encoding (scm_file_encoding): new function to find a port's character encoding * libguile/rdelim.c: don't unpack strings * libguile/print.h: declaration for modified function scm_i_charprint * libguile/print.c: use locale when printing characters and strings (scm_i_charprint): input parameter is now scm_t_wchar (scm_simple_format): don't unpack strings * libguile/posix.h: new declaration for scm_setbinary. * libguile/posix.c (scm_setlocale): set default and stdio port encodings based on the locale's character encoding (scm_setbinary): new function * libguile/ports.h (scm_t_port): add encoding and failed conversion handler to port type. Declarations for new or modified functions scm_getc, scm_unget_byte, scm_ungetc, scm_i_get_port_encoding, scm_i_set_port_encoding_x, scm_port_encoding, scm_set_port_encoding_x, scm_i_get_conversion_strategy, scm_i_set_conversion_strategy_x, scm_port_conversion_strategy, scm_set_port_conversion_strategy_x. * libguile/ports.c: assign the current ports to zero on startup so we can see if they've been set. (scm_current_input_port, scm_current_output_port, scm_current_error_port): return #f if the port is not yet initialized (scm_new_port_table_entry): set up a new port's encoding and illegal sequence handler based on the thread's current defaults (scm_i_remove_port): free port encoding name when port is removed (scm_i_mode_bits_n): now takes a scheme string instead of a c string and length. All callers changed. (SCM_MBCHAR_BUF_SIZE): new const (scm_getc): new function, since the scm_getc in inline.h is now scm_get_byte_or_eof. This pulls one codepoint from a port. (scm_lfwrite_substr, scm_lfwrite_str): now uses port's encoding (scm_unget_byte): new function, incorportaing the low-level functionality of scm_ungetc (scm_ungetc): uses scm_unget_byte * libguile/numbers.h (scm_t_wchar): compilation order problem with scm_t_wchar being use in functions in multiple headers. Forward declare scm_t_wchar. * libguile/load.c (scm_primitive_load): scan for file encoding at top of file and use it to set the load port's encoding * libguile/inline.h (scm_get_byte_or_eof): new function incorporating most of the functionality of scm_getc. * libguile/fports.c (fport_fill_input): now returns scm_t_wchar * libguile/chars.h (scm_t_wchar): avoid compilation order problem with declaration of scm_t_wchar
-rw-r--r--libguile/chars.h6
-rw-r--r--libguile/fports.c4
-rw-r--r--libguile/inline.h25
-rw-r--r--libguile/load.c11
-rw-r--r--libguile/numbers.h6
-rw-r--r--libguile/ports.c557
-rw-r--r--libguile/ports.h20
-rw-r--r--libguile/posix.c46
-rw-r--r--libguile/posix.h1
-rw-r--r--libguile/print.c103
-rw-r--r--libguile/print.h3
-rw-r--r--libguile/rdelim.c6
-rw-r--r--libguile/read.c383
-rw-r--r--libguile/read.h2
-rw-r--r--libguile/strings.c175
-rw-r--r--libguile/strings.h1
-rw-r--r--libguile/strports.c137
-rw-r--r--libguile/strports.h6
-rw-r--r--module/system/base/compile.scm7
-rw-r--r--test-suite/tests/encoding-escapes.test139
-rw-r--r--test-suite/tests/encoding-iso88591.test135
-rw-r--r--test-suite/tests/encoding-iso88597.test136
-rw-r--r--test-suite/tests/encoding-utf8.test105
-rw-r--r--test-suite/tests/numbers.test1
-rw-r--r--test-suite/tests/ports.test3
-rw-r--r--test-suite/tests/r6rs-ports.test3
26 files changed, 1705 insertions, 316 deletions
diff --git a/libguile/chars.h b/libguile/chars.h
index f75aeadd1..85b16739a 100644
--- a/libguile/chars.h
+++ b/libguile/chars.h
@@ -24,7 +24,11 @@
#include "libguile/__scm.h"
-#include "libguile/numbers.h"
+
+#ifndef SCM_T_WCHAR_DEFINED
+typedef scm_t_int32 scm_t_wchar;
+#define SCM_T_WCHAR_DEFINED
+#endif /* SCM_T_WCHAR_DEFINED */
/* Immediate Characters
diff --git a/libguile/fports.c b/libguile/fports.c
index cfb8b25b6..8e25ebdd5 100644
--- a/libguile/fports.c
+++ b/libguile/fports.c
@@ -587,7 +587,7 @@ static void fport_flush (SCM port);
/* fill a port's read-buffer with a single read. returns the first
char or EOF if end of file. */
-static int
+static scm_t_wchar
fport_fill_input (SCM port)
{
long count;
@@ -601,7 +601,7 @@ fport_fill_input (SCM port)
if (count == -1)
scm_syserror ("fport_fill_input");
if (count == 0)
- return EOF;
+ return (scm_t_wchar) EOF;
else
{
pt->read_pos = pt->read_buf;
diff --git a/libguile/inline.h b/libguile/inline.h
index cb908581d..b37834598 100644
--- a/libguile/inline.h
+++ b/libguile/inline.h
@@ -86,7 +86,7 @@ SCM_API void scm_array_handle_set (scm_t_array_handle *h, ssize_t pos, SCM val);
SCM_API int scm_is_pair (SCM x);
-SCM_API int scm_getc (SCM port);
+SCM_API int scm_get_byte_or_eof (SCM port);
SCM_API void scm_putc (char c, SCM port);
SCM_API void scm_puts (const char *str_data, SCM port);
@@ -290,7 +290,7 @@ scm_is_pair (SCM x)
SCM_C_EXTERN_INLINE
#endif
int
-scm_getc (SCM port)
+scm_get_byte_or_eof (SCM port)
{
int c;
scm_t_port *pt = SCM_PTAB_ENTRY (port);
@@ -310,27 +310,6 @@ scm_getc (SCM port)
c = *(pt->read_pos++);
- switch (c)
- {
- case '\a':
- break;
- case '\b':
- SCM_DECCOL (port);
- break;
- case '\n':
- SCM_INCLINE (port);
- break;
- case '\r':
- SCM_ZEROCOL (port);
- break;
- case '\t':
- SCM_TABCOL (port);
- break;
- default:
- SCM_INCCOL (port);
- break;
- }
-
return c;
}
diff --git a/libguile/load.c b/libguile/load.c
index 9e54dfab9..e2c0dbf99 100644
--- a/libguile/load.c
+++ b/libguile/load.c
@@ -85,6 +85,7 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
#define FUNC_NAME s_scm_primitive_load
{
SCM hook = *scm_loc_load_hook;
+ char *encoding;
SCM_VALIDATE_STRING (1, filename);
if (scm_is_true (hook) && scm_is_false (scm_procedure_p (hook)))
SCM_MISC_ERROR ("value of %load-hook is neither a procedure nor #f",
@@ -97,7 +98,15 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
SCM port = scm_open_file (filename, scm_from_locale_string ("r"));
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
scm_i_dynwind_current_load_port (port);
-
+ encoding = scm_scan_for_encoding (port);
+ if (encoding)
+ {
+ scm_i_set_port_encoding_x (port, encoding);
+ free (encoding);
+ }
+ else
+ /* The file has no encoding declaraed. We'll presume Latin-1. */
+ scm_i_set_port_encoding_x (port, NULL);
while (1)
{
SCM reader, form;
diff --git a/libguile/numbers.h b/libguile/numbers.h
index eaa57287a..9597afb8d 100644
--- a/libguile/numbers.h
+++ b/libguile/numbers.h
@@ -28,6 +28,11 @@
#include "libguile/__scm.h"
#include "libguile/print.h"
+#ifndef SCM_T_WCHAR_DEFINED
+typedef scm_t_int32 scm_t_wchar;
+#define SCM_T_WCHAR_DEFINED
+#endif /* SCM_T_WCHAR_DEFINED */
+
#if SCM_HAVE_FLOATINGPOINT_H
# include <floatingpoint.h>
#endif
@@ -174,7 +179,6 @@ typedef struct scm_t_complex
double imag;
} scm_t_complex;
-typedef scm_t_int32 scm_t_wchar;
diff --git a/libguile/ports.c b/libguile/ports.c
index 1ddeaa364..749d97533 100644
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -30,6 +30,9 @@
#include <errno.h>
#include <fcntl.h> /* for chsize on mingw */
#include <assert.h>
+#include <uniconv.h>
+#include <unistr.h>
+#include <striconveh.h>
#include "libguile/_scm.h"
#include "libguile/async.h"
@@ -51,6 +54,7 @@
#include "libguile/vectors.h"
#include "libguile/weaks.h"
#include "libguile/fluids.h"
+#include "libguile/eq.h"
#ifdef HAVE_STRING_H
#include <string.h>
@@ -358,10 +362,10 @@ SCM_DEFINE (scm_drain_input, "drain-input", 1, 0, 0,
/* Standard ports --- current input, output, error, and more(!). */
-static SCM cur_inport_fluid;
-static SCM cur_outport_fluid;
-static SCM cur_errport_fluid;
-static SCM cur_loadport_fluid;
+static SCM cur_inport_fluid = 0;
+static SCM cur_outport_fluid = 0;
+static SCM cur_errport_fluid = 0;
+static SCM cur_loadport_fluid = 0;
SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0,
(),
@@ -370,7 +374,10 @@ SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0,
"returns the @dfn{standard input} in Unix and C terminology.")
#define FUNC_NAME s_scm_current_input_port
{
- return scm_fluid_ref (cur_inport_fluid);
+ if (cur_inport_fluid)
+ return scm_fluid_ref (cur_inport_fluid);
+ else
+ return SCM_BOOL_F;
}
#undef FUNC_NAME
@@ -382,7 +389,10 @@ SCM_DEFINE (scm_current_output_port, "current-output-port", 0, 0, 0,
"Unix and C terminology.")
#define FUNC_NAME s_scm_current_output_port
{
- return scm_fluid_ref (cur_outport_fluid);
+ if (cur_outport_fluid)
+ return scm_fluid_ref (cur_outport_fluid);
+ else
+ return SCM_BOOL_F;
}
#undef FUNC_NAME
@@ -392,7 +402,10 @@ SCM_DEFINE (scm_current_error_port, "current-error-port", 0, 0, 0,
"@dfn{standard error} in Unix and C terminology).")
#define FUNC_NAME s_scm_current_error_port
{
- return scm_fluid_ref (cur_errport_fluid);
+ if (cur_errport_fluid)
+ return scm_fluid_ref (cur_errport_fluid);
+ else
+ return SCM_BOOL_F;
}
#undef FUNC_NAME
@@ -509,10 +522,18 @@ scm_new_port_table_entry (scm_t_bits tag)
SCM z = scm_cons (SCM_EOL, SCM_EOL);
scm_t_port *entry = (scm_t_port *) scm_gc_calloc (sizeof (scm_t_port), "port");
+ const char *enc;
entry->file_name = SCM_BOOL_F;
entry->rw_active = SCM_PORT_NEITHER;
entry->port = z;
+ /* Initialize this port with the thread's current default
+ encoding. */
+ if ((enc = scm_i_get_port_encoding (SCM_BOOL_F)) == NULL)
+ entry->encoding = NULL;
+ else
+ entry->encoding = strdup (enc);
+ entry->ilseq_handler = scm_i_get_conversion_strategy (SCM_BOOL_F);
SCM_SET_CELL_TYPE (z, tag);
SCM_SETPTAB_ENTRY (z, entry);
@@ -549,6 +570,11 @@ scm_i_remove_port (SCM port)
scm_t_port *p = SCM_PTAB_ENTRY (port);
if (p->putback_buf)
scm_gc_free (p->putback_buf, p->putback_buf_size, "putback buffer");
+ if (p->encoding)
+ {
+ free (p->encoding);
+ p->encoding = NULL;
+ }
scm_gc_free (p, sizeof (scm_t_port), "port");
SCM_SETPTAB_ENTRY (port, 0);
@@ -632,21 +658,22 @@ SCM_DEFINE (scm_set_port_revealed_x, "set-port-revealed!", 2, 0, 0,
*/
static long
-scm_i_mode_bits_n (const char *modes, size_t n)
+scm_i_mode_bits_n (SCM modes)
{
return (SCM_OPN
- | (memchr (modes, 'r', n) || memchr (modes, '+', n) ? SCM_RDNG : 0)
- | ( memchr (modes, 'w', n)
- || memchr (modes, 'a', n)
- || memchr (modes, '+', n) ? SCM_WRTNG : 0)
- | (memchr (modes, '0', n) ? SCM_BUF0 : 0)
- | (memchr (modes, 'l', n) ? SCM_BUFLINE : 0));
+ | (scm_i_string_contains_char (modes, 'r')
+ || scm_i_string_contains_char (modes, '+') ? SCM_RDNG : 0)
+ | (scm_i_string_contains_char (modes, 'w')
+ || scm_i_string_contains_char (modes, 'a')
+ || scm_i_string_contains_char (modes, '+') ? SCM_WRTNG : 0)
+ | (scm_i_string_contains_char (modes, '0') ? SCM_BUF0 : 0)
+ | (scm_i_string_contains_char (modes, 'l') ? SCM_BUFLINE : 0));
}
long
scm_mode_bits (char *modes)
{
- return scm_i_mode_bits_n (modes, strlen (modes));
+ return scm_i_mode_bits (scm_from_locale_string (modes));
}
long
@@ -657,8 +684,7 @@ scm_i_mode_bits (SCM modes)
if (!scm_is_string (modes))
scm_wrong_type_arg_msg (NULL, 0, modes, "string");
- bits = scm_i_mode_bits_n (scm_i_string_chars (modes),
- scm_i_string_length (modes));
+ bits = scm_i_mode_bits_n (modes);
scm_remember_upto_here_1 (modes);
return bits;
}
@@ -929,7 +955,7 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
"characters are available, the end-of-file object is returned.")
#define FUNC_NAME s_scm_read_char
{
- int c;
+ scm_t_wchar c;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
SCM_VALIDATE_OPINPORT (1, port);
@@ -940,6 +966,133 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
}
#undef FUNC_NAME
+#define SCM_MBCHAR_BUF_SIZE (4)
+
+/* Get one codepoint from a file, using the port's encoding. */
+scm_t_wchar
+scm_getc (SCM port)
+{
+ int c;
+ unsigned int bufcount = 0;
+ char buf[SCM_MBCHAR_BUF_SIZE];
+ scm_t_wchar codepoint = 0;
+ scm_t_uint32 *u32;
+ size_t u32len;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+ c = scm_get_byte_or_eof (port);
+ if (c == EOF)
+ return (scm_t_wchar) EOF;
+
+ buf[0] = c;
+ bufcount++;
+
+ if (pt->encoding == NULL)
+ {
+ /* The encoding is Latin-1: bytes are characters. */
+ codepoint = buf[0];
+ goto success;
+ }
+
+ for (;;)
+ {
+ u32 = u32_conv_from_encoding (pt->encoding,
+ (enum iconv_ilseq_handler) pt->ilseq_handler,
+ buf, bufcount, NULL, NULL, &u32len);
+ if (u32 == NULL || u32len == 0)
+ {
+ if (errno == ENOMEM)
+ scm_memory_error ("Input decoding");
+
+ /* Otherwise errno is EILSEQ or EINVAL, so perhaps more
+ bytes are needed. Keep looping. */
+ }
+ else
+ {
+ /* Complete codepoint found. */
+ codepoint = u32[0];
+ free (u32);
+ goto success;
+ }
+
+ if (bufcount == SCM_MBCHAR_BUF_SIZE)
+ {
+ /* We've read several bytes and didn't find a good
+ codepoint. Give up. */
+ goto failure;
+ }
+
+ c = scm_get_byte_or_eof (port);
+
+ if (c == EOF)
+ {
+ /* EOF before a complete character was read. Push it all
+ back and return EOF. */
+ while (bufcount > 0)
+ {
+ /* FIXME: this will probably cause errors in the port column. */
+ scm_unget_byte (buf[bufcount-1], port);
+ bufcount --;
+ }
+ return EOF;
+ }
+
+ if (c == '\n')
+ {
+ /* It is always invalid to have EOL in the middle of a
+ multibyte character. */
+ scm_unget_byte ('\n', port);
+ goto failure;
+ }
+
+ buf[bufcount++] = c;
+ }
+
+ success:
+ switch (codepoint)
+ {
+ case '\a':
+ break;
+ case '\b':
+ SCM_DECCOL (port);
+ break;
+ case '\n':
+ SCM_INCLINE (port);
+ break;
+ case '\r':
+ SCM_ZEROCOL (port);
+ break;
+ case '\t':
+ SCM_TABCOL (port);
+ break;
+ default:
+ SCM_INCCOL (port);
+ break;
+ }
+
+ return codepoint;
+
+ failure:
+ {
+ char *err_buf;
+ SCM err_str = scm_i_make_string (bufcount, &err_buf);
+ memcpy (err_buf, buf, bufcount);
+
+ if (errno == EILSEQ)
+ scm_misc_error (NULL, "input encoding error for ~s: ~s",
+ scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
+ err_str));
+ else
+ scm_misc_error (NULL, "input encoding error (invalid) for ~s: ~s\n",
+ scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
+ err_str));
+ }
+
+ /* Never gets here. */
+ return 0;
+}
+
+
/* this should only be called when the read buffer is empty. it
tries to refill the read buffer. it returns the first char from
the port, which is either EOF or *(pt->read_pos). */
@@ -1027,7 +1180,7 @@ scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
stringbuf_write_mutex. So, one shouldn't use scm_lfwrite_substr
if the stringbuf write mutex may still be held elsewhere. */
buf = scm_to_stringn (scm_c_substring (str, start, end), &len,
- NULL, SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ pt->encoding, pt->ilseq_handler);
ptob->write (port, buf, len);
free (buf);
@@ -1056,7 +1209,7 @@ scm_lfwrite_str (SCM str, SCM port)
scm_end_input (port);
buf = scm_to_stringn (str, &len,
- NULL, SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ pt->encoding, pt->ilseq_handler);
ptob->write (port, buf, len);
free (buf);
@@ -1257,8 +1410,8 @@ scm_end_input (SCM port)
void
-scm_ungetc (int c, SCM port)
-#define FUNC_NAME "scm_ungetc"
+scm_unget_byte (int c, SCM port)
+#define FUNC_NAME "scm_unget_byte"
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
@@ -1317,6 +1470,25 @@ scm_ungetc (int c, SCM port)
if (pt->rw_random)
pt->rw_active = SCM_PORT_READ;
+}
+#undef FUNC_NAME
+
+void
+scm_ungetc (scm_t_wchar c, SCM port)
+#define FUNC_NAME "scm_ungetc"
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_t_wchar *wbuf;
+ SCM str = scm_i_make_wide_string (1, &wbuf);
+ char *buf;
+ size_t len;
+ int i;
+
+ wbuf[0] = c;
+ buf = scm_to_stringn (str, &len, pt->encoding, pt->ilseq_handler);
+
+ for (i = len - 1; i >= 0; i--)
+ scm_unget_byte (buf[i], port);
if (c == '\n')
{
@@ -1363,7 +1535,7 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
"to @code{read-char} would have hung.")
#define FUNC_NAME s_scm_peek_char
{
- int c, column;
+ scm_t_wchar c, column;
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
else
@@ -1409,13 +1581,17 @@ SCM_DEFINE (scm_unread_string, "unread-string", 2, 0, 0,
"@var{port} is not supplied, the current-input-port is used.")
#define FUNC_NAME s_scm_unread_string
{
+ int n;
SCM_VALIDATE_STRING (1, str);
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
else
SCM_VALIDATE_OPINPORT (2, port);
- scm_ungets (scm_i_string_chars (str), scm_i_string_length (str), port);
+ n = scm_i_string_length (str);
+
+ while (n--)
+ scm_ungetc (scm_i_string_ref (str, n), port);
return str;
}
@@ -1670,6 +1846,328 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0,
}
#undef FUNC_NAME
+/* The default port encoding for this locale. New ports will have this
+ encoding. If it is a string, that is the encoding. If it #f, it
+ is in the native (Latin-1) encoding. */
+SCM_GLOBAL_VARIABLE (scm_port_encoding_var, "%port-encoding");
+static int scm_port_encoding_init = 0;
+
+/* Return a C string representation of the current encoding. */
+const char *
+scm_i_get_port_encoding (SCM port)
+{
+ SCM encoding;
+
+ if (scm_is_false (port))
+ {
+ if (!scm_port_encoding_init)
+ return NULL;
+ else if (!scm_is_fluid (SCM_VARIABLE_REF (scm_port_encoding_var)))
+ return NULL;
+ else
+ {
+ encoding = scm_fluid_ref (SCM_VARIABLE_REF (scm_port_encoding_var));
+ if (!scm_is_string (encoding))
+ return NULL;
+ else
+ return scm_i_string_chars (encoding);
+ }
+ }
+ else
+ {
+ scm_t_port *pt;
+ pt = SCM_PTAB_ENTRY (port);
+ if (pt->encoding)
+ return pt->encoding;
+ else
+ return NULL;
+ }
+}
+
+/* Returns ENC is if is a recognized encoding. If it isn't, it tries
+ to find an alias of ENC that is valid. Otherwise, it returns
+ NULL. */
+static const char *
+find_valid_encoding (const char *enc)
+{
+ int isvalid = 0;
+ const char str[] = " ";
+ scm_t_uint32 *u32;
+ size_t u32len;
+
+ u32 = u32_conv_from_encoding (enc, iconveh_error, str, 1,
+ NULL, NULL, &u32len);
+ isvalid = (u32 != NULL);
+ free (u32);
+
+ if (isvalid)
+ return enc;
+
+ return NULL;
+}
+
+void
+scm_i_set_port_encoding_x (SCM port, const char *enc)
+{
+ const char *valid_enc;
+ scm_t_port *pt;
+
+ /* Null is shorthand for the native, Latin-1 encoding. */
+ if (enc == NULL)
+ valid_enc = NULL;
+ else
+ {
+ valid_enc = find_valid_encoding (enc);
+ if (valid_enc == NULL)
+ {
+ SCM err;
+ err = scm_from_locale_string (enc);
+ scm_misc_error (NULL, "invalid or unknown character encoding ~s",
+ scm_list_1 (err));
+ }
+ }
+
+ if (scm_is_false (port))
+ {
+ /* Set the default encoding for future ports. */
+ if (!scm_port_encoding_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (scm_port_encoding_var)))
+ scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized",
+ SCM_EOL);
+
+ if (valid_enc == NULL
+ || !strcmp (valid_enc, "ASCII")
+ || !strcmp (valid_enc, "ANSI_X3.4-1968")
+ || !strcmp (valid_enc, "ISO-8859-1"))
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var), SCM_BOOL_F);
+ else
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var),
+ scm_from_locale_string (valid_enc));
+ }
+ else
+ {
+ /* Set the character encoding for this port. */
+ pt = SCM_PTAB_ENTRY (port);
+ if (pt->encoding)
+ free (pt->encoding);
+ if (valid_enc == NULL)
+ pt->encoding = NULL;
+ else
+ pt->encoding = strdup (valid_enc);
+ }
+}
+
+SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
+ (SCM port),
+ "Returns, as a string, the character encoding that @var{port}\n"
+ "uses to interpret its input and output.\n")
+#define FUNC_NAME s_scm_port_encoding
+{
+ scm_t_port *pt;
+ const char *enc;
+
+ SCM_VALIDATE_PORT (1, port);
+
+ pt = SCM_PTAB_ENTRY (port);
+ enc = scm_i_get_port_encoding (port);
+ if (enc)
+ return scm_from_locale_string (pt->encoding);
+ else
+ return scm_from_locale_string ("NONE");
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_encoding_x, "set-port-encoding!", 2, 0, 0,
+ (SCM port, SCM enc),
+ "Sets the character encoding that will be used to interpret all\n"
+ "port I/O. New ports are created with the encoding\n"
+ "appropriate for the current locale if @code{setlocale} has \n"
+ "been called or ISO-8859-1 otherwise\n"
+ "and this procedure can be used to modify that encoding.\n")
+
+#define FUNC_NAME s_scm_set_port_encoding_x
+{
+ char *enc_str;
+ const char *valid_enc_str;
+
+ SCM_VALIDATE_PORT (1, port);
+ SCM_VALIDATE_STRING (2, enc);
+
+ enc_str = scm_to_locale_string (enc);
+ valid_enc_str = find_valid_encoding (enc_str);
+ if (valid_enc_str == NULL)
+ {
+ free (enc_str);
+ scm_misc_error (FUNC_NAME, "invalid or unknown character encoding ~s",
+ scm_list_1 (enc));
+ }
+ else
+ {
+ scm_i_set_port_encoding_x (port, valid_enc_str);
+ free (enc_str);
+ }
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+
+/* This determines how conversions handle unconvertible characters. */
+SCM_GLOBAL_VARIABLE (scm_conversion_strategy, "%port-conversion-strategy");
+static int scm_conversion_strategy_init = 0;
+
+scm_t_string_failed_conversion_handler
+scm_i_get_conversion_strategy (SCM port)
+{
+ SCM encoding;
+
+ if (scm_is_false (port))
+ {
+ if (!scm_conversion_strategy_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (scm_conversion_strategy)))
+ return SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else
+ {
+ encoding = scm_fluid_ref (SCM_VARIABLE_REF (scm_conversion_strategy));
+ if (scm_is_false (encoding))
+ return SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else
+ return (scm_t_string_failed_conversion_handler) scm_to_int (encoding);
+ }
+ }
+ else
+ {
+ scm_t_port *pt;
+ pt = SCM_PTAB_ENTRY (port);
+ return pt->ilseq_handler;
+ }
+
+}
+
+void
+scm_i_set_conversion_strategy_x (SCM port,
+ scm_t_string_failed_conversion_handler handler)
+{
+ SCM strategy;
+ scm_t_port *pt;
+
+ strategy = scm_from_int ((int) handler);
+
+ if (scm_is_false (port))
+ {
+ /* Set the default encoding for future ports. */
+ if (!scm_conversion_strategy
+ || !scm_is_fluid (SCM_VARIABLE_REF (scm_conversion_strategy)))
+ scm_misc_error (NULL, "tried to set conversion strategy fluid before it is initialized",
+ SCM_EOL);
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_conversion_strategy), strategy);
+ }
+ else
+ {
+ /* Set the character encoding for this port. */
+ pt = SCM_PTAB_ENTRY (port);
+ pt->ilseq_handler = handler;
+ }
+}
+
+SCM_DEFINE (scm_port_conversion_strategy, "port-conversion-strategy",
+ 1, 0, 0, (SCM port),
+ "Returns the behavior of the port when handling a character that\n"
+ "is not representable in the port's current encoding.\n"
+ "It returns the symbol @code{error} if unrepresentable characters\n"
+ "should cause exceptions, @code{substitute} if the port should\n"
+ "try to replace unrepresentable characters with question marks or\n"
+ "approximate characters, or @code{escape} if unrepresentable\n"
+ "characters should be converted to string escapes.\n"
+ "\n"
+ "If @var{port} is @code{#f}, then the current default behavior\n"
+ "will be returned. New ports will have this default behavior\n"
+ "when they are created.\n")
+#define FUNC_NAME s_scm_port_conversion_strategy
+{
+ scm_t_string_failed_conversion_handler h;
+
+ SCM_VALIDATE_OPPORT (1, port);
+
+ if (!scm_is_false (port))
+ {
+ SCM_VALIDATE_OPPORT (1, port);
+ }
+
+ h = scm_i_get_conversion_strategy (port);
+ if (h == SCM_FAILED_CONVERSION_ERROR)
+ return scm_from_locale_symbol ("error");
+ else if (h == SCM_FAILED_CONVERSION_QUESTION_MARK)
+ return scm_from_locale_symbol ("substitute");
+ else if (h == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ return scm_from_locale_symbol ("escape");
+ else
+ abort ();
+
+ /* Never gets here. */
+ return SCM_UNDEFINED;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_conversion_strategy_x, "set-port-conversion-strategy!",
+ 2, 0, 0,
+ (SCM port, SCM sym),
+ "Sets the behavior of the interpreter when outputting a character\n"
+ "that is not representable in the port's current encoding.\n"
+ "@var{sym} can be either @code{'error}, @code{'substitute}, or\n"
+ "@code{'escape}. If it is @code{'error}, an error will be thrown\n"
+ "when an unconvertible character is encountered. If it is\n"
+ "@code{'substitute}, then unconvertible characters will \n"
+ "be replaced with approximate characters, or with question marks\n"
+ "if no approximately correct character is available.\n"
+ "If it is @code{'escape},\n"
+ "it will appear as a hex escape when output.\n"
+ "\n"
+ "If @var{port} is an open port, the conversion error behavior\n"
+ "is set for that port. If it is @code{#f}, it is set as the\n"
+ "default behavior for any future ports that get created in\n"
+ "this thread.\n")
+#define FUNC_NAME s_scm_set_port_conversion_strategy_x
+{
+ SCM err;
+ SCM qm;
+ SCM esc;
+
+ if (!scm_is_false (port))
+ {
+ SCM_VALIDATE_OPPORT (1, port);
+ }
+
+ err = scm_from_locale_symbol ("error");
+ if (scm_is_true (scm_eqv_p (sym, err)))
+ {
+ scm_i_set_conversion_strategy_x (port, SCM_FAILED_CONVERSION_ERROR);
+ return SCM_UNSPECIFIED;
+ }
+
+ qm = scm_from_locale_symbol ("substitute");
+ if (scm_is_true (scm_eqv_p (sym, qm)))
+ {
+ scm_i_set_conversion_strategy_x (port,
+ SCM_FAILED_CONVERSION_QUESTION_MARK);
+ return SCM_UNSPECIFIED;
+ }
+
+ esc = scm_from_locale_symbol ("escape");
+ if (scm_is_true (scm_eqv_p (sym, esc)))
+ {
+ scm_i_set_conversion_strategy_x (port,
+ SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ return SCM_UNSPECIFIED;
+ }
+
+ SCM_MISC_ERROR ("unknown conversion behavior ~s", scm_list_1 (sym));
+
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+
+
void
scm_print_port_mode (SCM exp, SCM port)
{
@@ -1780,8 +2278,17 @@ scm_init_ports ()
cur_loadport_fluid = scm_permanent_object (scm_make_fluid ());
scm_i_port_weak_hash = scm_permanent_object (scm_make_weak_key_hash_table (SCM_I_MAKINUM(31)));
-
#include "libguile/ports.x"
+
+ SCM_VARIABLE_SET (scm_port_encoding_var, scm_make_fluid ());
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_port_encoding_var), SCM_BOOL_F);
+ scm_port_encoding_init = 1;
+
+ SCM_VARIABLE_SET (scm_conversion_strategy, scm_make_fluid ());
+ scm_fluid_set_x (SCM_VARIABLE_REF (scm_conversion_strategy),
+ scm_from_int ((int) SCM_FAILED_CONVERSION_QUESTION_MARK));
+ scm_conversion_strategy_init = 1;
+
}
/*
diff --git a/libguile/ports.h b/libguile/ports.h
index e5c0ffd84..c75f17d59 100644
--- a/libguile/ports.h
+++ b/libguile/ports.h
@@ -28,7 +28,7 @@
#include "libguile/print.h"
#include "libguile/struct.h"
#include "libguile/threads.h"
-
+#include "libguile/strings.h"
@@ -56,6 +56,10 @@ typedef struct
long line_number; /* debugging support. */
int column_number; /* debugging support. */
+ /* Character encoding support */
+ char *encoding;
+ scm_t_string_failed_conversion_handler ilseq_handler;
+
/* port buffers. the buffer(s) are set up for all ports.
in the case of string ports, the buffer is the string itself.
in the case of unbuffered file ports, the buffer is a
@@ -266,6 +270,7 @@ SCM_API SCM scm_eof_object_p (SCM x);
SCM_API SCM scm_force_output (SCM port);
SCM_API SCM scm_flush_all_ports (void);
SCM_API SCM scm_read_char (SCM port);
+SCM_API scm_t_wchar scm_getc (SCM port);
SCM_API size_t scm_c_read (SCM port, void *buffer, size_t size);
SCM_API void scm_c_write (SCM port, const void *buffer, size_t size);
SCM_API void scm_lfwrite (const char *ptr, size_t size, SCM port);
@@ -275,7 +280,8 @@ SCM_INTERNAL void scm_lfwrite_substr (SCM str, size_t start, size_t end,
SCM_API void scm_flush (SCM port);
SCM_API void scm_end_input (SCM port);
SCM_API int scm_fill_input (SCM port);
-SCM_API void scm_ungetc (int c, SCM port);
+SCM_INTERNAL void scm_unget_byte (int c, SCM port);
+SCM_API void scm_ungetc (scm_t_wchar c, SCM port);
SCM_API void scm_ungets (const char *s, int n, SCM port);
SCM_API SCM scm_peek_char (SCM port);
SCM_API SCM scm_unread_char (SCM cobj, SCM port);
@@ -288,6 +294,15 @@ SCM_API SCM scm_port_column (SCM port);
SCM_API SCM scm_set_port_column_x (SCM port, SCM line);
SCM_API SCM scm_port_filename (SCM port);
SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename);
+SCM_INTERNAL const char *scm_i_get_port_encoding (SCM port);
+SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str);
+SCM_API SCM scm_port_encoding (SCM port);
+SCM_API SCM scm_set_port_encoding_x (SCM port, SCM encoding);
+SCM_INTERNAL scm_t_string_failed_conversion_handler scm_i_get_conversion_strategy (SCM port);
+SCM_INTERNAL void scm_i_set_conversion_strategy_x (SCM port,
+ scm_t_string_failed_conversion_handler h);
+SCM_API SCM scm_port_conversion_strategy (SCM port);
+SCM_API SCM scm_set_port_conversion_strategy_x (SCM port, SCM behavior);
SCM_API int scm_port_print (SCM exp, SCM port, scm_print_state *);
SCM_API void scm_print_port_mode (SCM exp, SCM port);
SCM_API void scm_ports_prehistory (void);
@@ -295,7 +310,6 @@ SCM_API SCM scm_void_port (char * mode_str);
SCM_API SCM scm_sys_make_void_port (SCM mode);
SCM_INTERNAL void scm_init_ports (void);
-
#if SCM_ENABLE_DEPRECATED==1
SCM_API scm_t_port * scm_add_to_port_table (SCM port);
#endif
diff --git a/libguile/posix.c b/libguile/posix.c
index dafc5e996..09d53f212 100644
--- a/libguile/posix.c
+++ b/libguile/posix.c
@@ -25,6 +25,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
+#include <uniconv.h>
#include "libguile/_scm.h"
#include "libguile/dynwind.h"
@@ -1528,12 +1529,17 @@ SCM_DEFINE (scm_setlocale, "setlocale", 1, 1, 0,
"Otherwise the specified locale category is set to the string\n"
"@var{locale} and the new value is returned as a\n"
"system-dependent string. If @var{locale} is an empty string,\n"
- "the locale will be set using environment variables.")
+ "the locale will be set using environment variables.\n"
+ "\n"
+ "When the locale is changed, the character encoding of the new\n"
+ "locale (UTF-8, ISO-8859-1, etc.) is used for the current\n"
+ "input, output, and error ports\n")
#define FUNC_NAME s_scm_setlocale
{
int c_category;
char *clocale;
char *rv;
+ const char *enc;
scm_dynwind_begin (0);
@@ -1562,15 +1568,47 @@ SCM_DEFINE (scm_setlocale, "setlocale", 1, 1, 0,
SCM_SYSERROR;
}
- /* Recompute the standard SRFI-14 character sets in a locale-dependent
- (actually charset-dependent) way. */
- scm_srfi_14_compute_char_sets ();
+ enc = locale_charset ();
+ /* Set the default encoding for new ports. */
+ scm_i_set_port_encoding_x (SCM_BOOL_F, enc);
+ /* Set the encoding for the stdio ports. */
+ scm_i_set_port_encoding_x (scm_current_input_port (), enc);
+ scm_i_set_port_encoding_x (scm_current_output_port (), enc);
+ scm_i_set_port_encoding_x (scm_current_error_port (), enc);
scm_dynwind_end ();
return scm_from_locale_string (rv);
}
#undef FUNC_NAME
#endif /* HAVE_SETLOCALE */
+SCM_DEFINE (scm_setbinary, "setbinary", 0, 0, 0,
+ (void),
+ "Sets the encoding for the current input, output, and error\n"
+ "ports to ISO-8859-1. That character encoding allows\n"
+ "ports to operate on binary data.\n"
+ "\n"
+ "It also sets the default encoding for newly created ports\n"
+ "to ISO-8859-1.\n"
+ "\n"
+ "The previous default encoding for new ports is returned\n")
+#define FUNC_NAME s_scm_setbinary
+{
+ const char *enc = scm_i_get_port_encoding (SCM_BOOL_F);
+
+ /* Set the default encoding for new ports. */
+ scm_i_set_port_encoding_x (SCM_BOOL_F, NULL);
+ /* Set the encoding for the stdio ports. */
+ scm_i_set_port_encoding_x (scm_current_input_port (), NULL);
+ scm_i_set_port_encoding_x (scm_current_output_port (), NULL);
+ scm_i_set_port_encoding_x (scm_current_error_port (), NULL);
+
+ if (enc)
+ return scm_from_locale_string (enc);
+
+ return scm_from_locale_string ("ISO-8859-1");
+}
+#undef FUNC_NAME
+
#ifdef HAVE_MKNOD
SCM_DEFINE (scm_mknod, "mknod", 4, 0, 0,
diff --git a/libguile/posix.h b/libguile/posix.h
index 4d057643c..2d93300b8 100644
--- a/libguile/posix.h
+++ b/libguile/posix.h
@@ -74,6 +74,7 @@ SCM_API SCM scm_access (SCM path, SCM how);
SCM_API SCM scm_getpid (void);
SCM_API SCM scm_putenv (SCM str);
SCM_API SCM scm_setlocale (SCM category, SCM locale);
+SCM_API SCM scm_setbinary (void);
SCM_API SCM scm_mknod (SCM path, SCM type, SCM perms, SCM dev);
SCM_API SCM scm_nice (SCM incr);
SCM_API SCM scm_sync (void);
diff --git a/libguile/print.c b/libguile/print.c
index 07bff47fd..520a2d95c 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -463,20 +463,45 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
| UC_CATEGORY_MASK_S))
/* Print the character if is graphic character. */
{
- if (i<256)
- /* Character is graphic. Print it. */
- scm_putc (i, port);
+ scm_t_wchar *wbuf;
+ SCM wstr = scm_i_make_wide_string (1, &wbuf);
+ char *buf;
+ size_t len;
+ const char *enc;
+
+ enc = scm_i_get_port_encoding (port);
+ wbuf[0] = i;
+ if (enc == NULL && i <= 0xFF)
+ {
+ /* Character is graphic and Latin-1. Print it */
+ scm_lfwrite_str (wstr, port);
+ }
else
- /* Character is graphic but unrepresentable in
- this port's encoding. */
- scm_intprint (i, 8, port);
+ {
+ buf = u32_conv_to_encoding (enc,
+ iconveh_error,
+ (scm_t_uint32 *) wbuf,
+ 1,
+ NULL,
+ NULL, &len);
+ if (buf != NULL)
+ {
+ /* Character is graphic. Print it. */
+ scm_lfwrite_str (wstr, port);
+ free (buf);
+ }
+ else
+ /* Character is graphic but unrepresentable in
+ this port's encoding. */
+ scm_intprint (i, 8, port);
+ }
}
else
/* Character is a non-graphical character. */
scm_intprint (i, 8, port);
}
else
- scm_putc (i, port);
+ scm_i_charprint (i, port);
}
else if (SCM_IFLAGP (exp)
&& ((size_t) SCM_IFLAGNUM (exp) < (sizeof iflagnames / sizeof (char *))))
@@ -608,21 +633,32 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
SCM wstr = scm_i_make_wide_string (1, &wbuf);
char *buf;
size_t len;
-
- wbuf[0] = ch;
-
- buf = u32_conv_to_encoding ("ISO-8859-1",
- iconveh_error,
- (scm_t_uint32 *) wbuf,
- 1, NULL, NULL, &len);
- if (buf != NULL)
+
+ if (scm_i_get_port_encoding (port))
{
- /* Character is graphic and representable in
- this encoding. Print it. */
- scm_lfwrite_str (wstr, port);
- free (buf);
- printed = 1;
+ wstr = scm_i_make_wide_string (1, &wbuf);
+ wbuf[0] = ch;
+ buf = u32_conv_to_encoding (scm_i_get_port_encoding (port),
+ iconveh_error,
+ (scm_t_uint32 *) wbuf,
+ 1 ,
+ NULL,
+ NULL, &len);
+ if (buf != NULL)
+ {
+ /* Character is graphic and representable in
+ this encoding. Print it. */
+ scm_lfwrite_str (wstr, port);
+ free (buf);
+ printed = 1;
+ }
}
+ else
+ if (ch <= 0xFF)
+ {
+ scm_putc (ch, port);
+ printed = 1;
+ }
}
if (!printed)
@@ -835,7 +871,7 @@ scm_prin1 (SCM exp, SCM port, int writingp)
/* Print a character.
*/
void
-scm_i_charprint (scm_t_uint32 ch, SCM port)
+scm_i_charprint (scm_t_wchar ch, SCM port)
{
scm_t_wchar *wbuf;
SCM wstr = scm_i_make_wide_string (1, &wbuf);
@@ -1057,9 +1093,7 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
SCM port, answer = SCM_UNSPECIFIED;
int fReturnString = 0;
int writingp;
- const char *start;
- const char *end;
- const char *p;
+ size_t start, p, end;
if (scm_is_eq (destination, SCM_BOOL_T))
{
@@ -1082,15 +1116,16 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
SCM_VALIDATE_STRING (2, message);
SCM_VALIDATE_REST_ARGUMENT (args);
- start = scm_i_string_chars (message);
- end = start + scm_i_string_length (message);
+ p = 0;
+ start = 0;
+ end = scm_i_string_length (message);
for (p = start; p != end; ++p)
- if (*p == '~')
+ if (scm_i_string_ref (message, p) == '~')
{
if (++p == end)
break;
- switch (*p)
+ switch (scm_i_string_ref (message, p))
{
case 'A': case 'a':
writingp = 0;
@@ -1099,33 +1134,33 @@ SCM_DEFINE (scm_simple_format, "simple-format", 2, 0, 1,
writingp = 1;
break;
case '~':
- scm_lfwrite (start, p - start, port);
+ scm_lfwrite_substr (message, start, p, port);
start = p + 1;
continue;
case '%':
- scm_lfwrite (start, p - start - 1, port);
+ scm_lfwrite_substr (message, start, p - 1, port);
scm_newline (port);
start = p + 1;
continue;
default:
SCM_MISC_ERROR ("FORMAT: Unsupported format option ~~~A - use (ice-9 format) instead",
- scm_list_1 (SCM_MAKE_CHAR (*p)));
+ scm_list_1 (SCM_MAKE_CHAR (scm_i_string_ref (message, p))));
}
if (!scm_is_pair (args))
SCM_MISC_ERROR ("FORMAT: Missing argument for ~~~A",
- scm_list_1 (SCM_MAKE_CHAR (*p)));
+ scm_list_1 (SCM_MAKE_CHAR (scm_i_string_ref (message, p))));
- scm_lfwrite (start, p - start - 1, port);
+ scm_lfwrite_substr (message, start, p - 1, port);
/* we pass destination here */
scm_prin1 (SCM_CAR (args), destination, writingp);
args = SCM_CDR (args);
start = p + 1;
}
- scm_lfwrite (start, p - start, port);
+ scm_lfwrite_substr (message, start, p, port);
if (!scm_is_eq (args, SCM_EOL))
SCM_MISC_ERROR ("FORMAT: ~A superfluous arguments",
scm_list_1 (scm_length (args)));
diff --git a/libguile/print.h b/libguile/print.h
index 3e2333ddd..ae2aaef54 100644
--- a/libguile/print.h
+++ b/libguile/print.h
@@ -25,6 +25,7 @@
#include "libguile/__scm.h"
+#include "libguile/chars.h"
#include "libguile/options.h"
@@ -77,7 +78,7 @@ SCM_API SCM scm_print_options (SCM setting);
SCM_API SCM scm_make_print_state (void);
SCM_API void scm_free_print_state (SCM print_state);
SCM_INTERNAL SCM scm_i_port_with_print_state (SCM port, SCM print_state);
-SCM_INTERNAL void scm_i_charprint (scm_t_uint32 c, SCM port);
+SCM_INTERNAL void scm_i_charprint (scm_t_wchar c, SCM port);
SCM_API void scm_intprint (scm_t_intmax n, int radix, SCM port);
SCM_API void scm_uintprint (scm_t_uintmax n, int radix, SCM port);
SCM_API void scm_ipruk (char *hdr, SCM ptr, SCM port);
diff --git a/libguile/rdelim.c b/libguile/rdelim.c
index 04a0944f4..1f46e5bf0 100644
--- a/libguile/rdelim.c
+++ b/libguile/rdelim.c
@@ -59,12 +59,10 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
size_t j;
size_t cstart;
size_t cend;
- int c;
- const char *cdelims;
+ scm_t_wchar c;
size_t num_delims;
SCM_VALIDATE_STRING (1, delims);
- cdelims = scm_i_string_chars (delims);
num_delims = scm_i_string_length (delims);
SCM_VALIDATE_STRING (2, str);
@@ -83,7 +81,7 @@ SCM_DEFINE (scm_read_delimited_x, "%read-delimited!", 3, 3, 0,
c = scm_getc (port);
for (k = 0; k < num_delims; k++)
{
- if (cdelims[k] == c)
+ if (scm_i_string_ref (delims, k) == c)
{
if (scm_is_false (gobble))
scm_ungetc (c, port);
diff --git a/libguile/read.c b/libguile/read.c
index 8efac67af..8abdf07bf 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -27,6 +27,8 @@
#include <stdio.h>
#include <ctype.h>
#include <string.h>
+#include <unistd.h>
+#include <unicase.h>
#include "libguile/_scm.h"
#include "libguile/bytevectors.h"
@@ -177,11 +179,6 @@ static SCM *scm_read_hash_procedures;
(((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
|| ((_chr) == 'd') || ((_chr) == 'l'))
-/* An inlinable version of `scm_c_downcase ()'. */
-#define CHAR_DOWNCASE(_chr) \
- (((_chr) <= UCHAR_MAX) ? tolower ((int) (_chr)) : (_chr))
-
-
/* Read an SCSH block comment. */
static inline SCM scm_read_scsh_block_comment (int chr, SCM port);
static SCM scm_read_commented_expression (int chr, SCM port);
@@ -189,41 +186,69 @@ static SCM scm_read_commented_expression (int chr, SCM port);
/* Read from PORT until a delimiter (e.g., a whitespace) is read. Return
zero if the whole token fits in BUF, non-zero otherwise. */
static inline int
-read_token (SCM port, char *buf, size_t buf_size, size_t *read)
+read_token (SCM port, SCM buf, size_t *read)
{
+ scm_t_wchar chr;
*read = 0;
- while (*read < buf_size)
+ buf = scm_i_string_start_writing (buf);
+ while (*read < scm_i_string_length (buf))
{
- int chr;
-
chr = scm_getc (port);
- chr = (SCM_CASE_INSENSITIVE_P ? CHAR_DOWNCASE (chr) : chr);
if (chr == EOF)
- return 0;
- else if (CHAR_IS_DELIMITER (chr))
{
- scm_ungetc (chr, port);
+ scm_i_string_stop_writing ();
return 0;
}
- else
+
+ chr = (SCM_CASE_INSENSITIVE_P ? uc_tolower (chr) : chr);
+
+ if (CHAR_IS_DELIMITER (chr))
{
- *buf = (char) chr;
- buf++, (*read)++;
+ scm_i_string_stop_writing ();
+ scm_ungetc (chr, port);
+ return 0;
}
+
+ scm_i_string_set_x (buf, *read, chr);
+ (*read)++;
}
+ scm_i_string_stop_writing ();
return 1;
}
+static SCM
+read_complete_token (SCM port, size_t *read)
+{
+ SCM buffer, str = SCM_EOL;
+ size_t len;
+ int overflow;
+
+ buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
+ overflow = read_token (port, buffer, read);
+ if (!overflow)
+ return scm_i_substring (buffer, 0, *read);
+
+ str = scm_string_copy (buffer);
+ do
+ {
+ overflow = read_token (port, buffer, &len);
+ str = scm_string_append (scm_list_2 (str, buffer));
+ *read += len;
+ }
+ while (overflow);
+
+ return scm_i_substring (str, 0, *read);
+}
/* Skip whitespace from PORT and return the first non-whitespace character
read. Raise an error on end-of-file. */
static int
flush_ws (SCM port, const char *eoferr)
{
- register int c;
+ register scm_t_wchar c;
while (1)
switch (c = scm_getc (port))
{
@@ -292,7 +317,7 @@ static SCM recsexpr (SCM obj, long line, int column, SCM filename);
static SCM
-scm_read_sexp (int chr, SCM port)
+scm_read_sexp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_i_lreadparen"
{
register int c;
@@ -553,107 +578,52 @@ scm_read_string (int chr, SCM port)
static SCM
-scm_read_number (int chr, SCM port)
+scm_read_number (scm_t_wchar chr, SCM port)
{
- SCM result, str = SCM_EOL;
- char buffer[READER_BUFFER_SIZE];
+ SCM result;
+ SCM buffer;
size_t read;
- int overflow = 0;
scm_ungetc (chr, port);
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
-
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- /* The slow path. */
-
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- result = scm_string_to_number (str, SCM_UNDEFINED);
- if (!scm_is_true (result))
- /* Return a symbol instead of a number. */
- result = scm_string_to_symbol (str);
- }
- else
- {
- result = scm_c_locale_stringn_to_number (buffer, read, 10);
- if (!scm_is_true (result))
- /* Return a symbol instead of a number. */
- result = scm_from_locale_symboln (buffer, read);
- }
+ buffer = read_complete_token (port, &read);
+ result = scm_string_to_number (buffer, SCM_UNDEFINED);
+ if (!scm_is_true (result))
+ /* Return a symbol instead of a number. */
+ result = scm_string_to_symbol (buffer);
return result;
}
static SCM
-scm_read_mixed_case_symbol (int chr, SCM port)
+scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port)
{
- SCM result, str = SCM_EOL;
- int overflow = 0, ends_with_colon = 0;
- char buffer[READER_BUFFER_SIZE];
+ SCM result;
+ int ends_with_colon = 0;
+ SCM buffer;
size_t read = 0;
int postfix = scm_is_eq (SCM_PACK (SCM_KEYWORD_STYLE), scm_keyword_postfix);
scm_ungetc (chr, port);
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
-
- if (read > 0)
- ends_with_colon = (buffer[read - 1] == ':');
+ buffer = read_complete_token (port, &read);
+ if (read > 0)
+ ends_with_colon = scm_i_string_ref (buffer, read - 1) == ':';
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- size_t len;
-
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- len = scm_c_string_length (str);
-
- /* Per SRFI-88, `:' alone is an identifier, not a keyword. */
- if (postfix && ends_with_colon && (len > 1))
- {
- /* Strip off colon. */
- str = scm_c_substring (str, 0, len-1);
- result = scm_string_to_symbol (str);
- result = scm_symbol_to_keyword (result);
- }
- else
- result = scm_string_to_symbol (str);
- }
+ if (postfix && ends_with_colon && (read > 1))
+ result = scm_symbol_to_keyword (scm_string_to_symbol (scm_i_substring (buffer, 0, read - 1)));
else
- {
- /* For symbols smaller than `sizeof (buffer)', we don't need to recur
- to Scheme strings. Therefore, we only create one Scheme object (a
- symbol) per symbol read. */
- if (postfix && ends_with_colon && (read > 1))
- result = scm_from_locale_keywordn (buffer, read - 1);
- else
- result = scm_from_locale_symboln (buffer, read);
- }
+ result = scm_string_to_symbol (buffer);
return result;
}
static SCM
-scm_read_number_and_radix (int chr, SCM port)
+scm_read_number_and_radix (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
- SCM result, str = SCM_EOL;
+ SCM result;
size_t read;
- char buffer[READER_BUFFER_SIZE];
+ SCM buffer = scm_i_make_string (READER_BUFFER_SIZE, NULL);
unsigned int radix;
- int overflow = 0;
switch (chr)
{
@@ -683,22 +653,8 @@ scm_read_number_and_radix (int chr, SCM port)
radix = 10;
}
- do
- {
- overflow = read_token (port, buffer, sizeof (buffer), &read);
-
- if ((overflow) || (scm_is_pair (str)))
- str = scm_cons (scm_from_locale_stringn (buffer, read), str);
- }
- while (overflow);
-
- if (scm_is_pair (str))
- {
- str = scm_string_concatenate (scm_reverse_x (str, SCM_EOL));
- result = scm_string_to_number (str, scm_from_uint (radix));
- }
- else
- result = scm_c_locale_stringn_to_number (buffer, read, radix);
+ buffer = read_complete_token (port, &read);
+ result = scm_string_to_number (buffer, scm_from_uint (radix));
if (scm_is_true (result))
return result;
@@ -728,7 +684,7 @@ scm_read_quote (int chr, SCM port)
case ',':
{
- int c;
+ scm_t_wchar c;
c = scm_getc (port);
if ('@' == c)
@@ -827,7 +783,10 @@ scm_read_semicolon_comment (int chr, SCM port)
{
int c;
- for (c = scm_getc (port);
+ /* We use the get_byte here because there is no need to get the
+ locale correct with comment input. This presumes that newline
+ always represents itself no matter what the encoding is. */
+ for (c = scm_get_byte_or_eof (port);
(c != EOF) && (c != '\n');
c = scm_getc (port));
@@ -855,14 +814,19 @@ scm_read_boolean (int chr, SCM port)
}
static SCM
-scm_read_character (int chr, SCM port)
+scm_read_character (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM ch;
- char charname[READER_CHAR_NAME_MAX_SIZE];
+ SCM charname = scm_i_make_string (READER_CHAR_NAME_MAX_SIZE, NULL);
size_t charname_len;
+ scm_t_wchar cp;
+ int overflow;
+
+ overflow = read_token (port, charname, &charname_len);
+ charname = scm_c_substring (charname, 0, charname_len);
- if (read_token (port, charname, sizeof (charname), &charname_len))
+ if (overflow)
goto char_error;
if (charname_len == 0)
@@ -877,28 +841,33 @@ scm_read_character (int chr, SCM port)
}
if (charname_len == 1)
- return SCM_MAKE_CHAR (charname[0]);
+ return SCM_MAKE_CHAR (scm_i_string_ref (charname, 0));
- if (*charname >= '0' && *charname < '8')
+ cp = scm_i_string_ref (charname, 0);
+ if (cp >= '0' && cp < '8')
{
/* Dirk:FIXME:: This type of character syntax is not R5RS
* compliant. Further, it should be verified that the constant
* does only consist of octal digits. Finally, it should be
* checked whether the resulting fixnum is in the range of
* characters. */
- SCM p = scm_c_locale_stringn_to_number (charname, charname_len, 8);
+ SCM p = scm_string_to_number (charname, scm_from_uint (8));
if (SCM_I_INUMP (p))
return SCM_MAKE_CHAR (SCM_I_INUM (p));
}
- ch = scm_i_charname_to_char (charname, charname_len);
+ /* The names of characters should never have non-Latin1
+ characters. */
+ if (scm_i_is_narrow_string (charname)
+ || scm_i_try_narrow_string (charname))
+ ch = scm_i_charname_to_char (scm_i_string_chars (charname),
+ charname_len);
if (scm_is_true (ch))
return ch;
char_error:
scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
- scm_list_1 (scm_from_locale_stringn (charname,
- charname_len)));
+ scm_list_1 (charname));
return SCM_UNSPECIFIED;
}
@@ -940,7 +909,7 @@ scm_read_srfi4_vector (int chr, SCM port)
}
static SCM
-scm_read_bytevector (int chr, SCM port)
+scm_read_bytevector (scm_t_wchar chr, SCM port)
{
chr = scm_getc (port);
if (chr != 'u')
@@ -964,7 +933,7 @@ scm_read_bytevector (int chr, SCM port)
}
static SCM
-scm_read_guile_bit_vector (int chr, SCM port)
+scm_read_guile_bit_vector (scm_t_wchar chr, SCM port)
{
/* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
terribly inefficient but who cares? */
@@ -984,13 +953,17 @@ scm_read_guile_bit_vector (int chr, SCM port)
}
static inline SCM
-scm_read_scsh_block_comment (int chr, SCM port)
+scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
{
int bang_seen = 0;
+ /* We can use the get_byte here because there is no need to get the
+ locale correct when reading comments. This presumes that
+ hash and exclamation points always represent themselves no
+ matter what the source encoding is.*/
for (;;)
{
- int c = scm_getc (port);
+ int c = scm_get_byte_or_eof (port);
if (c == EOF)
scm_i_input_error ("skip_block_comment", port,
@@ -1008,9 +981,9 @@ scm_read_scsh_block_comment (int chr, SCM port)
}
static SCM
-scm_read_commented_expression (int chr, SCM port)
+scm_read_commented_expression (scm_t_wchar chr, SCM port)
{
- int c;
+ scm_t_wchar c;
c = flush_ws (port, (char *) NULL);
if (EOF == c)
@@ -1022,19 +995,18 @@ scm_read_commented_expression (int chr, SCM port)
}
static SCM
-scm_read_extended_symbol (int chr, SCM port)
+scm_read_extended_symbol (scm_t_wchar chr, SCM port)
{
/* Guile's extended symbol read syntax looks like this:
#{This is all a symbol name}#
So here, CHR is expected to be `{'. */
- SCM result;
int saw_brace = 0, finished = 0;
size_t len = 0;
- char buf[1024];
+ SCM buf = scm_i_make_string (1024, NULL);
- result = scm_c_make_string (0, SCM_MAKE_CHAR ('X'));
+ buf = scm_i_string_start_writing (buf);
while ((chr = scm_getc (port)) != EOF)
{
@@ -1048,32 +1020,30 @@ scm_read_extended_symbol (int chr, SCM port)
else
{
saw_brace = 0;
- buf[len++] = '}';
- buf[len++] = chr;
+ scm_i_string_set_x (buf, len++, '}');
+ scm_i_string_set_x (buf, len++, chr);
}
}
else if (chr == '}')
saw_brace = 1;
else
- buf[len++] = chr;
+ scm_i_string_set_x (buf, len++, chr);
- if (len >= sizeof (buf) - 2)
+ if (len >= scm_i_string_length (buf) - 2)
{
- scm_string_append (scm_list_2 (result,
- scm_from_locale_stringn (buf, len)));
+ scm_i_string_stop_writing ();
+ SCM addy = scm_i_make_string (1024, NULL);
+ buf = scm_string_append (scm_list_2 (buf, addy));
len = 0;
+ buf = scm_i_string_start_writing (buf);
}
if (finished)
break;
}
+ scm_i_string_stop_writing ();
- if (len)
- result = scm_string_append (scm_list_2
- (result,
- scm_from_locale_stringn (buf, len)));
-
- return (scm_string_to_symbol (result));
+ return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
}
@@ -1109,7 +1079,7 @@ scm_read_sharp_extension (int chr, SCM port)
/* The reader for the sharp `#' character. It basically dispatches reads
among the above token readers. */
static SCM
-scm_read_sharp (int chr, SCM port)
+scm_read_sharp (scm_t_wchar chr, SCM port)
#define FUNC_NAME "scm_lreadr"
{
SCM result;
@@ -1161,7 +1131,7 @@ scm_read_sharp (int chr, SCM port)
{
/* When next char is '(', it really is an old-style
uniform array. */
- int next_c = scm_getc (port);
+ scm_t_wchar next_c = scm_getc (port);
if (next_c != EOF)
scm_ungetc (next_c, port);
if (next_c == '(')
@@ -1209,7 +1179,7 @@ scm_read_expression (SCM port)
{
while (1)
{
- register int chr;
+ register scm_t_wchar chr;
chr = scm_getc (port);
@@ -1420,6 +1390,127 @@ scm_get_hash_procedure (int c)
}
}
+#define SCM_ENCODING_SEARCH_SIZE (500)
+
+/* Search the first few hundred characters of a file for
+ an emacs-like coding declaration. */
+char *
+scm_scan_for_encoding (SCM port)
+{
+ char header[SCM_ENCODING_SEARCH_SIZE+1];
+ size_t bytes_read;
+ char *encoding = NULL;
+ int utf8_bom = 0;
+ char *pos;
+ int i;
+ int in_comment;
+
+ bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
+ scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
+
+ if (bytes_read > 3
+ && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
+ utf8_bom = 1;
+
+ /* search past "coding[:=]" */
+ pos = header;
+ while (1)
+ {
+ if ((pos = strstr(pos, "coding")) == NULL)
+ return NULL;
+
+ pos += strlen("coding");
+ if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
+ (*pos == ':' || *pos == '='))
+ {
+ pos ++;
+ break;
+ }
+ }
+
+ /* skip spaces */
+ while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
+ (*pos == ' ' || *pos == '\t'))
+ pos ++;
+
+ /* grab the next token */
+ i = 0;
+ while (pos + i - header <= SCM_ENCODING_SEARCH_SIZE
+ && (isalnum(pos[i]) || pos[i] == '_' || pos[i] == '-' || pos[i] == '.'))
+ i++;
+
+ if (i == 0)
+ return NULL;
+
+ encoding = scm_malloc (i+1);
+ memcpy (encoding, pos, i);
+ encoding[i] ='\0';
+ for (i = 0; i < strlen(encoding); i++)
+ encoding[i] = toupper(encoding[i]);
+
+ /* push backwards to make sure we were in a comment */
+ in_comment = 0;
+ while (pos - i - header > 0)
+ {
+ if (*(pos - i) == '\n')
+ {
+ /* This wasn't in a semicolon comment. Check for a
+ hash-bang comment. */
+ char *beg = strstr (header, "#!");
+ char *end = strstr (header, "!#");
+ if (beg < pos && pos < end)
+ in_comment = 1;
+ break;
+ }
+ if (*(pos - i) == ';')
+ {
+ in_comment = 1;
+ break;
+ }
+ i ++;
+ }
+ if (!in_comment)
+ {
+ /* This wasn't in a comment */
+ free (encoding);
+ return NULL;
+ }
+ if (utf8_bom && strcmp(encoding, "UTF-8"))
+ scm_misc_error (NULL,
+ "the port input declares the encoding ~s but is encoded as UTF-8",
+ scm_list_1 (scm_from_locale_string (encoding)));
+
+ return encoding;
+}
+
+SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
+ (SCM port),
+ "Scans the port for an EMACS-like character coding declaration\n"
+ "near the top of the contents of a port with random-acessible contents.\n"
+ "The coding declaration is of the form\n"
+ "@code{coding: XXXXX} and must appear in a scheme comment.\n"
+ "\n"
+ "Returns a string containing the character encoding of the file\n"
+ "if a declaration was found, or @code{#f} otherwise.\n")
+#define FUNC_NAME s_scm_file_encoding
+{
+ char *enc;
+ SCM s_enc;
+
+ enc = scm_scan_for_encoding (port);
+ if (enc == NULL)
+ return SCM_BOOL_F;
+ else
+ {
+ s_enc = scm_from_locale_string (enc);
+ free (enc);
+ return s_enc;
+ }
+
+ return SCM_BOOL_F;
+}
+#undef FUNC_NAME
+
void
scm_init_read ()
{
diff --git a/libguile/read.h b/libguile/read.h
index 20d3f4bf7..7bc4a0ba4 100644
--- a/libguile/read.h
+++ b/libguile/read.h
@@ -56,6 +56,8 @@ SCM_API SCM scm_read_options (SCM setting);
SCM_API SCM scm_read (SCM port);
SCM_API size_t scm_read_token (int ic, SCM * tok_buf, SCM port, int weird);
SCM_API SCM scm_read_hash_extend (SCM chr, SCM proc);
+SCM_INTERNAL char *scm_scan_for_encoding (SCM port);
+SCM_API SCM scm_file_encoding (SCM port);
SCM_INTERNAL void scm_i_input_error (const char *func, SCM port,
const char *message, SCM arg)
diff --git a/libguile/strings.c b/libguile/strings.c
index 59487bd06..5f5e4c672 100644
--- a/libguile/strings.c
+++ b/libguile/strings.c
@@ -28,6 +28,8 @@
#include <unistr.h>
#include <uniconv.h>
+#include "striconveh.h"
+
#include "libguile/_scm.h"
#include "libguile/chars.h"
#include "libguile/root.h"
@@ -632,6 +634,37 @@ scm_i_string_ref (SCM str, size_t x)
return scm_i_string_wide_chars (str)[x];
}
+/* Returns index+1 of the first char in STR that matches C, or
+ 0 if the char is not found. */
+int
+scm_i_string_contains_char (SCM str, char ch)
+{
+ size_t i;
+ size_t len = scm_i_string_length (str);
+
+ i = 0;
+ if (scm_i_is_narrow_string (str))
+ {
+ while (i < len)
+ {
+ if (scm_i_string_chars (str)[i] == ch)
+ return i+1;
+ i++;
+ }
+ }
+ else
+ {
+ while (i < len)
+ {
+ if (scm_i_string_wide_chars (str)[i]
+ == (unsigned char) ch)
+ return i+1;
+ i++;
+ }
+ }
+ return 0;
+}
+
int
scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
{
@@ -1443,31 +1476,6 @@ scm_is_string (SCM obj)
return IS_STRING (obj);
}
-SCM
-scm_from_locale_stringn (const char *str, size_t len)
-{
- SCM res;
- char *dst;
-
- if (len == (size_t) -1)
- len = strlen (str);
- if (len == 0)
- return scm_nullstr;
-
- res = scm_i_make_string (len, &dst);
- memcpy (dst, str, len);
- return res;
-}
-
-SCM
-scm_from_locale_string (const char *str)
-{
- if (str == NULL)
- return scm_nullstr;
-
- return scm_from_locale_stringn (str, -1);
-}
-
static SCM
scm_from_stringn (const char *str, size_t len, const char *encoding,
scm_t_string_failed_conversion_handler handler)
@@ -1477,6 +1485,15 @@ scm_from_stringn (const char *str, size_t len, const char *encoding,
int wide = 0;
SCM res;
+ if (encoding == NULL)
+ {
+ /* If encoding is null, use Latin-1. */
+ char *buf;
+ res = scm_i_make_string (len, &buf);
+ memcpy (buf, str, len);
+ return res;
+ }
+
u32len = 0;
u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
(enum iconv_ilseq_handler)
@@ -1491,12 +1508,9 @@ scm_from_stringn (const char *str, size_t len, const char *encoding,
scm_memory_error ("locale string conversion");
else
{
- /* There are invalid sequences in the input string. Since
- it is partially nonsense, what is the best strategy for
- printing it in the error message? */
+ /* There are invalid sequences in the input string. */
SCM errstr;
char *dst;
- /* We'll just print it unconverted and hope for the best. */
errstr = scm_i_make_string (len, &dst);
memcpy (dst, str, len);
scm_misc_error (NULL, "input locale conversion error from ~s: ~s",
@@ -1535,6 +1549,44 @@ scm_from_stringn (const char *str, size_t len, const char *encoding,
}
SCM
+scm_from_locale_stringn (const char *str, size_t len)
+{
+ const char *enc;
+ scm_t_string_failed_conversion_handler hndl;
+ SCM inport;
+ scm_t_port *pt;
+
+ if (len == (size_t) -1)
+ len = strlen (str);
+ if (len == 0)
+ return scm_nullstr;
+
+ inport = scm_current_input_port ();
+ if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
+ {
+ pt = SCM_PTAB_ENTRY (inport);
+ enc = pt->encoding;
+ hndl = pt->ilseq_handler;
+ }
+ else
+ {
+ enc = NULL;
+ hndl = SCM_FAILED_CONVERSION_ERROR;
+ }
+
+ return scm_from_stringn (str, len, enc, hndl);
+}
+
+SCM
+scm_from_locale_string (const char *str)
+{
+ if (str == NULL)
+ return scm_nullstr;
+
+ return scm_from_locale_stringn (str, -1);
+}
+
+SCM
scm_i_from_utf8_string (const scm_t_uint8 *str)
{
return scm_from_stringn ((const char *) str,
@@ -1630,13 +1682,22 @@ unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
char *
scm_to_locale_stringn (SCM str, size_t * lenp)
{
+ SCM outport;
+ scm_t_port *pt;
const char *enc;
- /* In the future, enc will hold the port's encoding. */
- enc = NULL;
+ outport = scm_current_output_port ();
+ if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
+ {
+ pt = SCM_PTAB_ENTRY (outport);
+ enc = pt->encoding;
+ }
+ else
+ enc = NULL;
- return scm_to_stringn (str, lenp, enc,
- SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
+ return scm_to_stringn (str, lenp,
+ enc,
+ scm_i_get_conversion_strategy (SCM_BOOL_F));
}
/* Low-level scheme to C string conversion function. */
@@ -1646,6 +1707,8 @@ scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
{
char *buf;
size_t ilen, len, i;
+ int ret;
+ const char *enc;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
@@ -1667,8 +1730,10 @@ scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
"string contains #\\nul character: ~S",
scm_list_1 (str));
- if (scm_i_is_narrow_string (str))
+ if (scm_i_is_narrow_string (str) && (encoding == NULL))
{
+ /* If using native Latin-1 encoding, just copy the string
+ contents. */
if (lenp)
{
buf = scm_malloc (ilen);
@@ -1688,17 +1753,41 @@ scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
buf = NULL;
len = 0;
- buf = u32_conv_to_encoding (encoding ? encoding : "ISO-8859-1",
- (enum iconv_ilseq_handler) handler,
- (scm_t_uint32 *) scm_i_string_wide_chars (str),
- ilen, NULL, NULL, &len);
- if (buf == NULL)
- scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
- scm_list_2 (scm_from_locale_string (encoding), str));
+ enc = encoding;
+ if (enc == NULL)
+ enc = "ISO-8859-1";
+ if (scm_i_is_narrow_string (str))
+ {
+ ret = mem_iconveh (scm_i_string_chars (str), ilen,
+ "ISO-8859-1", enc,
+ (enum iconv_ilseq_handler) handler, NULL,
+ &buf, &len);
- if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
- unistring_escapes_to_guile_escapes (&buf, &len);
+ if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ unistring_escapes_to_guile_escapes (&buf, &len);
+ if (ret != 0)
+ {
+ scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
+ scm_list_2 (scm_from_locale_string (enc),
+ str));
+ }
+ }
+ else
+ {
+ buf = u32_conv_to_encoding (enc,
+ (enum iconv_ilseq_handler) handler,
+ (scm_t_uint32 *) scm_i_string_wide_chars (str),
+ ilen,
+ NULL,
+ NULL, &len);
+ if (buf == NULL)
+ {
+ scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
+ scm_list_2 (scm_from_locale_string (enc),
+ str));
+ }
+ }
if (lenp)
*lenp = len;
else
diff --git a/libguile/strings.h b/libguile/strings.h
index 20726a3e7..2393aae91 100644
--- a/libguile/strings.h
+++ b/libguile/strings.h
@@ -154,6 +154,7 @@ SCM_INTERNAL SCM scm_i_string_start_writing (SCM str);
SCM_INTERNAL void scm_i_string_stop_writing (void);
SCM_INTERNAL int scm_i_is_narrow_string (SCM str);
SCM_INTERNAL scm_t_wchar scm_i_string_ref (SCM str, size_t x);
+SCM_INTERNAL int scm_i_string_contains_char (SCM str, char c);
SCM_INTERNAL int scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr);
SCM_INTERNAL void scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr);
/* internal functions related to symbols. */
diff --git a/libguile/strports.c b/libguile/strports.c
index 5c67bf9a8..ed6275b50 100644
--- a/libguile/strports.c
+++ b/libguile/strports.c
@@ -39,6 +39,7 @@
#include "libguile/modules.h"
#include "libguile/validate.h"
#include "libguile/deprecation.h"
+#include "libguile/srfi-4.h"
#include "libguile/strports.h"
@@ -289,42 +290,33 @@ st_truncate (SCM port, scm_t_off length)
}
SCM
-scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
+scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len, long modes, const char *caller)
{
- SCM z;
+ SCM z, str;
scm_t_port *pt;
- size_t str_len, c_pos;
-
- SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
+ size_t c_pos;
+ char *buf;
+
+ /* Because ports are inherently 8-bit, strings need to be converted
+ to a locale representation for storage. But, since string ports
+ rely on string functionality for their memory management, we need
+ to create a new string that has the 8-bit locale representation
+ of the underlying string. This violates the guideline that the
+ internal encoding of characters in strings is in unicode
+ codepoints. */
+ str = scm_i_make_string (str_len, &buf);
+ memcpy (buf, locale_str, str_len);
- str_len = scm_i_string_length (str);
c_pos = scm_to_unsigned_integer (pos, 0, str_len);
if (!((modes & SCM_WRTNG) || (modes & SCM_RDNG)))
scm_misc_error ("scm_mkstrport", "port must read or write", SCM_EOL);
- /* XXX
-
- Make a new string to isolate us from changes to the original.
- This is done so that we can rely on scm_i_string_chars to stay in
- place even across SCM_TICKs.
-
- Additionally, when we are going to write to the string, we make a
- copy so that we can write to it without having to use
- scm_i_string_writable_chars.
- */
-
- if (modes & SCM_WRTNG)
- str = scm_c_substring_copy (str, 0, str_len);
- else
- str = scm_c_substring (str, 0, str_len);
-
scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
z = scm_new_port_table_entry (scm_tc16_strport);
pt = SCM_PTAB_ENTRY(z);
SCM_SETSTREAM (z, SCM_UNPACK (str));
SCM_SET_CELL_TYPE(z, scm_tc16_strport|modes);
- /* see above why we can use scm_i_string_chars here. */
pt->write_buf = pt->read_buf = (unsigned char *) scm_i_string_chars (str);
pt->read_pos = pt->write_pos = pt->read_buf + c_pos;
pt->write_buf_size = pt->read_buf_size = str_len;
@@ -340,22 +332,60 @@ scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
return z;
}
+SCM
+scm_mkstrport (SCM pos, SCM str, long modes, const char *caller)
+{
+ SCM z;
+ size_t str_len;
+ char *buf;
+
+ SCM_ASSERT (scm_is_string (str), str, SCM_ARG1, caller);
+
+ /* Because ports are inherently 8-bit, strings need to be converted
+ to a locale representation for storage. But, since string ports
+ rely on string functionality for their memory management, we need
+ to create a new string that has the 8-bit locale representation
+ of the underlying string. This violates the guideline that the
+ internal encoding of characters in strings is in unicode
+ codepoints. */
+ buf = scm_to_locale_stringn (str, &str_len);
+ z = scm_i_mkstrport (pos, buf, str_len, modes, caller);
+ free (buf);
+ return z;
+}
+
/* create a new string from a string port's buffer. */
SCM scm_strport_to_string (SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
SCM str;
- char *dst;
if (pt->rw_active == SCM_PORT_WRITE)
st_flush (port);
- str = scm_i_make_string (pt->read_buf_size, &dst);
- memcpy (dst, (char *) pt->read_buf, pt->read_buf_size);
+ str = scm_from_locale_stringn ((char *)pt->read_buf, pt->read_buf_size);
scm_remember_upto_here_1 (port);
return str;
}
+/* Create a vector containing the locale representation of the string in the
+ port's buffer. */
+SCM scm_strport_to_locale_u8vector (SCM port)
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ SCM vec;
+ char *buf;
+
+ if (pt->rw_active == SCM_PORT_WRITE)
+ st_flush (port);
+
+ buf = scm_malloc (pt->read_buf_size);
+ memcpy (buf, pt->read_buf, pt->read_buf_size);
+ vec = scm_take_u8vector ((unsigned char *) buf, pt->read_buf_size);
+ scm_remember_upto_here_1 (port);
+ return vec;
+}
+
SCM_DEFINE (scm_object_to_string, "object->string", 1, 1, 0,
(SCM obj, SCM printer),
"Return a Scheme string obtained by printing @var{obj}.\n"
@@ -380,6 +410,25 @@ SCM_DEFINE (scm_object_to_string, "object->string", 1, 1, 0,
}
#undef FUNC_NAME
+SCM_DEFINE (scm_call_with_output_locale_u8vector, "call-with-output-locale-u8vector", 1, 0, 0,
+ (SCM proc),
+ "Calls the one-argument procedure @var{proc} with a newly created output\n"
+ "port. When the function returns, a vector containing the bytes of a\n"
+ "locale representation of the characters written into the port is returned\n")
+#define FUNC_NAME s_scm_call_with_output_locale_u8vector
+{
+ SCM p;
+
+ p = scm_mkstrport (SCM_INUM0,
+ scm_make_string (SCM_INUM0, SCM_UNDEFINED),
+ SCM_OPN | SCM_WRTNG,
+ FUNC_NAME);
+ scm_call_1 (proc, p);
+
+ return scm_get_output_locale_u8vector (p);
+}
+#undef FUNC_NAME
+
SCM_DEFINE (scm_call_with_output_string, "call-with-output-string", 1, 0, 0,
(SCM proc),
"Calls the one-argument procedure @var{proc} with a newly created output\n"
@@ -424,6 +473,27 @@ SCM_DEFINE (scm_open_input_string, "open-input-string", 1, 0, 0,
}
#undef FUNC_NAME
+SCM_DEFINE (scm_open_input_locale_u8vector, "open-input-locale-u8vector", 1, 0, 0,
+ (SCM vec),
+ "Take a u8vector containing the bytes of a string encoded in the\n"
+ "current locale and return an input port that delivers characters\n"
+ "from the string. The port can be closed by\n"
+ "@code{close-input-port}, though its storage will be reclaimed\n"
+ "by the garbage collector if it becomes inaccessible.")
+#define FUNC_NAME s_scm_open_input_locale_u8vector
+{
+ scm_t_array_handle hnd;
+ ssize_t inc;
+ size_t len;
+ const scm_t_uint8 *buf;
+
+ buf = scm_u8vector_elements (vec, &hnd, &len, &inc);
+ SCM p = scm_i_mkstrport(SCM_INUM0, (const char *) buf, len, SCM_OPN | SCM_RDNG, FUNC_NAME);
+ scm_array_handle_release (&hnd);
+ return p;
+}
+#undef FUNC_NAME
+
SCM_DEFINE (scm_open_output_string, "open-output-string", 0, 0, 0,
(void),
"Return an output port that will accumulate characters for\n"
@@ -456,11 +526,26 @@ SCM_DEFINE (scm_get_output_string, "get-output-string", 1, 0, 0,
#undef FUNC_NAME
+SCM_DEFINE (scm_get_output_locale_u8vector, "get-output-locale-u8vector", 1, 0, 0,
+ (SCM port),
+ "Given an output port created by @code{open-output-string},\n"
+ "return a u8 vector containing the characters of the string\n"
+ "encoded in the current locale.")
+#define FUNC_NAME s_scm_get_output_locale_u8vector
+{
+ SCM_VALIDATE_OPOUTSTRPORT (1, port);
+ return scm_strport_to_locale_u8vector (port);
+}
+#undef FUNC_NAME
+
+
/* Given a null-terminated string EXPR containing a Scheme expression
read it, and return it as an SCM value. */
SCM
scm_c_read_string (const char *expr)
{
+ /* FIXME: the c string gets packed into a string, only to get
+ immediately unpacked in scm_mkstrport. */
SCM port = scm_mkstrport (SCM_INUM0,
scm_from_locale_string (expr),
SCM_OPN | SCM_RDNG,
diff --git a/libguile/strports.h b/libguile/strports.h
index 3129c03e2..b2ded01f1 100644
--- a/libguile/strports.h
+++ b/libguile/strports.h
@@ -44,13 +44,19 @@ SCM_API scm_t_bits scm_tc16_strport;
SCM_API SCM scm_mkstrport (SCM pos, SCM str, long modes, const char * caller);
+SCM_INTERNAL SCM scm_i_mkstrport (SCM pos, const char *locale_str, size_t str_len,
+ long modes, const char *caller);
SCM_API SCM scm_strport_to_string (SCM port);
+SCM_API SCM scm_strport_to_locale_u8vector (SCM port);
SCM_API SCM scm_object_to_string (SCM obj, SCM printer);
SCM_API SCM scm_call_with_output_string (SCM proc);
+SCM_API SCM scm_call_with_output_locale_u8vector (SCM proc);
SCM_API SCM scm_call_with_input_string (SCM str, SCM proc);
SCM_API SCM scm_open_input_string (SCM str);
+SCM_API SCM scm_open_input_locale_u8vector (SCM str);
SCM_API SCM scm_open_output_string (void);
SCM_API SCM scm_get_output_string (SCM port);
+SCM_API SCM scm_get_output_locale_u8vector (SCM port);
SCM_API SCM scm_c_read_string (const char *expr);
SCM_API SCM scm_c_eval_string (const char *expr);
SCM_API SCM scm_c_eval_string_in_module (const char *expr, SCM module);
diff --git a/module/system/base/compile.scm b/module/system/base/compile.scm
index 8470f39e2..26dd29e20 100644
--- a/module/system/base/compile.scm
+++ b/module/system/base/compile.scm
@@ -145,8 +145,11 @@
(from (current-language))
(to 'objcode)
(opts '()))
- (let ((comp (or output-file (compiled-file-name file)))
- (in (open-input-file file)))
+ (let* ((comp (or output-file (compiled-file-name file)))
+ (in (open-input-file file))
+ (enc (file-encoding in)))
+ (if enc
+ (set-port-encoding! in enc))
(ensure-writable-dir (dirname comp))
(call-with-output-file/atomic comp
(lambda (port)
diff --git a/test-suite/tests/encoding-escapes.test b/test-suite/tests/encoding-escapes.test
new file mode 100644
index 000000000..8859d2e83
--- /dev/null
+++ b/test-suite/tests/encoding-escapes.test
@@ -0,0 +1,139 @@
+;;;; encoding-escapes.test --- test suite for Guile's string encodings -*- mode: scheme; coding: utf-8 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(define s1 "última")
+(define s2 "cédula")
+(define s3 "años")
+(define s4 "羅生門")
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "ultima"
+ (string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
+
+ (pass-if "cedula"
+ (string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
+
+ (pass-if "anos"
+ (string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
+
+ (pass-if "Rashomon"
+ (string=? s4 (string-ints #x7f85 #x751f #x9580))))
+
+(with-test-prefix "chars"
+
+ (pass-if "ultima"
+ (list= eqv? (string->list s1)
+ (list #\372 #\l #\t #\i #\m #\a)))
+
+ (pass-if "cedula"
+ (list= eqv? (string->list s2)
+ (list #\c #\351 #\d #\u #\l #\a)))
+
+ (pass-if "anos"
+ (list= eqv? (string->list s3)
+ (list #\a #\361 #\o #\s)))
+
+ (pass-if "Rashomon"
+ (list= eqv? (string->list s4)
+ (list #\77605 #\72437 #\112600))))
+
+
+;; Check that an error is flagged on display output when the output
+;; error strategy is 'error
+
+(with-test-prefix "display output errors"
+
+ (pass-if-exception "ultima"
+ exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'error)
+ (display s1 pt)))
+
+ (pass-if-exception "Rashomon"
+ exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'error)
+ (display s4 pt))))
+
+;; Check that questions marks or substitutions appear when the conversion
+;; mode is substitute
+(with-test-prefix "display output substitutions"
+
+ (pass-if "ultima"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'substitute)
+ (display s1 pt)
+ (string=? "?ltima"
+ (get-output-string pt))))
+
+ (pass-if "Rashomon"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'substitute)
+ (display s4 pt)
+ (string=? "???"
+ (get-output-string pt)))))
+
+
+;; Check that hex escapes appear in the write output and that no error
+;; is thrown. The output error strategy should be irrelevant here.
+(with-test-prefix "display output escapes"
+
+ (pass-if "ultima"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'escape)
+ (display s1 pt)
+ (string=? "\\xfaltima"
+ (get-output-string pt))))
+ (pass-if "Rashomon"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ASCII")
+ (set-port-conversion-strategy! pt 'escape)
+ (display s4 pt)
+ (string=? "\\u7F85\\u751F\\u9580"
+ (get-output-string pt)))))
+
+(setlocale LC_ALL "en_US.utf8")
+
+(with-test-prefix "input escapes"
+
+ (pass-if "última"
+ (string=? "última"
+ (with-input-from-string "\"\\xfaltima\"" read)))
+
+ (pass-if "羅生門"
+ (string=? "羅生門"
+ (with-input-from-string "\"\\u7F85\\u751F\\u9580\"" read))))
+
diff --git a/test-suite/tests/encoding-iso88591.test b/test-suite/tests/encoding-iso88591.test
new file mode 100644
index 000000000..edd573432
--- /dev/null
+++ b/test-suite/tests/encoding-iso88591.test
@@ -0,0 +1,135 @@
+;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: iso-8859-1 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(setlocale LC_ALL "")
+
+(define s1 "ltima")
+(define s2 "cdula")
+(define s3 "aos")
+(define s4 "Cmo?")
+
+(with-test-prefix "string length"
+
+ (pass-if "ltima"
+ (eq? (string-length s1) 6))
+
+ (pass-if "cdula"
+ (eq? (string-length s2) 6))
+
+ (pass-if "aos"
+ (eq? (string-length s3) 4))
+
+ (pass-if "Cmo?"
+ (eq? (string-length s4) 6)))
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "ltima"
+ (string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
+
+ (pass-if "cdula"
+ (string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
+
+ (pass-if "aos"
+ (string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
+
+ (pass-if "Cmo?"
+ (string=? s4 (string-ints #xbf #x43 #xf3 #x6d #x6f #x3f))))
+
+(with-test-prefix "chars"
+
+ (pass-if "ltima"
+ (list= eqv? (string->list s1)
+ (list #\ #\l #\t #\i #\m #\a)))
+
+ (pass-if "cdula"
+ (list= eqv? (string->list s2)
+ (list #\c #\ #\d #\u #\l #\a)))
+
+ (pass-if "aos"
+ (list= eqv? (string->list s3)
+ (list #\a #\ #\o #\s)))
+
+ (pass-if "Cmo?"
+ (list= eqv? (string->list s4)
+ (list #\ #\C #\ #\m #\o #\?))))
+
+;; Check that the output is in ISO-8859-1 encoding
+(with-test-prefix "display"
+
+ (pass-if "s1"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-1")
+ (display s1 pt)
+ (list= eqv?
+ (list #xfa #x6c #x74 #x69 #x6d #x61)
+ (u8vector->list
+ (get-output-locale-u8vector pt)))))
+
+ (pass-if "s2"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-1")
+ (display s2 pt)
+ (list= eqv?
+ (list #x63 #xe9 #x64 #x75 #x6c #x61)
+ (u8vector->list
+ (get-output-locale-u8vector pt))))))
+
+(with-test-prefix "symbols == strings"
+
+ (pass-if "ltima"
+ (eq? (string->symbol s1) 'ltima))
+
+ (pass-if "cdula"
+ (eq? (string->symbol s2) 'cdula))
+
+ (pass-if "aos"
+ (eq? (string->symbol s3) 'aos))
+
+ (pass-if "Cmo?"
+ (eq? (string->symbol s4) 'Cmo?)))
+
+(with-test-prefix "non-ascii variable names"
+
+ (pass-if "1"
+ (let (( 1)
+ ( 2))
+ (eq? (+ ) 3))))
+
+(with-test-prefix "output errors"
+
+ (pass-if-exception "char 256" exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-1")
+ (set-port-conversion-strategy! pt 'error)
+ (display (string-ints 256) pt))))
+
+;; Reset locales
+(setlocale LC_ALL "C") \ No newline at end of file
diff --git a/test-suite/tests/encoding-iso88597.test b/test-suite/tests/encoding-iso88597.test
new file mode 100644
index 000000000..8985042f4
--- /dev/null
+++ b/test-suite/tests/encoding-iso88597.test
@@ -0,0 +1,136 @@
+;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: iso-8859-7 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(setlocale LC_ALL "")
+
+(define s1 "")
+(define s2 "")
+(define s3 "")
+(define s4 "")
+
+(with-test-prefix "string length"
+
+ (pass-if "s1"
+ (eq? (string-length s1) 4))
+
+ (pass-if "s2"
+ (eq? (string-length s2) 3))
+
+ (pass-if "s3"
+ (eq? (string-length s3) 8))
+
+ (pass-if "s4"
+ (eq? (string-length s4) 3)))
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "s1"
+ (string=? s1 (string-ints #x03a0 #x03b5 #x03c1 #x03af)))
+
+ (pass-if "s2"
+ (string=? s2 (string-ints #x03c4 #x03b7 #x03c2)))
+
+ (pass-if "s3"
+ (string=? s3 (string-ints #x03ba #x03c1 #x03b9 #x03c4 #x03b9 #x03ba #x03ae #x03c2)))
+
+ (pass-if "s4"
+ (string=? s4 (string-ints #x03ba #x03b1 #x03b9))))
+
+(with-test-prefix "chars"
+
+ (pass-if "s1"
+ (list= eqv? (string->list s1)
+ (list #\ #\ #\ #\)))
+
+ (pass-if "s2"
+ (list= eqv? (string->list s2)
+ (list #\ #\ #\)))
+
+ (pass-if "s3"
+ (list= eqv? (string->list s3)
+ (list #\ #\ #\ #\ #\ #\ #\ #\)))
+
+ (pass-if "s4"
+ (list= eqv? (string->list s4)
+ (list #\ #\ #\))))
+
+;; Testing that the display of the string is output in the ISO-8859-7
+;; encoding
+(with-test-prefix "display"
+
+ (pass-if "s1"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-7")
+ (display s1 pt)
+ (list= eqv?
+ (list #xd0 #xe5 #xf1 #xdf)
+ (u8vector->list
+ (get-output-locale-u8vector pt)))))
+ (pass-if "s2"
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-7")
+ (display s2 pt)
+ (list= eqv?
+ (list #xf4 #xe7 #xf2)
+ (u8vector->list
+ (get-output-locale-u8vector pt))))))
+
+(with-test-prefix "symbols == strings"
+
+ (pass-if ""
+ (eq? (string->symbol s1) '))
+
+ (pass-if ""
+ (eq? (string->symbol s2) '))
+
+ (pass-if ""
+ (eq? (string->symbol s3) '))
+
+ (pass-if ""
+ (eq? (string->symbol s4) ')))
+
+(with-test-prefix "non-ascii variable names"
+
+ (pass-if "1"
+ (let (( 1)
+ ( 2))
+ (eq? (+ ) 3))))
+
+(with-test-prefix "output errors"
+
+ (pass-if-exception "char #x0400"
+ exception:conversion
+ (let ((pt (open-output-string)))
+ (set-port-encoding! pt "ISO-8859-7")
+ (set-port-conversion-strategy! pt 'error)
+ (display (string-ints #x0400) pt))))
+
+;; Reset locale
+(setlocale LC_ALL "C") \ No newline at end of file
diff --git a/test-suite/tests/encoding-utf8.test b/test-suite/tests/encoding-utf8.test
new file mode 100644
index 000000000..83e7540f3
--- /dev/null
+++ b/test-suite/tests/encoding-utf8.test
@@ -0,0 +1,105 @@
+;;;; strings.test --- test suite for Guile's string functions -*- mode: scheme; coding: utf-8 -*-
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;;
+;;;; This program is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2, or (at your option)
+;;;; any later version.
+;;;;
+;;;; This program is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with this software; see the file COPYING. If not, write to
+;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;;;; Boston, MA 02110-1301 USA
+
+(define-module (test-strings)
+ #:use-module (test-suite lib)
+ #:use-module (srfi srfi-1))
+
+(define exception:conversion
+ (cons 'misc-error "^cannot convert to output locale"))
+
+;; Create a string from integer char values, eg. (string-ints 65) => "A"
+(define (string-ints . args)
+ (apply string (map integer->char args)))
+
+(setlocale LC_ALL "")
+
+(define s1 "última")
+(define s2 "cédula")
+(define s3 "años")
+(define s4 "羅生門")
+
+(with-test-prefix "string length"
+
+ (pass-if "última"
+ (eq? (string-length s1) 6))
+
+ (pass-if "cédula"
+ (eq? (string-length s2) 6))
+
+ (pass-if "años"
+ (eq? (string-length s3) 4))
+
+ (pass-if "羅生門"
+ (eq? (string-length s4) 3)))
+
+(with-test-prefix "internal encoding"
+
+ (pass-if "última"
+ (string=? s1 (string-ints #xfa #x6c #x74 #x69 #x6d #x61)))
+
+ (pass-if "cédula"
+ (string=? s2 (string-ints #x63 #xe9 #x64 #x75 #x6c #x61)))
+
+ (pass-if "años"
+ (string=? s3 (string-ints #x61 #xf1 #x6f #x73)))
+
+ (pass-if "羅生門"
+ (string=? s4 (string-ints #x7f85 #x751f #x9580))))
+
+(with-test-prefix "chars"
+
+ (pass-if "última"
+ (list= eqv? (string->list s1)
+ (list #\ú #\l #\t #\i #\m #\a)))
+
+ (pass-if "cédula"
+ (list= eqv? (string->list s2)
+ (list #\c #\é #\d #\u #\l #\a)))
+
+ (pass-if "años"
+ (list= eqv? (string->list s3)
+ (list #\a #\ñ #\o #\s)))
+
+ (pass-if "羅生門"
+ (list= eqv? (string->list s4)
+ (list #\羅 #\生 #\門))))
+
+(with-test-prefix "symbols == strings"
+
+ (pass-if "última"
+ (eq? (string->symbol s1) 'última))
+
+ (pass-if "cédula"
+ (eq? (string->symbol s2) 'cédula))
+
+ (pass-if "años"
+ (eq? (string->symbol s3) 'años))
+
+ (pass-if "羅生門"
+ (eq? (string->symbol s4) '羅生門)))
+
+(with-test-prefix "non-ascii variable names"
+
+ (pass-if "1"
+ (let ((芥川龍之介 1)
+ (ñ 2))
+ (eq? (+ 芥川龍之介 ñ) 3))))
+
+
diff --git a/test-suite/tests/numbers.test b/test-suite/tests/numbers.test
index 4a9476a52..774e228a7 100644
--- a/test-suite/tests/numbers.test
+++ b/test-suite/tests/numbers.test
@@ -22,6 +22,7 @@
;;;
;;; miscellaneous
;;;
+(setbinary)
(define exception:numerical-overflow
(cons 'numerical-overflow "^Numerical overflow"))
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test
index 67df5b979..76b3e5656 100644
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -33,6 +33,9 @@
;;;; Some general utilities for testing ports.
+;;; Make sure we are set up for 8-bit data
+(setbinary)
+
;;; Read from PORT until EOF, and return the result as a string.
(define (read-all port)
(let loop ((chars '()))
diff --git a/test-suite/tests/r6rs-ports.test b/test-suite/tests/r6rs-ports.test
index df12e5cbc..c2b0755f8 100644
--- a/test-suite/tests/r6rs-ports.test
+++ b/test-suite/tests/r6rs-ports.test
@@ -27,6 +27,9 @@
;;; All these tests assume Guile 1.8's port system, where characters are
;;; treated as octets.
+;;; Set the default encoding of future ports to be binary
+(setbinary)
+
(with-test-prefix "7.2.5 End-of-File Object"