summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim Blandy <jimb@red-bean.com>1999-09-02 06:40:54 +0000
committerJim Blandy <jimb@red-bean.com>1999-09-02 06:40:54 +0000
commit755a7125790fab364e1f4747aa31b18c99adb398 (patch)
tree580c9dd8bcc954e1b859ee24849d4f6796d1af7c
parentb4b1660b473fb9d97020829485d32e243c1e587e (diff)
downloadguile-755a7125790fab364e1f4747aa31b18c99adb398.tar.gz
* mb.c, mb.h: New files.
* init.c: #include "mb.h". (scm_boot_guile_1): call scm_init_mb. * Makefile.am (libguile_la_SOURCES, BUILT_SOURCES, modinclude_HEADERS): Include the `mb' files in the lists.
-rw-r--r--libguile/Makefile.am36
-rw-r--r--libguile/init.c2
-rw-r--r--libguile/mb.c600
-rw-r--r--libguile/mb.h152
4 files changed, 772 insertions, 18 deletions
diff --git a/libguile/Makefile.am b/libguile/Makefile.am
index b94510604..364a9c76f 100644
--- a/libguile/Makefile.am
+++ b/libguile/Makefile.am
@@ -40,7 +40,7 @@ libguile_la_SOURCES = \
evalext.c feature.c filesys.c fluids.c fports.c gc.c gdbint.c \
gh_data.c gh_eval.c gh_funcs.c gh_init.c gh_io.c gh_list.c \
gh_predicates.c gsubr.c guardians.c hash.c hashtab.c init.c \
- ioext.c keywords.c lang.c list.c load.c macros.c mallocs.c \
+ ioext.c keywords.c lang.c list.c load.c macros.c mallocs.c mb.c \
modules.c net_db.c numbers.c objects.c objprop.c options.c pairs.c \
ports.c posix.c print.c procprop.c procs.c ramap.c random.c read.c \
root.c scmsigs.c script.c simpos.c smob.c socket.c sort.c \
@@ -52,15 +52,15 @@ BUILT_SOURCES = \
cpp_err_symbols.c cpp_sig_symbols.c libpath.h alist.x arbiters.x \
async.x backtrace.x boolean.x chars.x continuations.x debug.x \
dynl.x dynwind.x eq.x error.x eval.x evalext.x feature.x filesys.x \
- fluids.x fports.x gc.x gsubr.x \
- guardians.x hash.x hashtab.x init.x ioext.x iselect.x keywords.x \
- lang.x list.x load.x macros.x mallocs.x modules.x net_db.x \
- numbers.x objects.x objprop.x options.x pairs.x ports.x posix.x \
- print.x procprop.x procs.x random.x ramap.x read.x regex-posix.x \
- root.x scmsigs.x script.x simpos.x smob.x socket.x sort.x \
- srcprop.x stackchk.x stacks.x stime.x strings.x strop.x strorder.x \
- strports.x struct.x symbols.x tag.x threads.x throw.x unif.x \
- variable.x vectors.x version.x vports.x weaks.x
+ fluids.x fports.x gc.x gsubr.x guardians.x hash.x hashtab.x init.x \
+ ioext.x iselect.x keywords.x lang.x list.x load.x macros.x \
+ mallocs.x mb.x modules.x net_db.x numbers.x objects.x objprop.x \
+ options.x pairs.x ports.x posix.x print.x procprop.x procs.x \
+ random.x ramap.x read.x regex-posix.x root.x scmsigs.x script.x \
+ simpos.x smob.x socket.x sort.x srcprop.x stackchk.x stacks.x \
+ stime.x strings.x strop.x strorder.x strports.x struct.x symbols.x \
+ tag.x threads.x throw.x unif.x variable.x vectors.x version.x \
+ vports.x weaks.x
EXTRA_libguile_la_SOURCES = _scm.h \
strerror.c inet_aton.c putenv.c \
@@ -89,14 +89,14 @@ modinclude_HEADERS = \
continuations.h debug.h dynl.h dynwind.h eq.h error.h eval.h \
evalext.h feature.h filesys.h fports.h gc.h gdb_interface.h \
gdbint.h genio.h gsubr.h guardians.h hash.h hashtab.h init.h \
- ioext.h keywords.h kw.h lang.h list.h load.h macros.h mallocs.h \
- modules.h net_db.h numbers.h objects.h objprop.h options.h pairs.h \
- ports.h posix.h regex-posix.h print.h procprop.h procs.h random.h \
- ramap.h read.h root.h scmsigs.h script.h simpos.h smob.h socket.h \
- sort.h srcprop.h stackchk.h stacks.h stime.h strings.h strop.h \
- strorder.h strports.h struct.h symbols.h tag.h tags.h throw.h \
- unif.h variable.h vectors.h version.h vports.h weaks.h snarf.h \
- threads.h coop-defs.h fluids.h iselect.h
+ ioext.h keywords.h kw.h lang.h list.h load.h macros.h mallocs.h \
+ mb.h modules.h net_db.h numbers.h objects.h objprop.h options.h \
+ pairs.h ports.h posix.h regex-posix.h print.h procprop.h procs.h \
+ random.h ramap.h read.h root.h scmsigs.h script.h simpos.h smob.h \
+ socket.h sort.h srcprop.h stackchk.h stacks.h stime.h strings.h \
+ strop.h strorder.h strports.h struct.h symbols.h tag.h tags.h \
+ throw.h unif.h variable.h vectors.h version.h vports.h weaks.h \
+ snarf.h threads.h coop-defs.h fluids.h iselect.h
## This file is generated at configure time. That is why it is DATA
## and not a header -- headers are included in the distribution.
diff --git a/libguile/init.c b/libguile/init.c
index bf5c4e3f0..d9a8b2dca 100644
--- a/libguile/init.c
+++ b/libguile/init.c
@@ -46,6 +46,7 @@
#include "_scm.h"
/* Everybody has an init function. */
+#include "mb.h"
#include "alist.h"
#include "arbiters.h"
#include "async.h"
@@ -447,6 +448,7 @@ scm_boot_guile_1 (base, closure)
scm_init_threads (base);
#endif
start_stack (base);
+ scm_init_mb ();
scm_init_gsubr ();
scm_init_feature ();
scm_init_alist ();
diff --git a/libguile/mb.c b/libguile/mb.c
new file mode 100644
index 000000000..10c5b19e1
--- /dev/null
+++ b/libguile/mb.c
@@ -0,0 +1,600 @@
+/* Copyright (C) 1995,1996,1997,1998,1999 Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this software; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
+ *
+ * As a special exception, the Free Software Foundation gives permission
+ * for additional uses of the text contained in its release of GUILE.
+ *
+ * The exception is that, if you link the GUILE library with other files
+ * to produce an executable, this does not by itself cause the
+ * resulting executable to be covered by the GNU General Public License.
+ * Your use of that executable is in no way restricted on account of
+ * linking the GUILE library code into it.
+ *
+ * This exception does not however invalidate any other reasons why
+ * the executable file might be covered by the GNU General Public License.
+ *
+ * This exception applies only to the code released by the
+ * Free Software Foundation under the name GUILE. If you copy
+ * code from other Free Software Foundation releases into a copy of
+ * GUILE, as the General Public License permits, the exception does
+ * not apply to the code that you add in this way. To avoid misleading
+ * anyone as to the status of such modified files, you must delete
+ * this exception notice from them.
+ *
+ * If you write modifications of your own for GUILE, it is your choice
+ * whether to permit this exception to apply to your modifications.
+ * If you do not wish that, delete this exception notice. */
+
+/* Headers. */
+
+#include "_scm.h"
+#include "mb.h"
+
+
+/* Here is a description of the encoding.
+
+ ** THIS WILL CHANGE IN FUTURE VERSIONS OF GUILE --- IT IS NOT
+ CORRECT TO ASSUME THE USE OF THIS ENCODING OUTSIDE OF mb.c and
+ mb.h. **
+
+ If you can't accomplish what you want without this info, then the
+ multibyte API is flawed, and we need to extend it. If you spread
+ this knowledge around into other code, then it will break when we
+ change encodings.
+
+ You have been warned.
+
+
+ For all ASCII characters, the Guile scm_char_t code is equal to the
+ ASCII code. The Guile multi-byte encoding is a single byte whose
+ value is the character's ASCII code. Note that ASCII doesn't
+ contain any characters whose numbers are above 127.
+
+ For non-ASCII characters:
+
+ Each character is assigned a character set number from 0x81 to 0xFE
+ (except for 0x9A .. 0x9F), and a position within that character
+ set. A "position" within a character set is one or two bytes from
+ 0x20 to 0x7F.
+
+ For example:
+ - The Latin-1 character set is 0x81.
+ - The Japanese JISX0208.1983/1990 Kanji set is 0x92.
+ - The character á (a lower-case a with an acute accent) is part of
+ the Latin-1 character set. Its position is the byte 0x61.
+ - The Japanese Katakana character "ka" is part of the JISX0208
+ character set. Its position is the pair of bytes 0x25 0x2B.
+
+ Once we know a character's character set, we can determine whether
+ its position is one or two bytes, and the form of its encoding.
+
+ Character set number positions encoding byte sequence
+ ===========================================================================
+ from 0x81 to 0x8f 1 byte SET POS +0x80
+ from 0x90 to 0x99 2 bytes SET POS1+0x80 POS2+0x80
+ from 0xA0 to 0xDF 1 byte 0x9A SET POS +0x80
+ from 0xE0 to 0xEF 1 byte 0x9B SET POS +0x80
+ from 0xF0 to 0xF4 2 bytes 0x9C SET POS1+0x80 POS2+0x80
+ from 0xF5 to 0xFE 2 bytes 0x9D SET POS1+0x80 POS2+0x80
+
+ "SET" is the character set number;
+ "POS" is a one-byte position; and
+ "POS1" and "POS2" are a two-byte position,
+
+ Some examples:
+ - For the character á, SET is 0x81, and POS is 0x61, so it would be
+ encoded by the byte sequence 0x81 0xE1.
+ - For the Japanese Katakana character "ka", SET is 0x92, and POS1
+ and POS2 are 0x25 and 0x2B, so it would be encoded by the byte
+ sequence 0x92 0xA5 0xAB.
+
+ So the longest encoding is four bytes long.
+
+ It's easy to verify that this encoding meets the conditions
+ promised by mbapi.texi:
+
+ - Every ASCII character is encoded as a single byte from 0 to 127,
+ in the obvious way.
+ - The encodings of non-ASCII characters use only bytes between 0x80
+ and 0xFF.
+ - No character encoding is a subsequence of any other character
+ encoding, since bytes from 0x00 to 0x9f occur only at the
+ beginning of a sequence.
+ - You can always determine the full length of a character's
+ encoding from its first byte.
+ - Given an arbitrary byte position in a Guile string, you can
+ always find the beginning and end of the character containing
+ that byte without scanning too far in either direction, assuming
+ the string is null-terminated or followed by another valid
+ character (as substrings are).
+
+
+ How does Guile choose scm_char_t values for non-ASCII characters?
+
+ We divide a character value up into three fields:
+ FIELD1: bits 18 -- 14 (most significant bits)
+ FIELD2: bits 13 -- 7
+ FIELD3: bits 6 -- 0 (least significant bits)
+
+ If the character's position is one byte, then:
+ FIELD1 is zero.
+ FIELD2 is the character set number, minus 0x70.
+ FIELD3 is the character position.
+
+ If the character's position is two bytes, then:
+ FIELD2 is the first byte of the character's position.
+ FIELD3 is the second byte of the character's position.
+ If the character set number is from 0x90 to 0x99, then:
+ FIELD1 is the character set number, minus 0x8f.
+ (Thus, a number from 0x01 to 0x0A.)
+ If the character set number is from 0xF0 to 0xFE, then:
+ FIELD1 is the character set number, minus 0xE0.
+ (Thus, a number from 0x10 to 0x1E.)
+
+ For example:
+ - For the character á, FIELD1 would be zero, FIELD2 would be 0x11,
+ and FIELD3 would be 0x61. Thus, the full character code would be
+ (0x11 << 7) | 0x61, or 2273.
+ - For the Japanese Katakana character "ka", FIELD1 would be 0x3,
+ FIELD2 would be 0x25 and FIELD3 would be 0x2B. Thus, the full
+ character code would be (0x3 << 14) | (0x25 << 7) | 0x2B, or 53931.
+
+ Thus, character codes fall into the following ranges:
+
+ 0 .. 127 ASCII
+ 2208 .. 4095 "official" one-byte position character sets
+ 6176 .. 16383 "unofficial" one-byte position character sets
+ 20512 .. 180223 "official" two-byte position character sets
+ 266272 .. 507903 "unofficial" two-byte position character sets
+
+ It's hairy, but at the time this was designed, Unicode didn't exist
+ --- this encoding allowed Emacs to incorporate characters from all
+ kinds of character sets unchanged. It also allows Emacs to
+ distinguish between Japanese and Chinese character sets, which is
+ important to some users.
+
+ Even when we make the transition to Unicode, we will probably
+ retain some way of distinguishing Japanese and Chinese characters.
+ This is a highly controvertial issue. However, I think that the
+ opinions of people who do not use Chinese or Japanese regularly
+ should be discounted; once this is done, there is a substantial
+ body of users who say they need this distinction in the encoding
+ itself. So Guile will support it. */
+
+
+
+/* Exceptions. */
+
+SCM_SYMBOL (text_not_char_boundary, "text:not-char-boundary");
+SCM_SYMBOL (text_bad_encoding, "text:bad-encoding");
+static const char text_bad_encoding_msg[] =
+ "string contains byte sequence which is not a valid character encoding";
+SCM_SYMBOL (text_not_guile_char, "text:not-guile-char");
+
+
+
+/* Basic multibyte character processing. */
+
+/* Assembling and disassembling character codes.
+ A `CHAR1' is a character whose position is one byte.
+ A `CHAR2' is a character whose position is two bytes.
+ The suffix `O' refers to an "official" character set --- one
+ whose character set number is in the range 0x81 -- 0x99.
+ The suffix `P' refers to a "private" character set --- one
+ whose character set number is in the range 0xA0 -- 0xFE.
+*/
+
+#define BUILD_CHAR1(set, pos) ((((set) - 0x70) << 7) | (pos))
+#define BUILD_CHAR2(set, offset, pos1, pos2) \
+ ((((set) - (offset)) << 14) \
+ | ((pos1) << 7) \
+ | (pos2))
+
+#define IS_ASCII_CHAR(c) ((c) < 0x80)
+
+#define FIRST_CHAR1O (BUILD_CHAR1 (0x81, 0x20))
+#define LAST_CHAR1O (BUILD_CHAR1 (0x8f, 0x7F))
+
+#define FIRST_CHAR1P (BUILD_CHAR1 (0xA0, 0x20))
+#define LAST_CHAR1P (BUILD_CHAR1 (0xEF, 0x7F))
+
+#define FIRST_CHAR2O (BUILD_CHAR2 (0x90, 0x8F, 0x20, 0x20))
+#define LAST_CHAR2O (BUILD_CHAR2 (0x99, 0x8F, 0x7F, 0x7F))
+
+#define FIRST_CHAR2P (BUILD_CHAR2 (0xF0, 0xE0, 0x20, 0x20))
+#define LAST_CHAR2P (BUILD_CHAR2 (0xFE, 0xE0, 0x7F, 0x7F))
+
+#define CHAR1_SET(c) (((c) >> 7) + 0x70)
+#define CHAR1_POS(c) ((c) & 0x7F)
+
+#define CHAR2_SET(c, offset) (((c) >> 14) + (offset))
+#define CHAR2_POS1(c) (((c) >> 7) & 0x7f)
+#define CHAR2_POS2(c) ((c) & 0x7f)
+
+scm_char_t
+scm_mb_get_func (const unsigned char *p)
+{
+ unsigned char lead = *p;
+
+ if (IS_ASCII_CHAR (lead))
+ return lead;
+ else if (lead == 0x80)
+ /* Guile does *not* support composite characters, thank goodness. */
+ return -1;
+ else if (lead < 0x90)
+ {
+ unsigned char set = lead;
+ unsigned char pos = p[1] & 0x7f;
+
+ return BUILD_CHAR1 (set, pos);
+ }
+ else if (lead < 0x9A)
+ {
+ unsigned char set = load;
+ unsigned char pos_hi = p[1] & 0x7f;
+ unsigned char pos_lo = p[2] & 0x7f;
+
+ return BUILD_CHAR2 (set, 0x8F, pos_hi, pos_lo);
+ }
+ else if (lead < 0x9C)
+ {
+ unsigned char set = p[1];
+ unsigned char pos = p[2] & 0x7f;
+
+ return BUILD_CHAR1 (set, pos);
+ }
+ else if (lead < 0x9E)
+ {
+ unsigned char set = p[1];
+ unsigned char pos_hi = p[2] & 0x7f;
+ unsigned char pos_lo = p[3] & 0x7f;
+
+ return BUILD_CHAR2 (set, 0xE0, pos_hi, pos_lo);
+ }
+ else
+ return -1;
+}
+
+int
+scm_mb_put_func (scm_char_t c, unsigned char *p)
+{
+ if (c < 0)
+ return 0;
+ else if (IS_ASCII_CHAR (c))
+ {
+ *p = c;
+ return 1;
+ }
+ else if (c < FIRST_CHAR1O)
+ return 0;
+ else if (c <= LAST_CHAR1O)
+ {
+ /* encoding is SET; POS+0x80 */
+ *p++ = CHAR1_SET (c);
+ *p++ = CHAR1_POS (c) | 0x80;
+ return 2;
+ }
+ else if (c < FIRST_CHAR1P)
+ return 0;
+ else if (c <= LAST_CHAR1P)
+ {
+ /* encoding is: 0x9A or 0x9B; SET; POS+0x80 */
+ unsigned char set = CHAR1_SET (c);
+ *p++ = (set < 0xE0) ? 0x9A : 0x9B;
+ *p++ = set;
+ *p++ = CHAR1_POS (c) | 0x80;
+ return 3;
+ }
+ else if (c < FIRST_CHAR2O)
+ return 0;
+ else if (c <= LAST_CHAR2O)
+ {
+ /* encoding is: SET; POS1+0x80; POS2+0x80 */
+ *p++ = CHAR2_SET (c, 0x8f);
+ *p++ = CHAR2_POS1 (c) | 0x80;
+ *p++ = CHAR2_POS2 (c) | 0x80;
+ return 3;
+ }
+ else if (c < FIRST_CHAR2P)
+ return 0;
+ else if (c <= LAST_CHAR2P)
+ {
+ /* encoding is: 0x9C or 0x9D; SET; POS1+0x80; POS2+0x80 */
+ unsigned char set = CHAR2_SET (c, 0xE0);
+ *p++ = (set < 0xF5) ? 0x9C : 0x9D;
+ *p++ = set;
+ *p++ = CHAR2_POS1 (c) | 0x80;
+ *p++ = CHAR2_POS2 (c) | 0x80;
+ return 4;
+ }
+ else
+ return 0;
+}
+
+int
+scm_mb_len_func (unsigned char b)
+{
+ return scm_mb_len (b);
+}
+
+int
+scm_mb_len_char_func (scm_char_t c)
+{
+ return (IS_ASCII_CHAR (c) ? 1
+ : c < FIRST_CHAR1O ? 0 : c <= LAST_CHAR1O ? 2
+ : c < FIRST_CHAR1P ? 0 : c <= LAST_CHAR1P ? 3
+ : c < FIRST_CHAR2O ? 0 : c <= LAST_CHAR2O ? 3
+ : c < FIRST_CHAR2P ? 0 : c <= LAST_CHAR2P ? 4
+ : 0);
+}
+
+
+/* Finding character encoding boundaries. */
+
+const unsigned char *
+scm_mb_floor (const unsigned char *p)
+{
+ while (! scm_mb_boundary_p (p))
+ p--;
+
+ return p;
+}
+
+const unsigned char *
+scm_mb_ceiling (const unsigned char *p)
+{
+ while (! scm_mb_boundary_p (p))
+ p++;
+
+ return p;
+}
+
+
+/* Multibyte string functions. */
+
+/* Return the number of characters encoded by the LEN bytes at P. */
+int
+scm_mb_count (const unsigned char *p, int len)
+{
+ int count = 0;
+ const unsigned char *end = p + len;
+
+ /* If this turns out to be a big bottleneck, then we'll make it not
+ check every byte. But for now I think I want the sanity checking. */
+ while (p < end)
+ {
+ if (*p < 0x80)
+ count++, p++;
+ else if (! scm_mb_boundary_p (p))
+ /* At the top of the loop, p must always be pointing at the
+ beginning of a character encoding. */
+ goto error;
+ else
+ {
+ int n = scm_mb_len (*p);
+
+ /* Make sure this character's encoding fits within the string. */
+ if (p + n > end)
+ goto error;
+
+ p++, n--;
+ while (n > 0)
+ {
+ /* No character start bytes should occur within the
+ encoding. */
+ if (scm_mb_boundary_p (p))
+ goto error;
+ p++, n--;
+ }
+
+ count++;
+ }
+ }
+
+ return count;
+
+ error:
+ scm_error (text_bad_encoding, "scm_mb_count",
+ text_bad_encoding_msg, SCM_EOL, SCM_EOL);
+}
+
+
+/* Return the character at *PP, and advance *PP to the next character. */
+scm_char_t
+scm_mb_walk (const unsigned char **pp)
+{
+ const unsigned char *p = *p;
+ scm_char_t c = scm_mb_get (p);
+ *pp = p + scm_mb_len (*p);
+ return c;
+}
+
+
+/* Return the address of the character before P. */
+const unsigned char *
+scm_mb_prev (const unsigned char *p)
+{
+ p--;
+ while (! scm_mb_boundary_p (p))
+ p--;
+
+ return p;
+}
+
+
+/* Return the address of the character after P. */
+const unsigned char *
+scm_mb_next (const unsigned char *p)
+{
+ p++;
+ while (! scm_mb_boundary_p (p))
+ p++;
+
+ return p;
+}
+
+
+/* Return the location of the I'th character in LEN bytes at P. */
+const unsigned char *
+scm_mb_index (const unsigned char *p, int len, int i)
+{
+ struct scm_mb_cache cache;
+
+ cache.character = 0;
+ cache.byte = 0;
+
+ return scm_mb_index_cached_func (p, len, i, &cache);
+}
+
+
+const unsigned char *
+scm_mb_index_cached_func (const unsigned char *p, int len, int i,
+ struct scm_mb_cache *cache)
+{
+ int character = cache->character;
+ int byte = cache->byte;
+
+ SCM_ASSERT (i >= 0, i, SCM_OUTOFRANGE, "scm_mb_index");
+
+ /* If cache's character and byte offsets are the same, then that
+ means that all characters up to that position are a single byte
+ long, so that prefix of the string can be indexed normally. */
+ if (i <= character
+ && character == byte)
+ return &p[i];
+
+ /* We start from the beginning of the string or the cache position,
+ whichever is closer. */
+ if (i <= character / 2)
+ character = byte = 0;
+
+ if (i < character)
+ {
+ /* Scanning backwards! */
+
+ while (byte > 0 && i < character)
+ {
+ byte--;
+ if (scm_mb_boundary_p (&p[byte]))
+ character--;
+ }
+
+ /* We never got there! The cache and the string must have been
+ out of sync. */
+ if (i < character)
+ scm_misc_error ("scm_mb_index",
+ "multibyte position cache was inaccurate",
+ SCM_EOL);
+ }
+ else if (i > character)
+ {
+ /* Scanning forwards! */
+ while (byte < len && i > character)
+ {
+ if (! scm_mb_boundary_p (&p[byte]))
+ scm_error (text_bad_encoding, "scm_mb_index",
+ text_bad_encoding_msg, SCM_EOL, SCM_EOL);
+ byte += scm_mb_len (p[byte]);
+ character++;
+ }
+
+ /* We never got there! This could mean that 1) i was off the
+ end of the string, or 2) the cache and string were out of
+ sync. Assume the former. */
+ if (i > character)
+ SCM_ASSERT (0, i, SCM_OUTOFRANGE, "scm_mb_index");
+ }
+ else
+ /* Perfect cache hit! */
+ return &p[byte];
+
+ cache->character = character;
+ cache->byte = byte;
+ return &p[byte];
+}
+
+
+/* Convert a multibyte string to an array of scm_char_t's.
+ The caller is responsible for freeing the result. */
+scm_char_t *
+scm_mb_multibyte_to_fixed (const unsigned char *p, int len, int *result_len)
+{
+ const unsigned char *end = p + len;
+ scm_char_t *buf;
+ int buf_len;
+
+ buf = scm_must_malloc (len * sizeof (*buf), "scm_mb_multibyte_to_fixed");
+ buf_len = 0;
+
+ while (p < len)
+ {
+ scm_char_t c = scm_mb_get (p);
+ if (c < 0)
+ scm_error (text_bad_encoding, "scm_mb_multibyte_to_fixed",
+ text_bad_encoding_msg, SCM_EOL, SCM_EOL);
+ buf[buf_len++] = c;
+ p += scm_mb_len (*p);
+ }
+
+ buf = scm_must_realloc (buf, len * sizeof (*buf), buf_len * sizeof (*buf),
+ "scm_mb_multibyte_to_fixed");
+ *result_len = buf_len;
+ return buf;
+}
+
+/* Convert an array of scm_char_t's to a multibyte string.
+ The caller is responsible for freeing the result. */
+unsigned char *
+scm_mb_fixed_to_multibyte (const scm_char_t *fixed, int len, int *result_len)
+{
+ int i;
+ int buf_size;
+ unsigned char *buf, *p;
+
+ /* Compute the buffer size. I think it's faster to make two passes
+ over the string like this than to possibly recopy it. */
+ buf_size = 0;
+ for (i = 0; i < len; i++)
+ buf_size += scm_mb_len_char (fixed[i]);
+
+ buf = scm_must_malloc (buf_size + 1, "scm_mb_fixed_to_multibyte");
+ p = buf;
+ for (i = 0; i < len; i++)
+ {
+ scm_mb_put (fixed[i], p);
+ p += scm_mb_len (*p);
+ }
+
+ /* Was the size we computed actually correct? */
+ if (p != buf + buf_size)
+ abort ();
+
+ /* Null-terminate the string. */
+ *p = '\0';
+
+ *result_len = buf_size;
+ return buf;
+}
+
+
+/* Initialization. */
+
+void
+scm_init_mb ()
+{
+#include "mb.x"
+}
diff --git a/libguile/mb.h b/libguile/mb.h
new file mode 100644
index 000000000..725c5661d
--- /dev/null
+++ b/libguile/mb.h
@@ -0,0 +1,152 @@
+#ifndef SCM_MB_H
+#define SCM_MB_H
+
+/* Copyright (C) 1999 Free Software Foundation, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this software; see the file COPYING. If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
+ *
+ * As a special exception, the Free Software Foundation gives permission
+ * for additional uses of the text contained in its release of GUILE.
+ *
+ * The exception is that, if you link the GUILE library with other files
+ * to produce an executable, this does not by itself cause the
+ * resulting executable to be covered by the GNU General Public License.
+ * Your use of that executable is in no way restricted on account of
+ * linking the GUILE library code into it.
+ *
+ * This exception does not however invalidate any other reasons why
+ * the executable file might be covered by the GNU General Public License.
+ *
+ * This exception applies only to the code released by the
+ * Free Software Foundation under the name GUILE. If you copy
+ * code from other Free Software Foundation releases into a copy of
+ * GUILE, as the General Public License permits, the exception does
+ * not apply to the code that you add in this way. To avoid misleading
+ * anyone as to the status of such modified files, you must delete
+ * this exception notice from them.
+ *
+ * If you write modifications of your own for GUILE, it is your choice
+ * whether to permit this exception to apply to your modifications.
+ * If you do not wish that, delete this exception notice. */
+
+#include "libguile/__scm.h"
+
+/* Here are macros and functions for working with Guile's multibyte
+ text representation. At present, Guile uses the same encoding as
+ GNU Emacs 20.4, but Guile and Emacs will hopefully switch to UTF-8
+ sometime soon; you should use these macros to insulate your code
+ from the details of the encoding, so when the switch occurs, your
+ code won't break. All knowledge of Guile's character set should be
+ in mb.h (here) or mb.c.
+
+ These are all documented in ref/mbapi.texi, which is part of the
+ guile-doc CVS module.
+
+ Actually, a lot of these definitions only rely on the ``Promised
+ Properties of the Guile Multibyte Encoding'', as described in
+ mbapi.texi --- mostly the promise that ASCII characters are encoded
+ as themselves. */
+
+typedef int scm_char_t;
+
+
+/* Retrieve the character whose encoding is at P. */
+#define scm_mb_get(p) \
+ (*(p) < 128 ? *(p) : scm_mb_get_func (p))
+extern scm_char_t scm_mb_get_func (const unsigned char *p);
+
+/* Store the encoding of the character C at P, and return the
+ encoding's length in bytes. */
+#define scm_mb_put(c, p) \
+ ((c) < 128 ? (*(p) = c, 1) : scm_mb_put_func ((c), (p)))
+extern int scm_mb_put_func (scm_char_t c, unsigned char *p);
+
+/* The length of the longest character encoding, in bytes. */
+#define scm_mb_max_len (4)
+
+/* Given an encoding's first byte, return its length. */
+#define scm_mb_len(b) \
+ ((b) < 0x80 ? 1 \
+ : (b) < 0x90 ? 2 \
+ : (b) < 0x9C ? 3 \
+ : (b) < 0x9E ? 4 \
+ : 1)
+extern int scm_mb_len_func (unsigned char b);
+
+/* Given a Guile character, return the length of its encoding. */
+#define scm_mb_len_char(c) (scm_mb_len_char_func(c))
+extern int scm_mb_len_char_func (scm_char_t c);
+
+
+
+/* Finding character encoding boundaries. */
+
+/* Return true if P points at the first byte of an encoding. */
+#define scm_mb_boundary_p(p) (*(p) < 0xA0)
+
+/* Round P to the previous/next character boundary. */
+extern unsigned char *scm_mb_floor (const unsigned char *p);
+extern unsigned char *scm_mb_ceiling (const unsigned char *p);
+
+
+/* Multibyte string functions. */
+
+/* Return the number of characters encoded by the LEN bytes at P. */
+extern int scm_mb_count (const unsigned char *p, int len);
+
+/* Return the character at *PP, and advance *PP to the next character. */
+extern scm_char_t scm_mb_walk (const unsigned char **pp);
+
+/* Return the address of the character before P. */
+extern unsigned char *scm_mb_prev (const unsigned char *p);
+
+/* Return the address of the character after P. */
+extern unsigned char *scm_mb_next (const unsigned char *p);
+
+/* Return the location of the I'th character in LEN bytes at P. */
+extern unsigned char *scm_mb_index (const unsigned char *p, int len, int i);
+
+/* A cache of information about the positions of characters in
+ strings. Initialize all elements to zero before using. */
+struct scm_mb_cache {
+ int character; /* a character index */
+ int byte; /* its byte offset in the string */
+};
+
+/* Return the location of the I'th character in LEN bytes at P.
+ Use and update CACHE, if possible. */
+#define scm_mb_index_cached(p, len, i, cache) \
+ ((i) <= (cache)->character && (cache)->character == (cache)->byte \
+ ? &(p)[(i)] \
+ : scm_mb_index_cached_func ((p), (len), (i), (cache)))
+extern unsigned char *scm_mb_index_cached_func (const unsigned char *p,
+ int len, int i,
+ struct scm_mb_cache *cache);
+
+/* Convert a multibyte string to an array of scm_char_t's.
+ The caller is responsible for freeing the result. */
+extern scm_char_t *scm_mb_multibyte_to_fixed (const unsigned char *p, int len,
+ int *result_len);
+
+/* Convert an array of scm_char_t's to a multibyte string.
+ The caller is responsible for freeing the result. */
+extern unsigned char *scm_mb_fixed_to_multibyte (const scm_char_t *fixed,
+ int len, int *result_len);
+
+/* Initialize the multibyte stuff. */
+extern void scm_init_mb (void);
+
+#endif /* SCM_MB_H */