* mb.c, mb.h: New files.

* init.c: #include "mb.h". (scm_boot_guile_1): call scm_init_mb. * Makefile.am (libguile_la_SOURCES, BUILT_SOURCES, modinclude_HEADERS): Include the `mb' files in the lists.
author: Jim Blandy <jimb@red-bean.com> 1999-09-02 06:40:54 +0000
committer: Jim Blandy <jimb@red-bean.com> 1999-09-02 06:40:54 +0000
commit: 755a7125790fab364e1f4747aa31b18c99adb398 (patch)
tree: 580c9dd8bcc954e1b859ee24849d4f6796d1af7c
parent: b4b1660b473fb9d97020829485d32e243c1e587e (diff)
download: guile-755a7125790fab364e1f4747aa31b18c99adb398.tar.gz
4 files changed, 772 insertions, 18 deletions
diff --git a/libguile/Makefile.am b/libguile/Makefile.am
index b94510604..364a9c76f 100644
--- a/libguile/Makefile.am
+++ b/libguile/Makefile.am
@@ -40,7 +40,7 @@ libguile_la_SOURCES = \
     evalext.c feature.c filesys.c fluids.c fports.c gc.c gdbint.c	\
     gh_data.c gh_eval.c gh_funcs.c gh_init.c gh_io.c gh_list.c		\
     gh_predicates.c gsubr.c guardians.c hash.c hashtab.c init.c		\
-    ioext.c keywords.c lang.c list.c load.c macros.c mallocs.c		\
+    ioext.c keywords.c lang.c list.c load.c macros.c mallocs.c mb.c	\
     modules.c net_db.c numbers.c objects.c objprop.c options.c pairs.c	\
     ports.c posix.c print.c procprop.c procs.c ramap.c random.c read.c	\
     root.c scmsigs.c script.c simpos.c smob.c socket.c sort.c		\
@@ -52,15 +52,15 @@ BUILT_SOURCES = \
     cpp_err_symbols.c cpp_sig_symbols.c libpath.h alist.x arbiters.x	\
     async.x backtrace.x boolean.x chars.x continuations.x debug.x	\
     dynl.x dynwind.x eq.x error.x eval.x evalext.x feature.x filesys.x	\
-    fluids.x fports.x gc.x gsubr.x					\
-    guardians.x hash.x hashtab.x init.x ioext.x iselect.x keywords.x	\
-    lang.x list.x load.x macros.x mallocs.x modules.x net_db.x		\
-    numbers.x objects.x objprop.x options.x pairs.x ports.x posix.x	\
-    print.x procprop.x procs.x random.x ramap.x read.x regex-posix.x	\
-    root.x scmsigs.x script.x simpos.x smob.x socket.x sort.x		\
-    srcprop.x stackchk.x stacks.x stime.x strings.x strop.x strorder.x	\
-    strports.x struct.x symbols.x tag.x threads.x throw.x unif.x	\
-    variable.x vectors.x version.x vports.x weaks.x
+    fluids.x fports.x gc.x gsubr.x guardians.x hash.x hashtab.x init.x	\
+    ioext.x iselect.x keywords.x lang.x list.x load.x macros.x		\
+    mallocs.x mb.x modules.x net_db.x numbers.x objects.x objprop.x	\
+    options.x pairs.x ports.x posix.x print.x procprop.x procs.x	\
+    random.x ramap.x read.x regex-posix.x root.x scmsigs.x script.x	\
+    simpos.x smob.x socket.x sort.x srcprop.x stackchk.x stacks.x	\
+    stime.x strings.x strop.x strorder.x strports.x struct.x symbols.x	\
+    tag.x threads.x throw.x unif.x variable.x vectors.x version.x	\
+    vports.x weaks.x
 
 EXTRA_libguile_la_SOURCES = _scm.h \
     strerror.c inet_aton.c putenv.c \
@@ -89,14 +89,14 @@ modinclude_HEADERS = \
     continuations.h debug.h dynl.h dynwind.h eq.h error.h eval.h	\
     evalext.h feature.h filesys.h fports.h gc.h gdb_interface.h		\
     gdbint.h genio.h gsubr.h guardians.h hash.h hashtab.h init.h	\
-    ioext.h keywords.h kw.h lang.h list.h load.h macros.h mallocs.h 	\
-    modules.h net_db.h numbers.h objects.h objprop.h options.h pairs.h	\
-    ports.h posix.h regex-posix.h print.h procprop.h procs.h random.h	\
-    ramap.h read.h root.h scmsigs.h script.h simpos.h smob.h socket.h	\
-    sort.h srcprop.h stackchk.h stacks.h stime.h strings.h strop.h	\
-    strorder.h strports.h struct.h symbols.h tag.h tags.h throw.h	\
-    unif.h variable.h vectors.h version.h vports.h weaks.h snarf.h	\
-    threads.h coop-defs.h fluids.h iselect.h
+    ioext.h keywords.h kw.h lang.h list.h load.h macros.h mallocs.h	\
+    mb.h modules.h net_db.h numbers.h objects.h objprop.h options.h	\
+    pairs.h ports.h posix.h regex-posix.h print.h procprop.h procs.h	\
+    random.h ramap.h read.h root.h scmsigs.h script.h simpos.h smob.h	\
+    socket.h sort.h srcprop.h stackchk.h stacks.h stime.h strings.h	\
+    strop.h strorder.h strports.h struct.h symbols.h tag.h tags.h	\
+    throw.h unif.h variable.h vectors.h version.h vports.h weaks.h	\
+    snarf.h threads.h coop-defs.h fluids.h iselect.h
 
 ## This file is generated at configure time.  That is why it is DATA
 ## and not a header -- headers are included in the distribution.
diff --git a/libguile/init.c b/libguile/init.c
index bf5c4e3f0..d9a8b2dca 100644
--- a/libguile/init.c
+++ b/libguile/init.c
@@ -46,6 +46,7 @@
 #include "_scm.h"
 
 /* Everybody has an init function.  */
+#include "mb.h"
 #include "alist.h"
 #include "arbiters.h"
 #include "async.h"
@@ -447,6 +448,7 @@ scm_boot_guile_1 (base, closure)
       scm_init_threads (base);
 #endif
       start_stack (base);
+      scm_init_mb ();
       scm_init_gsubr ();
       scm_init_feature ();
       scm_init_alist ();
diff --git a/libguile/mb.c b/libguile/mb.c
new file mode 100644
index 000000000..10c5b19e1
--- /dev/null
+++ b/libguile/mb.c
@@ -0,0 +1,600 @@
+/*	Copyright (C) 1995,1996,1997,1998,1999 Free Software Foundation, Inc.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this software; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
+ *
+ * As a special exception, the Free Software Foundation gives permission
+ * for additional uses of the text contained in its release of GUILE.
+ *
+ * The exception is that, if you link the GUILE library with other files
+ * to produce an executable, this does not by itself cause the
+ * resulting executable to be covered by the GNU General Public License.
+ * Your use of that executable is in no way restricted on account of
+ * linking the GUILE library code into it.
+ *
+ * This exception does not however invalidate any other reasons why
+ * the executable file might be covered by the GNU General Public License.
+ *
+ * This exception applies only to the code released by the
+ * Free Software Foundation under the name GUILE.  If you copy
+ * code from other Free Software Foundation releases into a copy of
+ * GUILE, as the General Public License permits, the exception does
+ * not apply to the code that you add in this way.  To avoid misleading
+ * anyone as to the status of such modified files, you must delete
+ * this exception notice from them.
+ *
+ * If you write modifications of your own for GUILE, it is your choice
+ * whether to permit this exception to apply to your modifications.
+ * If you do not wish that, delete this exception notice.  */
+
+/* Headers.  */
+
+#include "_scm.h"
+#include "mb.h"
+
+
+/* Here is a description of the encoding.
+
+   ** THIS WILL CHANGE IN FUTURE VERSIONS OF GUILE --- IT IS NOT
+   CORRECT TO ASSUME THE USE OF THIS ENCODING OUTSIDE OF mb.c and
+   mb.h. **
+
+   If you can't accomplish what you want without this info, then the
+   multibyte API is flawed, and we need to extend it.  If you spread
+   this knowledge around into other code, then it will break when we
+   change encodings.
+
+   You have been warned.
+
+
+   For all ASCII characters, the Guile scm_char_t code is equal to the
+   ASCII code.  The Guile multi-byte encoding is a single byte whose
+   value is the character's ASCII code.  Note that ASCII doesn't
+   contain any characters whose numbers are above 127.
+
+   For non-ASCII characters:
+
+   Each character is assigned a character set number from 0x81 to 0xFE
+   (except for 0x9A .. 0x9F), and a position within that character
+   set.  A "position" within a character set is one or two bytes from
+   0x20 to 0x7F.
+
+   For example:
+   - The Latin-1 character set is 0x81.
+   - The Japanese JISX0208.1983/1990 Kanji set is 0x92.
+   - The character � (a lower-case a with an acute accent) is part of
+     the Latin-1 character set.  Its position is the byte 0x61.
+   - The Japanese Katakana character "ka" is part of the JISX0208
+     character set.  Its position is the pair of bytes 0x25 0x2B.
+
+   Once we know a character's character set, we can determine whether
+   its position is one or two bytes, and the form of its encoding.
+
+   Character set number     positions 	 encoding byte sequence
+   ===========================================================================
+   from 0x81 to 0x8f        1 byte    	 SET   POS +0x80
+   from 0x90 to 0x99        2 bytes   	 SET   POS1+0x80  POS2+0x80
+   from 0xA0 to 0xDF        1 byte    	 0x9A  SET        POS +0x80
+   from 0xE0 to 0xEF        1 byte    	 0x9B  SET        POS +0x80
+   from 0xF0 to 0xF4        2 bytes   	 0x9C  SET        POS1+0x80  POS2+0x80
+   from 0xF5 to 0xFE        2 bytes   	 0x9D  SET        POS1+0x80  POS2+0x80
+
+   "SET" is the character set number;
+   "POS" is a one-byte position; and
+   "POS1" and "POS2" are a two-byte position, 
+
+   Some examples:
+   - For the character �, SET is 0x81, and POS is 0x61, so it would be
+     encoded by the byte sequence 0x81 0xE1.
+   - For the Japanese Katakana character "ka", SET is 0x92, and POS1
+     and POS2 are 0x25 and 0x2B, so it would be encoded by the byte
+     sequence 0x92 0xA5 0xAB.
+
+   So the longest encoding is four bytes long.
+
+   It's easy to verify that this encoding meets the conditions
+   promised by mbapi.texi:
+
+   - Every ASCII character is encoded as a single byte from 0 to 127,
+     in the obvious way.
+   - The encodings of non-ASCII characters use only bytes between 0x80
+     and 0xFF.
+   - No character encoding is a subsequence of any other character
+     encoding, since bytes from 0x00 to 0x9f occur only at the
+     beginning of a sequence.
+   - You can always determine the full length of a character's
+     encoding from its first byte.
+   - Given an arbitrary byte position in a Guile string, you can
+     always find the beginning and end of the character containing
+     that byte without scanning too far in either direction, assuming
+     the string is null-terminated or followed by another valid
+     character (as substrings are).
+
+
+   How does Guile choose scm_char_t values for non-ASCII characters?
+
+   We divide a character value up into three fields:
+   FIELD1: bits 18 -- 14  (most significant bits)
+   FIELD2: bits 13 --  7
+   FIELD3: bits  6 --  0  (least significant bits)
+
+   If the character's position is one byte, then:
+     FIELD1 is zero.
+     FIELD2 is the character set number, minus 0x70.
+     FIELD3 is the character position.
+   
+   If the character's position is two bytes, then:
+     FIELD2 is the first byte of the character's position.
+     FIELD3 is the second byte of the character's position.
+     If the character set number is from 0x90 to 0x99, then:
+       FIELD1 is the character set number, minus 0x8f.
+       (Thus, a number from 0x01 to 0x0A.)
+     If the character set number is from 0xF0 to 0xFE, then:
+       FIELD1 is the character set number, minus 0xE0.
+       (Thus, a number from 0x10 to 0x1E.)
+
+   For example:
+   - For the character �, FIELD1 would be zero, FIELD2 would be 0x11,
+     and FIELD3 would be 0x61.  Thus, the full character code would be
+     (0x11 << 7) | 0x61, or 2273.
+   - For the Japanese Katakana character "ka", FIELD1 would be 0x3,
+     FIELD2 would be 0x25 and FIELD3 would be 0x2B.  Thus, the full
+     character code would be (0x3 << 14) | (0x25 << 7) | 0x2B, or 53931.
+
+   Thus, character codes fall into the following ranges:
+
+         0 ..    127    ASCII
+      2208 ..   4095    "official"   one-byte position character sets
+      6176 ..  16383    "unofficial" one-byte position character sets
+     20512 .. 180223    "official"   two-byte position character sets
+    266272 .. 507903    "unofficial" two-byte position character sets
+
+   It's hairy, but at the time this was designed, Unicode didn't exist
+   --- this encoding allowed Emacs to incorporate characters from all
+   kinds of character sets unchanged.  It also allows Emacs to
+   distinguish between Japanese and Chinese character sets, which is
+   important to some users.
+
+   Even when we make the transition to Unicode, we will probably
+   retain some way of distinguishing Japanese and Chinese characters.
+   This is a highly controvertial issue.  However, I think that the
+   opinions of people who do not use Chinese or Japanese regularly
+   should be discounted; once this is done, there is a substantial
+   body of users who say they need this distinction in the encoding
+   itself.  So Guile will support it.  */
+
+
+
+/* Exceptions.  */
+
+SCM_SYMBOL (text_not_char_boundary, "text:not-char-boundary");
+SCM_SYMBOL (text_bad_encoding,      "text:bad-encoding");
+static const char text_bad_encoding_msg[] =
+  "string contains byte sequence which is not a valid character encoding";
+SCM_SYMBOL (text_not_guile_char,    "text:not-guile-char");
+
+
+
+/* Basic multibyte character processing.  */
+
+/* Assembling and disassembling character codes.
+   A `CHAR1' is a character whose position is one byte.
+   A `CHAR2' is a character whose position is two bytes.
+   The suffix `O' refers to an "official" character set --- one
+       whose character set number is in the range 0x81 -- 0x99.
+   The suffix `P' refers to a "private" character set --- one
+       whose character set number is in the range 0xA0 -- 0xFE.
+*/
+
+#define BUILD_CHAR1(set, pos) ((((set) - 0x70) << 7) | (pos))
+#define BUILD_CHAR2(set, offset, pos1, pos2)	\
+  ((((set) - (offset)) << 14) 			\
+   | ((pos1) << 7)				\
+   | (pos2))
+
+#define IS_ASCII_CHAR(c) ((c) < 0x80)
+
+#define FIRST_CHAR1O (BUILD_CHAR1 (0x81, 0x20))
+#define LAST_CHAR1O  (BUILD_CHAR1 (0x8f, 0x7F))
+
+#define FIRST_CHAR1P (BUILD_CHAR1 (0xA0, 0x20))
+#define LAST_CHAR1P  (BUILD_CHAR1 (0xEF, 0x7F))
+
+#define FIRST_CHAR2O (BUILD_CHAR2 (0x90, 0x8F, 0x20, 0x20))
+#define LAST_CHAR2O  (BUILD_CHAR2 (0x99, 0x8F, 0x7F, 0x7F))
+
+#define FIRST_CHAR2P (BUILD_CHAR2 (0xF0, 0xE0, 0x20, 0x20))
+#define LAST_CHAR2P  (BUILD_CHAR2 (0xFE, 0xE0, 0x7F, 0x7F))
+
+#define CHAR1_SET(c) (((c) >> 7) + 0x70)
+#define CHAR1_POS(c) ((c) & 0x7F)
+
+#define CHAR2_SET(c, offset) (((c) >> 14) + (offset))
+#define CHAR2_POS1(c) (((c) >> 7) & 0x7f)
+#define CHAR2_POS2(c) ((c) & 0x7f)
+
+scm_char_t
+scm_mb_get_func (const unsigned char *p)
+{
+  unsigned char lead = *p;
+  
+  if (IS_ASCII_CHAR (lead))
+    return lead;
+  else if (lead == 0x80)
+    /* Guile does *not* support composite characters, thank goodness.  */
+    return -1;
+  else if (lead < 0x90)
+    {
+      unsigned char set = lead;
+      unsigned char pos = p[1] & 0x7f;
+
+      return BUILD_CHAR1 (set, pos);
+    }
+  else if (lead < 0x9A)
+    {
+      unsigned char set = load;
+      unsigned char pos_hi = p[1] & 0x7f;
+      unsigned char pos_lo = p[2] & 0x7f;
+
+      return BUILD_CHAR2 (set, 0x8F, pos_hi, pos_lo);
+    }
+  else if (lead < 0x9C)
+    {
+      unsigned char set = p[1];
+      unsigned char pos = p[2] & 0x7f;
+
+      return BUILD_CHAR1 (set, pos);
+    }
+  else if (lead < 0x9E)
+    {
+      unsigned char set = p[1];
+      unsigned char pos_hi = p[2] & 0x7f;
+      unsigned char pos_lo = p[3] & 0x7f;
+
+      return BUILD_CHAR2 (set, 0xE0, pos_hi, pos_lo);
+    }
+  else
+    return -1;
+}
+
+int
+scm_mb_put_func (scm_char_t c, unsigned char *p)
+{
+  if (c < 0)
+    return 0;
+  else if (IS_ASCII_CHAR (c))
+    {
+      *p = c;
+      return 1;
+    }
+  else if (c < FIRST_CHAR1O)
+    return 0;
+  else if (c <= LAST_CHAR1O)
+    {
+      /* encoding is SET; POS+0x80 */
+      *p++ = CHAR1_SET (c);
+      *p++ = CHAR1_POS (c) | 0x80;
+      return 2;
+    }
+  else if (c < FIRST_CHAR1P)
+    return 0;
+  else if (c <= LAST_CHAR1P)
+    {
+      /* encoding is: 0x9A or 0x9B; SET; POS+0x80  */
+      unsigned char set = CHAR1_SET (c);
+      *p++ = (set < 0xE0) ? 0x9A : 0x9B;
+      *p++ = set;
+      *p++ = CHAR1_POS (c) | 0x80;
+      return 3;
+    }
+  else if (c < FIRST_CHAR2O)
+    return 0;
+  else if (c <= LAST_CHAR2O)
+    {
+      /* encoding is: SET; POS1+0x80; POS2+0x80 */
+      *p++ = CHAR2_SET (c, 0x8f);
+      *p++ = CHAR2_POS1 (c) | 0x80;
+      *p++ = CHAR2_POS2 (c) | 0x80;
+      return 3;
+    }
+  else if (c < FIRST_CHAR2P)
+    return 0;
+  else if (c <= LAST_CHAR2P)
+    {
+      /* encoding is: 0x9C or 0x9D; SET; POS1+0x80; POS2+0x80  */
+      unsigned char set = CHAR2_SET (c, 0xE0);
+      *p++ = (set < 0xF5) ? 0x9C : 0x9D;
+      *p++ = set;
+      *p++ = CHAR2_POS1 (c) | 0x80;
+      *p++ = CHAR2_POS2 (c) | 0x80;
+      return 4;
+    }
+  else
+    return 0;
+}
+
+int
+scm_mb_len_func (unsigned char b)
+{
+  return scm_mb_len (b);
+}
+
+int
+scm_mb_len_char_func (scm_char_t c)
+{
+  return (IS_ASCII_CHAR (c) ? 1
+	  : c < FIRST_CHAR1O ? 0 : c <= LAST_CHAR1O ? 2
+	  : c < FIRST_CHAR1P ? 0 : c <= LAST_CHAR1P ? 3
+	  : c < FIRST_CHAR2O ? 0 : c <= LAST_CHAR2O ? 3
+	  : c < FIRST_CHAR2P ? 0 : c <= LAST_CHAR2P ? 4
+	  : 0);
+}
+
+
+/* Finding character encoding boundaries.  */
+
+const unsigned char *
+scm_mb_floor (const unsigned char *p)
+{
+  while (! scm_mb_boundary_p (p))
+    p--;
+
+  return p;
+}
+
+const unsigned char *
+scm_mb_ceiling (const unsigned char *p)
+{
+  while (! scm_mb_boundary_p (p))
+    p++;
+
+  return p;
+}
+
+
+/* Multibyte string functions.  */
+
+/* Return the number of characters encoded by the LEN bytes at P.  */
+int
+scm_mb_count (const unsigned char *p, int len)
+{
+  int count = 0;
+  const unsigned char *end = p + len;
+  
+  /* If this turns out to be a big bottleneck, then we'll make it not
+     check every byte.  But for now I think I want the sanity checking.  */
+  while (p < end)
+    {
+      if (*p < 0x80)
+	count++, p++;
+      else if (! scm_mb_boundary_p (p))
+	/* At the top of the loop, p must always be pointing at the
+	   beginning of a character encoding.  */
+	goto error;
+      else
+	{
+	  int n = scm_mb_len (*p);
+
+	  /* Make sure this character's encoding fits within the string.  */
+	  if (p + n > end)
+	    goto error;
+
+	  p++, n--;
+	  while (n > 0)
+	    {
+	      /* No character start bytes should occur within the
+                 encoding.  */
+	      if (scm_mb_boundary_p (p))
+		goto error;
+	      p++, n--;
+	    }
+
+	  count++;
+	}
+    }
+  
+  return count;
+
+ error:
+  scm_error (text_bad_encoding, "scm_mb_count",
+	     text_bad_encoding_msg, SCM_EOL, SCM_EOL);
+}
+
+
+/* Return the character at *PP, and advance *PP to the next character.  */
+scm_char_t
+scm_mb_walk (const unsigned char **pp)
+{
+  const unsigned char *p = *p;
+  scm_char_t c = scm_mb_get (p);
+  *pp = p + scm_mb_len (*p);
+  return c;
+}
+
+
+/* Return the address of the character before P.  */
+const unsigned char *
+scm_mb_prev (const unsigned char *p)
+{
+  p--;
+  while (! scm_mb_boundary_p (p))
+    p--;
+
+  return p;
+}
+
+
+/* Return the address of the character after P.  */
+const unsigned char *
+scm_mb_next (const unsigned char *p)
+{
+  p++;
+  while (! scm_mb_boundary_p (p))
+    p++;
+
+  return p;
+}
+
+
+/* Return the location of the I'th character in LEN bytes at P.  */
+const unsigned char *
+scm_mb_index (const unsigned char *p, int len, int i)
+{
+  struct scm_mb_cache cache;
+
+  cache.character = 0;
+  cache.byte = 0;
+
+  return scm_mb_index_cached_func (p, len, i, &cache);
+}
+
+
+const unsigned char *
+scm_mb_index_cached_func (const unsigned char *p, int len, int i,
+			  struct scm_mb_cache *cache)
+{
+  int character = cache->character;
+  int byte = cache->byte;
+
+  SCM_ASSERT (i >= 0, i, SCM_OUTOFRANGE, "scm_mb_index");
+
+  /* If cache's character and byte offsets are the same, then that
+     means that all characters up to that position are a single byte
+     long, so that prefix of the string can be indexed normally.  */
+  if (i <= character
+      && character == byte)
+    return &p[i];
+    
+  /* We start from the beginning of the string or the cache position,
+     whichever is closer.  */
+  if (i <= character / 2)
+    character = byte = 0;
+
+  if (i < character)
+    {
+      /* Scanning backwards!  */
+
+      while (byte > 0 && i < character)
+	{
+	  byte--;
+	  if (scm_mb_boundary_p (&p[byte]))
+	    character--;
+	}
+
+      /* We never got there!  The cache and the string must have been
+         out of sync.  */
+      if (i < character)
+	scm_misc_error ("scm_mb_index",
+			"multibyte position cache was inaccurate",
+			SCM_EOL);
+    }
+  else if (i > character)
+    {
+      /* Scanning forwards!  */
+      while (byte < len && i > character)
+	{
+	  if (! scm_mb_boundary_p (&p[byte]))
+	    scm_error (text_bad_encoding, "scm_mb_index",
+		       text_bad_encoding_msg, SCM_EOL, SCM_EOL);
+	  byte += scm_mb_len (p[byte]);
+	  character++;
+	}
+
+      /* We never got there!  This could mean that 1) i was off the
+         end of the string, or 2) the cache and string were out of
+         sync.  Assume the former.  */
+      if (i > character)
+	SCM_ASSERT (0, i, SCM_OUTOFRANGE, "scm_mb_index");
+    }
+  else
+    /* Perfect cache hit!  */
+    return &p[byte];
+
+  cache->character = character;
+  cache->byte = byte;
+  return &p[byte];
+}
+
+
+/* Convert a multibyte string to an array of scm_char_t's.
+   The caller is responsible for freeing the result.  */
+scm_char_t *
+scm_mb_multibyte_to_fixed (const unsigned char *p, int len, int *result_len)
+{
+  const unsigned char *end = p + len;
+  scm_char_t *buf;
+  int buf_len;
+
+  buf = scm_must_malloc (len * sizeof (*buf), "scm_mb_multibyte_to_fixed");
+  buf_len = 0;
+
+  while (p < len)
+    {
+      scm_char_t c = scm_mb_get (p);
+      if (c < 0)
+	scm_error (text_bad_encoding, "scm_mb_multibyte_to_fixed",
+		   text_bad_encoding_msg, SCM_EOL, SCM_EOL);
+      buf[buf_len++] = c;
+      p += scm_mb_len (*p);
+    }
+
+  buf = scm_must_realloc (buf, len * sizeof (*buf), buf_len * sizeof (*buf),
+			  "scm_mb_multibyte_to_fixed");
+  *result_len = buf_len;
+  return buf;
+}
+
+/* Convert an array of scm_char_t's to a multibyte string.
+   The caller is responsible for freeing the result.  */
+unsigned char *
+scm_mb_fixed_to_multibyte (const scm_char_t *fixed, int len, int *result_len)
+{
+  int i;
+  int buf_size;
+  unsigned char *buf, *p;
+
+  /* Compute the buffer size.  I think it's faster to make two passes
+     over the string like this than to possibly recopy it.  */
+  buf_size = 0;
+  for (i = 0; i < len; i++)
+    buf_size += scm_mb_len_char (fixed[i]);
+
+  buf = scm_must_malloc (buf_size + 1, "scm_mb_fixed_to_multibyte");
+  p = buf;
+  for (i = 0; i < len; i++)
+    {
+      scm_mb_put (fixed[i], p);
+      p += scm_mb_len (*p);
+    }
+
+  /* Was the size we computed actually correct?  */
+  if (p != buf + buf_size)
+    abort ();
+
+  /* Null-terminate the string.  */
+  *p = '\0';
+
+  *result_len = buf_size;
+  return buf;
+}
+
+
+/* Initialization.  */
+
+void
+scm_init_mb ()
+{
+#include "mb.x"
+}
diff --git a/libguile/mb.h b/libguile/mb.h
new file mode 100644
index 000000000..725c5661d
--- /dev/null
+++ b/libguile/mb.h
@@ -0,0 +1,152 @@
+#ifndef SCM_MB_H
+#define SCM_MB_H
+
+/*	Copyright (C) 1999 Free Software Foundation, Inc.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this software; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
+ *
+ * As a special exception, the Free Software Foundation gives permission
+ * for additional uses of the text contained in its release of GUILE.
+ *
+ * The exception is that, if you link the GUILE library with other files
+ * to produce an executable, this does not by itself cause the
+ * resulting executable to be covered by the GNU General Public License.
+ * Your use of that executable is in no way restricted on account of
+ * linking the GUILE library code into it.
+ *
+ * This exception does not however invalidate any other reasons why
+ * the executable file might be covered by the GNU General Public License.
+ *
+ * This exception applies only to the code released by the
+ * Free Software Foundation under the name GUILE.  If you copy
+ * code from other Free Software Foundation releases into a copy of
+ * GUILE, as the General Public License permits, the exception does
+ * not apply to the code that you add in this way.  To avoid misleading
+ * anyone as to the status of such modified files, you must delete
+ * this exception notice from them.
+ *
+ * If you write modifications of your own for GUILE, it is your choice
+ * whether to permit this exception to apply to your modifications.
+ * If you do not wish that, delete this exception notice.  */
+
+#include "libguile/__scm.h"
+
+/* Here are macros and functions for working with Guile's multibyte
+   text representation.  At present, Guile uses the same encoding as
+   GNU Emacs 20.4, but Guile and Emacs will hopefully switch to UTF-8
+   sometime soon; you should use these macros to insulate your code
+   from the details of the encoding, so when the switch occurs, your
+   code won't break.  All knowledge of Guile's character set should be
+   in mb.h (here) or mb.c.
+
+   These are all documented in ref/mbapi.texi, which is part of the
+   guile-doc CVS module.
+
+   Actually, a lot of these definitions only rely on the ``Promised
+   Properties of the Guile Multibyte Encoding'', as described in
+   mbapi.texi --- mostly the promise that ASCII characters are encoded
+   as themselves.  */
+
+typedef int scm_char_t;
+
+
+/* Retrieve the character whose encoding is at P.  */
+#define scm_mb_get(p) \
+  (*(p) < 128 ? *(p) : scm_mb_get_func (p))
+extern scm_char_t scm_mb_get_func (const unsigned char *p);
+
+/* Store the encoding of the character C at P, and return the
+   encoding's length in bytes.  */
+#define scm_mb_put(c, p) \
+  ((c) < 128 ? (*(p) = c, 1) : scm_mb_put_func ((c), (p)))
+extern int scm_mb_put_func (scm_char_t c, unsigned char *p);
+
+/* The length of the longest character encoding, in bytes.  */
+#define scm_mb_max_len (4)
+
+/* Given an encoding's first byte, return its length.  */
+#define scm_mb_len(b)				\
+  ((b) < 0x80 ? 1				\
+   : (b) < 0x90 ? 2				\
+   : (b) < 0x9C ? 3				\
+   : (b) < 0x9E ? 4				\
+   : 1)
+extern int scm_mb_len_func (unsigned char b);
+
+/* Given a Guile character, return the length of its encoding.  */
+#define scm_mb_len_char(c) (scm_mb_len_char_func(c))
+extern int scm_mb_len_char_func (scm_char_t c);
+
+
+
+/* Finding character encoding boundaries.  */
+
+/* Return true if P points at the first byte of an encoding.  */
+#define scm_mb_boundary_p(p) (*(p) < 0xA0)
+
+/* Round P to the previous/next character boundary.  */
+extern unsigned char *scm_mb_floor (const unsigned char *p);
+extern unsigned char *scm_mb_ceiling (const unsigned char *p);
+
+
+/* Multibyte string functions.  */
+
+/* Return the number of characters encoded by the LEN bytes at P.  */
+extern int scm_mb_count (const unsigned char *p, int len);
+
+/* Return the character at *PP, and advance *PP to the next character.  */
+extern scm_char_t scm_mb_walk (const unsigned char **pp);
+
+/* Return the address of the character before P.  */
+extern unsigned char *scm_mb_prev (const unsigned char *p);
+
+/* Return the address of the character after P.  */
+extern unsigned char *scm_mb_next (const unsigned char *p);
+
+/* Return the location of the I'th character in LEN bytes at P.  */
+extern unsigned char *scm_mb_index (const unsigned char *p, int len, int i);
+
+/* A cache of information about the positions of characters in
+   strings.  Initialize all elements to zero before using.  */
+struct scm_mb_cache {
+  int character;		/* a character index */
+  int byte;			/* its byte offset in the string */
+};
+
+/* Return the location of the I'th character in LEN bytes at P.
+   Use and update CACHE, if possible.  */
+#define scm_mb_index_cached(p, len, i, cache)				\
+  ((i) <= (cache)->character && (cache)->character == (cache)->byte	\
+   ? &(p)[(i)]								\
+   : scm_mb_index_cached_func ((p), (len), (i), (cache)))
+extern unsigned char *scm_mb_index_cached_func (const unsigned char *p,
+						int len, int i, 
+						struct scm_mb_cache *cache);
+
+/* Convert a multibyte string to an array of scm_char_t's.
+   The caller is responsible for freeing the result.  */
+extern scm_char_t *scm_mb_multibyte_to_fixed (const unsigned char *p, int len,
+					      int *result_len);
+
+/* Convert an array of scm_char_t's to a multibyte string.
+   The caller is responsible for freeing the result.  */
+extern unsigned char *scm_mb_fixed_to_multibyte (const scm_char_t *fixed,
+						 int len, int *result_len);
+
+/* Initialize the multibyte stuff.  */
+extern void scm_init_mb (void);
+
+#endif  /* SCM_MB_H */
author	Jim Blandy <jimb@red-bean.com>	1999-09-02 06:40:54 +0000
committer	Jim Blandy <jimb@red-bean.com>	1999-09-02 06:40:54 +0000
commit	755a7125790fab364e1f4747aa31b18c99adb398 (patch)
tree	580c9dd8bcc954e1b859ee24849d4f6796d1af7c
parent	b4b1660b473fb9d97020829485d32e243c1e587e (diff)
download	guile-755a7125790fab364e1f4747aa31b18c99adb398.tar.gz