Tcl 8.3 upgradeTCL_8_3

author: Ian Roxborough <irox@redhat.com> 2001-09-09 22:40:53 +0000
committer: Ian Roxborough <irox@redhat.com> 2001-09-09 22:40:53 +0000
commit: a850c17c374d03259483e799b09326d255e17487 (patch)
tree: f1d024951a993f0453aa49d4ba808d6c38fa4321 /tcl/generic/tclUtf.c
parent: 57e8350a3895a1579b77cc134d6d7d49b056678e (diff)
download: gdb-a850c17c374d03259483e799b09326d255e17487.tar.gz
1 files changed, 1586 insertions, 0 deletions
diff --git a/tcl/generic/tclUtf.c b/tcl/generic/tclUtf.c
new file mode 100644
index 00000000000..5f6826ddf01
--- /dev/null
+++ b/tcl/generic/tclUtf.c
@@ -0,0 +1,1586 @@
+/*
+ * tclUtf.c --
+ *
+ *	Routines for manipulating UTF-8 strings.
+ *
+ * Copyright (c) 1997-1998 Sun Microsystems, Inc.
+ *
+ * See the file "license.terms" for information on usage and redistribution
+ * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
+ *
+ * RCS: @(#) $Id$
+ */
+
+#include "tclInt.h"
+
+/*
+ * Include the static character classification tables and macros.
+ */
+
+#include "tclUniData.c"
+
+/*
+ * The following macros are used for fast character category tests.  The
+ * x_BITS values are shifted right by the category value to determine whether
+ * the given category is included in the set.
+ */ 
+
+#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
+    | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
+
+#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
+
+#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
+    | (1 << PARAGRAPH_SEPARATOR))
+
+#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
+
+#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
+	    (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
+	    (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
+	    (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
+	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
+	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
+	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
+	    (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
+	    (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
+
+#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
+	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
+	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
+	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
+
+/*
+ * Unicode characters less than this value are represented by themselves 
+ * in UTF-8 strings. 
+ */
+
+#define UNICODE_SELF	0x80
+
+/*
+ * The following structures are used when mapping between Unicode (UCS-2)
+ * and UTF-8.
+ */
+ 
+CONST unsigned char totalBytes[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#if TCL_UTF_MAX > 3
+    4,4,4,4,4,4,4,4,
+#else
+    1,1,1,1,1,1,1,1,
+#endif
+#if TCL_UTF_MAX > 4
+    5,5,5,5,
+#else
+    1,1,1,1,
+#endif
+#if TCL_UTF_MAX > 5
+    6,6,6,6
+#else
+    1,1,1,1
+#endif
+};
+
+/*
+ * Procedures used only in this module.
+ */
+
+static int UtfCount _ANSI_ARGS_((int ch));
+
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * UtfCount --
+ *
+ *	Find the number of bytes in the Utf character "ch".
+ *
+ * Results:
+ *	The return values is the number of bytes in the Utf character "ch".
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+static int
+UtfCount(ch)
+    int ch;			/* The Tcl_UniChar whose size is returned. */
+{
+    if ((ch > 0) && (ch < UNICODE_SELF)) {
+	return 1;
+    }
+    if (ch <= 0x7FF) {
+	return 2;
+    }
+    if (ch <= 0xFFFF) {
+	return 3;
+    }
+#if TCL_UTF_MAX > 3
+    if (ch <= 0x1FFFFF) {
+	return 4;
+    }
+    if (ch <= 0x3FFFFFF) {
+	return 5;
+    }
+    if (ch <= 0x7FFFFFFF) {
+	return 6;
+    }
+#endif
+    return 3;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UniCharToUtf --
+ *
+ *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
+ *	provided buffer.  Equivalent to Plan 9 runetochar().
+ *
+ * Results:
+ *	The return values is the number of bytes in the buffer that
+ *	were consumed.  
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+INLINE int
+Tcl_UniCharToUtf(ch, str)
+    int ch;			/* The Tcl_UniChar to be stored in the
+				 * buffer. */
+    char *str;			/* Buffer in which the UTF-8 representation
+				 * of the Tcl_UniChar is stored.  Buffer must
+				 * be large enough to hold the UTF-8 character
+				 * (at most TCL_UTF_MAX bytes). */
+{
+    if ((ch > 0) && (ch < UNICODE_SELF)) {
+	str[0] = (char) ch;
+	return 1;
+    }
+    if (ch <= 0x7FF) {
+	str[1] = (char) ((ch | 0x80) & 0xBF);
+	str[0] = (char) ((ch >> 6) | 0xC0);
+	return 2;
+    }
+    if (ch <= 0xFFFF) {
+	three:
+	str[2] = (char) ((ch | 0x80) & 0xBF);
+	str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	str[0] = (char) ((ch >> 12) | 0xE0);
+	return 3;
+    }
+
+#if TCL_UTF_MAX > 3
+    if (ch <= 0x1FFFFF) {
+	str[3] = (char) ((ch | 0x80) & 0xBF);
+	str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
+	str[0] = (char) ((ch >> 18) | 0xF0);
+	return 4;
+    }
+    if (ch <= 0x3FFFFFF) {
+	str[4] = (char) ((ch | 0x80) & 0xBF);
+	str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
+	str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
+	str[0] = (char) ((ch >> 24) | 0xF8);
+	return 5;
+    }
+    if (ch <= 0x7FFFFFFF) {
+	str[5] = (char) ((ch | 0x80) & 0xBF);
+	str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
+	str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
+	str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
+	str[0] = (char) ((ch >> 30) | 0xFC);
+	return 6;
+    }
+#endif
+
+    ch = 0xFFFD;
+    goto three;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UniCharToUtfDString --
+ *
+ *	Convert the given Unicode string to UTF-8.
+ *
+ * Results:
+ *	The return value is a pointer to the UTF-8 representation of the
+ *	Unicode string.  Storage for the return value is appended to the
+ *	end of dsPtr.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+char *
+Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
+    CONST Tcl_UniChar *wString;	/* Unicode string to convert to UTF-8. */
+    int numChars;		/* Length of Unicode string in Tcl_UniChars
+				 * (must be >= 0). */
+    Tcl_DString *dsPtr;		/* UTF-8 representation of string is
+				 * appended to this previously initialized
+				 * DString. */
+{
+    CONST Tcl_UniChar *w, *wEnd;
+    char *p, *string;
+    int oldLength;
+
+    /*
+     * UTF-8 string length in bytes will be <= Unicode string length *
+     * TCL_UTF_MAX.
+     */
+
+    oldLength = Tcl_DStringLength(dsPtr);
+    Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
+    string = Tcl_DStringValue(dsPtr) + oldLength;
+
+    p = string;
+    wEnd = wString + numChars;
+    for (w = wString; w < wEnd; ) {
+	p += Tcl_UniCharToUtf(*w, p);
+	w++;
+    }
+    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
+
+    return string;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfToUniChar --
+ *
+ *	Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
+ *	UTF-8 sequences are converted to valid Tcl_UniChars and processing
+ *	continues.  Equivalent to Plan 9 chartorune().
+ *
+ *	The caller must ensure that the source buffer is long enough that
+ *	this routine does not run off the end and dereference non-existent
+ *	memory looking for trail bytes.  If the source buffer is known to
+ *	be '\0' terminated, this cannot happen.  Otherwise, the caller
+ *	should call Tcl_UtfCharComplete() before calling this routine to
+ *	ensure that enough bytes remain in the string.
+ *
+ * Results:
+ *	*chPtr is filled with the Tcl_UniChar, and the return value is the
+ *	number of bytes from the UTF-8 string that were consumed.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+int
+Tcl_UtfToUniChar(str, chPtr)
+    register CONST char *str;	 /* The UTF-8 string. */
+    register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
+				  * by the UTF-8 string. */
+{
+    register int byte;
+    
+    /*
+     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
+     */
+
+    byte = *((unsigned char *) str);
+    if (byte < 0xC0) {
+	/*
+	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
+	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
+	 * characters representing themselves.
+	 */
+	 
+	*chPtr = (Tcl_UniChar) byte;
+	return 1;
+    } else if (byte < 0xE0) {
+	if ((str[1] & 0xC0) == 0x80) {
+	    /*
+	     * Two-byte-character lead-byte followed by a trail-byte.
+	     */
+	     
+	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
+	    return 2;
+	}
+	/*
+	 * A two-byte-character lead-byte not followed by trail-byte
+	 * represents itself.
+	 */
+	 
+	*chPtr = (Tcl_UniChar) byte;
+	return 1;
+    } else if (byte < 0xF0) {
+	if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
+	    /*
+	     * Three-byte-character lead byte followed by two trail bytes.
+	     */
+
+	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 
+		    | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
+	    return 3;
+	}
+	/*
+	 * A three-byte-character lead-byte not followed by two trail-bytes
+	 * represents itself.
+	 */
+
+	*chPtr = (Tcl_UniChar) byte;
+	return 1;
+    }
+#if TCL_UTF_MAX > 3
+    else {
+	int ch, total, trail;
+
+	total = totalBytes[byte];
+	trail = total - 1;
+	if (trail > 0) {
+	    ch = byte & (0x3F >> trail);
+	    do {
+		str++;
+		if ((*str & 0xC0) != 0x80) {
+		    *chPtr = byte;
+		    return 1;
+		}
+		ch <<= 6;
+		ch |= (*str & 0x3F);
+		trail--;
+	    } while (trail > 0);
+	    *chPtr = ch;
+	    return total;
+	}
+    }
+#endif
+
+    *chPtr = (Tcl_UniChar) byte;
+    return 1;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfToUniCharDString --
+ *
+ *	Convert the UTF-8 string to Unicode.
+ *
+ * Results:
+ *	The return value is a pointer to the Unicode representation of the
+ *	UTF-8 string.  Storage for the return value is appended to the
+ *	end of dsPtr.  The Unicode string is terminated with a Unicode
+ *	NULL character.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+Tcl_UniChar *
+Tcl_UtfToUniCharDString(string, length, dsPtr)
+    CONST char *string;		/* UTF-8 string to convert to Unicode. */
+    int length;			/* Length of UTF-8 string in bytes, or -1
+				 * for strlen(). */
+    Tcl_DString *dsPtr;		/* Unicode representation of string is
+				 * appended to this previously initialized
+				 * DString. */
+{
+    Tcl_UniChar *w, *wString;
+    CONST char *p, *end;
+    int oldLength;
+
+    if (length < 0) {
+	length = strlen(string);
+    }
+
+    /*
+     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
+     * in bytes.
+     */
+
+    oldLength = Tcl_DStringLength(dsPtr);
+    Tcl_DStringSetLength(dsPtr,
+	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
+    wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
+
+    w = wString;
+    end = string + length;
+    for (p = string; p < end; ) {
+	p += Tcl_UtfToUniChar(p, w);
+	w++;
+    }
+    *w = '\0';
+    Tcl_DStringSetLength(dsPtr,
+	    (oldLength + ((char *) w - (char *) wString)));
+
+    return wString;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfCharComplete --
+ *
+ *	Determine if the UTF-8 string of the given length is long enough
+ *	to be decoded by Tcl_UtfToUniChar().  This does not ensure that the
+ *	UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().
+ *
+ * Results:
+ *	The return value is 0 if the string is not long enough, non-zero
+ *	otherwise.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfCharComplete(str, len)
+    CONST char *str;		/* String to check if first few bytes
+				 * contain a complete UTF-8 character. */
+    int len;			/* Length of above string in bytes. */
+{
+    int ch;
+
+    ch = *((unsigned char *) str);
+    return len >= totalBytes[ch];
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_NumUtfChars --
+ *
+ *	Returns the number of characters (not bytes) in the UTF-8 string,
+ *	not including the terminating NULL byte.  This is equivalent to
+ *	Plan 9 utflen() and utfnlen().
+ *
+ * Results:
+ *	As above.  
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+int 
+Tcl_NumUtfChars(str, len)
+    register CONST char *str;	/* The UTF-8 string to measure. */
+    int len;			/* The length of the string in bytes, or -1
+				 * for strlen(string). */
+{
+    Tcl_UniChar ch;
+    register Tcl_UniChar *chPtr = &ch;
+    register int n;
+    int i;
+
+    /*
+     * The separate implementations are faster.
+     */
+     
+    i = 0;
+    if (len < 0) {
+	while (1) {
+	    str += Tcl_UtfToUniChar(str, chPtr);
+	    if (ch == '\0') {
+		break;
+	    }
+	    i++;
+	}
+    } else {
+	while (len > 0) {
+	    n = Tcl_UtfToUniChar(str, chPtr);
+	    len -= n;
+	    str += n;
+	    i++;
+	}
+    }
+    return i;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfFindFirst --
+ *
+ *	Returns a pointer to the first occurance of the given Tcl_UniChar
+ *	in the NULL-terminated UTF-8 string.  The NULL terminator is
+ *	considered part of the UTF-8 string.  Equivalent to Plan 9
+ *	utfrune().
+ *
+ * Results:
+ *	As above.  If the Tcl_UniChar does not exist in the given string,
+ *	the return value is NULL.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+char *
+Tcl_UtfFindFirst(string, ch)
+    CONST char *string;		/* The UTF-8 string to be searched. */
+    int ch;			/* The Tcl_UniChar to search for. */
+{
+    int len;
+    Tcl_UniChar find;
+    
+    while (1) {
+	len = Tcl_UtfToUniChar(string, &find);
+	if (find == ch) {
+	    return (char *) string;
+	}
+	if (*string == '\0') {
+	    return NULL;
+	}
+	string += len;
+    }
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfFindLast --
+ *
+ *	Returns a pointer to the last occurance of the given Tcl_UniChar
+ *	in the NULL-terminated UTF-8 string.  The NULL terminator is
+ *	considered part of the UTF-8 string.  Equivalent to Plan 9
+ *	utfrrune().
+ *
+ * Results:
+ *	As above.  If the Tcl_UniChar does not exist in the given string,
+ *	the return value is NULL.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+char *
+Tcl_UtfFindLast(string, ch)
+    CONST char *string;		/* The UTF-8 string to be searched. */
+    int ch;			/* The Tcl_UniChar to search for. */
+{
+    int len;
+    Tcl_UniChar find;
+    CONST char *last;
+	
+    last = NULL;
+    while (1) {
+	len = Tcl_UtfToUniChar(string, &find);
+	if (find == ch) {
+	    last = string;
+	}
+	if (*string == '\0') {
+	    break;
+	}
+	string += len;
+    }
+    return (char *) last;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfNext --
+ *
+ *	Given a pointer to some current location in a UTF-8 string,
+ *	move forward one character.  The caller must ensure that they
+ *	are not asking for the next character after the last character
+ *	in the string.
+ *
+ * Results:
+ *	The return value is the pointer to the next character in
+ *	the UTF-8 string.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+char *
+Tcl_UtfNext(str) 
+    CONST char *str;		    /* The current location in the string. */
+{
+    Tcl_UniChar ch;
+
+    return (char *) str + Tcl_UtfToUniChar(str, &ch);
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfPrev --
+ *
+ *	Given a pointer to some current location in a UTF-8 string,
+ *	move backwards one character.
+ *
+ * Results:
+ *	The return value is a pointer to the previous character in the
+ *	UTF-8 string.  If the current location was already at the
+ *	beginning of the string, the return value will also be a
+ *	pointer to the beginning of the string.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+char *
+Tcl_UtfPrev(str, start)
+    CONST char *str;		    /* The current location in the string. */
+    CONST char *start;		    /* Pointer to the beginning of the
+				     * string, to avoid going backwards too
+				     * far. */
+{
+    CONST char *look;
+    int i, byte;
+    
+    str--;
+    look = str;
+    for (i = 0; i < TCL_UTF_MAX; i++) {
+	if (look < start) {
+	    if (str < start) {
+		str = start;
+	    }
+	    break;
+	}
+	byte = *((unsigned char *) look);
+	if (byte < 0x80) {
+	    break;
+	} 
+	if (byte >= 0xC0) {
+	    if (totalBytes[byte] != i + 1) {
+		break;
+	    }
+	    return (char *) look;
+	}
+	look--;
+    }
+    return (char *) str;
+}
+	
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UniCharAtIndex --
+ *
+ *	Returns the Unicode character represented at the specified
+ *	character (not byte) position in the UTF-8 string.
+ *
+ * Results:
+ *	As above.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+ 
+Tcl_UniChar
+Tcl_UniCharAtIndex(src, index)
+    register CONST char *src;	/* The UTF-8 string to dereference. */
+    register int index;		/* The position of the desired character. */
+{
+    Tcl_UniChar ch;
+
+    while (index >= 0) {
+	index--;
+	src += Tcl_UtfToUniChar(src, &ch);
+    }
+    return ch;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfAtIndex --
+ *
+ *	Returns a pointer to the specified character (not byte) position
+ *	in the UTF-8 string.
+ *
+ * Results:
+ *	As above.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+char *
+Tcl_UtfAtIndex(src, index)
+    register CONST char *src;	/* The UTF-8 string. */
+    register int index;		/* The position of the desired character. */
+{
+    Tcl_UniChar ch;
+    
+    while (index > 0) {
+	index--;
+	src += Tcl_UtfToUniChar(src, &ch);
+    }
+    return (char *) src;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Tcl_UtfBackslash --
+ *
+ *	Figure out how to handle a backslash sequence.
+ *
+ * Results:
+ *	Stores the bytes represented by the backslash sequence in dst and
+ *	returns the number of bytes written to dst.  At most TCL_UTF_MAX
+ *	bytes are written to dst; dst must have been large enough to accept
+ *	those bytes.  If readPtr isn't NULL then it is filled in with a
+ *	count of the number of bytes in the backslash sequence.  
+ *
+ * Side effects:
+ *	The maximum number of bytes it takes to represent a Unicode
+ *	character in UTF-8 is guaranteed to be less than the number of
+ *	bytes used to express the backslash sequence that represents
+ *	that Unicode character.  If the target buffer into which the
+ *	caller is going to store the bytes that represent the Unicode
+ *	character is at least as large as the source buffer from which
+ *	the backslashed sequence was extracted, no buffer overruns should
+ *	occur.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfBackslash(src, readPtr, dst)
+    CONST char *src;		/* Points to the backslash character of
+				 * a backslash sequence. */
+    int *readPtr;		/* Fill in with number of characters read
+				 * from src, unless NULL. */
+    char *dst;			/* Filled with the bytes represented by the
+				 * backslash sequence. */
+{
+    register CONST char *p = src+1;
+    int result, count, n;
+    char buf[TCL_UTF_MAX];
+
+    if (dst == NULL) {
+	dst = buf;
+    }
+
+    count = 2;
+    switch (*p) {
+	/*
+         * Note: in the conversions below, use absolute values (e.g.,
+         * 0xa) rather than symbolic values (e.g. \n) that get converted
+         * by the compiler.  It's possible that compilers on some
+         * platforms will do the symbolic conversions differently, which
+         * could result in non-portable Tcl scripts.
+         */
+
+        case 'a':
+            result = 0x7;
+            break;
+        case 'b':
+            result = 0x8;
+            break;
+        case 'f':
+            result = 0xc;
+            break;
+        case 'n':
+            result = 0xa;
+            break;
+        case 'r':
+            result = 0xd;
+            break;
+        case 't':
+            result = 0x9;
+            break;
+        case 'v':
+            result = 0xb;
+            break;
+        case 'x':
+            if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
+                char *end;
+
+                result = (unsigned char) strtoul(p+1, &end, 16);
+                count = end - src;
+            } else {
+                count = 2;
+                result = 'x';
+            }
+            break;
+	case 'u':
+	    result = 0;
+	    for (count = 0; count < 4; count++) {
+		p++;
+		if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
+		    break;
+		}
+		n = *p - '0';
+		if (n > 9) {
+		    n = n + '0' + 10 - 'A';
+		}
+		if (n > 16) {
+		    n = n + 'A' - 'a';
+		}
+		result = (result << 4) + n;
+	    }
+	    if (count == 0) {
+		result = 'u';
+	    }
+	    count += 2;
+	    break;
+		    
+        case '\n':
+            do {
+                p++;
+            } while ((*p == ' ') || (*p == '\t'));
+            result = ' ';
+            count = p - src;
+            break;
+        case 0:
+            result = '\\';
+            count = 1;
+            break;
+	default:
+	    /*
+	     * Check for an octal number \oo?o?
+	     */
+	    if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
+		result = (unsigned char)(*p - '0');
+		p++;
+		if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
+		    break;
+		}
+		count = 3;
+		result = (unsigned char)((result << 3) + (*p - '0'));
+		p++;
+		if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
+		    break;
+		}
+		count = 4;
+		result = (unsigned char)((result << 3) + (*p - '0'));
+		break;
+	    }
+	    result = *p;
+	    count = 2;
+	    break;
+    }
+
+    if (readPtr != NULL) {
+	*readPtr = count;
+    }
+    return Tcl_UniCharToUtf(result, dst);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfToUpper --
+ *
+ *	Convert lowercase characters to uppercase characters in a UTF
+ *	string in place.  The conversion may shrink the UTF string.
+ *
+ * Results:
+ *	Returns the number of bytes in the resulting string
+ *	excluding the trailing null.
+ *
+ * Side effects:
+ *	Writes a terminating null after the last converted character.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfToUpper(str)
+    char *str;			/* String to convert in place. */
+{
+    Tcl_UniChar ch, upChar;
+    char *src, *dst;
+    int bytes;
+
+    /*
+     * Iterate over the string until we hit the terminating null.
+     */
+
+    src = dst = str;
+    while (*src) {
+        bytes = Tcl_UtfToUniChar(src, &ch);
+	upChar = Tcl_UniCharToUpper(ch);
+
+	/*
+	 * To keep badly formed Utf strings from getting inflated by
+	 * the conversion (thereby causing a segfault), only copy the
+	 * upper case char to dst if its size is <= the original char.
+	 */
+	
+	if (bytes < UtfCount(upChar)) {
+	    memcpy(dst, src, (size_t) bytes);
+	    dst += bytes;
+	} else {
+	    dst += Tcl_UniCharToUtf(upChar, dst);
+	}
+	src += bytes;
+    }
+    *dst = '\0';
+    return (dst - str);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfToLower --
+ *
+ *	Convert uppercase characters to lowercase characters in a UTF
+ *	string in place.  The conversion may shrink the UTF string.
+ *
+ * Results:
+ *	Returns the number of bytes in the resulting string
+ *	excluding the trailing null.
+ *
+ * Side effects:
+ *	Writes a terminating null after the last converted character.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfToLower(str)
+    char *str;			/* String to convert in place. */
+{
+    Tcl_UniChar ch, lowChar;
+    char *src, *dst;
+    int bytes;
+    
+    /*
+     * Iterate over the string until we hit the terminating null.
+     */
+
+    src = dst = str;
+    while (*src) {
+	bytes = Tcl_UtfToUniChar(src, &ch);
+	lowChar = Tcl_UniCharToLower(ch);
+
+	/*
+	 * To keep badly formed Utf strings from getting inflated by
+	 * the conversion (thereby causing a segfault), only copy the
+	 * lower case char to dst if its size is <= the original char.
+	 */
+	
+	if (bytes < UtfCount(lowChar)) {
+	    memcpy(dst, src, (size_t) bytes);
+	    dst += bytes;
+	} else {
+	    dst += Tcl_UniCharToUtf(lowChar, dst);
+	}
+	src += bytes;
+    }
+    *dst = '\0';
+    return (dst - str);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfToTitle --
+ *
+ *	Changes the first character of a UTF string to title case or
+ *	uppercase and the rest of the string to lowercase.  The
+ *	conversion happens in place and may shrink the UTF string.
+ *
+ * Results:
+ *	Returns the number of bytes in the resulting string
+ *	excluding the trailing null.
+ *
+ * Side effects:
+ *	Writes a terminating null after the last converted character.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfToTitle(str)
+    char *str;			/* String to convert in place. */
+{
+    Tcl_UniChar ch, titleChar, lowChar;
+    char *src, *dst;
+    int bytes;
+    
+    /*
+     * Capitalize the first character and then lowercase the rest of the
+     * characters until we get to a null.
+     */
+
+    src = dst = str;
+
+    if (*src) {
+	bytes = Tcl_UtfToUniChar(src, &ch);
+	titleChar = Tcl_UniCharToTitle(ch);
+
+	if (bytes < UtfCount(titleChar)) {
+	    memcpy(dst, src, (size_t) bytes);
+	    dst += bytes;
+	} else {
+	    dst += Tcl_UniCharToUtf(titleChar, dst);
+	}
+	src += bytes;
+    }
+    while (*src) {
+	bytes = Tcl_UtfToUniChar(src, &ch);
+	lowChar = Tcl_UniCharToLower(ch);
+
+	if (bytes < UtfCount(lowChar)) {
+	    memcpy(dst, src, (size_t) bytes);
+	    dst += bytes;
+	} else {
+	    dst += Tcl_UniCharToUtf(lowChar, dst);
+	}
+	src += bytes;
+    }
+    *dst = '\0';
+    return (dst - str);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfNcmp --
+ *
+ *	Compare at most n UTF chars of string cs to string ct.  Both cs
+ *	and ct are assumed to be at least n UTF chars long.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfNcmp(cs, ct, n)
+    CONST char *cs;		/* UTF string to compare to ct. */
+    CONST char *ct;		/* UTF string cs is compared to. */
+    unsigned long n;		/* Number of UTF chars to compare. */
+{
+    Tcl_UniChar ch1, ch2;
+    /*
+     * Another approach that should work is:
+     *   return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
+     * That assumes that ct is a properly formed UTF, so we will just
+     * be comparing the bytes that compromise those strings to the
+     * char length n.
+     */
+    while (n-- > 0) {
+	/*
+	 * n must be interpreted as chars, not bytes.
+	 * This should be called only when both strings are of
+	 * at least n chars long (no need for \0 check)
+	 */
+	cs += Tcl_UtfToUniChar(cs, &ch1);
+	ct += Tcl_UtfToUniChar(ct, &ch2);
+	if (ch1 != ch2) {
+	    return (ch1 - ch2);
+	}
+    }
+    return 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UtfNcasecmp --
+ *
+ *	Compare at most n UTF chars of string cs to string ct case
+ *	insensitive.  Both cs and ct are assumed to be at least n
+ *	UTF chars long.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UtfNcasecmp(cs, ct, n)
+    CONST char *cs;		/* UTF string to compare to ct. */
+    CONST char *ct;		/* UTF string cs is compared to. */
+    unsigned long n;			/* Number of UTF chars to compare. */
+{
+    Tcl_UniChar ch1, ch2;
+    while (n-- > 0) {
+	/*
+	 * n must be interpreted as chars, not bytes.
+	 * This should be called only when both strings are of
+	 * at least n chars long (no need for \0 check)
+	 */
+	cs += Tcl_UtfToUniChar(cs, &ch1);
+	ct += Tcl_UtfToUniChar(ct, &ch2);
+	if (ch1 != ch2) {
+	    ch1 = Tcl_UniCharToLower(ch1);
+	    ch2 = Tcl_UniCharToLower(ch2);
+	    if (ch1 != ch2) {
+		return (ch1 - ch2);
+	    }
+	}
+    }
+    return 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharToUpper --
+ *
+ *	Compute the uppercase equivalent of the given Unicode character.
+ *
+ * Results:
+ *	Returns the uppercase Unicode character.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+Tcl_UniChar
+Tcl_UniCharToUpper(ch)
+    int ch;			/* Unicode character to convert. */
+{
+    int info = GetUniCharInfo(ch);
+
+    if (GetCaseType(info) & 0x04) {
+	return (Tcl_UniChar) (ch - GetDelta(info));
+    } else {
+	return ch;
+    }
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharToLower --
+ *
+ *	Compute the lowercase equivalent of the given Unicode character.
+ *
+ * Results:
+ *	Returns the lowercase Unicode character.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+Tcl_UniChar
+Tcl_UniCharToLower(ch)
+    int ch;			/* Unicode character to convert. */
+{
+    int info = GetUniCharInfo(ch);
+
+    if (GetCaseType(info) & 0x02) {
+	return (Tcl_UniChar) (ch + GetDelta(info));
+    } else {
+	return ch;
+    }
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharToTitle --
+ *
+ *	Compute the titlecase equivalent of the given Unicode character.
+ *
+ * Results:
+ *	Returns the titlecase Unicode character.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+Tcl_UniChar
+Tcl_UniCharToTitle(ch)
+    int ch;			/* Unicode character to convert. */
+{
+    int info = GetUniCharInfo(ch);
+    int mode = GetCaseType(info);
+
+    if (mode & 0x1) {
+	/*
+	 * Subtract or add one depending on the original case.
+	 */
+
+	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
+    } else if (mode == 0x4) {
+	return (Tcl_UniChar) (ch - GetDelta(info));
+    } else {
+	return ch;
+    }
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharLen --
+ *
+ *	Find the length of a UniChar string.  The str input must be null
+ *	terminated.
+ *
+ * Results:
+ *	Returns the length of str in UniChars (not bytes).
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharLen(str)
+    Tcl_UniChar *str;		/* Unicode string to find length of. */
+{
+    int len = 0;
+    
+    while (*str != '\0') {
+	len++;
+	str++;
+    }
+    return len;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharNcmp --
+ *
+ *	Compare at most n unichars of string cs to string ct.  Both cs
+ *	and ct are assumed to be at least n unichars long.
+ *
+ * Results:
+ *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharNcmp(cs, ct, n)
+    CONST Tcl_UniChar *cs;		/* Unicode string to compare to ct. */
+    CONST Tcl_UniChar *ct;		/* Unicode string cs is compared to. */
+    unsigned long n;			/* Number of unichars to compare. */
+{
+    for ( ; n != 0; n--, cs++, ct++) {
+	if (*cs != *ct) {
+	    return *cs - *ct;
+	}
+	if (*cs == '\0') {
+	    break;
+	}
+    }
+    return 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsAlnum --
+ *
+ *	Test if a character is an alphanumeric Unicode character.
+ *
+ * Results:
+ *	Returns 1 if character is alphanumeric.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsAlnum(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsAlpha --
+ *
+ *	Test if a character is an alphabetic Unicode character.
+ *
+ * Results:
+ *	Returns 1 if character is alphabetic.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsAlpha(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return ((ALPHA_BITS >> category) & 1);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsControl --
+ *
+ *	Test if a character is a Unicode control character.
+ *
+ * Results:
+ *	Returns non-zero if character is a control.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsControl(ch)
+    int ch;			/* Unicode character to test. */
+{
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsDigit --
+ *
+ *	Test if a character is a numeric Unicode character.
+ *
+ * Results:
+ *	Returns non-zero if character is a digit.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsDigit(ch)
+    int ch;			/* Unicode character to test. */
+{
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
+	    == DECIMAL_DIGIT_NUMBER);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsGraph --
+ *
+ *	Test if a character is any Unicode print character except space.
+ *
+ * Results:
+ *	Returns non-zero if character is printable, but not space.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsGraph(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsLower --
+ *
+ *	Test if a character is a lowercase Unicode character.
+ *
+ * Results:
+ *	Returns non-zero if character is lowercase.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsLower(ch)
+    int ch;			/* Unicode character to test. */
+{
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsPrint --
+ *
+ *	Test if a character is a Unicode print character.
+ *
+ * Results:
+ *	Returns non-zero if character is printable.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsPrint(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return ((PRINT_BITS >> category) & 1);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsPunct --
+ *
+ *	Test if a character is a Unicode punctuation character.
+ *
+ * Results:
+ *	Returns non-zero if character is punct.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsPunct(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+    return ((PUNCT_BITS >> category) & 1);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsSpace --
+ *
+ *	Test if a character is a whitespace Unicode character.
+ *
+ * Results:
+ *	Returns non-zero if character is a space.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsSpace(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category;
+
+    /*
+     * If the character is within the first 127 characters, just use the
+     * standard C function, otherwise consult the Unicode table.
+     */
+
+    if (ch < 0x80) {
+	return isspace(UCHAR(ch)); /* INTL: ISO space */
+    } else {
+	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+	return ((SPACE_BITS >> category) & 1);
+    }
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsUpper --
+ *
+ *	Test if a character is a uppercase Unicode character.
+ *
+ * Results:
+ *	Returns non-zero if character is uppercase.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsUpper(ch)
+    int ch;			/* Unicode character to test. */
+{
+    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
+ * Tcl_UniCharIsWordChar --
+ *
+ *	Test if a character is alphanumeric or a connector punctuation
+ *	mark.
+ *
+ * Results:
+ *	Returns 1 if character is a word character.
+ *
+ * Side effects:
+ *	None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+Tcl_UniCharIsWordChar(ch)
+    int ch;			/* Unicode character to test. */
+{
+    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
+
+    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
+}
author	Ian Roxborough <irox@redhat.com>	2001-09-09 22:40:53 +0000
committer	Ian Roxborough <irox@redhat.com>	2001-09-09 22:40:53 +0000
commit	a850c17c374d03259483e799b09326d255e17487 (patch)
tree	f1d024951a993f0453aa49d4ba808d6c38fa4321 /tcl/generic/tclUtf.c
parent	57e8350a3895a1579b77cc134d6d7d49b056678e (diff)
download	gdb-a850c17c374d03259483e799b09326d255e17487.tar.gz