diff options
author | Eric Blake <ebb9@byu.net> | 2002-02-18 20:07:17 +0000 |
---|---|---|
committer | Eric Blake <ebb9@byu.net> | 2002-02-18 20:07:17 +0000 |
commit | 26cbbcfda33083544d135e59ccfae8206557e2ab (patch) | |
tree | dd7029e930baad02fb7fe7a93aabecce1df6dada /java/lang/Character.java | |
parent | 45940e02e46d2404145e67629b8c2f9641ede6d0 (diff) | |
download | classpath-26cbbcfda33083544d135e59ccfae8206557e2ab.tar.gz |
2002-02-18 Eric Blake <ebb9@email.byu.edu>
* doc/unicode/unicode-blocks.pl: Minor updates, to avoid warnings.
* doc/unicode/unicode-muncher.pl: Rewrite, to incorporate ideas
from Artur Biesiadowski: use a 2-level lookup of fixed-length
blocks instead of a 1-level lookup of variable-length blocks, and
store the data in String literals in a Java interface instead of
in binary files.
* doc/unicode/unicode.database.format: Remove, as
gnu/java/lang/CharData is self-documenting.
* gnu/java/locale/block.uni: Remove, replaced by
gnu/java/lang/CharData.java.
* gnu/java/locale/character.uni: Ditto.
* gnu/java/locale/titlecase.uni: Ditto.
* gnu/java/lang/Makefile.am (EXTRA_DIST): Add CharData.java.
* gnu/java/lang/CharData.java: New file, holding the Unicode
database for java.lang.Character.
* java/lang/Character.java (blocks, data, numValue, upper, lower),
(direction, title): Replace blocks, tcs, and unicodeData as the
Unicode database used in all other methods; initialized by
gnu.java.lang.CharData.
(<clinit>, getBlock, class Block, class CharAttr): Delete;
character attribute lookup now uses char[] instead of objects.
(readChar): Update to use new database.
(Various others): Reduce multiple comparisons to just one when
checking a character's type.
Diffstat (limited to 'java/lang/Character.java')
-rw-r--r-- | java/lang/Character.java | 454 |
1 files changed, 186 insertions, 268 deletions
diff --git a/java/lang/Character.java b/java/lang/Character.java index fec84b683..732a26400 100644 --- a/java/lang/Character.java +++ b/java/lang/Character.java @@ -39,21 +39,15 @@ exception statement from your version. */ package java.lang; import java.io.Serializable; -import java.io.File; -import java.io.DataInputStream; -import java.io.FileInputStream; -import java.io.RandomAccessFile; -import java.io.IOException; -import gnu.java.lang.ClassLoaderHelper; +import gnu.java.lang.CharData; /** - * Wrapper class for the primitive char data type. In addition, - * this class allows one to retrieve property information and - * perform transformations on the 57,707 defined characters in the - * Unicode Standard, Version 3.0.0. java.lang.Character is designed - * to be very dynamic, and as such, it retrieves information on - * the Unicode character set from a separate database, which - * can be easily upgraded. + * Wrapper class for the primitive char data type. In addition, this class + * allows one to retrieve property information and perform transformations + * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0. + * java.lang.Character is designed to be very dynamic, and as such, it + * retrieves information on the Unicode character set from a separate + * database, gnu.java.lang.CharData, which can be easily upgraded. * * <p>For predicates, boundaries are used to describe * the set of characters for which the method will return true. @@ -90,6 +84,7 @@ public final class Character implements Serializable, Comparable */ protected Subset(String name) { + // Note that name.toString() is name, unless name was null. this.name = name.toString(); } @@ -132,8 +127,9 @@ public final class Character implements Serializable, Comparable * A family of character subsets in the Unicode specification. A character * is in at most one of these blocks. * - * This class was generated by doc/unicode/unicode-blocks.pl. + * This inner class was generated by doc/unicode/unicode-blocks.pl. * + * @author doc/unicode/unicode-blocks.pl (written by Eric Blake) * @since 1.2 */ public static final class UnicodeBlock extends Subset @@ -1379,151 +1375,88 @@ public final class Character implements Serializable, Comparable */ public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; - /** Stores unicode blocks in gnu/java/locale/block.uni. */ - private static final Block blocks[]; - - /** Stores titlecase information in gnu/java/locale/titlecase.uni. */ - private static final char tcs[][]; + /** + * Stores unicode block offset lookup table. Exploit package visibility of + * String.value to avoid copying the array. + * @see #readChar(char) + * @see CharData#BLOCKS + */ + private static final char[] blocks = CharData.BLOCKS.value; - /** Stores unicode data in gnu/java/locale/character.uni. */ - private static final byte[] unicodeData; + /** + * Stores unicode attribute offset lookup table. Exploit package visibility + * of String.value to avoid copying the array. + * @see CharData#DATA + */ + private static final char[] data = CharData.DATA.value; - /** Caches the most recently used CharAttr. */ - private static CharAttr cachedCharAttr; + /** + * Stores unicode numeric value attribute table. Exploit package visibility + * of String.value to avoid copying the array. + * @see CharData#NUM_VALUE + */ + private static final char[] numValue = CharData.NUM_VALUE.value; - /** The size of a block in gnu/java/locale/block.uni. */ - private static final int BLOCK_SIZE = 6; + /** + * Stores unicode uppercase attribute table. Exploit package visibility + * of String.value to avoid copying the array. + * @see CharData#UPPER + */ + private static final char[] upper = CharData.UPPER.value; - /** The size of an attribute in gnu/java/locale/character.uni. */ - private static final int ATTR_SIZE = 8; + /** + * Stores unicode lowercase attribute table. Exploit package visibility + * of String.value to avoid copying the array. + * @see CharData#LOWER + */ + private static final char[] lower = CharData.LOWER.value; /** - * Open up the Unicode attribute database and read the index into memory. - * These files were generated by doc/unicode/unicode-muncher.pl. + * Stores unicode direction attribute table. Exploit package visibility + * of String.value to avoid copying the array. + * @see CharData#DIRECTION */ - static - { - File cFile = ClassLoaderHelper.getSystemResourceAsFile( - "/gnu/java/locale/character.uni"); - File blockFile = ClassLoaderHelper.getSystemResourceAsFile( - "/gnu/java/locale/block.uni"); - File tcFile = ClassLoaderHelper.getSystemResourceAsFile( - "/gnu/java/locale/titlecase.uni"); - if (cFile == null || blockFile == null || tcFile == null) - throw new InternalError("Cannot locate Unicode attribute database."); - - blocks = new Block[(int) blockFile.length() / BLOCK_SIZE]; - try - { - DataInputStream blockIS - = new DataInputStream(new FileInputStream(blockFile)); - for (int i = 0; i < blocks.length; i++) - { - char start = blockIS.readChar(); - char end = blockIS.readChar(); - short offset = blockIS.readShort(); - blocks[i] = new Block(start, end, offset); - } - } - catch (IOException e) - { - throw new InternalError("Error reading block file: " + e); - } + private static final char[] direction = CharData.DIRECTION.value; - tcs = new char[(int) tcFile.length() / 4][2]; - try - { - DataInputStream tcIS - = new DataInputStream(new FileInputStream(tcFile)); - for (int i = 0; i < tcs.length; i++) - { - tcs[i][0] = tcIS.readChar(); - tcs[i][1] = tcIS.readChar(); - } - } - catch (IOException e) - { - throw new InternalError("Error reading titlecase file: " + e); - } + /** + * Stores unicode titlecase table. Exploit package visibility of + * String.value to avoid copying the array. + * @see CharData#TITLE + */ + private static final char[] title = CharData.TITLE.value; - try - { - RandomAccessFile charFile = new RandomAccessFile(cFile, "r"); - unicodeData = new byte[(int) charFile.length()]; - charFile.readFully(unicodeData, 0, unicodeData.length); - } - catch (IOException e) - { - throw new InternalError("Error reading Unicode attribute file: " - + e); - } + /** + * Mask for grabbing the type out of the contents of data. + * @see CharData#DATA + */ + private static final int TYPE_MASK = 0x1F; - cachedCharAttr = new CharAttr('\0', false, CONTROL, -1, '\0', '\0', - DIRECTIONALITY_BOUNDARY_NEUTRAL, false); - } + /** Mask for grabbing the non-breaking space flag out of the contents of + * data. + * @see CharData#DATA + */ + private static final int NO_BREAK_MASK = 0x20; - /** - * Grabs a character out of the Unicode attribute database. See - * doc/unicode/unicode.database.format for details of the fields. - * - * @param ch the character to look up - * @return the character's CharAttr + /** Mask for grabbing the mirrored directionality flag out of the contents + * of data. + * @see CharData#DATA */ - private static CharAttr readChar(char ch) - { - CharAttr cached = cachedCharAttr; - if (cached.ch == ch) - return cached; - - Block b = getBlock(ch); - if (b == null) - return cachedCharAttr = new CharAttr(ch, false, UNASSIGNED, -1, ch, ch, - DIRECTIONALITY_UNDEFINED, false); - int offset = b.compressed ? b.offset - : (b.offset + (ch - b.start) * ATTR_SIZE); - byte flags = unicodeData[offset]; - boolean noBreakSpace = (flags & 0x20) == 0x20; - byte category = (byte) (flags & 0x1F); - // numericValue, uppercase, and lowercase are signed in unicode.uni. - int numericValue = (unicodeData[offset + 1] << 8) - | (unicodeData[offset + 2] & 0xFF); - char uppercase = (char) (ch - ((unicodeData[offset + 3] << 8) - | (unicodeData[offset + 4] & 0xFF))); - char lowercase = (char) (ch - ((unicodeData[offset + 5] << 8) - | (unicodeData[offset + 6] & 0xFF))); - if (b.compressed && numericValue >= 0) - numericValue += ch - b.start; - byte directionality = unicodeData[offset + 7]; - boolean mirrored = (flags & 0x40) == 0x40; - - return cachedCharAttr = new CharAttr(ch, noBreakSpace, category, - numericValue, uppercase, lowercase, - directionality, mirrored); - } + private static final int MIRROR_MASK = 0x40; /** - * Locates which block a character's information resides in. + * Grabs an attribute offset from the Unicode attribute database. The lower + * 5 bits are the character type, the next 2 bits are flags, and the top + * 9 bits are the offset into the attribute tables. * * @param ch the character to look up - * @return the block, or null + * @return the character's attribute offset and type + * @see CharData#DATA + * @see CharData#SHIFT */ - private static Block getBlock(char ch) + private static char readChar(char ch) { - // Simple binary search. - int low = 0; - int hi = blocks.length - 1; - while (low <= hi) - { - int mid = (low + hi) >> 1; - Block b = blocks[mid]; - if (ch < b.start) - hi = mid - 1; - else if (ch > b.end) - low = mid + 1; - else - return b; - } - return null; + return data[blocks[ch >> CharData.SHIFT] + + (ch & ~(-1 << CharData.SHIFT))]; } /** @@ -1577,6 +1510,8 @@ public final class Character implements Serializable, Comparable */ public String toString() { + // This assumes that String.valueOf(char) can create a single-character + // String more efficiently than through the public API. return String.valueOf(value); } @@ -1589,6 +1524,8 @@ public final class Character implements Serializable, Comparable */ public String toString(char ch) { + // This assumes that String.valueOf(char) can create a single-character + // String more efficiently than through the public API. return String.valueOf(ch); } @@ -1680,7 +1617,7 @@ public final class Character implements Serializable, Comparable */ public static boolean isDefined(char ch) { - return getBlock(ch) != null; + return getType(ch) == UNASSIGNED; } /** @@ -1703,8 +1640,12 @@ public final class Character implements Serializable, Comparable */ public static boolean isLetter(char ch) { - int category = getType(ch); - return category >= UPPERCASE_LETTER && category <= OTHER_LETTER; + return ((1 << getType(ch)) + & ((1 << UPPERCASE_LETTER) + | (1 << LOWERCASE_LETTER) + | (1 << TITLECASE_LETTER) + | (1 << MODIFIER_LETTER) + | (1 << OTHER_LETTER))) != 0; } /** @@ -1724,9 +1665,13 @@ public final class Character implements Serializable, Comparable */ public static boolean isLetterOrDigit(char ch) { - int category = getType(ch); - return (category <= OTHER_LETTER && category >= UPPERCASE_LETTER) - || category == DECIMAL_DIGIT_NUMBER; + return ((1 << getType(ch)) + & ((1 << UPPERCASE_LETTER) + | (1 << LOWERCASE_LETTER) + | (1 << TITLECASE_LETTER) + | (1 << MODIFIER_LETTER) + | (1 << OTHER_LETTER) + | (1 << DECIMAL_DIGIT_NUMBER))) != 0; } /** @@ -1790,10 +1735,15 @@ public final class Character implements Serializable, Comparable */ public static boolean isJavaIdentifierStart(char ch) { - int category = getType(ch); - return (category <= OTHER_LETTER && category >= UPPERCASE_LETTER) - || category == LETTER_NUMBER || category == CURRENCY_SYMBOL - || category == CONNECTOR_PUNCTUATION; + return ((1 << getType(ch)) + & ((1 << UPPERCASE_LETTER) + | (1 << LOWERCASE_LETTER) + | (1 << TITLECASE_LETTER) + | (1 << MODIFIER_LETTER) + | (1 << OTHER_LETTER) + | (1 << LETTER_NUMBER) + | (1 << CURRENCY_SYMBOL) + | (1 << CONNECTOR_PUNCTUATION))) != 0; } /** @@ -1818,12 +1768,20 @@ public final class Character implements Serializable, Comparable public static boolean isJavaIdentifierPart(char ch) { int category = getType(ch); - if (category == CONTROL) - return isIdentifierIgnorable(ch); - return (category <= NON_SPACING_MARK && category >= UPPERCASE_LETTER) - || (category <= LETTER_NUMBER && category >= COMBINING_SPACING_MARK) - || category == CURRENCY_SYMBOL || category == CONNECTOR_PUNCTUATION - || category == FORMAT; + return ((1 << category) + & ((1 << UPPERCASE_LETTER) + | (1 << LOWERCASE_LETTER) + | (1 << TITLECASE_LETTER) + | (1 << MODIFIER_LETTER) + | (1 << OTHER_LETTER) + | (1 << NON_SPACING_MARK) + | (1 << COMBINING_SPACING_MARK) + | (1 << DECIMAL_DIGIT_NUMBER) + | (1 << LETTER_NUMBER) + | (1 << CURRENCY_SYMBOL) + | (1 << CONNECTOR_PUNCTUATION) + | (1 << FORMAT))) != 0 + || (category == CONTROL && isIdentifierIgnorable(ch)); } /** @@ -1842,9 +1800,13 @@ public final class Character implements Serializable, Comparable */ public static boolean isUnicodeIdentifierStart(char ch) { - int category = getType(ch); - return (category <= OTHER_LETTER && category >= UPPERCASE_LETTER) - || category == LETTER_NUMBER; + return ((1 << getType(ch)) + & ((1 << UPPERCASE_LETTER) + | (1 << LOWERCASE_LETTER) + | (1 << TITLECASE_LETTER) + | (1 << MODIFIER_LETTER) + | (1 << OTHER_LETTER) + | (1 << LETTER_NUMBER))) != 0; } /** @@ -1868,11 +1830,19 @@ public final class Character implements Serializable, Comparable public static boolean isUnicodeIdentifierPart(char ch) { int category = getType(ch); - if (category == CONTROL) - return isIdentifierIgnorable(ch); - return (category <= NON_SPACING_MARK && category >= UPPERCASE_LETTER) - || (category <= LETTER_NUMBER && category >= COMBINING_SPACING_MARK) - || category == CONNECTOR_PUNCTUATION || category == FORMAT; + return ((1 << category) + & ((1 << UPPERCASE_LETTER) + | (1 << LOWERCASE_LETTER) + | (1 << TITLECASE_LETTER) + | (1 << MODIFIER_LETTER) + | (1 << OTHER_LETTER) + | (1 << NON_SPACING_MARK) + | (1 << COMBINING_SPACING_MARK) + | (1 << DECIMAL_DIGIT_NUMBER) + | (1 << LETTER_NUMBER) + | (1 << CONNECTOR_PUNCTUATION) + | (1 << FORMAT))) != 0 + || (category == CONTROL && isIdentifierIgnorable(ch)); } /** @@ -1893,8 +1863,9 @@ public final class Character implements Serializable, Comparable */ public static boolean isIdentifierIgnorable(char ch) { - return ch <= '\u0008' || (ch <= '\u001B' && ch >= '\u000E') - || (ch <= '\u009F' && ch >= '\u007F') || getType(ch) == FORMAT; + return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F' + || (ch <= '\u001B' && ch >= '\u000E'))) + || getType(ch) == FORMAT; } /** @@ -1912,7 +1883,8 @@ public final class Character implements Serializable, Comparable */ public static char toLowerCase(char ch) { - return readChar(ch).lowercase; + // Signedness doesn't matter, as result is cast back to char. + return (char) (ch + lower[readChar(ch) >> 7]); } /** @@ -1930,7 +1902,8 @@ public final class Character implements Serializable, Comparable */ public static char toUpperCase(char ch) { - return readChar(ch).uppercase; + // Signedness doesn't matter, as result is cast back to char. + return (char) (ch + upper[readChar(ch) >> 7]); } /** @@ -1947,9 +1920,10 @@ public final class Character implements Serializable, Comparable */ public static char toTitleCase(char ch) { - for (int i = 0; i < tcs.length; i++) - if (tcs[i][0] == ch) - return tcs[i][1]; + // As title is short, it doesn't hurt to exhaustively iterate over it. + for (int i = title.length - 2; i >= 0; i -= 2) + if (title[i] == ch) + return title[i + 1]; return toUpperCase(ch); } @@ -1975,12 +1949,14 @@ public final class Character implements Serializable, Comparable { if (radix < MIN_RADIX || radix > MAX_RADIX) return -1; - CharAttr attr = readChar(ch); - int category = attr.category; - if (category == DECIMAL_DIGIT_NUMBER || category == LOWERCASE_LETTER - || category == UPPERCASE_LETTER) + char attr = readChar(ch); + if (((1 << (attr & TYPE_MASK)) + & ((1 << UPPERCASE_LETTER) + | (1 << LOWERCASE_LETTER) + | (1 << DECIMAL_DIGIT_NUMBER))) != 0) { - int digit = attr.numericValue; + // Signedness doesn't matter; 0xffff vs. -1 are both rejected. + int digit = numValue[attr >> 7]; return (digit >= 0 && digit < radix) ? digit : -1; } return -1; @@ -2015,7 +1991,8 @@ public final class Character implements Serializable, Comparable */ public static int getNumericValue(char ch) { - return readChar(ch).numericValue; + // Treat numValue as signed. + return (short) numValue[readChar(ch) >> 7]; } /** @@ -2033,7 +2010,13 @@ public final class Character implements Serializable, Comparable */ public static boolean isSpace(char ch) { - return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f'; + // Performing the subtraction up front alleviates need to compare longs. + return ch-- <= ' ' && ((1 << ch) + & ((1 << (' ' - 1)) + | (1 << ('\t' - 1)) + | (1 << ('\n' - 1)) + | (1 << ('\r' - 1)) + | (1 << ('\f' - 1)))) != 0; } /** @@ -2049,8 +2032,10 @@ public final class Character implements Serializable, Comparable */ public static boolean isSpaceChar(char ch) { - int category = readChar(ch).category; - return category >= SPACE_SEPARATOR && category <= PARAGRAPH_SEPARATOR; + return ((1 << getType(ch)) + & ((1 << SPACE_SEPARATOR) + | (1 << LINE_SEPARATOR) + | (1 << PARAGRAPH_SEPARATOR))) != 0; } /** @@ -2072,12 +2057,22 @@ public final class Character implements Serializable, Comparable */ public static boolean isWhitespace(char ch) { - if ((ch <= '\r' && ch >= '\u0009') || (ch <= '\u001F' && ch >= '\u001C')) - return true; - CharAttr attr = readChar(ch); - int category = attr.category; - return (category == SPACE_SEPARATOR && ! attr.noBreakSpace) - || category == LINE_SEPARATOR || category == PARAGRAPH_SEPARATOR; + int attr = readChar(ch); + return ((((1 << (attr & TYPE_MASK)) + & ((1 << SPACE_SEPARATOR) + | (1 << LINE_SEPARATOR) + | (1 << PARAGRAPH_SEPARATOR))) != 0) + && (attr & NO_BREAK_MASK) == 0) + || (ch <= '\u001F' && ((1 << ch) + & ((1 << '\t') + | (1 << '\n') + | (1 << '\u000B') + | (1 << '\u000C') + | (1 << '\r') + | (1 << '\u001C') + | (1 << '\u001D') + | (1 << '\u001E') + | (1 << '\u001F'))) != 0); } /** @@ -2135,7 +2130,7 @@ public final class Character implements Serializable, Comparable */ public static int getType(char ch) { - return readChar(ch).category; + return readChar(ch) & TYPE_MASK; } /** @@ -2191,7 +2186,8 @@ public final class Character implements Serializable, Comparable */ public static byte getDirectionality(char ch) { - return readChar(ch).directionality; + // The result will correctly be signed. + return (byte) direction[readChar(ch) >> 7]; } /** @@ -2205,7 +2201,7 @@ public final class Character implements Serializable, Comparable */ public static boolean isMirrored(char ch) { - return readChar(ch).mirrored; + return (readChar(ch) & MIRROR_MASK) != 0; } /** @@ -2239,82 +2235,4 @@ public final class Character implements Serializable, Comparable { return compareTo((Character) o); } - - /** - * Represents an entry in gnu/java/locale/block.uni. - */ - private static final class Block - { - /** The start character of a compressed block. */ - final char start; - /** The end character of a compressed block. */ - final char end; - /** True if the block represents multiple characters. */ - final boolean compressed; - /** The offset of the block. */ - final int offset; - - /** - * Construct a block. - * @param start the start character - * @param end the end character - * @param offset the offset and compression of the block: the most - * significant bit is the compression, the remainder are the offset - */ - Block(char start, char end, short offset) - { - this.start = start; - this.end = end; - this.compressed = offset < 0; - this.offset = offset & Short.MAX_VALUE; - } - } // class Block - - /** - * Represents all the attributes stored for a character. - */ - private static final class CharAttr - { - /** The character of this attribute. */ - final char ch; - /** If the character is a non-breaking space. */ - final boolean noBreakSpace; - /** The character category. */ - final int category; - /** The character value. */ - final int numericValue; - /** The uppercase version of the character. */ - final char uppercase; - /** The lowercase version of the character. */ - final char lowercase; - /** The directionality of the character. */ - final byte directionality; - /** Whether the character is mirrored. */ - final boolean mirrored; - - /** - * Construct the attributes for a character. - * @param ch the character - * @param noBreakSpace if it is a non-breaking space - * @param category its category - * @param numericValue its value, or -1 (none), -2 (not representable) - * @param uppercase the uppercase version, or '\0' - * @param lowercase the lowercase version, or '\0' - * @param directionality the directionality - * @param mirrored if it is mirrored - */ - CharAttr(char ch, boolean noBreakSpace, byte category, int numericValue, - char uppercase, char lowercase, byte directionality, - boolean mirrored) - { - this.ch = ch; - this.noBreakSpace = noBreakSpace; - this.category = category; - this.numericValue = numericValue; - this.uppercase = uppercase == '\0' ? ch : uppercase; - this.lowercase = lowercase == '\0' ? ch : lowercase; - this.directionality = directionality; - this.mirrored = mirrored; - } - } // class CharAttr } //class Character |