diff options
author | Aaron M. Renn <arenn@urbanophile.com> | 1998-10-21 01:58:39 +0000 |
---|---|---|
committer | Aaron M. Renn <arenn@urbanophile.com> | 1998-10-21 01:58:39 +0000 |
commit | 9ba75f0f14aea2be89299da929687837f680612a (patch) | |
tree | bd4253eb58fccb0d7d1132e7481a89eb5f2cecaa /java/text/RuleBasedCollator.java | |
parent | 97c4e0b3cef28e94eca3f63952f8221f449f86f9 (diff) | |
download | classpath-9ba75f0f14aea2be89299da929687837f680612a.tar.gz |
Initial Checkin
Diffstat (limited to 'java/text/RuleBasedCollator.java')
-rw-r--r-- | java/text/RuleBasedCollator.java | 545 |
1 files changed, 545 insertions, 0 deletions
diff --git a/java/text/RuleBasedCollator.java b/java/text/RuleBasedCollator.java new file mode 100644 index 000000000..31a3962f0 --- /dev/null +++ b/java/text/RuleBasedCollator.java @@ -0,0 +1,545 @@ +/************************************************************************* +/* RuleBasedCollator.java -- Concrete Collator Class +/* +/* Copyright (c) 1998 Free Software Foundation, Inc. +/* Written by Aaron M. Renn (arenn@urbanophile.com) +/* +/* This library is free software; you can redistribute it and/or modify +/* it under the terms of the GNU Library General Public License as published +/* by the Free Software Foundation, either version 2 of the License, or +/* (at your option) any later verion. +/* +/* This library is distributed in the hope that it will be useful, but +/* WITHOUT ANY WARRANTY; without even the implied warranty of +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/* GNU Library General Public License for more details. +/* +/* You should have received a copy of the GNU Library General Public License +/* along with this library; if not, write to the Free Software Foundation +/* Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307 USA +/*************************************************************************/ + +package java.text; + +import java.util.Vector; + +/** + * This class is a concrete subclass of <code>Collator</code> suitable + * for string collation in a wide variety of languages. An instance of + * this class is normally returned by the <code>getInstance</code> method + * of <code>Collator</code> with rules predefined for the requested + * locale. However, an instance of this class can be created manually + * with any desired rules. + * <p> + * Rules take the form of a <code>String</code> with the following syntax + * <ul> + * <li> Modifier: '@' + * <li> Relation: '<' | ';' | ',' | '=' : <text> + * <li> Reset: '&' : <text> + * </ul> + * The modifier character indicates that accents sort backward as is the + * case with French. The relational operators specify how the text + * argument relates to the previous term. The relation characters have + * the following meanings: + * <ul> + * <li>'<' - The text argument is greater than the prior term at the primary + * difference level. + * <li>';' - The text argument is greater than the prior term at the secondary + * difference level. + * <li>',' - The text argument is greater than the prior term at the tertiary + * difference level. + * <li>'=' - The text argument is equal to the prior term + * </ul> + * <p> + * As for the text argument itself, this is any sequence of Unicode + * characters not in the following ranges: 0x0009-0x000D, 0x0020-0x002F, + * 0x003A-0x0040, 0x005B-0x0060, and 0x007B-0x007E. If these characters are + * desired, they must be enclosed in single quotes. If any whitespace is + * encountered, it is ignored. (For example, "a b" is equal to "ab"). + * <p> + * The reset operation inserts the following rule at the point where the + * text argument to it exists in the previously declared rule string. This + * makes it easy to add new rules to an existing string by simply including + * them in a reset sequence at the end. Note that the text argument, or + * at least the first character of it, must be present somewhere in the + * previously declared rules in order to be inserted properly. If this + * is not satisfied, a <code>ParseException</code> will be thrown. + * <p> + * This system of configuring <code>RuleBasedCollator</code> is needlessly + * complex and the people at Taligent who developed it (along with the folks + * at Sun who accepted it into the Java standard library) deserve a slow + * and agonizing death. + * <p> + * Here are a couple of example of rule strings: + * <p> + * "< a < b < c" - This string says that a is greater than b which is + * greater than c, with all differences being primary differences. + * <p> + * "< a,A < b,B < c,C" - This string says that 'A' is greater than 'a' with + * a tertiary strength comparison. Both 'b' and 'B' are greater than 'a' and + * 'A' during a primary strength comparison. But 'B' is greater than 'b' + * under a tertiary strength comparison. + * <p> + * "< a < c & a < b " - This sequence is identical in function to the + * "< a < b < c" rule string above. The '&' reset symbol indicates that + * the rule "< b" is to be inserted after the text argument "a" in the + * previous rule string segment. + * <p> + * "< a < b & y < z" - This is an error. The character 'y' does not appear + * anywhere in the previous rule string segment so the rule following the + * reset rule cannot be inserted. + * <p> + * For a description of the various comparison strength types, see the + * documentation for the <code>Collator</code> class. + * <p> + * As an additional complication to this already overly complex rule scheme, + * if any characters precede the first rule, these characters are considered + * ignorable. They will be treated as if they did not exist during + * comparisons. For example, "- < a < b ..." would make '-' an ignorable + * character such that the strings "high-tech" and "hightech" would + * be considered identical. + * <p> + * A <code>ParseException</code> will be thrown for any of the following + * conditions: + * <ul> + * <li>Unquoted punctuation characters in a text argument. + * <li>A relational or reset operator not followed by a text argument + * <li>A reset operator where the text argument is not present in + * the previous rule string section. + * </ul> + * + * @version 0.0 + * + * @author Aaron M. Renn (arenn@urbanophile.com) + */ +public class RuleBasedCollator extends Collator +{ + +/* + * Inner Classes + */ + +class CollationElement +{ +String char_seq; +int primary; +short secondary; +short tertiary; + +CollationElement(String char_seq, int primary, short secondary, short tertiary) +{ + this.char_seq = char_seq; + this.primary = primary; + this.secondary = secondary; + this.tertiary = tertiary; +} + +} // inner class CollationElement + +/*************************************************************************/ + +/* + * Instance Variables + */ + +/** + * This the the original rule string. + */ +private String rules; + +/** + * This is the table of collation element values + */ +private Object[] ce_table; + +/*************************************************************************/ + +/* + * Constructors + */ + +/** + * This method initializes a new instance of <code>RuleBasedCollator</code> + * with the specified collation rules. Note that an application normally + * obtains an instance of <code>RuleBasedCollator</code> by calling the + * <code>getInstance</code> method of <code>Collator</code>. That method + * automatically loads the proper set of rules for the desired locale. + * + * @param rules The collation rule string. + * + * @exception ParseException If the rule string contains syntax errors. + */ +public +RuleBasedCollator(String rules) throws ParseException +{ + this.rules = rules; + + if (rules.equals("")) + throw new IllegalArgumentException("Empty rule set"); + + Vector v = new Vector(); + boolean ignore_chars = true; + int primary_seq = 0; + short secondary_seq = 0; + short tertiary_seq = 0; + StringBuffer sb = new StringBuffer(""); + for (int i = 0; i < rules.length(); i++) + { + char c = rules.charAt(i); + + // Check if it is a whitespace character + if (((c >= 0x09) && (c <= 0x0D)) || (c == 0x20)) + continue; + + // Primary difference + if (c == '<') + { + ignore_chars = false; + secondary_seq = 0; + tertiary_seq = 0; + ++primary_seq; + + CollationElement e = new CollationElement(sb.toString(), primary_seq, + secondary_seq, + tertiary_seq); + v.add(e); + sb.setLength(0); + } + + // Secondary difference + if (c == ';') + { + if (primary_seq == 0) + throw new ParseException(rules, i); + + ++secondary_seq; + tertiary_seq = 0; + + CollationElement e = new CollationElement(sb.toString(), primary_seq, + secondary_seq, + tertiary_seq); + v.add(e); + sb.setLength(0); + } + + // Tertiary difference + if (c == ',') + { + if (primary_seq == 0) + throw new ParseException(rules, i); + + ++tertiary_seq; + + CollationElement e = new CollationElement(sb.toString(), primary_seq, + secondary_seq, + tertiary_seq); + v.add(e); + sb.setLength(0); + } + + // Is equal to + if (c == '=') + { + if (primary_seq == 0) + throw new ParseException(rules, i); + + CollationElement e = new CollationElement(sb.toString(), primary_seq, + secondary_seq, + tertiary_seq); + v.add(e); + sb.setLength(0); + } + + // Sort accents backwards + if (c == '@') + { + throw new ParseException("French style accents not implemented yet", 0); + } + + // Reset command + if (c == '&') + { + throw new ParseException("Reset not implemented yet", 0); + } + + // See if we are still reading characters to skip + if (ignore_chars == true) + { + CollationElement e = new CollationElement(c + "", 0, (short)0, + (short)0); + v.add(e); + } + + sb.append(c); + } + + ce_table = v.toArray(); +} + +/*************************************************************************/ + +/* + * Instance Methods + */ + +/** + * This method returns a <code>String</code> containing the collation rules + * for this object. + * + * @return The collation rules for this object. + */ +public String +getRules() +{ + return(rules); +} + +/*************************************************************************/ + +/** + * This method calculates the collation element value for the specified + * character(s). + */ +int +getCollationElementValue(String str) +{ + CollationElement e = null; + + // The table is sorted. Change to a binary search later. + for (int i = 0; i < ce_table.length; i++) + if (((CollationElement)ce_table[i]).char_seq.equals(str)) + { + e = (CollationElement)ce_table[i]; + break; + } + + if (e == null) + e = new CollationElement(str, 0xFFFF, (short)0xFF, (short)0xFF); + + int retval = (e.primary << 16) + (e.secondary << 8) + e.tertiary; + + return(retval); +} + +/*************************************************************************/ + +/** + * This method returns an instance for <code>CollationElementIterator</code> + * for the specified <code>String</code> under the collation rules for this + * object. + * + * @param str The <code>String</code> to return the <code>CollationElementIterator</code> instance for. + * + * @return A <code>CollationElementIterator</code> for the specified <code>String</code>. + */ +public CollationElementIterator +getCollationElementIterator(String str) +{ + return(new CollationElementIterator(this, str)); +} + +/*************************************************************************/ + +/** + * This method returns an instance of <code>CollationElementIterator</code> + * for the <code>String</code> represented by the specified + * <code>CharacterIterator</code>. + * + * @param ci The <code>CharacterIterator</code> with the desired <code>String</code>. + * + * @return A <code>CollationElementIterator</code> for the specified <code>String</code>. + */ +public CollationElementIterator +getCollationElementIterator(CharacterIterator ci) +{ + StringBuffer sb = new StringBuffer(""); + + // Right now we assume that we will read from the beginning of the string. + char c = ci.first(); + while (c != CharacterIterator.DONE) + { + sb.append(c); + c = ci.next(); + } + + return(getCollationElementIterator(sb.toString())); +} + +/*************************************************************************/ + +/** + * This method returns an integer which indicates whether the first + * specified <code>String</code> is less than, greater than, or equal to + * the second. The value depends not only on the collation rules in + * effect, but also the strength and decomposition settings of this object. + * + * @param s1 The first <code>String</code> to compare. + * @param s2 A second <code>String</code> to compare to the first. + * + * @return A negative integer if s1 < s2, a positive integer if s1 > s2, or 0 if s1 == s2. + */ +public int +compare(String s1, String s2) +{ + CollationElementIterator cei1 = getCollationElementIterator(s1); + CollationElementIterator cei2 = getCollationElementIterator(s2); + + for(;;) + { + int ord1 = cei1.next(); + int ord2 = cei2.next(); + + // Check for end of string + if (ord1 == CollationElementIterator.NULLORDER) + if (ord2 == CollationElementIterator.NULLORDER) + return(0); + else + return(-1); + else if (ord2 == CollationElementIterator.NULLORDER) + return(1); + + // We know chars are totally equal, so skip + if (ord1 == ord2) + continue; + + // Check for primary strength differences + int prim1 = cei1.primaryOrder(ord1); + int prim2 = cei2.primaryOrder(ord2); + + if (prim1 < prim2) + return(-1); + else if (prim1 > prim2) + return(1); + else if (getStrength() == PRIMARY) + continue; + + // Check for secondary strength differences + int sec1 = cei1.secondaryOrder(ord1); + int sec2 = cei2.secondaryOrder(ord2); + + if (sec1 < sec2) + return(-1); + else if (sec1 > sec2) + return(1); + else if (getStrength() == SECONDARY) + continue; + + // Check for tertiary differences + int tert1 = cei1.tertiaryOrder(ord1); + int tert2 = cei2.tertiaryOrder(ord1); + + if (tert1 < tert2) + return(-1); + else if (tert1 > tert2) + return(1); + } +} + +/*************************************************************************/ + +/** + * This method returns an instance of <code>CollationKey</code> for the + * specified <code>String</code>. The object returned will have a + * more efficient mechanism for its comparison function that could + * provide speed benefits if multiple comparisons are performed, such + * as during a sort. + * + * @param str The <code>String</code> to create a <code>CollationKey</code> for. + * + * @return A <code>CollationKey</code> for the specified <code>String</code>. + */ +public CollationKey +getCollationKey(String str) +{ + CollationElementIterator cei = getCollationElementIterator(str); + Vector vect = new Vector(25); + + int ord = cei.next(); + while (ord != CollationElementIterator.NULLORDER) + { + switch (getStrength()) + { + case PRIMARY: + ord = cei.primaryOrder(ord); + break; + + case SECONDARY: + ord = cei.secondaryOrder(ord); + + default: + break; + } + + vect.add(new Integer(ord)); + } + + Object[] objarr = vect.toArray(); + byte[] key = new byte[objarr.length * 4]; + + for (int i = 0; i < key.length; i++) + { + int j = ((Integer)objarr[i]).intValue(); + key[i++] = (byte)((j & 0xFF000000) >> 24); + key[i++] = (byte)((j & 0x00FF0000) >> 16); + key[i++] = (byte)((j & 0x0000FF00) >> 8); + key[i++] = (byte)(j & 0x000000FF); + } + + return(new CollationKey(this, str, key)); +} + +/*************************************************************************/ + +/** + * This method tests this object for equality against the specified + * object. This will be true if and only if the specified object is + * another reference to this object. + * + * @param obj The <code>Object</code> to compare against this object. + * + * @return <code>true</code> if the specified object is equal to this object, <code>false</code> otherwise. + */ +public boolean +equals(Object obj) +{ + if (obj == this) + return(true); + else + return(false); +} + +/*************************************************************************/ + +/** + * This method returns a hash value for this object. + * + * @return A hash value for this object. + */ +public int +hashCode() +{ + return(System.identityHashCode(this)); +} + +/*************************************************************************/ + +/** + * This method creates a copy of this object. + * + * @return A copy of this object. + */ +public Object +clone() +{ + try + { + return(super.clone()); + } + catch(CloneNotSupportedException e) + { + return(null); + } +} + +} // class RuleBasedCollator + |