Initial Checkin

author: Aaron M. Renn <arenn@urbanophile.com> 1998-10-21 01:58:39 +0000
committer: Aaron M. Renn <arenn@urbanophile.com> 1998-10-21 01:58:39 +0000
commit: 9ba75f0f14aea2be89299da929687837f680612a (patch)
tree: bd4253eb58fccb0d7d1132e7481a89eb5f2cecaa /java/text/RuleBasedCollator.java
parent: 97c4e0b3cef28e94eca3f63952f8221f449f86f9 (diff)
download: classpath-9ba75f0f14aea2be89299da929687837f680612a.tar.gz
1 files changed, 545 insertions, 0 deletions
diff --git a/java/text/RuleBasedCollator.java b/java/text/RuleBasedCollator.java
new file mode 100644
index 000000000..31a3962f0
--- /dev/null
+++ b/java/text/RuleBasedCollator.java
@@ -0,0 +1,545 @@
+/*************************************************************************
+/* RuleBasedCollator.java -- Concrete Collator Class
+/*
+/* Copyright (c) 1998 Free Software Foundation, Inc.
+/* Written by Aaron M. Renn (arenn@urbanophile.com)
+/*
+/* This library is free software; you can redistribute it and/or modify
+/* it under the terms of the GNU Library General Public License as published 
+/* by the Free Software Foundation, either version 2 of the License, or
+/* (at your option) any later verion.
+/*
+/* This library is distributed in the hope that it will be useful, but
+/* WITHOUT ANY WARRANTY; without even the implied warranty of
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+/* GNU Library General Public License for more details.
+/*
+/* You should have received a copy of the GNU Library General Public License
+/* along with this library; if not, write to the Free Software Foundation
+/* Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307 USA
+/*************************************************************************/
+
+package java.text;
+
+import java.util.Vector;
+
+/**
+  * This class is a concrete subclass of <code>Collator</code> suitable
+  * for string collation in a wide variety of languages.  An instance of
+  * this class is normally returned by the <code>getInstance</code> method
+  * of <code>Collator</code> with rules predefined for the requested
+  * locale.  However, an instance of this class can be created manually
+  * with any desired rules.
+  * <p>
+  * Rules take the form of a <code>String</code> with the following syntax
+  * <ul>
+  * <li> Modifier: '@' 
+  * <li> Relation: '<' | ';' | ',' | '=' : <text>
+  * <li> Reset: '&' : <text>
+  * </ul>
+  * The modifier character indicates that accents sort backward as is the
+  * case with French.  The relational operators specify how the text 
+  * argument relates to the previous term.  The relation characters have
+  * the following meanings:
+  * <ul>
+  * <li>'<' - The text argument is greater than the prior term at the primary
+  * difference level.
+  * <li>';' - The text argument is greater than the prior term at the secondary
+  * difference level.
+  * <li>',' - The text argument is greater than the prior term at the tertiary
+  * difference level.
+  * <li>'=' - The text argument is equal to the prior term
+  * </ul>
+  * <p>
+  * As for the text argument itself, this is any sequence of Unicode
+  * characters not in the following ranges: 0x0009-0x000D, 0x0020-0x002F,
+  * 0x003A-0x0040, 0x005B-0x0060, and 0x007B-0x007E. If these characters are 
+  * desired, they must be enclosed in single quotes.  If any whitespace is 
+  * encountered, it is ignored.  (For example, "a b" is equal to "ab").  
+  * <p>
+  * The reset operation inserts the following rule at the point where the
+  * text argument to it exists in the previously declared rule string.  This
+  * makes it easy to add new rules to an existing string by simply including
+  * them in a reset sequence at the end.  Note that the text argument, or
+  * at least the first character of it, must be present somewhere in the
+  * previously declared rules in order to be inserted properly.  If this
+  * is not satisfied, a <code>ParseException</code> will be thrown. 
+  * <p>
+  * This system of configuring <code>RuleBasedCollator</code> is needlessly
+  * complex and the people at Taligent who developed it (along with the folks
+  * at Sun who accepted it into the Java standard library) deserve a slow
+  * and agonizing death.
+  * <p>
+  * Here are a couple of example of rule strings:
+  * <p>
+  * "< a < b < c" - This string says that a is greater than b which is 
+  * greater than c, with all differences being primary differences.
+  * <p>
+  * "< a,A < b,B < c,C" - This string says that 'A' is greater than 'a' with
+  * a tertiary strength comparison.  Both 'b' and 'B' are greater than 'a' and
+  * 'A' during a primary strength comparison.  But 'B' is greater than 'b'
+  * under a tertiary strength comparison.
+  * <p>
+  * "< a < c & a < b " - This sequence is identical in function to the 
+  * "< a < b < c" rule string above.  The '&' reset symbol indicates that
+  * the rule "< b" is to be inserted after the text argument "a" in the
+  * previous rule string segment.
+  * <p>
+  * "< a < b & y < z" - This is an error.  The character 'y' does not appear
+  * anywhere in the previous rule string segment so the rule following the
+  * reset rule cannot be inserted.
+  * <p>
+  * For a description of the various comparison strength types, see the
+  * documentation for the <code>Collator</code> class.
+  * <p>
+  * As an additional complication to this already overly complex rule scheme,
+  * if any characters precede the first rule, these characters are considered
+  * ignorable.  They will be treated as if they did not exist during 
+  * comparisons.  For example, "- < a < b ..." would make '-' an ignorable
+  * character such that the strings "high-tech" and "hightech" would
+  * be considered identical.
+  * <p>
+  * A <code>ParseException</code> will be thrown for any of the following
+  * conditions:
+  * <ul>
+  * <li>Unquoted punctuation characters in a text argument.
+  * <li>A relational or reset operator not followed by a text argument
+  * <li>A reset operator where the text argument is not present in
+  * the previous rule string section.
+  * </ul>
+  *
+  * @version 0.0
+  *
+  * @author Aaron M. Renn (arenn@urbanophile.com)
+  */
+public class RuleBasedCollator extends Collator
+{
+
+/*
+ * Inner Classes
+ */
+
+class CollationElement
+{
+String char_seq;
+int primary;
+short secondary;
+short tertiary;
+
+CollationElement(String char_seq, int primary, short secondary, short tertiary)
+{
+  this.char_seq = char_seq;
+  this.primary = primary;
+  this.secondary = secondary;
+  this.tertiary = tertiary;
+}
+ 
+} // inner class CollationElement
+
+/*************************************************************************/
+
+/*
+ * Instance Variables
+ */
+
+/**
+  * This the the original rule string.
+  */
+private String rules;
+
+/**
+  * This is the table of collation element values
+  */
+private Object[] ce_table;
+
+/*************************************************************************/
+
+/*
+ * Constructors
+ */
+
+/**
+  * This method initializes a new instance of <code>RuleBasedCollator</code>
+  * with the specified collation rules.  Note that an application normally
+  * obtains an instance of <code>RuleBasedCollator</code> by calling the
+  * <code>getInstance</code> method of <code>Collator</code>.  That method
+  * automatically loads the proper set of rules for the desired locale.
+  *
+  * @param rules The collation rule string.
+  *
+  * @exception ParseException If the rule string contains syntax errors.
+  */
+public
+RuleBasedCollator(String rules) throws ParseException
+{
+  this.rules = rules;
+
+  if (rules.equals(""))
+    throw new IllegalArgumentException("Empty rule set");
+
+  Vector v = new Vector();
+  boolean ignore_chars = true;
+  int primary_seq = 0;
+  short secondary_seq = 0;
+  short tertiary_seq = 0;
+  StringBuffer sb = new StringBuffer("");
+  for (int i = 0; i < rules.length(); i++)
+    {
+      char c = rules.charAt(i);
+
+      // Check if it is a whitespace character
+      if (((c >= 0x09) && (c <= 0x0D)) || (c == 0x20))
+        continue;
+
+      // Primary difference
+      if (c == '<')
+        {
+          ignore_chars = false;
+          secondary_seq = 0;
+          tertiary_seq = 0;
+          ++primary_seq;
+
+          CollationElement e = new CollationElement(sb.toString(), primary_seq,
+                                                    secondary_seq,
+                                                    tertiary_seq);
+          v.add(e);
+          sb.setLength(0);
+        }
+
+      // Secondary difference
+      if (c == ';')
+        {
+          if (primary_seq == 0)
+            throw new ParseException(rules, i);
+
+          ++secondary_seq;
+          tertiary_seq = 0;
+
+          CollationElement e = new CollationElement(sb.toString(), primary_seq,
+                                                    secondary_seq,
+                                                    tertiary_seq);
+          v.add(e);
+          sb.setLength(0);
+        }
+
+      // Tertiary difference
+      if (c == ',')
+        {
+          if (primary_seq == 0)
+            throw new ParseException(rules, i);
+
+          ++tertiary_seq;
+
+          CollationElement e = new CollationElement(sb.toString(), primary_seq,
+                                                    secondary_seq,
+                                                    tertiary_seq);
+          v.add(e);
+          sb.setLength(0);
+        }
+
+      // Is equal to
+      if (c == '=')
+        {
+          if (primary_seq == 0)
+            throw new ParseException(rules, i);
+
+          CollationElement e = new CollationElement(sb.toString(), primary_seq,
+                                                    secondary_seq,
+                                                    tertiary_seq);
+          v.add(e);
+          sb.setLength(0);
+        }
+
+      // Sort accents backwards
+      if (c == '@')
+        {
+          throw new ParseException("French style accents not implemented yet", 0);
+        }
+
+      // Reset command
+      if (c == '&')
+        {
+          throw new ParseException("Reset not implemented yet", 0);
+        }
+
+      // See if we are still reading characters to skip
+      if (ignore_chars == true)
+        {
+          CollationElement e = new CollationElement(c + "", 0, (short)0, 
+                                                    (short)0);
+          v.add(e);
+        }
+
+      sb.append(c);
+    }
+
+  ce_table = v.toArray();
+}
+
+/*************************************************************************/
+
+/*
+ * Instance Methods
+ */
+
+/**
+  * This method returns a <code>String</code> containing the collation rules
+  * for this object.
+  *
+  * @return The collation rules for this object.
+  */
+public String
+getRules()
+{
+  return(rules);
+}
+
+/*************************************************************************/
+
+/**
+  * This method calculates the collation element value for the specified
+  * character(s).
+  */
+int
+getCollationElementValue(String str)
+{
+  CollationElement e = null;
+
+  // The table is sorted.  Change to a binary search later.
+  for (int i = 0; i < ce_table.length; i++) 
+    if (((CollationElement)ce_table[i]).char_seq.equals(str))
+      {
+        e = (CollationElement)ce_table[i];
+        break;
+      }
+
+  if (e == null)
+    e = new CollationElement(str, 0xFFFF, (short)0xFF, (short)0xFF);
+
+  int retval = (e.primary << 16) + (e.secondary << 8) + e.tertiary;
+
+  return(retval);
+}
+
+/*************************************************************************/
+
+/**
+  * This method returns an instance for <code>CollationElementIterator</code>
+  * for the specified <code>String</code> under the collation rules for this
+  * object.
+  *
+  * @param str The <code>String</code> to return the <code>CollationElementIterator</code> instance for.
+  *
+  * @return A <code>CollationElementIterator</code> for the specified <code>String</code>.
+  */
+public CollationElementIterator
+getCollationElementIterator(String str)
+{
+  return(new CollationElementIterator(this, str));
+}  
+
+/*************************************************************************/
+
+/**
+  * This method returns an instance of <code>CollationElementIterator</code>
+  * for the <code>String</code> represented by the specified
+  * <code>CharacterIterator</code>.
+  *
+  * @param ci The <code>CharacterIterator</code> with the desired <code>String</code>.
+  *
+  * @return A <code>CollationElementIterator</code> for the specified <code>String</code>.
+  */
+public CollationElementIterator
+getCollationElementIterator(CharacterIterator ci)
+{
+  StringBuffer sb = new StringBuffer("");
+
+  // Right now we assume that we will read from the beginning of the string.
+  char c = ci.first();
+  while (c != CharacterIterator.DONE) 
+    {
+      sb.append(c);
+      c = ci.next();
+    }
+
+  return(getCollationElementIterator(sb.toString()));
+}
+
+/*************************************************************************/
+
+/**
+  * This method returns an integer which indicates whether the first
+  * specified <code>String</code> is less than, greater than, or equal to
+  * the second.  The value depends not only on the collation rules in
+  * effect, but also the strength and decomposition settings of this object.
+  *
+  * @param s1 The first <code>String</code> to compare.
+  * @param s2 A second <code>String</code> to compare to the first.
+  *
+  * @return A negative integer if s1 < s2, a positive integer if s1 > s2, or 0 if s1 == s2.
+  */
+public int
+compare(String s1, String s2)
+{
+  CollationElementIterator cei1 = getCollationElementIterator(s1);
+  CollationElementIterator cei2 = getCollationElementIterator(s2);
+
+  for(;;)
+    {
+      int ord1 = cei1.next(); 
+      int ord2 = cei2.next(); 
+
+      // Check for end of string
+      if (ord1 == CollationElementIterator.NULLORDER)
+        if (ord2 == CollationElementIterator.NULLORDER)
+          return(0);
+        else
+          return(-1);
+      else if (ord2 == CollationElementIterator.NULLORDER)
+        return(1);
+
+      // We know chars are totally equal, so skip
+      if (ord1 == ord2)
+        continue;
+
+      // Check for primary strength differences
+      int prim1 = cei1.primaryOrder(ord1); 
+      int prim2 = cei2.primaryOrder(ord2); 
+
+      if (prim1 < prim2)
+        return(-1);
+      else if (prim1 > prim2)
+        return(1);
+      else if (getStrength() == PRIMARY)
+        continue;
+
+      // Check for secondary strength differences
+      int sec1 = cei1.secondaryOrder(ord1);
+      int sec2 = cei2.secondaryOrder(ord2);
+
+      if (sec1 < sec2)
+        return(-1);
+      else if (sec1 > sec2)
+        return(1);
+      else if (getStrength() == SECONDARY)
+        continue;
+
+      // Check for tertiary differences
+      int tert1 = cei1.tertiaryOrder(ord1);
+      int tert2 = cei2.tertiaryOrder(ord1);
+
+      if (tert1 < tert2)
+        return(-1);
+      else if (tert1 > tert2)
+        return(1);
+    }
+}
+
+/*************************************************************************/
+
+/**
+  * This method returns an instance of <code>CollationKey</code> for the
+  * specified <code>String</code>.  The object returned will have a
+  * more efficient mechanism for its comparison function that could
+  * provide speed benefits if multiple comparisons are performed, such
+  * as during a sort.
+  *
+  * @param str The <code>String</code> to create a <code>CollationKey</code> for.
+  *
+  * @return A <code>CollationKey</code> for the specified <code>String</code>.
+  */
+public CollationKey
+getCollationKey(String str)
+{
+  CollationElementIterator cei = getCollationElementIterator(str);
+  Vector vect = new Vector(25);
+
+  int ord = cei.next();
+  while (ord != CollationElementIterator.NULLORDER)
+    {
+      switch (getStrength())
+        {
+          case PRIMARY:
+             ord = cei.primaryOrder(ord);
+             break;
+
+          case SECONDARY:
+             ord = cei.secondaryOrder(ord);
+
+          default:
+             break;
+        }
+
+      vect.add(new Integer(ord)); 
+    }
+
+  Object[] objarr = vect.toArray();
+  byte[] key = new byte[objarr.length * 4];
+
+  for (int i = 0; i < key.length; i++)
+    {
+      int j = ((Integer)objarr[i]).intValue();
+      key[i++] = (byte)((j & 0xFF000000) >> 24);
+      key[i++] = (byte)((j & 0x00FF0000) >> 16);
+      key[i++] = (byte)((j & 0x0000FF00) >> 8);
+      key[i++] = (byte)(j & 0x000000FF);
+    }
+
+  return(new CollationKey(this, str, key));
+}
+
+/*************************************************************************/
+
+/**
+  * This method tests this object for equality against the specified 
+  * object.  This will be true if and only if the specified object is
+  * another reference to this object.
+  *
+  * @param obj The <code>Object</code> to compare against this object.
+  *
+  * @return <code>true</code> if the specified object is equal to this object, <code>false</code> otherwise.
+  */
+public boolean
+equals(Object obj)
+{
+  if (obj == this)
+    return(true);
+  else
+    return(false);
+}
+
+/*************************************************************************/
+
+/**
+  * This method returns a hash value for this object.
+  *
+  * @return A hash value for this object.
+  */
+public int
+hashCode()
+{
+  return(System.identityHashCode(this));
+}
+
+/*************************************************************************/
+
+/**
+  * This method creates a copy of this object.
+  *
+  * @return A copy of this object.
+  */
+public Object
+clone()
+{
+  try 
+    {
+       return(super.clone());
+    }
+  catch(CloneNotSupportedException e)
+    {
+       return(null);
+    }
+}
+
+} // class RuleBasedCollator
+
author	Aaron M. Renn <arenn@urbanophile.com>	1998-10-21 01:58:39 +0000
committer	Aaron M. Renn <arenn@urbanophile.com>	1998-10-21 01:58:39 +0000
commit	9ba75f0f14aea2be89299da929687837f680612a (patch)
tree	bd4253eb58fccb0d7d1132e7481a89eb5f2cecaa /java/text/RuleBasedCollator.java
parent	97c4e0b3cef28e94eca3f63952f8221f449f86f9 (diff)
download	classpath-9ba75f0f14aea2be89299da929687837f680612a.tar.gz