summaryrefslogtreecommitdiff
path: root/gnu/xml/stream/UnicodeReader.java
diff options
context:
space:
mode:
authorChris Burdess <dog@bluezoo.org>2005-12-29 09:14:19 +0000
committerChris Burdess <dog@bluezoo.org>2005-12-29 09:14:19 +0000
commitb91aa9e58f8db725da1b33461ffd38af23bedf33 (patch)
tree0ab9de666a816a359618415b5a7b68d6ef9e8773 /gnu/xml/stream/UnicodeReader.java
parentd00f77c58e298185c68d13f6d8f7ce0fb94433ae (diff)
downloadclasspath-b91aa9e58f8db725da1b33461ffd38af23bedf33.tar.gz
2005-12-28 Chris Burdess <dog@gnu.org>
* gnu/xml/stream/CRLFReader.java: Fixed bug where pos > 0. * gnu/xml/stream/XMLParser.java, gnu/xml/stream/UnicodeReader.java: Use Unicode code points instead of UTF-16 chars, resolving Unicode surrogates. * resource/META-INF/services/org.xml.sax.driver: Updated legacy SAX factory mechanism for new SAX driver.
Diffstat (limited to 'gnu/xml/stream/UnicodeReader.java')
-rw-r--r--gnu/xml/stream/UnicodeReader.java197
1 files changed, 197 insertions, 0 deletions
diff --git a/gnu/xml/stream/UnicodeReader.java b/gnu/xml/stream/UnicodeReader.java
new file mode 100644
index 000000000..e3c179cf7
--- /dev/null
+++ b/gnu/xml/stream/UnicodeReader.java
@@ -0,0 +1,197 @@
+/* UnicodeReader.java --
+ Copyright (C) 2005 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING. If not, write to the
+Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library. Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module. An independent module is a module which is not derived from
+or based on this library. If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so. If you do not wish to do so, delete this
+exception statement from your version. */
+
+package gnu.xml.stream;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * A reader that converts UTF-16 characters to Unicode code points.
+ *
+ * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
+ */
+class UnicodeReader
+{
+
+ final Reader in;
+ int carry, markCarry;
+ boolean isCarry, isMarkCarry;
+
+ UnicodeReader(Reader in)
+ {
+ this.in = in;
+ }
+
+ public void mark(int limit)
+ throws IOException
+ {
+ in.mark(limit);
+ markCarry = carry;
+ isMarkCarry = isCarry;
+ }
+
+ public void reset()
+ throws IOException
+ {
+ in.reset();
+ carry = markCarry;
+ isCarry = isMarkCarry;
+ }
+
+ public int read()
+ throws IOException
+ {
+ if (isCarry)
+ {
+ isCarry = false;
+ return carry;
+ }
+ int ret = in.read();
+ if (ret == -1)
+ return ret;
+ if (ret >= 0xd800 && ret < 0xdc00)
+ {
+ // Unicode surrogate?
+ int low = in.read();
+ if (low >= 0xdc00 && low < 0xe000)
+ ret = Character.toCodePoint((char) ret, (char) low);
+ else
+ {
+ carry = low;
+ isCarry = true;
+ }
+ }
+ return ret;
+ }
+
+ public int read(int[] buf, int off, int len)
+ throws IOException
+ {
+ if (len == 0)
+ return 0;
+ if (isCarry)
+ {
+ isCarry = false;
+ buf[off] = carry;
+ return 1;
+ }
+ char[] b2 = new char[len];
+ int ret = in.read(b2, 0, len);
+ if (ret <= 0)
+ return ret;
+ int l = ret - 1;
+ int j = off;
+ for (int i = 0; i < l; i++)
+ {
+ char c = b2[i];
+ if (c >= 0xd800 && c < 0xdc00)
+ {
+ // Unicode surrogate?
+ char d = b2[i + 1];
+ if (d >= 0xdc00 && d < 0xe000)
+ {
+ buf[j++] = Character.toCodePoint(c, d);
+ i++;
+ continue;
+ }
+ }
+ buf[j++] = (int) c;
+ }
+ // last char
+ char c = b2[l];
+ if (c >= 0xd800 && c < 0xdc00)
+ {
+ int low = in.read();
+ if (low >= 0xdc00 && low < 0xe000)
+ {
+ buf[j++] = Character.toCodePoint(c, (char) low);
+ return j;
+ }
+ else
+ {
+ carry = low;
+ isCarry = true;
+ }
+ }
+ buf[j++] = (int) c;
+ return j;
+ }
+
+ public void close()
+ throws IOException
+ {
+ in.close();
+ }
+
+ public static int[] toCodePointArray(String text)
+ {
+ char[] b2 = text.toCharArray();
+ int[] buf = new int[b2.length];
+ if (b2.length > 0)
+ {
+ int l = b2.length - 1;
+ int j = 0;
+ for (int i = 0; i < l; i++)
+ {
+ char c = b2[i];
+ if (c >= 0xd800 && c < 0xdc00)
+ {
+ // Unicode surrogate?
+ char d = b2[i + 1];
+ if (d >= 0xdc00 && d < 0xe000)
+ {
+ buf[j++] = Character.toCodePoint(c, d);
+ i++;
+ continue;
+ }
+ }
+ buf[j++] = (int) c;
+ }
+ // last char
+ buf[j++] = (int) b2[l];
+ if (j < buf.length)
+ {
+ int[] buf2 = new int[j];
+ System.arraycopy(buf, 0, buf2, 0, j);
+ buf = buf2;
+ }
+ }
+ return buf;
+ }
+
+}