summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBryan Duxbury <bryanduxbury@apache.org>2010-04-25 15:20:02 +0000
committerBryan Duxbury <bryanduxbury@apache.org>2010-04-25 15:20:02 +0000
commit719ab84318ae1c7c59da5657ef0ad41dc4c3f921 (patch)
tree69d5e848ea56867b9c82e837ad3e9c109b233dc4
parent0137af6bf1e37762db319a08d2d6921d6897e21f (diff)
downloadthrift-719ab84318ae1c7c59da5657ef0ad41dc4c3f921.tar.gz
THRIFT-765. java: Improved string encoding and decoding performance
This change makes Java's string/utf8 encoding and decoding about 2x faster. git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@937812 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--lib/java/src/org/apache/thrift/Utf8Helper.java86
-rw-r--r--lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java35
-rwxr-xr-xlib/java/src/org/apache/thrift/protocol/TCompactProtocol.java25
-rw-r--r--lib/java/test/org/apache/thrift/TestUtf8Helper.java58
4 files changed, 163 insertions, 41 deletions
diff --git a/lib/java/src/org/apache/thrift/Utf8Helper.java b/lib/java/src/org/apache/thrift/Utf8Helper.java
new file mode 100644
index 000000000..e754517de
--- /dev/null
+++ b/lib/java/src/org/apache/thrift/Utf8Helper.java
@@ -0,0 +1,86 @@
+package org.apache.thrift;
+
+public final class Utf8Helper {
+ private Utf8Helper() {}
+
+ public static final int getByteLength(final String s) {
+ int byteLength = 0;
+ int c;
+ for (int i = 0; i < s.length(); i++) {
+ c = s.charAt(i);
+ if (c <= 0x007F) {
+ byteLength++;
+ } else if (c > 0x07FF) {
+ byteLength+=3;
+ } else {
+ byteLength+=2;
+ }
+ }
+ return byteLength;
+ }
+
+ public static byte[] encode(String s) {
+ byte[] buf = new byte[getByteLength(s)];
+ encode(s, buf, 0);
+ return buf;
+ }
+
+ public static void encode(String s, byte[] buf, int offset) {
+ int nextByte = 0;
+ int c;
+ for (int i = 0; i < s.length(); i++) {
+ c = s.charAt(i);
+ if (c <= 0x007F) {
+ buf[offset + nextByte] = (byte)c;
+ nextByte++;
+ } else if (c > 0x07FF) {
+ buf[offset + nextByte ] = (byte)(0xE0 | c >> 12 & 0x0F);
+ buf[offset + nextByte + 1] = (byte)(0x80 | c >> 6 & 0x3F);
+ buf[offset + nextByte + 2] = (byte)(0x80 | c & 0x3F);
+ nextByte+=3;
+ } else {
+ buf[offset + nextByte ] = (byte)(0xC0 | c >> 6 & 0x1F);
+ buf[offset + nextByte + 1] = (byte)(0x80 | c & 0x3F);
+ nextByte+=2;
+ }
+ }
+ }
+
+ public static String decode(byte[] buf) {
+ return decode(buf, 0, buf.length);
+ }
+
+ public static String decode(byte[] buf, int offset, int byteLength) {
+ int charCount = 0;
+ char[] chars = new char[byteLength];
+ int c;
+ int byteIndex = offset;
+ int charIndex = 0;
+ while (byteIndex < offset + byteLength) {
+ c = buf[byteIndex++] & 0xFF;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ chars[charIndex++] = (char) c;
+ break;
+ case 12:
+ case 13:
+ chars[charIndex++] = (char) ((c & 0x1F) << 6 | (buf[byteIndex++] & 0x3F));
+ break;
+ case 14:
+ chars[charIndex++] = (char) ((c & 0x0F) << 12 | (buf[byteIndex++] & 0x3F) << 6 | (buf[byteIndex++] & 0x3F) << 0);
+ break;
+ }
+ charCount++;
+ }
+ return new String(chars, 0, charCount);
+
+ }
+
+}
diff --git a/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java b/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
index 16c7567cf..3b4453dcd 100644
--- a/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
+++ b/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
@@ -19,9 +19,8 @@
package org.apache.thrift.protocol;
-import java.io.UnsupportedEncodingException;
-
import org.apache.thrift.TException;
+import org.apache.thrift.Utf8Helper;
import org.apache.thrift.transport.TTransport;
/**
@@ -170,13 +169,9 @@ public class TBinaryProtocol extends TProtocol {
}
public void writeString(String str) throws TException {
- try {
- byte[] dat = str.getBytes("UTF-8");
- writeI32(dat.length);
- trans_.write(dat, 0, dat.length);
- } catch (UnsupportedEncodingException uex) {
- throw new TException("JVM DOES NOT SUPPORT UTF-8");
- }
+ byte[] dat = Utf8Helper.encode(str);
+ writeI32(dat.length);
+ trans_.write(dat, 0, dat.length);
}
public void writeBinary(byte[] bin) throws TException {
@@ -323,27 +318,19 @@ public class TBinaryProtocol extends TProtocol {
int size = readI32();
if (trans_.getBytesRemainingInBuffer() >= size) {
- try {
- String s = new String(trans_.getBuffer(), trans_.getBufferPosition(), size, "UTF-8");
- trans_.consumeBuffer(size);
- return s;
- } catch (UnsupportedEncodingException e) {
- throw new TException("JVM DOES NOT SUPPORT UTF-8");
- }
+ String s = Utf8Helper.decode(trans_.getBuffer(), trans_.getBufferPosition(), size);
+ trans_.consumeBuffer(size);
+ return s;
}
return readStringBody(size);
}
public String readStringBody(int size) throws TException {
- try {
- checkReadLength(size);
- byte[] buf = new byte[size];
- trans_.readAll(buf, 0, size);
- return new String(buf, "UTF-8");
- } catch (UnsupportedEncodingException uex) {
- throw new TException("JVM DOES NOT SUPPORT UTF-8");
- }
+ checkReadLength(size);
+ byte[] buf = new byte[size];
+ trans_.readAll(buf, 0, size);
+ return Utf8Helper.decode(buf);
}
public byte[] readBinary() throws TException {
diff --git a/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java b/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
index f4979423e..f50ef1b06 100755
--- a/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
+++ b/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
@@ -20,10 +20,9 @@
package org.apache.thrift.protocol;
-import java.io.UnsupportedEncodingException;
-
import org.apache.thrift.ShortStack;
import org.apache.thrift.TException;
+import org.apache.thrift.Utf8Helper;
import org.apache.thrift.transport.TTransport;
/**
@@ -293,11 +292,7 @@ public final class TCompactProtocol extends TProtocol {
* Write a string to the wire with a varint size preceeding.
*/
public void writeString(String str) throws TException {
- try {
- writeBinary(str.getBytes("UTF-8"));
- } catch (UnsupportedEncodingException e) {
- throw new TException("UTF-8 not supported!");
- }
+ writeBinary(Utf8Helper.encode(str));
}
/**
@@ -610,16 +605,12 @@ public final class TCompactProtocol extends TProtocol {
return "";
}
- try {
- if (trans_.getBytesRemainingInBuffer() >= length) {
- String str = new String(trans_.getBuffer(), trans_.getBufferPosition(), length, "UTF-8");
- trans_.consumeBuffer(length);
- return str;
- } else {
- return new String(readBinary(length), "UTF-8");
- }
- } catch (UnsupportedEncodingException e) {
- throw new TException("UTF-8 not supported!");
+ if (trans_.getBytesRemainingInBuffer() >= length) {
+ String str = Utf8Helper.decode(trans_.getBuffer(), trans_.getBufferPosition(), length);
+ trans_.consumeBuffer(length);
+ return str;
+ } else {
+ return Utf8Helper.decode(readBinary(length));
}
}
diff --git a/lib/java/test/org/apache/thrift/TestUtf8Helper.java b/lib/java/test/org/apache/thrift/TestUtf8Helper.java
new file mode 100644
index 000000000..9d04d5af5
--- /dev/null
+++ b/lib/java/test/org/apache/thrift/TestUtf8Helper.java
@@ -0,0 +1,58 @@
+package org.apache.thrift;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+public class TestUtf8Helper extends TestCase {
+ private static final String NON_UNICODE_STRING = "here's some text";
+
+ private static final byte[] kUnicodeBytes = {
+ (byte)0xd3, (byte)0x80, (byte)0xe2, (byte)0x85, (byte)0xae, (byte)0xce,
+ (byte)0x9d, (byte)0x20, (byte)0xd0, (byte)0x9d, (byte)0xce, (byte)0xbf,
+ (byte)0xe2, (byte)0x85, (byte)0xbf, (byte)0xd0, (byte)0xbe, (byte)0xc9,
+ (byte)0xa1, (byte)0xd0, (byte)0xb3, (byte)0xd0, (byte)0xb0, (byte)0xcf,
+ (byte)0x81, (byte)0xe2, (byte)0x84, (byte)0x8e, (byte)0x20, (byte)0xce,
+ (byte)0x91, (byte)0x74, (byte)0x74, (byte)0xce, (byte)0xb1, (byte)0xe2,
+ (byte)0x85, (byte)0xbd, (byte)0xce, (byte)0xba, (byte)0x83, (byte)0xe2,
+ (byte)0x80, (byte)0xbc
+ };
+
+ private static final String UNICODE_STRING = "abc\u5639\u563b";
+ private static final byte[] UNICODE_STRING_BYTES;
+
+ private static final String UNICODE_STRING_2;
+ private static final byte[] UNICODE_STRING_BYTES_2;
+
+ static {
+ try {
+ UNICODE_STRING_BYTES = UNICODE_STRING.getBytes("UTF-8");
+ UNICODE_STRING_2 = new String(kUnicodeBytes, "UTF-8");
+ UNICODE_STRING_BYTES_2 = UNICODE_STRING_2.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+ public void testEncode() throws Exception {
+ byte[] bytes = NON_UNICODE_STRING.getBytes("UTF-8");
+ byte[] otherBytes = Utf8Helper.encode(NON_UNICODE_STRING);
+ assertTrue(Arrays.equals(bytes, otherBytes));
+
+ otherBytes = Utf8Helper.encode(UNICODE_STRING);
+ assertTrue(Arrays.equals(UNICODE_STRING_BYTES, otherBytes));
+
+ otherBytes = Utf8Helper.encode(UNICODE_STRING_2);
+ assertTrue(Arrays.equals(UNICODE_STRING_BYTES_2, otherBytes));
+ }
+
+ public void testDecode() throws Exception {
+ byte[] bytes = NON_UNICODE_STRING.getBytes("UTF-8");
+ assertEquals(NON_UNICODE_STRING, Utf8Helper.decode(bytes));
+
+ assertEquals(UNICODE_STRING, Utf8Helper.decode(UNICODE_STRING_BYTES));
+ assertEquals(UNICODE_STRING_2, Utf8Helper.decode(UNICODE_STRING_BYTES_2));
+ }
+}