summaryrefslogtreecommitdiff
path: root/liboil/utf8
diff options
context:
space:
mode:
authorDavid Schleef <ds@schleef.org>2005-12-18 08:31:23 +0000
committerDavid Schleef <ds@schleef.org>2005-12-18 08:31:23 +0000
commit1fea44b4199bedcd028d43a6607a2f168fcab3ab (patch)
tree7be87e58144576a3630fb10d5685943b0b50382e /liboil/utf8
parenteb82d8fb7d4d2a14b03fb33eb2edbae112cae235 (diff)
downloadliboil-1fea44b4199bedcd028d43a6607a2f168fcab3ab.tar.gz
* liboil/utf8/utf8.c: (utf8_validate_test), (utf8_validate_ref):
* liboil/utf8/utf8_fast.c: (utf8_validate_fast), (utf8_validate_fast2), (utf8_validate_fast3), (utf8_validate_lookup): Some utf8 hacking. At least it works now.
Diffstat (limited to 'liboil/utf8')
-rw-r--r--liboil/utf8/utf8.c54
-rw-r--r--liboil/utf8/utf8_fast.c110
2 files changed, 157 insertions, 7 deletions
diff --git a/liboil/utf8/utf8.c b/liboil/utf8/utf8.c
index 8bbbdaf..e67cb65 100644
--- a/liboil/utf8/utf8.c
+++ b/liboil/utf8/utf8.c
@@ -35,15 +35,64 @@
#include "utf8.h"
+/*
+ * Little explanation:
+ * 0x00-0x7f ASCII, one byte character
+ * 0x80-0xbf continuation byte, not a valid start byte
+ * 0xc0-0xdf 2-byte character
+ * 0xe0-0xef 3-byte character
+ * 0xf0-0xf7 4-byte character
+ * 0xf8-0xff reserved (illegal at the present time)
+ */
static void
utf8_validate_test (OilTest *test)
{
int i;
int n = test->n;
uint8_t *ptr = oil_test_get_source_data (test, OIL_ARG_SRC1);
+ int x;
+ int extra_chars = 0;
for (i=0;i<n;i++){
- OIL_GET(ptr, i, uint8_t) = oil_rand_u8() & 0x7f;
+ if (i >= n-16) {
+ /* if it's close to the end, we'll randomly drop in a bad
+ * byte from either the 0x80-0xbf or 0xf8-0xff segments */
+ x = oil_rand_u8();
+ if (x < 16) {
+ x = oil_rand_u8();
+ if (extra_chars>0) {
+ /* this might not actually be a bad char */
+ ptr[i] = x;
+ extra_chars--;
+ } else {
+ if (x & 0x80) {
+ ptr[i] = 0x80 | (x&0x3f);
+ } else {
+ ptr[i] = 0xf8 | (x&0x07);
+ }
+ }
+ continue;
+ }
+ }
+ if (extra_chars > 0) {
+ ptr[i] = 0x80 | (oil_rand_u8() & 0x3f);
+ extra_chars--;
+ } else {
+ /* otherwise, we'll do a low probability of a multibyte char */
+ x = oil_rand_u8() & 0xf;
+ if (x == 0) {
+ ptr[i] = 0xc0 | (oil_rand_u8() & 0x1f);
+ extra_chars = 1;
+ } else if (x == 1) {
+ ptr[i] = 0xe0 | (oil_rand_u8() & 0x0f);
+ extra_chars = 2;
+ } else if (x == 2) {
+ ptr[i] = 0xf0 | (oil_rand_u8() & 0x07);
+ extra_chars = 3;
+ } else {
+ ptr[i] = oil_rand_u8() & 0x7f;
+ }
+ }
}
}
@@ -71,6 +120,7 @@ utf8_validate_ref (int32_t *d_1, uint8_t *s, int n)
int mask;
for(i=0;i<n;i++){
+ extra_bytes = 0;
if (s[i] < 128) continue;
if ((s[i] & 0xe0) == 0xc0) {
extra_bytes = 1;
@@ -86,8 +136,8 @@ utf8_validate_ref (int32_t *d_1, uint8_t *s, int n)
}
if (i + extra_bytes >= n) goto error;
while(extra_bytes--) {
- if ((s[i] & 0xc0) != 0x80) goto error;
i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
}
}
diff --git a/liboil/utf8/utf8_fast.c b/liboil/utf8/utf8_fast.c
index 73dfd1d..077c63a 100644
--- a/liboil/utf8/utf8_fast.c
+++ b/liboil/utf8/utf8_fast.c
@@ -64,8 +64,8 @@ utf8_validate_fast (int32_t *d_1, uint8_t *s, int n)
}
if (i + extra_bytes >= n) goto error;
while(extra_bytes--) {
- if ((s[i] & 0xc0) != 0x80) goto error;
i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
}
i++;
}
@@ -84,19 +84,23 @@ utf8_validate_fast2 (int32_t *d_1, uint8_t *s, int n)
i=0;
while (i<n) {
x = s[i];
- if (s[i] < 128) {
+ if (!(x & 0x80)) {
i++;
continue;
}
x <<= 1;
- if (s[i] < 128) {
+ if (!(x & 0x80)) {
+ goto error;
+ }
+ x <<= 1;
+ if (!(x & 0x80)) {
i++;
if ((s[i] & 0xc0) != 0x80) goto error;
i++;
continue;
}
x <<= 1;
- if (s[i] < 128) {
+ if (!(x & 0x80)) {
i++;
if ((s[i] & 0xc0) != 0x80) goto error;
i++;
@@ -105,7 +109,7 @@ utf8_validate_fast2 (int32_t *d_1, uint8_t *s, int n)
continue;
}
x <<= 1;
- if (s[i] < 128) {
+ if (!(x & 0x80)) {
i++;
if ((s[i] & 0xc0) != 0x80) goto error;
i++;
@@ -123,6 +127,102 @@ error:
}
OIL_DEFINE_IMPL (utf8_validate_fast2, utf8_validate);
+static void
+utf8_validate_fast3 (int32_t *d_1, uint8_t *s, int n)
+{
+ int i;
+ uint8_t x;
+
+ i=0;
+ while (i<n) {
+ if (i < n-3 && (*(uint32_t *)(s+i) & 0x80808080) == 0) {
+ i+=4;
+ continue;
+ }
+ x = s[i];
+ if (!(x & 0x80)) {
+ i++;
+ continue;
+ }
+ if (!(x & 0x40)) {
+ goto error;
+ }
+ if (!(x & 0x20)) {
+ i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
+ i++;
+ continue;
+ }
+ if (!(x & 0x10)) {
+ i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
+ i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
+ i++;
+ continue;
+ }
+ if (!(x & 0x08)) {
+ i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
+ i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
+ i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
+ i++;
+ continue;
+ }
+ goto error;
+ }
+
+error:
+ d_1[0] = i;
+}
+OIL_DEFINE_IMPL (utf8_validate_fast3, utf8_validate);
+
+static uint8_t utf8_table[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8
+};
+
+static void
+utf8_validate_lookup (int32_t *d_1, uint8_t *s, int n)
+{
+ int i;
+ uint8_t x;
+
+ i=0;
+ while (i<n) {
+ x = utf8_table[s[i]];
+ if (x > 0) {
+ if (x == 8) goto error;
+ while (x>0) {
+ i++;
+ if ((s[i] & 0xc0) != 0x80) goto error;
+ x--;
+ }
+ }
+ i++;
+ }
+
+error:
+ d_1[0] = i;
+}
+OIL_DEFINE_IMPL (utf8_validate_lookup, utf8_validate);
+
#if 0
static void
utf8_validate_asm1 (int32_t *d_1, uint8_t *s, int n)