diff options
author | David Schleef <ds@schleef.org> | 2005-12-18 08:31:23 +0000 |
---|---|---|
committer | David Schleef <ds@schleef.org> | 2005-12-18 08:31:23 +0000 |
commit | 1fea44b4199bedcd028d43a6607a2f168fcab3ab (patch) | |
tree | 7be87e58144576a3630fb10d5685943b0b50382e /liboil/utf8 | |
parent | eb82d8fb7d4d2a14b03fb33eb2edbae112cae235 (diff) | |
download | liboil-1fea44b4199bedcd028d43a6607a2f168fcab3ab.tar.gz |
* liboil/utf8/utf8.c: (utf8_validate_test), (utf8_validate_ref):
* liboil/utf8/utf8_fast.c: (utf8_validate_fast),
(utf8_validate_fast2), (utf8_validate_fast3),
(utf8_validate_lookup):
Some utf8 hacking. At least it works now.
Diffstat (limited to 'liboil/utf8')
-rw-r--r-- | liboil/utf8/utf8.c | 54 | ||||
-rw-r--r-- | liboil/utf8/utf8_fast.c | 110 |
2 files changed, 157 insertions, 7 deletions
diff --git a/liboil/utf8/utf8.c b/liboil/utf8/utf8.c index 8bbbdaf..e67cb65 100644 --- a/liboil/utf8/utf8.c +++ b/liboil/utf8/utf8.c @@ -35,15 +35,64 @@ #include "utf8.h" +/* + * Little explanation: + * 0x00-0x7f ASCII, one byte character + * 0x80-0xbf continuation byte, not a valid start byte + * 0xc0-0xdf 2-byte character + * 0xe0-0xef 3-byte character + * 0xf0-0xf7 4-byte character + * 0xf8-0xff reserved (illegal at the present time) + */ static void utf8_validate_test (OilTest *test) { int i; int n = test->n; uint8_t *ptr = oil_test_get_source_data (test, OIL_ARG_SRC1); + int x; + int extra_chars = 0; for (i=0;i<n;i++){ - OIL_GET(ptr, i, uint8_t) = oil_rand_u8() & 0x7f; + if (i >= n-16) { + /* if it's close to the end, we'll randomly drop in a bad + * byte from either the 0x80-0xbf or 0xf8-0xff segments */ + x = oil_rand_u8(); + if (x < 16) { + x = oil_rand_u8(); + if (extra_chars>0) { + /* this might not actually be a bad char */ + ptr[i] = x; + extra_chars--; + } else { + if (x & 0x80) { + ptr[i] = 0x80 | (x&0x3f); + } else { + ptr[i] = 0xf8 | (x&0x07); + } + } + continue; + } + } + if (extra_chars > 0) { + ptr[i] = 0x80 | (oil_rand_u8() & 0x3f); + extra_chars--; + } else { + /* otherwise, we'll do a low probability of a multibyte char */ + x = oil_rand_u8() & 0xf; + if (x == 0) { + ptr[i] = 0xc0 | (oil_rand_u8() & 0x1f); + extra_chars = 1; + } else if (x == 1) { + ptr[i] = 0xe0 | (oil_rand_u8() & 0x0f); + extra_chars = 2; + } else if (x == 2) { + ptr[i] = 0xf0 | (oil_rand_u8() & 0x07); + extra_chars = 3; + } else { + ptr[i] = oil_rand_u8() & 0x7f; + } + } } } @@ -71,6 +120,7 @@ utf8_validate_ref (int32_t *d_1, uint8_t *s, int n) int mask; for(i=0;i<n;i++){ + extra_bytes = 0; if (s[i] < 128) continue; if ((s[i] & 0xe0) == 0xc0) { extra_bytes = 1; @@ -86,8 +136,8 @@ utf8_validate_ref (int32_t *d_1, uint8_t *s, int n) } if (i + extra_bytes >= n) goto error; while(extra_bytes--) { - if ((s[i] & 0xc0) != 0x80) goto error; i++; + if ((s[i] & 0xc0) != 0x80) goto error; } } diff --git a/liboil/utf8/utf8_fast.c b/liboil/utf8/utf8_fast.c index 73dfd1d..077c63a 100644 --- a/liboil/utf8/utf8_fast.c +++ b/liboil/utf8/utf8_fast.c @@ -64,8 +64,8 @@ utf8_validate_fast (int32_t *d_1, uint8_t *s, int n) } if (i + extra_bytes >= n) goto error; while(extra_bytes--) { - if ((s[i] & 0xc0) != 0x80) goto error; i++; + if ((s[i] & 0xc0) != 0x80) goto error; } i++; } @@ -84,19 +84,23 @@ utf8_validate_fast2 (int32_t *d_1, uint8_t *s, int n) i=0; while (i<n) { x = s[i]; - if (s[i] < 128) { + if (!(x & 0x80)) { i++; continue; } x <<= 1; - if (s[i] < 128) { + if (!(x & 0x80)) { + goto error; + } + x <<= 1; + if (!(x & 0x80)) { i++; if ((s[i] & 0xc0) != 0x80) goto error; i++; continue; } x <<= 1; - if (s[i] < 128) { + if (!(x & 0x80)) { i++; if ((s[i] & 0xc0) != 0x80) goto error; i++; @@ -105,7 +109,7 @@ utf8_validate_fast2 (int32_t *d_1, uint8_t *s, int n) continue; } x <<= 1; - if (s[i] < 128) { + if (!(x & 0x80)) { i++; if ((s[i] & 0xc0) != 0x80) goto error; i++; @@ -123,6 +127,102 @@ error: } OIL_DEFINE_IMPL (utf8_validate_fast2, utf8_validate); +static void +utf8_validate_fast3 (int32_t *d_1, uint8_t *s, int n) +{ + int i; + uint8_t x; + + i=0; + while (i<n) { + if (i < n-3 && (*(uint32_t *)(s+i) & 0x80808080) == 0) { + i+=4; + continue; + } + x = s[i]; + if (!(x & 0x80)) { + i++; + continue; + } + if (!(x & 0x40)) { + goto error; + } + if (!(x & 0x20)) { + i++; + if ((s[i] & 0xc0) != 0x80) goto error; + i++; + continue; + } + if (!(x & 0x10)) { + i++; + if ((s[i] & 0xc0) != 0x80) goto error; + i++; + if ((s[i] & 0xc0) != 0x80) goto error; + i++; + continue; + } + if (!(x & 0x08)) { + i++; + if ((s[i] & 0xc0) != 0x80) goto error; + i++; + if ((s[i] & 0xc0) != 0x80) goto error; + i++; + if ((s[i] & 0xc0) != 0x80) goto error; + i++; + continue; + } + goto error; + } + +error: + d_1[0] = i; +} +OIL_DEFINE_IMPL (utf8_validate_fast3, utf8_validate); + +static uint8_t utf8_table[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8 +}; + +static void +utf8_validate_lookup (int32_t *d_1, uint8_t *s, int n) +{ + int i; + uint8_t x; + + i=0; + while (i<n) { + x = utf8_table[s[i]]; + if (x > 0) { + if (x == 8) goto error; + while (x>0) { + i++; + if ((s[i] & 0xc0) != 0x80) goto error; + x--; + } + } + i++; + } + +error: + d_1[0] = i; +} +OIL_DEFINE_IMPL (utf8_validate_lookup, utf8_validate); + #if 0 static void utf8_validate_asm1 (int32_t *d_1, uint8_t *s, int n) |