diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-05 20:12:24 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-05 20:12:24 +0000 |
commit | a9839b968cee5828bf35dbcb05a31859a49ab7a2 (patch) | |
tree | 836125e6c0ea7958e295ccda9f7d060b05102430 /pcre_study.c | |
parent | 216818740b54b629e7bd59cd49f783c72e244e23 (diff) | |
download | pcre-a9839b968cee5828bf35dbcb05a31859a49ab7a2.tar.gz |
Improving UTF-16 support by fixing a lot of issues.
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@785 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_study.c')
-rw-r--r-- | pcre_study.c | 113 |
1 files changed, 98 insertions, 15 deletions
diff --git a/pcre_study.c b/pcre_study.c index 1e10397..3f25c3a 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -224,7 +224,7 @@ for (;;) case OP_NOTPOSPLUSI: branchlength++; cc += 2; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -245,7 +245,7 @@ for (;;) case OP_NOTEXACTI: branchlength += GET2(cc,1); cc += 2 + IMM2_SIZE; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -293,7 +293,7 @@ for (;;) appear, but leave the code, just in case.) */ case OP_ANYBYTE: -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf) return -1; #endif branchlength++; @@ -486,7 +486,7 @@ for (;;) case OP_NOTPOSQUERYI: cc += PRIV(OP_lengths)[op]; -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); #endif break; @@ -549,9 +549,10 @@ set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, { unsigned int c = *p; +#ifdef COMPILE_PCRE8 SET_BIT(c); -#ifdef SUPPORT_UTF8 +#ifdef SUPPORT_UTF if (utf && c > 127) { GETCHARINC(c, p); @@ -572,6 +573,33 @@ if (utf && c > 127) if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); return p + 1; +#endif + +#ifdef COMPILE_PCRE16 +if (c > 0xff) + c = 0xff; +SET_BIT(c); + +#ifdef SUPPORT_UTF +if (utf && c > 127) + { + GETCHARINC(c, p); +#ifdef SUPPORT_UCP + if (caseless) + { + c = UCD_OTHERCASE(c); + if (c > 0xff) + c = 0xff; + SET_BIT(c); + } +#endif + return p; + } +#endif + +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); +return p + 1; +#endif } @@ -602,7 +630,7 @@ set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit, { register int c; for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (table_limit == 32) return; for (c = 128; c < 256; c++) { @@ -644,7 +672,9 @@ set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit, { register int c; for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff; +#endif } @@ -679,7 +709,11 @@ set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, { register int c; int yield = SSB_DONE; +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 int table_limit = utf? 16:32; +#else +int table_limit = 32; +#endif #if 0 /* ========================================================================= */ @@ -951,14 +985,23 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); +#ifdef SUPPORT_UTF if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0xA0); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0xA0); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0xA0); try_next = FALSE; break; @@ -968,12 +1011,21 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); +#ifdef SUPPORT_UTF if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0x85); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0x85); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0x85); try_next = FALSE; break; @@ -1058,14 +1110,23 @@ do case OP_HSPACE: SET_BIT(0x09); SET_BIT(0x20); +#ifdef COMPILE_PCRE8 if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+00A0 */ SET_BIT(0xE1); /* For U+1680, U+180E */ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */ SET_BIT(0xE3); /* For U+3000 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0xA0); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0xA0); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0xA0); break; case OP_ANYNL: @@ -1074,12 +1135,21 @@ do SET_BIT(0x0B); SET_BIT(0x0C); SET_BIT(0x0D); +#ifdef COMPILE_PCRE8 if (utf) { +#ifdef COMPILE_PCRE8 SET_BIT(0xC2); /* For U+0085 */ SET_BIT(0xE2); /* For U+2028, U+2029 */ +#endif +#ifdef COMPILE_PCRE16 + SET_BIT(0x85); + SET_BIT(0xFF); /* For characters > 255 */ +#endif } - else SET_BIT(0x85); + else +#endif /* SUPPORT_UTF */ + SET_BIT(0x85); break; case OP_NOT_DIGIT: @@ -1126,13 +1196,16 @@ do character with a value > 255. */ case OP_NCLASS: -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (utf) { start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ } #endif +#ifdef COMPILE_PCRE16 + SET_BIT(0xFF); /* For characters > 255 */ +#endif /* Fall through */ case OP_CLASS: @@ -1147,7 +1220,7 @@ do value is > 127. In fact, there are only two possible starting bytes for characters in the range 128 - 255. */ -#ifdef SUPPORT_UTF8 +#if defined SUPPORT_UTF && defined COMPILE_PCRE8 if (utf) { for (c = 0; c < 16; c++) start_bits[c] |= map[c]; @@ -1161,12 +1234,10 @@ do } } } - - /* In non-UTF-8 mode, the two bit maps are completely compatible. */ - else #endif { + /* In non-UTF-8 mode, the two bit maps are completely compatible. */ for (c = 0; c < 32; c++) start_bits[c] |= map[c]; } @@ -1342,6 +1413,18 @@ if (bits_set || min > 0 memcpy(study->start_bits, start_bits, sizeof(start_bits)); } +#ifdef PCRE_DEBUG + if (bits_set) + { + pcre_uint8 *ptr = (pcre_uint32 *)start_bits; + int i; + + printf("Start bits:\n"); + for (i = 0; i < 32; i++) + printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n"); + } +#endif + /* Always set the minlength value in the block, because the JIT compiler makes use of it. However, don't set the bit unless the length is greater than zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time |