summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-26 21:23:17 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2011-12-26 21:23:17 +0000
commit5fa1a51657506bf743cd560f15726a3bafeb8e6d (patch)
treead2c4788be88a21fc7763e1ddf294b53593d331b
parent1171fd96716f91c4bce421cd21aebf97b0bff6fb (diff)
downloadpcre-5fa1a51657506bf743cd560f15726a3bafeb8e6d.tar.gz
Supporting all newlines, horizontal and vertical spaces in 16 bit mode
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@825 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--Makefile.am1
-rw-r--r--pcre_exec.c134
-rw-r--r--pcre_jit_compile.c18
-rw-r--r--pcre_jit_test.c144
-rw-r--r--pcre_study.c15
-rw-r--r--testdata/testinput166
-rw-r--r--testdata/testinput176
-rw-r--r--testdata/testinput56
-rw-r--r--testdata/testoutput1624
-rw-r--r--testdata/testoutput1724
-rw-r--r--testdata/testoutput524
11 files changed, 318 insertions, 84 deletions
diff --git a/Makefile.am b/Makefile.am
index 810c9d9..4598ca6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -168,6 +168,7 @@ pcre_chartables.c: $(srcdir)/pcre_chartables.c.dist
endif # WITH_REBUILD_CHARTABLES
+BUILT_SOURCES = pcre_chartables.c
## The main pcre library
diff --git a/pcre_exec.c b/pcre_exec.c
index fa8bc77..ccbcfbd 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -4470,6 +4470,10 @@ for (;;)
case 0x000b:
case 0x000c:
case 0x0085:
+#ifdef COMPILE_PCRE16
+ case 0x2028:
+ case 0x2029:
+#endif
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
@@ -4490,6 +4494,24 @@ for (;;)
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
+#ifdef COMPILE_PCRE16
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+#endif
RRETURN(MATCH_NOMATCH);
}
}
@@ -4509,6 +4531,24 @@ for (;;)
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
+#ifdef COMPILE_PCRE16
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+#endif
break;
}
}
@@ -4530,6 +4570,10 @@ for (;;)
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
+#ifdef COMPILE_PCRE16
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+#endif
RRETURN(MATCH_NOMATCH);
}
}
@@ -4551,6 +4595,10 @@ for (;;)
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
+#ifdef COMPILE_PCRE16
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+#endif
break;
}
}
@@ -5063,6 +5111,10 @@ for (;;)
case 0x000b:
case 0x000c:
case 0x0085:
+#ifdef COMPILE_PCRE16
+ case 0x2028:
+ case 0x2029:
+#endif
if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
break;
}
@@ -5075,6 +5127,24 @@ for (;;)
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
+#ifdef COMPILE_PCRE16
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+#endif
RRETURN(MATCH_NOMATCH);
}
break;
@@ -5086,6 +5156,24 @@ for (;;)
case 0x09: /* HT */
case 0x20: /* SPACE */
case 0xa0: /* NBSP */
+#ifdef COMPILE_PCRE16
+ case 0x1680: /* OGHAM SPACE MARK */
+ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
+ case 0x2000: /* EN QUAD */
+ case 0x2001: /* EM QUAD */
+ case 0x2002: /* EN SPACE */
+ case 0x2003: /* EM SPACE */
+ case 0x2004: /* THREE-PER-EM SPACE */
+ case 0x2005: /* FOUR-PER-EM SPACE */
+ case 0x2006: /* SIX-PER-EM SPACE */
+ case 0x2007: /* FIGURE SPACE */
+ case 0x2008: /* PUNCTUATION SPACE */
+ case 0x2009: /* THIN SPACE */
+ case 0x200A: /* HAIR SPACE */
+ case 0x202f: /* NARROW NO-BREAK SPACE */
+ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
+ case 0x3000: /* IDEOGRAPHIC SPACE */
+#endif
break;
}
break;
@@ -5099,6 +5187,10 @@ for (;;)
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
+#ifdef COMPILE_PCRE16
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+#endif
RRETURN(MATCH_NOMATCH);
}
break;
@@ -5112,6 +5204,10 @@ for (;;)
case 0x0c: /* FF */
case 0x0d: /* CR */
case 0x85: /* NEL */
+#ifdef COMPILE_PCRE16
+ case 0x2028: /* LINE SEPARATOR */
+ case 0x2029: /* PARAGRAPH SEPARATOR */
+#endif
break;
}
break;
@@ -5708,10 +5804,12 @@ for (;;)
}
else
{
- if (c != 0x000a &&
- (md->bsr_anycrlf ||
- (c != 0x000b && c != 0x000c && c != 0x0085)))
- break;
+ if (c != 0x000a && (md->bsr_anycrlf ||
+ (c != 0x000b && c != 0x000c && c != 0x0085
+#ifdef COMPILE_PCRE16
+ && c != 0x2028 && c != 0x2029
+#endif
+ ))) break;
eptr++;
}
}
@@ -5726,7 +5824,12 @@ for (;;)
break;
}
c = *eptr;
- if (c == 0x09 || c == 0x20 || c == 0xa0) break;
+ if (c == 0x09 || c == 0x20 || c == 0xa0
+#ifdef COMPILE_PCRE16
+ || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
+ || c == 0x202f || c == 0x205f || c == 0x3000
+#endif
+ ) break;
eptr++;
}
break;
@@ -5740,7 +5843,12 @@ for (;;)
break;
}
c = *eptr;
- if (c != 0x09 && c != 0x20 && c != 0xa0) break;
+ if (c != 0x09 && c != 0x20 && c != 0xa0
+#ifdef COMPILE_PCRE16
+ && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
+ && c != 0x202f && c != 0x205f && c != 0x3000
+#endif
+ ) break;
eptr++;
}
break;
@@ -5754,8 +5862,11 @@ for (;;)
break;
}
c = *eptr;
- if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
- break;
+ if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
+#ifdef COMPILE_PCRE16
+ || c == 0x2028 || c == 0x2029
+#endif
+ ) break;
eptr++;
}
break;
@@ -5769,8 +5880,11 @@ for (;;)
break;
}
c = *eptr;
- if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
- break;
+ if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
+#ifdef COMPILE_PCRE16
+ && c != 0x2028 && c != 0x2029
+#endif
+ ) break;
eptr++;
}
break;
diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c
index 7f9df74..c24a8fc 100644
--- a/pcre_jit_compile.c
+++ b/pcre_jit_compile.c
@@ -2273,14 +2273,18 @@ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0a);
OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
+#ifdef COMPILE_PCRE8
if (common->utf)
{
+#endif
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2029 - 0x0a);
+#ifdef COMPILE_PCRE8
}
#endif
+#endif /* SUPPORT_UTF || COMPILE_PCRE16 */
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
}
@@ -2297,9 +2301,11 @@ COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x20);
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0xa0);
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
+#ifdef COMPILE_PCRE8
if (common->utf)
{
+#endif
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x1680);
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
@@ -2313,8 +2319,10 @@ if (common->utf)
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x205f - 0x2000);
COND_VALUE(SLJIT_OR, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x3000 - 0x2000);
+#ifdef COMPILE_PCRE8
}
#endif
+#endif /* SUPPORT_UTF || COMPILE_PCRE16 */
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
@@ -2331,14 +2339,18 @@ OP2(SLJIT_SUB, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x0a);
OP2(SLJIT_SUB | SLJIT_SET_U, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x0d - 0x0a);
COND_VALUE(SLJIT_MOV, TMP2, 0, SLJIT_C_LESS_EQUAL);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x85 - 0x0a);
-#ifdef SUPPORT_UTF
+#if defined SUPPORT_UTF || defined COMPILE_PCRE16
+#ifdef COMPILE_PCRE8
if (common->utf)
{
+#endif
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
OP2(SLJIT_OR, TMP1, 0, TMP1, 0, SLJIT_IMM, 0x1);
OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, 0x2029 - 0x0a);
+#ifdef COMPILE_PCRE8
}
#endif
+#endif /* SUPPORT_UTF || COMPILE_PCRE16 */
COND_VALUE(SLJIT_OR | SLJIT_SET_E, TMP2, 0, SLJIT_C_EQUAL);
sljit_emit_fast_return(compiler, RETURN_ADDR, 0);
diff --git a/pcre_jit_test.c b/pcre_jit_test.c
index 50eb653..252c192 100644
--- a/pcre_jit_test.c
+++ b/pcre_jit_test.c
@@ -87,7 +87,11 @@ static int regression_tests(void);
int main(void)
{
int jit = 0;
+#ifdef SUPPORT_PCRE8
pcre_config(PCRE_CONFIG_JIT, &jit);
+#else
+ pcre16_config(PCRE_CONFIG_JIT, &jit);
+#endif
if (!jit) {
printf("JIT must be enabled to run pcre_jit_test\n");
return 1;
@@ -115,6 +119,7 @@ int main(void)
#define F_NOMATCH 0x040000
#define F_DIFF 0x080000
#define F_FORCECONV 0x100000
+#define F_PROPERTY 0x200000
struct regression_test_case {
int flags;
@@ -192,7 +197,7 @@ static struct regression_test_case regression_test_cases[] = {
{ PCRE_NOTBOL | PCRE_MULTILINE | PCRE_NEWLINE_ANY, 0, "^a", "aa\naa" },
{ PCRE_NOTEOL | PCRE_NEWLINE_ANY, 0 | F_NOMATCH, "a$", "aa\naa" },
{ PCRE_NOTEOL | PCRE_NEWLINE_ANY, 0 | F_NOMATCH, "a$", "aa\r\n" },
- { PCRE_UTF8 | PCRE_DOLLAR_ENDONLY | PCRE_NEWLINE_ANY, 0, "\\p{Any}{2,}$", "aa\r\n" },
+ { PCRE_UTF8 | PCRE_DOLLAR_ENDONLY | PCRE_NEWLINE_ANY, 0 | F_PROPERTY, "\\p{Any}{2,}$", "aa\r\n" },
{ PCRE_NOTEOL | PCRE_MULTILINE | PCRE_NEWLINE_ANY, 0, "a$", "aa\naa" },
{ PCRE_NEWLINE_CR, 0, ".\\Z", "aaa" },
{ PCRE_NEWLINE_CR | PCRE_UTF8, 0, "a\\Z", "aaa\r" },
@@ -305,24 +310,24 @@ static struct regression_test_case regression_test_cases[] = {
/* Unicode properties. */
{ MUAP, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
- { MUAP, 0, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
+ { MUAP, 0 | F_PROPERTY, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
{ MUAP, 0, "[\\Wd-h_x-z]+", "a\xc2\xa1#_yhzdxi" },
- { MUAP, 0 | F_NOMATCH, "[\\P{Any}]", "abc" },
- { MUAP, 0 | F_NOMATCH, "[^\\p{Any}]", "abc" },
- { MUAP, 0 | F_NOMATCH, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
- { MUAP, 0 | F_NOMATCH, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
- { MUAP, 0 | F_NOMATCH, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
- { MUAP, 0 | F_NOMATCH, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
- { MUAP, 0, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
- { MUAP, 0, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
+ { MUAP, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}]", "abc" },
+ { MUAP, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}]", "abc" },
+ { MUAP, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
+ { MUAP, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
+ { MUAP, 0 | F_NOMATCH | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
+ { MUAP, 0 | F_NOMATCH | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
+ { MUAP, 0 | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
+ { MUAP, 0 | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
{ MUAP, 0, "[b-\xc3\xa9\\s]", "a\xc\xe6\x92\xad" },
{ CMUAP, 0, "[\xc2\x85-\xc2\x89\xc3\x89]", "\xc2\x84\xc3\xa9" },
{ MUAP, 0, "[^b-d^&\\s]{3,}", "db^ !a\xe2\x80\xa8_ae" },
- { MUAP, 0, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
- { MUA, 0, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
+ { MUAP, 0 | F_PROPERTY, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
+ { MUA, 0 | F_PROPERTY, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
{ CMUAP, 0, "[\xc3\xa1-\xc3\xa9_\xe2\x80\xa0-\xe2\x80\xaf]{1,5}[^\xe2\x80\xa0-\xe2\x80\xaf]", "\xc2\xa1\xc3\x89\xc3\x89\xe2\x80\xaf_\xe2\x80\xa0" },
- { MUAP, 0, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
- { MUAP, 0, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
+ { MUAP, 0 | F_PROPERTY, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
+ { MUAP, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
{ PCRE_UCP, 0, "[a-b\\s]{2,5}[^a]", "AB baaa" },
/* Possible empty brackets. */
@@ -406,13 +411,13 @@ static struct regression_test_case regression_test_cases[] = {
{ CMA, 0, "(?>((?>a{32}|b+|(a*))?(?>c+|d*)?\?)+e)+?f", "aaccebbdde bbdaaaccebbdee bbdaaaccebbdeef" },
{ MUA, 0, "(?>(?:(?>aa|a||x)+?b|(?>aa|a||(x))+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
{ MUA, 0, "(?>(?:(?>aa|a||(x))+?b|(?>aa|a||x)+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
- { MUA, 0 | F_NOMATCH, "\\X", "\xcc\x8d\xcc\x8d" },
- { MUA, 0, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
- { MUA, 0, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
- { MUA, 0, "\\X{2,4}", "abcdef" },
- { MUA, 0, "\\X{2,4}?", "abcdef" },
- { MUA, 0 | F_NOMATCH, "\\X{2,4}..", "#\xcc\x8d##" },
- { MUA, 0, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
+ { MUA, 0 | F_NOMATCH | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d" },
+ { MUA, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
+ { MUA, 0 | F_PROPERTY, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
+ { MUA, 0 | F_PROPERTY, "\\X{2,4}", "abcdef" },
+ { MUA, 0 | F_PROPERTY, "\\X{2,4}?", "abcdef" },
+ { MUA, 0 | F_NOMATCH | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d##" },
+ { MUA, 0 | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
{ MUA, 0, "(c(ab)?+ab)+", "cabcababcab" },
{ MUA, 0, "(?>(a+)b)+aabab", "aaaabaaabaabab" },
@@ -471,11 +476,11 @@ static struct regression_test_case regression_test_cases[] = {
{ MUA, 0, "(?:(aa|bb)(\\1{0,3}?)){2}(dd|)(\\3{0,3}?)b(\\1{0,3}?)(\\1{0,3})", "aaaaaaaaaaaaaaabaaaaa" },
{ MUA, 0, "(a(?:\\1|)a){3}b", "aaaaaaaaaaab" },
{ MA, 0, "(a?)b(\\1\\1*\\1+\\1?\\1*?\\1+?\\1??\\1*+\\1++\\1?+\\1{4}\\1{3,5}\\1{4,}\\1{0,5}\\1{3,5}?\\1{4,}?\\1{0,5}?\\1{3,5}+\\1{4,}+\\1{0,5}+#){2}d", "bb#b##d" },
- { MUAP, 0, "(\\P{N})\\1{2,}", ".www." },
- { MUAP, 0, "(\\P{N})\\1{0,2}", "wwwww." },
- { MUAP, 0, "(\\P{N})\\1{1,2}ww", "wwww" },
- { MUAP, 0, "(\\P{N})\\1{1,2}ww", "wwwww" },
- { PCRE_UCP, 0, "(\\P{N})\\1{2,}", ".www." },
+ { MUAP, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
+ { MUAP, 0 | F_PROPERTY, "(\\P{N})\\1{0,2}", "wwwww." },
+ { MUAP, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwww" },
+ { MUAP, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwwww" },
+ { PCRE_UCP, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
{ CMUAP, 0, "(\xf0\x90\x90\x80)\\1", "\xf0\x90\x90\xa8\xf0\x90\x90\xa8" },
/* Assertions. */
@@ -569,8 +574,8 @@ static struct regression_test_case regression_test_cases[] = {
{ MUA | PCRE_NOTEMPTY_ATSTART, 0, "a\\K(*ACCEPT)b", "aa" },
/* First line. */
- { MUA | PCRE_FIRSTLINE, 0, "\\p{Any}a", "bb\naaa" },
- { MUA | PCRE_FIRSTLINE, 0 | F_NOMATCH, "\\p{Any}a", "bb\r\naaa" },
+ { MUA | PCRE_FIRSTLINE, 0 | F_PROPERTY, "\\p{Any}a", "bb\naaa" },
+ { MUA | PCRE_FIRSTLINE, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}a", "bb\r\naaa" },
{ MUA | PCRE_FIRSTLINE, 0, "(?<=a)", "a" },
{ MUA | PCRE_FIRSTLINE, 0 | F_NOMATCH, "[^a][^b]", "ab" },
{ MUA | PCRE_FIRSTLINE, 0 | F_NOMATCH, "a", "\na" },
@@ -580,11 +585,11 @@ static struct regression_test_case regression_test_cases[] = {
{ PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANY | PCRE_FIRSTLINE, 0 | F_NOMATCH, "#", "\xc2\x85#" },
{ PCRE_MULTILINE | PCRE_NEWLINE_ANY | PCRE_FIRSTLINE, 0 | F_NOMATCH, "#", "\x85#" },
{ PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_ANY | PCRE_FIRSTLINE, 0 | F_NOMATCH, "^#", "\xe2\x80\xa8#" },
- { PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 0, "\\p{Any}", "\r\na" },
+ { PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 0 | F_PROPERTY, "\\p{Any}", "\r\na" },
{ PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 0, ".", "\r" },
{ PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 0, "a", "\ra" },
{ PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 0 | F_NOMATCH, "ba", "bbb\r\nba" },
- { PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 0 | F_NOMATCH, "\\p{Any}{4}|a", "\r\na" },
+ { PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}{4}|a", "\r\na" },
{ PCRE_MULTILINE | PCRE_UTF8 | PCRE_NEWLINE_CRLF | PCRE_FIRSTLINE, 1, ".", "\r\n" },
/* Recurse. */
@@ -640,6 +645,12 @@ static struct regression_test_case regression_test_cases[] = {
{ CMA | PCRE_EXTENDED, 0 | F_FORCECONV, "\xed\xa0\x80 \xed\xb0\x80 !", "\xed\xa0\x80\xed\xb0\x80!" },
{ CMA, 0 | F_FORCECONV, "\xed\xa0\x80+#[^#]+\xed\xa0\x80", "\xed\xa0\x80#a\xed\xa0\x80" },
{ CMA, 0 | F_FORCECONV, "(\xed\xa0\x80+)#\\1", "\xed\xa0\x80\xed\xa0\x80#\xed\xa0\x80\xed\xa0\x80" },
+ { PCRE_MULTILINE | PCRE_NEWLINE_ANY, 0 | F_NO8 | F_FORCECONV, "^-", "a--\xe2\x80\xa8--" },
+ { PCRE_BSR_UNICODE, 0 | F_NO8 | F_FORCECONV, "\\R", "ab\xe2\x80\xa8" },
+ { 0, 0 | F_NO8 | F_FORCECONV, "\\v", "ab\xe2\x80\xa9" },
+ { 0, 0 | F_NO8 | F_FORCECONV, "\\h", "ab\xe1\xa0\x8e" },
+ { 0, 0 | F_NO8 | F_FORCECONV, "\\v+?\\V+?#", "\xe2\x80\xa9\xe2\x80\xa9\xef\xbf\xbf\xef\xbf\xbf#" },
+ { 0, 0 | F_NO8 | F_FORCECONV, "\\h+?\\H+?#", "\xe1\xa0\x8e\xe1\xa0\x8e\xef\xbf\xbf\xef\xbf\xbf#" },
/* Deep recursion. */
{ MUA, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
@@ -714,7 +725,8 @@ static pcre_jit_stack* callback(void *arg)
return (pcre_jit_stack *)arg;
}
-static void setstack(pcre_extra *extra)
+#ifdef SUPPORT_PCRE8
+static void setstack8(pcre_extra *extra)
{
static pcre_jit_stack *stack;
@@ -730,6 +742,26 @@ static void setstack(pcre_extra *extra)
/* Extra can be NULL. */
pcre_assign_jit_stack(extra, callback, stack);
}
+#endif /* SUPPORT_PCRE8 */
+
+#ifdef SUPPORT_PCRE16
+static void setstack16(pcre_extra *extra)
+{
+ static pcre_jit_stack *stack;
+
+ if (!extra) {
+ if (stack)
+ pcre16_jit_stack_free(stack);
+ stack = NULL;
+ return;
+ }
+
+ if (!stack)
+ stack = pcre16_jit_stack_alloc(1, 1024 * 1024);
+ /* Extra can be NULL. */
+ pcre16_assign_jit_stack(extra, callback, stack);
+}
+#endif /* SUPPORT_PCRE8 */
#ifdef SUPPORT_PCRE16
@@ -803,11 +835,23 @@ static int regtest_offsetmap[REGTEST_MAX_LENGTH];
#endif /* SUPPORT_PCRE16 */
+static int check_ascii(const char *input)
+{
+ const unsigned char *ptr = (unsigned char *)input;
+ while (*ptr) {
+ if (*ptr > 127)
+ return 0;
+ ptr++;
+ }
+ return 1;
+}
+
static int regression_tests(void)
{
struct regression_test_case *current = regression_test_cases;
const char *error;
- int i, err_offs, is_successful;
+ int i, err_offs;
+ int is_successful, is_ascii_pattern, is_ascii_input;
int total = 0;
int successful = 0;
int counter = 0;
@@ -859,6 +903,13 @@ static int regression_tests(void)
while (current->pattern) {
/* printf("\nPattern: %s :\n", current->pattern); */
total++;
+ if (current->start_offset & F_PROPERTY) {
+ is_ascii_pattern = 0;
+ is_ascii_input = 0;
+ } else {
+ is_ascii_pattern = check_ascii(current->pattern);
+ is_ascii_input = check_ascii(current->input);
+ }
error = NULL;
#ifdef SUPPORT_PCRE8
@@ -883,7 +934,7 @@ static int regression_tests(void)
pcre_free(re8);
re8 = NULL;
}
- } else if (utf8 && ucp8 && !(current->start_offset & F_NO8))
+ } else if (((utf8 && ucp8) || is_ascii_pattern) && !(current->start_offset & F_NO8))
printf("\n8 bit: Cannot compile pattern: %s\n", current->pattern);
#endif
#ifdef SUPPORT_PCRE16
@@ -913,13 +964,19 @@ static int regression_tests(void)
pcre16_free(re16);
re16 = NULL;
}
- } else if (utf16 && ucp16 && !(current->start_offset & F_NO16))
+ } else if (((utf16 && ucp16) || is_ascii_pattern) && !(current->start_offset & F_NO16))
printf("\n16 bit: Cannot compile pattern: %s\n", current->pattern);
#endif
counter++;
- if ((counter & 0x3) != 0)
- setstack(NULL);
+ if ((counter & 0x3) != 0) {
+#ifdef SUPPORT_PCRE8
+ setstack8(NULL);
+#endif
+#ifdef SUPPORT_PCRE16
+ setstack16(NULL);
+#endif
+ }
#ifdef SUPPORT_PCRE8
return_value8_1 = -1000;
@@ -929,7 +986,7 @@ static int regression_tests(void)
for (i = 0; i < 32; ++i)
ovector8_2[i] = -2;
if (re8) {
- setstack(extra8);
+ setstack8(extra8);
return_value8_1 = pcre_exec(re8, extra8, current->input, strlen(current->input), current->start_offset & OFFSET_MASK,
current->flags & (PCRE_NOTBOL | PCRE_NOTEOL | PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART), ovector8_1, 32);
return_value8_2 = pcre_exec(re8, NULL, current->input, strlen(current->input), current->start_offset & OFFSET_MASK,
@@ -945,7 +1002,7 @@ static int regression_tests(void)
for (i = 0; i < 32; ++i)
ovector16_2[i] = -2;
if (re16) {
- setstack(extra16);
+ setstack16(extra16);
if ((current->flags & PCRE_UTF8) || (current->start_offset & F_FORCECONV))
length16 = convert_utf8_to_utf16(current->input, regtest_buf, regtest_offsetmap, REGTEST_MAX_LENGTH);
else
@@ -1032,7 +1089,7 @@ static int regression_tests(void)
if (is_successful) {
#ifdef SUPPORT_PCRE8
- if (!(current->start_offset & F_NO8)) {
+ if (!(current->start_offset & F_NO8) && ((utf8 && ucp8) || is_ascii_input)) {
if (return_value8_1 < 0 && !(current->start_offset & F_NOMATCH)) {
printf("8 bit: Test should match: [%d] '%s' @ '%s'\n",
total, current->pattern, current->input);
@@ -1047,7 +1104,7 @@ static int regression_tests(void)
}
#endif
#ifdef SUPPORT_PCRE16
- if (!(current->start_offset & F_NO16)) {
+ if (!(current->start_offset & F_NO16) && ((utf16 && ucp16) || is_ascii_input)) {
if (return_value16_1 < 0 && !(current->start_offset & F_NOMATCH)) {
printf("16 bit: Test should match: [%d] '%s' @ '%s'\n",
total, current->pattern, current->input);
@@ -1085,7 +1142,12 @@ static int regression_tests(void)
current++;
}
tables(1);
- setstack(NULL);
+#ifdef SUPPORT_PCRE8
+ setstack8(NULL);
+#endif
+#ifdef SUPPORT_PCRE16
+ setstack16(NULL);
+#endif
if (total == successful) {
printf("\nAll JIT regression tests are successfully passed.\n");
diff --git a/pcre_study.c b/pcre_study.c
index f1863b6..bf633cd 100644
--- a/pcre_study.c
+++ b/pcre_study.c
@@ -1004,7 +1004,12 @@ do
}
else
#endif /* SUPPORT_UTF */
+ {
SET_BIT(0xA0);
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
+ }
try_next = FALSE;
break;
@@ -1028,7 +1033,12 @@ do
}
else
#endif /* SUPPORT_UTF */
+ {
SET_BIT(0x85);
+#ifdef COMPILE_PCRE16
+ SET_BIT(0xFF); /* For characters > 255 */
+#endif
+ }
try_next = FALSE;
break;
@@ -1471,7 +1481,12 @@ if (bits_set || min > 0
if ((options & PCRE_STUDY_JIT_COMPILE) != 0) PRIV(jit_compile)(re, extra);
if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
{
+#ifdef COMPILE_PCRE8
pcre_free_study(extra);
+#endif
+#ifdef COMPILE_PCRE16
+ pcre16_free_study(extra);
+#endif
extra = NULL;
}
#endif
diff --git a/testdata/testinput16 b/testdata/testinput16
index e8435dd..26f53f8 100644
--- a/testdata/testinput16
+++ b/testdata/testinput16
@@ -24,4 +24,10 @@
/[^ⱥ]/8iBZ
+/\h/SI
+
+/\v/SI
+
+/\R/SI
+
/-- End of testinput16 --/
diff --git a/testdata/testinput17 b/testdata/testinput17
index 2479fe5..a9fc089 100644
--- a/testdata/testinput17
+++ b/testdata/testinput17
@@ -213,4 +213,10 @@
\) )* # optional trailing comment
/xSI
+/\h/SI
+
+/\v/SI
+
+/\R/SI
+
/-- End of testinput17 --/
diff --git a/testdata/testinput5 b/testdata/testinput5
index 8ec25cf..a172e0b 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -591,12 +591,6 @@
/X\W{3}X/8
\PX
-/\h/SI
-
-/\v/SI
-
-/\R/SI
-
/\sxxx\s/8T1
AB\x{85}xxx\x{a0}XYZ
AB\x{a0}xxx\x{85}XYZ
diff --git a/testdata/testoutput16 b/testdata/testoutput16
index 795376d..dde5399 100644
--- a/testdata/testoutput16
+++ b/testdata/testoutput16
@@ -86,4 +86,28 @@ Starting byte set: \xd0 \xd1
End
------------------------------------------------------------------
+/\h/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0
+
+/\v/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85
+
+/\R/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85
+
/-- End of testinput16 --/
diff --git a/testdata/testoutput17 b/testdata/testoutput17
index 0e754a3..9fc98e1 100644
--- a/testdata/testoutput17
+++ b/testdata/testoutput17
@@ -244,4 +244,28 @@ Starting byte set: \x09 \x20 ! " # $ % & ' ( * + - / 0 1 2 3 4 5 6 7 8
9 = ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ^ _ ` a b c d e
f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xff
+/\h/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x09 \x20 \xa0 \xff
+
+/\v/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff
+
+/\R/SI
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+Subject length lower bound = 1
+Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff
+
/-- End of testinput17 --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index ad51644..559ab7b 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1405,30 +1405,6 @@ Partial match: abcde
\PX
Partial match: X
-/\h/SI
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x09 \x20 \xa0
-
-/\v/SI
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \x85
-
-/\R/SI
-Capturing subpattern count = 0
-No options
-No first char
-No need char
-Subject length lower bound = 1
-Starting byte set: \x0a \x0b \x0c \x0d \x85
-
/\sxxx\s/8T1
AB\x{85}xxx\x{a0}XYZ
0: \x{85}xxx\x{a0}