summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-01 16:21:42 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2010-06-01 16:21:42 +0000
commit1d63547e1bfd241769214b9f80fde16de397f51f (patch)
treee31edef7db79dc7de47c9d9a18f436cedf86dcbc
parentae22c8e3671a04fd2a4b1c823de25aa472f651d0 (diff)
downloadpcre-1d63547e1bfd241769214b9f80fde16de397f51f.tar.gz
Extend auto-possessify to handle some Unicode properties.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@532 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog6
-rw-r--r--pcre_compile.c208
-rw-r--r--testdata/testinput1218
-rw-r--r--testdata/testoutput12109
4 files changed, 295 insertions, 46 deletions
diff --git a/ChangeLog b/ChangeLog
index f1fdc6d..3656f65 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -58,7 +58,9 @@ Version 8.10 03 May-2010
14. pcre_study() now recognizes \h, \v, and \R when constructing a bit map of
possible starting bytes for non-anchored patterns.
-15. The "auto-possessify" feature of pcre_compile() now recognizes \R.
+15. Extended the "auto-possessify" feature of pcre_compile(). It now recognizes
+ \R, and also a number of cases that involve Unicode properties, both
+ explicit and implicit when PCRE_UCP is set.
16. If a repeated Unicode property match (e.g. \p{Lu}*) was used with non-UTF-8
input, it could crash or give wrong results if characters with values
@@ -70,7 +72,7 @@ Version 8.10 03 May-2010
18. Added a check for running out of memory when PCRE is compiled with
--disable-stack-for-recursion.
-
+
Version 8.02 19-Mar-2010
diff --git a/pcre_compile.c b/pcre_compile.c
index c0486b7..f8c673d 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -2392,6 +2392,69 @@ for (++c; c <= d; c++)
return TRUE;
}
+
+
+
+/*************************************************
+* Check a character and a property *
+*************************************************/
+
+/* This function is called by check_auto_possessive() when a property item
+is adjacent to a fixed character.
+
+Arguments:
+ c the character
+ ptype the property type
+ pdata the data for the type
+ negated TRUE if it's a negated property (\P or \p{^)
+
+Returns: TRUE if auto-possessifying is OK
+*/
+
+static BOOL
+check_char_prop(int c, int ptype, int pdata, BOOL negated)
+{
+const ucd_record *prop = GET_UCD(c);
+switch(ptype)
+ {
+ case PT_LAMP:
+ return (prop->chartype == ucp_Lu ||
+ prop->chartype == ucp_Ll ||
+ prop->chartype == ucp_Lt) == negated;
+
+ case PT_GC:
+ return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
+
+ case PT_PC:
+ return (pdata == prop->chartype) == negated;
+
+ case PT_SC:
+ return (pdata == prop->script) == negated;
+
+ /* These are specials */
+
+ case PT_ALNUM:
+ return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
+ _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
+
+ case PT_SPACE: /* Perl space */
+ return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+ == negated;
+
+ case PT_PXSPACE: /* POSIX space */
+ return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
+ c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+ c == CHAR_FF || c == CHAR_CR)
+ == negated;
+
+ case PT_WORD:
+ return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
+ _pcre_ucp_gentype[prop->chartype] == ucp_N ||
+ c == CHAR_UNDERSCORE) == negated;
+ }
+return FALSE;
+}
#endif /* SUPPORT_UCP */
@@ -2405,10 +2468,8 @@ whether the next thing could possibly match the repeated item. If not, it makes
sense to automatically possessify the repeated item.
Arguments:
- op_code the repeated op code
- this data for this item, depends on the opcode
+ previous pointer to the repeated opcode
utf8 TRUE in UTF-8 mode
- utf8_char used for utf8 character bytes, NULL if not relevant
ptr next character in pattern
options options bits
cd contains pointers to tables etc.
@@ -2417,10 +2478,11 @@ Returns: TRUE if possessifying is wanted
*/
static BOOL
-check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
- const uschar *ptr, int options, compile_data *cd)
+check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
+ int options, compile_data *cd)
{
-int next;
+int c, next;
+int op_code = *previous++;
/* Skip whitespace and comments in extended mode */
@@ -2481,33 +2543,30 @@ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
return FALSE;
-/* Now compare the next item with the previous opcode. If the previous is a
-positive single character match, "item" either contains the character or, if
-"item" is greater than 127 in utf8 mode, the character's bytes are in
-utf8_char. */
-
-
-/* Handle cases when the next item is a character. */
+/* Now compare the next item with the previous opcode. First, handle cases when
+the next item is a character. */
if (next >= 0) switch(op_code)
{
case OP_CHAR:
-#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
+#ifdef SUPPORT_UTF8
+ GETCHARTEST(c, previous);
#else
- (void)(utf8_char); /* Keep compiler happy by referencing function argument */
-#endif
- return item != next;
+ c = *previous;
+#endif
+ return c != next;
/* For CHARNC (caseless character) we must check the other case. If we have
Unicode property support, we can use it to test the other case of
high-valued characters. */
case OP_CHARNC:
-#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
-#endif
- if (item == next) return FALSE;
+#ifdef SUPPORT_UTF8
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
+#endif
+ if (c == next) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
{
@@ -2518,16 +2577,16 @@ if (next >= 0) switch(op_code)
#else
othercase = NOTACHAR;
#endif
- return (unsigned int)item != othercase;
+ return (unsigned int)c != othercase;
}
else
#endif /* SUPPORT_UTF8 */
- return (item != cd->fcc[next]); /* Non-UTF-8 mode */
+ return (c != cd->fcc[next]); /* Non-UTF-8 mode */
- /* For OP_NOT, "item" must be a single-byte character. */
+ /* For OP_NOT, its data is always a single-byte character. */
case OP_NOT:
- if (item == next) return TRUE;
+ if ((c = *previous) == next) return TRUE;
if ((options & PCRE_CASELESS) == 0) return FALSE;
#ifdef SUPPORT_UTF8
if (utf8)
@@ -2539,11 +2598,11 @@ if (next >= 0) switch(op_code)
#else
othercase = NOTACHAR;
#endif
- return (unsigned int)item == othercase;
+ return (unsigned int)c == othercase;
}
else
#endif /* SUPPORT_UTF8 */
- return (item == cd->fcc[next]); /* Non-UTF-8 mode */
+ return (c == cd->fcc[next]); /* Non-UTF-8 mode */
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
@@ -2611,6 +2670,14 @@ if (next >= 0) switch(op_code)
return op_code != OP_NOT_VSPACE;
}
+#ifdef SUPPORT_UCP
+ case OP_PROP:
+ return check_char_prop(next, previous[0], previous[1], FALSE);
+
+ case OP_NOTPROP:
+ return check_char_prop(next, previous[0], previous[1], TRUE);
+#endif
+
default:
return FALSE;
}
@@ -2619,38 +2686,41 @@ if (next >= 0) switch(op_code)
/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
generated only when PCRE_UCP is *not* set, that is, when only ASCII
-characteristics are recognized. */
+characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
+replaced by OP_PROP codes when PCRE_UCP is set. */
switch(op_code)
{
case OP_CHAR:
case OP_CHARNC:
-#ifdef SUPPORT_UTF8
- if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
-#endif
+#ifdef SUPPORT_UTF8
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
+#endif
switch(-next)
{
case ESC_d:
- return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
case ESC_D:
- return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
case ESC_s:
- return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
case ESC_S:
- return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
case ESC_w:
- return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
+ return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
case ESC_W:
- return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
+ return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
case ESC_h:
case ESC_H:
- switch(item)
+ switch(c)
{
case 0x09:
case 0x20:
@@ -2678,7 +2748,7 @@ switch(op_code)
case ESC_v:
case ESC_V:
- switch(item)
+ switch(c)
{
case 0x0a:
case 0x0b:
@@ -2691,10 +2761,61 @@ switch(op_code)
default:
return -next == ESC_v;
}
+
+ /* When PCRE_UCP is set, these values get generated for \d etc. Find
+ their substitutions and process them. The result will always be either
+ -ESC_p or -ESC_P. Then fall through to process those values. */
+
+#ifdef SUPPORT_UCP
+ case ESC_du:
+ case ESC_DU:
+ case ESC_wu:
+ case ESC_WU:
+ case ESC_su:
+ case ESC_SU:
+ {
+ int temperrorcode = 0;
+ ptr = substitutes[-next - ESC_DU];
+ next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
+ if (temperrorcode != 0) return FALSE;
+ ptr++; /* For compatibility */
+ }
+ /* Fall through */
+
+ case ESC_p:
+ case ESC_P:
+ {
+ int ptype, pdata, errorcodeptr;
+ BOOL negated;
+
+ ptr--; /* Make ptr point at the p or P */
+ ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
+ if (ptype < 0) return FALSE;
+ ptr++; /* Point past the final curly ket */
+
+ /* If the property item is optional, we have to give up. (When generated
+ from \d etc by PCRE_UCP, this test will have been applied much earlier,
+ to the original \d etc. At this point, ptr will point to a zero byte. */
+
+ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
+ strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
+ return FALSE;
+
+ /* Do the property check. */
+
+ return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
+ }
+#endif
default:
return FALSE;
}
+
+ /* In principle, support for Unicode properties should be integrated here as
+ well. It means re-organizing the above code so as to get hold of the property
+ values before switching on the op-code. However, I wonder how many patterns
+ combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
+ these op-codes are never generated.) */
case OP_DIGIT:
return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
@@ -3998,8 +4119,7 @@ for (;; ptr++)
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
- options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -4020,7 +4140,7 @@ for (;; ptr++)
c = previous[1];
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
@@ -4044,7 +4164,7 @@ for (;; ptr++)
if (!possessive_quantifier &&
repeat_max < 0 &&
- check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
+ check_auto_possessive(previous, utf8, ptr + 1, options, cd))
{
repeat_type = 0; /* Force greedy */
possessive_quantifier = TRUE;
diff --git a/testdata/testinput12 b/testdata/testinput12
index 42efabe..78ecf64 100644
--- a/testdata/testinput12
+++ b/testdata/testinput12
@@ -485,4 +485,22 @@ of case for anything other than the ASCII letters. --/
/\p{Xps}*/SI
+/\p{Lu}+9\p{Lu}+B\p{Lu}+b/BZ
+
+/\p{^Lu}+9\p{^Lu}+B\p{^Lu}+b/BZ
+
+/\P{Lu}+9\P{Lu}+B\P{Lu}+b/BZ
+
+/\p{Han}+X\p{Greek}+\x{370}/BZ8
+
+/\p{Xan}+!\p{Xan}+A/BZ
+
+/\p{Xsp}+!\p{Xsp}\t/BZ
+
+/\p{Xps}+!\p{Xps}\t/BZ
+
+/\p{Xwd}+!\p{Xwd}_/BZ
+
+/A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+
/-- End of testinput12 --/
diff --git a/testdata/testoutput12 b/testdata/testoutput12
index 5bd83ab..ab9dbfd 100644
--- a/testdata/testoutput12
+++ b/testdata/testoutput12
@@ -1067,4 +1067,113 @@ No need char
Subject length lower bound = 0
No set of starting bytes
+/\p{Lu}+9\p{Lu}+B\p{Lu}+b/BZ
+------------------------------------------------------------------
+ Bra
+ prop Lu ++
+ 9
+ prop Lu +
+ B
+ prop Lu ++
+ b
+ Ket
+ End
+------------------------------------------------------------------
+
+/\p{^Lu}+9\p{^Lu}+B\p{^Lu}+b/BZ
+------------------------------------------------------------------
+ Bra
+ notprop Lu +
+ 9
+ notprop Lu ++
+ B
+ notprop Lu +
+ b
+ Ket
+ End
+------------------------------------------------------------------
+
+/\P{Lu}+9\P{Lu}+B\P{Lu}+b/BZ
+------------------------------------------------------------------
+ Bra
+ notprop Lu +
+ 9
+ notprop Lu ++
+ B
+ notprop Lu +
+ b
+ Ket
+ End
+------------------------------------------------------------------
+
+/\p{Han}+X\p{Greek}+\x{370}/BZ8
+------------------------------------------------------------------
+ Bra
+ prop Han ++
+ X
+ prop Greek +
+ \x{370}
+ Ket
+ End
+------------------------------------------------------------------
+
+/\p{Xan}+!\p{Xan}+A/BZ
+------------------------------------------------------------------
+ Bra
+ prop Xan ++
+ !
+ prop Xan +
+ A
+ Ket
+ End
+------------------------------------------------------------------
+
+/\p{Xsp}+!\p{Xsp}\t/BZ
+------------------------------------------------------------------
+ Bra
+ prop Xsp ++
+ !
+ prop Xsp
+ \x09
+ Ket
+ End
+------------------------------------------------------------------
+
+/\p{Xps}+!\p{Xps}\t/BZ
+------------------------------------------------------------------
+ Bra
+ prop Xps ++
+ !
+ prop Xps
+ \x09
+ Ket
+ End
+------------------------------------------------------------------
+
+/\p{Xwd}+!\p{Xwd}_/BZ
+------------------------------------------------------------------
+ Bra
+ prop Xwd ++
+ !
+ prop Xwd
+ _
+ Ket
+ End
+------------------------------------------------------------------
+
+/A+\p{N}A+\dB+\p{N}*B+\d*/WBZ
+------------------------------------------------------------------
+ Bra
+ A++
+ prop N
+ A++
+ prop Nd
+ B+
+ prop N *+
+ B+
+ prop Nd *
+ Ket
+ End
+------------------------------------------------------------------
+
/-- End of testinput12 --/