summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2020-02-23 16:40:05 +0000
committerph10 <ph10@6239d852-aaf2-0410-a92c-79f79f948069>2020-02-23 16:40:05 +0000
commiteeeb059e46a07f10023f2313894159161504b664 (patch)
tree98719aa173603943d4a2a403724045bf8a40f19c /src
parent98c6677bd3ff37d50249b32297abdb6008b42d54 (diff)
downloadpcre2-eeeb059e46a07f10023f2313894159161504b664.tar.gz
Unicode upper/lower casing is now used when UCP is set, even if UTF is not set.
This is not yet documented, and it not yet implemented in JIT. git-svn-id: svn://vcs.exim.org/pcre2/code/trunk@1224 6239d852-aaf2-0410-a92c-79f79f948069
Diffstat (limited to 'src')
-rw-r--r--src/pcre2_auto_possess.c34
-rw-r--r--src/pcre2_compile.c46
-rw-r--r--src/pcre2_dfa_match.c40
-rw-r--r--src/pcre2_internal.h2
-rw-r--r--src/pcre2_match.c125
-rw-r--r--src/pcre2_study.c75
-rw-r--r--src/pcre2_substitute.c5
7 files changed, 219 insertions, 108 deletions
diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c
index 5b95b9b..6c4925f 100644
--- a/src/pcre2_auto_possess.c
+++ b/src/pcre2_auto_possess.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2019 University of Cambridge
+ New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -292,6 +292,7 @@ possessification, and if so, fills a list with its properties.
Arguments:
code points to start of expression
utf TRUE if in UTF mode
+ ucp TRUE if in UCP mode
fcc points to the case-flipping table
list points to output list
list[0] will be filled with the opcode
@@ -304,7 +305,7 @@ Returns: points to the start of the next opcode if *code is accepted
*/
static PCRE2_SPTR
-get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
+get_chr_property_list(PCRE2_SPTR code, BOOL utf, BOOL ucp, const uint8_t *fcc,
uint32_t *list)
{
PCRE2_UCHAR c = *code;
@@ -316,7 +317,8 @@ uint32_t chr;
uint32_t *clist_dest;
const uint32_t *clist_src;
#else
-(void)utf; /* Suppress "unused parameter" compiler warning */
+(void)utf; /* Suppress "unused parameter" compiler warnings */
+(void)ucp;
#endif
list[0] = c;
@@ -396,7 +398,7 @@ switch(c)
list[2] = chr;
#ifdef SUPPORT_UNICODE
- if (chr < 128 || (chr < 256 && !utf))
+ if (chr < 128 || (chr < 256 && !utf && !ucp))
list[3] = fcc[chr];
else
list[3] = UCD_OTHERCASE(chr);
@@ -503,6 +505,7 @@ which case the base cannot be possessified.
Arguments:
code points to the byte code
utf TRUE in UTF mode
+ ucp TRUE in UCP mode
cb compile data block
base_list the data list of the base opcode
base_end the end of the base opcode
@@ -512,7 +515,7 @@ Returns: TRUE if the auto-possessification is possible
*/
static BOOL
-compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
+compare_opcodes(PCRE2_SPTR code, BOOL utf, BOOL ucp, const compile_block *cb,
const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
{
PCRE2_UCHAR c;
@@ -651,7 +654,7 @@ for(;;)
while (*next_code == OP_ALT)
{
- if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
+ if (!compare_opcodes(code, utf, ucp, cb, base_list, base_end, rec_limit))
return FALSE;
code = next_code + 1 + LINK_SIZE;
next_code += GET(next_code, 1);
@@ -672,7 +675,8 @@ for(;;)
/* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
next_code += 1 + LINK_SIZE;
- if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
+ if (!compare_opcodes(next_code, utf, ucp, cb, base_list, base_end,
+ rec_limit))
return FALSE;
code += PRIV(OP_lengths)[c];
@@ -688,7 +692,7 @@ for(;;)
/* We now have the next appropriate opcode to compare with the base. Check
for a supported opcode, and load its properties. */
- code = get_chr_property_list(code, utf, cb->fcc, list);
+ code = get_chr_property_list(code, utf, ucp, cb->fcc, list);
if (code == NULL) return FALSE; /* Unsupported */
/* If either opcode is a small character list, set pointers for comparing
@@ -1100,7 +1104,6 @@ leaving the remainder of the pattern unpossessified.
Arguments:
code points to start of the byte code
- utf TRUE in UTF mode
cb compile data block
Returns: 0 for success
@@ -1108,13 +1111,15 @@ Returns: 0 for success
*/
int
-PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
+PRIV(auto_possessify)(PCRE2_UCHAR *code, const compile_block *cb)
{
PCRE2_UCHAR c;
PCRE2_SPTR end;
PCRE2_UCHAR *repeat_opcode;
uint32_t list[8];
int rec_limit = 1000; /* Was 10,000 but clang+ASAN uses a lot of stack. */
+BOOL utf = (cb->external_options & PCRE2_UTF) != 0;
+BOOL ucp = (cb->external_options & PCRE2_UCP) != 0;
for (;;)
{
@@ -1126,10 +1131,11 @@ for (;;)
{
c -= get_repeat_base(c) - OP_STAR;
end = (c <= OP_MINUPTO) ?
- get_chr_property_list(code, utf, cb->fcc, list) : NULL;
+ get_chr_property_list(code, utf, ucp, cb->fcc, list) : NULL;
list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
- if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
+ if (end != NULL && compare_opcodes(end, utf, ucp, cb, list, end,
+ &rec_limit))
{
switch(c)
{
@@ -1181,11 +1187,11 @@ for (;;)
if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
{
/* end must not be NULL. */
- end = get_chr_property_list(code, utf, cb->fcc, list);
+ end = get_chr_property_list(code, utf, ucp, cb->fcc, list);
list[1] = (c & 1) == 0;
- if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
+ if (compare_opcodes(end, utf, ucp, cb, list, end, &rec_limit))
{
switch (c)
{
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index aa4869f..515f2aa 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -4904,7 +4904,7 @@ range. */
if ((options & PCRE2_CASELESS) != 0)
{
#ifdef SUPPORT_UNICODE
- if ((options & PCRE2_UTF) != 0)
+ if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)
{
int rc;
uint32_t oc, od;
@@ -5319,7 +5319,8 @@ dynamically as we process the pattern. */
#ifdef SUPPORT_UNICODE
BOOL utf = (options & PCRE2_UTF) != 0;
-#else /* No UTF support */
+BOOL ucp = (options & PCRE2_UCP) != 0;
+#else /* No Unicode support */
BOOL utf = FALSE;
#endif
@@ -5602,7 +5603,7 @@ for (;; pptr++)
uint32_t d;
#ifdef SUPPORT_UNICODE
- if (utf && c > 127) d = UCD_OTHERCASE(c); else
+ if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
#endif
{
#if PCRE2_CODE_UNIT_WIDTH != 8
@@ -9632,6 +9633,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
{
BOOL utf; /* Set TRUE for UTF mode */
+BOOL ucp; /* Set TRUE for UCP mode */
BOOL has_lookbehind = FALSE; /* Set TRUE if a lookbehind is found */
BOOL zero_terminated; /* Set TRUE for zero-terminated pattern */
pcre2_real_code *re = NULL; /* What we will return */
@@ -9919,8 +9921,8 @@ if (utf)
/* Check UCP lockout. */
-if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
- (PCRE2_UCP|PCRE2_NEVER_UCP))
+ucp = (cb.external_options & PCRE2_UCP) != 0;
+if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
{
errorcode = ERR75;
goto HAD_EARLY_ERROR;
@@ -10296,7 +10298,7 @@ function call. */
if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
- if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
+ if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
}
/* Failed to compile, or error while post-processing. */
@@ -10344,21 +10346,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((firstcuflags & REQ_CASELESS) != 0)
{
- if (firstcu < 128 || (!utf && firstcu < 255))
+ if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
{
if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
}
- /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
- 8-bit UTF mode, codepoints in the range 128-255 are introductory code
- points and cannot have another case. In 16-bit and 32-bit modes, we can
- check wide characters when UTF (and therefore UCP) is supported. */
+ /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
+ In 8-bit UTF mode, codepoints in the range 128-255 are introductory code
+ points and cannot have another case, but if UCP is set they may do. */
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- else if (firstcu <= MAX_UTF_CODE_POINT &&
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
+ re->flags |= PCRE2_FIRSTCASELESS;
+#else
+ else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
UCD_OTHERCASE(firstcu) != firstcu)
re->flags |= PCRE2_FIRSTCASELESS;
#endif
+#endif /* SUPPORT_UNICODE */
}
}
@@ -10407,14 +10413,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if ((reqcuflags & REQ_CASELESS) != 0)
{
- if (reqcu < 128 || (!utf && reqcu < 255))
+ if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
{
if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
}
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
- re->flags |= PCRE2_LASTCASELESS;
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
+ re->flags |= PCRE2_LASTCASELESS;
+#else
+ else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
+ UCD_OTHERCASE(reqcu) != reqcu)
+ re->flags |= PCRE2_LASTCASELESS;
#endif
+#endif /* SUPPORT_UNICODE */
}
}
}
diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
index 7d8ffe8..b8bdd02 100644
--- a/src/pcre2_dfa_match.c
+++ b/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2019 University of Cambridge
+ New API code Copyright (c) 2016-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
#else
BOOL utf = FALSE;
#endif
@@ -2190,7 +2191,7 @@ for (;;)
if (clen == 0) break;
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf_or_ucp)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
@@ -2204,7 +2205,7 @@ for (;;)
}
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF mode */
+ /* Not UTF or UCP mode */
{
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
{ ADD_NEW(state_offset + 2, 0); }
@@ -2339,7 +2340,7 @@ for (;;)
{
uint32_t otherd;
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2374,7 +2375,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2417,7 +2418,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2458,7 +2459,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2491,7 +2492,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2531,7 +2532,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && first_cu > 127)
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
-#endif
+#else
+ if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
+ first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+#endif
+#endif /* SUPPORT_UNICODE */
}
}
else
@@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
+ req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#else
+ if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
+ req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index ac96d2d..9963d6f 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -1952,7 +1952,7 @@ is available. */
#define _pcre2_was_newline PCRE2_SUFFIX(_pcre2_was_newline_)
#define _pcre2_xclass PCRE2_SUFFIX(_pcre2_xclass_)
-extern int _pcre2_auto_possessify(PCRE2_UCHAR *, BOOL,
+extern int _pcre2_auto_possessify(PCRE2_UCHAR *,
const compile_block *);
extern int _pcre2_check_escape(PCRE2_SPTR *, PCRE2_SPTR, uint32_t *,
int *, uint32_t, uint32_t, BOOL, compile_block *);
diff --git a/src/pcre2_match.c b/src/pcre2_match.c
index a3fccc1..77c98f5 100644
--- a/src/pcre2_match.c
+++ b/src/pcre2_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2015-2019 University of Cambridge
+ New API code Copyright (c) 2015-2020 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -598,12 +598,13 @@ BOOL condition; /* Used in conditional groups */
BOOL cur_is_word; /* Used in "word" tests */
BOOL prev_is_word; /* Used in "word" tests */
-/* UTF flag */
+/* UTF and UCP flags */
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL ucp = (mb->poptions & PCRE2_UCP) != 0;
#else
-BOOL utf = FALSE;
+BOOL utf = FALSE; /* Required for convenience even when no Unicode support */
#endif
/* This is the length of the last part of a backtracking frame that must be
@@ -928,6 +929,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
else
#endif
+
/* Not UTF mode */
{
if (mb->end_subject - Feptr < 1)
@@ -987,10 +989,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
}
}
+
+ /* If UCP is set without UTF we must do the same as above, but with one
+ character per code unit. */
+
+ else if (ucp)
+ {
+ uint32_t cc = UCHAR21(Feptr);
+ fc = Fecode[1];
+ if (fc < 128)
+ {
+ if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH);
+ }
+ else
+ {
+ if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH);
+ }
+ Feptr++;
+ Fecode += 2;
+ }
+
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF mode; use the table for characters < 256. */
+ /* Not UTF or UCP mode; use the table for characters < 256. */
{
if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1])
!= TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH);
@@ -1010,6 +1032,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
+
#ifdef SUPPORT_UNICODE
if (utf)
{
@@ -1026,15 +1049,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (ch > 127)
ch = UCD_OTHERCASE(ch);
else
- ch = TABLE_GET(ch, mb->fcc, ch);
+ ch = (mb->fcc)[ch];
+ if (ch == fc) RRETURN(MATCH_NOMATCH);
+ }
+ }
+
+ /* UCP without UTF is as above, but with one character per code unit. */
+
+ else if (ucp)
+ {
+ uint32_t ch;
+ fc = UCHAR21INC(Feptr);
+ ch = Fecode[1];
+ Fecode += 2;
+
+ if (ch == fc)
+ {
+ RRETURN(MATCH_NOMATCH); /* Caseful match */
+ }
+ else if (Fop == OP_NOTI) /* If caseless */
+ {
+ if (ch > 127)
+ ch = UCD_OTHERCASE(ch);
+ else
+ ch = (mb->fcc)[ch];
if (ch == fc) RRETURN(MATCH_NOMATCH);
}
}
+
else
#endif /* SUPPORT_UNICODE */
+
+ /* Neither UTF nor UCP is set */
+
{
uint32_t ch = Fecode[1];
- fc = *Feptr++;
+ fc = UCHAR21INC(Feptr);
if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc))
RRETURN(MATCH_NOMATCH);
Fecode += 2;
@@ -1244,7 +1294,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
#endif /* SUPPORT_UNICODE */
/* When not in UTF mode, load a single-code-unit character. Then proceed as
- above. */
+ above, using Unicode casing if either UTF or UCP is set. */
Lc = *Fecode++;
@@ -1253,11 +1303,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_STARI)
{
#if PCRE2_CODE_UNIT_WIDTH == 8
- /* Lc must be < 128 in UTF-8 mode. */
+#ifdef SUPPORT_UNICODE
+ if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+ else
+#endif /* SUPPORT_UNICODE */
+ /* Lc will be < 128 in UTF-8 mode. */
Loc = mb->fcc[Lc];
#else /* 16-bit & 32-bit */
#ifdef SUPPORT_UNICODE
- if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc);
+ if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
Loc = TABLE_GET(Lc, mb->fcc, Lc);
@@ -1490,7 +1544,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (Fop >= OP_NOTSTARI) /* Caseless */
{
#ifdef SUPPORT_UNICODE
- if (utf && Lc > 127)
+ if ((utf || ucp) && Lc > 127)
Loc = UCD_OTHERCASE(Lc);
else
#endif /* SUPPORT_UNICODE */
@@ -6045,7 +6099,6 @@ BOOL firstline;
BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
BOOL startline;
-BOOL utf;
#if PCRE2_CODE_UNIT_WIDTH == 8
BOOL memchr_not_found_first_cu = FALSE;
@@ -6069,13 +6122,19 @@ PCRE2_SPTR match_partial;
BOOL use_jit;
#endif
+/* This flag is needed even when Unicode is not supported for convenience
+(it is used by the IS_NEWLINE macro). */
+
+BOOL utf = FALSE;
+
#ifdef SUPPORT_UNICODE
+BOOL ucp = FALSE;
BOOL allow_invalid;
uint32_t fragment_options = 0;
#ifdef SUPPORT_JIT
BOOL jit_checked_utf = FALSE;
#endif
-#endif
+#endif /* SUPPORT_UNICODE */
PCRE2_SIZE frame_size;
@@ -6147,12 +6206,13 @@ use_jit = (re->executable_jit != NULL &&
(options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
#endif
-/* Initialize UTF parameters. */
+/* Initialize UTF/UCP parameters. */
-utf = (re->overall_options & PCRE2_UTF) != 0;
#ifdef SUPPORT_UNICODE
+utf = (re->overall_options & PCRE2_UTF) != 0;
allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
-#endif
+ucp = (re->overall_options & PCRE2_UCP) != 0;
+#endif /* SUPPORT_UNICODE */
/* Convert the partial matching flags into an integer. */
@@ -6589,9 +6649,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu);
+#else
+ if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
else
@@ -6607,9 +6671,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu);
+#else
+ if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
@@ -6756,15 +6824,16 @@ for(;;)
#endif
}
- /* If we can't find the required code unit, having reached the true end
- of the subject, break the bumpalong loop, to force a match failure,
- except when doing partial matching, when we let the next cycle run at
- the end of the subject. To see why, consider the pattern /(?<=abc)def/,
- which partially matches "abc", even though the string does not contain
- the starting character "d". If we have not reached the true end of the
- subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
- we also let the cycle run, because the matching string is legitimately
- allowed to start with the first code unit of a newline. */
+ /* If we can't find the required first code unit, having reached the
+ true end of the subject, break the bumpalong loop, to force a match
+ failure, except when doing partial matching, when we let the next cycle
+ run at the end of the subject. To see why, consider the pattern
+ /(?<=abc)def/, which partially matches "abc", even though the string
+ does not contain the starting character "d". If we have not reached the
+ true end of the subject (PCRE2_FIRSTLINE caused end_subject to be
+ temporarily modified) we also let the cycle run, because the matching
+ string is legitimately allowed to start with the first code unit of a
+ newline. */
if (mb->partial == 0 && start_match >= mb->end_subject)
{
diff --git a/src/pcre2_study.c b/src/pcre2_study.c
index 5af01b5..02d1c08 100644
--- a/src/pcre2_study.c
+++ b/src/pcre2_study.c
@@ -772,15 +772,19 @@ Arguments:
p points to the first code unit of the character
caseless TRUE if caseless
utf TRUE for UTF mode
+ ucp TRUE for UCP mode
Returns: pointer after the character
*/
static PCRE2_SPTR
-set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
+set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf,
+ BOOL ucp)
{
uint32_t c = *p++; /* First code unit */
-(void)utf; /* Stop compiler warning when UTF not supported */
+
+(void)utf; /* Stop compiler warnings when UTF not supported */
+(void)ucp;
/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
0xff. */
@@ -810,22 +814,26 @@ if (utf)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf || ucp)
{
-#if PCRE2_CODE_UNIT_WIDTH == 8
- PCRE2_UCHAR buff[6];
c = UCD_OTHERCASE(c);
- (void)PRIV(ord2utf)(c, buff);
- SET_BIT(buff[0]);
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (utf)
+ {
+ PCRE2_UCHAR buff[6];
+ (void)PRIV(ord2utf)(c, buff);
+ SET_BIT(buff[0]);
+ }
+ else SET_BIT(c);
#else /* 16-bit or 32-bit mode */
- c = UCD_OTHERCASE(c);
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
#endif
}
+
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF */
+ /* Not UTF or UCP */
if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
}
@@ -931,6 +939,7 @@ Arguments:
re points to the compiled regex block
code points to an expression
utf TRUE if in UTF mode
+ ucp TRUE if in UCP mode
depthptr pointer to recurse depth
Returns: SSB_FAIL => Failed to find any starting code units
@@ -941,7 +950,8 @@ Returns: SSB_FAIL => Failed to find any starting code units
*/
static int
-set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, int *depthptr)
+set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf, BOOL ucp,
+ int *depthptr)
{
uint32_t c;
int yield = SSB_DONE;
@@ -1111,7 +1121,7 @@ do
case OP_SCRIPT_RUN:
case OP_ASSERT:
case OP_ASSERT_NA:
- rc = set_start_bits(re, tcode, utf, depthptr);
+ rc = set_start_bits(re, tcode, utf, ucp, depthptr);
if (rc == SSB_DONE)
{
try_next = FALSE;
@@ -1167,7 +1177,7 @@ do
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_BRAPOSZERO:
- rc = set_start_bits(re, ++tcode, utf, depthptr);
+ rc = set_start_bits(re, ++tcode, utf, ucp, depthptr);
if (rc == SSB_FAIL || rc == SSB_UNKNOWN || rc == SSB_TOODEEP) return rc;
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
@@ -1189,7 +1199,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- tcode = set_table_bit(re, tcode + 1, FALSE, utf);
+ tcode = set_table_bit(re, tcode + 1, FALSE, utf, ucp);
break;
case OP_STARI:
@@ -1198,7 +1208,7 @@ do
case OP_QUERYI:
case OP_MINQUERYI:
case OP_POSQUERYI:
- tcode = set_table_bit(re, tcode + 1, TRUE, utf);
+ tcode = set_table_bit(re, tcode + 1, TRUE, utf, ucp);
break;
/* Single-char upto sets the bit and tries the next */
@@ -1206,13 +1216,13 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
+ tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf, ucp);
break;
case OP_UPTOI:
case OP_MINUPTOI:
case OP_POSUPTOI:
- tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
+ tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf, ucp);
break;
/* At least one single char sets the bit and stops */
@@ -1224,7 +1234,7 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- (void)set_table_bit(re, tcode + 1, FALSE, utf);
+ (void)set_table_bit(re, tcode + 1, FALSE, utf, ucp);
try_next = FALSE;
break;
@@ -1235,7 +1245,7 @@ do
case OP_PLUSI:
case OP_MINPLUSI:
case OP_POSPLUSI:
- (void)set_table_bit(re, tcode + 1, TRUE, utf);
+ (void)set_table_bit(re, tcode + 1, TRUE, utf, ucp);
try_next = FALSE;
break;
@@ -1664,6 +1674,7 @@ PRIV(study)(pcre2_real_code *re)
int count = 0;
PCRE2_UCHAR *code;
BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (re->overall_options & PCRE2_UCP) != 0;
/* Find start of compiled code */
@@ -1677,7 +1688,7 @@ code units. */
if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
{
int depth = 0;
- int rc = set_start_bits(re, code, utf, &depth);
+ int rc = set_start_bits(re, code, utf, ucp, &depth);
if (rc == SSB_UNKNOWN) return 1;
/* If a list of starting code units was set up, scan the list to see if only
@@ -1695,7 +1706,7 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
int b = -1;
uint8_t *p = re->start_bitmap;
uint32_t flags = PCRE2_FIRSTMAPSET;
-
+
for (i = 0; i < 256; p++, i += 8)
{
uint8_t x = *p;
@@ -1725,27 +1736,27 @@ if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
}
/* c contains the code unit value, in the range 0-255. In 8-bit UTF
- mode, only values < 128 can be used. */
+ mode, only values < 128 can be used. In all the other cases, c is a
+ character value. */
#if PCRE2_CODE_UNIT_WIDTH == 8
- if (c > 127) goto DONE;
+ if (utf && c > 127) goto DONE;
#endif
- if (a < 0) a = c; /* First one found */
+ if (a < 0) a = c; /* First one found, save in a */
else if (b < 0) /* Second one found */
{
int d = TABLE_GET((unsigned int)c, re->tables + fcc_offset, c);
-
+
#ifdef SUPPORT_UNICODE
-#if PCRE2_CODE_UNIT_WIDTH == 8
- if (utf && UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
-#else /* 16-bit or 32-bit */
- if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
- if (utf && c > 127) d = UCD_OTHERCASE(c);
-#endif /* Code width */
+ if (utf || ucp)
+ {
+ if (UCD_CASESET(c) != 0) goto DONE; /* Multiple case set */
+ if (c > 127) d = UCD_OTHERCASE(c);
+ }
#endif /* SUPPORT_UNICODE */
- if (d != a) goto DONE; /* Not other case of a */
- b = c;
+ if (d != a) goto DONE; /* Not the other case of a */
+ b = c; /* Save second in b */
}
else goto DONE; /* More than two characters found */
}
diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c
index 222cb32..981a106 100644
--- a/src/pcre2_substitute.c
+++ b/src/pcre2_substitute.c
@@ -236,6 +236,7 @@ BOOL use_existing_match;
BOOL replacement_only;
#ifdef SUPPORT_UNICODE
BOOL utf = (code->overall_options & PCRE2_UTF) != 0;
+BOOL ucp = (code->overall_options & PCRE2_UCP) != 0;
#endif
PCRE2_UCHAR temp[6];
PCRE2_SPTR ptr;
@@ -758,7 +759,7 @@ do
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&
@@ -860,7 +861,7 @@ do
if (forcecase != 0)
{
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf || ucp)
{
uint32_t type = UCD_CHARTYPE(ch);
if (PRIV(ucp_gentype)[type] == ucp_L &&