summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pod/perlreguts.pod39
-rw-r--r--regcomp.c53
-rw-r--r--regexec.c2
-rwxr-xr-xt/op/pat.t12
4 files changed, 80 insertions, 26 deletions
diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod
index 3ba0da0c69..d119dfe4f2 100644
--- a/pod/perlreguts.pod
+++ b/pod/perlreguts.pod
@@ -775,7 +775,7 @@ must be able to correctly build a regexp structure.
typedef struct regexp {
/* what engine created this regexp? */
- const struct regexp_engine* engine;
+ const struct regexp_engine* engine;
/* Information about the match that the perl core uses to manage things */
U32 extflags; /* Flags used both externally and internally */
@@ -829,10 +829,10 @@ to the subroutines that are to be used for performing a match. It
is the compiling routine's responsibility to populate this field before
returning the regexp object.
-=item C<precomp> C<prelen>
+=item C<precomp> C<prelen>
Used for debugging purposes. C<precomp> holds a copy of the pattern
-that was compiled.
+that was compiled.
=item C<extflags>
@@ -841,22 +841,22 @@ contains a \G or a ^ or $ symbol.
=item C<minlen> C<minlenret>
-C<minlen> is the minimum string length required for the pattern to match.
-This is used to prune the search space by not bothering to match any
-closer to the end of a string than would allow a match. For instance
-there is no point in even starting the regex engine if the minlen is
-10 but the string is only 5 characters long. There is no way that the
+C<minlen> is the minimum string length required for the pattern to match.
+This is used to prune the search space by not bothering to match any
+closer to the end of a string than would allow a match. For instance
+there is no point in even starting the regex engine if the minlen is
+10 but the string is only 5 characters long. There is no way that the
pattern can match.
C<minlenret> is the minimum length of the string that would be found
-in $& after a match.
+in $& after a match.
The difference between C<minlen> and C<minlenret> can be seen in the
following pattern:
/ns(?=\d)/
-where the C<minlen> would be 3 but the minlen ret would only be 2 as
+where the C<minlen> would be 3 but the minlen ret would only be 2 as
the \d is required to match but is not actually included in the matched
content. This distinction is particularly important as the substitution
logic uses the C<minlenret> to tell whether it can do in-place substition
@@ -889,7 +889,7 @@ occur at a floating offset from the start of the pattern. Used to do
Fast-Boyer-Moore searches on the string to find out if its worth using
the regex engine at all, and if so where in the string to search.
-=item C<startp>, C<endp>,
+=item C<startp>, C<endp>
These fields store arrays that are used to hold the offsets of the begining
and end of each capture group that has matched. -1 is used to indicate no match.
@@ -903,8 +903,8 @@ patterns.
=item C<seen_evals>
-This stores the number of eval groups in the pattern. This is used
-for security purposes when embedding compiled regexes into larger
+This stores the number of eval groups in the pattern. This is used
+for security purposes when embedding compiled regexes into larger
patterns.
=back
@@ -1028,6 +1028,17 @@ Compile the pattern between exp and xend using the flags contained in
pm and return a pointer to a prepared regexp structure that can perform
the match.
+The utf8'ness of the string can be found by testing
+
+ pm->op_pmdynflags & PMdf_CMP_UTF8
+
+Additional various flags reflecting the modifiers used are contained in
+
+ pm->op_pmflags
+
+some of these have exact equivelents in re->extflags. See regcomp.h and op.h
+for details of these values.
+
=item exec
I32 exec(regexp* prog,
@@ -1046,7 +1057,7 @@ Execute a regexp.
Find the start position where a regex match should be attempted,
or possibly whether the regex engine should not be run because the
pattern can't match. This is called as appropriate by the core
-depending on the values of the extflags member of the regexp
+depending on the values of the extflags member of the regexp
structure.
=item checkstr
diff --git a/regcomp.c b/regcomp.c
index 429b493286..7c088404af 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -124,7 +124,10 @@ typedef struct RExC_state_t {
regnode **open_parens; /* pointers to open parens */
regnode **close_parens; /* pointers to close parens */
regnode *opend; /* END node in program */
- I32 utf8;
+ I32 utf8; /* whether the pattern is utf8 or not */
+ I32 orig_utf8; /* whether the pattern was originally in utf8 */
+ /* XXX use this for future optimisation of case
+ * where pattern must be upgraded to utf8. */
HV *charnames; /* cache of named sequences */
HV *paren_names; /* Paren names */
@@ -168,6 +171,7 @@ typedef struct RExC_state_t {
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define RExC_charnames (pRExC_state->charnames)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
@@ -1375,16 +1379,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
- STRLEN chars=0;
+ STRLEN chars = 0;
+ bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
trie->minlen= 0;
continue;
}
- if (trie->bitmap) {
- TRIE_BITMAP_SET(trie,*uc);
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
- }
+ if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
+ regardless of encoding */
+
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
@@ -1396,6 +1401,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
TRIE_STORE_REVCHAR;
}
+ if ( set_bit ) {
+ /* store the codepoint in the bitmap, and if its ascii
+ also store its folded equivelent. */
+ TRIE_BITMAP_SET(trie,uvc);
+ if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ set_bit = 0; /* We've done our bit :-) */
+ }
} else {
SV** svpp;
if ( !widecharmap )
@@ -4052,16 +4064,18 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
if (exp == NULL)
FAIL("NULL regexp argument");
- RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
- RExC_precomp = exp;
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, RExC_precomp, (xend - exp), 60);
+ dsv, exp, (xend - exp), 60);
PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
PL_colors[4],PL_colors[5],s);
});
+
+redo_first_pass:
+ RExC_precomp = exp;
RExC_flags = pm->op_pmflags;
RExC_sawback = 0;
@@ -4100,6 +4114,25 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
RExC_precomp = NULL;
return(NULL);
}
+ if (RExC_utf8 && !RExC_orig_utf8) {
+ /* It's possible to write a regexp in ascii that represents unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ XXX: somehow figure out how to make this less expensive...
+ -- dmq */
+ STRLEN len = xend-exp;
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+ exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
+ xend = exp + len;
+ RExC_orig_utf8 = RExC_utf8;
+ SAVEFREEPV(exp);
+ goto redo_first_pass;
+ }
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
"Required size %"IVdf" nodes\n"
@@ -4956,7 +4989,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("reg ");
-
*flagp = 0; /* Tentatively. */
@@ -5796,6 +5828,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("brnc");
+
if (first)
ret = NULL;
else {
diff --git a/regexec.c b/regexec.c
index dc0cd9ba3c..c9efaaef14 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1997,7 +1997,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
SV * const prop = sv_newmortal();
regprop(prog, prop, c);
{
- RE_PV_QUOTED_DECL(quoted,UTF,PERL_DEBUG_PAD_ZERO(1),
+ RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
s,strend-s,60);
PerlIO_printf(Perl_debug_log,
"Matching stclass %.*s against %s (%d chars)\n",
diff --git a/t/op/pat.t b/t/op/pat.t
index 5bc68d7776..423822abac 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4316,6 +4316,16 @@ sub kt
"Check that (?&..) to a buffer inside a (?|...) goes to the leftmost");
}
+{
+ use warnings;
+ local $Message = "ASCII pattern that really is utf8";
+ my @w;
+ local $SIG{__WARN__}=sub{push @w,"@_"};
+ my $c=qq(\x{DF});
+ ok($c=~/${c}|\x{100}/);
+ ok(@w==0);
+}
+
# Test counter is at bottom of file. Put new tests above here.
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
@@ -4385,7 +4395,7 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
iseq(0+$::test,$::TestCount,"Got the right number of tests!");
# Don't forget to update this!
BEGIN {
- $::TestCount = 1650;
+ $::TestCount = 1652;
print "1..$::TestCount\n";
}