summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.h14
-rwxr-xr-xembed.pl5
-rw-r--r--mg.c22
-rw-r--r--objXSUB.h4
-rw-r--r--pp_ctl.c11
-rw-r--r--pp_hot.c27
-rw-r--r--proto.h5
-rw-r--r--regcomp.c840
-rw-r--r--regcomp.h39
-rw-r--r--regcomp.sym19
-rw-r--r--regexec.c1313
-rw-r--r--regnodes.h301
-rw-r--r--sv.c2
-rw-r--r--t/op/utf8decode.t2
-rwxr-xr-xt/pragma/utf8.t407
15 files changed, 1409 insertions, 1602 deletions
diff --git a/embed.h b/embed.h
index 64c1eaf9ef..3b54154de1 100644
--- a/embed.h
+++ b/embed.h
@@ -543,6 +543,7 @@
#define ref Perl_ref
#define refkids Perl_refkids
#define regdump Perl_regdump
+#define regclass_swash Perl_regclass_swash
#define pregexec Perl_pregexec
#define pregfree Perl_pregfree
#define pregcomp Perl_pregcomp
@@ -995,7 +996,6 @@
#define regbranch S_regbranch
#define reguni S_reguni
#define regclass S_regclass
-#define regclassutf8 S_regclassutf8
#define regcurly S_regcurly
#define reg_node S_reg_node
#define regpiece S_regpiece
@@ -1025,7 +1025,6 @@
#define regrepeat_hard S_regrepeat_hard
#define regtry S_regtry
#define reginclass S_reginclass
-#define reginclassutf8 S_reginclassutf8
#define regcppush S_regcppush
#define regcppop S_regcppop
#define regcp_set_to S_regcp_set_to
@@ -2015,6 +2014,7 @@
#define ref(a,b) Perl_ref(aTHX_ a,b)
#define refkids(a,b) Perl_refkids(aTHX_ a,b)
#define regdump(a) Perl_regdump(aTHX_ a)
+#define regclass_swash(a,b,c) Perl_regclass_swash(aTHX_ a,b,c)
#define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g)
#define pregfree(a) Perl_pregfree(aTHX_ a)
#define pregcomp(a,b,c) Perl_pregcomp(aTHX_ a,b,c)
@@ -2459,7 +2459,6 @@
#define regbranch(a,b,c) S_regbranch(aTHX_ a,b,c)
#define reguni(a,b,c,d) S_reguni(aTHX_ a,b,c,d)
#define regclass(a) S_regclass(aTHX_ a)
-#define regclassutf8(a) S_regclassutf8(aTHX_ a)
#define regcurly(a) S_regcurly(aTHX_ a)
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
#define regpiece(a,b) S_regpiece(aTHX_ a,b)
@@ -2487,8 +2486,7 @@
#define regrepeat(a,b) S_regrepeat(aTHX_ a,b)
#define regrepeat_hard(a,b,c) S_regrepeat_hard(aTHX_ a,b,c)
#define regtry(a,b) S_regtry(aTHX_ a,b)
-#define reginclass(a,b) S_reginclass(aTHX_ a,b)
-#define reginclassutf8(a,b) S_reginclassutf8(aTHX_ a,b)
+#define reginclass(a,b,c) S_reginclass(aTHX_ a,b,c)
#define regcppush(a) S_regcppush(aTHX_ a)
#define regcppop() S_regcppop(aTHX)
#define regcp_set_to(a) S_regcp_set_to(aTHX_ a)
@@ -3950,6 +3948,8 @@
#define refkids Perl_refkids
#define Perl_regdump CPerlObj::Perl_regdump
#define regdump Perl_regdump
+#define Perl_regclass_swash CPerlObj::Perl_regclass_swash
+#define regclass_swash Perl_regclass_swash
#define Perl_pregexec CPerlObj::Perl_pregexec
#define pregexec Perl_pregexec
#define Perl_pregfree CPerlObj::Perl_pregfree
@@ -4787,8 +4787,6 @@
#define reguni S_reguni
#define S_regclass CPerlObj::S_regclass
#define regclass S_regclass
-#define S_regclassutf8 CPerlObj::S_regclassutf8
-#define regclassutf8 S_regclassutf8
#define S_regcurly CPerlObj::S_regcurly
#define regcurly S_regcurly
#define S_reg_node CPerlObj::S_reg_node
@@ -4845,8 +4843,6 @@
#define regtry S_regtry
#define S_reginclass CPerlObj::S_reginclass
#define reginclass S_reginclass
-#define S_reginclassutf8 CPerlObj::S_reginclassutf8
-#define reginclassutf8 S_reginclassutf8
#define S_regcppush CPerlObj::S_regcppush
#define regcppush S_regcppush
#define S_regcppop CPerlObj::S_regcppop
diff --git a/embed.pl b/embed.pl
index 9e2bd9c7e1..32f3ddc329 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1873,6 +1873,7 @@ Ap |void |push_scope
p |OP* |ref |OP* o|I32 type
p |OP* |refkids |OP* o|I32 type
Ap |void |regdump |regexp* r
+Ap |SV* |regclass_swash |struct regnode *n|bool doinit|SV **initsvp
Ap |I32 |pregexec |regexp* prog|char* stringarg \
|char* strend|char* strbeg|I32 minend \
|SV* screamer|U32 nosave
@@ -2366,7 +2367,6 @@ s |regnode*|regatom |struct RExC_state_t*|I32 *
s |regnode*|regbranch |struct RExC_state_t*|I32 *|I32
s |void |reguni |struct RExC_state_t*|UV|char *|STRLEN*
s |regnode*|regclass |struct RExC_state_t*
-s |regnode*|regclassutf8 |struct RExC_state_t*
s |I32 |regcurly |char *
s |regnode*|reg_node |struct RExC_state_t*|U8
s |regnode*|regpiece |struct RExC_state_t*|I32 *
@@ -2401,8 +2401,7 @@ s |I32 |regmatch |regnode *prog
s |I32 |regrepeat |regnode *p|I32 max
s |I32 |regrepeat_hard |regnode *p|I32 max|I32 *lp
s |I32 |regtry |regexp *prog|char *startpos
-s |bool |reginclass |regnode *p|I32 c
-s |bool |reginclassutf8 |regnode *f|U8* p
+s |bool |reginclass |regnode *n|U8 *p|bool do_utf8sv_is_utf8
s |CHECKPOINT|regcppush |I32 parenfloor
s |char*|regcppop
s |char*|regcp_set_to |I32 ss
diff --git a/mg.c b/mg.c
index f97c6cedb0..a61d167279 100644
--- a/mg.c
+++ b/mg.c
@@ -391,7 +391,7 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg)
case '5': case '6': case '7': case '8': case '9': case '&':
if (PL_curpm && (rx = PL_curpm->op_pmregexp)) {
- paren = atoi(mg->mg_ptr);
+ paren = atoi(mg->mg_ptr); /* $& is in [0] */
getparen:
if (paren <= rx->nparens &&
(s1 = rx->startp[paren]) != -1 &&
@@ -399,17 +399,15 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg)
{
i = t1 - s1;
getlen:
- if (i > 0 && (PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) {
- char *s = rx->subbeg + s1;
+ if (i > 0 && DO_UTF8(PL_reg_sv)) {
+ char *s = rx->subbeg + s1;
char *send = rx->subbeg + t1;
- i = 0;
- while (s < send) {
- s += UTF8SKIP(s);
- i++;
- }
+
+ i = Perl_utf8_length((U8*)s, (U8*)send);
}
- if (i >= 0)
- return i;
+ if (i < 0)
+ Perl_croak(aTHX_ "panic: magic_len: %d", i);
+ return i;
}
}
return 0;
@@ -604,7 +602,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
* Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj));
* XXX Does the new way break anything?
*/
- paren = atoi(mg->mg_ptr);
+ paren = atoi(mg->mg_ptr); /* $& is in [0] */
getparen:
if (paren <= rx->nparens &&
(s1 = rx->startp[paren]) != -1 &&
@@ -623,7 +621,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
PL_tainted = FALSE;
}
sv_setpvn(sv, s, i);
- if ((PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+ if (DO_UTF8(PL_reg_sv))
SvUTF8_on(sv);
else
SvUTF8_off(sv);
diff --git a/objXSUB.h b/objXSUB.h
index 43537d30c9..60c6e9038b 100644
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -1263,6 +1263,10 @@
#define Perl_regdump pPerl->Perl_regdump
#undef regdump
#define regdump Perl_regdump
+#undef Perl_regclass_swash
+#define Perl_regclass_swash pPerl->Perl_regclass_swash
+#undef regclass_swash
+#define regclass_swash Perl_regclass_swash
#undef Perl_pregexec
#define Perl_pregexec pPerl->Perl_pregexec
#undef pregexec
diff --git a/pp_ctl.c b/pp_ctl.c
index d079e4af22..aff58153ce 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -157,7 +157,7 @@ PP(pp_substcont)
register char *m = cx->sb_m;
char *orig = cx->sb_orig;
register REGEXP *rx = cx->sb_rx;
-
+
rxres_restore(&cx->sb_rxres, rx);
if (cx->sb_iters++) {
@@ -176,8 +176,8 @@ PP(pp_substcont)
: (REXEC_COPY_STR|REXEC_IGNOREPOS|REXEC_NOT_FIRST))))
{
SV *targ = cx->sb_targ;
- sv_catpvn(dstr, s, cx->sb_strend - s);
+ sv_catpvn(dstr, s, cx->sb_strend - s);
cx->sb_rxtainted |= RX_MATCH_TAINTED(rx);
(void)SvOOK_off(targ);
@@ -189,9 +189,11 @@ PP(pp_substcont)
sv_free(dstr);
TAINT_IF(cx->sb_rxtainted & 1);
+ if (pm->op_pmdynflags & PMdf_UTF8)
+ SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */
PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1)));
- (void)SvPOK_only(targ);
+ (void)SvPOK_only_UTF8(targ);
TAINT_IF(cx->sb_rxtainted);
SvSETMAGIC(targ);
SvTAINT(targ);
@@ -209,7 +211,8 @@ PP(pp_substcont)
cx->sb_strend = s + (cx->sb_strend - m);
}
cx->sb_m = m = rx->startp[0] + orig;
- sv_catpvn(dstr, s, m-s);
+ if (m > s)
+ sv_catpvn(dstr, s, m-s);
cx->sb_s = rx->endp[0] + orig;
{ /* Update the pos() information. */
SV *sv = cx->sb_targ;
diff --git a/pp_hot.c b/pp_hot.c
index 6a5b96fe1a..2904d9f6e2 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -1179,6 +1179,7 @@ PP(pp_match)
TARG = DEFSV;
EXTEND(SP,1);
}
+ PL_reg_sv = TARG;
PUTBACK; /* EVAL blocks need stack_sp. */
s = SvPV(TARG, len);
strend = s + len;
@@ -1268,27 +1269,25 @@ play_it_again:
RX_MATCH_TAINTED_on(rx);
TAINT_IF(RX_MATCH_TAINTED(rx));
if (gimme == G_ARRAY) {
- I32 iters, i, len;
+ I32 nparens, i, len;
- iters = rx->nparens;
- if (global && !iters)
+ nparens = rx->nparens;
+ if (global && !nparens)
i = 1;
else
i = 0;
SPAGAIN; /* EVAL blocks could move the stack. */
- EXTEND(SP, iters + i);
- EXTEND_MORTAL(iters + i);
- for (i = !i; i <= iters; i++) {
+ EXTEND(SP, nparens + i);
+ EXTEND_MORTAL(nparens + i);
+ for (i = !i; i <= nparens; i++) {
PUSHs(sv_newmortal());
/*SUPPRESS 560*/
if ((rx->startp[i] != -1) && rx->endp[i] != -1 ) {
len = rx->endp[i] - rx->startp[i];
s = rx->startp[i] + truebase;
sv_setpvn(*SP, s, len);
- if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) {
+ if (DO_UTF8(TARG))
SvUTF8_on(*SP);
- sv_utf8_downgrade(*SP, TRUE);
- }
}
}
if (global) {
@@ -1298,7 +1297,7 @@ play_it_again:
r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST;
goto play_it_again;
}
- else if (!iters)
+ else if (!nparens)
XPUSHs(&PL_sv_yes);
LEAVE_SCOPE(oldsave);
RETURN;
@@ -1831,6 +1830,7 @@ PP(pp_subst)
TARG = DEFSV;
EXTEND(SP,1);
}
+ PL_reg_sv = TARG;
if (SvFAKE(TARG) && SvREADONLY(TARG))
sv_force_normal(TARG);
if (SvREADONLY(TARG)
@@ -1847,7 +1847,7 @@ PP(pp_subst)
if (PL_tainted)
rxtainted |= 2;
TAINT_NOT;
-
+
force_it:
if (!pm || !s)
DIE(aTHX_ "panic: do_subst");
@@ -2004,6 +2004,8 @@ PP(pp_subst)
rxtainted |= RX_MATCH_TAINTED(rx);
dstr = NEWSV(25, len);
sv_setpvn(dstr, m, s-m);
+ if (DO_UTF8(TARG))
+ SvUTF8_on(dstr);
PL_curpm = pm;
if (!c) {
register PERL_CONTEXT *cx;
@@ -2030,7 +2032,8 @@ PP(pp_subst)
sv_catpvn(dstr, c, clen);
if (once)
break;
- } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m, TARG, NULL, r_flags));
+ } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m,
+ TARG, NULL, r_flags));
sv_catpvn(dstr, s, strend - s);
(void)SvOOK_off(TARG);
diff --git a/proto.h b/proto.h
index 4fc260ea3a..1bcb5cdd91 100644
--- a/proto.h
+++ b/proto.h
@@ -616,6 +616,7 @@ PERL_CALLCONV void Perl_push_scope(pTHX);
PERL_CALLCONV OP* Perl_ref(pTHX_ OP* o, I32 type);
PERL_CALLCONV OP* Perl_refkids(pTHX_ OP* o, I32 type);
PERL_CALLCONV void Perl_regdump(pTHX_ regexp* r);
+PERL_CALLCONV SV* Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **initsvp);
PERL_CALLCONV I32 Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave);
PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r);
PERL_CALLCONV regexp* Perl_pregcomp(pTHX_ char* exp, char* xend, PMOP* pm);
@@ -1111,7 +1112,6 @@ STATIC regnode* S_regatom(pTHX_ struct RExC_state_t*, I32 *);
STATIC regnode* S_regbranch(pTHX_ struct RExC_state_t*, I32 *, I32);
STATIC void S_reguni(pTHX_ struct RExC_state_t*, UV, char *, STRLEN*);
STATIC regnode* S_regclass(pTHX_ struct RExC_state_t*);
-STATIC regnode* S_regclassutf8(pTHX_ struct RExC_state_t*);
STATIC I32 S_regcurly(pTHX_ char *);
STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t*, U8);
STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t*, I32 *);
@@ -1141,8 +1141,7 @@ STATIC I32 S_regmatch(pTHX_ regnode *prog);
STATIC I32 S_regrepeat(pTHX_ regnode *p, I32 max);
STATIC I32 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp);
STATIC I32 S_regtry(pTHX_ regexp *prog, char *startpos);
-STATIC bool S_reginclass(pTHX_ regnode *p, I32 c);
-STATIC bool S_reginclassutf8(pTHX_ regnode *f, U8* p);
+STATIC bool S_reginclass(pTHX_ regnode *n, U8 *p, bool do_utf8sv_is_utf8);
STATIC CHECKPOINT S_regcppush(pTHX_ I32 parenfloor);
STATIC char* S_regcppop(pTHX);
STATIC char* S_regcp_set_to(pTHX_ I32 ss);
diff --git a/regcomp.c b/regcomp.c
index aae2ceda5f..69a9f917b6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -118,7 +118,7 @@ typedef struct RExC_state_t {
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
I32 whilem_seen; /* number of WHILEM in this expr */
- regnode *emit; /* Code-emit pointer; &regdummy = don't */
+ regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
@@ -234,8 +234,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define LOC (RExC_flags16 & PMf_LOCALE)
#define FOLD (RExC_flags16 & PMf_FOLD)
-#define OOB_CHAR8 1234
-#define OOB_UTF8 123456
+#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
@@ -1196,7 +1195,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
break;
}
}
- else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) {
+ else if (strchr((char*)PL_simple,OP(scan))) {
int value;
if (flags & SCF_DO_SUBSTR) {
@@ -1210,20 +1209,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
/* Some of the logic below assumes that switching
locale on will only add false positives. */
switch (PL_regkind[(U8)OP(scan)]) {
- case ANYUTF8:
case SANY:
- case SANYUTF8:
- case ALNUMUTF8:
- case ANYOFUTF8:
- case ALNUMLUTF8:
- case NALNUMUTF8:
- case NALNUMLUTF8:
- case SPACEUTF8:
- case NSPACEUTF8:
- case SPACELUTF8:
- case NSPACELUTF8:
- case DIGITUTF8:
- case NDIGITUTF8:
default:
do_default:
/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
@@ -1750,7 +1736,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
/* turn .* into ^.* with an implied $*=1 */
int type = OP(NEXTOPER(first));
- if (type == REG_ANY || type == ANYUTF8)
+ if (type == REG_ANY)
type = ROPT_ANCH_MBOL;
else
type = ROPT_ANCH_SBOL;
@@ -1850,8 +1836,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
longest_fixed_length = 0;
}
if (r->regstclass
- && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8
- || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY))
+ && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
r->regstclass = NULL;
if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
@@ -1866,6 +1851,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
struct regnode_charclass_class);
r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
+ PL_regdata = r->data; /* for regprop() */
DEBUG_r((sv = sv_newmortal(),
regprop(sv, (regnode*)data.start_class),
PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
@@ -1933,7 +1919,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
r->reganch |= ROPT_EVAL_SEEN;
Newz(1002, r->startp, RExC_npar, I32);
Newz(1002, r->endp, RExC_npar, I32);
- PL_regdata = r->data; /* for regprop() ANYOFUTF8 */
+ PL_regdata = r->data; /* for regprop() */
DEBUG_r(regdump(r));
return(r);
}
@@ -2556,26 +2542,17 @@ tryagain:
break;
case '.':
nextchar(pRExC_state);
- if (UTF) {
- if (RExC_flags16 & PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SANYUTF8);
- else
- ret = reg_node(pRExC_state, ANYUTF8);
- *flagp |= HASWIDTH;
- }
- else {
- if (RExC_flags16 & PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SANY);
- else
- ret = reg_node(pRExC_state, REG_ANY);
- *flagp |= HASWIDTH|SIMPLE;
- }
+ if (RExC_flags16 & PMf_SINGLELINE)
+ ret = reg_node(pRExC_state, SANY);
+ else
+ ret = reg_node(pRExC_state, REG_ANY);
+ *flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
break;
case '[':
{
char *oregcomp_parse = ++RExC_parse;
- ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state));
+ ret = regclass(pRExC_state);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
@@ -2659,20 +2636,14 @@ tryagain:
is_utf8_mark((U8*)"~"); /* preload table */
break;
case 'w':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
- : (LOC ? ALNUML : ALNUM));
+ ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'W':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
- : (LOC ? NALNUML : NALNUM));
+ ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
@@ -2681,10 +2652,7 @@ tryagain:
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
- : (LOC ? BOUNDL : BOUND));
+ ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
@@ -2693,44 +2661,35 @@ tryagain:
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
- : (LOC ? NBOUNDL : NBOUND));
+ ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 's':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? SPACELUTF8 : SPACEUTF8)
- : (LOC ? SPACEL : SPACE));
+ ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_space)
is_utf8_space((U8*)" "); /* preload table */
break;
case 'S':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
- : (LOC ? NSPACEL : NSPACE));
+ ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_space)
is_utf8_space((U8*)" "); /* preload table */
break;
case 'd':
- ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT);
+ ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_digit)
is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'D':
- ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT);
+ ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_digit)
@@ -2754,7 +2713,7 @@ tryagain:
RExC_end = RExC_parse + 2;
RExC_parse--;
- ret = regclassutf8(pRExC_state);
+ ret = regclass(pRExC_state);
RExC_end = oldregxend;
RExC_parse--;
@@ -3194,58 +3153,108 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
- register U32 value;
- register I32 lastvalue = OOB_CHAR8;
- register I32 range = 0;
+ register UV value;
+ register IV lastvalue = OOB_UNICODE;
+ register IV range = 0;
register regnode *ret;
STRLEN numlen;
- I32 namedclass;
+ IV namedclass;
char *rangebegin;
bool need_class = 0;
+ SV *listsv;
+ register char *e;
+ UV n;
+
+ ret = reganode(pRExC_state, ANYOF, 0);
+
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) = 0;
+
+ if (*RExC_parse == '^') { /* Complement of range. */
+ RExC_naughty++;
+ RExC_parse++;
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ }
- ret = reg_node(pRExC_state, ANYOF);
if (SIZE_ONLY)
RExC_size += ANYOF_SKIP;
else {
- ret->flags = 0;
- ANYOF_BITMAP_ZERO(ret);
RExC_emit += ANYOF_SKIP;
if (FOLD)
ANYOF_FLAGS(ret) |= ANYOF_FOLD;
if (LOC)
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
- }
- if (*RExC_parse == '^') { /* Complement of range. */
- RExC_naughty++;
- RExC_parse++;
- if (!SIZE_ONLY)
- ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ ANYOF_BITMAP_ZERO(ret);
+ listsv = newSVpvn("# comment\n", 10);
}
if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
checkposixcc(pRExC_state);
if (*RExC_parse == ']' || *RExC_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
+ goto charclassloop; /* allow 1st char to be ] or - */
+
while (RExC_parse < RExC_end && *RExC_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
+
+ charclassloop:
+
+ namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
+
if (!range)
rangebegin = RExC_parse;
- value = UCHARAT(RExC_parse++);
+ if (UTF) {
+ value = utf8_to_uv((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
if (value == '[')
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
- value = UCHARAT(RExC_parse++);
+ if (UTF) {
+ value = utf8_to_uv((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
/* Some compilers cannot handle switching on 64-bit integer
- * values, therefore the 'value' cannot be an UV. --jhi */
- switch (value) {
+ * values, therefore value cannot be an UV. Yes, this will
+ * be a problem later if we want switch on Unicode. --jhi */
+ switch ((I32)value) {
case 'w': namedclass = ANYOF_ALNUM; break;
case 'W': namedclass = ANYOF_NALNUM; break;
case 's': namedclass = ANYOF_SPACE; break;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'p':
+ case 'P':
+ if (*RExC_parse == '{') {
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL("Missing right brace on \\p{}");
+ n = e - RExC_parse;
+ }
+ else {
+ e = RExC_parse;
+ n = 1;
+ }
+ if (!SIZE_ONLY) {
+ if (value == 'p')
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", (int)n, RExC_parse);
+ else
+ Perl_sv_catpvf(aTHX_ listsv,
+ "!utf8::%.*s\n", (int)n, RExC_parse);
+ }
+ RExC_parse = e + 1;
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ continue;
case 'n': value = '\n'; break;
case 'r': value = '\r'; break;
case 't': value = '\t'; break;
@@ -3259,9 +3268,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
case 'a': value = '\057'; break;
#endif
case 'x':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
- RExC_parse += numlen;
+ if (*RExC_parse == '{') {
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL("Missing right brace on \\x{}");
+ numlen = 1; /* allow underscores */
+ value = (UV)scan_hex(RExC_parse,
+ e - RExC_parse,
+ &numlen);
+ RExC_parse = e + 1;
+ }
+ else {
+ numlen = 0; /* disallow underscores */
+ value = (UV)scan_hex(RExC_parse, 2, &numlen);
+ RExC_parse += numlen;
+ }
break;
case 'c':
value = UCHARAT(RExC_parse++);
@@ -3275,16 +3296,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
break;
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-
- vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value);
+ vWARN2(RExC_parse,
+ "Unrecognized escape \\%c in character class passed through",
+ (int)value);
break;
}
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (!need_class && !SIZE_ONLY)
+ } /* end of \blah */
+
+ if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
+ if (!SIZE_ONLY && !need_class)
ANYOF_CLASS_ZERO(ret);
+
need_class = 1;
- if (range) { /* a-\d, a-[:digit:] */
+
+ /* a bad range like a-\d, a-[:digit:] ? */
+ if (range) {
if (!SIZE_ONLY) {
if (ckWARN(WARN_REGEXP))
vWARN4(RExC_parse,
@@ -3292,11 +3319,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
RExC_parse - rangebegin,
RExC_parse - rangebegin,
rangebegin);
- ANYOF_BITMAP_SET(ret, lastvalue);
- ANYOF_BITMAP_SET(ret, '-');
+ if (lastvalue < 256) {
+ ANYOF_BITMAP_SET(ret, lastvalue);
+ ANYOF_BITMAP_SET(ret, '-');
+ }
+ else {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ Perl_sv_catpvf(aTHX_ listsv,
+ /* 0x002D is Unicode for '-' */
+ "%04"UVxf"\n002D\n", (UV)lastvalue);
+ }
}
- range = 0; /* this is not a true range */
+
+ range = 0; /* this was not a true range */
}
+
if (!SIZE_ONLY) {
switch (namedclass) {
case ANYOF_ALNUM:
@@ -3307,6 +3344,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
if (LOC)
@@ -3316,42 +3354,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
- case ANYOF_SPACE:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_SPACE);
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NSPACE:
+ case ANYOF_ALNUMC:
if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
else {
for (value = 0; value < 256; value++)
- if (!isSPACE(value))
+ if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_DIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
- else {
- for (value = '0'; value <= '9'; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NDIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
- else {
- for (value = 0; value < '0'; value++)
- ANYOF_BITMAP_SET(ret, value);
- for (value = '9' + 1; value < 256; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
if (LOC)
@@ -3361,15 +3374,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_ALNUMC:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
- else {
- for (value = 0; value < 256; value++)
- if (isALNUMC(value))
- ANYOF_BITMAP_SET(ret, value);
- }
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
if (LOC)
@@ -3379,6 +3384,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
if (LOC)
@@ -3388,6 +3394,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
if (LOC)
@@ -3402,6 +3409,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
if (LOC)
@@ -3416,6 +3424,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
if (LOC)
@@ -3425,6 +3434,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
if (LOC)
@@ -3434,6 +3444,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
if (LOC)
@@ -3443,7 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- lastvalue = OOB_CHAR8;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
if (LOC)
@@ -3453,6 +3464,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
+ break;
+ case ANYOF_DIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = '0'; value <= '9'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
+ break;
+ case ANYOF_NDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = 0; value < '0'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ for (value = '9' + 1; value < 256; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
if (LOC)
@@ -3462,6 +3496,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
if (LOC)
@@ -3471,6 +3506,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
if (LOC)
@@ -3480,6 +3516,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
if (LOC)
@@ -3489,6 +3526,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
if (LOC)
@@ -3498,6 +3536,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
if (LOC)
@@ -3507,6 +3546,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
if (LOC)
@@ -3516,6 +3556,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
if (LOC)
@@ -3525,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
if (LOC)
@@ -3534,6 +3576,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
if (LOC)
@@ -3543,6 +3586,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
+ break;
+ case ANYOF_SPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_SPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
+ break;
+ case ANYOF_NSPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
if (LOC)
@@ -3552,6 +3616,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
if (LOC)
@@ -3561,6 +3626,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
if (LOC)
@@ -3570,6 +3636,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
if (LOC)
@@ -3579,6 +3646,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
vFAIL("Invalid [::] class");
@@ -3588,7 +3656,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
continue;
}
- }
+ } /* end of namedclass \blah */
+
if (range) {
if (lastvalue > value) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
@@ -3596,14 +3665,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
RExC_parse - rangebegin,
rangebegin);
}
- range = 0;
+ range = 0; /* not a true range */
}
else {
- lastvalue = value;
+ lastvalue = value; /* save the beginning of the range */
if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
RExC_parse[1] != ']') {
RExC_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
+
+ /* a bad range like \w-, [:word:]- ? */
+ if (namedclass > OOB_NAMEDCLASS) {
if (ckWARN(WARN_REGEXP))
vWARN4(RExC_parse,
"False [] range \"%*.*s\"",
@@ -3613,325 +3684,89 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!SIZE_ONLY)
ANYOF_BITMAP_SET(ret, '-');
} else
- range = 1;
- continue; /* do it next time */
+ range = 1; /* yeah, it's a range! */
+ continue; /* but do it the next time */
}
}
+
/* now is the next time */
if (!SIZE_ONLY) {
+ if (lastvalue < 256 && value < 256) {
#ifndef ASCIIish /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
- (isUPPER(lastvalue) && isUPPER(value)))
- {
- I32 i;
- if (isLOWER(lastvalue)) {
- for (i = lastvalue; i <= value; i++)
- if (isLOWER(i))
- ANYOF_BITMAP_SET(ret, i);
- } else {
- for (i = lastvalue; i <= value; i++)
- if (isUPPER(i))
- ANYOF_BITMAP_SET(ret, i);
+ if ((isLOWER(lastvalue) && isLOWER(value)) ||
+ (isUPPER(lastvalue) && isUPPER(value)))
+ {
+ IV i;
+ if (isLOWER(lastvalue)) {
+ for (i = lastvalue; i <= value; i++)
+ if (isLOWER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ } else {
+ for (i = lastvalue; i <= value; i++)
+ if (isUPPER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ }
}
- }
- else
+ else
#endif
- for ( ; lastvalue <= value; lastvalue++)
- ANYOF_BITMAP_SET(ret, lastvalue);
+ for ( ; lastvalue <= value; lastvalue++)
+ ANYOF_BITMAP_SET(ret, lastvalue);
+ } else {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ if (lastvalue < value)
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+ (UV)lastvalue, (UV)value);
+ else
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)value);
+ }
}
- range = 0;
+
+ range = 0; /* this range (if it was one) is done now */
}
+
if (need_class) {
if (SIZE_ONLY)
RExC_size += ANYOF_CLASS_ADD_SKIP;
else
RExC_emit += ANYOF_CLASS_ADD_SKIP;
}
+
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
- (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+ (ANYOF_FLAGS(ret) &
+ /* If the only flag is folding (plus possibly inversion). */
+ (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
for (value = 0; value < 256; ++value) {
if (ANYOF_BITMAP_TEST(ret, value)) {
- I32 cf = PL_fold[value];
- ANYOF_BITMAP_SET(ret, cf);
+ IV fold = PL_fold[value];
+
+ if (fold != value)
+ ANYOF_BITMAP_SET(ret, fold);
}
}
ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
}
+
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ if (!SIZE_ONLY &&
+ /* If the only flag is inversion. */
+ (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
ANYOF_FLAGS(ret) = 0;
}
- return ret;
-}
-STATIC regnode *
-S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
-{
- register char *e;
- register U32 value;
- register U32 lastvalue = OOB_UTF8;
- register I32 range = 0;
- register regnode *ret;
- STRLEN numlen;
- I32 n;
- SV *listsv;
- U8 flags = 0;
- I32 namedclass;
- char *rangebegin;
-
- if (*RExC_parse == '^') { /* Complement of range. */
- RExC_naughty++;
- RExC_parse++;
- if (!SIZE_ONLY)
- flags |= ANYOF_INVERT;
- }
- if (!SIZE_ONLY) {
- if (FOLD)
- flags |= ANYOF_FOLD;
- if (LOC)
- flags |= ANYOF_LOCALE;
- listsv = newSVpvn("# comment\n", 10);
- }
-
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
- checkposixcc(pRExC_state);
-
- if (*RExC_parse == ']' || *RExC_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
-
- while (RExC_parse < RExC_end && *RExC_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
- if (!range)
- rangebegin = RExC_parse;
- value = utf8_to_uv((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
- RExC_parse += numlen;
- if (value == '[')
- namedclass = regpposixcc(pRExC_state, value);
- else if (value == '\\') {
- value = (U32)utf8_to_uv((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
- RExC_parse += numlen;
- /* Some compilers cannot handle switching on 64-bit integer
- * values, therefore value cannot be an UV. Yes, this will
- * be a problem later if we want switch on Unicode. --jhi */
- switch (value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
- case 's': namedclass = ANYOF_SPACE; break;
- case 'S': namedclass = ANYOF_NSPACE; break;
- case 'd': namedclass = ANYOF_DIGIT; break;
- case 'D': namedclass = ANYOF_NDIGIT; break;
- case 'p':
- case 'P':
- if (*RExC_parse == '{') {
- e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\p{}");
- n = e - RExC_parse;
- }
- else {
- e = RExC_parse;
- n = 1;
- }
- if (!SIZE_ONLY) {
- if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, RExC_parse);
- else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, RExC_parse);
- }
- RExC_parse = e + 1;
- lastvalue = OOB_UTF8;
- continue;
- case 'n': value = '\n'; break;
- case 'r': value = '\r'; break;
- case 't': value = '\t'; break;
- case 'f': value = '\f'; break;
- case 'b': value = '\b'; break;
-#ifdef ASCIIish
- case 'e': value = '\033'; break;
- case 'a': value = '\007'; break;
-#else
- case 'e': value = '\047'; break;
- case 'a': value = '\057'; break;
-#endif
- case 'x':
- if (*RExC_parse == '{') {
- e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\x{}");
- numlen = 1; /* allow underscores */
- value = (UV)scan_hex(RExC_parse,
- e - RExC_parse,
- &numlen);
- RExC_parse = e + 1;
- }
- else {
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
- RExC_parse += numlen;
- }
- break;
- case 'c':
- value = UCHARAT(RExC_parse++);
- value = toCTRL(value);
- break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--RExC_parse, 3, &numlen);
- RExC_parse += numlen;
- break;
- default:
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
- vWARN2(RExC_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
- break;
- }
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (range) { /* a-\d, a-[:digit:] */
- if (!SIZE_ONLY) {
- if (ckWARN(WARN_REGEXP))
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "%04"UVxf"\n002D\n", (UV)lastvalue);
- }
- range = 0;
- }
- if (!SIZE_ONLY) {
- switch (namedclass) {
- case ANYOF_ALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
- case ANYOF_NALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break;
- case ANYOF_ALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break;
- case ANYOF_NALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break;
- case ANYOF_ALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break;
- case ANYOF_NALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break;
- case ANYOF_ASCII:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break;
- case ANYOF_NASCII:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break;
- case ANYOF_CNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break;
- case ANYOF_NCNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break;
- case ANYOF_GRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break;
- case ANYOF_NGRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break;
- case ANYOF_DIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break;
- case ANYOF_NDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break;
- case ANYOF_LOWER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break;
- case ANYOF_NLOWER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break;
- case ANYOF_PRINT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break;
- case ANYOF_NPRINT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break;
- case ANYOF_PUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break;
- case ANYOF_NPUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break;
- case ANYOF_SPACE:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break;
- case ANYOF_NSPACE:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break;
- case ANYOF_BLANK:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break;
- case ANYOF_NBLANK:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break;
- case ANYOF_PSXSPC:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break;
- case ANYOF_NPSXSPC:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break;
- case ANYOF_UPPER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break;
- case ANYOF_NUPPER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break;
- case ANYOF_XDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break;
- case ANYOF_NXDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
- }
- continue;
- }
- }
- if (range) {
- if (lastvalue > value) { /* b-a */
- Simple_vFAIL4("Invalid [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- }
- range = 0;
- }
- else {
- lastvalue = value;
- if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
- RExC_parse[1] != ']') {
- RExC_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
- if (ckWARN(WARN_REGEXP))
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "002D\n");
- } else
- range = 1;
- continue; /* do it next time */
- }
- }
- /* now is the next time */
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)lastvalue, (UV)value);
- range = 0;
- }
-
- ret = reganode(pRExC_state, ANYOFUTF8, 0);
-
- if (!SIZE_ONLY) {
- SV *rv = swash_init("utf8", "", listsv, 1, 0);
-#ifdef DEBUGGING
+ if (!SIZE_ONLY) {
AV *av = newAV();
- av_push(av, rv);
- av_push(av, listsv);
- rv = newRV_inc((SV*)av);
-#else
- SvREFCNT_dec(listsv);
-#endif
+ SV *rv;
+
+ av_store(av, 0, listsv);
+ av_store(av, 1, NULL);
+ rv = newRV_noinc((SV*)av);
n = add_data(pRExC_state, 1, "s");
RExC_rx->data->data[n] = (void*)rv;
- ARG1_SET(ret, flags);
- ARG2_SET(ret, n);
+ ARG_SET(ret, n);
}
return ret;
@@ -4269,7 +4104,7 @@ Perl_regdump(pTHX_ regexp *r)
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
- if (isCNTRL(c) || c == 127 || c == 255)
+ if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
else if (c == '-' || c == ']' || c == '\\' || c == '^')
Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
@@ -4311,8 +4146,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
int i, rangestart = -1;
- bool anyofutf8 = OP(o) == ANYOFUTF8;
- U8 flags = anyofutf8 ? ARG1(o) : o->flags;
+ U8 flags = ANYOF_FLAGS(o);
const char * const anyofs[] = { /* Should be syncronized with
* ANYOF_ #xdefines in regcomp.h */
"\\w",
@@ -4354,78 +4188,93 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpv(sv, "^");
- if (OP(o) == ANYOF) {
- for (i = 0; i <= 256; i++) {
- if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++)
put_byte(sv, rangestart);
- sv_catpv(sv, "-");
- put_byte(sv, i - 1);
- }
- rangestart = -1;
+ else {
+ put_byte(sv, rangestart);
+ sv_catpv(sv, "-");
+ put_byte(sv, i - 1);
}
+ rangestart = -1;
}
- if (o->flags & ANYOF_CLASS)
- for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
- if (ANYOF_CLASS_TEST(o,i))
- sv_catpv(sv, anyofs[i]);
}
- else {
- SV *rv = (SV*)PL_regdata->data[ARG2(o)];
- AV *av = (AV*)SvRV((SV*)rv);
- SV *sw = *av_fetch(av, 0, FALSE);
- SV *lv = *av_fetch(av, 1, FALSE);
- UV i;
- U8 s[UTF8_MAXLEN+1];
- for (i = 0; i <= 256; i++) { /* just the first 256 */
- U8 *e = uv_to_utf8(s, i);
- if (i < 256 && swash_fetch(sw, s)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- U8 *p;
-
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++) {
- for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
- put_byte(sv, *p);
+
+ if (o->flags & ANYOF_CLASS)
+ for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
+ if (ANYOF_CLASS_TEST(o,i))
+ sv_catpv(sv, anyofs[i]);
+
+ if (flags & ANYOF_UNICODE)
+ sv_catpv(sv, "{unicode}");
+
+ {
+ SV *lv;
+ SV *sw = regclass_swash(o, FALSE, &lv);
+
+ if (lv) {
+ if (sw) {
+ UV i;
+ U8 s[UTF8_MAXLEN+1];
+
+ for (i = 0; i <= 256; i++) { /* just the first 256 */
+ U8 *e = uv_to_utf8(s, i);
+
+ if (i < 256 && swash_fetch(sw, s)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ U8 *p;
+
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++) {
+ for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ else {
+ for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ sv_catpv(sv, "-");
+ for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ rangestart = -1;
+ }
}
- else {
- for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
- put_byte(sv, *p);
- sv_catpv(sv, "-");
- for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
- put_byte(sv, *p);
- }
- rangestart = -1;
+
+ sv_catpv(sv, "..."); /* et cetera */
}
- }
- sv_catpv(sv, "...");
- {
- char *s = savepv(SvPVX(lv));
-
- while(*s && *s != '\n') s++;
- if (*s == '\n') {
- char *t = ++s;
- while (*s) {
- if (*s == '\n')
- *s = ' ';
- s++;
+ {
+ char *s = savepv(SvPVX(lv));
+ char *origs = s;
+
+ while(*s && *s != '\n') s++;
+
+ if (*s == '\n') {
+ char *t = ++s;
+
+ while (*s) {
+ if (*s == '\n')
+ *s = ' ';
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
+
+ sv_catpv(sv, t);
}
- if (s[-1] == ' ')
- s[-1] = 0;
-
- sv_catpv(sv, t);
+
+ Safefree(origs);
}
}
}
+
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
@@ -4486,16 +4335,6 @@ Perl_pregfree(pTHX_ struct regexp *r)
while (--n >= 0) {
switch (r->data->what[n]) {
case 's':
-#ifdef DEBUGGING
- {
- SV *rv = (SV*)r->data->data[n];
- AV *av = (AV*)SvRV((SV*)rv);
- SV *sw = *av_fetch(av, 0, FALSE);
- SV *lv = *av_fetch(av, 1, FALSE);
- SvREFCNT_dec(sw);
- SvREFCNT_dec(lv);
- }
-#endif
SvREFCNT_dec((SV*)r->data->data[n]);
break;
case 'f':
@@ -4657,4 +4496,3 @@ clear_re(pTHXo_ void *r)
{
ReREFCNT_dec((regexp *)r);
}
-
diff --git a/regcomp.h b/regcomp.h
index 284cf2fff8..c8094e14a4 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -88,12 +88,13 @@ struct regnode_2 {
};
#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
-#define ANYOF_CLASSBITMAP_SIZE 4
+#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */
struct regnode_charclass {
U8 flags;
U8 type;
U16 next_off;
+ U32 arg1;
char bitmap[ANYOF_BITMAP_SIZE];
};
@@ -101,6 +102,7 @@ struct regnode_charclass_class {
U8 flags;
U8 type;
U16 next_off;
+ U32 arg1;
char bitmap[ANYOF_BITMAP_SIZE];
char classflags[ANYOF_CLASSBITMAP_SIZE];
};
@@ -180,13 +182,21 @@ struct regnode_charclass_class {
/* Flags for node->flags of ANYOF */
-#define ANYOF_CLASS 0x08
-#define ANYOF_INVERT 0x04
-#define ANYOF_FOLD 0x02
-#define ANYOF_LOCALE 0x01
+#define ANYOF_CLASS 0x08
+#define ANYOF_INVERT 0x04
+#define ANYOF_FOLD 0x02
+#define ANYOF_LOCALE 0x01
/* Used for regstclass only */
-#define ANYOF_EOS 0x10 /* Can match an empty string too */
+#define ANYOF_EOS 0x10 /* Can match an empty string too */
+
+/* There is a character or a range past 0xff */
+#define ANYOF_UNICODE 0x20
+
+/* Are there any runtime flags on in this node? */
+#define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f)
+
+#define ANYOF_FLAGS_ALL 0xff
/* Character classes for node->classflags of ANYOF */
/* Should be synchronized with a table in regprop() */
@@ -220,7 +230,7 @@ struct regnode_charclass_class {
#define ANYOF_NXDIGIT 25
#define ANYOF_PSXSPC 26 /* POSIX space: \s plus the vertical tab */
#define ANYOF_NPSXSPC 27
-#define ANYOF_BLANK 28 /* GNU extension: space and tab */
+#define ANYOF_BLANK 28 /* GNU extension: space and tab: non-vertical space */
#define ANYOF_NBLANK 29
#define ANYOF_MAX 32
@@ -238,7 +248,6 @@ struct regnode_charclass_class {
#define ANYOF_CLASS_SIZE (sizeof(struct regnode_charclass_class))
#define ANYOF_FLAGS(p) ((p)->flags)
-#define ANYOF_FLAGS_ALL 0xff
#define ANYOF_BIT(c) (1 << ((c) & 7))
@@ -300,12 +309,14 @@ EXTCONST U8 PL_varies[] = {
EXTCONST U8 PL_simple[];
#else
EXTCONST U8 PL_simple[] = {
- REG_ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8,
- ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8,
- NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8,
- SPACE, SPACEUTF8, SPACEL, SPACELUTF8,
- NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8,
- DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0
+ REG_ANY, SANY,
+ ANYOF,
+ ALNUM, ALNUML,
+ NALNUM, NALNUML,
+ SPACE, SPACEL,
+ NSPACE, NSPACEL,
+ DIGIT, NDIGIT,
+ 0
};
#endif
diff --git a/regcomp.sym b/regcomp.sym
index bb5f8f8482..59284f4b21 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -16,46 +16,27 @@ EOL EOL, no Match "" at end of line.
MEOL EOL, no Same, assuming multiline.
SEOL EOL, no Same, assuming singleline.
BOUND BOUND, no Match "" at any word boundary
-BOUNDUTF8 BOUND, no Match "" at any word boundary
BOUNDL BOUND, no Match "" at any word boundary
-BOUNDLUTF8 BOUND, no Match "" at any word boundary
NBOUND NBOUND, no Match "" at any word non-boundary
-NBOUNDUTF8 NBOUND, no Match "" at any word non-boundary
NBOUNDL NBOUND, no Match "" at any word non-boundary
-NBOUNDLUTF8 NBOUND, no Match "" at any word non-boundary
GPOS GPOS, no Matches where last m//g left off.
# [Special] alternatives
REG_ANY REG_ANY, no Match any one character (except newline).
-ANYUTF8 REG_ANY, no Match any one Unicode character (except newline).
SANY REG_ANY, no Match any one character.
-SANYUTF8 REG_ANY, no Match any one Unicode character.
ANYOF ANYOF, sv Match character in (or not in) this class.
-ANYOFUTF8 ANYOF, sv 1 Match character in (or not in) this class.
ALNUM ALNUM, no Match any alphanumeric character
-ALNUMUTF8 ALNUM, no Match any alphanumeric character in utf8
ALNUML ALNUM, no Match any alphanumeric char in locale
-ALNUMLUTF8 ALNUM, no Match any alphanumeric char in locale+utf8
NALNUM NALNUM, no Match any non-alphanumeric character
-NALNUMUTF8 NALNUM, no Match any non-alphanumeric character in utf8
NALNUML NALNUM, no Match any non-alphanumeric char in locale
-NALNUMLUTF8 NALNUM, no Match any non-alphanumeric char in locale+utf8
SPACE SPACE, no Match any whitespace character
-SPACEUTF8 SPACE, no Match any whitespace character in utf8
SPACEL SPACE, no Match any whitespace char in locale
-SPACELUTF8 SPACE, no Match any whitespace char in locale+utf8
NSPACE NSPACE, no Match any non-whitespace character
-NSPACEUTF8 NSPACE, no Match any non-whitespace character in utf8
NSPACEL NSPACE, no Match any non-whitespace char in locale
-NSPACELUTF8 NSPACE, no Match any non-whitespace char in locale+utf8
DIGIT DIGIT, no Match any numeric character
-DIGITUTF8 DIGIT, no Match any numeric character in utf8
DIGITL DIGIT, no Match any numeric character in locale
-DIGITLUTF8 DIGIT, no Match any numeric character in locale+utf8
NDIGIT NDIGIT, no Match any non-numeric character
-NDIGITUTF8 NDIGIT, no Match any non-numeric character in utf8
NDIGITL NDIGIT, no Match any non-numeric character in locale
-NDIGITLUTF8 NDIGIT, no Match any non-numeric character in locale+utf8
CLUMP CLUMP, no Match any combining character sequence
# BRANCH The set of branches constituting a single choice are hooked
diff --git a/regexec.c b/regexec.c
index 5e821ba3f0..ac91beaf75 100644
--- a/regexec.c
+++ b/regexec.c
@@ -105,13 +105,6 @@
* Forwards.
*/
-#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c))
-#ifdef DEBUGGING
-# define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch(*av_fetch((AV*)SvRV((SV*)PL_regdata->data[ARG2(f)]),0,FALSE),p))
-#else
-# define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p))
-#endif
-
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
@@ -738,7 +731,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
t = s;
if (prog->reganch & ROPT_UTF8) {
- PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */
+ PL_regdata = prog->data;
PL_bostr = startpos;
}
s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
@@ -840,25 +833,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
unsigned int c2;
char *e;
register I32 tmp = 1; /* Scratch variable? */
+ register bool do_utf8 = DO_UTF8(PL_reg_sv);
/* We know what class it must start with. */
switch (OP(c)) {
- case ANYOFUTF8:
- while (s < strend) {
- if (REGINCLASSUTF8(c, (U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
- else
- tmp = doevery;
- }
- else
- tmp = 1;
- s += UTF8SKIP(s);
- }
- break;
case ANYOF:
while (s < strend) {
- if (REGINCLASS(c, *(U8*)s)) {
+ if (reginclass(c, (U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
else
@@ -866,7 +847,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
else
tmp = 1;
- s++;
+ s += do_utf8 ? UTF8SKIP(s) : 1;
}
break;
case EXACTF:
@@ -912,42 +893,40 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case BOUND:
- tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
- tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
- while (s < strend) {
- if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
- tmp = !tmp;
- if ((norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ if (s == startpos)
+ tmp = '\n';
+ else {
+ U8 *r = reghop((U8*)s, -1);
+
+ tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+ }
+ tmp = ((OP(c) == BOUND ?
+ isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+ while (s < strend) {
+ if (tmp == !(OP(c) == BOUND ?
+ swash_fetch(PL_utf8_alnum, (U8*)s) :
+ isALNUM_LC_utf8((U8*)s)))
+ {
+ tmp = !tmp;
+ if ((norun || regtry(prog, s)))
+ goto got_it;
+ }
+ s += UTF8SKIP(s);
}
- s++;
}
- if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
- goto got_it;
- break;
- case BOUNDLUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case BOUNDUTF8:
- if (s == startpos)
- tmp = '\n';
else {
- U8 *r = reghop((U8*)s, -1);
-
- tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
- }
- tmp = ((OP(c) == BOUNDUTF8 ?
- isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
- while (s < strend) {
- if (tmp == !(OP(c) == BOUNDUTF8 ?
- swash_fetch(PL_utf8_alnum, (U8*)s) :
- isALNUM_LC_utf8((U8*)s)))
- {
- tmp = !tmp;
- if ((norun || regtry(prog, s)))
- goto got_it;
+ tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+ tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+ while (s < strend) {
+ if (tmp ==
+ !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
+ tmp = !tmp;
+ if ((norun || regtry(prog, s)))
+ goto got_it;
+ }
+ s++;
}
- s += UTF8SKIP(s);
}
if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
goto got_it;
@@ -956,365 +935,382 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case NBOUND:
- tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
- tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
- while (s < strend) {
- if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
- tmp = !tmp;
- else if ((norun || regtry(prog, s)))
- goto got_it;
- s++;
+ if (do_utf8) {
+ if (s == startpos)
+ tmp = '\n';
+ else {
+ U8 *r = reghop((U8*)s, -1);
+
+ tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
+ }
+ tmp = ((OP(c) == NBOUND ?
+ isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
+ while (s < strend) {
+ if (tmp == !(OP(c) == NBOUND ?
+ swash_fetch(PL_utf8_alnum, (U8*)s) :
+ isALNUM_LC_utf8((U8*)s)))
+ tmp = !tmp;
+ else if ((norun || regtry(prog, s)))
+ goto got_it;
+ s += UTF8SKIP(s);
+ }
}
- if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
- goto got_it;
- break;
- case NBOUNDLUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case NBOUNDUTF8:
- if (s == startpos)
- tmp = '\n';
else {
- U8 *r = reghop((U8*)s, -1);
-
- tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
- }
- tmp = ((OP(c) == NBOUNDUTF8 ?
- isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
- while (s < strend) {
- if (tmp == !(OP(c) == NBOUNDUTF8 ?
- swash_fetch(PL_utf8_alnum, (U8*)s) :
- isALNUM_LC_utf8((U8*)s)))
- tmp = !tmp;
- else if ((norun || regtry(prog, s)))
- goto got_it;
- s += UTF8SKIP(s);
+ tmp = (s != startpos) ? UCHARAT(s - 1) : '\n';
+ tmp = ((OP(c) == NBOUND ?
+ isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
+ while (s < strend) {
+ if (tmp ==
+ !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
+ tmp = !tmp;
+ else if ((norun || regtry(prog, s)))
+ goto got_it;
+ s++;
+ }
}
if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
goto got_it;
break;
case ALNUM:
- while (s < strend) {
- if (isALNUM(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case ALNUMUTF8:
- while (s < strend) {
- if (swash_fetch(PL_utf8_alnum, (U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (isALNUM(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case ALNUML:
PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (isALNUM_LC(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (isALNUM_LC_utf8((U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case ALNUMLUTF8:
- PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (isALNUM_LC_utf8((U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (isALNUM_LC(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case NALNUM:
- while (s < strend) {
- if (!isALNUM(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case NALNUMUTF8:
- while (s < strend) {
- if (!swash_fetch(PL_utf8_alnum, (U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (!isALNUM(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case NALNUML:
PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (!isALNUM_LC(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (!isALNUM_LC_utf8((U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case NALNUMLUTF8:
- PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (!isALNUM_LC_utf8((U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (!isALNUM_LC(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case SPACE:
- while (s < strend) {
- if (isSPACE(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case SPACEUTF8:
- while (s < strend) {
- if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (isSPACE(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case SPACEL:
PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (isSPACE_LC(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case SPACELUTF8:
- PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (isSPACE_LC(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case NSPACE:
- while (s < strend) {
- if (!isSPACE(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case NSPACEUTF8:
- while (s < strend) {
- if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (!isSPACE(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case NSPACEL:
PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (!isSPACE_LC(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case NSPACELUTF8:
- PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (!isSPACE_LC(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case DIGIT:
- while (s < strend) {
- if (isDIGIT(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (swash_fetch(PL_utf8_digit,(U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case DIGITUTF8:
- while (s < strend) {
- if (swash_fetch(PL_utf8_digit,(U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (isDIGIT(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case DIGITL:
PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (isDIGIT_LC(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (isDIGIT_LC_utf8((U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case DIGITLUTF8:
- PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (isDIGIT_LC_utf8((U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (isDIGIT_LC(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case NDIGIT:
- while (s < strend) {
- if (!isDIGIT(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case NDIGITUTF8:
- while (s < strend) {
- if (!swash_fetch(PL_utf8_digit,(U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (!isDIGIT(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
case NDIGITL:
PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (!isDIGIT_LC(*s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ if (do_utf8) {
+ while (s < strend) {
+ if (!isDIGIT_LC_utf8((U8*)s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s += UTF8SKIP(s);
}
- else
- tmp = 1;
- s++;
}
- break;
- case NDIGITLUTF8:
- PL_reg_flags |= RF_tainted;
- while (s < strend) {
- if (!isDIGIT_LC_utf8((U8*)s)) {
- if (tmp && (norun || regtry(prog, s)))
- goto got_it;
+ else {
+ while (s < strend) {
+ if (!isDIGIT_LC(*s)) {
+ if (tmp && (norun || regtry(prog, s)))
+ goto got_it;
+ else
+ tmp = doevery;
+ }
else
- tmp = doevery;
+ tmp = 1;
+ s++;
}
- else
- tmp = 1;
- s += UTF8SKIP(s);
}
break;
default:
@@ -1606,6 +1602,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
/* don't bother with what can't match */
strend = HOPc(strend, -(minlen - 1));
+ DEBUG_r({
+ SV *prop = sv_newmortal();
+ regprop(prop, c);
+ PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s);
+ });
if (find_byclass(prog, c, s, strend, startpos, 0))
goto got_it;
DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
@@ -1619,7 +1620,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
last = screaminstr(sv, prog->float_substr, s - strbeg,
end_shift, &scream_pos, 1); /* last one */
if (!last)
- last = scream_olds; /* Only one occurence. */
+ last = scream_olds; /* Only one occurrence. */
}
else {
STRLEN len;
@@ -1891,6 +1892,7 @@ S_regmatch(pTHX_ regnode *prog)
int minmod = 0, sw = 0, logical = 0;
I32 unwind = 0;
I32 firstcp = PL_savestack_ix;
+ register bool do_utf8 = DO_UTF8(PL_reg_sv);
#ifdef DEBUGGING
PL_regindent++;
@@ -2009,8 +2011,8 @@ S_regmatch(pTHX_ regnode *prog)
if (PL_regeol != locinput)
sayNO;
break;
- case SANYUTF8:
- if (nextchr & 0x80) {
+ case SANY:
+ if (DO_UTF8(PL_reg_sv)) {
locinput += PL_utf8skip[nextchr];
if (locinput > PL_regeol)
sayNO;
@@ -2021,13 +2023,8 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
nextchr = UCHARAT(++locinput);
break;
- case SANY:
- if (!nextchr && locinput >= PL_regeol)
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
- case ANYUTF8:
- if (nextchr & 0x80) {
+ case REG_ANY:
+ if (DO_UTF8(PL_reg_sv)) {
locinput += PL_utf8skip[nextchr];
if (locinput > PL_regeol)
sayNO;
@@ -2038,11 +2035,6 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
nextchr = UCHARAT(++locinput);
break;
- case REG_ANY:
- if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
case EXACT:
s = STRING(scan);
ln = STR_LEN(scan);
@@ -2099,22 +2091,24 @@ S_regmatch(pTHX_ regnode *prog)
locinput += ln;
nextchr = UCHARAT(locinput);
break;
- case ANYOFUTF8:
- if (!REGINCLASSUTF8(scan, (U8*)locinput))
- sayNO;
- if (locinput >= PL_regeol)
- sayNO;
- locinput += PL_utf8skip[nextchr];
- nextchr = UCHARAT(locinput);
- break;
case ANYOF:
- if (nextchr < 0)
+ if (do_utf8) {
+ if (!reginclass(scan, (U8*)locinput, do_utf8))
+ sayNO;
+ if (locinput >= PL_regeol)
+ sayNO;
+ locinput += PL_utf8skip[nextchr];
nextchr = UCHARAT(locinput);
- if (!REGINCLASS(scan, nextchr))
- sayNO;
- if (!nextchr && locinput >= PL_regeol)
- sayNO;
- nextchr = UCHARAT(++locinput);
+ }
+ else {
+ if (nextchr < 0)
+ nextchr = UCHARAT(locinput);
+ if (!reginclass(scan, (U8*)locinput, do_utf8))
+ sayNO;
+ if (!nextchr && locinput >= PL_regeol)
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ }
break;
case ALNUML:
PL_reg_flags |= RF_tainted;
@@ -2122,19 +2116,8 @@ S_regmatch(pTHX_ regnode *prog)
case ALNUM:
if (!nextchr)
sayNO;
- if (!(OP(scan) == ALNUM
- ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
- case ALNUMLUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case ALNUMUTF8:
- if (!nextchr)
- sayNO;
- if (nextchr & 0x80) {
- if (!(OP(scan) == ALNUMUTF8
+ if (do_utf8) {
+ if (!(OP(scan) == ALNUM
? swash_fetch(PL_utf8_alnum, (U8*)locinput)
: isALNUM_LC_utf8((U8*)locinput)))
{
@@ -2144,7 +2127,7 @@ S_regmatch(pTHX_ regnode *prog)
nextchr = UCHARAT(locinput);
break;
}
- if (!(OP(scan) == ALNUMUTF8
+ if (!(OP(scan) == ALNUM
? isALNUM(nextchr) : isALNUM_LC(nextchr)))
sayNO;
nextchr = UCHARAT(++locinput);
@@ -2155,19 +2138,8 @@ S_regmatch(pTHX_ regnode *prog)
case NALNUM:
if (!nextchr && locinput >= PL_regeol)
sayNO;
- if (OP(scan) == NALNUM
- ? isALNUM(nextchr) : isALNUM_LC(nextchr))
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
- case NALNUMLUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case NALNUMUTF8:
- if (!nextchr && locinput >= PL_regeol)
- sayNO;
- if (nextchr & 0x80) {
- if (OP(scan) == NALNUMUTF8
+ if (do_utf8) {
+ if (OP(scan) == NALNUM
? swash_fetch(PL_utf8_alnum, (U8*)locinput)
: isALNUM_LC_utf8((U8*)locinput))
{
@@ -2177,7 +2149,7 @@ S_regmatch(pTHX_ regnode *prog)
nextchr = UCHARAT(locinput);
break;
}
- if (OP(scan) == NALNUMUTF8
+ if (OP(scan) == NALNUM
? isALNUM(nextchr) : isALNUM_LC(nextchr))
sayNO;
nextchr = UCHARAT(++locinput);
@@ -2189,42 +2161,38 @@ S_regmatch(pTHX_ regnode *prog)
case BOUND:
case NBOUND:
/* was last char in word? */
- ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev;
- if (OP(scan) == BOUND || OP(scan) == NBOUND) {
- ln = isALNUM(ln);
- n = isALNUM(nextchr);
- }
- else {
- ln = isALNUM_LC(ln);
- n = isALNUM_LC(nextchr);
- }
- if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL))
- sayNO;
- break;
- case BOUNDLUTF8:
- case NBOUNDLUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case BOUNDUTF8:
- case NBOUNDUTF8:
- /* was last char in word? */
- if (locinput == PL_regbol)
- ln = PL_regprev;
- else {
- U8 *r = reghop((U8*)locinput, -1);
-
- ln = utf8_to_uv(r, s - (char*)r, 0, 0);
- }
- if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
- ln = isALNUM_uni(ln);
- n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
+ if (do_utf8) {
+ if (locinput == PL_regbol)
+ ln = PL_regprev;
+ else {
+ U8 *r = reghop((U8*)locinput, -1);
+
+ ln = utf8_to_uv(r, s - (char*)r, 0, 0);
+ }
+ if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+ ln = isALNUM_uni(ln);
+ n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
+ }
+ else {
+ ln = isALNUM_LC_uni(ln);
+ n = isALNUM_LC_utf8((U8*)locinput);
+ }
}
else {
- ln = isALNUM_LC_uni(ln);
- n = isALNUM_LC_utf8((U8*)locinput);
+ ln = (locinput != PL_regbol) ?
+ UCHARAT(locinput - 1) : PL_regprev;
+ if (OP(scan) == BOUND || OP(scan) == NBOUND) {
+ ln = isALNUM(ln);
+ n = isALNUM(nextchr);
+ }
+ else {
+ ln = isALNUM_LC(ln);
+ n = isALNUM_LC(nextchr);
+ }
}
- if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8))
- sayNO;
+ if (((!ln) == (!n)) == (OP(scan) == BOUND ||
+ OP(scan) == BOUNDL))
+ sayNO;
break;
case SPACEL:
PL_reg_flags |= RF_tainted;
@@ -2232,32 +2200,29 @@ S_regmatch(pTHX_ regnode *prog)
case SPACE:
if (!nextchr)
sayNO;
- if (!(OP(scan) == SPACE
- ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
- case SPACELUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case SPACEUTF8:
- if (!nextchr)
- sayNO;
- if (nextchr & 0x80) {
- if (!(OP(scan) == SPACEUTF8
- ? swash_fetch(PL_utf8_space, (U8*)locinput)
- : isSPACE_LC_utf8((U8*)locinput)))
- {
- sayNO;
+ if (DO_UTF8(PL_reg_sv)) {
+ if (nextchr & 0x80) {
+ if (!(OP(scan) == SPACE
+ ? swash_fetch(PL_utf8_space, (U8*)locinput)
+ : isSPACE_LC_utf8((U8*)locinput)))
+ {
+ sayNO;
+ }
+ locinput += PL_utf8skip[nextchr];
+ nextchr = UCHARAT(locinput);
+ break;
}
- locinput += PL_utf8skip[nextchr];
- nextchr = UCHARAT(locinput);
- break;
+ if (!(OP(scan) == SPACE
+ ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
+ }
+ else {
+ if (!(OP(scan) == SPACE
+ ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
+ sayNO;
+ nextchr = UCHARAT(++locinput);
}
- if (!(OP(scan) == SPACEUTF8
- ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
- sayNO;
- nextchr = UCHARAT(++locinput);
break;
case NSPACEL:
PL_reg_flags |= RF_tainted;
@@ -2265,19 +2230,8 @@ S_regmatch(pTHX_ regnode *prog)
case NSPACE:
if (!nextchr && locinput >= PL_regeol)
sayNO;
- if (OP(scan) == NSPACE
- ? isSPACE(nextchr) : isSPACE_LC(nextchr))
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
- case NSPACELUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case NSPACEUTF8:
- if (!nextchr && locinput >= PL_regeol)
- sayNO;
- if (nextchr & 0x80) {
- if (OP(scan) == NSPACEUTF8
+ if (DO_UTF8(PL_reg_sv)) {
+ if (OP(scan) == NSPACE
? swash_fetch(PL_utf8_space, (U8*)locinput)
: isSPACE_LC_utf8((U8*)locinput))
{
@@ -2287,7 +2241,7 @@ S_regmatch(pTHX_ regnode *prog)
nextchr = UCHARAT(locinput);
break;
}
- if (OP(scan) == NSPACEUTF8
+ if (OP(scan) == NSPACE
? isSPACE(nextchr) : isSPACE_LC(nextchr))
sayNO;
nextchr = UCHARAT(++locinput);
@@ -2298,19 +2252,8 @@ S_regmatch(pTHX_ regnode *prog)
case DIGIT:
if (!nextchr)
sayNO;
- if (!(OP(scan) == DIGIT
- ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
- case DIGITLUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case DIGITUTF8:
- if (!nextchr)
- sayNO;
- if (nextchr & 0x80) {
- if (!(OP(scan) == DIGITUTF8
+ if (DO_UTF8(PL_reg_sv)) {
+ if (!(OP(scan) == DIGIT
? swash_fetch(PL_utf8_digit, (U8*)locinput)
: isDIGIT_LC_utf8((U8*)locinput)))
{
@@ -2320,7 +2263,7 @@ S_regmatch(pTHX_ regnode *prog)
nextchr = UCHARAT(locinput);
break;
}
- if (!(OP(scan) == DIGITUTF8
+ if (!(OP(scan) == DIGIT
? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
sayNO;
nextchr = UCHARAT(++locinput);
@@ -2331,19 +2274,8 @@ S_regmatch(pTHX_ regnode *prog)
case NDIGIT:
if (!nextchr && locinput >= PL_regeol)
sayNO;
- if (OP(scan) == NDIGIT
- ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
- sayNO;
- nextchr = UCHARAT(++locinput);
- break;
- case NDIGITLUTF8:
- PL_reg_flags |= RF_tainted;
- /* FALL THROUGH */
- case NDIGITUTF8:
- if (!nextchr && locinput >= PL_regeol)
- sayNO;
- if (nextchr & 0x80) {
- if (OP(scan) == NDIGITUTF8
+ if (DO_UTF8(PL_reg_sv)) {
+ if (OP(scan) == NDIGIT
? swash_fetch(PL_utf8_digit, (U8*)locinput)
: isDIGIT_LC_utf8((U8*)locinput))
{
@@ -2353,7 +2285,7 @@ S_regmatch(pTHX_ regnode *prog)
nextchr = UCHARAT(locinput);
break;
}
- if (OP(scan) == NDIGITUTF8
+ if (OP(scan) == NDIGIT
? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
sayNO;
nextchr = UCHARAT(++locinput);
@@ -3461,30 +3393,33 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
register I32 c;
register char *loceol = PL_regeol;
register I32 hardcount = 0;
+ register bool do_utf8 = DO_UTF8(PL_reg_sv);
scan = PL_reginput;
if (max != REG_INFTY && max < loceol - scan)
loceol = scan + max;
switch (OP(p)) {
case REG_ANY:
- while (scan < loceol && *scan != '\n')
- scan++;
- break;
- case SANY:
- scan = loceol;
- break;
- case ANYUTF8:
- loceol = PL_regeol;
- while (scan < loceol && *scan != '\n') {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol && *scan != '\n') {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && *scan != '\n')
+ scan++;
}
break;
- case SANYUTF8:
- loceol = PL_regeol;
- while (scan < loceol) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ case SANY:
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ scan = loceol;
}
break;
case EXACT: /* length of string is 1 */
@@ -3505,135 +3440,144 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
(UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
scan++;
break;
- case ANYOFUTF8:
- loceol = PL_regeol;
- while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- break;
case ANYOF:
- while (scan < loceol && REGINCLASS(p, *scan))
- scan++;
+ if (do_utf8) {
+ loceol = PL_regeol;
+ while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && reginclass(p, (U8*)scan, do_utf8))
+ scan++;
+ }
break;
case ALNUM:
- while (scan < loceol && isALNUM(*scan))
- scan++;
- break;
- case ALNUMUTF8:
- loceol = PL_regeol;
- while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && isALNUM(*scan))
+ scan++;
}
break;
case ALNUML:
PL_reg_flags |= RF_tainted;
- while (scan < loceol && isALNUM_LC(*scan))
- scan++;
- break;
- case ALNUMLUTF8:
- PL_reg_flags |= RF_tainted;
- loceol = PL_regeol;
- while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && isALNUM_LC(*scan))
+ scan++;
}
break;
- break;
case NALNUM:
- while (scan < loceol && !isALNUM(*scan))
- scan++;
- break;
- case NALNUMUTF8:
- loceol = PL_regeol;
- while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !isALNUM(*scan))
+ scan++;
}
break;
case NALNUML:
PL_reg_flags |= RF_tainted;
- while (scan < loceol && !isALNUM_LC(*scan))
- scan++;
- break;
- case NALNUMLUTF8:
- PL_reg_flags |= RF_tainted;
- loceol = PL_regeol;
- while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !isALNUM_LC(*scan))
+ scan++;
}
break;
case SPACE:
- while (scan < loceol && isSPACE(*scan))
- scan++;
- break;
- case SPACEUTF8:
- loceol = PL_regeol;
- while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol &&
+ (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && isSPACE(*scan))
+ scan++;
}
break;
case SPACEL:
PL_reg_flags |= RF_tainted;
- while (scan < loceol && isSPACE_LC(*scan))
- scan++;
- break;
- case SPACELUTF8:
- PL_reg_flags |= RF_tainted;
- loceol = PL_regeol;
- while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol &&
+ (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && isSPACE_LC(*scan))
+ scan++;
}
break;
case NSPACE:
- while (scan < loceol && !isSPACE(*scan))
- scan++;
- break;
- case NSPACEUTF8:
- loceol = PL_regeol;
- while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol &&
+ !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !isSPACE(*scan))
+ scan++;
+ break;
}
- break;
case NSPACEL:
PL_reg_flags |= RF_tainted;
- while (scan < loceol && !isSPACE_LC(*scan))
- scan++;
- break;
- case NSPACELUTF8:
- PL_reg_flags |= RF_tainted;
- loceol = PL_regeol;
- while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol &&
+ !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !isSPACE_LC(*scan))
+ scan++;
}
break;
case DIGIT:
- while (scan < loceol && isDIGIT(*scan))
- scan++;
- break;
- case DIGITUTF8:
- loceol = PL_regeol;
- while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && isDIGIT(*scan))
+ scan++;
}
break;
- break;
case NDIGIT:
- while (scan < loceol && !isDIGIT(*scan))
- scan++;
- break;
- case NDIGITUTF8:
- loceol = PL_regeol;
- while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
- scan += UTF8SKIP(scan);
- hardcount++;
+ if (DO_UTF8(PL_reg_sv)) {
+ loceol = PL_regeol;
+ while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ } else {
+ while (scan < loceol && !isDIGIT(*scan))
+ scan++;
}
break;
default: /* Called on something of 0 width. */
@@ -3712,102 +3656,139 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
}
/*
+- regclass_swash - prepare the utf8 swash
+*/
+
+SV *
+Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp)
+{
+ SV *sw = NULL;
+ SV *si = NULL;
+
+ if (PL_regdata && PL_regdata->count) {
+ U32 n = ARG(node);
+
+ if (PL_regdata->what[n] == 's') {
+ SV *rv = (SV*)PL_regdata->data[n];
+ AV *av = (AV*)SvRV((SV*)rv);
+ SV **a;
+
+ si = *av_fetch(av, 0, FALSE);
+ a = av_fetch(av, 1, FALSE);
+
+ if (a)
+ sw = *a;
+ else if (si && doinit) {
+ sw = swash_init("utf8", "", si, 1, 0);
+ (void)av_store(av, 1, sw);
+ }
+ }
+ }
+
+ if (initsvp)
+ *initsvp = si;
+
+ return sw;
+}
+
+/*
- reginclass - determine if a character falls into a character class
*/
STATIC bool
-S_reginclass(pTHX_ register regnode *p, register I32 c)
+S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
{
- char flags = ANYOF_FLAGS(p);
+ char flags = ANYOF_FLAGS(n);
bool match = FALSE;
- c &= 0xFF;
- if (ANYOF_BITMAP_TEST(p, c))
- match = TRUE;
- else if (flags & ANYOF_FOLD) {
- I32 cf;
- if (flags & ANYOF_LOCALE) {
- PL_reg_flags |= RF_tainted;
- cf = PL_fold_locale[c];
+ if (do_utf8 || (flags & ANYOF_UNICODE)) {
+ if (do_utf8 && !ANYOF_RUNTIME(n)) {
+ STRLEN len;
+ UV c = utf8_to_uv_simple(p, &len);
+
+ if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
+ match = TRUE;
}
- else
- cf = PL_fold[c];
- if (ANYOF_BITMAP_TEST(p, cf))
- match = TRUE;
- }
- if (!match && (flags & ANYOF_CLASS)) {
- PL_reg_flags |= RF_tainted;
- if (
- (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC) && isPSXSPC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) ||
- (ANYOF_CLASS_TEST(p, ANYOF_NBLANK) && !isBLANK(c))
- ) /* How's that for a conditional? */
- {
- match = TRUE;
+ if (!match) {
+ SV *sw = regclass_swash(n, TRUE, 0);
+
+ if (sw) {
+ if (swash_fetch(sw, p))
+ match = TRUE;
+ else if (flags & ANYOF_FOLD) {
+ U8 tmpbuf[UTF8_MAXLEN+1];
+
+ if (flags & ANYOF_LOCALE) {
+ PL_reg_flags |= RF_tainted;
+ uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
+ }
+ else
+ uv_to_utf8(tmpbuf, toLOWER_utf8(p));
+ if (swash_fetch(sw, tmpbuf))
+ match = TRUE;
+ }
+ }
}
}
+ else {
+ U8 c = *p;
- return (flags & ANYOF_INVERT) ? !match : match;
-}
-
-STATIC bool
-S_reginclassutf8(pTHX_ regnode *f, U8 *p)
-{
- char flags = ARG1(f);
- bool match = FALSE;
-#ifdef DEBUGGING
- SV *rv = (SV*)PL_regdata->data[ARG2(f)];
- AV *av = (AV*)SvRV((SV*)rv);
- SV *sw = *av_fetch(av, 0, FALSE);
- SV *lv = *av_fetch(av, 1, FALSE);
-#else
- SV *sw = (SV*)PL_regdata->data[ARG2(f)];
-#endif
+ if (ANYOF_BITMAP_TEST(n, c))
+ match = TRUE;
+ else if (flags & ANYOF_FOLD) {
+ I32 f;
- if (swash_fetch(sw, p))
- match = TRUE;
- else if (flags & ANYOF_FOLD) {
- U8 tmpbuf[UTF8_MAXLEN+1];
- if (flags & ANYOF_LOCALE) {
+ if (flags & ANYOF_LOCALE) {
+ PL_reg_flags |= RF_tainted;
+ f = PL_fold_locale[c];
+ }
+ else
+ f = PL_fold[c];
+ if (f != c && ANYOF_BITMAP_TEST(n, f))
+ match = TRUE;
+ }
+
+ if (!match && (flags & ANYOF_CLASS)) {
PL_reg_flags |= RF_tainted;
- uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
+ if (
+ (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
+ (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
+ ) /* How's that for a conditional? */
+ {
+ match = TRUE;
+ }
}
- else
- uv_to_utf8(tmpbuf, toLOWER_utf8(p));
- if (swash_fetch(sw, tmpbuf))
- match = TRUE;
}
- /* UTF8 combined with ANYOF_CLASS is ill-defined. */
-
return (flags & ANYOF_INVERT) ? !match : match;
}
@@ -3815,17 +3796,20 @@ STATIC U8 *
S_reghop(pTHX_ U8 *s, I32 off)
{
if (off >= 0) {
- while (off-- && s < (U8*)PL_regeol)
+ while (off-- && s < (U8*)PL_regeol) {
+ /* XXX could check well-formedness here */
s += UTF8SKIP(s);
+ }
}
else {
while (off++) {
if (s > (U8*)PL_bostr) {
s--;
- if (*s & 0x80) {
- while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
+ if (UTF8_IS_CONTINUED(*s)) {
+ while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
s--;
- } /* XXX could check well-formedness here */
+ }
+ /* XXX could check well-formedness here */
}
}
}
@@ -3836,8 +3820,10 @@ STATIC U8 *
S_reghopmaybe(pTHX_ U8* s, I32 off)
{
if (off >= 0) {
- while (off-- && s < (U8*)PL_regeol)
+ while (off-- && s < (U8*)PL_regeol) {
+ /* XXX could check well-formedness here */
s += UTF8SKIP(s);
+ }
if (off >= 0)
return 0;
}
@@ -3845,10 +3831,11 @@ S_reghopmaybe(pTHX_ U8* s, I32 off)
while (off++) {
if (s > (U8*)PL_bostr) {
s--;
- if (*s & 0x80) {
- while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80)
+ if (UTF8_IS_CONTINUED(*s)) {
+ while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
s--;
- } /* XXX could check well-formedness here */
+ }
+ /* XXX could check well-formedness here */
}
else
break;
diff --git a/regnodes.h b/regnodes.h
index 89c78e6bac..00dc0ecaec 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -13,76 +13,57 @@
#define MEOL 7 /* 0x7 Same, assuming multiline. */
#define SEOL 8 /* 0x8 Same, assuming singleline. */
#define BOUND 9 /* 0x9 Match "" at any word boundary */
-#define BOUNDUTF8 10 /* 0xa Match "" at any word boundary */
-#define BOUNDL 11 /* 0xb Match "" at any word boundary */
-#define BOUNDLUTF8 12 /* 0xc Match "" at any word boundary */
-#define NBOUND 13 /* 0xd Match "" at any word non-boundary */
-#define NBOUNDUTF8 14 /* 0xe Match "" at any word non-boundary */
-#define NBOUNDL 15 /* 0xf Match "" at any word non-boundary */
-#define NBOUNDLUTF8 16 /* 0x10 Match "" at any word non-boundary */
-#define GPOS 17 /* 0x11 Matches where last m//g left off. */
-#define REG_ANY 18 /* 0x12 Match any one character (except newline). */
-#define ANYUTF8 19 /* 0x13 Match any one Unicode character (except newline). */
-#define SANY 20 /* 0x14 Match any one character. */
-#define SANYUTF8 21 /* 0x15 Match any one Unicode character. */
-#define ANYOF 22 /* 0x16 Match character in (or not in) this class. */
-#define ANYOFUTF8 23 /* 0x17 Match character in (or not in) this class. */
-#define ALNUM 24 /* 0x18 Match any alphanumeric character */
-#define ALNUMUTF8 25 /* 0x19 Match any alphanumeric character in utf8 */
-#define ALNUML 26 /* 0x1a Match any alphanumeric char in locale */
-#define ALNUMLUTF8 27 /* 0x1b Match any alphanumeric char in locale+utf8 */
-#define NALNUM 28 /* 0x1c Match any non-alphanumeric character */
-#define NALNUMUTF8 29 /* 0x1d Match any non-alphanumeric character in utf8 */
-#define NALNUML 30 /* 0x1e Match any non-alphanumeric char in locale */
-#define NALNUMLUTF8 31 /* 0x1f Match any non-alphanumeric char in locale+utf8 */
-#define SPACE 32 /* 0x20 Match any whitespace character */
-#define SPACEUTF8 33 /* 0x21 Match any whitespace character in utf8 */
-#define SPACEL 34 /* 0x22 Match any whitespace char in locale */
-#define SPACELUTF8 35 /* 0x23 Match any whitespace char in locale+utf8 */
-#define NSPACE 36 /* 0x24 Match any non-whitespace character */
-#define NSPACEUTF8 37 /* 0x25 Match any non-whitespace character in utf8 */
-#define NSPACEL 38 /* 0x26 Match any non-whitespace char in locale */
-#define NSPACELUTF8 39 /* 0x27 Match any non-whitespace char in locale+utf8 */
-#define DIGIT 40 /* 0x28 Match any numeric character */
-#define DIGITUTF8 41 /* 0x29 Match any numeric character in utf8 */
-#define DIGITL 42 /* 0x2a Match any numeric character in locale */
-#define DIGITLUTF8 43 /* 0x2b Match any numeric character in locale+utf8 */
-#define NDIGIT 44 /* 0x2c Match any non-numeric character */
-#define NDIGITUTF8 45 /* 0x2d Match any non-numeric character in utf8 */
-#define NDIGITL 46 /* 0x2e Match any non-numeric character in locale */
-#define NDIGITLUTF8 47 /* 0x2f Match any non-numeric character in locale+utf8 */
-#define CLUMP 48 /* 0x30 Match any combining character sequence */
-#define BRANCH 49 /* 0x31 Match this alternative, or the next... */
-#define BACK 50 /* 0x32 Match "", "next" ptr points backward. */
-#define EXACT 51 /* 0x33 Match this string (preceded by length). */
-#define EXACTF 52 /* 0x34 Match this string, folded (prec. by length). */
-#define EXACTFL 53 /* 0x35 Match this string, folded in locale (w/len). */
-#define NOTHING 54 /* 0x36 Match empty string. */
-#define TAIL 55 /* 0x37 Match empty string. Can jump here from outside. */
-#define STAR 56 /* 0x38 Match this (simple) thing 0 or more times. */
-#define PLUS 57 /* 0x39 Match this (simple) thing 1 or more times. */
-#define CURLY 58 /* 0x3a Match this simple thing {n,m} times. */
-#define CURLYN 59 /* 0x3b Match next-after-this simple thing */
-#define CURLYM 60 /* 0x3c Match this medium-complex thing {n,m} times. */
-#define CURLYX 61 /* 0x3d Match this complex thing {n,m} times. */
-#define WHILEM 62 /* 0x3e Do curly processing and see if rest matches. */
-#define OPEN 63 /* 0x3f Mark this point in input as start of #n. */
-#define CLOSE 64 /* 0x40 Analogous to OPEN. */
-#define REF 65 /* 0x41 Match some already matched string */
-#define REFF 66 /* 0x42 Match already matched string, folded */
-#define REFFL 67 /* 0x43 Match already matched string, folded in loc. */
-#define IFMATCH 68 /* 0x44 Succeeds if the following matches. */
-#define UNLESSM 69 /* 0x45 Fails if the following matches. */
-#define SUSPEND 70 /* 0x46 "Independent" sub-RE. */
-#define IFTHEN 71 /* 0x47 Switch, should be preceeded by switcher . */
-#define GROUPP 72 /* 0x48 Whether the group matched. */
-#define LONGJMP 73 /* 0x49 Jump far away. */
-#define BRANCHJ 74 /* 0x4a BRANCH with long offset. */
-#define EVAL 75 /* 0x4b Execute some Perl code. */
-#define MINMOD 76 /* 0x4c Next operator is not greedy. */
-#define LOGICAL 77 /* 0x4d Next opcode should set the flag only. */
-#define RENUM 78 /* 0x4e Group with independently numbered parens. */
-#define OPTIMIZED 79 /* 0x4f Placeholder for dump. */
+#define BOUNDL 10 /* 0xa Match "" at any word boundary */
+#define NBOUND 11 /* 0xb Match "" at any word non-boundary */
+#define NBOUNDL 12 /* 0xc Match "" at any word non-boundary */
+#define GPOS 13 /* 0xd Matches where last m//g left off. */
+#define REG_ANY 14 /* 0xe Match any one character (except newline). */
+#define SANY 15 /* 0xf Match any one character. */
+#define ANYOF 16 /* 0x10 Match character in (or not in) this class. */
+#define ALNUM 17 /* 0x11 Match any alphanumeric character */
+#define ALNUML 18 /* 0x12 Match any alphanumeric char in locale */
+#define NALNUM 19 /* 0x13 Match any non-alphanumeric character */
+#define NALNUML 20 /* 0x14 Match any non-alphanumeric char in locale */
+#define SPACE 21 /* 0x15 Match any whitespace character */
+#define SPACEL 22 /* 0x16 Match any whitespace char in locale */
+#define NSPACE 23 /* 0x17 Match any non-whitespace character */
+#define NSPACEL 24 /* 0x18 Match any non-whitespace char in locale */
+#define DIGIT 25 /* 0x19 Match any numeric character */
+#define DIGITL 26 /* 0x1a Match any numeric character in locale */
+#define NDIGIT 27 /* 0x1b Match any non-numeric character */
+#define NDIGITL 28 /* 0x1c Match any non-numeric character in locale */
+#define CLUMP 29 /* 0x1d Match any combining character sequence */
+#define BRANCH 30 /* 0x1e Match this alternative, or the next... */
+#define BACK 31 /* 0x1f Match "", "next" ptr points backward. */
+#define EXACT 32 /* 0x20 Match this string (preceded by length). */
+#define EXACTF 33 /* 0x21 Match this string, folded (prec. by length). */
+#define EXACTFL 34 /* 0x22 Match this string, folded in locale (w/len). */
+#define NOTHING 35 /* 0x23 Match empty string. */
+#define TAIL 36 /* 0x24 Match empty string. Can jump here from outside. */
+#define STAR 37 /* 0x25 Match this (simple) thing 0 or more times. */
+#define PLUS 38 /* 0x26 Match this (simple) thing 1 or more times. */
+#define CURLY 39 /* 0x27 Match this simple thing {n,m} times. */
+#define CURLYN 40 /* 0x28 Match next-after-this simple thing */
+#define CURLYM 41 /* 0x29 Match this medium-complex thing {n,m} times. */
+#define CURLYX 42 /* 0x2a Match this complex thing {n,m} times. */
+#define WHILEM 43 /* 0x2b Do curly processing and see if rest matches. */
+#define OPEN 44 /* 0x2c Mark this point in input as start of #n. */
+#define CLOSE 45 /* 0x2d Analogous to OPEN. */
+#define REF 46 /* 0x2e Match some already matched string */
+#define REFF 47 /* 0x2f Match already matched string, folded */
+#define REFFL 48 /* 0x30 Match already matched string, folded in loc. */
+#define IFMATCH 49 /* 0x31 Succeeds if the following matches. */
+#define UNLESSM 50 /* 0x32 Fails if the following matches. */
+#define SUSPEND 51 /* 0x33 "Independent" sub-RE. */
+#define IFTHEN 52 /* 0x34 Switch, should be preceeded by switcher . */
+#define GROUPP 53 /* 0x35 Whether the group matched. */
+#define LONGJMP 54 /* 0x36 Jump far away. */
+#define BRANCHJ 55 /* 0x37 BRANCH with long offset. */
+#define EVAL 56 /* 0x38 Execute some Perl code. */
+#define MINMOD 57 /* 0x39 Next operator is not greedy. */
+#define LOGICAL 58 /* 0x3a Next opcode should set the flag only. */
+#define RENUM 59 /* 0x3b Group with independently numbered parens. */
+#define OPTIMIZED 60 /* 0x3c Placeholder for dump. */
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
@@ -98,44 +79,25 @@ EXTCONST U8 PL_regkind[] = {
EOL, /* MEOL */
EOL, /* SEOL */
BOUND, /* BOUND */
- BOUND, /* BOUNDUTF8 */
BOUND, /* BOUNDL */
- BOUND, /* BOUNDLUTF8 */
NBOUND, /* NBOUND */
- NBOUND, /* NBOUNDUTF8 */
NBOUND, /* NBOUNDL */
- NBOUND, /* NBOUNDLUTF8 */
GPOS, /* GPOS */
REG_ANY, /* REG_ANY */
- REG_ANY, /* ANYUTF8 */
REG_ANY, /* SANY */
- REG_ANY, /* SANYUTF8 */
ANYOF, /* ANYOF */
- ANYOF, /* ANYOFUTF8 */
ALNUM, /* ALNUM */
- ALNUM, /* ALNUMUTF8 */
ALNUM, /* ALNUML */
- ALNUM, /* ALNUMLUTF8 */
NALNUM, /* NALNUM */
- NALNUM, /* NALNUMUTF8 */
NALNUM, /* NALNUML */
- NALNUM, /* NALNUMLUTF8 */
SPACE, /* SPACE */
- SPACE, /* SPACEUTF8 */
SPACE, /* SPACEL */
- SPACE, /* SPACELUTF8 */
NSPACE, /* NSPACE */
- NSPACE, /* NSPACEUTF8 */
NSPACE, /* NSPACEL */
- NSPACE, /* NSPACELUTF8 */
DIGIT, /* DIGIT */
- DIGIT, /* DIGITUTF8 */
DIGIT, /* DIGITL */
- DIGIT, /* DIGITLUTF8 */
NDIGIT, /* NDIGIT */
- NDIGIT, /* NDIGITUTF8 */
NDIGIT, /* NDIGITL */
- NDIGIT, /* NDIGITLUTF8 */
CLUMP, /* CLUMP */
BRANCH, /* BRANCH */
BACK, /* BACK */
@@ -184,44 +146,25 @@ static const U8 regarglen[] = {
0, /* MEOL */
0, /* SEOL */
0, /* BOUND */
- 0, /* BOUNDUTF8 */
0, /* BOUNDL */
- 0, /* BOUNDLUTF8 */
0, /* NBOUND */
- 0, /* NBOUNDUTF8 */
0, /* NBOUNDL */
- 0, /* NBOUNDLUTF8 */
0, /* GPOS */
0, /* REG_ANY */
- 0, /* ANYUTF8 */
0, /* SANY */
- 0, /* SANYUTF8 */
0, /* ANYOF */
- EXTRA_SIZE(struct regnode_1), /* ANYOFUTF8 */
0, /* ALNUM */
- 0, /* ALNUMUTF8 */
0, /* ALNUML */
- 0, /* ALNUMLUTF8 */
0, /* NALNUM */
- 0, /* NALNUMUTF8 */
0, /* NALNUML */
- 0, /* NALNUMLUTF8 */
0, /* SPACE */
- 0, /* SPACEUTF8 */
0, /* SPACEL */
- 0, /* SPACELUTF8 */
0, /* NSPACE */
- 0, /* NSPACEUTF8 */
0, /* NSPACEL */
- 0, /* NSPACELUTF8 */
0, /* DIGIT */
- 0, /* DIGITUTF8 */
0, /* DIGITL */
- 0, /* DIGITLUTF8 */
0, /* NDIGIT */
- 0, /* NDIGITUTF8 */
0, /* NDIGITL */
- 0, /* NDIGITLUTF8 */
0, /* CLUMP */
0, /* BRANCH */
0, /* BACK */
@@ -267,44 +210,25 @@ static const char reg_off_by_arg[] = {
0, /* MEOL */
0, /* SEOL */
0, /* BOUND */
- 0, /* BOUNDUTF8 */
0, /* BOUNDL */
- 0, /* BOUNDLUTF8 */
0, /* NBOUND */
- 0, /* NBOUNDUTF8 */
0, /* NBOUNDL */
- 0, /* NBOUNDLUTF8 */
0, /* GPOS */
0, /* REG_ANY */
- 0, /* ANYUTF8 */
0, /* SANY */
- 0, /* SANYUTF8 */
0, /* ANYOF */
- 0, /* ANYOFUTF8 */
0, /* ALNUM */
- 0, /* ALNUMUTF8 */
0, /* ALNUML */
- 0, /* ALNUMLUTF8 */
0, /* NALNUM */
- 0, /* NALNUMUTF8 */
0, /* NALNUML */
- 0, /* NALNUMLUTF8 */
0, /* SPACE */
- 0, /* SPACEUTF8 */
0, /* SPACEL */
- 0, /* SPACELUTF8 */
0, /* NSPACE */
- 0, /* NSPACEUTF8 */
0, /* NSPACEL */
- 0, /* NSPACELUTF8 */
0, /* DIGIT */
- 0, /* DIGITUTF8 */
0, /* DIGITL */
- 0, /* DIGITLUTF8 */
0, /* NDIGIT */
- 0, /* NDIGITUTF8 */
0, /* NDIGITL */
- 0, /* NDIGITLUTF8 */
0, /* CLUMP */
0, /* BRANCH */
0, /* BACK */
@@ -351,79 +275,60 @@ static const char * const reg_name[] = {
"MEOL", /* 0x7 */
"SEOL", /* 0x8 */
"BOUND", /* 0x9 */
- "BOUNDUTF8", /* 0xa */
- "BOUNDL", /* 0xb */
- "BOUNDLUTF8", /* 0xc */
- "NBOUND", /* 0xd */
- "NBOUNDUTF8", /* 0xe */
- "NBOUNDL", /* 0xf */
- "NBOUNDLUTF8", /* 0x10 */
- "GPOS", /* 0x11 */
- "REG_ANY", /* 0x12 */
- "ANYUTF8", /* 0x13 */
- "SANY", /* 0x14 */
- "SANYUTF8", /* 0x15 */
- "ANYOF", /* 0x16 */
- "ANYOFUTF8", /* 0x17 */
- "ALNUM", /* 0x18 */
- "ALNUMUTF8", /* 0x19 */
- "ALNUML", /* 0x1a */
- "ALNUMLUTF8", /* 0x1b */
- "NALNUM", /* 0x1c */
- "NALNUMUTF8", /* 0x1d */
- "NALNUML", /* 0x1e */
- "NALNUMLUTF8", /* 0x1f */
- "SPACE", /* 0x20 */
- "SPACEUTF8", /* 0x21 */
- "SPACEL", /* 0x22 */
- "SPACELUTF8", /* 0x23 */
- "NSPACE", /* 0x24 */
- "NSPACEUTF8", /* 0x25 */
- "NSPACEL", /* 0x26 */
- "NSPACELUTF8", /* 0x27 */
- "DIGIT", /* 0x28 */
- "DIGITUTF8", /* 0x29 */
- "DIGITL", /* 0x2a */
- "DIGITLUTF8", /* 0x2b */
- "NDIGIT", /* 0x2c */
- "NDIGITUTF8", /* 0x2d */
- "NDIGITL", /* 0x2e */
- "NDIGITLUTF8", /* 0x2f */
- "CLUMP", /* 0x30 */
- "BRANCH", /* 0x31 */
- "BACK", /* 0x32 */
- "EXACT", /* 0x33 */
- "EXACTF", /* 0x34 */
- "EXACTFL", /* 0x35 */
- "NOTHING", /* 0x36 */
- "TAIL", /* 0x37 */
- "STAR", /* 0x38 */
- "PLUS", /* 0x39 */
- "CURLY", /* 0x3a */
- "CURLYN", /* 0x3b */
- "CURLYM", /* 0x3c */
- "CURLYX", /* 0x3d */
- "WHILEM", /* 0x3e */
- "OPEN", /* 0x3f */
- "CLOSE", /* 0x40 */
- "REF", /* 0x41 */
- "REFF", /* 0x42 */
- "REFFL", /* 0x43 */
- "IFMATCH", /* 0x44 */
- "UNLESSM", /* 0x45 */
- "SUSPEND", /* 0x46 */
- "IFTHEN", /* 0x47 */
- "GROUPP", /* 0x48 */
- "LONGJMP", /* 0x49 */
- "BRANCHJ", /* 0x4a */
- "EVAL", /* 0x4b */
- "MINMOD", /* 0x4c */
- "LOGICAL", /* 0x4d */
- "RENUM", /* 0x4e */
- "OPTIMIZED", /* 0x4f */
+ "BOUNDL", /* 0xa */
+ "NBOUND", /* 0xb */
+ "NBOUNDL", /* 0xc */
+ "GPOS", /* 0xd */
+ "REG_ANY", /* 0xe */
+ "SANY", /* 0xf */
+ "ANYOF", /* 0x10 */
+ "ALNUM", /* 0x11 */
+ "ALNUML", /* 0x12 */
+ "NALNUM", /* 0x13 */
+ "NALNUML", /* 0x14 */
+ "SPACE", /* 0x15 */
+ "SPACEL", /* 0x16 */
+ "NSPACE", /* 0x17 */
+ "NSPACEL", /* 0x18 */
+ "DIGIT", /* 0x19 */
+ "DIGITL", /* 0x1a */
+ "NDIGIT", /* 0x1b */
+ "NDIGITL", /* 0x1c */
+ "CLUMP", /* 0x1d */
+ "BRANCH", /* 0x1e */
+ "BACK", /* 0x1f */
+ "EXACT", /* 0x20 */
+ "EXACTF", /* 0x21 */
+ "EXACTFL", /* 0x22 */
+ "NOTHING", /* 0x23 */
+ "TAIL", /* 0x24 */
+ "STAR", /* 0x25 */
+ "PLUS", /* 0x26 */
+ "CURLY", /* 0x27 */
+ "CURLYN", /* 0x28 */
+ "CURLYM", /* 0x29 */
+ "CURLYX", /* 0x2a */
+ "WHILEM", /* 0x2b */
+ "OPEN", /* 0x2c */
+ "CLOSE", /* 0x2d */
+ "REF", /* 0x2e */
+ "REFF", /* 0x2f */
+ "REFFL", /* 0x30 */
+ "IFMATCH", /* 0x31 */
+ "UNLESSM", /* 0x32 */
+ "SUSPEND", /* 0x33 */
+ "IFTHEN", /* 0x34 */
+ "GROUPP", /* 0x35 */
+ "LONGJMP", /* 0x36 */
+ "BRANCHJ", /* 0x37 */
+ "EVAL", /* 0x38 */
+ "MINMOD", /* 0x39 */
+ "LOGICAL", /* 0x3a */
+ "RENUM", /* 0x3b */
+ "OPTIMIZED", /* 0x3c */
};
-static const int reg_num = 80;
+static const int reg_num = 61;
#endif /* DEBUGGING */
#endif /* REG_COMP_C */
diff --git a/sv.c b/sv.c
index 1dafbf6ceb..1fbf83fb2a 100644
--- a/sv.c
+++ b/sv.c
@@ -4522,11 +4522,9 @@ Perl_sv_len_utf8(pTHX_ register SV *sv)
if (!sv)
return 0;
-#ifdef NOTYET
if (SvGMAGICAL(sv))
return mg_length(sv);
else
-#endif
{
STRLEN len;
U8 *s = (U8*)SvPV(sv, len);
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
index ac42b85577..cd9d56a5c4 100644
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -5,6 +5,8 @@ BEGIN {
@INC = '../lib';
}
+no utf8; # this test contains raw 8-bit data on purpose; don't switch to \x{}
+
print "1..78\n";
my $test = 1;
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index 6986720aab..89416dcfab 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
}
}
-print "1..90\n";
+print "1..104\n";
my $test = 1;
@@ -42,6 +42,7 @@ sub nok_bytes {
{
use utf8;
+
$_ = ">\x{263A}<";
s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg;
ok $_, '>&#9786;<';
@@ -106,212 +107,191 @@ sub nok_bytes {
}
{
- use utf8;
-
- $_ = "\x{263A}>\x{263A}\x{263A}";
-
- ok length, 4;
- $test++; # 13
-
- ok length((m/>(.)/)[0]), 1;
- $test++; # 14
-
- ok length($&), 2;
- $test++; # 15
+ # no use utf8 needed
+ $_ = "\x{263A}\x{263A}x\x{263A}y\x{263A}";
+
+ ok length($_), 6; # 13
+ $test++;
- ok length($'), 1;
- $test++; # 16
+ ($a) = m/x(.)/;
- ok length($`), 1;
- $test++; # 17
+ ok length($a), 1; # 14
+ $test++;
- ok length($1), 1;
- $test++; # 18
+ ok length($`), 2; # 15
+ $test++;
+ ok length($&), 2; # 16
+ $test++;
+ ok length($'), 2; # 17
+ $test++;
- ok length($tmp=$&), 2;
- $test++; # 19
+ ok length($1), 1; # 18
+ $test++;
- ok length($tmp=$'), 1;
- $test++; # 20
+ ok length($b=$`), 2; # 19
+ $test++;
- ok length($tmp=$`), 1;
- $test++; # 21
+ ok length($b=$&), 2; # 20
+ $test++;
- ok length($tmp=$1), 1;
- $test++; # 22
+ ok length($b=$'), 2; # 21
+ $test++;
- {
- use bytes;
+ ok length($b=$1), 1; # 22
+ $test++;
- my $tmp = $&;
- ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
- $test++; # 23
+ ok $a, "\x{263A}"; # 23
+ $test++;
- $tmp = $';
- ok $tmp, pack("C*", 0342, 0230, 0272);
- $test++; # 24
+ ok $`, "\x{263A}\x{263A}"; # 24
+ $test++;
- $tmp = $`;
- ok $tmp, pack("C*", 0342, 0230, 0272);
- $test++; # 25
+ ok $&, "x\x{263A}"; # 25
+ $test++;
- $tmp = $1;
- ok $tmp, pack("C*", 0342, 0230, 0272);
- $test++; # 26
- }
+ ok $', "y\x{263A}"; # 26
+ $test++;
- ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272);
- $test++; # 27
+ ok $1, "\x{263A}"; # 27
+ $test++;
- ok_bytes $', pack("C*", 0342, 0230, 0272);
- $test++; # 28
+ ok_bytes $a, "\342\230\272"; # 28
+ $test++;
- ok_bytes $`, pack("C*", 0342, 0230, 0272);
- $test++; # 29
+ ok_bytes $1, "\342\230\272"; # 29
+ $test++;
- ok_bytes $1, pack("C*", 0342, 0230, 0272);
- $test++; # 30
+ ok_bytes $&, "x\342\230\272"; # 30
+ $test++;
{
- use bytes;
- no utf8;
-
- ok length, 10;
- $test++; # 31
+ use utf8; # required
+ $_ = chr(0x263A) . chr(0x263A) . 'x' . chr(0x263A) . 'y' . chr(0x263A);
+ }
- ok length((m/>(.)/)[0]), 1;
- $test++; # 32
+ ok length($_), 6; # 31
+ $test++;
- ok length($&), 2;
- $test++; # 33
+ ($a) = m/x(.)/;
- ok length($'), 5;
- $test++; # 34
+ ok length($a), 1; # 32
+ $test++;
- ok length($`), 3;
- $test++; # 35
+ ok length($`), 2; # 33
+ $test++;
- ok length($1), 1;
- $test++; # 36
+ ok length($&), 2; # 34
+ $test++;
- ok $&, pack("C*", ord(">"), 0342);
- $test++; # 37
+ ok length($'), 2; # 35
+ $test++;
- ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
- $test++; # 38
+ ok length($1), 1; # 36
+ $test++;
- ok $`, pack("C*", 0342, 0230, 0272);
- $test++; # 39
+ ok length($b=$`), 2; # 37
+ $test++;
- ok $1, pack("C*", 0342);
- $test++; # 40
- }
+ ok length($b=$&), 2; # 38
+ $test++;
- {
- no utf8;
- $_="\342\230\272>\342\230\272\342\230\272";
- }
+ ok length($b=$'), 2; # 39
+ $test++;
- ok length, 10;
- $test++; # 41
+ ok length($b=$1), 1; # 40
+ $test++;
- ok length((m/>(.)/)[0]), 1;
- $test++; # 42
+ ok $a, "\x{263A}"; # 41
+ $test++;
- ok length($&), 2;
- $test++; # 43
+ ok $`, "\x{263A}\x{263A}"; # 42
+ $test++;
- ok length($'), 1;
- $test++; # 44
+ ok $&, "x\x{263A}"; # 43
+ $test++;
- ok length($`), 1;
- $test++; # 45
+ ok $', "y\x{263A}"; # 44
+ $test++;
- ok length($1), 1;
- $test++; # 46
+ ok $1, "\x{263A}"; # 45
+ $test++;
- ok length($tmp=$&), 2;
- $test++; # 47
+ ok_bytes $a, "\342\230\272"; # 46
+ $test++;
- ok length($tmp=$'), 1;
- $test++; # 48
+ ok_bytes $1, "\342\230\272"; # 47
+ $test++;
- ok length($tmp=$`), 1;
- $test++; # 49
+ ok_bytes $&, "x\342\230\272"; # 48
+ $test++;
- ok length($tmp=$1), 1;
- $test++; # 50
+ $_ = "\342\230\272\342\230\272x\342\230\272y\342\230\272";
- {
- use bytes;
+ ok length($_), 14; # 49
+ $test++;
- my $tmp = $&;
- ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272);
- $test++; # 51
+ ($a) = m/x(.)/;
- $tmp = $';
- ok $tmp, pack("C*", 0342, 0230, 0272);
- $test++; # 52
+ ok length($a), 1; # 50
+ $test++;
- $tmp = $`;
- ok $tmp, pack("C*", 0342, 0230, 0272);
- $test++; # 53
+ ok length($`), 6; # 51
+ $test++;
- $tmp = $1;
- ok $tmp, pack("C*", 0342, 0230, 0272);
- $test++; # 54
- }
+ ok length($&), 2; # 52
+ $test++;
- {
- use bytes;
- no utf8;
+ ok length($'), 6; # 53
+ $test++;
- ok length, 10;
- $test++; # 55
+ ok length($1), 1; # 54
+ $test++;
- ok length((m/>(.)/)[0]), 1;
- $test++; # 56
+ ok length($b=$`), 6; # 55
+ $test++;
- ok length($&), 2;
- $test++; # 57
+ ok length($b=$&), 2; # 56
+ $test++;
- ok length($'), 5;
- $test++; # 58
+ ok length($b=$'), 6; # 57
+ $test++;
- ok length($`), 3;
- $test++; # 59
+ ok length($b=$1), 1; # 58
+ $test++;
- ok length($1), 1;
- $test++; # 60
+ ok $a, "\342"; # 59
+ $test++;
- ok $&, pack("C*", ord(">"), 0342);
- $test++; # 61
+ ok $`, "\342\230\272\342\230\272"; # 60
+ $test++;
- ok $', pack("C*", 0230, 0272, 0342, 0230, 0272);
- $test++; # 62
+ ok $&, "x\342"; # 61
+ $test++;
- ok $`, pack("C*", 0342, 0230, 0272);
- $test++; # 63
+ ok $', "\230\272y\342\230\272"; # 62
+ $test++;
- ok $1, pack("C*", 0342);
- $test++; # 64
- }
+ ok $1, "\342"; # 63
+ $test++;
+}
+{
+ use utf8;
ok "\x{ab}" =~ /^\x{ab}$/, 1;
- $test++; # 65
+ $test++; # 64
}
{
use utf8;
ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2);
- $test++; # 66
+ $test++; # 65
}
{
use utf8;
my @a = map ord, split(//, join("", map chr, (1234, 123, 2345)));
ok "@a", "1234 123 2345";
- $test++; # 67
+ $test++; # 66
}
{
@@ -319,7 +299,7 @@ sub nok_bytes {
my $x = chr(123);
my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345)));
ok "@a", "1234 2345";
- $test++; # 68
+ $test++; # 67
}
{
@@ -331,10 +311,10 @@ sub nok_bytes {
{ use utf8; $b = "\xe4" } # \xXX must not produce UTF-8
print "not " if $a eq $b;
- print "ok $test\n"; $test++;
+ print "ok $test\n"; $test++; # 68
{ use utf8; print "not " if $a eq $b; }
- print "ok $test\n"; $test++;
+ print "ok $test\n"; $test++; # 69
}
{
@@ -344,7 +324,7 @@ sub nok_bytes {
for (@x) {
s/(\d+)\s*([\w\-]+)/$1 . uc $2/e;
my($latin) = /^(.+)(?:\s+\d)/;
- print $latin eq "stra\337e" ? "ok $test\n" :
+ print $latin eq "stra\337e" ? "ok $test\n" : # 70, 71
"#latin[$latin]\nnot ok $test\n";
$test++;
$latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a
@@ -369,7 +349,7 @@ sub nok_bytes {
}
print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B";
- print "ok $test\n";
+ print "ok $test\n"; # 72
$test++;
}
@@ -384,27 +364,27 @@ sub nok_bytes {
print "not "
unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a;
print "ok $test\n";
- $test++;
+ $test++; # 73
my ($a, $b) = split(/\x{100}/, $s);
print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20";
print "ok $test\n";
- $test++;
+ $test++; # 74
my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s);
print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20";
print "ok $test\n";
- $test++;
+ $test++; # 75
my ($a, $b) = split(/\x40\x{80}/, $s);
print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20";
print "ok $test\n";
- $test++;
+ $test++; # 76
my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s);
print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20";
print "ok $test\n";
- $test++;
+ $test++; # 77
}
{
@@ -414,14 +394,14 @@ sub nok_bytes {
my $smiley = "\x{263a}";
- for my $s ("\x{263a}", # 1
- $smiley, # 2
+ for my $s ("\x{263a}", # 78
+ $smiley, # 79
- "" . $smiley, # 3
- "" . "\x{263a}", # 4
+ "" . $smiley, # 80
+ "" . "\x{263a}", # 81
- $smiley . "", # 5
- "\x{263a}" . "", # 6
+ $smiley . "", # 82
+ "\x{263a}" . "", # 83
) {
my $length_chars = length($s);
my $length_bytes;
@@ -437,14 +417,14 @@ sub nok_bytes {
$test++;
}
- for my $s ("\x{263a}" . "\x{263a}", # 7
- $smiley . $smiley, # 8
+ for my $s ("\x{263a}" . "\x{263a}", # 84
+ $smiley . $smiley, # 85
- "\x{263a}\x{263a}", # 9
- "$smiley$smiley", # 10
+ "\x{263a}\x{263a}", # 86
+ "$smiley$smiley", # 87
- "\x{263a}" x 2, # 11
- $smiley x 2, # 12
+ "\x{263a}" x 2, # 88
+ $smiley x 2, # 89
) {
my $length_chars = length($s);
my $length_bytes;
@@ -460,3 +440,106 @@ sub nok_bytes {
$test++;
}
}
+
+{
+ use utf8;
+
+ print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
+ print "ok $test\n";
+ $test++; # 90
+
+ print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
+ print "ok $test\n";
+ $test++; # 91
+
+ print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}";
+ print "ok $test\n";
+ $test++; # 92
+
+ print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4";
+ print "ok $test\n";
+ $test++; # 93
+
+ print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
+ print "ok $test\n";
+ $test++; # 94
+
+ print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
+ print "ok $test\n";
+ $test++; # 95
+
+ print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}";
+ print "ok $test\n";
+ $test++; # 96
+
+ print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4";
+ print "ok $test\n";
+ $test++; # 97
+}
+
+{
+ # the first half of 20001028.003
+
+ my $X = chr(1448);
+ my ($Y) = $X =~ /(.*)/;
+ print "not " unless length $Y == 1;
+ print "ok $test\n";
+ $test++; # 98
+}
+
+{
+ # 20001108.001
+
+ use utf8;
+ my $X = "Szab\x{f3},Bal\x{e1}zs";
+ my $Y = $X;
+ $Y =~ s/(B)/$1/ for 0..3;
+ print "not " unless $Y eq $X;
+ print "ok $test\n";
+ $test++; # 99
+}
+
+{
+ # 20001114.001
+
+ use utf8;
+ use charnames ':full';
+ my $text = "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}";
+ print "not " unless ord($text) == 0xc4;
+ print "ok $test\n";
+ $test++; # 100
+}
+
+{
+ # 20001205.014
+
+ use utf8;
+
+ my $a = "ABC\x{263A}";
+
+ my @b = split( //, $a );
+
+ print "not " unless @b == 4;
+ print "ok $test\n";
+ $test++; # 101
+
+ print "not " unless length($b[3]) == 1;
+ print "ok $test\n";
+ $test++; # 102
+
+ $a =~ s/^A/Z/;
+ print "not " unless length($a) == 4;
+ print "ok $test\n";
+ $test++; # 103
+}
+
+{
+ # the second half of 20001028.003
+
+ use utf8;
+ $X =~ s/^/chr(1488)/e;
+ print "not " unless length $X == 1;
+ print "ok $test\n";
+ $test++; # 104
+}
+