diff options
-rw-r--r-- | embed.h | 14 | ||||
-rwxr-xr-x | embed.pl | 5 | ||||
-rw-r--r-- | mg.c | 22 | ||||
-rw-r--r-- | objXSUB.h | 4 | ||||
-rw-r--r-- | pp_ctl.c | 11 | ||||
-rw-r--r-- | pp_hot.c | 27 | ||||
-rw-r--r-- | proto.h | 5 | ||||
-rw-r--r-- | regcomp.c | 840 | ||||
-rw-r--r-- | regcomp.h | 39 | ||||
-rw-r--r-- | regcomp.sym | 19 | ||||
-rw-r--r-- | regexec.c | 1313 | ||||
-rw-r--r-- | regnodes.h | 301 | ||||
-rw-r--r-- | sv.c | 2 | ||||
-rw-r--r-- | t/op/utf8decode.t | 2 | ||||
-rwxr-xr-x | t/pragma/utf8.t | 407 |
15 files changed, 1409 insertions, 1602 deletions
@@ -543,6 +543,7 @@ #define ref Perl_ref #define refkids Perl_refkids #define regdump Perl_regdump +#define regclass_swash Perl_regclass_swash #define pregexec Perl_pregexec #define pregfree Perl_pregfree #define pregcomp Perl_pregcomp @@ -995,7 +996,6 @@ #define regbranch S_regbranch #define reguni S_reguni #define regclass S_regclass -#define regclassutf8 S_regclassutf8 #define regcurly S_regcurly #define reg_node S_reg_node #define regpiece S_regpiece @@ -1025,7 +1025,6 @@ #define regrepeat_hard S_regrepeat_hard #define regtry S_regtry #define reginclass S_reginclass -#define reginclassutf8 S_reginclassutf8 #define regcppush S_regcppush #define regcppop S_regcppop #define regcp_set_to S_regcp_set_to @@ -2015,6 +2014,7 @@ #define ref(a,b) Perl_ref(aTHX_ a,b) #define refkids(a,b) Perl_refkids(aTHX_ a,b) #define regdump(a) Perl_regdump(aTHX_ a) +#define regclass_swash(a,b,c) Perl_regclass_swash(aTHX_ a,b,c) #define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g) #define pregfree(a) Perl_pregfree(aTHX_ a) #define pregcomp(a,b,c) Perl_pregcomp(aTHX_ a,b,c) @@ -2459,7 +2459,6 @@ #define regbranch(a,b,c) S_regbranch(aTHX_ a,b,c) #define reguni(a,b,c,d) S_reguni(aTHX_ a,b,c,d) #define regclass(a) S_regclass(aTHX_ a) -#define regclassutf8(a) S_regclassutf8(aTHX_ a) #define regcurly(a) S_regcurly(aTHX_ a) #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define regpiece(a,b) S_regpiece(aTHX_ a,b) @@ -2487,8 +2486,7 @@ #define regrepeat(a,b) S_regrepeat(aTHX_ a,b) #define regrepeat_hard(a,b,c) S_regrepeat_hard(aTHX_ a,b,c) #define regtry(a,b) S_regtry(aTHX_ a,b) -#define reginclass(a,b) S_reginclass(aTHX_ a,b) -#define reginclassutf8(a,b) S_reginclassutf8(aTHX_ a,b) +#define reginclass(a,b,c) S_reginclass(aTHX_ a,b,c) #define regcppush(a) S_regcppush(aTHX_ a) #define regcppop() S_regcppop(aTHX) #define regcp_set_to(a) S_regcp_set_to(aTHX_ a) @@ -3950,6 +3948,8 @@ #define refkids Perl_refkids #define Perl_regdump CPerlObj::Perl_regdump #define regdump Perl_regdump +#define Perl_regclass_swash CPerlObj::Perl_regclass_swash +#define regclass_swash Perl_regclass_swash #define Perl_pregexec CPerlObj::Perl_pregexec #define pregexec Perl_pregexec #define Perl_pregfree CPerlObj::Perl_pregfree @@ -4787,8 +4787,6 @@ #define reguni S_reguni #define S_regclass CPerlObj::S_regclass #define regclass S_regclass -#define S_regclassutf8 CPerlObj::S_regclassutf8 -#define regclassutf8 S_regclassutf8 #define S_regcurly CPerlObj::S_regcurly #define regcurly S_regcurly #define S_reg_node CPerlObj::S_reg_node @@ -4845,8 +4843,6 @@ #define regtry S_regtry #define S_reginclass CPerlObj::S_reginclass #define reginclass S_reginclass -#define S_reginclassutf8 CPerlObj::S_reginclassutf8 -#define reginclassutf8 S_reginclassutf8 #define S_regcppush CPerlObj::S_regcppush #define regcppush S_regcppush #define S_regcppop CPerlObj::S_regcppop @@ -1873,6 +1873,7 @@ Ap |void |push_scope p |OP* |ref |OP* o|I32 type p |OP* |refkids |OP* o|I32 type Ap |void |regdump |regexp* r +Ap |SV* |regclass_swash |struct regnode *n|bool doinit|SV **initsvp Ap |I32 |pregexec |regexp* prog|char* stringarg \ |char* strend|char* strbeg|I32 minend \ |SV* screamer|U32 nosave @@ -2366,7 +2367,6 @@ s |regnode*|regatom |struct RExC_state_t*|I32 * s |regnode*|regbranch |struct RExC_state_t*|I32 *|I32 s |void |reguni |struct RExC_state_t*|UV|char *|STRLEN* s |regnode*|regclass |struct RExC_state_t* -s |regnode*|regclassutf8 |struct RExC_state_t* s |I32 |regcurly |char * s |regnode*|reg_node |struct RExC_state_t*|U8 s |regnode*|regpiece |struct RExC_state_t*|I32 * @@ -2401,8 +2401,7 @@ s |I32 |regmatch |regnode *prog s |I32 |regrepeat |regnode *p|I32 max s |I32 |regrepeat_hard |regnode *p|I32 max|I32 *lp s |I32 |regtry |regexp *prog|char *startpos -s |bool |reginclass |regnode *p|I32 c -s |bool |reginclassutf8 |regnode *f|U8* p +s |bool |reginclass |regnode *n|U8 *p|bool do_utf8sv_is_utf8 s |CHECKPOINT|regcppush |I32 parenfloor s |char*|regcppop s |char*|regcp_set_to |I32 ss @@ -391,7 +391,7 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg) case '5': case '6': case '7': case '8': case '9': case '&': if (PL_curpm && (rx = PL_curpm->op_pmregexp)) { - paren = atoi(mg->mg_ptr); + paren = atoi(mg->mg_ptr); /* $& is in [0] */ getparen: if (paren <= rx->nparens && (s1 = rx->startp[paren]) != -1 && @@ -399,17 +399,15 @@ Perl_magic_len(pTHX_ SV *sv, MAGIC *mg) { i = t1 - s1; getlen: - if (i > 0 && (PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) { - char *s = rx->subbeg + s1; + if (i > 0 && DO_UTF8(PL_reg_sv)) { + char *s = rx->subbeg + s1; char *send = rx->subbeg + t1; - i = 0; - while (s < send) { - s += UTF8SKIP(s); - i++; - } + + i = Perl_utf8_length((U8*)s, (U8*)send); } - if (i >= 0) - return i; + if (i < 0) + Perl_croak(aTHX_ "panic: magic_len: %d", i); + return i; } } return 0; @@ -604,7 +602,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) * Pre-threads, this was paren = atoi(GvENAME((GV*)mg->mg_obj)); * XXX Does the new way break anything? */ - paren = atoi(mg->mg_ptr); + paren = atoi(mg->mg_ptr); /* $& is in [0] */ getparen: if (paren <= rx->nparens && (s1 = rx->startp[paren]) != -1 && @@ -623,7 +621,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg) PL_tainted = FALSE; } sv_setpvn(sv, s, i); - if ((PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) + if (DO_UTF8(PL_reg_sv)) SvUTF8_on(sv); else SvUTF8_off(sv); @@ -1263,6 +1263,10 @@ #define Perl_regdump pPerl->Perl_regdump #undef regdump #define regdump Perl_regdump +#undef Perl_regclass_swash +#define Perl_regclass_swash pPerl->Perl_regclass_swash +#undef regclass_swash +#define regclass_swash Perl_regclass_swash #undef Perl_pregexec #define Perl_pregexec pPerl->Perl_pregexec #undef pregexec @@ -157,7 +157,7 @@ PP(pp_substcont) register char *m = cx->sb_m; char *orig = cx->sb_orig; register REGEXP *rx = cx->sb_rx; - + rxres_restore(&cx->sb_rxres, rx); if (cx->sb_iters++) { @@ -176,8 +176,8 @@ PP(pp_substcont) : (REXEC_COPY_STR|REXEC_IGNOREPOS|REXEC_NOT_FIRST)))) { SV *targ = cx->sb_targ; - sv_catpvn(dstr, s, cx->sb_strend - s); + sv_catpvn(dstr, s, cx->sb_strend - s); cx->sb_rxtainted |= RX_MATCH_TAINTED(rx); (void)SvOOK_off(targ); @@ -189,9 +189,11 @@ PP(pp_substcont) sv_free(dstr); TAINT_IF(cx->sb_rxtainted & 1); + if (pm->op_pmdynflags & PMdf_UTF8) + SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */ PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1))); - (void)SvPOK_only(targ); + (void)SvPOK_only_UTF8(targ); TAINT_IF(cx->sb_rxtainted); SvSETMAGIC(targ); SvTAINT(targ); @@ -209,7 +211,8 @@ PP(pp_substcont) cx->sb_strend = s + (cx->sb_strend - m); } cx->sb_m = m = rx->startp[0] + orig; - sv_catpvn(dstr, s, m-s); + if (m > s) + sv_catpvn(dstr, s, m-s); cx->sb_s = rx->endp[0] + orig; { /* Update the pos() information. */ SV *sv = cx->sb_targ; @@ -1179,6 +1179,7 @@ PP(pp_match) TARG = DEFSV; EXTEND(SP,1); } + PL_reg_sv = TARG; PUTBACK; /* EVAL blocks need stack_sp. */ s = SvPV(TARG, len); strend = s + len; @@ -1268,27 +1269,25 @@ play_it_again: RX_MATCH_TAINTED_on(rx); TAINT_IF(RX_MATCH_TAINTED(rx)); if (gimme == G_ARRAY) { - I32 iters, i, len; + I32 nparens, i, len; - iters = rx->nparens; - if (global && !iters) + nparens = rx->nparens; + if (global && !nparens) i = 1; else i = 0; SPAGAIN; /* EVAL blocks could move the stack. */ - EXTEND(SP, iters + i); - EXTEND_MORTAL(iters + i); - for (i = !i; i <= iters; i++) { + EXTEND(SP, nparens + i); + EXTEND_MORTAL(nparens + i); + for (i = !i; i <= nparens; i++) { PUSHs(sv_newmortal()); /*SUPPRESS 560*/ if ((rx->startp[i] != -1) && rx->endp[i] != -1 ) { len = rx->endp[i] - rx->startp[i]; s = rx->startp[i] + truebase; sv_setpvn(*SP, s, len); - if ((pm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE) { + if (DO_UTF8(TARG)) SvUTF8_on(*SP); - sv_utf8_downgrade(*SP, TRUE); - } } } if (global) { @@ -1298,7 +1297,7 @@ play_it_again: r_flags |= REXEC_IGNOREPOS | REXEC_NOT_FIRST; goto play_it_again; } - else if (!iters) + else if (!nparens) XPUSHs(&PL_sv_yes); LEAVE_SCOPE(oldsave); RETURN; @@ -1831,6 +1830,7 @@ PP(pp_subst) TARG = DEFSV; EXTEND(SP,1); } + PL_reg_sv = TARG; if (SvFAKE(TARG) && SvREADONLY(TARG)) sv_force_normal(TARG); if (SvREADONLY(TARG) @@ -1847,7 +1847,7 @@ PP(pp_subst) if (PL_tainted) rxtainted |= 2; TAINT_NOT; - + force_it: if (!pm || !s) DIE(aTHX_ "panic: do_subst"); @@ -2004,6 +2004,8 @@ PP(pp_subst) rxtainted |= RX_MATCH_TAINTED(rx); dstr = NEWSV(25, len); sv_setpvn(dstr, m, s-m); + if (DO_UTF8(TARG)) + SvUTF8_on(dstr); PL_curpm = pm; if (!c) { register PERL_CONTEXT *cx; @@ -2030,7 +2032,8 @@ PP(pp_subst) sv_catpvn(dstr, c, clen); if (once) break; - } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m, TARG, NULL, r_flags)); + } while (CALLREGEXEC(aTHX_ rx, s, strend, orig, s == m, + TARG, NULL, r_flags)); sv_catpvn(dstr, s, strend - s); (void)SvOOK_off(TARG); @@ -616,6 +616,7 @@ PERL_CALLCONV void Perl_push_scope(pTHX); PERL_CALLCONV OP* Perl_ref(pTHX_ OP* o, I32 type); PERL_CALLCONV OP* Perl_refkids(pTHX_ OP* o, I32 type); PERL_CALLCONV void Perl_regdump(pTHX_ regexp* r); +PERL_CALLCONV SV* Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **initsvp); PERL_CALLCONV I32 Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave); PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r); PERL_CALLCONV regexp* Perl_pregcomp(pTHX_ char* exp, char* xend, PMOP* pm); @@ -1111,7 +1112,6 @@ STATIC regnode* S_regatom(pTHX_ struct RExC_state_t*, I32 *); STATIC regnode* S_regbranch(pTHX_ struct RExC_state_t*, I32 *, I32); STATIC void S_reguni(pTHX_ struct RExC_state_t*, UV, char *, STRLEN*); STATIC regnode* S_regclass(pTHX_ struct RExC_state_t*); -STATIC regnode* S_regclassutf8(pTHX_ struct RExC_state_t*); STATIC I32 S_regcurly(pTHX_ char *); STATIC regnode* S_reg_node(pTHX_ struct RExC_state_t*, U8); STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t*, I32 *); @@ -1141,8 +1141,7 @@ STATIC I32 S_regmatch(pTHX_ regnode *prog); STATIC I32 S_regrepeat(pTHX_ regnode *p, I32 max); STATIC I32 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp); STATIC I32 S_regtry(pTHX_ regexp *prog, char *startpos); -STATIC bool S_reginclass(pTHX_ regnode *p, I32 c); -STATIC bool S_reginclassutf8(pTHX_ regnode *f, U8* p); +STATIC bool S_reginclass(pTHX_ regnode *n, U8 *p, bool do_utf8sv_is_utf8); STATIC CHECKPOINT S_regcppush(pTHX_ I32 parenfloor); STATIC char* S_regcppop(pTHX); STATIC char* S_regcp_set_to(pTHX_ I32 ss); @@ -118,7 +118,7 @@ typedef struct RExC_state_t { char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ I32 whilem_seen; /* number of WHILEM in this expr */ - regnode *emit; /* Code-emit pointer; ®dummy = don't */ + regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ U32 seen; @@ -234,8 +234,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define LOC (RExC_flags16 & PMf_LOCALE) #define FOLD (RExC_flags16 & PMf_FOLD) -#define OOB_CHAR8 1234 -#define OOB_UTF8 123456 +#define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) @@ -1196,7 +1195,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg break; } } - else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) { + else if (strchr((char*)PL_simple,OP(scan))) { int value; if (flags & SCF_DO_SUBSTR) { @@ -1210,20 +1209,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Some of the logic below assumes that switching locale on will only add false positives. */ switch (PL_regkind[(U8)OP(scan)]) { - case ANYUTF8: case SANY: - case SANYUTF8: - case ALNUMUTF8: - case ANYOFUTF8: - case ALNUMLUTF8: - case NALNUMUTF8: - case NALNUMLUTF8: - case SPACEUTF8: - case NSPACEUTF8: - case SPACELUTF8: - case NSPACELUTF8: - case DIGITUTF8: - case NDIGITUTF8: default: do_default: /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ @@ -1750,7 +1736,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* turn .* into ^.* with an implied $*=1 */ int type = OP(NEXTOPER(first)); - if (type == REG_ANY || type == ANYUTF8) + if (type == REG_ANY) type = ROPT_ANCH_MBOL; else type = ROPT_ANCH_SBOL; @@ -1850,8 +1836,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) longest_fixed_length = 0; } if (r->regstclass - && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8 - || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY)) + && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; if ((!r->anchored_substr || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) @@ -1866,6 +1851,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) struct regnode_charclass_class); r->regstclass = (regnode*)RExC_rx->data->data[n]; r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ + PL_regdata = r->data; /* for regprop() */ DEBUG_r((sv = sv_newmortal(), regprop(sv, (regnode*)data.start_class), PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", @@ -1933,7 +1919,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch |= ROPT_EVAL_SEEN; Newz(1002, r->startp, RExC_npar, I32); Newz(1002, r->endp, RExC_npar, I32); - PL_regdata = r->data; /* for regprop() ANYOFUTF8 */ + PL_regdata = r->data; /* for regprop() */ DEBUG_r(regdump(r)); return(r); } @@ -2556,26 +2542,17 @@ tryagain: break; case '.': nextchar(pRExC_state); - if (UTF) { - if (RExC_flags16 & PMf_SINGLELINE) - ret = reg_node(pRExC_state, SANYUTF8); - else - ret = reg_node(pRExC_state, ANYUTF8); - *flagp |= HASWIDTH; - } - else { - if (RExC_flags16 & PMf_SINGLELINE) - ret = reg_node(pRExC_state, SANY); - else - ret = reg_node(pRExC_state, REG_ANY); - *flagp |= HASWIDTH|SIMPLE; - } + if (RExC_flags16 & PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; break; case '[': { char *oregcomp_parse = ++RExC_parse; - ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state)); + ret = regclass(pRExC_state); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ["); @@ -2659,20 +2636,14 @@ tryagain: is_utf8_mark((U8*)"~"); /* preload table */ break; case 'w': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? ALNUMLUTF8 : ALNUMUTF8) - : (LOC ? ALNUML : ALNUM)); + ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'W': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NALNUMLUTF8 : NALNUMUTF8) - : (LOC ? NALNUML : NALNUM)); + ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) @@ -2681,10 +2652,7 @@ tryagain: case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, - UTF - ? (LOC ? BOUNDLUTF8 : BOUNDUTF8) - : (LOC ? BOUNDL : BOUND)); + ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) @@ -2693,44 +2661,35 @@ tryagain: case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8) - : (LOC ? NBOUNDL : NBOUND)); + ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 's': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? SPACELUTF8 : SPACEUTF8) - : (LOC ? SPACEL : SPACE)); + ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_space) is_utf8_space((U8*)" "); /* preload table */ break; case 'S': - ret = reg_node(pRExC_state, - UTF - ? (LOC ? NSPACELUTF8 : NSPACEUTF8) - : (LOC ? NSPACEL : NSPACE)); + ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_space) is_utf8_space((U8*)" "); /* preload table */ break; case 'd': - ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT); + ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_digit) is_utf8_digit((U8*)"1"); /* preload table */ break; case 'D': - ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT); + ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); if (UTF && !PL_utf8_digit) @@ -2754,7 +2713,7 @@ tryagain: RExC_end = RExC_parse + 2; RExC_parse--; - ret = regclassutf8(pRExC_state); + ret = regclass(pRExC_state); RExC_end = oldregxend; RExC_parse--; @@ -3194,58 +3153,108 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state) { - register U32 value; - register I32 lastvalue = OOB_CHAR8; - register I32 range = 0; + register UV value; + register IV lastvalue = OOB_UNICODE; + register IV range = 0; register regnode *ret; STRLEN numlen; - I32 namedclass; + IV namedclass; char *rangebegin; bool need_class = 0; + SV *listsv; + register char *e; + UV n; + + ret = reganode(pRExC_state, ANYOF, 0); + + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) = 0; + + if (*RExC_parse == '^') { /* Complement of range. */ + RExC_naughty++; + RExC_parse++; + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } - ret = reg_node(pRExC_state, ANYOF); if (SIZE_ONLY) RExC_size += ANYOF_SKIP; else { - ret->flags = 0; - ANYOF_BITMAP_ZERO(ret); RExC_emit += ANYOF_SKIP; if (FOLD) ANYOF_FLAGS(ret) |= ANYOF_FOLD; if (LOC) ANYOF_FLAGS(ret) |= ANYOF_LOCALE; - } - if (*RExC_parse == '^') { /* Complement of range. */ - RExC_naughty++; - RExC_parse++; - if (!SIZE_ONLY) - ANYOF_FLAGS(ret) |= ANYOF_INVERT; + ANYOF_BITMAP_ZERO(ret); + listsv = newSVpvn("# comment\n", 10); } if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) checkposixcc(pRExC_state); if (*RExC_parse == ']' || *RExC_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ + goto charclassloop; /* allow 1st char to be ] or - */ + while (RExC_parse < RExC_end && *RExC_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + if (!range) rangebegin = RExC_parse; - value = UCHARAT(RExC_parse++); + if (UTF) { + value = utf8_to_uv((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); if (value == '[') namedclass = regpposixcc(pRExC_state, value); else if (value == '\\') { - value = UCHARAT(RExC_parse++); + if (UTF) { + value = utf8_to_uv((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); /* Some compilers cannot handle switching on 64-bit integer - * values, therefore the 'value' cannot be an UV. --jhi */ - switch (value) { + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. --jhi */ + switch ((I32)value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'p': + case 'P': + if (*RExC_parse == '{') { + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\p{}"); + n = e - RExC_parse; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + if (value == 'p') + Perl_sv_catpvf(aTHX_ listsv, + "+utf8::%.*s\n", (int)n, RExC_parse); + else + Perl_sv_catpvf(aTHX_ listsv, + "!utf8::%.*s\n", (int)n, RExC_parse); + } + RExC_parse = e + 1; + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + continue; case 'n': value = '\n'; break; case 'r': value = '\r'; break; case 't': value = '\t'; break; @@ -3259,9 +3268,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) case 'a': value = '\057'; break; #endif case 'x': - numlen = 0; /* disallow underscores */ - value = (UV)scan_hex(RExC_parse, 2, &numlen); - RExC_parse += numlen; + if (*RExC_parse == '{') { + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\x{}"); + numlen = 1; /* allow underscores */ + value = (UV)scan_hex(RExC_parse, + e - RExC_parse, + &numlen); + RExC_parse = e + 1; + } + else { + numlen = 0; /* disallow underscores */ + value = (UV)scan_hex(RExC_parse, 2, &numlen); + RExC_parse += numlen; + } break; case 'c': value = UCHARAT(RExC_parse++); @@ -3275,16 +3296,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) break; default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - - vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value); + vWARN2(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } - } - if (namedclass > OOB_NAMEDCLASS) { - if (!need_class && !SIZE_ONLY) + } /* end of \blah */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + + if (!SIZE_ONLY && !need_class) ANYOF_CLASS_ZERO(ret); + need_class = 1; - if (range) { /* a-\d, a-[:digit:] */ + + /* a bad range like a-\d, a-[:digit:] ? */ + if (range) { if (!SIZE_ONLY) { if (ckWARN(WARN_REGEXP)) vWARN4(RExC_parse, @@ -3292,11 +3319,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) RExC_parse - rangebegin, RExC_parse - rangebegin, rangebegin); - ANYOF_BITMAP_SET(ret, lastvalue); - ANYOF_BITMAP_SET(ret, '-'); + if (lastvalue < 256) { + ANYOF_BITMAP_SET(ret, lastvalue); + ANYOF_BITMAP_SET(ret, '-'); + } + else { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + Perl_sv_catpvf(aTHX_ listsv, + /* 0x002D is Unicode for '-' */ + "%04"UVxf"\n002D\n", (UV)lastvalue); + } } - range = 0; /* this is not a true range */ + + range = 0; /* this was not a true range */ } + if (!SIZE_ONLY) { switch (namedclass) { case ANYOF_ALNUM: @@ -3307,6 +3344,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: if (LOC) @@ -3316,42 +3354,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_SPACE: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_SPACE); - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NSPACE: + case ANYOF_ALNUMC: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); else { for (value = 0; value < 256; value++) - if (!isSPACE(value)) + if (isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); - else { - for (value = '0'; value <= '9'; value++) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); - else { - for (value = 0; value < '0'; value++) - ANYOF_BITMAP_SET(ret, value); - for (value = '9' + 1; value < 256; value++) - ANYOF_BITMAP_SET(ret, value); - } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; case ANYOF_NALNUMC: if (LOC) @@ -3361,15 +3374,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_ALNUMC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); - else { - for (value = 0; value < 256; value++) - if (isALNUMC(value)) - ANYOF_BITMAP_SET(ret, value); - } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; case ANYOF_ALPHA: if (LOC) @@ -3379,6 +3384,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; case ANYOF_NALPHA: if (LOC) @@ -3388,6 +3394,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; case ANYOF_ASCII: if (LOC) @@ -3402,6 +3409,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; case ANYOF_NASCII: if (LOC) @@ -3416,6 +3424,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; case ANYOF_BLANK: if (LOC) @@ -3425,6 +3434,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; case ANYOF_NBLANK: if (LOC) @@ -3434,6 +3444,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; case ANYOF_CNTRL: if (LOC) @@ -3443,7 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - lastvalue = OOB_CHAR8; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; case ANYOF_NCNTRL: if (LOC) @@ -3453,6 +3464,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); + break; + case ANYOF_DIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + else { + /* consecutive digits assumed */ + for (value = '0'; value <= '9'; value++) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); + break; + case ANYOF_NDIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + else { + /* consecutive digits assumed */ + for (value = 0; value < '0'; value++) + ANYOF_BITMAP_SET(ret, value); + for (value = '9' + 1; value < 256; value++) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; case ANYOF_GRAPH: if (LOC) @@ -3462,6 +3496,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; case ANYOF_NGRAPH: if (LOC) @@ -3471,6 +3506,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; case ANYOF_LOWER: if (LOC) @@ -3480,6 +3516,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; case ANYOF_NLOWER: if (LOC) @@ -3489,6 +3526,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; case ANYOF_PRINT: if (LOC) @@ -3498,6 +3536,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; case ANYOF_NPRINT: if (LOC) @@ -3507,6 +3546,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; case ANYOF_PSXSPC: if (LOC) @@ -3516,6 +3556,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NPSXSPC: if (LOC) @@ -3525,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_PUNCT: if (LOC) @@ -3534,6 +3576,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; case ANYOF_NPUNCT: if (LOC) @@ -3543,6 +3586,27 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); + break; + case ANYOF_SPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_SPACE); + else { + for (value = 0; value < 256; value++) + if (isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); + break; + case ANYOF_NSPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + else { + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); break; case ANYOF_UPPER: if (LOC) @@ -3552,6 +3616,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; case ANYOF_NUPPER: if (LOC) @@ -3561,6 +3626,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; case ANYOF_XDIGIT: if (LOC) @@ -3570,6 +3636,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; case ANYOF_NXDIGIT: if (LOC) @@ -3579,6 +3646,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; default: vFAIL("Invalid [::] class"); @@ -3588,7 +3656,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_FLAGS(ret) |= ANYOF_CLASS; continue; } - } + } /* end of namedclass \blah */ + if (range) { if (lastvalue > value) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", @@ -3596,14 +3665,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) RExC_parse - rangebegin, rangebegin); } - range = 0; + range = 0; /* not a true range */ } else { - lastvalue = value; + lastvalue = value; /* save the beginning of the range */ if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && RExC_parse[1] != ']') { RExC_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { if (ckWARN(WARN_REGEXP)) vWARN4(RExC_parse, "False [] range \"%*.*s\"", @@ -3613,325 +3684,89 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!SIZE_ONLY) ANYOF_BITMAP_SET(ret, '-'); } else - range = 1; - continue; /* do it next time */ + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ } } + /* now is the next time */ if (!SIZE_ONLY) { + if (lastvalue < 256 && value < 256) { #ifndef ASCIIish /* EBCDIC, for example. */ - if ((isLOWER(lastvalue) && isLOWER(value)) || - (isUPPER(lastvalue) && isUPPER(value))) - { - I32 i; - if (isLOWER(lastvalue)) { - for (i = lastvalue; i <= value; i++) - if (isLOWER(i)) - ANYOF_BITMAP_SET(ret, i); - } else { - for (i = lastvalue; i <= value; i++) - if (isUPPER(i)) - ANYOF_BITMAP_SET(ret, i); + if ((isLOWER(lastvalue) && isLOWER(value)) || + (isUPPER(lastvalue) && isUPPER(value))) + { + IV i; + if (isLOWER(lastvalue)) { + for (i = lastvalue; i <= value; i++) + if (isLOWER(i)) + ANYOF_BITMAP_SET(ret, i); + } else { + for (i = lastvalue; i <= value; i++) + if (isUPPER(i)) + ANYOF_BITMAP_SET(ret, i); + } } - } - else + else #endif - for ( ; lastvalue <= value; lastvalue++) - ANYOF_BITMAP_SET(ret, lastvalue); + for ( ; lastvalue <= value; lastvalue++) + ANYOF_BITMAP_SET(ret, lastvalue); + } else { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + if (lastvalue < value) + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", + (UV)lastvalue, (UV)value); + else + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", + (UV)value); + } } - range = 0; + + range = 0; /* this range (if it was one) is done now */ } + if (need_class) { if (SIZE_ONLY) RExC_size += ANYOF_CLASS_ADD_SKIP; else RExC_emit += ANYOF_CLASS_ADD_SKIP; } + /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ if (!SIZE_ONLY && - (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) { + (ANYOF_FLAGS(ret) & + /* If the only flag is folding (plus possibly inversion). */ + (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) { for (value = 0; value < 256; ++value) { if (ANYOF_BITMAP_TEST(ret, value)) { - I32 cf = PL_fold[value]; - ANYOF_BITMAP_SET(ret, cf); + IV fold = PL_fold[value]; + + if (fold != value) + ANYOF_BITMAP_SET(ret, fold); } } ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; } + /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { + if (!SIZE_ONLY && + /* If the only flag is inversion. */ + (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; ANYOF_FLAGS(ret) = 0; } - return ret; -} -STATIC regnode * -S_regclassutf8(pTHX_ RExC_state_t *pRExC_state) -{ - register char *e; - register U32 value; - register U32 lastvalue = OOB_UTF8; - register I32 range = 0; - register regnode *ret; - STRLEN numlen; - I32 n; - SV *listsv; - U8 flags = 0; - I32 namedclass; - char *rangebegin; - - if (*RExC_parse == '^') { /* Complement of range. */ - RExC_naughty++; - RExC_parse++; - if (!SIZE_ONLY) - flags |= ANYOF_INVERT; - } - if (!SIZE_ONLY) { - if (FOLD) - flags |= ANYOF_FOLD; - if (LOC) - flags |= ANYOF_LOCALE; - listsv = newSVpvn("# comment\n", 10); - } - - if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) - checkposixcc(pRExC_state); - - if (*RExC_parse == ']' || *RExC_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ - - while (RExC_parse < RExC_end && *RExC_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; - if (!range) - rangebegin = RExC_parse; - value = utf8_to_uv((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); - RExC_parse += numlen; - if (value == '[') - namedclass = regpposixcc(pRExC_state, value); - else if (value == '\\') { - value = (U32)utf8_to_uv((U8*)RExC_parse, - RExC_end - RExC_parse, - &numlen, 0); - RExC_parse += numlen; - /* Some compilers cannot handle switching on 64-bit integer - * values, therefore value cannot be an UV. Yes, this will - * be a problem later if we want switch on Unicode. --jhi */ - switch (value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; - case 's': namedclass = ANYOF_SPACE; break; - case 'S': namedclass = ANYOF_NSPACE; break; - case 'd': namedclass = ANYOF_DIGIT; break; - case 'D': namedclass = ANYOF_NDIGIT; break; - case 'p': - case 'P': - if (*RExC_parse == '{') { - e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\p{}"); - n = e - RExC_parse; - } - else { - e = RExC_parse; - n = 1; - } - if (!SIZE_ONLY) { - if (value == 'p') - Perl_sv_catpvf(aTHX_ listsv, - "+utf8::%.*s\n", (int)n, RExC_parse); - else - Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", (int)n, RExC_parse); - } - RExC_parse = e + 1; - lastvalue = OOB_UTF8; - continue; - case 'n': value = '\n'; break; - case 'r': value = '\r'; break; - case 't': value = '\t'; break; - case 'f': value = '\f'; break; - case 'b': value = '\b'; break; -#ifdef ASCIIish - case 'e': value = '\033'; break; - case 'a': value = '\007'; break; -#else - case 'e': value = '\047'; break; - case 'a': value = '\057'; break; -#endif - case 'x': - if (*RExC_parse == '{') { - e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\x{}"); - numlen = 1; /* allow underscores */ - value = (UV)scan_hex(RExC_parse, - e - RExC_parse, - &numlen); - RExC_parse = e + 1; - } - else { - numlen = 0; /* disallow underscores */ - value = (UV)scan_hex(RExC_parse, 2, &numlen); - RExC_parse += numlen; - } - break; - case 'c': - value = UCHARAT(RExC_parse++); - value = toCTRL(value); - break; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - numlen = 0; /* disallow underscores */ - value = (UV)scan_oct(--RExC_parse, 3, &numlen); - RExC_parse += numlen; - break; - default: - if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - vWARN2(RExC_parse, - "Unrecognized escape \\%c in character class passed through", - (int)value); - break; - } - } - if (namedclass > OOB_NAMEDCLASS) { - if (range) { /* a-\d, a-[:digit:] */ - if (!SIZE_ONLY) { - if (ckWARN(WARN_REGEXP)) - vWARN4(RExC_parse, - "False [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "%04"UVxf"\n002D\n", (UV)lastvalue); - } - range = 0; - } - if (!SIZE_ONLY) { - switch (namedclass) { - case ANYOF_ALNUM: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; - case ANYOF_NALNUM: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_ALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; - case ANYOF_NALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; - case ANYOF_ALPHA: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; - case ANYOF_NALPHA: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; - case ANYOF_ASCII: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; - case ANYOF_NASCII: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; - case ANYOF_CNTRL: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; - case ANYOF_NCNTRL: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; - case ANYOF_GRAPH: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; - case ANYOF_NGRAPH: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; - case ANYOF_DIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; - case ANYOF_NDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; - case ANYOF_LOWER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; - case ANYOF_NLOWER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; - case ANYOF_PRINT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; - case ANYOF_NPRINT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; - case ANYOF_PUNCT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; - case ANYOF_NPUNCT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; - case ANYOF_SPACE: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break; - case ANYOF_NSPACE: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break; - case ANYOF_BLANK: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; - case ANYOF_NBLANK: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; - case ANYOF_PSXSPC: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; - case ANYOF_NPSXSPC: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; - case ANYOF_UPPER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; - case ANYOF_NUPPER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; - case ANYOF_XDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; - case ANYOF_NXDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; - } - continue; - } - } - if (range) { - if (lastvalue > value) { /* b-a */ - Simple_vFAIL4("Invalid [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - } - range = 0; - } - else { - lastvalue = value; - if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && - RExC_parse[1] != ']') { - RExC_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_REGEXP)) - vWARN4(RExC_parse, - "False [] range \"%*.*s\"", - RExC_parse - rangebegin, - RExC_parse - rangebegin, - rangebegin); - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "002D\n"); - } else - range = 1; - continue; /* do it next time */ - } - } - /* now is the next time */ - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)lastvalue, (UV)value); - range = 0; - } - - ret = reganode(pRExC_state, ANYOFUTF8, 0); - - if (!SIZE_ONLY) { - SV *rv = swash_init("utf8", "", listsv, 1, 0); -#ifdef DEBUGGING + if (!SIZE_ONLY) { AV *av = newAV(); - av_push(av, rv); - av_push(av, listsv); - rv = newRV_inc((SV*)av); -#else - SvREFCNT_dec(listsv); -#endif + SV *rv; + + av_store(av, 0, listsv); + av_store(av, 1, NULL); + rv = newRV_noinc((SV*)av); n = add_data(pRExC_state, 1, "s"); RExC_rx->data->data[n] = (void*)rv; - ARG1_SET(ret, flags); - ARG2_SET(ret, n); + ARG_SET(ret, n); } return ret; @@ -4269,7 +4104,7 @@ Perl_regdump(pTHX_ regexp *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 127 || c == 255) + if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); else if (c == '-' || c == ']' || c == '\\' || c == '^') Perl_sv_catpvf(aTHX_ sv, "\\%c", c); @@ -4311,8 +4146,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { int i, rangestart = -1; - bool anyofutf8 = OP(o) == ANYOFUTF8; - U8 flags = anyofutf8 ? ARG1(o) : o->flags; + U8 flags = ANYOF_FLAGS(o); const char * const anyofs[] = { /* Should be syncronized with * ANYOF_ #xdefines in regcomp.h */ "\\w", @@ -4354,78 +4188,93 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpv(sv, "^"); - if (OP(o) == ANYOF) { - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) - put_byte(sv, rangestart); - else { + for (i = 0; i <= 256; i++) { + if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) put_byte(sv, rangestart); - sv_catpv(sv, "-"); - put_byte(sv, i - 1); - } - rangestart = -1; + else { + put_byte(sv, rangestart); + sv_catpv(sv, "-"); + put_byte(sv, i - 1); } + rangestart = -1; } - if (o->flags & ANYOF_CLASS) - for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) - if (ANYOF_CLASS_TEST(o,i)) - sv_catpv(sv, anyofs[i]); } - else { - SV *rv = (SV*)PL_regdata->data[ARG2(o)]; - AV *av = (AV*)SvRV((SV*)rv); - SV *sw = *av_fetch(av, 0, FALSE); - SV *lv = *av_fetch(av, 1, FALSE); - UV i; - U8 s[UTF8_MAXLEN+1]; - for (i = 0; i <= 256; i++) { /* just the first 256 */ - U8 *e = uv_to_utf8(s, i); - if (i < 256 && swash_fetch(sw, s)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - U8 *p; - - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) { - for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) - put_byte(sv, *p); + + if (o->flags & ANYOF_CLASS) + for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) + if (ANYOF_CLASS_TEST(o,i)) + sv_catpv(sv, anyofs[i]); + + if (flags & ANYOF_UNICODE) + sv_catpv(sv, "{unicode}"); + + { + SV *lv; + SV *sw = regclass_swash(o, FALSE, &lv); + + if (lv) { + if (sw) { + UV i; + U8 s[UTF8_MAXLEN+1]; + + for (i = 0; i <= 256; i++) { /* just the first 256 */ + U8 *e = uv_to_utf8(s, i); + + if (i < 256 && swash_fetch(sw, s)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + U8 *p; + + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) { + for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + } + else { + for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + sv_catpv(sv, "-"); + for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) + put_byte(sv, *p); + } + rangestart = -1; + } } - else { - for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) - put_byte(sv, *p); - sv_catpv(sv, "-"); - for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) - put_byte(sv, *p); - } - rangestart = -1; + + sv_catpv(sv, "..."); /* et cetera */ } - } - sv_catpv(sv, "..."); - { - char *s = savepv(SvPVX(lv)); - - while(*s && *s != '\n') s++; - if (*s == '\n') { - char *t = ++s; - while (*s) { - if (*s == '\n') - *s = ' '; - s++; + { + char *s = savepv(SvPVX(lv)); + char *origs = s; + + while(*s && *s != '\n') s++; + + if (*s == '\n') { + char *t = ++s; + + while (*s) { + if (*s == '\n') + *s = ' '; + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); } - if (s[-1] == ' ') - s[-1] = 0; - - sv_catpv(sv, t); + + Safefree(origs); } } } + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -4486,16 +4335,6 @@ Perl_pregfree(pTHX_ struct regexp *r) while (--n >= 0) { switch (r->data->what[n]) { case 's': -#ifdef DEBUGGING - { - SV *rv = (SV*)r->data->data[n]; - AV *av = (AV*)SvRV((SV*)rv); - SV *sw = *av_fetch(av, 0, FALSE); - SV *lv = *av_fetch(av, 1, FALSE); - SvREFCNT_dec(sw); - SvREFCNT_dec(lv); - } -#endif SvREFCNT_dec((SV*)r->data->data[n]); break; case 'f': @@ -4657,4 +4496,3 @@ clear_re(pTHXo_ void *r) { ReREFCNT_dec((regexp *)r); } - @@ -88,12 +88,13 @@ struct regnode_2 { }; #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ -#define ANYOF_CLASSBITMAP_SIZE 4 +#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ struct regnode_charclass { U8 flags; U8 type; U16 next_off; + U32 arg1; char bitmap[ANYOF_BITMAP_SIZE]; }; @@ -101,6 +102,7 @@ struct regnode_charclass_class { U8 flags; U8 type; U16 next_off; + U32 arg1; char bitmap[ANYOF_BITMAP_SIZE]; char classflags[ANYOF_CLASSBITMAP_SIZE]; }; @@ -180,13 +182,21 @@ struct regnode_charclass_class { /* Flags for node->flags of ANYOF */ -#define ANYOF_CLASS 0x08 -#define ANYOF_INVERT 0x04 -#define ANYOF_FOLD 0x02 -#define ANYOF_LOCALE 0x01 +#define ANYOF_CLASS 0x08 +#define ANYOF_INVERT 0x04 +#define ANYOF_FOLD 0x02 +#define ANYOF_LOCALE 0x01 /* Used for regstclass only */ -#define ANYOF_EOS 0x10 /* Can match an empty string too */ +#define ANYOF_EOS 0x10 /* Can match an empty string too */ + +/* There is a character or a range past 0xff */ +#define ANYOF_UNICODE 0x20 + +/* Are there any runtime flags on in this node? */ +#define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f) + +#define ANYOF_FLAGS_ALL 0xff /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ @@ -220,7 +230,7 @@ struct regnode_charclass_class { #define ANYOF_NXDIGIT 25 #define ANYOF_PSXSPC 26 /* POSIX space: \s plus the vertical tab */ #define ANYOF_NPSXSPC 27 -#define ANYOF_BLANK 28 /* GNU extension: space and tab */ +#define ANYOF_BLANK 28 /* GNU extension: space and tab: non-vertical space */ #define ANYOF_NBLANK 29 #define ANYOF_MAX 32 @@ -238,7 +248,6 @@ struct regnode_charclass_class { #define ANYOF_CLASS_SIZE (sizeof(struct regnode_charclass_class)) #define ANYOF_FLAGS(p) ((p)->flags) -#define ANYOF_FLAGS_ALL 0xff #define ANYOF_BIT(c) (1 << ((c) & 7)) @@ -300,12 +309,14 @@ EXTCONST U8 PL_varies[] = { EXTCONST U8 PL_simple[]; #else EXTCONST U8 PL_simple[] = { - REG_ANY, ANYUTF8, SANY, SANYUTF8, ANYOF, ANYOFUTF8, - ALNUM, ALNUMUTF8, ALNUML, ALNUMLUTF8, - NALNUM, NALNUMUTF8, NALNUML, NALNUMLUTF8, - SPACE, SPACEUTF8, SPACEL, SPACELUTF8, - NSPACE, NSPACEUTF8, NSPACEL, NSPACELUTF8, - DIGIT, DIGITUTF8, NDIGIT, NDIGITUTF8, 0 + REG_ANY, SANY, + ANYOF, + ALNUM, ALNUML, + NALNUM, NALNUML, + SPACE, SPACEL, + NSPACE, NSPACEL, + DIGIT, NDIGIT, + 0 }; #endif diff --git a/regcomp.sym b/regcomp.sym index bb5f8f8482..59284f4b21 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -16,46 +16,27 @@ EOL EOL, no Match "" at end of line. MEOL EOL, no Same, assuming multiline. SEOL EOL, no Same, assuming singleline. BOUND BOUND, no Match "" at any word boundary -BOUNDUTF8 BOUND, no Match "" at any word boundary BOUNDL BOUND, no Match "" at any word boundary -BOUNDLUTF8 BOUND, no Match "" at any word boundary NBOUND NBOUND, no Match "" at any word non-boundary -NBOUNDUTF8 NBOUND, no Match "" at any word non-boundary NBOUNDL NBOUND, no Match "" at any word non-boundary -NBOUNDLUTF8 NBOUND, no Match "" at any word non-boundary GPOS GPOS, no Matches where last m//g left off. # [Special] alternatives REG_ANY REG_ANY, no Match any one character (except newline). -ANYUTF8 REG_ANY, no Match any one Unicode character (except newline). SANY REG_ANY, no Match any one character. -SANYUTF8 REG_ANY, no Match any one Unicode character. ANYOF ANYOF, sv Match character in (or not in) this class. -ANYOFUTF8 ANYOF, sv 1 Match character in (or not in) this class. ALNUM ALNUM, no Match any alphanumeric character -ALNUMUTF8 ALNUM, no Match any alphanumeric character in utf8 ALNUML ALNUM, no Match any alphanumeric char in locale -ALNUMLUTF8 ALNUM, no Match any alphanumeric char in locale+utf8 NALNUM NALNUM, no Match any non-alphanumeric character -NALNUMUTF8 NALNUM, no Match any non-alphanumeric character in utf8 NALNUML NALNUM, no Match any non-alphanumeric char in locale -NALNUMLUTF8 NALNUM, no Match any non-alphanumeric char in locale+utf8 SPACE SPACE, no Match any whitespace character -SPACEUTF8 SPACE, no Match any whitespace character in utf8 SPACEL SPACE, no Match any whitespace char in locale -SPACELUTF8 SPACE, no Match any whitespace char in locale+utf8 NSPACE NSPACE, no Match any non-whitespace character -NSPACEUTF8 NSPACE, no Match any non-whitespace character in utf8 NSPACEL NSPACE, no Match any non-whitespace char in locale -NSPACELUTF8 NSPACE, no Match any non-whitespace char in locale+utf8 DIGIT DIGIT, no Match any numeric character -DIGITUTF8 DIGIT, no Match any numeric character in utf8 DIGITL DIGIT, no Match any numeric character in locale -DIGITLUTF8 DIGIT, no Match any numeric character in locale+utf8 NDIGIT NDIGIT, no Match any non-numeric character -NDIGITUTF8 NDIGIT, no Match any non-numeric character in utf8 NDIGITL NDIGIT, no Match any non-numeric character in locale -NDIGITLUTF8 NDIGIT, no Match any non-numeric character in locale+utf8 CLUMP CLUMP, no Match any combining character sequence # BRANCH The set of branches constituting a single choice are hooked @@ -105,13 +105,6 @@ * Forwards. */ -#define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c) : ANYOF_BITMAP_TEST(p,c)) -#ifdef DEBUGGING -# define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch(*av_fetch((AV*)SvRV((SV*)PL_regdata->data[ARG2(f)]),0,FALSE),p)) -#else -# define REGINCLASSUTF8(f,p) (ARG1(f) ? reginclassutf8(f,p) : swash_fetch((SV*)PL_regdata->data[ARG2(f)],p)) -#endif - #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) @@ -738,7 +731,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, t = s; if (prog->reganch & ROPT_UTF8) { - PL_regdata = prog->data; /* Used by REGINCLASS UTF logic */ + PL_regdata = prog->data; PL_bostr = startpos; } s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1); @@ -840,25 +833,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta unsigned int c2; char *e; register I32 tmp = 1; /* Scratch variable? */ + register bool do_utf8 = DO_UTF8(PL_reg_sv); /* We know what class it must start with. */ switch (OP(c)) { - case ANYOFUTF8: - while (s < strend) { - if (REGINCLASSUTF8(c, (U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += UTF8SKIP(s); - } - break; case ANYOF: while (s < strend) { - if (REGINCLASS(c, *(U8*)s)) { + if (reginclass(c, (U8*)s, do_utf8)) { if (tmp && (norun || regtry(prog, s))) goto got_it; else @@ -866,7 +847,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else tmp = 1; - s++; + s += do_utf8 ? UTF8SKIP(s) : 1; } break; case EXACTF: @@ -912,42 +893,40 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: - tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { - tmp = !tmp; - if ((norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + if (s == startpos) + tmp = '\n'; + else { + U8 *r = reghop((U8*)s, -1); + + tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); + } + tmp = ((OP(c) == BOUND ? + isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); + while (s < strend) { + if (tmp == !(OP(c) == BOUND ? + swash_fetch(PL_utf8_alnum, (U8*)s) : + isALNUM_LC_utf8((U8*)s))) + { + tmp = !tmp; + if ((norun || regtry(prog, s))) + goto got_it; + } + s += UTF8SKIP(s); } - s++; } - if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) - goto got_it; - break; - case BOUNDLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUNDUTF8: - if (s == startpos) - tmp = '\n'; else { - U8 *r = reghop((U8*)s, -1); - - tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); - } - tmp = ((OP(c) == BOUNDUTF8 ? - isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == BOUNDUTF8 ? - swash_fetch(PL_utf8_alnum, (U8*)s) : - isALNUM_LC_utf8((U8*)s))) - { - tmp = !tmp; - if ((norun || regtry(prog, s))) - goto got_it; + tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; + tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + while (s < strend) { + if (tmp == + !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { + tmp = !tmp; + if ((norun || regtry(prog, s))) + goto got_it; + } + s++; } - s += UTF8SKIP(s); } if ((!prog->minlen && tmp) && (norun || regtry(prog, s))) goto got_it; @@ -956,365 +935,382 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUND: - tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == NBOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) - tmp = !tmp; - else if ((norun || regtry(prog, s))) - goto got_it; - s++; + if (do_utf8) { + if (s == startpos) + tmp = '\n'; + else { + U8 *r = reghop((U8*)s, -1); + + tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); + } + tmp = ((OP(c) == NBOUND ? + isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); + while (s < strend) { + if (tmp == !(OP(c) == NBOUND ? + swash_fetch(PL_utf8_alnum, (U8*)s) : + isALNUM_LC_utf8((U8*)s))) + tmp = !tmp; + else if ((norun || regtry(prog, s))) + goto got_it; + s += UTF8SKIP(s); + } } - if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) - goto got_it; - break; - case NBOUNDLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NBOUNDUTF8: - if (s == startpos) - tmp = '\n'; else { - U8 *r = reghop((U8*)s, -1); - - tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); - } - tmp = ((OP(c) == NBOUNDUTF8 ? - isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0); - while (s < strend) { - if (tmp == !(OP(c) == NBOUNDUTF8 ? - swash_fetch(PL_utf8_alnum, (U8*)s) : - isALNUM_LC_utf8((U8*)s))) - tmp = !tmp; - else if ((norun || regtry(prog, s))) - goto got_it; - s += UTF8SKIP(s); + tmp = (s != startpos) ? UCHARAT(s - 1) : '\n'; + tmp = ((OP(c) == NBOUND ? + isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + while (s < strend) { + if (tmp == + !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) + tmp = !tmp; + else if ((norun || regtry(prog, s))) + goto got_it; + s++; + } } if ((!prog->minlen && !tmp) && (norun || regtry(prog, s))) goto got_it; break; case ALNUM: - while (s < strend) { - if (isALNUM(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (swash_fetch(PL_utf8_alnum, (U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case ALNUMUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_alnum, (U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isALNUM(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case ALNUML: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isALNUM_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (isALNUM_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case ALNUMLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isALNUM_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isALNUM_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NALNUM: - while (s < strend) { - if (!isALNUM(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!swash_fetch(PL_utf8_alnum, (U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NALNUMUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_alnum, (U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isALNUM(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NALNUML: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isALNUM_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!isALNUM_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NALNUMLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isALNUM_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isALNUM_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case SPACE: - while (s < strend) { - if (isSPACE(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case SPACEUTF8: - while (s < strend) { - if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isSPACE(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case SPACEL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isSPACE_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case SPACELUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isSPACE_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NSPACE: - while (s < strend) { - if (!isSPACE(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NSPACEUTF8: - while (s < strend) { - if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s))) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isSPACE(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NSPACEL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isSPACE_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NSPACELUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isSPACE_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case DIGIT: - while (s < strend) { - if (isDIGIT(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (swash_fetch(PL_utf8_digit,(U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case DIGITUTF8: - while (s < strend) { - if (swash_fetch(PL_utf8_digit,(U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isDIGIT(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case DIGITL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isDIGIT_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (isDIGIT_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case DIGITLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (isDIGIT_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (isDIGIT_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NDIGIT: - while (s < strend) { - if (!isDIGIT(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!swash_fetch(PL_utf8_digit,(U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NDIGITUTF8: - while (s < strend) { - if (!swash_fetch(PL_utf8_digit,(U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isDIGIT(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; case NDIGITL: PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isDIGIT_LC(*s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + if (do_utf8) { + while (s < strend) { + if (!isDIGIT_LC_utf8((U8*)s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s += UTF8SKIP(s); } - else - tmp = 1; - s++; } - break; - case NDIGITLUTF8: - PL_reg_flags |= RF_tainted; - while (s < strend) { - if (!isDIGIT_LC_utf8((U8*)s)) { - if (tmp && (norun || regtry(prog, s))) - goto got_it; + else { + while (s < strend) { + if (!isDIGIT_LC(*s)) { + if (tmp && (norun || regtry(prog, s))) + goto got_it; + else + tmp = doevery; + } else - tmp = doevery; + tmp = 1; + s++; } - else - tmp = 1; - s += UTF8SKIP(s); } break; default: @@ -1606,6 +1602,11 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT) /* don't bother with what can't match */ strend = HOPc(strend, -(minlen - 1)); + DEBUG_r({ + SV *prop = sv_newmortal(); + regprop(prop, c); + PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), s); + }); if (find_byclass(prog, c, s, strend, startpos, 0)) goto got_it; DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n")); @@ -1619,7 +1620,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * last = screaminstr(sv, prog->float_substr, s - strbeg, end_shift, &scream_pos, 1); /* last one */ if (!last) - last = scream_olds; /* Only one occurence. */ + last = scream_olds; /* Only one occurrence. */ } else { STRLEN len; @@ -1891,6 +1892,7 @@ S_regmatch(pTHX_ regnode *prog) int minmod = 0, sw = 0, logical = 0; I32 unwind = 0; I32 firstcp = PL_savestack_ix; + register bool do_utf8 = DO_UTF8(PL_reg_sv); #ifdef DEBUGGING PL_regindent++; @@ -2009,8 +2011,8 @@ S_regmatch(pTHX_ regnode *prog) if (PL_regeol != locinput) sayNO; break; - case SANYUTF8: - if (nextchr & 0x80) { + case SANY: + if (DO_UTF8(PL_reg_sv)) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -2021,13 +2023,8 @@ S_regmatch(pTHX_ regnode *prog) sayNO; nextchr = UCHARAT(++locinput); break; - case SANY: - if (!nextchr && locinput >= PL_regeol) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ANYUTF8: - if (nextchr & 0x80) { + case REG_ANY: + if (DO_UTF8(PL_reg_sv)) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -2038,11 +2035,6 @@ S_regmatch(pTHX_ regnode *prog) sayNO; nextchr = UCHARAT(++locinput); break; - case REG_ANY: - if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') - sayNO; - nextchr = UCHARAT(++locinput); - break; case EXACT: s = STRING(scan); ln = STR_LEN(scan); @@ -2099,22 +2091,24 @@ S_regmatch(pTHX_ regnode *prog) locinput += ln; nextchr = UCHARAT(locinput); break; - case ANYOFUTF8: - if (!REGINCLASSUTF8(scan, (U8*)locinput)) - sayNO; - if (locinput >= PL_regeol) - sayNO; - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; case ANYOF: - if (nextchr < 0) + if (do_utf8) { + if (!reginclass(scan, (U8*)locinput, do_utf8)) + sayNO; + if (locinput >= PL_regeol) + sayNO; + locinput += PL_utf8skip[nextchr]; nextchr = UCHARAT(locinput); - if (!REGINCLASS(scan, nextchr)) - sayNO; - if (!nextchr && locinput >= PL_regeol) - sayNO; - nextchr = UCHARAT(++locinput); + } + else { + if (nextchr < 0) + nextchr = UCHARAT(locinput); + if (!reginclass(scan, (U8*)locinput, do_utf8)) + sayNO; + if (!nextchr && locinput >= PL_regeol) + sayNO; + nextchr = UCHARAT(++locinput); + } break; case ALNUML: PL_reg_flags |= RF_tainted; @@ -2122,19 +2116,8 @@ S_regmatch(pTHX_ regnode *prog) case ALNUM: if (!nextchr) sayNO; - if (!(OP(scan) == ALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case ALNUMLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case ALNUMUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == ALNUMUTF8 + if (do_utf8) { + if (!(OP(scan) == ALNUM ? swash_fetch(PL_utf8_alnum, (U8*)locinput) : isALNUM_LC_utf8((U8*)locinput))) { @@ -2144,7 +2127,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == ALNUMUTF8 + if (!(OP(scan) == ALNUM ? isALNUM(nextchr) : isALNUM_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); @@ -2155,19 +2138,8 @@ S_regmatch(pTHX_ regnode *prog) case NALNUM: if (!nextchr && locinput >= PL_regeol) sayNO; - if (OP(scan) == NALNUM - ? isALNUM(nextchr) : isALNUM_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NALNUMLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NALNUMUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NALNUMUTF8 + if (do_utf8) { + if (OP(scan) == NALNUM ? swash_fetch(PL_utf8_alnum, (U8*)locinput) : isALNUM_LC_utf8((U8*)locinput)) { @@ -2177,7 +2149,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NALNUMUTF8 + if (OP(scan) == NALNUM ? isALNUM(nextchr) : isALNUM_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); @@ -2189,42 +2161,38 @@ S_regmatch(pTHX_ regnode *prog) case BOUND: case NBOUND: /* was last char in word? */ - ln = (locinput != PL_regbol) ? UCHARAT(locinput - 1) : PL_regprev; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { - ln = isALNUM(ln); - n = isALNUM(nextchr); - } - else { - ln = isALNUM_LC(ln); - n = isALNUM_LC(nextchr); - } - if (((!ln) == (!n)) == (OP(scan) == BOUND || OP(scan) == BOUNDL)) - sayNO; - break; - case BOUNDLUTF8: - case NBOUNDLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUNDUTF8: - case NBOUNDUTF8: - /* was last char in word? */ - if (locinput == PL_regbol) - ln = PL_regprev; - else { - U8 *r = reghop((U8*)locinput, -1); - - ln = utf8_to_uv(r, s - (char*)r, 0, 0); - } - if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) { - ln = isALNUM_uni(ln); - n = swash_fetch(PL_utf8_alnum, (U8*)locinput); + if (do_utf8) { + if (locinput == PL_regbol) + ln = PL_regprev; + else { + U8 *r = reghop((U8*)locinput, -1); + + ln = utf8_to_uv(r, s - (char*)r, 0, 0); + } + if (OP(scan) == BOUND || OP(scan) == NBOUND) { + ln = isALNUM_uni(ln); + n = swash_fetch(PL_utf8_alnum, (U8*)locinput); + } + else { + ln = isALNUM_LC_uni(ln); + n = isALNUM_LC_utf8((U8*)locinput); + } } else { - ln = isALNUM_LC_uni(ln); - n = isALNUM_LC_utf8((U8*)locinput); + ln = (locinput != PL_regbol) ? + UCHARAT(locinput - 1) : PL_regprev; + if (OP(scan) == BOUND || OP(scan) == NBOUND) { + ln = isALNUM(ln); + n = isALNUM(nextchr); + } + else { + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchr); + } } - if (((!ln) == (!n)) == (OP(scan) == BOUNDUTF8 || OP(scan) == BOUNDLUTF8)) - sayNO; + if (((!ln) == (!n)) == (OP(scan) == BOUND || + OP(scan) == BOUNDL)) + sayNO; break; case SPACEL: PL_reg_flags |= RF_tainted; @@ -2232,32 +2200,29 @@ S_regmatch(pTHX_ regnode *prog) case SPACE: if (!nextchr) sayNO; - if (!(OP(scan) == SPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case SPACELUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case SPACEUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == SPACEUTF8 - ? swash_fetch(PL_utf8_space, (U8*)locinput) - : isSPACE_LC_utf8((U8*)locinput))) - { - sayNO; + if (DO_UTF8(PL_reg_sv)) { + if (nextchr & 0x80) { + if (!(OP(scan) == SPACE + ? swash_fetch(PL_utf8_space, (U8*)locinput) + : isSPACE_LC_utf8((U8*)locinput))) + { + sayNO; + } + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + break; } - locinput += PL_utf8skip[nextchr]; - nextchr = UCHARAT(locinput); - break; + if (!(OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); + } + else { + if (!(OP(scan) == SPACE + ? isSPACE(nextchr) : isSPACE_LC(nextchr))) + sayNO; + nextchr = UCHARAT(++locinput); } - if (!(OP(scan) == SPACEUTF8 - ? isSPACE(nextchr) : isSPACE_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); break; case NSPACEL: PL_reg_flags |= RF_tainted; @@ -2265,19 +2230,8 @@ S_regmatch(pTHX_ regnode *prog) case NSPACE: if (!nextchr && locinput >= PL_regeol) sayNO; - if (OP(scan) == NSPACE - ? isSPACE(nextchr) : isSPACE_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NSPACELUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NSPACEUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NSPACEUTF8 + if (DO_UTF8(PL_reg_sv)) { + if (OP(scan) == NSPACE ? swash_fetch(PL_utf8_space, (U8*)locinput) : isSPACE_LC_utf8((U8*)locinput)) { @@ -2287,7 +2241,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NSPACEUTF8 + if (OP(scan) == NSPACE ? isSPACE(nextchr) : isSPACE_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); @@ -2298,19 +2252,8 @@ S_regmatch(pTHX_ regnode *prog) case DIGIT: if (!nextchr) sayNO; - if (!(OP(scan) == DIGIT - ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case DIGITLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case DIGITUTF8: - if (!nextchr) - sayNO; - if (nextchr & 0x80) { - if (!(OP(scan) == DIGITUTF8 + if (DO_UTF8(PL_reg_sv)) { + if (!(OP(scan) == DIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput))) { @@ -2320,7 +2263,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (!(OP(scan) == DIGITUTF8 + if (!(OP(scan) == DIGIT ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) sayNO; nextchr = UCHARAT(++locinput); @@ -2331,19 +2274,8 @@ S_regmatch(pTHX_ regnode *prog) case NDIGIT: if (!nextchr && locinput >= PL_regeol) sayNO; - if (OP(scan) == NDIGIT - ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) - sayNO; - nextchr = UCHARAT(++locinput); - break; - case NDIGITLUTF8: - PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NDIGITUTF8: - if (!nextchr && locinput >= PL_regeol) - sayNO; - if (nextchr & 0x80) { - if (OP(scan) == NDIGITUTF8 + if (DO_UTF8(PL_reg_sv)) { + if (OP(scan) == NDIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput)) { @@ -2353,7 +2285,7 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(locinput); break; } - if (OP(scan) == NDIGITUTF8 + if (OP(scan) == NDIGIT ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)) sayNO; nextchr = UCHARAT(++locinput); @@ -3461,30 +3393,33 @@ S_regrepeat(pTHX_ regnode *p, I32 max) register I32 c; register char *loceol = PL_regeol; register I32 hardcount = 0; + register bool do_utf8 = DO_UTF8(PL_reg_sv); scan = PL_reginput; if (max != REG_INFTY && max < loceol - scan) loceol = scan + max; switch (OP(p)) { case REG_ANY: - while (scan < loceol && *scan != '\n') - scan++; - break; - case SANY: - scan = loceol; - break; - case ANYUTF8: - loceol = PL_regeol; - while (scan < loceol && *scan != '\n') { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && *scan != '\n') { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && *scan != '\n') + scan++; } break; - case SANYUTF8: - loceol = PL_regeol; - while (scan < loceol) { - scan += UTF8SKIP(scan); - hardcount++; + case SANY: + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + scan = loceol; } break; case EXACT: /* length of string is 1 */ @@ -3505,135 +3440,144 @@ S_regrepeat(pTHX_ regnode *p, I32 max) (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) scan++; break; - case ANYOFUTF8: - loceol = PL_regeol; - while (scan < loceol && REGINCLASSUTF8(p, (U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; - } - break; case ANYOF: - while (scan < loceol && REGINCLASS(p, *scan)) - scan++; + if (do_utf8) { + loceol = PL_regeol; + while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) + scan++; + } break; case ALNUM: - while (scan < loceol && isALNUM(*scan)) - scan++; - break; - case ALNUMUTF8: - loceol = PL_regeol; - while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isALNUM(*scan)) + scan++; } break; case ALNUML: PL_reg_flags |= RF_tainted; - while (scan < loceol && isALNUM_LC(*scan)) - scan++; - break; - case ALNUMLUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isALNUM_LC(*scan)) + scan++; } break; - break; case NALNUM: - while (scan < loceol && !isALNUM(*scan)) - scan++; - break; - case NALNUMUTF8: - loceol = PL_regeol; - while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isALNUM(*scan)) + scan++; } break; case NALNUML: PL_reg_flags |= RF_tainted; - while (scan < loceol && !isALNUM_LC(*scan)) - scan++; - break; - case NALNUMLUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isALNUM_LC(*scan)) + scan++; } break; case SPACE: - while (scan < loceol && isSPACE(*scan)) - scan++; - break; - case SPACEUTF8: - loceol = PL_regeol; - while (scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isSPACE(*scan)) + scan++; } break; case SPACEL: PL_reg_flags |= RF_tainted; - while (scan < loceol && isSPACE_LC(*scan)) - scan++; - break; - case SPACELUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isSPACE_LC(*scan)) + scan++; } break; case NSPACE: - while (scan < loceol && !isSPACE(*scan)) - scan++; - break; - case NSPACEUTF8: - loceol = PL_regeol; - while (scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isSPACE(*scan)) + scan++; + break; } - break; case NSPACEL: PL_reg_flags |= RF_tainted; - while (scan < loceol && !isSPACE_LC(*scan)) - scan++; - break; - case NSPACELUTF8: - PL_reg_flags |= RF_tainted; - loceol = PL_regeol; - while (scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && + !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isSPACE_LC(*scan)) + scan++; } break; case DIGIT: - while (scan < loceol && isDIGIT(*scan)) - scan++; - break; - case DIGITUTF8: - loceol = PL_regeol; - while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isDIGIT(*scan)) + scan++; } break; - break; case NDIGIT: - while (scan < loceol && !isDIGIT(*scan)) - scan++; - break; - case NDIGITUTF8: - loceol = PL_regeol; - while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) { - scan += UTF8SKIP(scan); - hardcount++; + if (DO_UTF8(PL_reg_sv)) { + loceol = PL_regeol; + while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isDIGIT(*scan)) + scan++; } break; default: /* Called on something of 0 width. */ @@ -3712,102 +3656,139 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp) } /* +- regclass_swash - prepare the utf8 swash +*/ + +SV * +Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp) +{ + SV *sw = NULL; + SV *si = NULL; + + if (PL_regdata && PL_regdata->count) { + U32 n = ARG(node); + + if (PL_regdata->what[n] == 's') { + SV *rv = (SV*)PL_regdata->data[n]; + AV *av = (AV*)SvRV((SV*)rv); + SV **a; + + si = *av_fetch(av, 0, FALSE); + a = av_fetch(av, 1, FALSE); + + if (a) + sw = *a; + else if (si && doinit) { + sw = swash_init("utf8", "", si, 1, 0); + (void)av_store(av, 1, sw); + } + } + } + + if (initsvp) + *initsvp = si; + + return sw; +} + +/* - reginclass - determine if a character falls into a character class */ STATIC bool -S_reginclass(pTHX_ register regnode *p, register I32 c) +S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) { - char flags = ANYOF_FLAGS(p); + char flags = ANYOF_FLAGS(n); bool match = FALSE; - c &= 0xFF; - if (ANYOF_BITMAP_TEST(p, c)) - match = TRUE; - else if (flags & ANYOF_FOLD) { - I32 cf; - if (flags & ANYOF_LOCALE) { - PL_reg_flags |= RF_tainted; - cf = PL_fold_locale[c]; + if (do_utf8 || (flags & ANYOF_UNICODE)) { + if (do_utf8 && !ANYOF_RUNTIME(n)) { + STRLEN len; + UV c = utf8_to_uv_simple(p, &len); + + if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) + match = TRUE; } - else - cf = PL_fold[c]; - if (ANYOF_BITMAP_TEST(p, cf)) - match = TRUE; - } - if (!match && (flags & ANYOF_CLASS)) { - PL_reg_flags |= RF_tainted; - if ( - (ANYOF_CLASS_TEST(p, ANYOF_ALNUM) && isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NALNUM) && !isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_SPACE) && isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NSPACE) && !isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_DIGIT) && isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_ALNUMC) && isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_ALPHA) && isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NALPHA) && !isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_ASCII) && isASCII(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NASCII) && !isASCII(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_CNTRL) && isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_GRAPH) && isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_LOWER) && isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NLOWER) && !isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_PRINT) && isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NPRINT) && !isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_PUNCT) && isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_UPPER) && isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NUPPER) && !isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_XDIGIT) && isXDIGIT(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NXDIGIT) && !isXDIGIT(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_PSXSPC) && isPSXSPC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_BLANK) && isBLANK(c)) || - (ANYOF_CLASS_TEST(p, ANYOF_NBLANK) && !isBLANK(c)) - ) /* How's that for a conditional? */ - { - match = TRUE; + if (!match) { + SV *sw = regclass_swash(n, TRUE, 0); + + if (sw) { + if (swash_fetch(sw, p)) + match = TRUE; + else if (flags & ANYOF_FOLD) { + U8 tmpbuf[UTF8_MAXLEN+1]; + + if (flags & ANYOF_LOCALE) { + PL_reg_flags |= RF_tainted; + uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p)); + } + else + uv_to_utf8(tmpbuf, toLOWER_utf8(p)); + if (swash_fetch(sw, tmpbuf)) + match = TRUE; + } + } } } + else { + U8 c = *p; - return (flags & ANYOF_INVERT) ? !match : match; -} - -STATIC bool -S_reginclassutf8(pTHX_ regnode *f, U8 *p) -{ - char flags = ARG1(f); - bool match = FALSE; -#ifdef DEBUGGING - SV *rv = (SV*)PL_regdata->data[ARG2(f)]; - AV *av = (AV*)SvRV((SV*)rv); - SV *sw = *av_fetch(av, 0, FALSE); - SV *lv = *av_fetch(av, 1, FALSE); -#else - SV *sw = (SV*)PL_regdata->data[ARG2(f)]; -#endif + if (ANYOF_BITMAP_TEST(n, c)) + match = TRUE; + else if (flags & ANYOF_FOLD) { + I32 f; - if (swash_fetch(sw, p)) - match = TRUE; - else if (flags & ANYOF_FOLD) { - U8 tmpbuf[UTF8_MAXLEN+1]; - if (flags & ANYOF_LOCALE) { + if (flags & ANYOF_LOCALE) { + PL_reg_flags |= RF_tainted; + f = PL_fold_locale[c]; + } + else + f = PL_fold[c]; + if (f != c && ANYOF_BITMAP_TEST(n, f)) + match = TRUE; + } + + if (!match && (flags & ANYOF_CLASS)) { PL_reg_flags |= RF_tainted; - uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p)); + if ( + (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) + ) /* How's that for a conditional? */ + { + match = TRUE; + } } - else - uv_to_utf8(tmpbuf, toLOWER_utf8(p)); - if (swash_fetch(sw, tmpbuf)) - match = TRUE; } - /* UTF8 combined with ANYOF_CLASS is ill-defined. */ - return (flags & ANYOF_INVERT) ? !match : match; } @@ -3815,17 +3796,20 @@ STATIC U8 * S_reghop(pTHX_ U8 *s, I32 off) { if (off >= 0) { - while (off-- && s < (U8*)PL_regeol) + while (off-- && s < (U8*)PL_regeol) { + /* XXX could check well-formedness here */ s += UTF8SKIP(s); + } } else { while (off++) { if (s > (U8*)PL_bostr) { s--; - if (*s & 0x80) { - while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80) + if (UTF8_IS_CONTINUED(*s)) { + while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s)) s--; - } /* XXX could check well-formedness here */ + } + /* XXX could check well-formedness here */ } } } @@ -3836,8 +3820,10 @@ STATIC U8 * S_reghopmaybe(pTHX_ U8* s, I32 off) { if (off >= 0) { - while (off-- && s < (U8*)PL_regeol) + while (off-- && s < (U8*)PL_regeol) { + /* XXX could check well-formedness here */ s += UTF8SKIP(s); + } if (off >= 0) return 0; } @@ -3845,10 +3831,11 @@ S_reghopmaybe(pTHX_ U8* s, I32 off) while (off++) { if (s > (U8*)PL_bostr) { s--; - if (*s & 0x80) { - while (s > (U8*)PL_bostr && (*s & 0xc0) == 0x80) + if (UTF8_IS_CONTINUED(*s)) { + while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s)) s--; - } /* XXX could check well-formedness here */ + } + /* XXX could check well-formedness here */ } else break; diff --git a/regnodes.h b/regnodes.h index 89c78e6bac..00dc0ecaec 100644 --- a/regnodes.h +++ b/regnodes.h @@ -13,76 +13,57 @@ #define MEOL 7 /* 0x7 Same, assuming multiline. */ #define SEOL 8 /* 0x8 Same, assuming singleline. */ #define BOUND 9 /* 0x9 Match "" at any word boundary */ -#define BOUNDUTF8 10 /* 0xa Match "" at any word boundary */ -#define BOUNDL 11 /* 0xb Match "" at any word boundary */ -#define BOUNDLUTF8 12 /* 0xc Match "" at any word boundary */ -#define NBOUND 13 /* 0xd Match "" at any word non-boundary */ -#define NBOUNDUTF8 14 /* 0xe Match "" at any word non-boundary */ -#define NBOUNDL 15 /* 0xf Match "" at any word non-boundary */ -#define NBOUNDLUTF8 16 /* 0x10 Match "" at any word non-boundary */ -#define GPOS 17 /* 0x11 Matches where last m//g left off. */ -#define REG_ANY 18 /* 0x12 Match any one character (except newline). */ -#define ANYUTF8 19 /* 0x13 Match any one Unicode character (except newline). */ -#define SANY 20 /* 0x14 Match any one character. */ -#define SANYUTF8 21 /* 0x15 Match any one Unicode character. */ -#define ANYOF 22 /* 0x16 Match character in (or not in) this class. */ -#define ANYOFUTF8 23 /* 0x17 Match character in (or not in) this class. */ -#define ALNUM 24 /* 0x18 Match any alphanumeric character */ -#define ALNUMUTF8 25 /* 0x19 Match any alphanumeric character in utf8 */ -#define ALNUML 26 /* 0x1a Match any alphanumeric char in locale */ -#define ALNUMLUTF8 27 /* 0x1b Match any alphanumeric char in locale+utf8 */ -#define NALNUM 28 /* 0x1c Match any non-alphanumeric character */ -#define NALNUMUTF8 29 /* 0x1d Match any non-alphanumeric character in utf8 */ -#define NALNUML 30 /* 0x1e Match any non-alphanumeric char in locale */ -#define NALNUMLUTF8 31 /* 0x1f Match any non-alphanumeric char in locale+utf8 */ -#define SPACE 32 /* 0x20 Match any whitespace character */ -#define SPACEUTF8 33 /* 0x21 Match any whitespace character in utf8 */ -#define SPACEL 34 /* 0x22 Match any whitespace char in locale */ -#define SPACELUTF8 35 /* 0x23 Match any whitespace char in locale+utf8 */ -#define NSPACE 36 /* 0x24 Match any non-whitespace character */ -#define NSPACEUTF8 37 /* 0x25 Match any non-whitespace character in utf8 */ -#define NSPACEL 38 /* 0x26 Match any non-whitespace char in locale */ -#define NSPACELUTF8 39 /* 0x27 Match any non-whitespace char in locale+utf8 */ -#define DIGIT 40 /* 0x28 Match any numeric character */ -#define DIGITUTF8 41 /* 0x29 Match any numeric character in utf8 */ -#define DIGITL 42 /* 0x2a Match any numeric character in locale */ -#define DIGITLUTF8 43 /* 0x2b Match any numeric character in locale+utf8 */ -#define NDIGIT 44 /* 0x2c Match any non-numeric character */ -#define NDIGITUTF8 45 /* 0x2d Match any non-numeric character in utf8 */ -#define NDIGITL 46 /* 0x2e Match any non-numeric character in locale */ -#define NDIGITLUTF8 47 /* 0x2f Match any non-numeric character in locale+utf8 */ -#define CLUMP 48 /* 0x30 Match any combining character sequence */ -#define BRANCH 49 /* 0x31 Match this alternative, or the next... */ -#define BACK 50 /* 0x32 Match "", "next" ptr points backward. */ -#define EXACT 51 /* 0x33 Match this string (preceded by length). */ -#define EXACTF 52 /* 0x34 Match this string, folded (prec. by length). */ -#define EXACTFL 53 /* 0x35 Match this string, folded in locale (w/len). */ -#define NOTHING 54 /* 0x36 Match empty string. */ -#define TAIL 55 /* 0x37 Match empty string. Can jump here from outside. */ -#define STAR 56 /* 0x38 Match this (simple) thing 0 or more times. */ -#define PLUS 57 /* 0x39 Match this (simple) thing 1 or more times. */ -#define CURLY 58 /* 0x3a Match this simple thing {n,m} times. */ -#define CURLYN 59 /* 0x3b Match next-after-this simple thing */ -#define CURLYM 60 /* 0x3c Match this medium-complex thing {n,m} times. */ -#define CURLYX 61 /* 0x3d Match this complex thing {n,m} times. */ -#define WHILEM 62 /* 0x3e Do curly processing and see if rest matches. */ -#define OPEN 63 /* 0x3f Mark this point in input as start of #n. */ -#define CLOSE 64 /* 0x40 Analogous to OPEN. */ -#define REF 65 /* 0x41 Match some already matched string */ -#define REFF 66 /* 0x42 Match already matched string, folded */ -#define REFFL 67 /* 0x43 Match already matched string, folded in loc. */ -#define IFMATCH 68 /* 0x44 Succeeds if the following matches. */ -#define UNLESSM 69 /* 0x45 Fails if the following matches. */ -#define SUSPEND 70 /* 0x46 "Independent" sub-RE. */ -#define IFTHEN 71 /* 0x47 Switch, should be preceeded by switcher . */ -#define GROUPP 72 /* 0x48 Whether the group matched. */ -#define LONGJMP 73 /* 0x49 Jump far away. */ -#define BRANCHJ 74 /* 0x4a BRANCH with long offset. */ -#define EVAL 75 /* 0x4b Execute some Perl code. */ -#define MINMOD 76 /* 0x4c Next operator is not greedy. */ -#define LOGICAL 77 /* 0x4d Next opcode should set the flag only. */ -#define RENUM 78 /* 0x4e Group with independently numbered parens. */ -#define OPTIMIZED 79 /* 0x4f Placeholder for dump. */ +#define BOUNDL 10 /* 0xa Match "" at any word boundary */ +#define NBOUND 11 /* 0xb Match "" at any word non-boundary */ +#define NBOUNDL 12 /* 0xc Match "" at any word non-boundary */ +#define GPOS 13 /* 0xd Matches where last m//g left off. */ +#define REG_ANY 14 /* 0xe Match any one character (except newline). */ +#define SANY 15 /* 0xf Match any one character. */ +#define ANYOF 16 /* 0x10 Match character in (or not in) this class. */ +#define ALNUM 17 /* 0x11 Match any alphanumeric character */ +#define ALNUML 18 /* 0x12 Match any alphanumeric char in locale */ +#define NALNUM 19 /* 0x13 Match any non-alphanumeric character */ +#define NALNUML 20 /* 0x14 Match any non-alphanumeric char in locale */ +#define SPACE 21 /* 0x15 Match any whitespace character */ +#define SPACEL 22 /* 0x16 Match any whitespace char in locale */ +#define NSPACE 23 /* 0x17 Match any non-whitespace character */ +#define NSPACEL 24 /* 0x18 Match any non-whitespace char in locale */ +#define DIGIT 25 /* 0x19 Match any numeric character */ +#define DIGITL 26 /* 0x1a Match any numeric character in locale */ +#define NDIGIT 27 /* 0x1b Match any non-numeric character */ +#define NDIGITL 28 /* 0x1c Match any non-numeric character in locale */ +#define CLUMP 29 /* 0x1d Match any combining character sequence */ +#define BRANCH 30 /* 0x1e Match this alternative, or the next... */ +#define BACK 31 /* 0x1f Match "", "next" ptr points backward. */ +#define EXACT 32 /* 0x20 Match this string (preceded by length). */ +#define EXACTF 33 /* 0x21 Match this string, folded (prec. by length). */ +#define EXACTFL 34 /* 0x22 Match this string, folded in locale (w/len). */ +#define NOTHING 35 /* 0x23 Match empty string. */ +#define TAIL 36 /* 0x24 Match empty string. Can jump here from outside. */ +#define STAR 37 /* 0x25 Match this (simple) thing 0 or more times. */ +#define PLUS 38 /* 0x26 Match this (simple) thing 1 or more times. */ +#define CURLY 39 /* 0x27 Match this simple thing {n,m} times. */ +#define CURLYN 40 /* 0x28 Match next-after-this simple thing */ +#define CURLYM 41 /* 0x29 Match this medium-complex thing {n,m} times. */ +#define CURLYX 42 /* 0x2a Match this complex thing {n,m} times. */ +#define WHILEM 43 /* 0x2b Do curly processing and see if rest matches. */ +#define OPEN 44 /* 0x2c Mark this point in input as start of #n. */ +#define CLOSE 45 /* 0x2d Analogous to OPEN. */ +#define REF 46 /* 0x2e Match some already matched string */ +#define REFF 47 /* 0x2f Match already matched string, folded */ +#define REFFL 48 /* 0x30 Match already matched string, folded in loc. */ +#define IFMATCH 49 /* 0x31 Succeeds if the following matches. */ +#define UNLESSM 50 /* 0x32 Fails if the following matches. */ +#define SUSPEND 51 /* 0x33 "Independent" sub-RE. */ +#define IFTHEN 52 /* 0x34 Switch, should be preceeded by switcher . */ +#define GROUPP 53 /* 0x35 Whether the group matched. */ +#define LONGJMP 54 /* 0x36 Jump far away. */ +#define BRANCHJ 55 /* 0x37 BRANCH with long offset. */ +#define EVAL 56 /* 0x38 Execute some Perl code. */ +#define MINMOD 57 /* 0x39 Next operator is not greedy. */ +#define LOGICAL 58 /* 0x3a Next opcode should set the flag only. */ +#define RENUM 59 /* 0x3b Group with independently numbered parens. */ +#define OPTIMIZED 60 /* 0x3c Placeholder for dump. */ #ifndef DOINIT EXTCONST U8 PL_regkind[]; @@ -98,44 +79,25 @@ EXTCONST U8 PL_regkind[] = { EOL, /* MEOL */ EOL, /* SEOL */ BOUND, /* BOUND */ - BOUND, /* BOUNDUTF8 */ BOUND, /* BOUNDL */ - BOUND, /* BOUNDLUTF8 */ NBOUND, /* NBOUND */ - NBOUND, /* NBOUNDUTF8 */ NBOUND, /* NBOUNDL */ - NBOUND, /* NBOUNDLUTF8 */ GPOS, /* GPOS */ REG_ANY, /* REG_ANY */ - REG_ANY, /* ANYUTF8 */ REG_ANY, /* SANY */ - REG_ANY, /* SANYUTF8 */ ANYOF, /* ANYOF */ - ANYOF, /* ANYOFUTF8 */ ALNUM, /* ALNUM */ - ALNUM, /* ALNUMUTF8 */ ALNUM, /* ALNUML */ - ALNUM, /* ALNUMLUTF8 */ NALNUM, /* NALNUM */ - NALNUM, /* NALNUMUTF8 */ NALNUM, /* NALNUML */ - NALNUM, /* NALNUMLUTF8 */ SPACE, /* SPACE */ - SPACE, /* SPACEUTF8 */ SPACE, /* SPACEL */ - SPACE, /* SPACELUTF8 */ NSPACE, /* NSPACE */ - NSPACE, /* NSPACEUTF8 */ NSPACE, /* NSPACEL */ - NSPACE, /* NSPACELUTF8 */ DIGIT, /* DIGIT */ - DIGIT, /* DIGITUTF8 */ DIGIT, /* DIGITL */ - DIGIT, /* DIGITLUTF8 */ NDIGIT, /* NDIGIT */ - NDIGIT, /* NDIGITUTF8 */ NDIGIT, /* NDIGITL */ - NDIGIT, /* NDIGITLUTF8 */ CLUMP, /* CLUMP */ BRANCH, /* BRANCH */ BACK, /* BACK */ @@ -184,44 +146,25 @@ static const U8 regarglen[] = { 0, /* MEOL */ 0, /* SEOL */ 0, /* BOUND */ - 0, /* BOUNDUTF8 */ 0, /* BOUNDL */ - 0, /* BOUNDLUTF8 */ 0, /* NBOUND */ - 0, /* NBOUNDUTF8 */ 0, /* NBOUNDL */ - 0, /* NBOUNDLUTF8 */ 0, /* GPOS */ 0, /* REG_ANY */ - 0, /* ANYUTF8 */ 0, /* SANY */ - 0, /* SANYUTF8 */ 0, /* ANYOF */ - EXTRA_SIZE(struct regnode_1), /* ANYOFUTF8 */ 0, /* ALNUM */ - 0, /* ALNUMUTF8 */ 0, /* ALNUML */ - 0, /* ALNUMLUTF8 */ 0, /* NALNUM */ - 0, /* NALNUMUTF8 */ 0, /* NALNUML */ - 0, /* NALNUMLUTF8 */ 0, /* SPACE */ - 0, /* SPACEUTF8 */ 0, /* SPACEL */ - 0, /* SPACELUTF8 */ 0, /* NSPACE */ - 0, /* NSPACEUTF8 */ 0, /* NSPACEL */ - 0, /* NSPACELUTF8 */ 0, /* DIGIT */ - 0, /* DIGITUTF8 */ 0, /* DIGITL */ - 0, /* DIGITLUTF8 */ 0, /* NDIGIT */ - 0, /* NDIGITUTF8 */ 0, /* NDIGITL */ - 0, /* NDIGITLUTF8 */ 0, /* CLUMP */ 0, /* BRANCH */ 0, /* BACK */ @@ -267,44 +210,25 @@ static const char reg_off_by_arg[] = { 0, /* MEOL */ 0, /* SEOL */ 0, /* BOUND */ - 0, /* BOUNDUTF8 */ 0, /* BOUNDL */ - 0, /* BOUNDLUTF8 */ 0, /* NBOUND */ - 0, /* NBOUNDUTF8 */ 0, /* NBOUNDL */ - 0, /* NBOUNDLUTF8 */ 0, /* GPOS */ 0, /* REG_ANY */ - 0, /* ANYUTF8 */ 0, /* SANY */ - 0, /* SANYUTF8 */ 0, /* ANYOF */ - 0, /* ANYOFUTF8 */ 0, /* ALNUM */ - 0, /* ALNUMUTF8 */ 0, /* ALNUML */ - 0, /* ALNUMLUTF8 */ 0, /* NALNUM */ - 0, /* NALNUMUTF8 */ 0, /* NALNUML */ - 0, /* NALNUMLUTF8 */ 0, /* SPACE */ - 0, /* SPACEUTF8 */ 0, /* SPACEL */ - 0, /* SPACELUTF8 */ 0, /* NSPACE */ - 0, /* NSPACEUTF8 */ 0, /* NSPACEL */ - 0, /* NSPACELUTF8 */ 0, /* DIGIT */ - 0, /* DIGITUTF8 */ 0, /* DIGITL */ - 0, /* DIGITLUTF8 */ 0, /* NDIGIT */ - 0, /* NDIGITUTF8 */ 0, /* NDIGITL */ - 0, /* NDIGITLUTF8 */ 0, /* CLUMP */ 0, /* BRANCH */ 0, /* BACK */ @@ -351,79 +275,60 @@ static const char * const reg_name[] = { "MEOL", /* 0x7 */ "SEOL", /* 0x8 */ "BOUND", /* 0x9 */ - "BOUNDUTF8", /* 0xa */ - "BOUNDL", /* 0xb */ - "BOUNDLUTF8", /* 0xc */ - "NBOUND", /* 0xd */ - "NBOUNDUTF8", /* 0xe */ - "NBOUNDL", /* 0xf */ - "NBOUNDLUTF8", /* 0x10 */ - "GPOS", /* 0x11 */ - "REG_ANY", /* 0x12 */ - "ANYUTF8", /* 0x13 */ - "SANY", /* 0x14 */ - "SANYUTF8", /* 0x15 */ - "ANYOF", /* 0x16 */ - "ANYOFUTF8", /* 0x17 */ - "ALNUM", /* 0x18 */ - "ALNUMUTF8", /* 0x19 */ - "ALNUML", /* 0x1a */ - "ALNUMLUTF8", /* 0x1b */ - "NALNUM", /* 0x1c */ - "NALNUMUTF8", /* 0x1d */ - "NALNUML", /* 0x1e */ - "NALNUMLUTF8", /* 0x1f */ - "SPACE", /* 0x20 */ - "SPACEUTF8", /* 0x21 */ - "SPACEL", /* 0x22 */ - "SPACELUTF8", /* 0x23 */ - "NSPACE", /* 0x24 */ - "NSPACEUTF8", /* 0x25 */ - "NSPACEL", /* 0x26 */ - "NSPACELUTF8", /* 0x27 */ - "DIGIT", /* 0x28 */ - "DIGITUTF8", /* 0x29 */ - "DIGITL", /* 0x2a */ - "DIGITLUTF8", /* 0x2b */ - "NDIGIT", /* 0x2c */ - "NDIGITUTF8", /* 0x2d */ - "NDIGITL", /* 0x2e */ - "NDIGITLUTF8", /* 0x2f */ - "CLUMP", /* 0x30 */ - "BRANCH", /* 0x31 */ - "BACK", /* 0x32 */ - "EXACT", /* 0x33 */ - "EXACTF", /* 0x34 */ - "EXACTFL", /* 0x35 */ - "NOTHING", /* 0x36 */ - "TAIL", /* 0x37 */ - "STAR", /* 0x38 */ - "PLUS", /* 0x39 */ - "CURLY", /* 0x3a */ - "CURLYN", /* 0x3b */ - "CURLYM", /* 0x3c */ - "CURLYX", /* 0x3d */ - "WHILEM", /* 0x3e */ - "OPEN", /* 0x3f */ - "CLOSE", /* 0x40 */ - "REF", /* 0x41 */ - "REFF", /* 0x42 */ - "REFFL", /* 0x43 */ - "IFMATCH", /* 0x44 */ - "UNLESSM", /* 0x45 */ - "SUSPEND", /* 0x46 */ - "IFTHEN", /* 0x47 */ - "GROUPP", /* 0x48 */ - "LONGJMP", /* 0x49 */ - "BRANCHJ", /* 0x4a */ - "EVAL", /* 0x4b */ - "MINMOD", /* 0x4c */ - "LOGICAL", /* 0x4d */ - "RENUM", /* 0x4e */ - "OPTIMIZED", /* 0x4f */ + "BOUNDL", /* 0xa */ + "NBOUND", /* 0xb */ + "NBOUNDL", /* 0xc */ + "GPOS", /* 0xd */ + "REG_ANY", /* 0xe */ + "SANY", /* 0xf */ + "ANYOF", /* 0x10 */ + "ALNUM", /* 0x11 */ + "ALNUML", /* 0x12 */ + "NALNUM", /* 0x13 */ + "NALNUML", /* 0x14 */ + "SPACE", /* 0x15 */ + "SPACEL", /* 0x16 */ + "NSPACE", /* 0x17 */ + "NSPACEL", /* 0x18 */ + "DIGIT", /* 0x19 */ + "DIGITL", /* 0x1a */ + "NDIGIT", /* 0x1b */ + "NDIGITL", /* 0x1c */ + "CLUMP", /* 0x1d */ + "BRANCH", /* 0x1e */ + "BACK", /* 0x1f */ + "EXACT", /* 0x20 */ + "EXACTF", /* 0x21 */ + "EXACTFL", /* 0x22 */ + "NOTHING", /* 0x23 */ + "TAIL", /* 0x24 */ + "STAR", /* 0x25 */ + "PLUS", /* 0x26 */ + "CURLY", /* 0x27 */ + "CURLYN", /* 0x28 */ + "CURLYM", /* 0x29 */ + "CURLYX", /* 0x2a */ + "WHILEM", /* 0x2b */ + "OPEN", /* 0x2c */ + "CLOSE", /* 0x2d */ + "REF", /* 0x2e */ + "REFF", /* 0x2f */ + "REFFL", /* 0x30 */ + "IFMATCH", /* 0x31 */ + "UNLESSM", /* 0x32 */ + "SUSPEND", /* 0x33 */ + "IFTHEN", /* 0x34 */ + "GROUPP", /* 0x35 */ + "LONGJMP", /* 0x36 */ + "BRANCHJ", /* 0x37 */ + "EVAL", /* 0x38 */ + "MINMOD", /* 0x39 */ + "LOGICAL", /* 0x3a */ + "RENUM", /* 0x3b */ + "OPTIMIZED", /* 0x3c */ }; -static const int reg_num = 80; +static const int reg_num = 61; #endif /* DEBUGGING */ #endif /* REG_COMP_C */ @@ -4522,11 +4522,9 @@ Perl_sv_len_utf8(pTHX_ register SV *sv) if (!sv) return 0; -#ifdef NOTYET if (SvGMAGICAL(sv)) return mg_length(sv); else -#endif { STRLEN len; U8 *s = (U8*)SvPV(sv, len); diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t index ac42b85577..cd9d56a5c4 100644 --- a/t/op/utf8decode.t +++ b/t/op/utf8decode.t @@ -5,6 +5,8 @@ BEGIN { @INC = '../lib'; } +no utf8; # this test contains raw 8-bit data on purpose; don't switch to \x{} + print "1..78\n"; my $test = 1; diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 6986720aab..89416dcfab 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..90\n"; +print "1..104\n"; my $test = 1; @@ -42,6 +42,7 @@ sub nok_bytes { { use utf8; + $_ = ">\x{263A}<"; s/([\x{80}-\x{10ffff}])/"&#".ord($1).";"/eg; ok $_, '>☺<'; @@ -106,212 +107,191 @@ sub nok_bytes { } { - use utf8; - - $_ = "\x{263A}>\x{263A}\x{263A}"; - - ok length, 4; - $test++; # 13 - - ok length((m/>(.)/)[0]), 1; - $test++; # 14 - - ok length($&), 2; - $test++; # 15 + # no use utf8 needed + $_ = "\x{263A}\x{263A}x\x{263A}y\x{263A}"; + + ok length($_), 6; # 13 + $test++; - ok length($'), 1; - $test++; # 16 + ($a) = m/x(.)/; - ok length($`), 1; - $test++; # 17 + ok length($a), 1; # 14 + $test++; - ok length($1), 1; - $test++; # 18 + ok length($`), 2; # 15 + $test++; + ok length($&), 2; # 16 + $test++; + ok length($'), 2; # 17 + $test++; - ok length($tmp=$&), 2; - $test++; # 19 + ok length($1), 1; # 18 + $test++; - ok length($tmp=$'), 1; - $test++; # 20 + ok length($b=$`), 2; # 19 + $test++; - ok length($tmp=$`), 1; - $test++; # 21 + ok length($b=$&), 2; # 20 + $test++; - ok length($tmp=$1), 1; - $test++; # 22 + ok length($b=$'), 2; # 21 + $test++; - { - use bytes; + ok length($b=$1), 1; # 22 + $test++; - my $tmp = $&; - ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272); - $test++; # 23 + ok $a, "\x{263A}"; # 23 + $test++; - $tmp = $'; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 24 + ok $`, "\x{263A}\x{263A}"; # 24 + $test++; - $tmp = $`; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 25 + ok $&, "x\x{263A}"; # 25 + $test++; - $tmp = $1; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 26 - } + ok $', "y\x{263A}"; # 26 + $test++; - ok_bytes $&, pack("C*", ord(">"), 0342, 0230, 0272); - $test++; # 27 + ok $1, "\x{263A}"; # 27 + $test++; - ok_bytes $', pack("C*", 0342, 0230, 0272); - $test++; # 28 + ok_bytes $a, "\342\230\272"; # 28 + $test++; - ok_bytes $`, pack("C*", 0342, 0230, 0272); - $test++; # 29 + ok_bytes $1, "\342\230\272"; # 29 + $test++; - ok_bytes $1, pack("C*", 0342, 0230, 0272); - $test++; # 30 + ok_bytes $&, "x\342\230\272"; # 30 + $test++; { - use bytes; - no utf8; - - ok length, 10; - $test++; # 31 + use utf8; # required + $_ = chr(0x263A) . chr(0x263A) . 'x' . chr(0x263A) . 'y' . chr(0x263A); + } - ok length((m/>(.)/)[0]), 1; - $test++; # 32 + ok length($_), 6; # 31 + $test++; - ok length($&), 2; - $test++; # 33 + ($a) = m/x(.)/; - ok length($'), 5; - $test++; # 34 + ok length($a), 1; # 32 + $test++; - ok length($`), 3; - $test++; # 35 + ok length($`), 2; # 33 + $test++; - ok length($1), 1; - $test++; # 36 + ok length($&), 2; # 34 + $test++; - ok $&, pack("C*", ord(">"), 0342); - $test++; # 37 + ok length($'), 2; # 35 + $test++; - ok $', pack("C*", 0230, 0272, 0342, 0230, 0272); - $test++; # 38 + ok length($1), 1; # 36 + $test++; - ok $`, pack("C*", 0342, 0230, 0272); - $test++; # 39 + ok length($b=$`), 2; # 37 + $test++; - ok $1, pack("C*", 0342); - $test++; # 40 - } + ok length($b=$&), 2; # 38 + $test++; - { - no utf8; - $_="\342\230\272>\342\230\272\342\230\272"; - } + ok length($b=$'), 2; # 39 + $test++; - ok length, 10; - $test++; # 41 + ok length($b=$1), 1; # 40 + $test++; - ok length((m/>(.)/)[0]), 1; - $test++; # 42 + ok $a, "\x{263A}"; # 41 + $test++; - ok length($&), 2; - $test++; # 43 + ok $`, "\x{263A}\x{263A}"; # 42 + $test++; - ok length($'), 1; - $test++; # 44 + ok $&, "x\x{263A}"; # 43 + $test++; - ok length($`), 1; - $test++; # 45 + ok $', "y\x{263A}"; # 44 + $test++; - ok length($1), 1; - $test++; # 46 + ok $1, "\x{263A}"; # 45 + $test++; - ok length($tmp=$&), 2; - $test++; # 47 + ok_bytes $a, "\342\230\272"; # 46 + $test++; - ok length($tmp=$'), 1; - $test++; # 48 + ok_bytes $1, "\342\230\272"; # 47 + $test++; - ok length($tmp=$`), 1; - $test++; # 49 + ok_bytes $&, "x\342\230\272"; # 48 + $test++; - ok length($tmp=$1), 1; - $test++; # 50 + $_ = "\342\230\272\342\230\272x\342\230\272y\342\230\272"; - { - use bytes; + ok length($_), 14; # 49 + $test++; - my $tmp = $&; - ok $tmp, pack("C*", ord(">"), 0342, 0230, 0272); - $test++; # 51 + ($a) = m/x(.)/; - $tmp = $'; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 52 + ok length($a), 1; # 50 + $test++; - $tmp = $`; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 53 + ok length($`), 6; # 51 + $test++; - $tmp = $1; - ok $tmp, pack("C*", 0342, 0230, 0272); - $test++; # 54 - } + ok length($&), 2; # 52 + $test++; - { - use bytes; - no utf8; + ok length($'), 6; # 53 + $test++; - ok length, 10; - $test++; # 55 + ok length($1), 1; # 54 + $test++; - ok length((m/>(.)/)[0]), 1; - $test++; # 56 + ok length($b=$`), 6; # 55 + $test++; - ok length($&), 2; - $test++; # 57 + ok length($b=$&), 2; # 56 + $test++; - ok length($'), 5; - $test++; # 58 + ok length($b=$'), 6; # 57 + $test++; - ok length($`), 3; - $test++; # 59 + ok length($b=$1), 1; # 58 + $test++; - ok length($1), 1; - $test++; # 60 + ok $a, "\342"; # 59 + $test++; - ok $&, pack("C*", ord(">"), 0342); - $test++; # 61 + ok $`, "\342\230\272\342\230\272"; # 60 + $test++; - ok $', pack("C*", 0230, 0272, 0342, 0230, 0272); - $test++; # 62 + ok $&, "x\342"; # 61 + $test++; - ok $`, pack("C*", 0342, 0230, 0272); - $test++; # 63 + ok $', "\230\272y\342\230\272"; # 62 + $test++; - ok $1, pack("C*", 0342); - $test++; # 64 - } + ok $1, "\342"; # 63 + $test++; +} +{ + use utf8; ok "\x{ab}" =~ /^\x{ab}$/, 1; - $test++; # 65 + $test++; # 64 } { use utf8; ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2); - $test++; # 66 + $test++; # 65 } { use utf8; my @a = map ord, split(//, join("", map chr, (1234, 123, 2345))); ok "@a", "1234 123 2345"; - $test++; # 67 + $test++; # 66 } { @@ -319,7 +299,7 @@ sub nok_bytes { my $x = chr(123); my @a = map ord, split(/$x/, join("", map chr, (1234, 123, 2345))); ok "@a", "1234 2345"; - $test++; # 68 + $test++; # 67 } { @@ -331,10 +311,10 @@ sub nok_bytes { { use utf8; $b = "\xe4" } # \xXX must not produce UTF-8 print "not " if $a eq $b; - print "ok $test\n"; $test++; + print "ok $test\n"; $test++; # 68 { use utf8; print "not " if $a eq $b; } - print "ok $test\n"; $test++; + print "ok $test\n"; $test++; # 69 } { @@ -344,7 +324,7 @@ sub nok_bytes { for (@x) { s/(\d+)\s*([\w\-]+)/$1 . uc $2/e; my($latin) = /^(.+)(?:\s+\d)/; - print $latin eq "stra\337e" ? "ok $test\n" : + print $latin eq "stra\337e" ? "ok $test\n" : # 70, 71 "#latin[$latin]\nnot ok $test\n"; $test++; $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a @@ -369,7 +349,7 @@ sub nok_bytes { } print "not " unless $r eq " U+B36C U+5A8C U+FF5B U+5079 U+505B"; - print "ok $test\n"; + print "ok $test\n"; # 72 $test++; } @@ -384,27 +364,27 @@ sub nok_bytes { print "not " unless $a eq "\x20" && $b eq "\x{80}\x{100}\x{80}" && $c eq $a; print "ok $test\n"; - $test++; + $test++; # 73 my ($a, $b) = split(/\x{100}/, $s); print "not " unless $a eq "\x20\x40\x{80}" && $b eq "\x{80}\x40\x20"; print "ok $test\n"; - $test++; + $test++; # 74 my ($a, $b) = split(/\x{80}\x{100}\x{80}/, $s); print "not " unless $a eq "\x20\x40" && $b eq "\x40\x20"; print "ok $test\n"; - $test++; + $test++; # 75 my ($a, $b) = split(/\x40\x{80}/, $s); print "not " unless $a eq "\x20" && $b eq "\x{100}\x{80}\x40\x20"; print "ok $test\n"; - $test++; + $test++; # 76 my ($a, $b, $c) = split(/[\x40\x{80}]+/, $s); print "not " unless $a eq "\x20" && $b eq "\x{100}" && $c eq "\x20"; print "ok $test\n"; - $test++; + $test++; # 77 } { @@ -414,14 +394,14 @@ sub nok_bytes { my $smiley = "\x{263a}"; - for my $s ("\x{263a}", # 1 - $smiley, # 2 + for my $s ("\x{263a}", # 78 + $smiley, # 79 - "" . $smiley, # 3 - "" . "\x{263a}", # 4 + "" . $smiley, # 80 + "" . "\x{263a}", # 81 - $smiley . "", # 5 - "\x{263a}" . "", # 6 + $smiley . "", # 82 + "\x{263a}" . "", # 83 ) { my $length_chars = length($s); my $length_bytes; @@ -437,14 +417,14 @@ sub nok_bytes { $test++; } - for my $s ("\x{263a}" . "\x{263a}", # 7 - $smiley . $smiley, # 8 + for my $s ("\x{263a}" . "\x{263a}", # 84 + $smiley . $smiley, # 85 - "\x{263a}\x{263a}", # 9 - "$smiley$smiley", # 10 + "\x{263a}\x{263a}", # 86 + "$smiley$smiley", # 87 - "\x{263a}" x 2, # 11 - $smiley x 2, # 12 + "\x{263a}" x 2, # 88 + $smiley x 2, # 89 ) { my $length_chars = length($s); my $length_bytes; @@ -460,3 +440,106 @@ sub nok_bytes { $test++; } } + +{ + use utf8; + + print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 90 + + print "not " unless "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 91 + + print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 92 + + print "not " unless "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 93 + + print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 94 + + print "not " unless "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 95 + + print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; + print "ok $test\n"; + $test++; # 96 + + print "not " unless "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; + print "ok $test\n"; + $test++; # 97 +} + +{ + # the first half of 20001028.003 + + my $X = chr(1448); + my ($Y) = $X =~ /(.*)/; + print "not " unless length $Y == 1; + print "ok $test\n"; + $test++; # 98 +} + +{ + # 20001108.001 + + use utf8; + my $X = "Szab\x{f3},Bal\x{e1}zs"; + my $Y = $X; + $Y =~ s/(B)/$1/ for 0..3; + print "not " unless $Y eq $X; + print "ok $test\n"; + $test++; # 99 +} + +{ + # 20001114.001 + + use utf8; + use charnames ':full'; + my $text = "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}"; + print "not " unless ord($text) == 0xc4; + print "ok $test\n"; + $test++; # 100 +} + +{ + # 20001205.014 + + use utf8; + + my $a = "ABC\x{263A}"; + + my @b = split( //, $a ); + + print "not " unless @b == 4; + print "ok $test\n"; + $test++; # 101 + + print "not " unless length($b[3]) == 1; + print "ok $test\n"; + $test++; # 102 + + $a =~ s/^A/Z/; + print "not " unless length($a) == 4; + print "ok $test\n"; + $test++; # 103 +} + +{ + # the second half of 20001028.003 + + use utf8; + $X =~ s/^/chr(1488)/e; + print "not " unless length $X == 1; + print "ok $test\n"; + $test++; # 104 +} + |