diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-09-23 23:36:40 -0600 |
---|---|---|
committer | Jesse Vincent <jesse@bestpractical.com> | 2010-10-15 23:14:29 +0900 |
commit | a12cf05f80a65e40fe339b086ab2d10e18d838c1 (patch) | |
tree | bd1254d24bac6bb121801a2a06d01c7e17703b92 /regexec.c | |
parent | bdc22dd52e899130c8c4111c985fcbd7eec164a5 (diff) | |
download | perl-a12cf05f80a65e40fe339b086ab2d10e18d838c1.tar.gz |
Subject: [perl #58182] partial: Add uni \s,\w matching
This commit causes regex sequences \b, \s, and \w (and complements) to
match in the latin1 range in the scope of feature 'unicode_strings' or
with the /u regex modifier.
It uses the previously unused flags field in the respective regnodes to
indicate the type of matching, and in regexec.c, uses that to decide
which of the handy.h macros to use, native or Latin1.
I chose this for now rather than create new nodes for each type of
match. An earlier version of this patch did that, and in every case the
switch case: statements were adjacent, offering no performance
advantage. If regexec were modified to use in-line functions or more
macros for various short section of it, then it would be faster to have
new nodes rather than using the flags field. But, using that field
simplified things, as this change flies under the radar in a number of
places where it would not if separate nodes were used.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 102 |
1 files changed, 73 insertions, 29 deletions
@@ -202,7 +202,7 @@ nextchr = UCHARAT(locinput); \ break; \ } \ - /* Finished up by macro calling this one */ + /* Drops through to the macro that calls this one */ #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ @@ -1528,12 +1528,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } ); } - else { + else { /* Not utf8 */ tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + tmp = cBOOL((OP(c) == BOUNDL) + ? isALNUM_LC(tmp) + : (isWORDCHAR_L1(tmp) + && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); REXEC_FBC_SCAN( if (tmp == - !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { + !((OP(c) == BOUNDL) + ? isALNUM_LC(*s) + : (isWORDCHAR_L1((U8) *s) + && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) + { tmp = !tmp; REXEC_FBC_TRYIT; } @@ -1566,12 +1573,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == NBOUND ? - isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + tmp = cBOOL((OP(c) == NBOUNDL) + ? isALNUM_LC(tmp) + : (isWORDCHAR_L1(tmp) + && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); REXEC_FBC_SCAN( - if (tmp == - !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) + if (tmp == ! cBOOL( + (OP(c) == NBOUNDL) + ? isALNUM_LC(*s) + : (isWORDCHAR_L1((U8) *s) + && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) + { tmp = !tmp; + } else REXEC_FBC_TRYIT; ); } @@ -1582,7 +1596,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), - isALNUM(*s) + (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s) ); case ALNUML: REXEC_FBC_CSCAN_TAINT( @@ -1593,7 +1607,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), - !isALNUM(*s) + ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)) ); case NALNUML: REXEC_FBC_CSCAN_TAINT( @@ -1604,7 +1618,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), - isSPACE(*s) + isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)) ); case SPACEL: REXEC_FBC_CSCAN_TAINT( @@ -1615,7 +1629,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), - !isSPACE(*s) + !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))) ); case NSPACEL: REXEC_FBC_CSCAN_TAINT( @@ -3591,7 +3605,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) else { ln = (locinput != PL_bostr) ? UCHARAT(locinput - 1) : '\n'; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { + if (FLAGS(scan) & USE_UNI) { + + /* Here, can't be BOUNDL or NBOUNDL because they never set + * the flags to USE_UNI */ + ln = isWORDCHAR_L1(ln); + n = isWORDCHAR_L1(nextchr); + } + else if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM(ln); n = isALNUM(nextchr); } @@ -3638,11 +3659,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) sayNO; break; /* Special char classes - The defines start on line 129 or so */ - CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); - CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); + CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); + CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); - CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); - CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); + CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); + CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); @@ -5765,13 +5786,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) { + swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && isWORDCHAR_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && isALNUM(*scan)) - scan++; + while (scan < loceol && isALNUM((U8) *scan)) { + scan++; + } } break; case ALNUML: @@ -5793,13 +5820,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) { + !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && !isALNUM(*scan)) - scan++; + while (scan < loceol && ! isALNUM((U8) *scan)) { + scan++; + } } break; case NALNUML: @@ -5822,13 +5855,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && (*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) { + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && isSPACE_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && isSPACE(*scan)) - scan++; + while (scan < loceol && isSPACE((U8) *scan)) + scan++; } break; case SPACEL: @@ -5851,13 +5889,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && !(*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) { + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && ! isSPACE_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && !isSPACE(*scan)) - scan++; + while (scan < loceol && ! isSPACE((U8) *scan)) { + scan++; + } } break; case NSPACEL: |