diff options
author | Yves Orton <demerphq@gmail.com> | 2007-04-23 01:34:55 +0200 |
---|---|---|
committer | Rafael Garcia-Suarez <rgarciasuarez@gmail.com> | 2007-04-23 09:04:31 +0000 |
commit | e1d1eefb8c88e0dcaf2bb9e6c04d7f6192be966f (patch) | |
tree | 6930eda70a9e751e8789ca568d015204ca292552 /regexec.c | |
parent | 8638e433f90c5046b38f1d145c125bad5dbfdb7e (diff) | |
download | perl-e1d1eefb8c88e0dcaf2bb9e6c04d7f6192be966f.tar.gz |
Change meaning of \v, \V, and add \h, \H to match Perl6, add \R to match PCRE and unicode tr18
Message-ID: <9b18b3110704221434g43457742p28cab00289f83639@mail.gmail.com>
p4raw-id: //depot/perl@31026
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 136 |
1 files changed, 135 insertions, 1 deletions
@@ -1110,6 +1110,15 @@ REXEC_FBC_SCAN( \ if ((!reginfo || regtry(reginfo, &s))) \ goto got_it +#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ + if (do_utf8) { \ + REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ + } \ + else { \ + REXEC_FBC_CLASS_SCAN(CoNd); \ + } \ + break + #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \ if (do_utf8) { \ UtFpReLoAd; \ @@ -1425,6 +1434,31 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !isDIGIT_LC_utf8((U8*)s), !isDIGIT_LC(*s) ); + case LNBREAK: + REXEC_FBC_CSCAN( + is_LNBREAK_utf8(s), + is_LNBREAK_latin1(s) + ); + case VERTWS: + REXEC_FBC_CSCAN( + is_VERTWS_utf8(s), + is_VERTWS_latin1(s) + ); + case NVERTWS: + REXEC_FBC_CSCAN( + !is_VERTWS_utf8(s), + !is_VERTWS_latin1(s) + ); + case HORIZWS: + REXEC_FBC_CSCAN( + is_HORIZWS_utf8(s), + is_HORIZWS_latin1(s) + ); + case NHORIZWS: + REXEC_FBC_CSCAN( + !is_HORIZWS_utf8(s), + !is_HORIZWS_latin1(s) + ); case AHOCORASICKC: case AHOCORASICK: { @@ -3207,8 +3241,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * pack("U0U*", 0xDF) =~ /ss/i, * the 0xC3 0x9F are the UTF-8 * byte sequence for the U+00DF. */ + if (!(do_utf8 && - toLOWER(s[0]) == 's' && + toLOWER(s[0]) == 's' && ln >= 2 && toLOWER(s[1]) == 's' && (U8)l[0] == 0xC3 && @@ -4972,6 +5007,35 @@ NULL /* NOTREACHED */ #undef ST + case LNBREAK: + if ((n=is_LNBREAK(locinput,do_utf8))) { + locinput += n; + nextchr = UCHARAT(locinput); + } else + sayNO; + break; + +#define CASE_CLASS(nAmE) \ + case nAmE: \ + if ((n=is_##nAmE(locinput,do_utf8))) { \ + locinput += n; \ + nextchr = UCHARAT(locinput); \ + } else \ + sayNO; \ + break; \ + case N##nAmE: \ + if ((n=is_##nAmE(locinput,do_utf8))) { \ + sayNO; \ + } else { \ + locinput += UTF8SKIP(locinput); \ + nextchr = UCHARAT(locinput); \ + } \ + break + + CASE_CLASS(VERTWS); + CASE_CLASS(HORIZWS); +#undef CASE_CLASS + default: PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", PTR2UV(scan), OP(scan)); @@ -5382,7 +5446,77 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) while (scan < loceol && !isDIGIT(*scan)) scan++; } + case LNBREAK: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) { + scan += c; + hardcount++; + } + } else { + /* + LNBREAK can match two latin chars, which is ok, + because we have a null terminated string, but we + have to use hardcount in this situation + */ + while (scan < loceol && (c=is_LNBREAK_latin1(scan))) { + scan+=c; + hardcount++; + } + } + break; + case HORIZWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) { + scan += c; + hardcount++; + } + } else { + while (scan < loceol && is_HORIZWS_latin1(scan)) + scan++; + } break; + case NHORIZWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !is_HORIZWS_latin1(scan)) + scan++; + + } + break; + case VERTWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) { + scan += c; + hardcount++; + } + } else { + while (scan < loceol && is_VERTWS_latin1(scan)) + scan++; + + } + break; + case NVERTWS: + if (do_utf8) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !is_VERTWS_latin1(scan)) + scan++; + + } + break; + default: /* Called on something of 0 width. */ break; /* So match right here or not at all. */ } |