diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-02-12 19:23:34 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-02-14 08:41:39 -0700 |
commit | fac1af778e9882ea26e6b17438a0fa0b41157116 (patch) | |
tree | 8bb8b8fdd667f0a21318e9bb49ce38c08c3c01b5 /regexec.c | |
parent | a33c29bcc3e80d893a599165c1ee5ad27029365e (diff) | |
download | perl-fac1af778e9882ea26e6b17438a0fa0b41157116.tar.gz |
regexec.c: refactor find-by-class EXACTish code
This code is way out-of-date, using upper and lower case instead of fold-case.
Diffstat (limited to 'regexec.c')
-rw-r--r-- | regexec.c | 126 |
1 files changed, 112 insertions, 14 deletions
@@ -1256,8 +1256,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ if ( f != c \ && (f == c1 || f == c2) \ && (ln == len || \ - foldEQ_utf8(s, &my_strend, 0, utf8_target,\ - m, NULL, ln, cBOOL(UTF_PATTERN)))\ + foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,\ + m, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags))\ && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ } \ @@ -1266,17 +1266,9 @@ s += len #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ STMT_START { \ - re_fold_t folder; \ - switch (OP(c)) { \ - case EXACTFU: folder = foldEQ_latin1; break; \ - case EXACTFL: folder = foldEQ_locale; break; \ - case EXACTF: folder = foldEQ; break; \ - default: \ - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \ - } \ while (s <= e) { \ if ( (CoNd) \ - && (ln == 1 || folder(s, m, ln)) \ + && (ln == 1 || folder(s, pat_string, ln)) \ && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ s++; \ @@ -1447,15 +1439,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, { dVAR; const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; - char *m; + char *pat_string; /* The pattern's exactish string */ + char *pat_end; /* ptr to end char of pat_string */ + re_fold_t folder; /* Function for computing non-utf8 folds */ + const U8 *fold_array; /* array for folding ords < 256 */ STRLEN ln; STRLEN lnc; register STRLEN uskip; - unsigned int c1; - unsigned int c2; + U8 c1; + U8 c2; char *e; register I32 tmp = 1; /* Scratch variable? */ register const bool utf8_target = PL_reg_match_utf8; + UV utf8_fold_flags; RXi_GET_DECL(prog,progi); PERL_ARGS_ASSERT_FIND_BYCLASS; @@ -1498,7 +1494,108 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; case EXACTFU: + if (UTF_PATTERN || utf8_target) { + utf8_fold_flags = 0; + goto do_exactf_utf8; + } + fold_array = PL_fold_latin1; + folder = foldEQ_latin1; + /* XXX This uses the full utf8 fold because if the pattern contains + * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string. + * There could be a new node type, say EXACTFU_SS, which is + * generated by regcomp only if there is an 'ss', and then every + * other case could goto do_exactf_non_utf8;*/ + goto do_exactf_utf8; + case EXACTF: + if (UTF_PATTERN || utf8_target) { + utf8_fold_flags = 0; + goto do_exactf_utf8; + } + fold_array = PL_fold; + folder = foldEQ; + goto do_exactf_non_utf8; + + case EXACTFL: + if (UTF_PATTERN || utf8_target) { + utf8_fold_flags = 0; /* XXX, add new flag for locale */ + goto do_exactf_utf8; + } + fold_array = PL_fold_locale; + folder = foldEQ_locale; + + /* FALL THROUGH */ + + do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */ + + /* The idea in the non-utf8 EXACTF* cases is to first find the + * first character of the EXACTF* node and then, if necessary, + * case-insensitively compare the full text of the node. c1 is the + * first character. c2 is its fold. This logic will not work for + * Unicode semantics and the german sharp ss, which hence should + * not be compiled into a node that gets here. */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + + e = HOP3c(strend, -((I32)ln), s); + + if (!reginfo && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + c1 = *pat_string; + c2 = fold_array[c1]; + if (c1 == c2) { /* If char and fold are the same */ + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1); + } + else { + REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); + } + break; + + do_exactf_utf8: + + /* If one of the operands is in utf8, we can't use the simpler + * folding above, due to the fact that many different characters + * can have the same fold, or portion of a fold, or different- + * length fold */ + pat_string = STRING(c); + ln = STR_LEN(c); /* length to match in octets/bytes */ + pat_end = pat_string + ln; + lnc = (UTF_PATTERN) /* length to match in characters */ + ? utf8_length((U8 *) pat_string, (U8 *) pat_end) + : ln; + + e = HOP3c(strend, -((I32)lnc), s); + + if (!reginfo && e < s) { + e = s; /* Due to minlen logic of intuit() */ + } + + while (s <= e) { + char *my_strend= (char *)strend; + if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, + pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags) + && (!reginfo || regtry(reginfo, &s)) ) + { + goto got_it; + } + s += UTF8SKIP(s); + } + break; + + +#if 0 + case EXACTFA: + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf_non_locale; + + case EXACTFU: + case EXACTF: + utf8_fold_flags = 0; + + do_exactf_non_locale: + m = STRING(c); ln = STR_LEN(c); /* length to match in octets/bytes */ lnc = (I32) ln; /* length to match in characters */ @@ -1625,6 +1722,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2); } break; +#endif case BOUNDL: PL_reg_flags |= RF_tainted; FBC_BOUND(isALNUM_LC, |