summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-02-12 19:23:34 -0700
committerKarl Williamson <public@khwilliamson.com>2011-02-14 08:41:39 -0700
commitfac1af778e9882ea26e6b17438a0fa0b41157116 (patch)
tree8bb8b8fdd667f0a21318e9bb49ce38c08c3c01b5 /regexec.c
parenta33c29bcc3e80d893a599165c1ee5ad27029365e (diff)
downloadperl-fac1af778e9882ea26e6b17438a0fa0b41157116.tar.gz
regexec.c: refactor find-by-class EXACTish code
This code is way out-of-date, using upper and lower case instead of fold-case.
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c126
1 files changed, 112 insertions, 14 deletions
diff --git a/regexec.c b/regexec.c
index e5e6e276f6..a7f55267c7 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1256,8 +1256,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
if ( f != c \
&& (f == c1 || f == c2) \
&& (ln == len || \
- foldEQ_utf8(s, &my_strend, 0, utf8_target,\
- m, NULL, ln, cBOOL(UTF_PATTERN)))\
+ foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,\
+ m, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags))\
&& (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
} \
@@ -1266,17 +1266,9 @@ s += len
#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
STMT_START { \
- re_fold_t folder; \
- switch (OP(c)) { \
- case EXACTFU: folder = foldEQ_latin1; break; \
- case EXACTFL: folder = foldEQ_locale; break; \
- case EXACTF: folder = foldEQ; break; \
- default: \
- Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
- } \
while (s <= e) { \
if ( (CoNd) \
- && (ln == 1 || folder(s, m, ln)) \
+ && (ln == 1 || folder(s, pat_string, ln)) \
&& (!reginfo || regtry(reginfo, &s)) ) \
goto got_it; \
s++; \
@@ -1447,15 +1439,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
{
dVAR;
const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
- char *m;
+ char *pat_string; /* The pattern's exactish string */
+ char *pat_end; /* ptr to end char of pat_string */
+ re_fold_t folder; /* Function for computing non-utf8 folds */
+ const U8 *fold_array; /* array for folding ords < 256 */
STRLEN ln;
STRLEN lnc;
register STRLEN uskip;
- unsigned int c1;
- unsigned int c2;
+ U8 c1;
+ U8 c2;
char *e;
register I32 tmp = 1; /* Scratch variable? */
register const bool utf8_target = PL_reg_match_utf8;
+ UV utf8_fold_flags;
RXi_GET_DECL(prog,progi);
PERL_ARGS_ASSERT_FIND_BYCLASS;
@@ -1498,7 +1494,108 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
);
break;
case EXACTFU:
+ if (UTF_PATTERN || utf8_target) {
+ utf8_fold_flags = 0;
+ goto do_exactf_utf8;
+ }
+ fold_array = PL_fold_latin1;
+ folder = foldEQ_latin1;
+ /* XXX This uses the full utf8 fold because if the pattern contains
+ * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
+ * There could be a new node type, say EXACTFU_SS, which is
+ * generated by regcomp only if there is an 'ss', and then every
+ * other case could goto do_exactf_non_utf8;*/
+ goto do_exactf_utf8;
+
case EXACTF:
+ if (UTF_PATTERN || utf8_target) {
+ utf8_fold_flags = 0;
+ goto do_exactf_utf8;
+ }
+ fold_array = PL_fold;
+ folder = foldEQ;
+ goto do_exactf_non_utf8;
+
+ case EXACTFL:
+ if (UTF_PATTERN || utf8_target) {
+ utf8_fold_flags = 0; /* XXX, add new flag for locale */
+ goto do_exactf_utf8;
+ }
+ fold_array = PL_fold_locale;
+ folder = foldEQ_locale;
+
+ /* FALL THROUGH */
+
+ do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
+
+ /* The idea in the non-utf8 EXACTF* cases is to first find the
+ * first character of the EXACTF* node and then, if necessary,
+ * case-insensitively compare the full text of the node. c1 is the
+ * first character. c2 is its fold. This logic will not work for
+ * Unicode semantics and the german sharp ss, which hence should
+ * not be compiled into a node that gets here. */
+ pat_string = STRING(c);
+ ln = STR_LEN(c); /* length to match in octets/bytes */
+
+ e = HOP3c(strend, -((I32)ln), s);
+
+ if (!reginfo && e < s) {
+ e = s; /* Due to minlen logic of intuit() */
+ }
+
+ c1 = *pat_string;
+ c2 = fold_array[c1];
+ if (c1 == c2) { /* If char and fold are the same */
+ REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
+ }
+ else {
+ REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
+ }
+ break;
+
+ do_exactf_utf8:
+
+ /* If one of the operands is in utf8, we can't use the simpler
+ * folding above, due to the fact that many different characters
+ * can have the same fold, or portion of a fold, or different-
+ * length fold */
+ pat_string = STRING(c);
+ ln = STR_LEN(c); /* length to match in octets/bytes */
+ pat_end = pat_string + ln;
+ lnc = (UTF_PATTERN) /* length to match in characters */
+ ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
+ : ln;
+
+ e = HOP3c(strend, -((I32)lnc), s);
+
+ if (!reginfo && e < s) {
+ e = s; /* Due to minlen logic of intuit() */
+ }
+
+ while (s <= e) {
+ char *my_strend= (char *)strend;
+ if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
+ pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
+ && (!reginfo || regtry(reginfo, &s)) )
+ {
+ goto got_it;
+ }
+ s += UTF8SKIP(s);
+ }
+ break;
+
+
+#if 0
+ case EXACTFA:
+ utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
+ goto do_exactf_non_locale;
+
+ case EXACTFU:
+ case EXACTF:
+ utf8_fold_flags = 0;
+
+ do_exactf_non_locale:
+
m = STRING(c);
ln = STR_LEN(c); /* length to match in octets/bytes */
lnc = (I32) ln; /* length to match in characters */
@@ -1625,6 +1722,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
}
break;
+#endif
case BOUNDL:
PL_reg_flags |= RF_tainted;
FBC_BOUND(isALNUM_LC,