summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2012-02-14 00:46:10 +0100
committerYves Orton <demerphq@gmail.com>2012-02-14 00:46:10 +0100
commit62d9fc4e33d0b7108f75401989e52db8b0c37710 (patch)
tree2796289fbc7415d298c9b05a254326a536573cd6
parenta58a85fab78d767203f1dac26cbf0717d0c47e87 (diff)
downloadperl-smoke-me/trie-exactfu.tar.gz
Minimal patch to allow the trie logic to handle EXACTFU nodessmoke-me/trie-exactfu
The old logic was that we would trie EXACTF nodes only when they were utf8. When the /u support was added things were changed so that EXACTFU nodes are produced in this case, however they are also produced when we are using /u on a non UTF pattern. This patch teaches the trie logic to handle this case, so it is also now converted to a trie. Note, this patch is deliberately minimal, we should in a future patch review this logic for further improvements.
-rw-r--r--embed.fnc4
-rw-r--r--embed.h4
-rw-r--r--proto.h14
-rw-r--r--regcomp.c13
4 files changed, 23 insertions, 12 deletions
diff --git a/embed.fnc b/embed.fnc
index 34aa251d4e..57ab4b71e1 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -604,7 +604,9 @@ Ap |UV |to_uni_upper |UV c|NN U8 *p|NN STRLEN *lenp
Ap |UV |to_uni_title |UV c|NN U8 *p|NN STRLEN *lenp
#ifdef PERL_IN_UTF8_C
sR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
-p |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
+#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C)
+EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
#endif
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
p |UV |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
diff --git a/embed.h b/embed.h
index d429c8d0ea..e20081cb94 100644
--- a/embed.h
+++ b/embed.h
@@ -980,6 +980,9 @@
#define reghop4 S_reghop4
# endif
# endif
+# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C)
+#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
+# endif
# if defined(PERL_OLD_COPY_ON_WRITE)
#define sv_setsv_cow(a,b) Perl_sv_setsv_cow(aTHX_ a,b)
# endif
@@ -1597,7 +1600,6 @@
#define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d)
# endif
# if defined(PERL_IN_UTF8_C)
-#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
#define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
#define is_utf8_char_slow S_is_utf8_char_slow
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index 84bfbf4982..e2a79d7435 100644
--- a/proto.h
+++ b/proto.h
@@ -7115,12 +7115,6 @@ STATIC bool S_isa_lookup(pTHX_ HV *stash, const char * const name, STRLEN len, U
#endif
#if defined(PERL_IN_UTF8_C)
-PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
- __attribute__nonnull__(pTHX_2)
- __attribute__nonnull__(pTHX_3);
-#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
- assert(p); assert(lenp)
-
STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
__attribute__warn_unused_result__
__attribute__nonnull__(pTHX_1)
@@ -7161,6 +7155,14 @@ PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *le
assert(p); assert(lenp)
#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C)
+PERL_CALLCONV UV Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const bool flags)
+ __attribute__nonnull__(pTHX_2)
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \
+ assert(p); assert(lenp)
+
+#endif
#if defined(PERL_IN_UTIL_C)
STATIC bool S_ckwarn_common(pTHX_ U32 w);
STATIC const COP* S_closest_cop(pTHX_ const COP *cop, const OP *o)
diff --git a/regcomp.c b/regcomp.c
index b33eddac88..985279c0e4 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1380,7 +1380,7 @@ is the recommended Unicode-aware way of saying
#define TRIE_READ_CHAR STMT_START { \
wordlen++; \
- if ( UTF ) { \
+ if ( UTF || flags == EXACTFU ) { \
if ( folder ) { \
if ( foldlen > 0 ) { \
uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
@@ -1388,8 +1388,13 @@ is the recommended Unicode-aware way of saying
scan += len; \
len = 0; \
} else { \
- len = UTF8SKIP(uc);\
- uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
+ if (UTF) { \
+ len = UTF8SKIP(uc); \
+ uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
+ } else { \
+ len = 1; \
+ uvc = _to_fold_latin1((U32)*uc, foldbuf, &foldlen, 1); \
+ } \
foldlen -= UNISKIP( uvc ); \
scan = foldbuf + UNISKIP( uvc ); \
} \
@@ -3289,7 +3294,7 @@ Note that join_exact() assumes that the other types of EXACTFish nodes are not
used in tries, so that would have to be updated if this changed
*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
+#define TRIE_TYPE_IS_SAFE (optype == EXACTFU || optype==EXACT)
if ( last && TRIE_TYPE_IS_SAFE ) {
make_trie( pRExC_state,